[RFC v2 0/5] vfio/type1: Add support for valid iova list management

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC v2 0/5] vfio/type1: Add support for valid iova list management
@ 2018-01-12 16:45 Shameer Kolothum
  2018-01-12 16:45 ` [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
                   ` (4 more replies)
  0 siblings, 5 replies; 21+ messages in thread
From: Shameer Kolothum @ 2018-01-12 16:45 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This series introduces an iova list associated with a vfio 
iommu. The list is kept updated taking care of iommu apertures,
and reserved regions. Also this series adds checks for any conflict
with existing dma mappings whenever a new device group is attached to
the domain.

User-space can retrieve valid iova ranges using VFIO_IOMMU_GET_INFO
ioctl capability chains. Any dma map request outside the valid iova
range will be rejected.

RFC v1 --> v2
 Addressed comments from Alex:
-Introduced IOVA list management and added checks for conflicts with 
 existing dma map entries during attach/detach.

Shameer Kolothum (5):
  vfio/type1: Introduce iova list and add iommu aperture validity check
  vfio/type1: Check reserve region conflict and update iova list
  vfio/type1: check dma map request is within a valid iova range
  vfio/type1: Add IOVA range capability support
  vfio/type1: remove duplicate retrieval of reserved regions.

 drivers/vfio/vfio_iommu_type1.c | 466 +++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/vfio.h       |  23 ++
 2 files changed, 478 insertions(+), 11 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check
  2018-01-12 16:45 [RFC v2 0/5] vfio/type1: Add support for valid iova list management Shameer Kolothum
@ 2018-01-12 16:45 ` Shameer Kolothum
  2018-01-18  0:04   ` Alex Williamson
  2018-01-12 16:45 ` [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list Shameer Kolothum
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 21+ messages in thread
From: Shameer Kolothum @ 2018-01-12 16:45 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This introduces an iova list that is valid for dma mappings. Make
sure the new iommu aperture window is valid and doesn't conflict
with any existing dma mappings during attach. Also update the iova
list with new aperture window during attach/detach.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 177 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29a..11cbd49 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,6 +60,7 @@
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct list_head	iova_list;
 	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
@@ -92,6 +93,12 @@ struct vfio_group {
 	struct list_head	next;
 };
 
+struct vfio_iova {
+	struct list_head	list;
+	phys_addr_t		start;
+	phys_addr_t		end;
+};
+
 /*
  * Guest RAM pinning working set or DMA target
  */
@@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
 	return ret;
 }
 
+static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
+				struct list_head *head)
+{
+	struct vfio_iova *region;
+
+	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&region->list);
+	region->start = start;
+	region->end = end;
+
+	list_add_tail(&region->list, head);
+	return 0;
+}
+
+/*
+ * Find whether a mem region overlaps with existing dma mappings
+ */
+static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
+				  phys_addr_t start, phys_addr_t end)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma;
+
+		dma = rb_entry(n, struct vfio_dma, node);
+
+		if (end < dma->iova)
+			break;
+		if (start >= dma->iova + dma->size)
+			continue;
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Check the new iommu aperture is a valid one
+ */
+static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
+				     phys_addr_t start,
+				     phys_addr_t end)
+{
+	struct vfio_iova *first, *last;
+	struct list_head *iova = &iommu->iova_list;
+
+	if (list_empty(iova))
+		return 0;
+
+	/* Check if new one is outside the current aperture */
+	first = list_first_entry(iova, struct vfio_iova, list);
+	last = list_last_entry(iova, struct vfio_iova, list);
+	if ((start > last->end) || (end < first->start))
+		return -EINVAL;
+
+	/* Check for any existing dma mappings outside the new start */
+	if (start > first->start) {
+		if (vfio_find_dma_overlap(iommu, first->start, start - 1))
+			return -EINVAL;
+	}
+
+	/* Check for any existing dma mappings outside the new end */
+	if (end < last->end) {
+		if (vfio_find_dma_overlap(iommu, end + 1, last->end))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Adjust the iommu aperture window if new aperture is a valid one
+ */
+static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
+				      phys_addr_t start,
+				      phys_addr_t end)
+{
+	struct vfio_iova *node, *next;
+	struct list_head *iova = &iommu->iova_list;
+
+	if (list_empty(iova))
+		return vfio_insert_iova(start, end, iova);
+
+	/* Adjust iova list start */
+	list_for_each_entry_safe(node, next, iova, list) {
+		if (start < node->start)
+			break;
+		if ((start >= node->start) && (start <= node->end)) {
+			node->start = start;
+			break;
+		}
+		/* Delete nodes before new start */
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	/* Adjust iova list end */
+	list_for_each_entry_safe(node, next, iova, list) {
+		if (end > node->end)
+			continue;
+
+		if ((end >= node->start) && (end <= node->end)) {
+			node->end = end;
+			continue;
+		}
+		/* Delete nodes after new end */
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	return 0;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	int ret;
 	bool resv_msi, msi_remap;
 	phys_addr_t resv_msi_base;
+	struct iommu_domain_geometry geo;
 
 	mutex_lock(&iommu->lock);
 
@@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_domain;
 
+	/* Get aperture info */
+	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
+
+	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
+					geo.aperture_end);
+	if (ret)
+		goto out_detach;
+
 	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
 
 	INIT_LIST_HEAD(&domain->group_list);
@@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 			goto out_detach;
 	}
 
+	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
+					  geo.aperture_end);
+	if (ret)
+		goto out_detach;
+
 	list_add(&domain->next, &iommu->domain_list);
 
 	mutex_unlock(&iommu->lock);
@@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
 	WARN_ON(iommu->notifier.head);
 }
 
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *domain;
+	struct iommu_domain_geometry geo;
+	struct vfio_iova *node;
+	phys_addr_t start = 0;
+	phys_addr_t end = (phys_addr_t)~0;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+				      &geo);
+			if (geo.aperture_start > start)
+				start = geo.aperture_start;
+			if (geo.aperture_end < end)
+				end = geo.aperture_end;
+	}
+
+	/* modify iova aperture limits */
+	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
+	node->start = start;
+	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
+	node->end = end;
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
 			kfree(domain);
+			vfio_iommu_iova_aper_refresh(iommu);
 		}
 		break;
 	}
@@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	}
 
 	INIT_LIST_HEAD(&iommu->domain_list);
+	INIT_LIST_HEAD(&iommu->iova_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
@@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain, *domain_tmp;
+	struct vfio_iova *iova, *iova_tmp;
 
 	if (iommu->external_domain) {
 		vfio_release_domain(iommu->external_domain, true);
@@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void *iommu_data)
 		list_del(&domain->next);
 		kfree(domain);
 	}
+
+	list_for_each_entry_safe(iova, iova_tmp,
+				 &iommu->iova_list, list) {
+		list_del(&iova->list);
+		kfree(iova);
+	}
+
 	kfree(iommu);
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list
  2018-01-12 16:45 [RFC v2 0/5] vfio/type1: Add support for valid iova list management Shameer Kolothum
  2018-01-12 16:45 ` [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
@ 2018-01-12 16:45 ` Shameer Kolothum
  2018-01-18  0:04   ` Alex Williamson
  2018-01-12 16:45 ` [RFC v2 3/5] vfio/type1: check dma map request is within a valid iova range Shameer Kolothum
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 21+ messages in thread
From: Shameer Kolothum @ 2018-01-12 16:45 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This retrieves the reserved regions associated with dev group and
checks for conflicts with any existing dma mappings. Also update
the iova list excluding the reserved regions.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 11cbd49..7609070 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -28,6 +28,7 @@
 #include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/iommu.h>
+#include <linux/list_sort.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/rbtree.h>
@@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
 	return ret;
 }
 
+static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct iommu_resv_region *ra, *rb;
+
+	ra = container_of(a, struct iommu_resv_region, list);
+	rb = container_of(b, struct iommu_resv_region, list);
+
+	if (ra->start < rb->start)
+		return -1;
+	if (ra->start > rb->start)
+		return 1;
+	return 0;
+}
+
 static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
 				struct list_head *head)
 {
@@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
 }
 
 /*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
+				struct list_head *resv_regions)
+{
+	struct iommu_resv_region *region;
+
+	/* Check for conflict with existing dma mappings */
+	list_for_each_entry(region, resv_regions, list) {
+		if (vfio_find_dma_overlap(iommu, region->start,
+				    region->start + region->length - 1))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
  * Adjust the iommu aperture window if new aperture is a valid one
  */
 static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
@@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
 	return 0;
 }
 
+/*
+ * Check and update iova region list in case a reserved region
+ * overlaps the iommu iova range
+ */
+static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
+					struct list_head *resv_regions)
+{
+	struct iommu_resv_region *resv;
+	struct list_head *iova = &iommu->iova_list;
+	struct vfio_iova *n, *next;
+
+	list_for_each_entry(resv, resv_regions, list) {
+		phys_addr_t start, end;
+
+		start = resv->start;
+		end = resv->start + resv->length - 1;
+
+		list_for_each_entry_safe(n, next, iova, list) {
+			phys_addr_t a, b;
+			int ret = 0;
+
+			a = n->start;
+			b = n->end;
+			/* No overlap */
+			if ((start > b) || (end < a))
+				continue;
+			/* Split the current node and create holes */
+			if (start > a)
+				ret = vfio_insert_iova(a, start - 1, &n->list);
+			if (!ret && end < b)
+				ret = vfio_insert_iova(end + 1, b, &n->list);
+			if (ret)
+				return ret;
+
+			list_del(&n->list);
+			kfree(n);
+		}
+	}
+
+	if (list_empty(iova))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	bool resv_msi, msi_remap;
 	phys_addr_t resv_msi_base;
 	struct iommu_domain_geometry geo;
+	struct list_head group_resv_regions;
+	struct iommu_resv_region *resv, *resv_next;
 
 	mutex_lock(&iommu->lock);
 
@@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
+	INIT_LIST_HEAD(&group_resv_regions);
+	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
+	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
+
+	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
+	if (ret)
+		goto out_detach;
+
 	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
 
 	INIT_LIST_HEAD(&domain->group_list);
@@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		    d->prot == domain->prot) {
 			iommu_detach_group(domain->domain, iommu_group);
 			if (!iommu_attach_group(d->domain, iommu_group)) {
+				ret = vfio_iommu_iova_resv_adjust(iommu,
+							&group_resv_regions);
+				if (!ret)
+					goto out_domain;
+
 				list_add(&group->next, &d->group_list);
 				iommu_domain_free(domain->domain);
 				kfree(domain);
-				mutex_unlock(&iommu->lock);
-				return 0;
+				goto done;
 			}
 
 			ret = iommu_attach_group(domain->domain, iommu_group);
@@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
+	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
+	if (ret)
+		goto out_detach;
+
 	list_add(&domain->next, &iommu->domain_list);
 
+done:
+	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
+		kfree(resv);
 	mutex_unlock(&iommu->lock);
 
 	return 0;
@@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	iommu_detach_group(domain->domain, iommu_group);
 out_domain:
 	iommu_domain_free(domain->domain);
+	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
+		kfree(resv);
 out_free:
 	kfree(domain);
 	kfree(group);
@@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
 	node->end = end;
 }
 
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+   list again.
+ */
+static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *d;
+	struct vfio_group *g;
+	struct vfio_iova *node, *tmp;
+	struct iommu_resv_region *resv, *resv_next;
+	struct list_head resv_regions;
+	phys_addr_t start, end;
+
+	INIT_LIST_HEAD(&resv_regions);
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		list_for_each_entry(g, &d->group_list, next)
+			iommu_get_group_resv_regions(g->iommu_group,
+							 &resv_regions);
+	}
+
+	if (list_empty(&resv_regions))
+		return;
+
+	list_sort(NULL, &resv_regions, vfio_resv_cmp);
+
+	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
+	start = node->start;
+	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
+	end = node->end;
+
+	/* purge the iova list and create new one */
+	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
+		pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n",
+			__func__);
+		goto done;
+	}
+
+	/* adjust the iova with current reserved regions */
+	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
+		pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n",
+			__func__);
+done:
+	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
+		kfree(resv);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		break;
 	}
 
+	vfio_iommu_iova_resv_refresh(iommu);
+
 detach_group_done:
 	mutex_unlock(&iommu->lock);
 }
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 3/5] vfio/type1: check dma map request is within a valid iova range
  2018-01-12 16:45 [RFC v2 0/5] vfio/type1: Add support for valid iova list management Shameer Kolothum
  2018-01-12 16:45 ` [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
  2018-01-12 16:45 ` [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list Shameer Kolothum
@ 2018-01-12 16:45 ` Shameer Kolothum
  2018-01-23  8:38   ` Auger Eric
  2018-01-12 16:45 ` [RFC v2 4/5] vfio/type1: Add IOVA range capability support Shameer Kolothum
  2018-01-12 16:45 ` [RFC v2 5/5] vfio/type1: remove duplicate retrieval of reserved regions Shameer Kolothum
  4 siblings, 1 reply; 21+ messages in thread
From: Shameer Kolothum @ 2018-01-12 16:45 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This checks and rejects any dma map request outside valid iova
range.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 7609070..47ea490 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -971,6 +971,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	return ret;
 }
 
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+				phys_addr_t start, phys_addr_t end)
+{
+	struct list_head *iova = &iommu->iova_list;
+	struct vfio_iova *node;
+
+	list_for_each_entry(node, iova, list) {
+		if ((start >= node->start) && (end <= node->end))
+			return true;
+	}
+
+	return false;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
@@ -1009,6 +1026,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		goto out_unlock;
 	}
 
+	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 	if (!dma) {
 		ret = -ENOMEM;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 4/5] vfio/type1: Add IOVA range capability support
  2018-01-12 16:45 [RFC v2 0/5] vfio/type1: Add support for valid iova list management Shameer Kolothum
                   ` (2 preceding siblings ...)
  2018-01-12 16:45 ` [RFC v2 3/5] vfio/type1: check dma map request is within a valid iova range Shameer Kolothum
@ 2018-01-12 16:45 ` Shameer Kolothum
  2018-01-23 11:16   ` Auger Eric
  2018-01-12 16:45 ` [RFC v2 5/5] vfio/type1: remove duplicate retrieval of reserved regions Shameer Kolothum
  4 siblings, 1 reply; 21+ messages in thread
From: Shameer Kolothum @ 2018-01-12 16:45 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This  allows the user-space to retrieve the supported IOVA
range(s), excluding any reserved regions. The implementation
is based on capability chains, added to VFIO_IOMMU_GET_INFO ioctl.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 91 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       | 23 +++++++++++
 2 files changed, 114 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 47ea490..dc6ed85 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1893,6 +1893,67 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
 	return ret;
 }
 
+static int vfio_add_iova_cap(struct vfio_info_cap *caps, void *cap_type,
+			     size_t size)
+{
+	struct vfio_info_cap_header *header;
+	struct vfio_iommu_type1_info_cap_iova *iova_cap, *iova = cap_type;
+
+	header = vfio_info_cap_add(caps, size,
+				VFIO_IOMMU_TYPE1_INFO_CAP_IOVA, 1);
+	if (IS_ERR(header))
+		return PTR_ERR(header);
+
+	iova_cap = container_of(header,
+			struct vfio_iommu_type1_info_cap_iova, header);
+	iova_cap->nr_iovas = iova->nr_iovas;
+	memcpy(iova_cap->iova_ranges, iova->iova_ranges,
+			iova->nr_iovas * sizeof(*iova->iova_ranges));
+	return 0;
+}
+
+static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
+				struct vfio_info_cap *caps)
+{
+	struct vfio_iommu_type1_info_cap_iova *iova_cap;
+	struct vfio_iova *iova;
+	size_t size;
+	int iovas = 0, i = 0, ret;
+
+	mutex_lock(&iommu->lock);
+
+	list_for_each_entry(iova, &iommu->iova_list, list)
+		iovas++;
+
+	if (!iovas) {
+		ret = EINVAL;
+		goto out_unlock;
+	}
+
+	size = sizeof(*iova_cap) + (iovas * sizeof(*iova_cap->iova_ranges));
+
+	iova_cap = kzalloc(size, GFP_KERNEL);
+	if (!iova_cap) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	iova_cap->nr_iovas = iovas;
+
+	list_for_each_entry(iova, &iommu->iova_list, list) {
+		iova_cap->iova_ranges[i].start = iova->start;
+		iova_cap->iova_ranges[i].end = iova->end;
+		i++;
+	}
+
+	ret = vfio_add_iova_cap(caps, iova_cap, size);
+
+	kfree(iova_cap);
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -1914,6 +1975,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 		}
 	} else if (cmd == VFIO_IOMMU_GET_INFO) {
 		struct vfio_iommu_type1_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		int ret;
 
 		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 
@@ -1927,6 +1990,34 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
 
+		if (info.argsz == minsz)
+			goto done;
+
+		ret = vfio_build_iommu_iova_caps(iommu, &caps);
+		if (ret)
+			return ret;
+
+		if (caps.size) {
+			info.flags |= VFIO_IOMMU_INFO_CAPS;
+			minsz = offsetofend(struct vfio_iommu_type1_info,
+							 cap_offset);
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						sizeof(info), caps.buf,
+						caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+done:
 		return copy_to_user((void __user *)arg, &info, minsz) ?
 			-EFAULT : 0;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e3301db..8671448 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -517,7 +517,30 @@ struct vfio_iommu_type1_info {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
+#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
 	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
+	__u32   cap_offset;	/* Offset within info struct of first cap */
+};
+
+/*
+ * The IOVA capability allows to report the valid IOVA range(s)
+ * excluding any reserved regions associated with dev group. Any dma
+ * map attempt outside the valid iova range will return error.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA  1
+
+struct vfio_iova_range {
+	__u64	start;
+	__u64	end;
+};
+
+struct vfio_iommu_type1_info_cap_iova {
+	struct vfio_info_cap_header header;
+	__u32	nr_iovas;
+	__u32	reserved;
+	struct vfio_iova_range iova_ranges[];
 };
 
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 5/5] vfio/type1: remove duplicate retrieval of reserved regions.
  2018-01-12 16:45 [RFC v2 0/5] vfio/type1: Add support for valid iova list management Shameer Kolothum
                   ` (3 preceding siblings ...)
  2018-01-12 16:45 ` [RFC v2 4/5] vfio/type1: Add IOVA range capability support Shameer Kolothum
@ 2018-01-12 16:45 ` Shameer Kolothum
  4 siblings, 0 replies; 21+ messages in thread
From: Shameer Kolothum @ 2018-01-12 16:45 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

As we now already have the reserved regions list, just pass that into
vfio_iommu_has_sw_msi() fn.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index dc6ed85..b5cf21d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1193,15 +1193,13 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
 	return NULL;
 }
 
-static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
+static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
+						phys_addr_t *base)
 {
-	struct list_head group_resv_regions;
-	struct iommu_resv_region *region, *next;
+	struct iommu_resv_region *region;
 	bool ret = false;
 
-	INIT_LIST_HEAD(&group_resv_regions);
-	iommu_get_group_resv_regions(group, &group_resv_regions);
-	list_for_each_entry(region, &group_resv_regions, list) {
+	list_for_each_entry(region, group_resv_regions, list) {
 		/*
 		 * The presence of any 'real' MSI regions should take
 		 * precedence over the software-managed one if the
@@ -1217,8 +1215,7 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
 			ret = true;
 		}
 	}
-	list_for_each_entry_safe(region, next, &group_resv_regions, list)
-		kfree(region);
+
 	return ret;
 }
 
@@ -1514,7 +1511,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
-	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
+	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
 
 	INIT_LIST_HEAD(&domain->group_list);
 	list_add(&group->next, &domain->group_list);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check
  2018-01-12 16:45 ` [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
@ 2018-01-18  0:04   ` Alex Williamson
  2018-01-19  9:47     ` Shameerali Kolothum Thodi
  2018-01-23  8:25     ` Auger Eric
  0 siblings, 2 replies; 21+ messages in thread
From: Alex Williamson @ 2018-01-18  0:04 UTC (permalink / raw)
  To: Shameer Kolothum
  Cc: eric.auger, pmorel, kvm, linux-kernel, linuxarm, john.garry, xuwei5

On Fri, 12 Jan 2018 16:45:27 +0000
Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> This introduces an iova list that is valid for dma mappings. Make
> sure the new iommu aperture window is valid and doesn't conflict
> with any existing dma mappings during attach. Also update the iova
> list with new aperture window during attach/detach.
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 177 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 177 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29a..11cbd49 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -60,6 +60,7 @@
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct list_head	iova_list;
>  	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
> @@ -92,6 +93,12 @@ struct vfio_group {
>  	struct list_head	next;
>  };
>  
> +struct vfio_iova {
> +	struct list_head	list;
> +	phys_addr_t		start;
> +	phys_addr_t		end;
> +};

dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA
addresses, why are we using phys_addr_t?

> +
>  /*
>   * Guest RAM pinning working set or DMA target
>   */
> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
>  	return ret;
>  }
>  
> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> +				struct list_head *head)
> +{
> +	struct vfio_iova *region;
> +
> +	region = kmalloc(sizeof(*region), GFP_KERNEL);
> +	if (!region)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&region->list);
> +	region->start = start;
> +	region->end = end;
> +
> +	list_add_tail(&region->list, head);
> +	return 0;
> +}

As I'm reading through this series, I'm learning that there are a lot
of assumptions and subtle details that should be documented.  For
instance, the IOMMU API only provides a single geometry and we build
upon that here as this patch creates a list, but there's only a single
entry for now.  The following patches carve that single iova range into
pieces and somewhat subtly use the list_head passed to keep the list
sorted, allowing the first/last_entry tricks used throughout.  Subtle
interfaces are prone to bugs.

> +
> +/*
> + * Find whether a mem region overlaps with existing dma mappings
> + */
> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
> +				  phys_addr_t start, phys_addr_t end)
> +{
> +	struct rb_node *n = rb_first(&iommu->dma_list);
> +
> +	for (; n; n = rb_next(n)) {
> +		struct vfio_dma *dma;
> +
> +		dma = rb_entry(n, struct vfio_dma, node);
> +
> +		if (end < dma->iova)
> +			break;
> +		if (start >= dma->iova + dma->size)
> +			continue;
> +		return true;
> +	}
> +
> +	return false;
> +}

Why do we need this in addition to the existing vfio_find_dma()?  Why
doesn't this use the tree structure of the dma_list?

> +
> +/*
> + * Check the new iommu aperture is a valid one
> + */
> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
> +				     phys_addr_t start,
> +				     phys_addr_t end)
> +{
> +	struct vfio_iova *first, *last;
> +	struct list_head *iova = &iommu->iova_list;
> +
> +	if (list_empty(iova))
> +		return 0;
> +
> +	/* Check if new one is outside the current aperture */

"Disjoint sets"

> +	first = list_first_entry(iova, struct vfio_iova, list);
> +	last = list_last_entry(iova, struct vfio_iova, list);
> +	if ((start > last->end) || (end < first->start))
> +		return -EINVAL;
> +
> +	/* Check for any existing dma mappings outside the new start */
> +	if (start > first->start) {
> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))
> +			return -EINVAL;
> +	}
> +
> +	/* Check for any existing dma mappings outside the new end */
> +	if (end < last->end) {
> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))
> +			return -EINVAL;
> +	}
> +
> +	return 0;
> +}

I think this returns an int because you want to use it for the return
value below, but it really seems like a bool question, ie. does this
aperture conflict with existing mappings.  Additionally, the aperture
is valid, it was provided to us by the IOMMU API, the question is
whether it conflicts.  Please also name consistently to the other
functions in this patch, vfio_iommu_aper_xxxx().

> +
> +/*
> + * Adjust the iommu aperture window if new aperture is a valid one
> + */
> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> +				      phys_addr_t start,
> +				      phys_addr_t end)

Perhaps "resize", "prune", or "shrink" to make it more clear what is
being adjusted?

> +{
> +	struct vfio_iova *node, *next;
> +	struct list_head *iova = &iommu->iova_list;
> +
> +	if (list_empty(iova))
> +		return vfio_insert_iova(start, end, iova);
> +
> +	/* Adjust iova list start */
> +	list_for_each_entry_safe(node, next, iova, list) {
> +		if (start < node->start)
> +			break;
> +		if ((start >= node->start) && (start <= node->end)) {

start == node->end results in a zero sized node.  s/<=/</

> +			node->start = start;
> +			break;
> +		}
> +		/* Delete nodes before new start */
> +		list_del(&node->list);
> +		kfree(node);
> +	}
> +
> +	/* Adjust iova list end */
> +	list_for_each_entry_safe(node, next, iova, list) {
> +		if (end > node->end)
> +			continue;
> +
> +		if ((end >= node->start) && (end <= node->end)) {

end == node->start results in a zero sized node.  s/>=/>/

> +			node->end = end;
> +			continue;
> +		}
> +		/* Delete nodes after new end */
> +		list_del(&node->list);
> +		kfree(node);
> +	}
> +
> +	return 0;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	int ret;
>  	bool resv_msi, msi_remap;
>  	phys_addr_t resv_msi_base;
> +	struct iommu_domain_geometry geo;
>  
>  	mutex_lock(&iommu->lock);
>  
> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_domain;
>  
> +	/* Get aperture info */
> +	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
> +
> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
> +					geo.aperture_end);
> +	if (ret)
> +		goto out_detach;
> +
>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>  
>  	INIT_LIST_HEAD(&domain->group_list);
> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  			goto out_detach;
>  	}
>  
> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
> +					  geo.aperture_end);
> +	if (ret)
> +		goto out_detach;
> +
>  	list_add(&domain->next, &iommu->domain_list);
>  
>  	mutex_unlock(&iommu->lock);
> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
>  	WARN_ON(iommu->notifier.head);
>  }
>  
> +/*
> + * Called when a domain is removed in detach. It is possible that
> + * the removed domain decided the iova aperture window. Modify the
> + * iova aperture with the smallest window among existing domains.
> + */
> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *domain;
> +	struct iommu_domain_geometry geo;
> +	struct vfio_iova *node;
> +	phys_addr_t start = 0;
> +	phys_addr_t end = (phys_addr_t)~0;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
> +				      &geo);
> +			if (geo.aperture_start > start)
> +				start = geo.aperture_start;
> +			if (geo.aperture_end < end)
> +				end = geo.aperture_end;
> +	}
> +
> +	/* modify iova aperture limits */
> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> +	node->start = start;
> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> +	node->end = end;

We can do this because the new aperture is the same or bigger than the
current aperture, never smaller.  That's not fully obvious and should
be noted in the comment.  Perhaps this function should be "expand"
rather than "refresh".

> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  			iommu_domain_free(domain->domain);
>  			list_del(&domain->next);
>  			kfree(domain);
> +			vfio_iommu_iova_aper_refresh(iommu);
>  		}
>  		break;
>  	}
> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	}
>  
>  	INIT_LIST_HEAD(&iommu->domain_list);
> +	INIT_LIST_HEAD(&iommu->iova_list);
>  	iommu->dma_list = RB_ROOT;
>  	mutex_init(&iommu->lock);
>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> +	struct vfio_iova *iova, *iova_tmp;
>  
>  	if (iommu->external_domain) {
>  		vfio_release_domain(iommu->external_domain, true);
> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void *iommu_data)
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
> +
> +	list_for_each_entry_safe(iova, iova_tmp,
> +				 &iommu->iova_list, list) {
> +		list_del(&iova->list);
> +		kfree(iova);
> +	}
> +
>  	kfree(iommu);
>  }
>  

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list
  2018-01-12 16:45 ` [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list Shameer Kolothum
@ 2018-01-18  0:04   ` Alex Williamson
  2018-01-19  9:48     ` Shameerali Kolothum Thodi
  2018-01-23  8:32     ` Auger Eric
  0 siblings, 2 replies; 21+ messages in thread
From: Alex Williamson @ 2018-01-18  0:04 UTC (permalink / raw)
  To: Shameer Kolothum
  Cc: eric.auger, pmorel, kvm, linux-kernel, linuxarm, john.garry, xuwei5

On Fri, 12 Jan 2018 16:45:28 +0000
Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> This retrieves the reserved regions associated with dev group and
> checks for conflicts with any existing dma mappings. Also update
> the iova list excluding the reserved regions.
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 159 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 11cbd49..7609070 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -28,6 +28,7 @@
>  #include <linux/device.h>
>  #include <linux/fs.h>
>  #include <linux/iommu.h>
> +#include <linux/list_sort.h>
>  #include <linux/module.h>
>  #include <linux/mm.h>
>  #include <linux/rbtree.h>
> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
>  	return ret;
>  }
>  

/* list_sort helper */

> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
> +{
> +	struct iommu_resv_region *ra, *rb;
> +
> +	ra = container_of(a, struct iommu_resv_region, list);
> +	rb = container_of(b, struct iommu_resv_region, list);
> +
> +	if (ra->start < rb->start)
> +		return -1;
> +	if (ra->start > rb->start)
> +		return 1;
> +	return 0;
> +}
> +
>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>  				struct list_head *head)
>  {
> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
>  }
>  
>  /*
> + * Check reserved region conflicts with existing dma mappings
> + */
> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
> +				struct list_head *resv_regions)
> +{
> +	struct iommu_resv_region *region;
> +
> +	/* Check for conflict with existing dma mappings */
> +	list_for_each_entry(region, resv_regions, list) {
> +		if (vfio_find_dma_overlap(iommu, region->start,
> +				    region->start + region->length - 1))
> +			return -EINVAL;
> +	}
> +
> +	return 0;
> +}

This basically does the same test as vfio_iommu_valid_aperture but
properly names it a conflict test.  Please be consistent.  Should this
also return bool, "conflict" is a yes/no answer.

> +
> +/*
>   * Adjust the iommu aperture window if new aperture is a valid one
>   */
>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>  	return 0;
>  }
>  
> +/*
> + * Check and update iova region list in case a reserved region
> + * overlaps the iommu iova range
> + */
> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
> +					struct list_head *resv_regions)

"resv_region" in previous function, just "resv" here, use consistent
names.  Also, what are we adjusting.  Maybe "exclude" is a better term.

> +{
> +	struct iommu_resv_region *resv;
> +	struct list_head *iova = &iommu->iova_list;
> +	struct vfio_iova *n, *next;
> +
> +	list_for_each_entry(resv, resv_regions, list) {
> +		phys_addr_t start, end;
> +
> +		start = resv->start;
> +		end = resv->start + resv->length - 1;
> +
> +		list_for_each_entry_safe(n, next, iova, list) {
> +			phys_addr_t a, b;
> +			int ret = 0;
> +
> +			a = n->start;
> +			b = n->end;

'a' and 'b' variables actually make this incredibly confusing.  Use
better variable names or just drop them entirely, it's much easier to
follow as n->start & n->end.

> +			/* No overlap */
> +			if ((start > b) || (end < a))
> +				continue;
> +			/* Split the current node and create holes */
> +			if (start > a)
> +				ret = vfio_insert_iova(a, start - 1, &n->list);
> +			if (!ret && end < b)
> +				ret = vfio_insert_iova(end + 1, b, &n->list);
> +			if (ret)
> +				return ret;
> +
> +			list_del(&n->list);

This is trickier than it appears and deserves some explanation.  AIUI,
we're actually inserting duplicate entries for the remainder at the
start of the range and then at the end of the range (and the order is
important here because we're inserting each before the current node),
and then we delete the current node.  So the iova_list is kept sorted
through this process, though temporarily includes some bogus, unordered
sub-sets.

> +			kfree(n);
> +		}
> +	}
> +
> +	if (list_empty(iova))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	bool resv_msi, msi_remap;
>  	phys_addr_t resv_msi_base;
>  	struct iommu_domain_geometry geo;
> +	struct list_head group_resv_regions;
> +	struct iommu_resv_region *resv, *resv_next;
>  
>  	mutex_lock(&iommu->lock);
>  
> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_detach;
>  
> +	INIT_LIST_HEAD(&group_resv_regions);
> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> +
> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
> +	if (ret)
> +		goto out_detach;
> +
>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>  
>  	INIT_LIST_HEAD(&domain->group_list);
> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		    d->prot == domain->prot) {
>  			iommu_detach_group(domain->domain, iommu_group);
>  			if (!iommu_attach_group(d->domain, iommu_group)) {
> +				ret = vfio_iommu_iova_resv_adjust(iommu,
> +							&group_resv_regions);
> +				if (!ret)
> +					goto out_domain;

The above function is not without side effects if it fails, it's
altered the iova_list.  It needs to be valid for the remaining domains
if we're going to continue.

> +
>  				list_add(&group->next, &d->group_list);
>  				iommu_domain_free(domain->domain);
>  				kfree(domain);
> -				mutex_unlock(&iommu->lock);
> -				return 0;
> +				goto done;
>  			}
>  
>  			ret = iommu_attach_group(domain->domain, iommu_group);
> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_detach;
>  
> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
> +	if (ret)
> +		goto out_detach;

Can't we process the reserved regions once before we get here rather
than have two separate call points that do the same thing?  In order to
roll back from errors above, it seems like we need to copy iova_list
and work on the copy, installing it and deleting the original only on
success.

> +
>  	list_add(&domain->next, &iommu->domain_list);
>  
> +done:
> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> +		kfree(resv);
>  	mutex_unlock(&iommu->lock);
>  
>  	return 0;
> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	iommu_detach_group(domain->domain, iommu_group);
>  out_domain:
>  	iommu_domain_free(domain->domain);
> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> +		kfree(resv);
>  out_free:
>  	kfree(domain);
>  	kfree(group);
> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
>  	node->end = end;
>  }
>  
> +/*
> + * Called when a group is detached. The reserved regions for that
> + * group can be part of valid iova now. But since reserved regions
> + * may be duplicated among groups, populate the iova valid regions
> +   list again.
> + */
> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *d;
> +	struct vfio_group *g;
> +	struct vfio_iova *node, *tmp;
> +	struct iommu_resv_region *resv, *resv_next;
> +	struct list_head resv_regions;
> +	phys_addr_t start, end;
> +
> +	INIT_LIST_HEAD(&resv_regions);
> +
> +	list_for_each_entry(d, &iommu->domain_list, next) {
> +		list_for_each_entry(g, &d->group_list, next)
> +			iommu_get_group_resv_regions(g->iommu_group,
> +							 &resv_regions);
> +	}
> +
> +	if (list_empty(&resv_regions))
> +		return;
> +
> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
> +
> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> +	start = node->start;
> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> +	end = node->end;

list_sort() only sorts based on ->start, we added reserved regions for
all our groups to one list, we potentially have multiple entries with
the same ->start.  How can we be sure that the last one in the list
actually has the largest ->end value?

> +
> +	/* purge the iova list and create new one */
> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
> +		list_del(&node->list);
> +		kfree(node);
> +	}
> +
> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n",
> +			__func__);

Map requests "will" fail.  Is this the right error strategy?  Detaching
a group cannot fail.  Aren't we better off leaving the iova_list we had
in place?  If we cannot expand the iova aperture when a group is
removed, a user can continue unscathed.

> +		goto done;
> +	}
> +
> +	/* adjust the iova with current reserved regions */
> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
> +		pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n",
> +			__func__);

Same.

> +done:
> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
> +		kfree(resv);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  		break;
>  	}
>  
> +	vfio_iommu_iova_resv_refresh(iommu);
> +
>  detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check
  2018-01-18  0:04   ` Alex Williamson
@ 2018-01-19  9:47     ` Shameerali Kolothum Thodi
  2018-01-23  8:25     ` Auger Eric
  1 sibling, 0 replies; 21+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-01-19  9:47 UTC (permalink / raw)
  To: Alex Williamson
  Cc: eric.auger, pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

Hi Alex,

> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, January 18, 2018 12:05 AM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: eric.auger@redhat.com; pmorel@linux.vnet.ibm.com;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)
> <xuwei5@huawei.com>
> Subject: Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu
> aperture validity check
> 
> On Fri, 12 Jan 2018 16:45:27 +0000
> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> 
> > This introduces an iova list that is valid for dma mappings. Make
> > sure the new iommu aperture window is valid and doesn't conflict
> > with any existing dma mappings during attach. Also update the iova
> > list with new aperture window during attach/detach.
> >
> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 177
> ++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 177 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> > index e30e29a..11cbd49 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -60,6 +60,7 @@
> >
> >  struct vfio_iommu {
> >  	struct list_head	domain_list;
> > +	struct list_head	iova_list;
> >  	struct vfio_domain	*external_domain; /* domain for external user
> */
> >  	struct mutex		lock;
> >  	struct rb_root		dma_list;
> > @@ -92,6 +93,12 @@ struct vfio_group {
> >  	struct list_head	next;
> >  };
> >
> > +struct vfio_iova {
> > +	struct list_head	list;
> > +	phys_addr_t		start;
> > +	phys_addr_t		end;
> > +};
> 
> dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA
> addresses, why are we using phys_addr_t?

Ok. I will change that to dma_addr_t.

> > +
> >  /*
> >   * Guest RAM pinning working set or DMA target
> >   */
> > @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct
> iommu_group *group, phys_addr_t *base)
> >  	return ret;
> >  }
> >
> > +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> > +				struct list_head *head)
> > +{
> > +	struct vfio_iova *region;
> > +
> > +	region = kmalloc(sizeof(*region), GFP_KERNEL);
> > +	if (!region)
> > +		return -ENOMEM;
> > +
> > +	INIT_LIST_HEAD(&region->list);
> > +	region->start = start;
> > +	region->end = end;
> > +
> > +	list_add_tail(&region->list, head);
> > +	return 0;
> > +}
> 
> As I'm reading through this series, I'm learning that there are a lot
> of assumptions and subtle details that should be documented.  For
> instance, the IOMMU API only provides a single geometry and we build
> upon that here as this patch creates a list, but there's only a single
> entry for now.  The following patches carve that single iova range into
> pieces and somewhat subtly use the list_head passed to keep the list
> sorted, allowing the first/last_entry tricks used throughout.  Subtle
> interfaces are prone to bugs.

Agree. The iova list management logic needs to be documented properly.
I will address this in next revision.

> > +
> > +/*
> > + * Find whether a mem region overlaps with existing dma mappings
> > + */
> > +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
> > +				  phys_addr_t start, phys_addr_t end)
> > +{
> > +	struct rb_node *n = rb_first(&iommu->dma_list);
> > +
> > +	for (; n; n = rb_next(n)) {
> > +		struct vfio_dma *dma;
> > +
> > +		dma = rb_entry(n, struct vfio_dma, node);
> > +
> > +		if (end < dma->iova)
> > +			break;
> > +		if (start >= dma->iova + dma->size)
> > +			continue;
> > +		return true;
> > +	}
> > +
> > +	return false;
> > +}
> 
> Why do we need this in addition to the existing vfio_find_dma()?  Why
> doesn't this use the tree structure of the dma_list?

Ok. I will take a look at the vfio_find_dma().

> > +
> > +/*
> > + * Check the new iommu aperture is a valid one
> > + */
> > +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
> > +				     phys_addr_t start,
> > +				     phys_addr_t end)
> > +{
> > +	struct vfio_iova *first, *last;
> > +	struct list_head *iova = &iommu->iova_list;
> > +
> > +	if (list_empty(iova))
> > +		return 0;
> > +
> > +	/* Check if new one is outside the current aperture */
> 
> "Disjoint sets"
> 
> > +	first = list_first_entry(iova, struct vfio_iova, list);
> > +	last = list_last_entry(iova, struct vfio_iova, list);
> > +	if ((start > last->end) || (end < first->start))
> > +		return -EINVAL;
> > +
> > +	/* Check for any existing dma mappings outside the new start */
> > +	if (start > first->start) {
> > +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))
> > +			return -EINVAL;
> > +	}
> > +
> > +	/* Check for any existing dma mappings outside the new end */
> > +	if (end < last->end) {
> > +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))
> > +			return -EINVAL;
> > +	}
> > +
> > +	return 0;
> > +}
> 
> I think this returns an int because you want to use it for the return
> value below, but it really seems like a bool question, ie. does this
> aperture conflict with existing mappings.  Additionally, the aperture
> is valid, it was provided to us by the IOMMU API, the question is
> whether it conflicts.  Please also name consistently to the other
> functions in this patch, vfio_iommu_aper_xxxx().

Sure.

> > +
> > +/*
> > + * Adjust the iommu aperture window if new aperture is a valid one
> > + */
> > +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> > +				      phys_addr_t start,
> > +				      phys_addr_t end)
> 
> Perhaps "resize", "prune", or "shrink" to make it more clear what is
> being adjusted?

Ok.

> > +{
> > +	struct vfio_iova *node, *next;
> > +	struct list_head *iova = &iommu->iova_list;
> > +
> > +	if (list_empty(iova))
> > +		return vfio_insert_iova(start, end, iova);
> > +
> > +	/* Adjust iova list start */
> > +	list_for_each_entry_safe(node, next, iova, list) {
> > +		if (start < node->start)
> > +			break;
> > +		if ((start >= node->start) && (start <= node->end)) {
> 
> start == node->end results in a zero sized node.  s/<=/</

Ok.

> 
> > +			node->start = start;
> > +			break;
> > +		}
> > +		/* Delete nodes before new start */
> > +		list_del(&node->list);
> > +		kfree(node);
> > +	}
> > +
> > +	/* Adjust iova list end */
> > +	list_for_each_entry_safe(node, next, iova, list) {
> > +		if (end > node->end)
> > +			continue;
> > +
> > +		if ((end >= node->start) && (end <= node->end)) {
> 
> end == node->start results in a zero sized node.  s/>=/>/

Ok.

> > +			node->end = end;
> > +			continue;
> > +		}
> > +		/* Delete nodes after new end */
> > +		list_del(&node->list);
> > +		kfree(node);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >  					 struct iommu_group *iommu_group)
> >  {
> > @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  	int ret;
> >  	bool resv_msi, msi_remap;
> >  	phys_addr_t resv_msi_base;
> > +	struct iommu_domain_geometry geo;
> >
> >  	mutex_lock(&iommu->lock);
> >
> > @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  	if (ret)
> >  		goto out_domain;
> >
> > +	/* Get aperture info */
> > +	iommu_domain_get_attr(domain->domain,
> DOMAIN_ATTR_GEOMETRY, &geo);
> > +
> > +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
> > +					geo.aperture_end);
> > +	if (ret)
> > +		goto out_detach;
> > +
> >  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
> >
> >  	INIT_LIST_HEAD(&domain->group_list);
> > @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  			goto out_detach;
> >  	}
> >
> > +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
> > +					  geo.aperture_end);
> > +	if (ret)
> > +		goto out_detach;
> > +
> >  	list_add(&domain->next, &iommu->domain_list);
> >
> >  	mutex_unlock(&iommu->lock);
> > @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct
> vfio_iommu *iommu)
> >  	WARN_ON(iommu->notifier.head);
> >  }
> >
> > +/*
> > + * Called when a domain is removed in detach. It is possible that
> > + * the removed domain decided the iova aperture window. Modify the
> > + * iova aperture with the smallest window among existing domains.
> > + */
> > +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
> > +{
> > +	struct vfio_domain *domain;
> > +	struct iommu_domain_geometry geo;
> > +	struct vfio_iova *node;
> > +	phys_addr_t start = 0;
> > +	phys_addr_t end = (phys_addr_t)~0;
> > +
> > +	list_for_each_entry(domain, &iommu->domain_list, next) {
> > +		iommu_domain_get_attr(domain->domain,
> DOMAIN_ATTR_GEOMETRY,
> > +				      &geo);
> > +			if (geo.aperture_start > start)
> > +				start = geo.aperture_start;
> > +			if (geo.aperture_end < end)
> > +				end = geo.aperture_end;
> > +	}
> > +
> > +	/* modify iova aperture limits */
> > +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> > +	node->start = start;
> > +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> > +	node->end = end;
> 
> We can do this because the new aperture is the same or bigger than the
> current aperture, never smaller.  That's not fully obvious and should
> be noted in the comment.  Perhaps this function should be "expand"
> rather than "refresh".

Ok. Based on the comments from patch#2, I will remove this aperture expand
logic as reserved region conflict handling for the expanded aperture might fail.
Looks like it is better to leave the smaller aperture so that the user can continue.

Thanks for going through this.

Shameer
 
> > +}
> > +
> >  static void vfio_iommu_type1_detach_group(void *iommu_data,
> >  					  struct iommu_group *iommu_group)
> >  {
> > @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void
> *iommu_data,
> >  			iommu_domain_free(domain->domain);
> >  			list_del(&domain->next);
> >  			kfree(domain);
> > +			vfio_iommu_iova_aper_refresh(iommu);
> >  		}
> >  		break;
> >  	}
> > @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned
> long arg)
> >  	}
> >
> >  	INIT_LIST_HEAD(&iommu->domain_list);
> > +	INIT_LIST_HEAD(&iommu->iova_list);
> >  	iommu->dma_list = RB_ROOT;
> >  	mutex_init(&iommu->lock);
> >  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> > @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void
> *iommu_data)
> >  {
> >  	struct vfio_iommu *iommu = iommu_data;
> >  	struct vfio_domain *domain, *domain_tmp;
> > +	struct vfio_iova *iova, *iova_tmp;
> >
> >  	if (iommu->external_domain) {
> >  		vfio_release_domain(iommu->external_domain, true);
> > @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void
> *iommu_data)
> >  		list_del(&domain->next);
> >  		kfree(domain);
> >  	}
> > +
> > +	list_for_each_entry_safe(iova, iova_tmp,
> > +				 &iommu->iova_list, list) {
> > +		list_del(&iova->list);
> > +		kfree(iova);
> > +	}
> > +
> >  	kfree(iommu);
> >  }
> >

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list
  2018-01-18  0:04   ` Alex Williamson
@ 2018-01-19  9:48     ` Shameerali Kolothum Thodi
  2018-01-19 15:45       ` Alex Williamson
  2018-01-23  8:32     ` Auger Eric
  1 sibling, 1 reply; 21+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-01-19  9:48 UTC (permalink / raw)
  To: Alex Williamson
  Cc: eric.auger, pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)



> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, January 18, 2018 12:05 AM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: eric.auger@redhat.com; pmorel@linux.vnet.ibm.com;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)
> <xuwei5@huawei.com>
> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update
> iova list
> 
> On Fri, 12 Jan 2018 16:45:28 +0000
> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> 
> > This retrieves the reserved regions associated with dev group and
> > checks for conflicts with any existing dma mappings. Also update
> > the iova list excluding the reserved regions.
> >
> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 161
> +++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 159 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> > index 11cbd49..7609070 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -28,6 +28,7 @@
> >  #include <linux/device.h>
> >  #include <linux/fs.h>
> >  #include <linux/iommu.h>
> > +#include <linux/list_sort.h>
> >  #include <linux/module.h>
> >  #include <linux/mm.h>
> >  #include <linux/rbtree.h>
> > @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct
> iommu_group *group, phys_addr_t *base)
> >  	return ret;
> >  }
> >
> 
> /* list_sort helper */
> 
> > +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
> > +{
> > +	struct iommu_resv_region *ra, *rb;
> > +
> > +	ra = container_of(a, struct iommu_resv_region, list);
> > +	rb = container_of(b, struct iommu_resv_region, list);
> > +
> > +	if (ra->start < rb->start)
> > +		return -1;
> > +	if (ra->start > rb->start)
> > +		return 1;
> > +	return 0;
> > +}
> > +
> >  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> >  				struct list_head *head)
> >  {
> > @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct
> vfio_iommu *iommu,
> >  }
> >
> >  /*
> > + * Check reserved region conflicts with existing dma mappings
> > + */
> > +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
> > +				struct list_head *resv_regions)
> > +{
> > +	struct iommu_resv_region *region;
> > +
> > +	/* Check for conflict with existing dma mappings */
> > +	list_for_each_entry(region, resv_regions, list) {
> > +		if (vfio_find_dma_overlap(iommu, region->start,
> > +				    region->start + region->length - 1))
> > +			return -EINVAL;
> > +	}
> > +
> > +	return 0;
> > +}
> 
> This basically does the same test as vfio_iommu_valid_aperture but
> properly names it a conflict test.  Please be consistent.  Should this
> also return bool, "conflict" is a yes/no answer.

Ok.
 
> > +
> > +/*
> >   * Adjust the iommu aperture window if new aperture is a valid one
> >   */
> >  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> > @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct
> vfio_iommu *iommu,
> >  	return 0;
> >  }
> >
> > +/*
> > + * Check and update iova region list in case a reserved region
> > + * overlaps the iommu iova range
> > + */
> > +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
> > +					struct list_head *resv_regions)
> 
> "resv_region" in previous function, just "resv" here, use consistent
> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.

Ok.

> > +{
> > +	struct iommu_resv_region *resv;
> > +	struct list_head *iova = &iommu->iova_list;
> > +	struct vfio_iova *n, *next;
> > +
> > +	list_for_each_entry(resv, resv_regions, list) {
> > +		phys_addr_t start, end;
> > +
> > +		start = resv->start;
> > +		end = resv->start + resv->length - 1;
> > +
> > +		list_for_each_entry_safe(n, next, iova, list) {
> > +			phys_addr_t a, b;
> > +			int ret = 0;
> > +
> > +			a = n->start;
> > +			b = n->end;
> 
> 'a' and 'b' variables actually make this incredibly confusing.  Use
> better variable names or just drop them entirely, it's much easier to
> follow as n->start & n->end.

I will drop the name and go with  n->start & n->end.
 
> > +			/* No overlap */
> > +			if ((start > b) || (end < a))
> > +				continue;
> > +			/* Split the current node and create holes */
> > +			if (start > a)
> > +				ret = vfio_insert_iova(a, start - 1, &n->list);
> > +			if (!ret && end < b)
> > +				ret = vfio_insert_iova(end + 1, b, &n->list);
> > +			if (ret)
> > +				return ret;
> > +
> > +			list_del(&n->list);
> 
> This is trickier than it appears and deserves some explanation.  AIUI,
> we're actually inserting duplicate entries for the remainder at the
> start of the range and then at the end of the range (and the order is
> important here because we're inserting each before the current node),
> and then we delete the current node.  So the iova_list is kept sorted
> through this process, though temporarily includes some bogus, unordered
> sub-sets.

Yes. That understanding is correct. I will add comments to make it clear.

> > +			kfree(n);
> > +		}
> > +	}
> > +
> > +	if (list_empty(iova))
> > +		return -EINVAL;

The above is also not correct. The list cannot be empty. I think as you
said below, need to work on a copy.

> > +	return 0;
> > +}
> > +
> >  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >  					 struct iommu_group *iommu_group)
> >  {
> > @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  	bool resv_msi, msi_remap;
> >  	phys_addr_t resv_msi_base;
> >  	struct iommu_domain_geometry geo;
> > +	struct list_head group_resv_regions;
> > +	struct iommu_resv_region *resv, *resv_next;
> >
> >  	mutex_lock(&iommu->lock);
> >
> > @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  	if (ret)
> >  		goto out_detach;
> >
> > +	INIT_LIST_HEAD(&group_resv_regions);
> > +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
> > +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> > +
> > +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
> > +	if (ret)
> > +		goto out_detach;
> > +
> >  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
> >
> >  	INIT_LIST_HEAD(&domain->group_list);
> > @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  		    d->prot == domain->prot) {
> >  			iommu_detach_group(domain->domain,
> iommu_group);
> >  			if (!iommu_attach_group(d->domain, iommu_group)) {
> > +				ret = vfio_iommu_iova_resv_adjust(iommu,
> > +
> 	&group_resv_regions);
> > +				if (!ret)
> > +					goto out_domain;
> 
> The above function is not without side effects if it fails, it's
> altered the iova_list.  It needs to be valid for the remaining domains
> if we're going to continue.
> 
> > +
> >  				list_add(&group->next, &d->group_list);
> >  				iommu_domain_free(domain->domain);
> >  				kfree(domain);
> > -				mutex_unlock(&iommu->lock);
> > -				return 0;
> > +				goto done;
> >  			}
> >
> >  			ret = iommu_attach_group(domain->domain,
> iommu_group);
> > @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  	if (ret)
> >  		goto out_detach;
> >
> > +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
> > +	if (ret)
> > +		goto out_detach;
> 
> Can't we process the reserved regions once before we get here rather
> than have two separate call points that do the same thing?  In order to
> roll back from errors above, it seems like we need to copy iova_list
> and work on the copy, installing it and deleting the original only on
> success.

Correct. In case of error, the iova list needs to be rolled back to previous
state. Yes, it looks like have to work on a copy. I will address this in next
revision.
 
> > +
> >  	list_add(&domain->next, &iommu->domain_list);
> >
> > +done:
> > +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> > +		kfree(resv);
> >  	mutex_unlock(&iommu->lock);
> >
> >  	return 0;
> > @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  	iommu_detach_group(domain->domain, iommu_group);
> >  out_domain:
> >  	iommu_domain_free(domain->domain);
> > +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> > +		kfree(resv);
> >  out_free:
> >  	kfree(domain);
> >  	kfree(group);
> > @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct
> vfio_iommu *iommu)
> >  	node->end = end;
> >  }
> >
> > +/*
> > + * Called when a group is detached. The reserved regions for that
> > + * group can be part of valid iova now. But since reserved regions
> > + * may be duplicated among groups, populate the iova valid regions
> > +   list again.
> > + */
> > +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
> > +{
> > +	struct vfio_domain *d;
> > +	struct vfio_group *g;
> > +	struct vfio_iova *node, *tmp;
> > +	struct iommu_resv_region *resv, *resv_next;
> > +	struct list_head resv_regions;
> > +	phys_addr_t start, end;
> > +
> > +	INIT_LIST_HEAD(&resv_regions);
> > +
> > +	list_for_each_entry(d, &iommu->domain_list, next) {
> > +		list_for_each_entry(g, &d->group_list, next)
> > +			iommu_get_group_resv_regions(g->iommu_group,
> > +							 &resv_regions);
> > +	}
> > +
> > +	if (list_empty(&resv_regions))
> > +		return;
> > +
> > +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
> > +
> > +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> > +	start = node->start;
> > +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> > +	end = node->end;
> 
> list_sort() only sorts based on ->start, we added reserved regions for
> all our groups to one list, we potentially have multiple entries with
> the same ->start.  How can we be sure that the last one in the list
> actually has the largest ->end value?

Hmm.. the sorting is done on the reserved list. The start and end entries 
are of the iova list which is kept updated on _attach(). So I don't think
there is a problem here.

> > +
> > +	/* purge the iova list and create new one */
> > +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
> > +		list_del(&node->list);
> > +		kfree(node);
> > +	}
> > +
> > +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
> > +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map
> request may fail\n",
> > +			__func__);
> 
> Map requests "will" fail.  Is this the right error strategy?  Detaching
> a group cannot fail.  Aren't we better off leaving the iova_list we had
> in place?  If we cannot expand the iova aperture when a group is
> removed, a user can continue unscathed.

Ok. I think that's a better strategy rather than trying to update the iova list
here. I will remove this.

Thanks,
Shameer

> > +		goto done;
> > +	}
> > +
> > +	/* adjust the iova with current reserved regions */
> > +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
> > +		pr_warn("%s: Failed to update iova list with reserve regions.
> VFIO DMA map request may fail\n",
> > +			__func__);
> 
> Same.
> 
> > +done:
> > +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
> > +		kfree(resv);
> > +}
> > +
> >  static void vfio_iommu_type1_detach_group(void *iommu_data,
> >  					  struct iommu_group *iommu_group)
> >  {
> > @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void
> *iommu_data,
> >  		break;
> >  	}
> >
> > +	vfio_iommu_iova_resv_refresh(iommu);
> > +
> >  detach_group_done:
> >  	mutex_unlock(&iommu->lock);
> >  }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list
  2018-01-19  9:48     ` Shameerali Kolothum Thodi
@ 2018-01-19 15:45       ` Alex Williamson
  0 siblings, 0 replies; 21+ messages in thread
From: Alex Williamson @ 2018-01-19 15:45 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi
  Cc: eric.auger, pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

On Fri, 19 Jan 2018 09:48:22 +0000
Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> wrote:
> > > +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
> > > +{
> > > +	struct vfio_domain *d;
> > > +	struct vfio_group *g;
> > > +	struct vfio_iova *node, *tmp;
> > > +	struct iommu_resv_region *resv, *resv_next;
> > > +	struct list_head resv_regions;
> > > +	phys_addr_t start, end;
> > > +
> > > +	INIT_LIST_HEAD(&resv_regions);
> > > +
> > > +	list_for_each_entry(d, &iommu->domain_list, next) {
> > > +		list_for_each_entry(g, &d->group_list, next)
> > > +			iommu_get_group_resv_regions(g->iommu_group,
> > > +							 &resv_regions);
> > > +	}
> > > +
> > > +	if (list_empty(&resv_regions))
> > > +		return;
> > > +
> > > +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
> > > +
> > > +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> > > +	start = node->start;
> > > +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> > > +	end = node->end;  
> > 
> > list_sort() only sorts based on ->start, we added reserved regions for
> > all our groups to one list, we potentially have multiple entries with
> > the same ->start.  How can we be sure that the last one in the list
> > actually has the largest ->end value?  
> 
> Hmm.. the sorting is done on the reserved list. The start and end entries 
> are of the iova list which is kept updated on _attach(). So I don't think
> there is a problem here.

Oops, yes you're right.  List confusion.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check
  2018-01-18  0:04   ` Alex Williamson
  2018-01-19  9:47     ` Shameerali Kolothum Thodi
@ 2018-01-23  8:25     ` Auger Eric
  2018-01-23 10:04       ` Shameerali Kolothum Thodi
  1 sibling, 1 reply; 21+ messages in thread
From: Auger Eric @ 2018-01-23  8:25 UTC (permalink / raw)
  To: Alex Williamson, Shameer Kolothum
  Cc: pmorel, kvm, linux-kernel, linuxarm, john.garry, xuwei5

Hi Shameer,

On 18/01/18 01:04, Alex Williamson wrote:
> On Fri, 12 Jan 2018 16:45:27 +0000
> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> 
>> This introduces an iova list that is valid for dma mappings. Make
>> sure the new iommu aperture window is valid and doesn't conflict
>> with any existing dma mappings during attach. Also update the iova
>> list with new aperture window during attach/detach.
>>
>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 177 ++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 177 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index e30e29a..11cbd49 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -60,6 +60,7 @@
>>  
>>  struct vfio_iommu {
>>  	struct list_head	domain_list;
>> +	struct list_head	iova_list;
>>  	struct vfio_domain	*external_domain; /* domain for external user */
>>  	struct mutex		lock;
>>  	struct rb_root		dma_list;
>> @@ -92,6 +93,12 @@ struct vfio_group {
>>  	struct list_head	next;
>>  };
>>  
>> +struct vfio_iova {
>> +	struct list_head	list;
>> +	phys_addr_t		start;
>> +	phys_addr_t		end;
>> +};
> 
> dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA
> addresses, why are we using phys_addr_t?
> 
>> +
>>  /*
>>   * Guest RAM pinning working set or DMA target
>>   */
>> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
>>  	return ret;
>>  }
>>  
>> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>> +				struct list_head *head)
>> +{
>> +	struct vfio_iova *region;
>> +
>> +	region = kmalloc(sizeof(*region), GFP_KERNEL);
>> +	if (!region)
>> +		return -ENOMEM;
>> +
>> +	INIT_LIST_HEAD(&region->list);
>> +	region->start = start;
>> +	region->end = end;
>> +
>> +	list_add_tail(&region->list, head);
>> +	return 0;
>> +}
> 
> As I'm reading through this series, I'm learning that there are a lot
> of assumptions and subtle details that should be documented.  For
> instance, the IOMMU API only provides a single geometry and we build
> upon that here as this patch creates a list, but there's only a single
> entry for now.  The following patches carve that single iova range into
> pieces and somewhat subtly use the list_head passed to keep the list
> sorted, allowing the first/last_entry tricks used throughout.  Subtle
> interfaces are prone to bugs.
> 
>> +
>> +/*
>> + * Find whether a mem region overlaps with existing dma mappings
>> + */
>> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
>> +				  phys_addr_t start, phys_addr_t end)
>> +{
>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> +	for (; n; n = rb_next(n)) {
>> +		struct vfio_dma *dma;
>> +
>> +		dma = rb_entry(n, struct vfio_dma, node);
>> +
>> +		if (end < dma->iova)
>> +			break;
>> +		if (start >= dma->iova + dma->size)
>> +			continue;
>> +		return true;
>> +	}
>> +
>> +	return false;
>> +}
> 
> Why do we need this in addition to the existing vfio_find_dma()?  Why
> doesn't this use the tree structure of the dma_list?
> 
>> +
>> +/*
>> + * Check the new iommu aperture is a valid one
>> + */
>> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
>> +				     phys_addr_t start,
>> +				     phys_addr_t end)
>> +{
>> +	struct vfio_iova *first, *last;
>> +	struct list_head *iova = &iommu->iova_list;
>> +
>> +	if (list_empty(iova))
>> +		return 0;
>> +
>> +	/* Check if new one is outside the current aperture */
> 
> "Disjoint sets"
> 
>> +	first = list_first_entry(iova, struct vfio_iova, list);
>> +	last = list_last_entry(iova, struct vfio_iova, list);
>> +	if ((start > last->end) || (end < first->start))
>> +		return -EINVAL;
>> +
>> +	/* Check for any existing dma mappings outside the new start */
>> +	if (start > first->start) {
>> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))
>> +			return -EINVAL;
>> +	}
>> +
>> +	/* Check for any existing dma mappings outside the new end */
>> +	if (end < last->end) {
>> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))
>> +			return -EINVAL;
>> +	}
>> +
>> +	return 0;
>> +}
> 
> I think this returns an int because you want to use it for the return
> value below, but it really seems like a bool question, ie. does this
> aperture conflict with existing mappings.  Additionally, the aperture
> is valid, it was provided to us by the IOMMU API, the question is
> whether it conflicts.  Please also name consistently to the other
> functions in this patch, vfio_iommu_aper_xxxx().
> 
>> +
>> +/*
>> + * Adjust the iommu aperture window if new aperture is a valid one
>> + */
>> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>> +				      phys_addr_t start,
>> +				      phys_addr_t end)
> 
> Perhaps "resize", "prune", or "shrink" to make it more clear what is
> being adjusted?
> 
>> +{
>> +	struct vfio_iova *node, *next;
>> +	struct list_head *iova = &iommu->iova_list;
>> +
>> +	if (list_empty(iova))
>> +		return vfio_insert_iova(start, end, iova);
>> +
>> +	/* Adjust iova list start */
>> +	list_for_each_entry_safe(node, next, iova, list) {
>> +		if (start < node->start)
>> +			break;
>> +		if ((start >= node->start) && (start <= node->end)) {
> 
> start == node->end results in a zero sized node.  s/<=/</
> 
>> +			node->start = start;
>> +			break;
>> +		}
>> +		/* Delete nodes before new start */
>> +		list_del(&node->list);
>> +		kfree(node);
>> +	}
>> +
>> +	/* Adjust iova list end */
>> +	list_for_each_entry_safe(node, next, iova, list) {
>> +		if (end > node->end)
>> +			continue;
>> +
>> +		if ((end >= node->start) && (end <= node->end)) {
> 
> end == node->start results in a zero sized node.  s/>=/>/
> 
>> +			node->end = end;
>> +			continue;
>> +		}
>> +		/* Delete nodes after new end */
>> +		list_del(&node->list);
>> +		kfree(node);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  					 struct iommu_group *iommu_group)
>>  {
>> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	int ret;
>>  	bool resv_msi, msi_remap;
>>  	phys_addr_t resv_msi_base;
>> +	struct iommu_domain_geometry geo;
>>  
>>  	mutex_lock(&iommu->lock);
>>  
>> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	if (ret)
>>  		goto out_domain;
>>  
>> +	/* Get aperture info */
>> +	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
>> +
>> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
>> +					geo.aperture_end);
>> +	if (ret)
>> +		goto out_detach;
>> +
>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>>  
>>  	INIT_LIST_HEAD(&domain->group_list);
>> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  			goto out_detach;
>>  	}
>>  
>> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
>> +					  geo.aperture_end);
>> +	if (ret)
>> +		goto out_detach;
>> +
>>  	list_add(&domain->next, &iommu->domain_list);
>>  
>>  	mutex_unlock(&iommu->lock);
>> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
>>  	WARN_ON(iommu->notifier.head);
>>  }
>>  
>> +/*
>> + * Called when a domain is removed in detach. It is possible that
>> + * the removed domain decided the iova aperture window. Modify the
>> + * iova aperture with the smallest window among existing domains.
>> + */
>> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
>> +{
>> +	struct vfio_domain *domain;
>> +	struct iommu_domain_geometry geo;
>> +	struct vfio_iova *node;
>> +	phys_addr_t start = 0;
>> +	phys_addr_t end = (phys_addr_t)~0;
>> +
>> +	list_for_each_entry(domain, &iommu->domain_list, next) {
>> +		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
>> +				      &geo);
>> +			if (geo.aperture_start > start)
>> +				start = geo.aperture_start;
>> +			if (geo.aperture_end < end)
>> +				end = geo.aperture_end;
>> +	}
>> +
>> +	/* modify iova aperture limits */
>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
>> +	node->start = start;
>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
>> +	node->end = end;
> 
> We can do this because the new aperture is the same or bigger than the
> current aperture, never smaller.  That's not fully obvious and should
> be noted in the comment.  Perhaps this function should be "expand"
> rather than "refresh".
This one is not obvious to me either:
assuming you have 2 domains, resp with aperture 1 and 2, resulting into
aperture 3. Holes are created by resv regions for instance. If you
remove domain 1, don't you get 4) instead of 2)?

1)   |------------|
 +
2) |---|    |--|       |-----|
=
3)   |-|    |--|


4) |---|    |----------------|

Thanks

Eric
> 
>> +}
>> +
>>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>>  					  struct iommu_group *iommu_group)
>>  {
>> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>>  			iommu_domain_free(domain->domain);
>>  			list_del(&domain->next);
>>  			kfree(domain);
>> +			vfio_iommu_iova_aper_refresh(iommu);
>>  		}
>>  		break;
>>  	}
>> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>>  	}
>>  
>>  	INIT_LIST_HEAD(&iommu->domain_list);
>> +	INIT_LIST_HEAD(&iommu->iova_list);
>>  	iommu->dma_list = RB_ROOT;
>>  	mutex_init(&iommu->lock);
>>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
>>  {
>>  	struct vfio_iommu *iommu = iommu_data;
>>  	struct vfio_domain *domain, *domain_tmp;
>> +	struct vfio_iova *iova, *iova_tmp;
>>  
>>  	if (iommu->external_domain) {
>>  		vfio_release_domain(iommu->external_domain, true);
>> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void *iommu_data)
>>  		list_del(&domain->next);
>>  		kfree(domain);
>>  	}
>> +
>> +	list_for_each_entry_safe(iova, iova_tmp,
>> +				 &iommu->iova_list, list) {
>> +		list_del(&iova->list);
>> +		kfree(iova);
>> +	}
>> +
>>  	kfree(iommu);
>>  }
>>  
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list
  2018-01-18  0:04   ` Alex Williamson
  2018-01-19  9:48     ` Shameerali Kolothum Thodi
@ 2018-01-23  8:32     ` Auger Eric
  2018-01-23 12:16       ` Shameerali Kolothum Thodi
  1 sibling, 1 reply; 21+ messages in thread
From: Auger Eric @ 2018-01-23  8:32 UTC (permalink / raw)
  To: Alex Williamson, Shameer Kolothum
  Cc: pmorel, kvm, linux-kernel, linuxarm, john.garry, xuwei5

Hi Shameer,

On 18/01/18 01:04, Alex Williamson wrote:
> On Fri, 12 Jan 2018 16:45:28 +0000
> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> 
>> This retrieves the reserved regions associated with dev group and
>> checks for conflicts with any existing dma mappings. Also update
>> the iova list excluding the reserved regions.
>>
>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 159 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 11cbd49..7609070 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -28,6 +28,7 @@
>>  #include <linux/device.h>
>>  #include <linux/fs.h>
>>  #include <linux/iommu.h>
>> +#include <linux/list_sort.h>
>>  #include <linux/module.h>
>>  #include <linux/mm.h>
>>  #include <linux/rbtree.h>
>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
>>  	return ret;
>>  }
>>  
> 
> /* list_sort helper */
> 
>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
>> +{
>> +	struct iommu_resv_region *ra, *rb;
>> +
>> +	ra = container_of(a, struct iommu_resv_region, list);
>> +	rb = container_of(b, struct iommu_resv_region, list);
>> +
>> +	if (ra->start < rb->start)
>> +		return -1;
>> +	if (ra->start > rb->start)
>> +		return 1;
>> +	return 0;
>> +}
>> +
>>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>>  				struct list_head *head)
>>  {
>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
>>  }
>>  
>>  /*
>> + * Check reserved region conflicts with existing dma mappings
>> + */
>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
>> +				struct list_head *resv_regions)
>> +{
>> +	struct iommu_resv_region *region;
>> +
>> +	/* Check for conflict with existing dma mappings */
>> +	list_for_each_entry(region, resv_regions, list) {
>> +		if (vfio_find_dma_overlap(iommu, region->start,
>> +				    region->start + region->length - 1))
>> +			return -EINVAL;
>> +	}
>> +
>> +	return 0;
>> +}
> 
> This basically does the same test as vfio_iommu_valid_aperture but
> properly names it a conflict test.  Please be consistent.  Should this
> also return bool, "conflict" is a yes/no answer.
> 
>> +
>> +/*
>>   * Adjust the iommu aperture window if new aperture is a valid one
>>   */
>>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>>  	return 0;
>>  }
>>  
>> +/*
>> + * Check and update iova region list in case a reserved region
>> + * overlaps the iommu iova range
>> + */
>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
>> +					struct list_head *resv_regions)
> 
> "resv_region" in previous function, just "resv" here, use consistent
> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.
> 
>> +{
>> +	struct iommu_resv_region *resv;
>> +	struct list_head *iova = &iommu->iova_list;
>> +	struct vfio_iova *n, *next;
>> +
>> +	list_for_each_entry(resv, resv_regions, list) {
>> +		phys_addr_t start, end;
>> +
>> +		start = resv->start;
>> +		end = resv->start + resv->length - 1;
>> +
>> +		list_for_each_entry_safe(n, next, iova, list) {
>> +			phys_addr_t a, b;
>> +			int ret = 0;
>> +
>> +			a = n->start;
>> +			b = n->end;
> 
> 'a' and 'b' variables actually make this incredibly confusing.  Use
> better variable names or just drop them entirely, it's much easier to
> follow as n->start & n->end.
> 
>> +			/* No overlap */
>> +			if ((start > b) || (end < a))
>> +				continue;
>> +			/* Split the current node and create holes */
>> +			if (start > a)
>> +				ret = vfio_insert_iova(a, start - 1, &n->list);
>> +			if (!ret && end < b)
>> +				ret = vfio_insert_iova(end + 1, b, &n->list);
>> +			if (ret)
>> +				return ret;
>> +
>> +			list_del(&n->list);
> 
> This is trickier than it appears and deserves some explanation.  AIUI,
> we're actually inserting duplicate entries for the remainder at the
> start of the range and then at the end of the range (and the order is
> important here because we're inserting each before the current node),
> and then we delete the current node.  So the iova_list is kept sorted
> through this process, though temporarily includes some bogus, unordered
> sub-sets.
> 
>> +			kfree(n);
>> +		}
>> +	}
>> +
>> +	if (list_empty(iova))
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  					 struct iommu_group *iommu_group)
>>  {
>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	bool resv_msi, msi_remap;
>>  	phys_addr_t resv_msi_base;
>>  	struct iommu_domain_geometry geo;
>> +	struct list_head group_resv_regions;
>> +	struct iommu_resv_region *resv, *resv_next;
>>  
>>  	mutex_lock(&iommu->lock);
>>  
>> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	if (ret)
>>  		goto out_detach;
>>  
>> +	INIT_LIST_HEAD(&group_resv_regions);
>> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
>> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
iommu_get_group_resv_regions returns a sorted list (see
iommu_insert_resv_regions kerneldoc comment). You can have overlapping
regions of different types though.

Thanks

Eric
>> +
>> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
>> +	if (ret)
>> +		goto out_detach;
>> +
>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>>  
>>  	INIT_LIST_HEAD(&domain->group_list);
>> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  		    d->prot == domain->prot) {
>>  			iommu_detach_group(domain->domain, iommu_group);
>>  			if (!iommu_attach_group(d->domain, iommu_group)) {
>> +				ret = vfio_iommu_iova_resv_adjust(iommu,
>> +							&group_resv_regions);
>> +				if (!ret)
>> +					goto out_domain;
> 
> The above function is not without side effects if it fails, it's
> altered the iova_list.  It needs to be valid for the remaining domains
> if we're going to continue.
> 
>> +
>>  				list_add(&group->next, &d->group_list);
>>  				iommu_domain_free(domain->domain);
>>  				kfree(domain);
>> -				mutex_unlock(&iommu->lock);
>> -				return 0;
>> +				goto done;
>>  			}
>>  
>>  			ret = iommu_attach_group(domain->domain, iommu_group);
>> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	if (ret)
>>  		goto out_detach;
>>  
>> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
>> +	if (ret)
>> +		goto out_detach;
> 
> Can't we process the reserved regions once before we get here rather
> than have two separate call points that do the same thing?  In order to
> roll back from errors above, it seems like we need to copy iova_list
> and work on the copy, installing it and deleting the original only on
> success.
> 
>> +
>>  	list_add(&domain->next, &iommu->domain_list);
>>  
>> +done:
>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>> +		kfree(resv);
>>  	mutex_unlock(&iommu->lock);
>>  
>>  	return 0;
>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	iommu_detach_group(domain->domain, iommu_group);
>>  out_domain:
>>  	iommu_domain_free(domain->domain);
>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>> +		kfree(resv);
>>  out_free:
>>  	kfree(domain);
>>  	kfree(group);
>> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
>>  	node->end = end;
>>  }
>>  
>> +/*
>> + * Called when a group is detached. The reserved regions for that
>> + * group can be part of valid iova now. But since reserved regions
>> + * may be duplicated among groups, populate the iova valid regions
>> +   list again.
>> + */
>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
>> +{
>> +	struct vfio_domain *d;
>> +	struct vfio_group *g;
>> +	struct vfio_iova *node, *tmp;
>> +	struct iommu_resv_region *resv, *resv_next;
>> +	struct list_head resv_regions;
>> +	phys_addr_t start, end;
>> +
>> +	INIT_LIST_HEAD(&resv_regions);
>> +
>> +	list_for_each_entry(d, &iommu->domain_list, next) {
>> +		list_for_each_entry(g, &d->group_list, next)
>> +			iommu_get_group_resv_regions(g->iommu_group,
>> +							 &resv_regions);
>> +	}
>> +
>> +	if (list_empty(&resv_regions))
>> +		return;
>> +
>> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
>> +
>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
>> +	start = node->start;
>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
>> +	end = node->end;
> 
> list_sort() only sorts based on ->start, we added reserved regions for
> all our groups to one list, we potentially have multiple entries with
> the same ->start.  How can we be sure that the last one in the list
> actually has the largest ->end value?
> 
>> +
>> +	/* purge the iova list and create new one */
>> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
>> +		list_del(&node->list);
>> +		kfree(node);
>> +	}
>> +
>> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
>> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n",
>> +			__func__);
> 
> Map requests "will" fail.  Is this the right error strategy?  Detaching
> a group cannot fail.  Aren't we better off leaving the iova_list we had
> in place?  If we cannot expand the iova aperture when a group is
> removed, a user can continue unscathed.
> 
>> +		goto done;
>> +	}
>> +
>> +	/* adjust the iova with current reserved regions */
>> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
>> +		pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n",
>> +			__func__);
> 
> Same.
> 
>> +done:
>> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
>> +		kfree(resv);
>> +}
>> +
>>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>>  					  struct iommu_group *iommu_group)
>>  {
>> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>>  		break;
>>  	}
>>  
>> +	vfio_iommu_iova_resv_refresh(iommu);
>> +
>>  detach_group_done:
>>  	mutex_unlock(&iommu->lock);
>>  }
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 3/5] vfio/type1: check dma map request is within a valid iova range
  2018-01-12 16:45 ` [RFC v2 3/5] vfio/type1: check dma map request is within a valid iova range Shameer Kolothum
@ 2018-01-23  8:38   ` Auger Eric
  0 siblings, 0 replies; 21+ messages in thread
From: Auger Eric @ 2018-01-23  8:38 UTC (permalink / raw)
  To: Shameer Kolothum, alex.williamson, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5

Hi Shameer,

On 12/01/18 17:45, Shameer Kolothum wrote:
> This checks and rejects any dma map request outside valid iova
> range.
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>  1 file changed, 22 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 7609070..47ea490 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -971,6 +971,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  	return ret;
>  }
>  
> +/*
> + * Check dma map request is within a valid iova range
> + */
> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> +				phys_addr_t start, phys_addr_t end)
s/phys_addr_t/dma_addr_t here also.
> +{
> +	struct list_head *iova = &iommu->iova_list;
> +	struct vfio_iova *node;
> +
> +	list_for_each_entry(node, iova, list) {
> +		if ((start >= node->start) && (end <= node->end))
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  			   struct vfio_iommu_type1_dma_map *map)
>  {
> @@ -1009,6 +1026,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  		goto out_unlock;
>  	}
>  
> +	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
>  	if (!dma) {
>  		ret = -ENOMEM;
> 

Thanks

Eric

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check
  2018-01-23  8:25     ` Auger Eric
@ 2018-01-23 10:04       ` Shameerali Kolothum Thodi
  2018-01-23 11:20         ` Auger Eric
  0 siblings, 1 reply; 21+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-01-23 10:04 UTC (permalink / raw)
  To: Auger Eric, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

Hi Eric,

> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@redhat.com]
> Sent: Tuesday, January 23, 2018 8:25 AM
> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum
> Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>
> Subject: Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu
> aperture validity check
> 
> Hi Shameer,
> 
> On 18/01/18 01:04, Alex Williamson wrote:
> > On Fri, 12 Jan 2018 16:45:27 +0000
> > Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> >
> >> This introduces an iova list that is valid for dma mappings. Make
> >> sure the new iommu aperture window is valid and doesn't conflict
> >> with any existing dma mappings during attach. Also update the iova
> >> list with new aperture window during attach/detach.
> >>
> >> Signed-off-by: Shameer Kolothum
> <shameerali.kolothum.thodi@huawei.com>
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 177
> ++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 177 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> >> index e30e29a..11cbd49 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -60,6 +60,7 @@
> >>
> >>  struct vfio_iommu {
> >>  	struct list_head	domain_list;
> >> +	struct list_head	iova_list;
> >>  	struct vfio_domain	*external_domain; /* domain for external user
> */
> >>  	struct mutex		lock;
> >>  	struct rb_root		dma_list;
> >> @@ -92,6 +93,12 @@ struct vfio_group {
> >>  	struct list_head	next;
> >>  };
> >>
> >> +struct vfio_iova {
> >> +	struct list_head	list;
> >> +	phys_addr_t		start;
> >> +	phys_addr_t		end;
> >> +};
> >
> > dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA
> > addresses, why are we using phys_addr_t?
> >
> >> +
> >>  /*
> >>   * Guest RAM pinning working set or DMA target
> >>   */
> >> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct
> iommu_group *group, phys_addr_t *base)
> >>  	return ret;
> >>  }
> >>
> >> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> >> +				struct list_head *head)
> >> +{
> >> +	struct vfio_iova *region;
> >> +
> >> +	region = kmalloc(sizeof(*region), GFP_KERNEL);
> >> +	if (!region)
> >> +		return -ENOMEM;
> >> +
> >> +	INIT_LIST_HEAD(&region->list);
> >> +	region->start = start;
> >> +	region->end = end;
> >> +
> >> +	list_add_tail(&region->list, head);
> >> +	return 0;
> >> +}
> >
> > As I'm reading through this series, I'm learning that there are a lot
> > of assumptions and subtle details that should be documented.  For
> > instance, the IOMMU API only provides a single geometry and we build
> > upon that here as this patch creates a list, but there's only a single
> > entry for now.  The following patches carve that single iova range into
> > pieces and somewhat subtly use the list_head passed to keep the list
> > sorted, allowing the first/last_entry tricks used throughout.  Subtle
> > interfaces are prone to bugs.
> >
> >> +
> >> +/*
> >> + * Find whether a mem region overlaps with existing dma mappings
> >> + */
> >> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
> >> +				  phys_addr_t start, phys_addr_t end)
> >> +{
> >> +	struct rb_node *n = rb_first(&iommu->dma_list);
> >> +
> >> +	for (; n; n = rb_next(n)) {
> >> +		struct vfio_dma *dma;
> >> +
> >> +		dma = rb_entry(n, struct vfio_dma, node);
> >> +
> >> +		if (end < dma->iova)
> >> +			break;
> >> +		if (start >= dma->iova + dma->size)
> >> +			continue;
> >> +		return true;
> >> +	}
> >> +
> >> +	return false;
> >> +}
> >
> > Why do we need this in addition to the existing vfio_find_dma()?  Why
> > doesn't this use the tree structure of the dma_list?
> >
> >> +
> >> +/*
> >> + * Check the new iommu aperture is a valid one
> >> + */
> >> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
> >> +				     phys_addr_t start,
> >> +				     phys_addr_t end)
> >> +{
> >> +	struct vfio_iova *first, *last;
> >> +	struct list_head *iova = &iommu->iova_list;
> >> +
> >> +	if (list_empty(iova))
> >> +		return 0;
> >> +
> >> +	/* Check if new one is outside the current aperture */
> >
> > "Disjoint sets"
> >
> >> +	first = list_first_entry(iova, struct vfio_iova, list);
> >> +	last = list_last_entry(iova, struct vfio_iova, list);
> >> +	if ((start > last->end) || (end < first->start))
> >> +		return -EINVAL;
> >> +
> >> +	/* Check for any existing dma mappings outside the new start */
> >> +	if (start > first->start) {
> >> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))
> >> +			return -EINVAL;
> >> +	}
> >> +
> >> +	/* Check for any existing dma mappings outside the new end */
> >> +	if (end < last->end) {
> >> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))
> >> +			return -EINVAL;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >
> > I think this returns an int because you want to use it for the return
> > value below, but it really seems like a bool question, ie. does this
> > aperture conflict with existing mappings.  Additionally, the aperture
> > is valid, it was provided to us by the IOMMU API, the question is
> > whether it conflicts.  Please also name consistently to the other
> > functions in this patch, vfio_iommu_aper_xxxx().
> >
> >> +
> >> +/*
> >> + * Adjust the iommu aperture window if new aperture is a valid one
> >> + */
> >> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> >> +				      phys_addr_t start,
> >> +				      phys_addr_t end)
> >
> > Perhaps "resize", "prune", or "shrink" to make it more clear what is
> > being adjusted?
> >
> >> +{
> >> +	struct vfio_iova *node, *next;
> >> +	struct list_head *iova = &iommu->iova_list;
> >> +
> >> +	if (list_empty(iova))
> >> +		return vfio_insert_iova(start, end, iova);
> >> +
> >> +	/* Adjust iova list start */
> >> +	list_for_each_entry_safe(node, next, iova, list) {
> >> +		if (start < node->start)
> >> +			break;
> >> +		if ((start >= node->start) && (start <= node->end)) {
> >
> > start == node->end results in a zero sized node.  s/<=/</
> >
> >> +			node->start = start;
> >> +			break;
> >> +		}
> >> +		/* Delete nodes before new start */
> >> +		list_del(&node->list);
> >> +		kfree(node);
> >> +	}
> >> +
> >> +	/* Adjust iova list end */
> >> +	list_for_each_entry_safe(node, next, iova, list) {
> >> +		if (end > node->end)
> >> +			continue;
> >> +
> >> +		if ((end >= node->start) && (end <= node->end)) {
> >
> > end == node->start results in a zero sized node.  s/>=/>/
> >
> >> +			node->end = end;
> >> +			continue;
> >> +		}
> >> +		/* Delete nodes after new end */
> >> +		list_del(&node->list);
> >> +		kfree(node);
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  					 struct iommu_group *iommu_group)
> >>  {
> >> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  	int ret;
> >>  	bool resv_msi, msi_remap;
> >>  	phys_addr_t resv_msi_base;
> >> +	struct iommu_domain_geometry geo;
> >>
> >>  	mutex_lock(&iommu->lock);
> >>
> >> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  	if (ret)
> >>  		goto out_domain;
> >>
> >> +	/* Get aperture info */
> >> +	iommu_domain_get_attr(domain->domain,
> DOMAIN_ATTR_GEOMETRY, &geo);
> >> +
> >> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
> >> +					geo.aperture_end);
> >> +	if (ret)
> >> +		goto out_detach;
> >> +
> >>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
> >>
> >>  	INIT_LIST_HEAD(&domain->group_list);
> >> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  			goto out_detach;
> >>  	}
> >>
> >> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
> >> +					  geo.aperture_end);
> >> +	if (ret)
> >> +		goto out_detach;
> >> +
> >>  	list_add(&domain->next, &iommu->domain_list);
> >>
> >>  	mutex_unlock(&iommu->lock);
> >> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct
> vfio_iommu *iommu)
> >>  	WARN_ON(iommu->notifier.head);
> >>  }
> >>
> >> +/*
> >> + * Called when a domain is removed in detach. It is possible that
> >> + * the removed domain decided the iova aperture window. Modify the
> >> + * iova aperture with the smallest window among existing domains.
> >> + */
> >> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
> >> +{
> >> +	struct vfio_domain *domain;
> >> +	struct iommu_domain_geometry geo;
> >> +	struct vfio_iova *node;
> >> +	phys_addr_t start = 0;
> >> +	phys_addr_t end = (phys_addr_t)~0;
> >> +
> >> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> >> +		iommu_domain_get_attr(domain->domain,
> DOMAIN_ATTR_GEOMETRY,
> >> +				      &geo);
> >> +			if (geo.aperture_start > start)
> >> +				start = geo.aperture_start;
> >> +			if (geo.aperture_end < end)
> >> +				end = geo.aperture_end;
> >> +	}
> >> +
> >> +	/* modify iova aperture limits */
> >> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> >> +	node->start = start;
> >> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> >> +	node->end = end;
> >
> > We can do this because the new aperture is the same or bigger than the
> > current aperture, never smaller.  That's not fully obvious and should
> > be noted in the comment.  Perhaps this function should be "expand"
> > rather than "refresh".
> This one is not obvious to me either:
> assuming you have 2 domains, resp with aperture 1 and 2, resulting into
> aperture 3. Holes are created by resv regions for instance. If you
> remove domain 1, don't you get 4) instead of 2)?
> 
> 1)   |------------|
>  +
> 2) |---|    |--|       |-----|
> =
> 3)   |-|    |--|
> 
> 
> 4) |---|    |----------------|

That is true partially. But please remember that this patch is not aware of
any reserved regions yet. That is introduced in patch #2. So patch #1 and #2
together, the iova aperture might looks like 4) after this function call and once 
vfio_iommu_iova_resv_refresh() in patch #2 is done, the aperture will be
back to 2).

Hope I am clear. Please let me know.

In any case, based on comments by Alex, I will be removing this aperture/reserve
refresh functions and leave the iova list as it is when a group is detached. 

Thanks,
Shameer

> Thanks
> 
> Eric
> >
> >> +}
> >> +
> >>  static void vfio_iommu_type1_detach_group(void *iommu_data,
> >>  					  struct iommu_group *iommu_group)
> >>  {
> >> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void
> *iommu_data,
> >>  			iommu_domain_free(domain->domain);
> >>  			list_del(&domain->next);
> >>  			kfree(domain);
> >> +			vfio_iommu_iova_aper_refresh(iommu);
> >>  		}
> >>  		break;
> >>  	}
> >> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned
> long arg)
> >>  	}
> >>
> >>  	INIT_LIST_HEAD(&iommu->domain_list);
> >> +	INIT_LIST_HEAD(&iommu->iova_list);
> >>  	iommu->dma_list = RB_ROOT;
> >>  	mutex_init(&iommu->lock);
> >>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> >> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void
> *iommu_data)
> >>  {
> >>  	struct vfio_iommu *iommu = iommu_data;
> >>  	struct vfio_domain *domain, *domain_tmp;
> >> +	struct vfio_iova *iova, *iova_tmp;
> >>
> >>  	if (iommu->external_domain) {
> >>  		vfio_release_domain(iommu->external_domain, true);
> >> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void
> *iommu_data)
> >>  		list_del(&domain->next);
> >>  		kfree(domain);
> >>  	}
> >> +
> >> +	list_for_each_entry_safe(iova, iova_tmp,
> >> +				 &iommu->iova_list, list) {
> >> +		list_del(&iova->list);
> >> +		kfree(iova);
> >> +	}
> >> +
> >>  	kfree(iommu);
> >>  }
> >>
> >

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 4/5] vfio/type1: Add IOVA range capability support
  2018-01-12 16:45 ` [RFC v2 4/5] vfio/type1: Add IOVA range capability support Shameer Kolothum
@ 2018-01-23 11:16   ` Auger Eric
  2018-01-23 12:51     ` Shameerali Kolothum Thodi
  0 siblings, 1 reply; 21+ messages in thread
From: Auger Eric @ 2018-01-23 11:16 UTC (permalink / raw)
  To: Shameer Kolothum, alex.williamson, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5

Hi Shameer,

On 12/01/18 17:45, Shameer Kolothum wrote:
> This  allows the user-space to retrieve the supported IOVA
> range(s), excluding any reserved regions. The implementation
> is based on capability chains, added to VFIO_IOMMU_GET_INFO ioctl.
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 91 +++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h       | 23 +++++++++++
>  2 files changed, 114 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 47ea490..dc6ed85 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1893,6 +1893,67 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
>  	return ret;
>  }
>  
> +static int vfio_add_iova_cap(struct vfio_info_cap *caps, void *cap_type,
can't you pass cap_type directly as a struct vfio_iommu_type1_info_cap_iova?

Also maybe using more explicit name for cap_type such as cap_iova_ranges?
> +			     size_t size)
> +{
> +	struct vfio_info_cap_header *header;
> +	struct vfio_iommu_type1_info_cap_iova *iova_cap, *iova = cap_type;
> +
> +	header = vfio_info_cap_add(caps, size,
> +				VFIO_IOMMU_TYPE1_INFO_CAP_IOVA, 1);
> +	if (IS_ERR(header))
> +		return PTR_ERR(header);
> +
> +	iova_cap = container_of(header,
> +			struct vfio_iommu_type1_info_cap_iova, header);
> +	iova_cap->nr_iovas = iova->nr_iovas;
> +	memcpy(iova_cap->iova_ranges, iova->iova_ranges,
> +			iova->nr_iovas * sizeof(*iova->iova_ranges));
> +	return 0;
> +}
> +
> +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
> +				struct vfio_info_cap *caps)
> +{
> +	struct vfio_iommu_type1_info_cap_iova *iova_cap;
> +	struct vfio_iova *iova;
> +	size_t size;
> +	int iovas = 0, i = 0, ret;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	list_for_each_entry(iova, &iommu->iova_list, list)
> +		iovas++;
> +
> +	if (!iovas) {
> +		ret = EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	size = sizeof(*iova_cap) + (iovas * sizeof(*iova_cap->iova_ranges));
> +
> +	iova_cap = kzalloc(size, GFP_KERNEL);
> +	if (!iova_cap) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	iova_cap->nr_iovas = iovas;
> +
> +	list_for_each_entry(iova, &iommu->iova_list, list) {
> +		iova_cap->iova_ranges[i].start = iova->start;
> +		iova_cap->iova_ranges[i].end = iova->end;
> +		i++;
> +	}
> +
> +	ret = vfio_add_iova_cap(caps, iova_cap, size);
> +
> +	kfree(iova_cap);
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -1914,6 +1975,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  		}
>  	} else if (cmd == VFIO_IOMMU_GET_INFO) {
>  		struct vfio_iommu_type1_info info;
> +		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> +		int ret;
>  
>  		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
>  
> @@ -1927,6 +1990,34 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
>  
> +		if (info.argsz == minsz)
> +			goto done;
> +
> +		ret = vfio_build_iommu_iova_caps(iommu, &caps);
> +		if (ret)
> +			return ret;
> +
> +		if (caps.size) {
> +			info.flags |= VFIO_IOMMU_INFO_CAPS;
> +			minsz = offsetofend(struct vfio_iommu_type1_info,
> +							 cap_offset);
> +			if (info.argsz < sizeof(info) + caps.size) {
> +				info.argsz = sizeof(info) + caps.size;
> +				info.cap_offset = 0;
> +			} else {
> +				vfio_info_cap_shift(&caps, sizeof(info));
> +				if (copy_to_user((void __user *)arg +
> +						sizeof(info), caps.buf,
> +						caps.size)) {
> +					kfree(caps.buf);
> +					return -EFAULT;
> +				}
> +				info.cap_offset = sizeof(info);
> +			}
> +
> +			kfree(caps.buf);
> +		}
> +done:
>  		return copy_to_user((void __user *)arg, &info, minsz) ?
>  			-EFAULT : 0;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index e3301db..8671448 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -517,7 +517,30 @@ struct vfio_iommu_type1_info {
>  	__u32	argsz;
>  	__u32	flags;
>  #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
> +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
>  	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
> +	__u32   cap_offset;	/* Offset within info struct of first cap */
> +};
> +
> +/*
> + * The IOVA capability allows to report the valid IOVA range(s)
> + * excluding any reserved regions associated with dev group. Any dma
> + * map attempt outside the valid iova range will return error.
> + *
> + * The structures below define version 1 of this capability.
> + */
> +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA  1
VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE ?
> +
> +struct vfio_iova_range {
> +	__u64	start;
> +	__u64	end;
> +};
> +
> +struct vfio_iommu_type1_info_cap_iova {
cap_iova_ranges?
> +	struct vfio_info_cap_header header;
> +	__u32	nr_iovas;
> +	__u32	reserved;
> +	struct vfio_iova_range iova_ranges[];
>  };
>  
>  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> 
Otherwise looks good to me.

Thanks

Eric

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check
  2018-01-23 10:04       ` Shameerali Kolothum Thodi
@ 2018-01-23 11:20         ` Auger Eric
  0 siblings, 0 replies; 21+ messages in thread
From: Auger Eric @ 2018-01-23 11:20 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

Hi Shameer,

On 23/01/18 11:04, Shameerali Kolothum Thodi wrote:
> Hi Eric,
> 
>> -----Original Message-----
>> From: Auger Eric [mailto:eric.auger@redhat.com]
>> Sent: Tuesday, January 23, 2018 8:25 AM
>> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum
>> Thodi <shameerali.kolothum.thodi@huawei.com>
>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>
>> Subject: Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu
>> aperture validity check
>>
>> Hi Shameer,
>>
>> On 18/01/18 01:04, Alex Williamson wrote:
>>> On Fri, 12 Jan 2018 16:45:27 +0000
>>> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
>>>
>>>> This introduces an iova list that is valid for dma mappings. Make
>>>> sure the new iommu aperture window is valid and doesn't conflict
>>>> with any existing dma mappings during attach. Also update the iova
>>>> list with new aperture window during attach/detach.
>>>>
>>>> Signed-off-by: Shameer Kolothum
>> <shameerali.kolothum.thodi@huawei.com>
>>>> ---
>>>>  drivers/vfio/vfio_iommu_type1.c | 177
>> ++++++++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 177 insertions(+)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
>> b/drivers/vfio/vfio_iommu_type1.c
>>>> index e30e29a..11cbd49 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -60,6 +60,7 @@
>>>>
>>>>  struct vfio_iommu {
>>>>  	struct list_head	domain_list;
>>>> +	struct list_head	iova_list;
>>>>  	struct vfio_domain	*external_domain; /* domain for external user
>> */
>>>>  	struct mutex		lock;
>>>>  	struct rb_root		dma_list;
>>>> @@ -92,6 +93,12 @@ struct vfio_group {
>>>>  	struct list_head	next;
>>>>  };
>>>>
>>>> +struct vfio_iova {
>>>> +	struct list_head	list;
>>>> +	phys_addr_t		start;
>>>> +	phys_addr_t		end;
>>>> +};
>>>
>>> dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA
>>> addresses, why are we using phys_addr_t?
>>>
>>>> +
>>>>  /*
>>>>   * Guest RAM pinning working set or DMA target
>>>>   */
>>>> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct
>> iommu_group *group, phys_addr_t *base)
>>>>  	return ret;
>>>>  }
>>>>
>>>> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>>>> +				struct list_head *head)
>>>> +{
>>>> +	struct vfio_iova *region;
>>>> +
>>>> +	region = kmalloc(sizeof(*region), GFP_KERNEL);
>>>> +	if (!region)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	INIT_LIST_HEAD(&region->list);
>>>> +	region->start = start;
>>>> +	region->end = end;
>>>> +
>>>> +	list_add_tail(&region->list, head);
>>>> +	return 0;
>>>> +}
>>>
>>> As I'm reading through this series, I'm learning that there are a lot
>>> of assumptions and subtle details that should be documented.  For
>>> instance, the IOMMU API only provides a single geometry and we build
>>> upon that here as this patch creates a list, but there's only a single
>>> entry for now.  The following patches carve that single iova range into
>>> pieces and somewhat subtly use the list_head passed to keep the list
>>> sorted, allowing the first/last_entry tricks used throughout.  Subtle
>>> interfaces are prone to bugs.
>>>
>>>> +
>>>> +/*
>>>> + * Find whether a mem region overlaps with existing dma mappings
>>>> + */
>>>> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
>>>> +				  phys_addr_t start, phys_addr_t end)
>>>> +{
>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);
>>>> +
>>>> +	for (; n; n = rb_next(n)) {
>>>> +		struct vfio_dma *dma;
>>>> +
>>>> +		dma = rb_entry(n, struct vfio_dma, node);
>>>> +
>>>> +		if (end < dma->iova)
>>>> +			break;
>>>> +		if (start >= dma->iova + dma->size)
>>>> +			continue;
>>>> +		return true;
>>>> +	}
>>>> +
>>>> +	return false;
>>>> +}
>>>
>>> Why do we need this in addition to the existing vfio_find_dma()?  Why
>>> doesn't this use the tree structure of the dma_list?
>>>
>>>> +
>>>> +/*
>>>> + * Check the new iommu aperture is a valid one
>>>> + */
>>>> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
>>>> +				     phys_addr_t start,
>>>> +				     phys_addr_t end)
>>>> +{
>>>> +	struct vfio_iova *first, *last;
>>>> +	struct list_head *iova = &iommu->iova_list;
>>>> +
>>>> +	if (list_empty(iova))
>>>> +		return 0;
>>>> +
>>>> +	/* Check if new one is outside the current aperture */
>>>
>>> "Disjoint sets"
>>>
>>>> +	first = list_first_entry(iova, struct vfio_iova, list);
>>>> +	last = list_last_entry(iova, struct vfio_iova, list);
>>>> +	if ((start > last->end) || (end < first->start))
>>>> +		return -EINVAL;
>>>> +
>>>> +	/* Check for any existing dma mappings outside the new start */
>>>> +	if (start > first->start) {
>>>> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))
>>>> +			return -EINVAL;
>>>> +	}
>>>> +
>>>> +	/* Check for any existing dma mappings outside the new end */
>>>> +	if (end < last->end) {
>>>> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))
>>>> +			return -EINVAL;
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>
>>> I think this returns an int because you want to use it for the return
>>> value below, but it really seems like a bool question, ie. does this
>>> aperture conflict with existing mappings.  Additionally, the aperture
>>> is valid, it was provided to us by the IOMMU API, the question is
>>> whether it conflicts.  Please also name consistently to the other
>>> functions in this patch, vfio_iommu_aper_xxxx().
>>>
>>>> +
>>>> +/*
>>>> + * Adjust the iommu aperture window if new aperture is a valid one
>>>> + */
>>>> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>>>> +				      phys_addr_t start,
>>>> +				      phys_addr_t end)
>>>
>>> Perhaps "resize", "prune", or "shrink" to make it more clear what is
>>> being adjusted?
>>>
>>>> +{
>>>> +	struct vfio_iova *node, *next;
>>>> +	struct list_head *iova = &iommu->iova_list;
>>>> +
>>>> +	if (list_empty(iova))
>>>> +		return vfio_insert_iova(start, end, iova);
>>>> +
>>>> +	/* Adjust iova list start */
>>>> +	list_for_each_entry_safe(node, next, iova, list) {
>>>> +		if (start < node->start)
>>>> +			break;
>>>> +		if ((start >= node->start) && (start <= node->end)) {
>>>
>>> start == node->end results in a zero sized node.  s/<=/</
>>>
>>>> +			node->start = start;
>>>> +			break;
>>>> +		}
>>>> +		/* Delete nodes before new start */
>>>> +		list_del(&node->list);
>>>> +		kfree(node);
>>>> +	}
>>>> +
>>>> +	/* Adjust iova list end */
>>>> +	list_for_each_entry_safe(node, next, iova, list) {
>>>> +		if (end > node->end)
>>>> +			continue;
>>>> +
>>>> +		if ((end >= node->start) && (end <= node->end)) {
>>>
>>> end == node->start results in a zero sized node.  s/>=/>/
>>>
>>>> +			node->end = end;
>>>> +			continue;
>>>> +		}
>>>> +		/* Delete nodes after new end */
>>>> +		list_del(&node->list);
>>>> +		kfree(node);
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>>>  					 struct iommu_group *iommu_group)
>>>>  {
>>>> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>  	int ret;
>>>>  	bool resv_msi, msi_remap;
>>>>  	phys_addr_t resv_msi_base;
>>>> +	struct iommu_domain_geometry geo;
>>>>
>>>>  	mutex_lock(&iommu->lock);
>>>>
>>>> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>  	if (ret)
>>>>  		goto out_domain;
>>>>
>>>> +	/* Get aperture info */
>>>> +	iommu_domain_get_attr(domain->domain,
>> DOMAIN_ATTR_GEOMETRY, &geo);
>>>> +
>>>> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
>>>> +					geo.aperture_end);
>>>> +	if (ret)
>>>> +		goto out_detach;
>>>> +
>>>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>>>>
>>>>  	INIT_LIST_HEAD(&domain->group_list);
>>>> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>  			goto out_detach;
>>>>  	}
>>>>
>>>> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
>>>> +					  geo.aperture_end);
>>>> +	if (ret)
>>>> +		goto out_detach;
>>>> +
>>>>  	list_add(&domain->next, &iommu->domain_list);
>>>>
>>>>  	mutex_unlock(&iommu->lock);
>>>> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct
>> vfio_iommu *iommu)
>>>>  	WARN_ON(iommu->notifier.head);
>>>>  }
>>>>
>>>> +/*
>>>> + * Called when a domain is removed in detach. It is possible that
>>>> + * the removed domain decided the iova aperture window. Modify the
>>>> + * iova aperture with the smallest window among existing domains.
>>>> + */
>>>> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
>>>> +{
>>>> +	struct vfio_domain *domain;
>>>> +	struct iommu_domain_geometry geo;
>>>> +	struct vfio_iova *node;
>>>> +	phys_addr_t start = 0;
>>>> +	phys_addr_t end = (phys_addr_t)~0;
>>>> +
>>>> +	list_for_each_entry(domain, &iommu->domain_list, next) {
>>>> +		iommu_domain_get_attr(domain->domain,
>> DOMAIN_ATTR_GEOMETRY,
>>>> +				      &geo);
>>>> +			if (geo.aperture_start > start)
>>>> +				start = geo.aperture_start;
>>>> +			if (geo.aperture_end < end)
>>>> +				end = geo.aperture_end;
>>>> +	}
>>>> +
>>>> +	/* modify iova aperture limits */
>>>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
>>>> +	node->start = start;
>>>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
>>>> +	node->end = end;
>>>
>>> We can do this because the new aperture is the same or bigger than the
>>> current aperture, never smaller.  That's not fully obvious and should
>>> be noted in the comment.  Perhaps this function should be "expand"
>>> rather than "refresh".
>> This one is not obvious to me either:
>> assuming you have 2 domains, resp with aperture 1 and 2, resulting into
>> aperture 3. Holes are created by resv regions for instance. If you
>> remove domain 1, don't you get 4) instead of 2)?
>>
>> 1)   |------------|
>>  +
>> 2) |---|    |--|       |-----|
>> =
>> 3)   |-|    |--|
>>
>>
>> 4) |---|    |----------------|
> 
> That is true partially. But please remember that this patch is not aware of
> any reserved regions yet. That is introduced in patch #2. So patch #1 and #2
> together, the iova aperture might looks like 4) after this function call and once 
> vfio_iommu_iova_resv_refresh() in patch #2 is done, the aperture will be
> back to 2).
> 
> Hope I am clear. Please let me know.
Ah OK.
> 
> In any case, based on comments by Alex, I will be removing this aperture/reserve
> refresh functions and leave the iova list as it is when a group is detached. 
Looking forwarding to reviewing the next version then.

Thanks

Eric
> 
> Thanks,
> Shameer
> 
>> Thanks
>>
>> Eric
>>>
>>>> +}
>>>> +
>>>>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>>>>  					  struct iommu_group *iommu_group)
>>>>  {
>>>> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void
>> *iommu_data,
>>>>  			iommu_domain_free(domain->domain);
>>>>  			list_del(&domain->next);
>>>>  			kfree(domain);
>>>> +			vfio_iommu_iova_aper_refresh(iommu);
>>>>  		}
>>>>  		break;
>>>>  	}
>>>> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned
>> long arg)
>>>>  	}
>>>>
>>>>  	INIT_LIST_HEAD(&iommu->domain_list);
>>>> +	INIT_LIST_HEAD(&iommu->iova_list);
>>>>  	iommu->dma_list = RB_ROOT;
>>>>  	mutex_init(&iommu->lock);
>>>>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>>>> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void
>> *iommu_data)
>>>>  {
>>>>  	struct vfio_iommu *iommu = iommu_data;
>>>>  	struct vfio_domain *domain, *domain_tmp;
>>>> +	struct vfio_iova *iova, *iova_tmp;
>>>>
>>>>  	if (iommu->external_domain) {
>>>>  		vfio_release_domain(iommu->external_domain, true);
>>>> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void
>> *iommu_data)
>>>>  		list_del(&domain->next);
>>>>  		kfree(domain);
>>>>  	}
>>>> +
>>>> +	list_for_each_entry_safe(iova, iova_tmp,
>>>> +				 &iommu->iova_list, list) {
>>>> +		list_del(&iova->list);
>>>> +		kfree(iova);
>>>> +	}
>>>> +
>>>>  	kfree(iommu);
>>>>  }
>>>>
>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list
  2018-01-23  8:32     ` Auger Eric
@ 2018-01-23 12:16       ` Shameerali Kolothum Thodi
  2018-01-23 12:51         ` Auger Eric
  0 siblings, 1 reply; 21+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-01-23 12:16 UTC (permalink / raw)
  To: Auger Eric, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

Hi Eric,

> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@redhat.com]
> Sent: Tuesday, January 23, 2018 8:32 AM
> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum
> Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>
> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update
> iova list
> 
> Hi Shameer,
> 
> On 18/01/18 01:04, Alex Williamson wrote:
> > On Fri, 12 Jan 2018 16:45:28 +0000
> > Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> >
> >> This retrieves the reserved regions associated with dev group and
> >> checks for conflicts with any existing dma mappings. Also update
> >> the iova list excluding the reserved regions.
> >>
> >> Signed-off-by: Shameer Kolothum
> <shameerali.kolothum.thodi@huawei.com>
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 161
> +++++++++++++++++++++++++++++++++++++++-
> >>  1 file changed, 159 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> >> index 11cbd49..7609070 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -28,6 +28,7 @@
> >>  #include <linux/device.h>
> >>  #include <linux/fs.h>
> >>  #include <linux/iommu.h>
> >> +#include <linux/list_sort.h>
> >>  #include <linux/module.h>
> >>  #include <linux/mm.h>
> >>  #include <linux/rbtree.h>
> >> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct
> iommu_group *group, phys_addr_t *base)
> >>  	return ret;
> >>  }
> >>
> >
> > /* list_sort helper */
> >
> >> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
> >> +{
> >> +	struct iommu_resv_region *ra, *rb;
> >> +
> >> +	ra = container_of(a, struct iommu_resv_region, list);
> >> +	rb = container_of(b, struct iommu_resv_region, list);
> >> +
> >> +	if (ra->start < rb->start)
> >> +		return -1;
> >> +	if (ra->start > rb->start)
> >> +		return 1;
> >> +	return 0;
> >> +}
> >> +
> >>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> >>  				struct list_head *head)
> >>  {
> >> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct
> vfio_iommu *iommu,
> >>  }
> >>
> >>  /*
> >> + * Check reserved region conflicts with existing dma mappings
> >> + */
> >> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
> >> +				struct list_head *resv_regions)
> >> +{
> >> +	struct iommu_resv_region *region;
> >> +
> >> +	/* Check for conflict with existing dma mappings */
> >> +	list_for_each_entry(region, resv_regions, list) {
> >> +		if (vfio_find_dma_overlap(iommu, region->start,
> >> +				    region->start + region->length - 1))
> >> +			return -EINVAL;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >
> > This basically does the same test as vfio_iommu_valid_aperture but
> > properly names it a conflict test.  Please be consistent.  Should this
> > also return bool, "conflict" is a yes/no answer.
> >
> >> +
> >> +/*
> >>   * Adjust the iommu aperture window if new aperture is a valid one
> >>   */
> >>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> >> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct
> vfio_iommu *iommu,
> >>  	return 0;
> >>  }
> >>
> >> +/*
> >> + * Check and update iova region list in case a reserved region
> >> + * overlaps the iommu iova range
> >> + */
> >> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
> >> +					struct list_head *resv_regions)
> >
> > "resv_region" in previous function, just "resv" here, use consistent
> > names.  Also, what are we adjusting.  Maybe "exclude" is a better term.
> >
> >> +{
> >> +	struct iommu_resv_region *resv;
> >> +	struct list_head *iova = &iommu->iova_list;
> >> +	struct vfio_iova *n, *next;
> >> +
> >> +	list_for_each_entry(resv, resv_regions, list) {
> >> +		phys_addr_t start, end;
> >> +
> >> +		start = resv->start;
> >> +		end = resv->start + resv->length - 1;
> >> +
> >> +		list_for_each_entry_safe(n, next, iova, list) {
> >> +			phys_addr_t a, b;
> >> +			int ret = 0;
> >> +
> >> +			a = n->start;
> >> +			b = n->end;
> >
> > 'a' and 'b' variables actually make this incredibly confusing.  Use
> > better variable names or just drop them entirely, it's much easier to
> > follow as n->start & n->end.
> >
> >> +			/* No overlap */
> >> +			if ((start > b) || (end < a))
> >> +				continue;
> >> +			/* Split the current node and create holes */
> >> +			if (start > a)
> >> +				ret = vfio_insert_iova(a, start - 1, &n->list);
> >> +			if (!ret && end < b)
> >> +				ret = vfio_insert_iova(end + 1, b, &n->list);
> >> +			if (ret)
> >> +				return ret;
> >> +
> >> +			list_del(&n->list);
> >
> > This is trickier than it appears and deserves some explanation.  AIUI,
> > we're actually inserting duplicate entries for the remainder at the
> > start of the range and then at the end of the range (and the order is
> > important here because we're inserting each before the current node),
> > and then we delete the current node.  So the iova_list is kept sorted
> > through this process, though temporarily includes some bogus, unordered
> > sub-sets.
> >
> >> +			kfree(n);
> >> +		}
> >> +	}
> >> +
> >> +	if (list_empty(iova))
> >> +		return -EINVAL;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  					 struct iommu_group *iommu_group)
> >>  {
> >> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  	bool resv_msi, msi_remap;
> >>  	phys_addr_t resv_msi_base;
> >>  	struct iommu_domain_geometry geo;
> >> +	struct list_head group_resv_regions;
> >> +	struct iommu_resv_region *resv, *resv_next;
> >>
> >>  	mutex_lock(&iommu->lock);
> >>
> >> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  	if (ret)
> >>  		goto out_detach;
> >>
> >> +	INIT_LIST_HEAD(&group_resv_regions);
> >> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
> >> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> iommu_get_group_resv_regions returns a sorted list (see
> iommu_insert_resv_regions kerneldoc comment). You can have overlapping
> regions of different types though.

Hmm..I am not sure. It looks like it is sorted only if the regions are of same type.

"* The new element is sorted by address with respect to the other
 * regions of the same type."

So hypothetically if there are two groups with regions like,

Group 1.
 Start       size            type  
  0x0000   0x1000        1
  0x2000   0x1000        1
  0x5000   0x1000        1

Group 2
  Start       size              type
   0x2000  0x4000           2
   0x7000   0x1000          1

Then the  iommu_get_group_resv_regions() will return,

0x0000   0x1000        1
0x2000   0x1000        1
0x5000   0x1000        1
0x2000  0x4000         2
0x7000   0x1000        1  

But honestly I am not sure the above is a valid scenario or not. I am
happy to remove the sorting if such a case will never happen.

Please let me know.

Thanks,
Shameer

> Eric
> >> +
> >> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
> >> +	if (ret)
> >> +		goto out_detach;
> >> +
> >>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
> >>
> >>  	INIT_LIST_HEAD(&domain->group_list);
> >> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  		    d->prot == domain->prot) {
> >>  			iommu_detach_group(domain->domain,
> iommu_group);
> >>  			if (!iommu_attach_group(d->domain, iommu_group)) {
> >> +				ret = vfio_iommu_iova_resv_adjust(iommu,
> >> +
> 	&group_resv_regions);
> >> +				if (!ret)
> >> +					goto out_domain;
> >
> > The above function is not without side effects if it fails, it's
> > altered the iova_list.  It needs to be valid for the remaining domains
> > if we're going to continue.
> >
> >> +
> >>  				list_add(&group->next, &d->group_list);
> >>  				iommu_domain_free(domain->domain);
> >>  				kfree(domain);
> >> -				mutex_unlock(&iommu->lock);
> >> -				return 0;
> >> +				goto done;
> >>  			}
> >>
> >>  			ret = iommu_attach_group(domain->domain,
> iommu_group);
> >> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  	if (ret)
> >>  		goto out_detach;
> >>
> >> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
> >> +	if (ret)
> >> +		goto out_detach;
> >
> > Can't we process the reserved regions once before we get here rather
> > than have two separate call points that do the same thing?  In order to
> > roll back from errors above, it seems like we need to copy iova_list
> > and work on the copy, installing it and deleting the original only on
> > success.
> >
> >> +
> >>  	list_add(&domain->next, &iommu->domain_list);
> >>
> >> +done:
> >> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> >> +		kfree(resv);
> >>  	mutex_unlock(&iommu->lock);
> >>
> >>  	return 0;
> >> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >>  	iommu_detach_group(domain->domain, iommu_group);
> >>  out_domain:
> >>  	iommu_domain_free(domain->domain);
> >> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> >> +		kfree(resv);
> >>  out_free:
> >>  	kfree(domain);
> >>  	kfree(group);
> >> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct
> vfio_iommu *iommu)
> >>  	node->end = end;
> >>  }
> >>
> >> +/*
> >> + * Called when a group is detached. The reserved regions for that
> >> + * group can be part of valid iova now. But since reserved regions
> >> + * may be duplicated among groups, populate the iova valid regions
> >> +   list again.
> >> + */
> >> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
> >> +{
> >> +	struct vfio_domain *d;
> >> +	struct vfio_group *g;
> >> +	struct vfio_iova *node, *tmp;
> >> +	struct iommu_resv_region *resv, *resv_next;
> >> +	struct list_head resv_regions;
> >> +	phys_addr_t start, end;
> >> +
> >> +	INIT_LIST_HEAD(&resv_regions);
> >> +
> >> +	list_for_each_entry(d, &iommu->domain_list, next) {
> >> +		list_for_each_entry(g, &d->group_list, next)
> >> +			iommu_get_group_resv_regions(g->iommu_group,
> >> +							 &resv_regions);
> >> +	}
> >> +
> >> +	if (list_empty(&resv_regions))
> >> +		return;
> >> +
> >> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
> >> +
> >> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> >> +	start = node->start;
> >> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> >> +	end = node->end;
> >
> > list_sort() only sorts based on ->start, we added reserved regions for
> > all our groups to one list, we potentially have multiple entries with
> > the same ->start.  How can we be sure that the last one in the list
> > actually has the largest ->end value?
> >
> >> +
> >> +	/* purge the iova list and create new one */
> >> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
> >> +		list_del(&node->list);
> >> +		kfree(node);
> >> +	}
> >> +
> >> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
> >> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map
> request may fail\n",
> >> +			__func__);
> >
> > Map requests "will" fail.  Is this the right error strategy?  Detaching
> > a group cannot fail.  Aren't we better off leaving the iova_list we had
> > in place?  If we cannot expand the iova aperture when a group is
> > removed, a user can continue unscathed.
> >
> >> +		goto done;
> >> +	}
> >> +
> >> +	/* adjust the iova with current reserved regions */
> >> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
> >> +		pr_warn("%s: Failed to update iova list with reserve regions.
> VFIO DMA map request may fail\n",
> >> +			__func__);
> >
> > Same.
> >
> >> +done:
> >> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
> >> +		kfree(resv);
> >> +}
> >> +
> >>  static void vfio_iommu_type1_detach_group(void *iommu_data,
> >>  					  struct iommu_group *iommu_group)
> >>  {
> >> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void
> *iommu_data,
> >>  		break;
> >>  	}
> >>
> >> +	vfio_iommu_iova_resv_refresh(iommu);
> >> +
> >>  detach_group_done:
> >>  	mutex_unlock(&iommu->lock);
> >>  }
> >

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC v2 4/5] vfio/type1: Add IOVA range capability support
  2018-01-23 11:16   ` Auger Eric
@ 2018-01-23 12:51     ` Shameerali Kolothum Thodi
  0 siblings, 0 replies; 21+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-01-23 12:51 UTC (permalink / raw)
  To: Auger Eric, alex.williamson, pmorel
  Cc: kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)



> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@redhat.com]
> Sent: Tuesday, January 23, 2018 11:17 AM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> alex.williamson@redhat.com; pmorel@linux.vnet.ibm.com
> Cc: kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)
> <xuwei5@huawei.com>
> Subject: Re: [RFC v2 4/5] vfio/type1: Add IOVA range capability support
> 
> Hi Shameer,
> 
> On 12/01/18 17:45, Shameer Kolothum wrote:
> > This  allows the user-space to retrieve the supported IOVA range(s),
> > excluding any reserved regions. The implementation is based on
> > capability chains, added to VFIO_IOMMU_GET_INFO ioctl.
> >
> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 91
> +++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/vfio.h       | 23 +++++++++++
> >  2 files changed, 114 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index 47ea490..dc6ed85 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -1893,6 +1893,67 @@ static int
> vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
> >  	return ret;
> >  }
> >
> > +static int vfio_add_iova_cap(struct vfio_info_cap *caps, void
> > +*cap_type,
> can't you pass cap_type directly as a struct vfio_iommu_type1_info_cap_iova?
> 
> Also maybe using more explicit name for cap_type such as cap_iova_ranges?
> > +			     size_t size)
> > +{
> > +	struct vfio_info_cap_header *header;
> > +	struct vfio_iommu_type1_info_cap_iova *iova_cap, *iova = cap_type;
> > +
> > +	header = vfio_info_cap_add(caps, size,
> > +				VFIO_IOMMU_TYPE1_INFO_CAP_IOVA, 1);
> > +	if (IS_ERR(header))
> > +		return PTR_ERR(header);
> > +
> > +	iova_cap = container_of(header,
> > +			struct vfio_iommu_type1_info_cap_iova, header);
> > +	iova_cap->nr_iovas = iova->nr_iovas;
> > +	memcpy(iova_cap->iova_ranges, iova->iova_ranges,
> > +			iova->nr_iovas * sizeof(*iova->iova_ranges));
> > +	return 0;
> > +}
> > +
> > +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
> > +				struct vfio_info_cap *caps)
> > +{
> > +	struct vfio_iommu_type1_info_cap_iova *iova_cap;
> > +	struct vfio_iova *iova;
> > +	size_t size;
> > +	int iovas = 0, i = 0, ret;
> > +
> > +	mutex_lock(&iommu->lock);
> > +
> > +	list_for_each_entry(iova, &iommu->iova_list, list)
> > +		iovas++;
> > +
> > +	if (!iovas) {
> > +		ret = EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	size = sizeof(*iova_cap) + (iovas * sizeof(*iova_cap->iova_ranges));
> > +
> > +	iova_cap = kzalloc(size, GFP_KERNEL);
> > +	if (!iova_cap) {
> > +		ret = -ENOMEM;
> > +		goto out_unlock;
> > +	}
> > +
> > +	iova_cap->nr_iovas = iovas;
> > +
> > +	list_for_each_entry(iova, &iommu->iova_list, list) {
> > +		iova_cap->iova_ranges[i].start = iova->start;
> > +		iova_cap->iova_ranges[i].end = iova->end;
> > +		i++;
> > +	}
> > +
> > +	ret = vfio_add_iova_cap(caps, iova_cap, size);
> > +
> > +	kfree(iova_cap);
> > +out_unlock:
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  				   unsigned int cmd, unsigned long arg)  { @@ -
> 1914,6 +1975,8 @@
> > static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  		}
> >  	} else if (cmd == VFIO_IOMMU_GET_INFO) {
> >  		struct vfio_iommu_type1_info info;
> > +		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> > +		int ret;
> >
> >  		minsz = offsetofend(struct vfio_iommu_type1_info,
> iova_pgsizes);
> >
> > @@ -1927,6 +1990,34 @@ static long vfio_iommu_type1_ioctl(void
> > *iommu_data,
> >
> >  		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
> >
> > +		if (info.argsz == minsz)
> > +			goto done;
> > +
> > +		ret = vfio_build_iommu_iova_caps(iommu, &caps);
> > +		if (ret)
> > +			return ret;
> > +
> > +		if (caps.size) {
> > +			info.flags |= VFIO_IOMMU_INFO_CAPS;
> > +			minsz = offsetofend(struct vfio_iommu_type1_info,
> > +							 cap_offset);
> > +			if (info.argsz < sizeof(info) + caps.size) {
> > +				info.argsz = sizeof(info) + caps.size;
> > +				info.cap_offset = 0;
> > +			} else {
> > +				vfio_info_cap_shift(&caps, sizeof(info));
> > +				if (copy_to_user((void __user *)arg +
> > +						sizeof(info), caps.buf,
> > +						caps.size)) {
> > +					kfree(caps.buf);
> > +					return -EFAULT;
> > +				}
> > +				info.cap_offset = sizeof(info);
> > +			}
> > +
> > +			kfree(caps.buf);
> > +		}
> > +done:
> >  		return copy_to_user((void __user *)arg, &info, minsz) ?
> >  			-EFAULT : 0;
> >
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index e3301db..8671448 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -517,7 +517,30 @@ struct vfio_iommu_type1_info {
> >  	__u32	argsz;
> >  	__u32	flags;
> >  #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
> > +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
> >  	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
> > +	__u32   cap_offset;	/* Offset within info struct of first cap */
> > +};
> > +
> > +/*
> > + * The IOVA capability allows to report the valid IOVA range(s)
> > + * excluding any reserved regions associated with dev group. Any dma
> > + * map attempt outside the valid iova range will return error.
> > + *
> > + * The structures below define version 1 of this capability.
> > + */
> > +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA  1
> VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE ?
> > +
> > +struct vfio_iova_range {
> > +	__u64	start;
> > +	__u64	end;
> > +};
> > +
> > +struct vfio_iommu_type1_info_cap_iova {
> cap_iova_ranges?
> > +	struct vfio_info_cap_header header;
> > +	__u32	nr_iovas;
> > +	__u32	reserved;
> > +	struct vfio_iova_range iova_ranges[];
> >  };
> >
> >  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> >
> Otherwise looks good to me.

Ok. I will take care of them.

Thanks,
Shameer

> Thanks
> 
> Eric

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list
  2018-01-23 12:16       ` Shameerali Kolothum Thodi
@ 2018-01-23 12:51         ` Auger Eric
  2018-01-23 15:26           ` Shameerali Kolothum Thodi
  0 siblings, 1 reply; 21+ messages in thread
From: Auger Eric @ 2018-01-23 12:51 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

Hi Shameer,

On 23/01/18 13:16, Shameerali Kolothum Thodi wrote:
> Hi Eric,
> 
>> -----Original Message-----
>> From: Auger Eric [mailto:eric.auger@redhat.com]
>> Sent: Tuesday, January 23, 2018 8:32 AM
>> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum
>> Thodi <shameerali.kolothum.thodi@huawei.com>
>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>
>> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update
>> iova list
>>
>> Hi Shameer,
>>
>> On 18/01/18 01:04, Alex Williamson wrote:
>>> On Fri, 12 Jan 2018 16:45:28 +0000
>>> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
>>>
>>>> This retrieves the reserved regions associated with dev group and
>>>> checks for conflicts with any existing dma mappings. Also update
>>>> the iova list excluding the reserved regions.
>>>>
>>>> Signed-off-by: Shameer Kolothum
>> <shameerali.kolothum.thodi@huawei.com>
>>>> ---
>>>>  drivers/vfio/vfio_iommu_type1.c | 161
>> +++++++++++++++++++++++++++++++++++++++-
>>>>  1 file changed, 159 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
>> b/drivers/vfio/vfio_iommu_type1.c
>>>> index 11cbd49..7609070 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -28,6 +28,7 @@
>>>>  #include <linux/device.h>
>>>>  #include <linux/fs.h>
>>>>  #include <linux/iommu.h>
>>>> +#include <linux/list_sort.h>
>>>>  #include <linux/module.h>
>>>>  #include <linux/mm.h>
>>>>  #include <linux/rbtree.h>
>>>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct
>> iommu_group *group, phys_addr_t *base)
>>>>  	return ret;
>>>>  }
>>>>
>>>
>>> /* list_sort helper */
>>>
>>>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
>>>> +{
>>>> +	struct iommu_resv_region *ra, *rb;
>>>> +
>>>> +	ra = container_of(a, struct iommu_resv_region, list);
>>>> +	rb = container_of(b, struct iommu_resv_region, list);
>>>> +
>>>> +	if (ra->start < rb->start)
>>>> +		return -1;
>>>> +	if (ra->start > rb->start)
>>>> +		return 1;
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>>>>  				struct list_head *head)
>>>>  {
>>>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct
>> vfio_iommu *iommu,
>>>>  }
>>>>
>>>>  /*
>>>> + * Check reserved region conflicts with existing dma mappings
>>>> + */
>>>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
>>>> +				struct list_head *resv_regions)
>>>> +{
>>>> +	struct iommu_resv_region *region;
>>>> +
>>>> +	/* Check for conflict with existing dma mappings */
>>>> +	list_for_each_entry(region, resv_regions, list) {
>>>> +		if (vfio_find_dma_overlap(iommu, region->start,
>>>> +				    region->start + region->length - 1))
>>>> +			return -EINVAL;
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>
>>> This basically does the same test as vfio_iommu_valid_aperture but
>>> properly names it a conflict test.  Please be consistent.  Should this
>>> also return bool, "conflict" is a yes/no answer.
>>>
>>>> +
>>>> +/*
>>>>   * Adjust the iommu aperture window if new aperture is a valid one
>>>>   */
>>>>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>>>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct
>> vfio_iommu *iommu,
>>>>  	return 0;
>>>>  }
>>>>
>>>> +/*
>>>> + * Check and update iova region list in case a reserved region
>>>> + * overlaps the iommu iova range
>>>> + */
>>>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
>>>> +					struct list_head *resv_regions)
>>>
>>> "resv_region" in previous function, just "resv" here, use consistent
>>> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.
>>>
>>>> +{
>>>> +	struct iommu_resv_region *resv;
>>>> +	struct list_head *iova = &iommu->iova_list;
>>>> +	struct vfio_iova *n, *next;
>>>> +
>>>> +	list_for_each_entry(resv, resv_regions, list) {
>>>> +		phys_addr_t start, end;
>>>> +
>>>> +		start = resv->start;
>>>> +		end = resv->start + resv->length - 1;
>>>> +
>>>> +		list_for_each_entry_safe(n, next, iova, list) {
>>>> +			phys_addr_t a, b;
>>>> +			int ret = 0;
>>>> +
>>>> +			a = n->start;
>>>> +			b = n->end;
>>>
>>> 'a' and 'b' variables actually make this incredibly confusing.  Use
>>> better variable names or just drop them entirely, it's much easier to
>>> follow as n->start & n->end.
>>>
>>>> +			/* No overlap */
>>>> +			if ((start > b) || (end < a))
>>>> +				continue;
>>>> +			/* Split the current node and create holes */
>>>> +			if (start > a)
>>>> +				ret = vfio_insert_iova(a, start - 1, &n->list);
>>>> +			if (!ret && end < b)
>>>> +				ret = vfio_insert_iova(end + 1, b, &n->list);
>>>> +			if (ret)
>>>> +				return ret;
>>>> +
>>>> +			list_del(&n->list);
>>>
>>> This is trickier than it appears and deserves some explanation.  AIUI,
>>> we're actually inserting duplicate entries for the remainder at the
>>> start of the range and then at the end of the range (and the order is
>>> important here because we're inserting each before the current node),
>>> and then we delete the current node.  So the iova_list is kept sorted
>>> through this process, though temporarily includes some bogus, unordered
>>> sub-sets.
>>>
>>>> +			kfree(n);
>>>> +		}
>>>> +	}
>>>> +
>>>> +	if (list_empty(iova))
>>>> +		return -EINVAL;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>>>  					 struct iommu_group *iommu_group)
>>>>  {
>>>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>  	bool resv_msi, msi_remap;
>>>>  	phys_addr_t resv_msi_base;
>>>>  	struct iommu_domain_geometry geo;
>>>> +	struct list_head group_resv_regions;
>>>> +	struct iommu_resv_region *resv, *resv_next;
>>>>
>>>>  	mutex_lock(&iommu->lock);
>>>>
>>>> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>  	if (ret)
>>>>  		goto out_detach;
>>>>
>>>> +	INIT_LIST_HEAD(&group_resv_regions);
>>>> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
>>>> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
>> iommu_get_group_resv_regions returns a sorted list (see
>> iommu_insert_resv_regions kerneldoc comment). You can have overlapping
>> regions of different types though.
> 
> Hmm..I am not sure. It looks like it is sorted only if the regions are of same type.
> 
> "* The new element is sorted by address with respect to the other
>  * regions of the same type."
> 
> So hypothetically if there are two groups with regions like,
> 
> Group 1.
>  Start       size            type  
>   0x0000   0x1000        1
>   0x2000   0x1000        1
>   0x5000   0x1000        1
> 
> Group 2
>   Start       size              type
>    0x2000  0x4000           2
>    0x7000   0x1000          1
> 
> Then the  iommu_get_group_resv_regions() will return,
> 
> 0x0000   0x1000        1
> 0x2000   0x1000        1
> 0x5000   0x1000        1
> 0x2000  0x4000         2
> 0x7000   0x1000        1  

Hum yes, I remember now, sorry. It was made on purpose to avoid to
display interleaved resv region types in
/sys/kernel/iommu_groups/reserved_regions. I think it gives a better
user experience.

Thanks

Eric
> 
> But honestly I am not sure the above is a valid scenario or not. I am
> happy to remove the sorting if such a case will never happen.
> 
> Please let me know.
> 
> Thanks,
> Shameer
> 
>> Eric
>>>> +
>>>> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
>>>> +	if (ret)
>>>> +		goto out_detach;
>>>> +
>>>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>>>>
>>>>  	INIT_LIST_HEAD(&domain->group_list);
>>>> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>  		    d->prot == domain->prot) {
>>>>  			iommu_detach_group(domain->domain,
>> iommu_group);
>>>>  			if (!iommu_attach_group(d->domain, iommu_group)) {
>>>> +				ret = vfio_iommu_iova_resv_adjust(iommu,
>>>> +
>> 	&group_resv_regions);
>>>> +				if (!ret)
>>>> +					goto out_domain;
>>>
>>> The above function is not without side effects if it fails, it's
>>> altered the iova_list.  It needs to be valid for the remaining domains
>>> if we're going to continue.
>>>
>>>> +
>>>>  				list_add(&group->next, &d->group_list);
>>>>  				iommu_domain_free(domain->domain);
>>>>  				kfree(domain);
>>>> -				mutex_unlock(&iommu->lock);
>>>> -				return 0;
>>>> +				goto done;
>>>>  			}
>>>>
>>>>  			ret = iommu_attach_group(domain->domain,
>> iommu_group);
>>>> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>  	if (ret)
>>>>  		goto out_detach;
>>>>
>>>> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
>>>> +	if (ret)
>>>> +		goto out_detach;
>>>
>>> Can't we process the reserved regions once before we get here rather
>>> than have two separate call points that do the same thing?  In order to
>>> roll back from errors above, it seems like we need to copy iova_list
>>> and work on the copy, installing it and deleting the original only on
>>> success.
>>>
>>>> +
>>>>  	list_add(&domain->next, &iommu->domain_list);
>>>>
>>>> +done:
>>>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>>>> +		kfree(resv);
>>>>  	mutex_unlock(&iommu->lock);
>>>>
>>>>  	return 0;
>>>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>  	iommu_detach_group(domain->domain, iommu_group);
>>>>  out_domain:
>>>>  	iommu_domain_free(domain->domain);
>>>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>>>> +		kfree(resv);
>>>>  out_free:
>>>>  	kfree(domain);
>>>>  	kfree(group);
>>>> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct
>> vfio_iommu *iommu)
>>>>  	node->end = end;
>>>>  }
>>>>
>>>> +/*
>>>> + * Called when a group is detached. The reserved regions for that
>>>> + * group can be part of valid iova now. But since reserved regions
>>>> + * may be duplicated among groups, populate the iova valid regions
>>>> +   list again.
>>>> + */
>>>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
>>>> +{
>>>> +	struct vfio_domain *d;
>>>> +	struct vfio_group *g;
>>>> +	struct vfio_iova *node, *tmp;
>>>> +	struct iommu_resv_region *resv, *resv_next;
>>>> +	struct list_head resv_regions;
>>>> +	phys_addr_t start, end;
>>>> +
>>>> +	INIT_LIST_HEAD(&resv_regions);
>>>> +
>>>> +	list_for_each_entry(d, &iommu->domain_list, next) {
>>>> +		list_for_each_entry(g, &d->group_list, next)
>>>> +			iommu_get_group_resv_regions(g->iommu_group,
>>>> +							 &resv_regions);
>>>> +	}
>>>> +
>>>> +	if (list_empty(&resv_regions))
>>>> +		return;
>>>> +
>>>> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
>>>> +
>>>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
>>>> +	start = node->start;
>>>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
>>>> +	end = node->end;
>>>
>>> list_sort() only sorts based on ->start, we added reserved regions for
>>> all our groups to one list, we potentially have multiple entries with
>>> the same ->start.  How can we be sure that the last one in the list
>>> actually has the largest ->end value?
>>>
>>>> +
>>>> +	/* purge the iova list and create new one */
>>>> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
>>>> +		list_del(&node->list);
>>>> +		kfree(node);
>>>> +	}
>>>> +
>>>> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
>>>> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map
>> request may fail\n",
>>>> +			__func__);
>>>
>>> Map requests "will" fail.  Is this the right error strategy?  Detaching
>>> a group cannot fail.  Aren't we better off leaving the iova_list we had
>>> in place?  If we cannot expand the iova aperture when a group is
>>> removed, a user can continue unscathed.
>>>
>>>> +		goto done;
>>>> +	}
>>>> +
>>>> +	/* adjust the iova with current reserved regions */
>>>> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
>>>> +		pr_warn("%s: Failed to update iova list with reserve regions.
>> VFIO DMA map request may fail\n",
>>>> +			__func__);
>>>
>>> Same.
>>>
>>>> +done:
>>>> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
>>>> +		kfree(resv);
>>>> +}
>>>> +
>>>>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>>>>  					  struct iommu_group *iommu_group)
>>>>  {
>>>> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void
>> *iommu_data,
>>>>  		break;
>>>>  	}
>>>>
>>>> +	vfio_iommu_iova_resv_refresh(iommu);
>>>> +
>>>>  detach_group_done:
>>>>  	mutex_unlock(&iommu->lock);
>>>>  }
>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list
  2018-01-23 12:51         ` Auger Eric
@ 2018-01-23 15:26           ` Shameerali Kolothum Thodi
  0 siblings, 0 replies; 21+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-01-23 15:26 UTC (permalink / raw)
  To: Auger Eric, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)



> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@redhat.com]
> Sent: Tuesday, January 23, 2018 12:52 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> Alex Williamson <alex.williamson@redhat.com>
> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>
> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update
> iova list
> 
> Hi Shameer,
> 
> On 23/01/18 13:16, Shameerali Kolothum Thodi wrote:
> > Hi Eric,
> >
> >> -----Original Message-----
> >> From: Auger Eric [mailto:eric.auger@redhat.com]
> >> Sent: Tuesday, January 23, 2018 8:32 AM
> >> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum
> >> Thodi <shameerali.kolothum.thodi@huawei.com>
> >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>
> >> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and
> update
> >> iova list
> >>
> >> Hi Shameer,
> >>
> >> On 18/01/18 01:04, Alex Williamson wrote:
> >>> On Fri, 12 Jan 2018 16:45:28 +0000
> >>> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> >>>
> >>>> This retrieves the reserved regions associated with dev group and
> >>>> checks for conflicts with any existing dma mappings. Also update
> >>>> the iova list excluding the reserved regions.
> >>>>
> >>>> Signed-off-by: Shameer Kolothum
> >> <shameerali.kolothum.thodi@huawei.com>
> >>>> ---
> >>>>  drivers/vfio/vfio_iommu_type1.c | 161
> >> +++++++++++++++++++++++++++++++++++++++-
> >>>>  1 file changed, 159 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
> >> b/drivers/vfio/vfio_iommu_type1.c
> >>>> index 11cbd49..7609070 100644
> >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>> @@ -28,6 +28,7 @@
> >>>>  #include <linux/device.h>
> >>>>  #include <linux/fs.h>
> >>>>  #include <linux/iommu.h>
> >>>> +#include <linux/list_sort.h>
> >>>>  #include <linux/module.h>
> >>>>  #include <linux/mm.h>
> >>>>  #include <linux/rbtree.h>
> >>>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct
> >> iommu_group *group, phys_addr_t *base)
> >>>>  	return ret;
> >>>>  }
> >>>>
> >>>
> >>> /* list_sort helper */
> >>>
> >>>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head
> *b)
> >>>> +{
> >>>> +	struct iommu_resv_region *ra, *rb;
> >>>> +
> >>>> +	ra = container_of(a, struct iommu_resv_region, list);
> >>>> +	rb = container_of(b, struct iommu_resv_region, list);
> >>>> +
> >>>> +	if (ra->start < rb->start)
> >>>> +		return -1;
> >>>> +	if (ra->start > rb->start)
> >>>> +		return 1;
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> >>>>  				struct list_head *head)
> >>>>  {
> >>>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct
> >> vfio_iommu *iommu,
> >>>>  }
> >>>>
> >>>>  /*
> >>>> + * Check reserved region conflicts with existing dma mappings
> >>>> + */
> >>>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
> >>>> +				struct list_head *resv_regions)
> >>>> +{
> >>>> +	struct iommu_resv_region *region;
> >>>> +
> >>>> +	/* Check for conflict with existing dma mappings */
> >>>> +	list_for_each_entry(region, resv_regions, list) {
> >>>> +		if (vfio_find_dma_overlap(iommu, region->start,
> >>>> +				    region->start + region->length - 1))
> >>>> +			return -EINVAL;
> >>>> +	}
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>
> >>> This basically does the same test as vfio_iommu_valid_aperture but
> >>> properly names it a conflict test.  Please be consistent.  Should this
> >>> also return bool, "conflict" is a yes/no answer.
> >>>
> >>>> +
> >>>> +/*
> >>>>   * Adjust the iommu aperture window if new aperture is a valid one
> >>>>   */
> >>>>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> >>>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct
> >> vfio_iommu *iommu,
> >>>>  	return 0;
> >>>>  }
> >>>>
> >>>> +/*
> >>>> + * Check and update iova region list in case a reserved region
> >>>> + * overlaps the iommu iova range
> >>>> + */
> >>>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
> >>>> +					struct list_head *resv_regions)
> >>>
> >>> "resv_region" in previous function, just "resv" here, use consistent
> >>> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.
> >>>
> >>>> +{
> >>>> +	struct iommu_resv_region *resv;
> >>>> +	struct list_head *iova = &iommu->iova_list;
> >>>> +	struct vfio_iova *n, *next;
> >>>> +
> >>>> +	list_for_each_entry(resv, resv_regions, list) {
> >>>> +		phys_addr_t start, end;
> >>>> +
> >>>> +		start = resv->start;
> >>>> +		end = resv->start + resv->length - 1;
> >>>> +
> >>>> +		list_for_each_entry_safe(n, next, iova, list) {
> >>>> +			phys_addr_t a, b;
> >>>> +			int ret = 0;
> >>>> +
> >>>> +			a = n->start;
> >>>> +			b = n->end;
> >>>
> >>> 'a' and 'b' variables actually make this incredibly confusing.  Use
> >>> better variable names or just drop them entirely, it's much easier to
> >>> follow as n->start & n->end.
> >>>
> >>>> +			/* No overlap */
> >>>> +			if ((start > b) || (end < a))
> >>>> +				continue;
> >>>> +			/* Split the current node and create holes */
> >>>> +			if (start > a)
> >>>> +				ret = vfio_insert_iova(a, start - 1, &n->list);
> >>>> +			if (!ret && end < b)
> >>>> +				ret = vfio_insert_iova(end + 1, b, &n->list);
> >>>> +			if (ret)
> >>>> +				return ret;
> >>>> +
> >>>> +			list_del(&n->list);
> >>>
> >>> This is trickier than it appears and deserves some explanation.  AIUI,
> >>> we're actually inserting duplicate entries for the remainder at the
> >>> start of the range and then at the end of the range (and the order is
> >>> important here because we're inserting each before the current node),
> >>> and then we delete the current node.  So the iova_list is kept sorted
> >>> through this process, though temporarily includes some bogus, unordered
> >>> sub-sets.
> >>>
> >>>> +			kfree(n);
> >>>> +		}
> >>>> +	}
> >>>> +
> >>>> +	if (list_empty(iova))
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>>>  					 struct iommu_group *iommu_group)
> >>>>  {
> >>>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void
> >> *iommu_data,
> >>>>  	bool resv_msi, msi_remap;
> >>>>  	phys_addr_t resv_msi_base;
> >>>>  	struct iommu_domain_geometry geo;
> >>>> +	struct list_head group_resv_regions;
> >>>> +	struct iommu_resv_region *resv, *resv_next;
> >>>>
> >>>>  	mutex_lock(&iommu->lock);
> >>>>
> >>>> @@ -1404,6 +1484,14 @@ static int
> vfio_iommu_type1_attach_group(void
> >> *iommu_data,
> >>>>  	if (ret)
> >>>>  		goto out_detach;
> >>>>
> >>>> +	INIT_LIST_HEAD(&group_resv_regions);
> >>>> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
> >>>> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> >> iommu_get_group_resv_regions returns a sorted list (see
> >> iommu_insert_resv_regions kerneldoc comment). You can have overlapping
> >> regions of different types though.
> >
> > Hmm..I am not sure. It looks like it is sorted only if the regions are of same
> type.
> >
> > "* The new element is sorted by address with respect to the other
> >  * regions of the same type."
> >
> > So hypothetically if there are two groups with regions like,
> >
> > Group 1.
> >  Start       size            type
> >   0x0000   0x1000        1
> >   0x2000   0x1000        1
> >   0x5000   0x1000        1
> >
> > Group 2
> >   Start       size              type
> >    0x2000  0x4000           2
> >    0x7000   0x1000          1
> >
> > Then the  iommu_get_group_resv_regions() will return,
> >
> > 0x0000   0x1000        1
> > 0x2000   0x1000        1
> > 0x5000   0x1000        1
> > 0x2000  0x4000         2
> > 0x7000   0x1000        1
> 
> Hum yes, I remember now, sorry. It was made on purpose to avoid to
> display interleaved resv region types in
> /sys/kernel/iommu_groups/reserved_regions. I think it gives a better
> user experience.

Ok. However, I have a feeling that sorting may not be required in this 
patch. I will double check the logic in vfio_iommu_iova_resv_adjust() and if
possible will remove the sorting.

Thanks,
Shameer

> Thanks
> 
> Eric
> >
> > But honestly I am not sure the above is a valid scenario or not. I am
> > happy to remove the sorting if such a case will never happen.
> >
> > Please let me know.
> >
> > Thanks,
> > Shameer
> >
> >> Eric
> >>>> +
> >>>> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
> >>>> +	if (ret)
> >>>> +		goto out_detach;
> >>>> +
> >>>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
> >>>>
> >>>>  	INIT_LIST_HEAD(&domain->group_list);
> >>>> @@ -1434,11 +1522,15 @@ static int
> vfio_iommu_type1_attach_group(void
> >> *iommu_data,
> >>>>  		    d->prot == domain->prot) {
> >>>>  			iommu_detach_group(domain->domain,
> >> iommu_group);
> >>>>  			if (!iommu_attach_group(d->domain, iommu_group)) {
> >>>> +				ret = vfio_iommu_iova_resv_adjust(iommu,
> >>>> +
> >> 	&group_resv_regions);
> >>>> +				if (!ret)
> >>>> +					goto out_domain;
> >>>
> >>> The above function is not without side effects if it fails, it's
> >>> altered the iova_list.  It needs to be valid for the remaining domains
> >>> if we're going to continue.
> >>>
> >>>> +
> >>>>  				list_add(&group->next, &d->group_list);
> >>>>  				iommu_domain_free(domain->domain);
> >>>>  				kfree(domain);
> >>>> -				mutex_unlock(&iommu->lock);
> >>>> -				return 0;
> >>>> +				goto done;
> >>>>  			}
> >>>>
> >>>>  			ret = iommu_attach_group(domain->domain,
> >> iommu_group);
> >>>> @@ -1465,8 +1557,15 @@ static int
> vfio_iommu_type1_attach_group(void
> >> *iommu_data,
> >>>>  	if (ret)
> >>>>  		goto out_detach;
> >>>>
> >>>> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
> >>>> +	if (ret)
> >>>> +		goto out_detach;
> >>>
> >>> Can't we process the reserved regions once before we get here rather
> >>> than have two separate call points that do the same thing?  In order to
> >>> roll back from errors above, it seems like we need to copy iova_list
> >>> and work on the copy, installing it and deleting the original only on
> >>> success.
> >>>
> >>>> +
> >>>>  	list_add(&domain->next, &iommu->domain_list);
> >>>>
> >>>> +done:
> >>>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> >>>> +		kfree(resv);
> >>>>  	mutex_unlock(&iommu->lock);
> >>>>
> >>>>  	return 0;
> >>>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void
> >> *iommu_data,
> >>>>  	iommu_detach_group(domain->domain, iommu_group);
> >>>>  out_domain:
> >>>>  	iommu_domain_free(domain->domain);
> >>>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> >>>> +		kfree(resv);
> >>>>  out_free:
> >>>>  	kfree(domain);
> >>>>  	kfree(group);
> >>>> @@ -1559,6 +1660,60 @@ static void
> vfio_iommu_iova_aper_refresh(struct
> >> vfio_iommu *iommu)
> >>>>  	node->end = end;
> >>>>  }
> >>>>
> >>>> +/*
> >>>> + * Called when a group is detached. The reserved regions for that
> >>>> + * group can be part of valid iova now. But since reserved regions
> >>>> + * may be duplicated among groups, populate the iova valid regions
> >>>> +   list again.
> >>>> + */
> >>>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
> >>>> +{
> >>>> +	struct vfio_domain *d;
> >>>> +	struct vfio_group *g;
> >>>> +	struct vfio_iova *node, *tmp;
> >>>> +	struct iommu_resv_region *resv, *resv_next;
> >>>> +	struct list_head resv_regions;
> >>>> +	phys_addr_t start, end;
> >>>> +
> >>>> +	INIT_LIST_HEAD(&resv_regions);
> >>>> +
> >>>> +	list_for_each_entry(d, &iommu->domain_list, next) {
> >>>> +		list_for_each_entry(g, &d->group_list, next)
> >>>> +			iommu_get_group_resv_regions(g->iommu_group,
> >>>> +							 &resv_regions);
> >>>> +	}
> >>>> +
> >>>> +	if (list_empty(&resv_regions))
> >>>> +		return;
> >>>> +
> >>>> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
> >>>> +
> >>>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> >>>> +	start = node->start;
> >>>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> >>>> +	end = node->end;
> >>>
> >>> list_sort() only sorts based on ->start, we added reserved regions for
> >>> all our groups to one list, we potentially have multiple entries with
> >>> the same ->start.  How can we be sure that the last one in the list
> >>> actually has the largest ->end value?
> >>>
> >>>> +
> >>>> +	/* purge the iova list and create new one */
> >>>> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
> >>>> +		list_del(&node->list);
> >>>> +		kfree(node);
> >>>> +	}
> >>>> +
> >>>> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
> >>>> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map
> >> request may fail\n",
> >>>> +			__func__);
> >>>
> >>> Map requests "will" fail.  Is this the right error strategy?  Detaching
> >>> a group cannot fail.  Aren't we better off leaving the iova_list we had
> >>> in place?  If we cannot expand the iova aperture when a group is
> >>> removed, a user can continue unscathed.
> >>>
> >>>> +		goto done;
> >>>> +	}
> >>>> +
> >>>> +	/* adjust the iova with current reserved regions */
> >>>> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
> >>>> +		pr_warn("%s: Failed to update iova list with reserve regions.
> >> VFIO DMA map request may fail\n",
> >>>> +			__func__);
> >>>
> >>> Same.
> >>>
> >>>> +done:
> >>>> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
> >>>> +		kfree(resv);
> >>>> +}
> >>>> +
> >>>>  static void vfio_iommu_type1_detach_group(void *iommu_data,
> >>>>  					  struct iommu_group *iommu_group)
> >>>>  {
> >>>> @@ -1617,6 +1772,8 @@ static void
> vfio_iommu_type1_detach_group(void
> >> *iommu_data,
> >>>>  		break;
> >>>>  	}
> >>>>
> >>>> +	vfio_iommu_iova_resv_refresh(iommu);
> >>>> +
> >>>>  detach_group_done:
> >>>>  	mutex_unlock(&iommu->lock);
> >>>>  }
> >>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2018-01-23 15:26 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-12 16:45 [RFC v2 0/5] vfio/type1: Add support for valid iova list management Shameer Kolothum
2018-01-12 16:45 ` [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
2018-01-18  0:04   ` Alex Williamson
2018-01-19  9:47     ` Shameerali Kolothum Thodi
2018-01-23  8:25     ` Auger Eric
2018-01-23 10:04       ` Shameerali Kolothum Thodi
2018-01-23 11:20         ` Auger Eric
2018-01-12 16:45 ` [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list Shameer Kolothum
2018-01-18  0:04   ` Alex Williamson
2018-01-19  9:48     ` Shameerali Kolothum Thodi
2018-01-19 15:45       ` Alex Williamson
2018-01-23  8:32     ` Auger Eric
2018-01-23 12:16       ` Shameerali Kolothum Thodi
2018-01-23 12:51         ` Auger Eric
2018-01-23 15:26           ` Shameerali Kolothum Thodi
2018-01-12 16:45 ` [RFC v2 3/5] vfio/type1: check dma map request is within a valid iova range Shameer Kolothum
2018-01-23  8:38   ` Auger Eric
2018-01-12 16:45 ` [RFC v2 4/5] vfio/type1: Add IOVA range capability support Shameer Kolothum
2018-01-23 11:16   ` Auger Eric
2018-01-23 12:51     ` Shameerali Kolothum Thodi
2018-01-12 16:45 ` [RFC v2 5/5] vfio/type1: remove duplicate retrieval of reserved regions Shameer Kolothum

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.