All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
@ 2020-11-21  7:58 xuxiaoyang (C)
  2020-12-08 13:55 ` xuxiaoyang (C)
  0 siblings, 1 reply; 8+ messages in thread
From: xuxiaoyang (C) @ 2020-11-21  7:58 UTC (permalink / raw)
  To: linux-kernel, kvm, Alex Williamson
  Cc: kwankhede, wu.wubin, maoming.maoming, xieyingtai, lizhengui,
	wubinfeng, xuxiaoyang (C)

vfio_pin_pages() accepts an array of unrelated iova pfns and processes
each to return the physical pfn.  When dealing with large arrays of
contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
it is processed page by page.In this case, we can divide the iova pfn
array into multiple continuous ranges and optimize them.  For example,
when the iova pfn array is {1,5,6,7,9}, it will be divided into three
groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
number of calls to pin_user_pages_remote is reduced from 3 times to once.
For single page or large array of discontinuous iovas, we still use
vfio_pin_page_external to deal with it to reduce the performance loss
caused by refactoring.

Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
---
v1 -> v2:
 * make vfio_iommu_type1_pin_contiguous_pages use vfio_pin_page_external
 to pin single page when npage=1
 * make vfio_pin_contiguous_pages_external use set npage to mark
 consecutive pages as dirty. simplify the processing logic of unwind
 * remove unnecessary checks in vfio_get_contiguous_pages_length, put
 the least costly judgment logic at the top, and replace
 vfio_iova_get_vfio_pfn with vfio_find_vpfn

 drivers/vfio/vfio_iommu_type1.c | 231 ++++++++++++++++++++++++++++----
 1 file changed, 204 insertions(+), 27 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 67e827638995..080727b531c6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -628,6 +628,196 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 	return unlocked;
 }

+static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+				    int prot, long npage, unsigned long *phys_pfn)
+{
+	struct page **pages = NULL;
+	unsigned int flags = 0;
+	int i, ret;
+
+	pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	if (prot & IOMMU_WRITE)
+		flags |= FOLL_WRITE;
+
+	mmap_read_lock(mm);
+	ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
+				    pages, NULL, NULL);
+	mmap_read_unlock(mm);
+
+	for (i = 0; i < ret; i++)
+		*(phys_pfn + i) = page_to_pfn(pages[i]);
+
+	kvfree(pages);
+
+	return ret;
+}
+
+static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
+				    struct vfio_dma *dma,
+				    unsigned long *user_pfn,
+				    int npage, unsigned long *phys_pfn,
+				    bool do_accounting)
+{
+	int ret, i, j, lock_acct = 0;
+	unsigned long remote_vaddr;
+	dma_addr_t iova;
+	struct mm_struct *mm;
+	struct vfio_pfn *vpfn;
+
+	mm = get_task_mm(dma->task);
+	if (!mm)
+		return -ENODEV;
+
+	iova = user_pfn[0] << PAGE_SHIFT;
+	remote_vaddr = dma->vaddr + iova - dma->iova;
+	ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
+					    npage, phys_pfn);
+	mmput(mm);
+	if (ret <= 0)
+		return ret;
+
+	npage = ret;
+	for (i = 0; i < npage; i++) {
+		iova = user_pfn[i] << PAGE_SHIFT;
+		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
+		if (ret)
+			goto unwind;
+
+		if (!is_invalid_reserved_pfn(phys_pfn[i]))
+			lock_acct++;
+	}
+
+	if (do_accounting) {
+		ret = vfio_lock_acct(dma, lock_acct, true);
+		if (ret) {
+			if (ret == -ENOMEM)
+				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
+					__func__, dma->task->comm, task_pid_nr(dma->task),
+					task_rlimit(dma->task, RLIMIT_MEMLOCK));
+			goto unwind;
+		}
+	}
+
+	if (iommu->dirty_page_tracking) {
+		unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+		/*
+		 * Bitmap populated with the smallest supported page
+		 * size
+		 */
+		bitmap_set(dma->bitmap,
+			   ((user_pfn[0] << PAGE_SHIFT) - dma->iova) >> pgshift, npage);
+	}
+
+	return i;
+unwind:
+	for (j = 0; j < npage; j++) {
+		if (j < i) {
+			iova = user_pfn[j] << PAGE_SHIFT;
+			vpfn = vfio_find_vpfn(dma, iova);
+			vfio_iova_put_vfio_pfn(dma, vpfn);
+		} else {
+			put_pfn(phys_pfn[j], dma->prot);
+		}
+
+		phys_pfn[j] = 0;
+	}
+
+	return ret;
+}
+
+static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
+					    struct vfio_dma *dma,
+					    unsigned long *user_pfn,
+					    int npage, unsigned long *phys_pfn,
+					    bool do_accounting)
+{
+	int ret = 0, i, j;
+	unsigned long remote_vaddr;
+	dma_addr_t iova;
+
+	if (npage == 1)
+		goto pin_single_page;
+
+	ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
+				phys_pfn, do_accounting);
+	if (ret == npage)
+		return ret;
+
+	if (ret < 0)
+		ret = 0;
+
+pin_single_page:
+	for (i = ret; i < npage; i++) {
+		iova = user_pfn[i] << PAGE_SHIFT;
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
+			    do_accounting);
+		if (ret)
+			goto pin_unwind;
+
+		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
+		if (ret) {
+			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
+				vfio_lock_acct(dma, -1, true);
+			goto pin_unwind;
+		}
+
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap,
+					   (iova - dma->iova) >> pgshift, 1);
+		}
+	}
+
+	return i;
+
+pin_unwind:
+	phys_pfn[i] = 0;
+	for (j = 0; j < i; j++) {
+		iova = user_pfn[j] << PAGE_SHIFT;
+		vfio_unpin_page_external(dma, iova, do_accounting);
+		phys_pfn[j] = 0;
+	}
+
+	return ret;
+}
+
+static int vfio_get_contiguous_pages_length(struct vfio_dma *dma,
+				    unsigned long *user_pfn, int npage)
+{
+	int i;
+	dma_addr_t iova = user_pfn[0] << PAGE_SHIFT;
+	struct vfio_pfn *vpfn;
+
+	if (npage <= 1)
+		return npage;
+
+	for (i = 1; i < npage; i++) {
+		if (user_pfn[i] != user_pfn[0] + i)
+			break;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+		if (iova >= dma->iova + dma->size ||
+				iova + PAGE_SIZE <= dma->iova)
+			break;
+
+		vpfn = vfio_find_vpfn(dma, iova);
+		if (vpfn)
+			break;
+	}
+	return i;
+}
+
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				      struct iommu_group *iommu_group,
 				      unsigned long *user_pfn,
@@ -637,9 +827,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_group *group;
 	int i, j, ret;
-	unsigned long remote_vaddr;
 	struct vfio_dma *dma;
 	bool do_accounting;
+	int contiguous_npage;

 	if (!iommu || !user_pfn || !phys_pfn)
 		return -EINVAL;
@@ -663,7 +853,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	 */
 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);

-	for (i = 0; i < npage; i++) {
+	for (i = 0; i < npage; i += contiguous_npage) {
 		dma_addr_t iova;
 		struct vfio_pfn *vpfn;

@@ -682,31 +872,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
 		if (vpfn) {
 			phys_pfn[i] = vpfn->pfn;
-			continue;
-		}
-
-		remote_vaddr = dma->vaddr + (iova - dma->iova);
-		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
-					     do_accounting);
-		if (ret)
-			goto pin_unwind;
-
-		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
-		if (ret) {
-			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
-				vfio_lock_acct(dma, -1, true);
-			goto pin_unwind;
-		}
-
-		if (iommu->dirty_page_tracking) {
-			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
-
-			/*
-			 * Bitmap populated with the smallest supported page
-			 * size
-			 */
-			bitmap_set(dma->bitmap,
-				   (iova - dma->iova) >> pgshift, 1);
+			contiguous_npage = 1;
+		} else {
+			ret = vfio_get_contiguous_pages_length(dma,
+					&user_pfn[i], npage - i);
+			if (ret < 0)
+				goto pin_unwind;
+
+			ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
+					dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting);
+			if (ret < 0)
+				goto pin_unwind;
+			contiguous_npage = ret;
 		}
 	}
 	ret = i;
--
2.19.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-11-21  7:58 [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance xuxiaoyang (C)
@ 2020-12-08 13:55 ` xuxiaoyang (C)
  2020-12-09 11:54   ` Cornelia Huck
  0 siblings, 1 reply; 8+ messages in thread
From: xuxiaoyang (C) @ 2020-12-08 13:55 UTC (permalink / raw)
  To: linux-kernel, kvm, Alex Williamson
  Cc: kwankhede, wu.wubin, maoming.maoming, xieyingtai, lizhengui,
	wubinfeng, Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang



On 2020/11/21 15:58, xuxiaoyang (C) wrote:
> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
> each to return the physical pfn.  When dealing with large arrays of
> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
> it is processed page by page.In this case, we can divide the iova pfn
> array into multiple continuous ranges and optimize them.  For example,
> when the iova pfn array is {1,5,6,7,9}, it will be divided into three
> groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
> number of calls to pin_user_pages_remote is reduced from 3 times to once.
> For single page or large array of discontinuous iovas, we still use
> vfio_pin_page_external to deal with it to reduce the performance loss
> caused by refactoring.
> 
> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
> ---
> v1 -> v2:
>  * make vfio_iommu_type1_pin_contiguous_pages use vfio_pin_page_external
>  to pin single page when npage=1
>  * make vfio_pin_contiguous_pages_external use set npage to mark
>  consecutive pages as dirty. simplify the processing logic of unwind
>  * remove unnecessary checks in vfio_get_contiguous_pages_length, put
>  the least costly judgment logic at the top, and replace
>  vfio_iova_get_vfio_pfn with vfio_find_vpfn
> 
>  drivers/vfio/vfio_iommu_type1.c | 231 ++++++++++++++++++++++++++++----
>  1 file changed, 204 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 67e827638995..080727b531c6 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -628,6 +628,196 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>  	return unlocked;
>  }
> 
> +static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +				    int prot, long npage, unsigned long *phys_pfn)
> +{
> +	struct page **pages = NULL;
> +	unsigned int flags = 0;
> +	int i, ret;
> +
> +	pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
> +	if (!pages)
> +		return -ENOMEM;
> +
> +	if (prot & IOMMU_WRITE)
> +		flags |= FOLL_WRITE;
> +
> +	mmap_read_lock(mm);
> +	ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
> +				    pages, NULL, NULL);
> +	mmap_read_unlock(mm);
> +
> +	for (i = 0; i < ret; i++)
> +		*(phys_pfn + i) = page_to_pfn(pages[i]);
> +
> +	kvfree(pages);
> +
> +	return ret;
> +}
> +
> +static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
> +				    struct vfio_dma *dma,
> +				    unsigned long *user_pfn,
> +				    int npage, unsigned long *phys_pfn,
> +				    bool do_accounting)
> +{
> +	int ret, i, j, lock_acct = 0;
> +	unsigned long remote_vaddr;
> +	dma_addr_t iova;
> +	struct mm_struct *mm;
> +	struct vfio_pfn *vpfn;
> +
> +	mm = get_task_mm(dma->task);
> +	if (!mm)
> +		return -ENODEV;
> +
> +	iova = user_pfn[0] << PAGE_SHIFT;
> +	remote_vaddr = dma->vaddr + iova - dma->iova;
> +	ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
> +					    npage, phys_pfn);
> +	mmput(mm);
> +	if (ret <= 0)
> +		return ret;
> +
> +	npage = ret;
> +	for (i = 0; i < npage; i++) {
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> +		if (ret)
> +			goto unwind;
> +
> +		if (!is_invalid_reserved_pfn(phys_pfn[i]))
> +			lock_acct++;
> +	}
> +
> +	if (do_accounting) {
> +		ret = vfio_lock_acct(dma, lock_acct, true);
> +		if (ret) {
> +			if (ret == -ENOMEM)
> +				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> +					__func__, dma->task->comm, task_pid_nr(dma->task),
> +					task_rlimit(dma->task, RLIMIT_MEMLOCK));
> +			goto unwind;
> +		}
> +	}
> +
> +	if (iommu->dirty_page_tracking) {
> +		unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> +
> +		/*
> +		 * Bitmap populated with the smallest supported page
> +		 * size
> +		 */
> +		bitmap_set(dma->bitmap,
> +			   ((user_pfn[0] << PAGE_SHIFT) - dma->iova) >> pgshift, npage);
> +	}
> +
> +	return i;
> +unwind:
> +	for (j = 0; j < npage; j++) {
> +		if (j < i) {
> +			iova = user_pfn[j] << PAGE_SHIFT;
> +			vpfn = vfio_find_vpfn(dma, iova);
> +			vfio_iova_put_vfio_pfn(dma, vpfn);
> +		} else {
> +			put_pfn(phys_pfn[j], dma->prot);
> +		}
> +
> +		phys_pfn[j] = 0;
> +	}
> +
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
> +					    struct vfio_dma *dma,
> +					    unsigned long *user_pfn,
> +					    int npage, unsigned long *phys_pfn,
> +					    bool do_accounting)
> +{
> +	int ret = 0, i, j;
> +	unsigned long remote_vaddr;
> +	dma_addr_t iova;
> +
> +	if (npage == 1)
> +		goto pin_single_page;
> +
> +	ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
> +				phys_pfn, do_accounting);
> +	if (ret == npage)
> +		return ret;
> +
> +	if (ret < 0)
> +		ret = 0;
> +
> +pin_single_page:
> +	for (i = ret; i < npage; i++) {
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> +			    do_accounting);
> +		if (ret)
> +			goto pin_unwind;
> +
> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> +		if (ret) {
> +			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> +				vfio_lock_acct(dma, -1, true);
> +			goto pin_unwind;
> +		}
> +
> +		if (iommu->dirty_page_tracking) {
> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> +
> +			/*
> +			 * Bitmap populated with the smallest supported page
> +			 * size
> +			 */
> +			bitmap_set(dma->bitmap,
> +					   (iova - dma->iova) >> pgshift, 1);
> +		}
> +	}
> +
> +	return i;
> +
> +pin_unwind:
> +	phys_pfn[i] = 0;
> +	for (j = 0; j < i; j++) {
> +		iova = user_pfn[j] << PAGE_SHIFT;
> +		vfio_unpin_page_external(dma, iova, do_accounting);
> +		phys_pfn[j] = 0;
> +	}
> +
> +	return ret;
> +}
> +
> +static int vfio_get_contiguous_pages_length(struct vfio_dma *dma,
> +				    unsigned long *user_pfn, int npage)
> +{
> +	int i;
> +	dma_addr_t iova = user_pfn[0] << PAGE_SHIFT;
> +	struct vfio_pfn *vpfn;
> +
> +	if (npage <= 1)
> +		return npage;
> +
> +	for (i = 1; i < npage; i++) {
> +		if (user_pfn[i] != user_pfn[0] + i)
> +			break;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +		if (iova >= dma->iova + dma->size ||
> +				iova + PAGE_SIZE <= dma->iova)
> +			break;
> +
> +		vpfn = vfio_find_vpfn(dma, iova);
> +		if (vpfn)
> +			break;
> +	}
> +	return i;
> +}
> +
>  static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  				      struct iommu_group *iommu_group,
>  				      unsigned long *user_pfn,
> @@ -637,9 +827,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_group *group;
>  	int i, j, ret;
> -	unsigned long remote_vaddr;
>  	struct vfio_dma *dma;
>  	bool do_accounting;
> +	int contiguous_npage;
> 
>  	if (!iommu || !user_pfn || !phys_pfn)
>  		return -EINVAL;
> @@ -663,7 +853,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  	 */
>  	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> 
> -	for (i = 0; i < npage; i++) {
> +	for (i = 0; i < npage; i += contiguous_npage) {
>  		dma_addr_t iova;
>  		struct vfio_pfn *vpfn;
> 
> @@ -682,31 +872,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>  		if (vpfn) {
>  			phys_pfn[i] = vpfn->pfn;
> -			continue;
> -		}
> -
> -		remote_vaddr = dma->vaddr + (iova - dma->iova);
> -		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> -					     do_accounting);
> -		if (ret)
> -			goto pin_unwind;
> -
> -		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> -		if (ret) {
> -			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> -				vfio_lock_acct(dma, -1, true);
> -			goto pin_unwind;
> -		}
> -
> -		if (iommu->dirty_page_tracking) {
> -			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> -
> -			/*
> -			 * Bitmap populated with the smallest supported page
> -			 * size
> -			 */
> -			bitmap_set(dma->bitmap,
> -				   (iova - dma->iova) >> pgshift, 1);
> +			contiguous_npage = 1;
> +		} else {
> +			ret = vfio_get_contiguous_pages_length(dma,
> +					&user_pfn[i], npage - i);
> +			if (ret < 0)
> +				goto pin_unwind;
> +
> +			ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
> +					dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting);
> +			if (ret < 0)
> +				goto pin_unwind;
> +			contiguous_npage = ret;
>  		}
>  	}
>  	ret = i;
> --
> 2.19.1
> .
> 

hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang

vfio_pin_pages() accepts an array of unrelated iova pfns and processes
each to return the physical pfn.  When dealing with large arrays of
contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
it is processed page by page.  In this case, we can divide the iova pfn
array into multiple continuous ranges and optimize them.  I have a set
of performance test data for reference.

The patch was not applied
                    1 page           512 pages
no huge pages:     1638ns           223651ns
THP:               1668ns           222330ns
HugeTLB:           1526ns           208151ns

The patch was applied
                    1 page           512 pages
no huge pages       1735ns           167286ns
THP:               1934ns           126900ns
HugeTLB:           1713ns           102188ns

As Alex Williamson said, this patch lacks proof that it works in the
real world. I think you will have some valuable opinions.

Regards,
Xu


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-12-08 13:55 ` xuxiaoyang (C)
@ 2020-12-09 11:54   ` Cornelia Huck
  2020-12-09 14:42     ` Eric Farman
  2020-12-10 13:54     ` xuxiaoyang (C)
  0 siblings, 2 replies; 8+ messages in thread
From: Cornelia Huck @ 2020-12-09 11:54 UTC (permalink / raw)
  To: xuxiaoyang (C), Eric Farman
  Cc: linux-kernel, kvm, Alex Williamson, kwankhede, wu.wubin,
	maoming.maoming, xieyingtai, lizhengui, wubinfeng, Zhenyu Wang,
	Zhi Wang

On Tue, 8 Dec 2020 21:55:53 +0800
"xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:

> On 2020/11/21 15:58, xuxiaoyang (C) wrote:
> > vfio_pin_pages() accepts an array of unrelated iova pfns and processes
> > each to return the physical pfn.  When dealing with large arrays of
> > contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
> > it is processed page by page.In this case, we can divide the iova pfn
> > array into multiple continuous ranges and optimize them.  For example,
> > when the iova pfn array is {1,5,6,7,9}, it will be divided into three
> > groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
> > number of calls to pin_user_pages_remote is reduced from 3 times to once.
> > For single page or large array of discontinuous iovas, we still use
> > vfio_pin_page_external to deal with it to reduce the performance loss
> > caused by refactoring.
> > 
> > Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>

(...)

> 
> hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang
> 
> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
> each to return the physical pfn.  When dealing with large arrays of
> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
> it is processed page by page.  In this case, we can divide the iova pfn
> array into multiple continuous ranges and optimize them.  I have a set
> of performance test data for reference.
> 
> The patch was not applied
>                     1 page           512 pages
> no huge pages:     1638ns           223651ns
> THP:               1668ns           222330ns
> HugeTLB:           1526ns           208151ns
> 
> The patch was applied
>                     1 page           512 pages
> no huge pages       1735ns           167286ns
> THP:               1934ns           126900ns
> HugeTLB:           1713ns           102188ns
> 
> As Alex Williamson said, this patch lacks proof that it works in the
> real world. I think you will have some valuable opinions.

Looking at this from the vfio-ccw angle, I'm not sure how much this
would buy us, as we deal with IDAWs, which are designed so that they
can be non-contiguous. I guess this depends a lot on what the guest
does.

Eric, any opinion? Do you maybe also happen to have a test setup that
mimics workloads actually seen in the real world?


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-12-09 11:54   ` Cornelia Huck
@ 2020-12-09 14:42     ` Eric Farman
  2020-12-10 13:56       ` xuxiaoyang (C)
  2020-12-10 13:54     ` xuxiaoyang (C)
  1 sibling, 1 reply; 8+ messages in thread
From: Eric Farman @ 2020-12-09 14:42 UTC (permalink / raw)
  To: Cornelia Huck, xuxiaoyang (C)
  Cc: linux-kernel, kvm, Alex Williamson, kwankhede, wu.wubin,
	maoming.maoming, xieyingtai, lizhengui, wubinfeng, Zhenyu Wang,
	Zhi Wang



On 12/9/20 6:54 AM, Cornelia Huck wrote:
> On Tue, 8 Dec 2020 21:55:53 +0800
> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
> 
>> On 2020/11/21 15:58, xuxiaoyang (C) wrote:
>>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>>> each to return the physical pfn.  When dealing with large arrays of
>>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>>> it is processed page by page.In this case, we can divide the iova pfn
>>> array into multiple continuous ranges and optimize them.  For example,
>>> when the iova pfn array is {1,5,6,7,9}, it will be divided into three
>>> groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
>>> number of calls to pin_user_pages_remote is reduced from 3 times to once.
>>> For single page or large array of discontinuous iovas, we still use
>>> vfio_pin_page_external to deal with it to reduce the performance loss
>>> caused by refactoring.
>>>
>>> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
> 
> (...)
> 
>>
>> hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang
>>
>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>> each to return the physical pfn.  When dealing with large arrays of
>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>> it is processed page by page.  In this case, we can divide the iova pfn
>> array into multiple continuous ranges and optimize them.  I have a set
>> of performance test data for reference.
>>
>> The patch was not applied
>>                      1 page           512 pages
>> no huge pages:     1638ns           223651ns
>> THP:               1668ns           222330ns
>> HugeTLB:           1526ns           208151ns
>>
>> The patch was applied
>>                      1 page           512 pages
>> no huge pages       1735ns           167286ns
>> THP:               1934ns           126900ns
>> HugeTLB:           1713ns           102188ns
>>
>> As Alex Williamson said, this patch lacks proof that it works in the
>> real world. I think you will have some valuable opinions.
> 
> Looking at this from the vfio-ccw angle, I'm not sure how much this
> would buy us, as we deal with IDAWs, which are designed so that they
> can be non-contiguous. I guess this depends a lot on what the guest
> does.

This would be my concern too, but I don't have data off the top of my 
head to say one way or another...

> 
> Eric, any opinion? Do you maybe also happen to have a test setup that
> mimics workloads actually seen in the real world?
> 

...I do have some test setups, which I will try to get some data from in 
a couple days. At the moment I've broken most of those setups trying to 
implement some other stuff, and can't revert back at the moment. Will 
get back to this.

Eric

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-12-09 11:54   ` Cornelia Huck
  2020-12-09 14:42     ` Eric Farman
@ 2020-12-10 13:54     ` xuxiaoyang (C)
  1 sibling, 0 replies; 8+ messages in thread
From: xuxiaoyang (C) @ 2020-12-10 13:54 UTC (permalink / raw)
  To: Cornelia Huck, Eric Farman
  Cc: linux-kernel, kvm, Alex Williamson, kwankhede, wu.wubin,
	maoming.maoming, xieyingtai, lizhengui, wubinfeng, Zhenyu Wang,
	Zhi Wang



On 2020/12/9 19:54, Cornelia Huck wrote:
> On Tue, 8 Dec 2020 21:55:53 +0800
> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
> 
>> On 2020/11/21 15:58, xuxiaoyang (C) wrote:
>>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>>> each to return the physical pfn.  When dealing with large arrays of
>>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>>> it is processed page by page.In this case, we can divide the iova pfn
>>> array into multiple continuous ranges and optimize them.  For example,
>>> when the iova pfn array is {1,5,6,7,9}, it will be divided into three
>>> groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
>>> number of calls to pin_user_pages_remote is reduced from 3 times to once.
>>> For single page or large array of discontinuous iovas, we still use
>>> vfio_pin_page_external to deal with it to reduce the performance loss
>>> caused by refactoring.
>>>
>>> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
> 
> (...)
> 
>>
>> hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang
>>
>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>> each to return the physical pfn.  When dealing with large arrays of
>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>> it is processed page by page.  In this case, we can divide the iova pfn
>> array into multiple continuous ranges and optimize them.  I have a set
>> of performance test data for reference.
>>
>> The patch was not applied
>>                     1 page           512 pages
>> no huge pages:     1638ns           223651ns
>> THP:               1668ns           222330ns
>> HugeTLB:           1526ns           208151ns
>>
>> The patch was applied
>>                     1 page           512 pages
>> no huge pages       1735ns           167286ns
>> THP:               1934ns           126900ns
>> HugeTLB:           1713ns           102188ns
>>
>> As Alex Williamson said, this patch lacks proof that it works in the
>> real world. I think you will have some valuable opinions.
> 
> Looking at this from the vfio-ccw angle, I'm not sure how much this
> would buy us, as we deal with IDAWs, which are designed so that they
> can be non-contiguous. I guess this depends a lot on what the guest
> does.
> 
> Eric, any opinion? Do you maybe also happen to have a test setup that
> mimics workloads actually seen in the real world?
> 
> .
> 
Thank you for your reply. The iova array constructed using
pfn_array_alloc is continuous, and I think there will be
some performance improvements here.

Regards,
Xu

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-12-09 14:42     ` Eric Farman
@ 2020-12-10 13:56       ` xuxiaoyang (C)
  2020-12-14 18:58         ` Eric Farman
  0 siblings, 1 reply; 8+ messages in thread
From: xuxiaoyang (C) @ 2020-12-10 13:56 UTC (permalink / raw)
  To: Eric Farman, Cornelia Huck
  Cc: linux-kernel, kvm, Alex Williamson, kwankhede, wu.wubin,
	maoming.maoming, xieyingtai, lizhengui, wubinfeng, Zhenyu Wang,
	Zhi Wang



On 2020/12/9 22:42, Eric Farman wrote:
> 
> 
> On 12/9/20 6:54 AM, Cornelia Huck wrote:
>> On Tue, 8 Dec 2020 21:55:53 +0800
>> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
>>
>>> On 2020/11/21 15:58, xuxiaoyang (C) wrote:
>>>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>>>> each to return the physical pfn.  When dealing with large arrays of
>>>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>>>> it is processed page by page.In this case, we can divide the iova pfn
>>>> array into multiple continuous ranges and optimize them.  For example,
>>>> when the iova pfn array is {1,5,6,7,9}, it will be divided into three
>>>> groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
>>>> number of calls to pin_user_pages_remote is reduced from 3 times to once.
>>>> For single page or large array of discontinuous iovas, we still use
>>>> vfio_pin_page_external to deal with it to reduce the performance loss
>>>> caused by refactoring.
>>>>
>>>> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
>>
>> (...)
>>
>>>
>>> hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang
>>>
>>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>>> each to return the physical pfn.  When dealing with large arrays of
>>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>>> it is processed page by page.  In this case, we can divide the iova pfn
>>> array into multiple continuous ranges and optimize them.  I have a set
>>> of performance test data for reference.
>>>
>>> The patch was not applied
>>>                      1 page           512 pages
>>> no huge pages:     1638ns           223651ns
>>> THP:               1668ns           222330ns
>>> HugeTLB:           1526ns           208151ns
>>>
>>> The patch was applied
>>>                      1 page           512 pages
>>> no huge pages       1735ns           167286ns
>>> THP:               1934ns           126900ns
>>> HugeTLB:           1713ns           102188ns
>>>
>>> As Alex Williamson said, this patch lacks proof that it works in the
>>> real world. I think you will have some valuable opinions.
>>
>> Looking at this from the vfio-ccw angle, I'm not sure how much this
>> would buy us, as we deal with IDAWs, which are designed so that they
>> can be non-contiguous. I guess this depends a lot on what the guest
>> does.
> 
> This would be my concern too, but I don't have data off the top of my head to say one way or another...
> 
>>
>> Eric, any opinion? Do you maybe also happen to have a test setup that
>> mimics workloads actually seen in the real world?
>>
> 
> ...I do have some test setups, which I will try to get some data from in a couple days. At the moment I've broken most of those setups trying to implement some other stuff, and can't revert back at the moment. Will get back to this.
> 
> Eric
> .

Thank you for your reply. Looking forward to your test data.

Regards,
Xu

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-12-10 13:56       ` xuxiaoyang (C)
@ 2020-12-14 18:58         ` Eric Farman
  2020-12-15 13:13           ` xuxiaoyang (C)
  0 siblings, 1 reply; 8+ messages in thread
From: Eric Farman @ 2020-12-14 18:58 UTC (permalink / raw)
  To: xuxiaoyang (C), Cornelia Huck
  Cc: linux-kernel, kvm, Alex Williamson, kwankhede, wu.wubin,
	maoming.maoming, xieyingtai, lizhengui, wubinfeng, Zhenyu Wang,
	Zhi Wang



On 12/10/20 8:56 AM, xuxiaoyang (C) wrote:
> 
> 
> On 2020/12/9 22:42, Eric Farman wrote:
>>
>>
>> On 12/9/20 6:54 AM, Cornelia Huck wrote:
>>> On Tue, 8 Dec 2020 21:55:53 +0800
>>> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
>>>
>>>> On 2020/11/21 15:58, xuxiaoyang (C) wrote:
>>>>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>>>>> each to return the physical pfn.  When dealing with large arrays of
>>>>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>>>>> it is processed page by page.In this case, we can divide the iova pfn
>>>>> array into multiple continuous ranges and optimize them.  For example,
>>>>> when the iova pfn array is {1,5,6,7,9}, it will be divided into three
>>>>> groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
>>>>> number of calls to pin_user_pages_remote is reduced from 3 times to once.
>>>>> For single page or large array of discontinuous iovas, we still use
>>>>> vfio_pin_page_external to deal with it to reduce the performance loss
>>>>> caused by refactoring.
>>>>>
>>>>> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
>>>
>>> (...)
>>>
>>>>
>>>> hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang
>>>>
>>>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>>>> each to return the physical pfn.  When dealing with large arrays of
>>>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>>>> it is processed page by page.  In this case, we can divide the iova pfn
>>>> array into multiple continuous ranges and optimize them.  I have a set
>>>> of performance test data for reference.
>>>>
>>>> The patch was not applied
>>>>                       1 page           512 pages
>>>> no huge pages:     1638ns           223651ns
>>>> THP:               1668ns           222330ns
>>>> HugeTLB:           1526ns           208151ns
>>>>
>>>> The patch was applied
>>>>                       1 page           512 pages
>>>> no huge pages       1735ns           167286ns
>>>> THP:               1934ns           126900ns
>>>> HugeTLB:           1713ns           102188ns
>>>>
>>>> As Alex Williamson said, this patch lacks proof that it works in the
>>>> real world. I think you will have some valuable opinions.
>>>
>>> Looking at this from the vfio-ccw angle, I'm not sure how much this
>>> would buy us, as we deal with IDAWs, which are designed so that they
>>> can be non-contiguous. I guess this depends a lot on what the guest
>>> does.
>>
>> This would be my concern too, but I don't have data off the top of my head to say one way or another...
>>
>>>
>>> Eric, any opinion? Do you maybe also happen to have a test setup that
>>> mimics workloads actually seen in the real world?
>>>
>>
>> ...I do have some test setups, which I will try to get some data from in a couple days. At the moment I've broken most of those setups trying to implement some other stuff, and can't revert back at the moment. Will get back to this.
>>
>> Eric
>> .
> 
> Thank you for your reply. Looking forward to your test data.

Xu,

The scenario I ran was a host kernel 5.10.0-rc7 with qemu 5.2.0, with a 
Fedora 32 guest with 4 VCPU and 4GB memory. I tried this a handful of 
times across a couple different hosts, so the likelihood that these 
numbers are outliers are pretty low. The histograms below come from a 
simple bpftrace, recording the number of pages asked to be pinned, and 
the length of time (in nanoseconds) it took to pin all those pages. I 
separated out the length of time for a request of one page versus a 
request of multiple pages, because as you will see the former far 
outnumbers the latter.

The first thing I tried was simply to boot the guest via vfio-ccw, to 
see how the patch itself behaved:

@1_page_ns	BASE		+PATCH
256, 512	12531	42.50%	12744	42.26%
512, 1K		5660	19.20%	5611	18.61%
1K, 2K		8416	28.54%	8947	29.67%
2K, 4K		2694	9.14%	2669	8.85%
4K, 8K		164	0.56%	169	0.56%
8K, 16K		14	0.05%	14	0.05%
16K, 32K	2	0.01%	3	0.01%
32K, 64K	0	0.00%	0	0.00%
64K, 128K	0	0.00%	0	0.00%

@n_pages_ns	BASE		+PATCH
256, 512	0	0.00%	0	0.00%
512, 1K		67	0.97%	48	0.68%
1K, 2K		1598	23.13%	1036	14.71%
2K, 4K		2784	40.30%	3112	44.17%
4K, 8K		1288	18.64%	1579	22.41%
8K, 16K		1011	14.63%	1032	14.65%
16K, 32K	159	2.30%	234	3.32%
32K, 64K	1	0.01%	2	0.03%
64K, 128K	0	0.00%	2	0.03%

@npage		BASE		+PATCH
1		29484	81.02%	30157	81.06%
2, 4		3298	9.06%	3385	9.10%
4, 8		1011	2.78%	1029	2.77%
8, 16		2600	7.14%	2631	7.07%


The second thing I tried was simply fio, running it for about 10 minutes 
with a few minutes each for sequential read, sequential write, random 
read, and random write. (I tried this with both the guest booted off 
vfio-ccw and virtio-blk, but the difference was negligible.) The results 
in this space are similar as well:

@1_page_ns	BASE		+PATCH
256, 512	5648104	66.79%	6615878	66.75%
512, 1K		1784047	21.10%	2082852	21.01%
1K, 2K		648877	7.67%	771964	7.79%
2K, 4K		339551	4.01%	396381	4.00%
4K, 8K		32513	0.38%	40359	0.41%
8K, 16K		2602	0.03%	2884	0.03%
16K, 32K	758	0.01%	762	0.01%
32K, 64K	434	0.01%	352	0.00%

@n_pages_ns	BASE		+PATCH
256, 512	0	0.00%	0	0.00%
512, 1K		470803	12.18%	360524	7.95%
1K, 2K		1305166	33.75%	1739183	38.37%
2K, 4K		1338277	34.61%	1471161	32.46%
4K, 8K		733480	18.97%	937341	20.68%
8K, 16K		16954	0.44%	20708	0.46%
16K, 32K	1278	0.03%	2197	0.05%
32K, 64K	707	0.02%	703	0.02%

@npage		BASE		+PATCH
1		8457107	68.62%	9911624	68.62%
2, 4		2066957	16.77%	2446462	16.94%
4, 8		359989	2.92%	417188	2.89%
8, 16		1440006	11.68%	1668482	11.55%


I tried a smattering of other tests that might be more realistic, but 
the results were all pretty similar so there's no point in appending 
them here. Across the board, the amount of time spent on a multi-page 
request grows with the supplied patch. It doesn't get me very excited.

If you are wondering why this might be, Conny's initial take about IDAWs 
being non-contiguous by design is spot on. Let's observe the page counts 
given to vfio_iommu_type1_pin_contiguous_pages() in addition to the 
counts in vfio_iommu_type1_pin_pages(). The following is an example of 
one guest boot PLUS an fio run:

vfio_iommu_type1_pin_pages npage:
1	9890332		68.64%
2, 4	2438213		16.92%
4, 8	416278		2.89%
8, 16	1663201		11.54%
Total	14408024	

vfio_iommu_type1_pin_contiguous_pages npage:
1	16384925	86.89%
2, 4	1327548		7.04%
4, 8	727564		3.86%
8, 16	417182		2.21%
Total	18857219	

Yup... 87% of the calls to vfio_iommu_type1_pin_contiguous_pages() do so 
with a length of just a single page.

Happy to provide more data if desired, but it doesn't look like a 
benefit to vfio-ccw's use.

Thanks,
Eric


> 
> Regards,
> Xu
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-12-14 18:58         ` Eric Farman
@ 2020-12-15 13:13           ` xuxiaoyang (C)
  0 siblings, 0 replies; 8+ messages in thread
From: xuxiaoyang (C) @ 2020-12-15 13:13 UTC (permalink / raw)
  To: Eric Farman, Cornelia Huck
  Cc: linux-kernel, kvm, Alex Williamson, kwankhede, wu.wubin,
	maoming.maoming, xieyingtai, lizhengui, wubinfeng, Zhenyu Wang,
	Zhi Wang



On 2020/12/15 2:58, Eric Farman wrote:
> 
> 
> On 12/10/20 8:56 AM, xuxiaoyang (C) wrote:
>>
>>
>> On 2020/12/9 22:42, Eric Farman wrote:
>>>
>>>
>>> On 12/9/20 6:54 AM, Cornelia Huck wrote:
>>>> On Tue, 8 Dec 2020 21:55:53 +0800
>>>> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
>>>>
>>>>> On 2020/11/21 15:58, xuxiaoyang (C) wrote:
>>>>>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>>>>>> each to return the physical pfn.  When dealing with large arrays of
>>>>>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>>>>>> it is processed page by page.In this case, we can divide the iova pfn
>>>>>> array into multiple continuous ranges and optimize them.  For example,
>>>>>> when the iova pfn array is {1,5,6,7,9}, it will be divided into three
>>>>>> groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
>>>>>> number of calls to pin_user_pages_remote is reduced from 3 times to once.
>>>>>> For single page or large array of discontinuous iovas, we still use
>>>>>> vfio_pin_page_external to deal with it to reduce the performance loss
>>>>>> caused by refactoring.
>>>>>>
>>>>>> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
>>>>
>>>> (...)
>>>>
>>>>>
>>>>> hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang
>>>>>
>>>>> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
>>>>> each to return the physical pfn.  When dealing with large arrays of
>>>>> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
>>>>> it is processed page by page.  In this case, we can divide the iova pfn
>>>>> array into multiple continuous ranges and optimize them.  I have a set
>>>>> of performance test data for reference.
>>>>>
>>>>> The patch was not applied
>>>>>                       1 page           512 pages
>>>>> no huge pages:     1638ns           223651ns
>>>>> THP:               1668ns           222330ns
>>>>> HugeTLB:           1526ns           208151ns
>>>>>
>>>>> The patch was applied
>>>>>                       1 page           512 pages
>>>>> no huge pages       1735ns           167286ns
>>>>> THP:               1934ns           126900ns
>>>>> HugeTLB:           1713ns           102188ns
>>>>>
>>>>> As Alex Williamson said, this patch lacks proof that it works in the
>>>>> real world. I think you will have some valuable opinions.
>>>>
>>>> Looking at this from the vfio-ccw angle, I'm not sure how much this
>>>> would buy us, as we deal with IDAWs, which are designed so that they
>>>> can be non-contiguous. I guess this depends a lot on what the guest
>>>> does.
>>>
>>> This would be my concern too, but I don't have data off the top of my head to say one way or another...
>>>
>>>>
>>>> Eric, any opinion? Do you maybe also happen to have a test setup that
>>>> mimics workloads actually seen in the real world?
>>>>
>>>
>>> ...I do have some test setups, which I will try to get some data from in a couple days. At the moment I've broken most of those setups trying to implement some other stuff, and can't revert back at the moment. Will get back to this.
>>>
>>> Eric
>>> .
>>
>> Thank you for your reply. Looking forward to your test data.
> 
> Xu,
> 
> The scenario I ran was a host kernel 5.10.0-rc7 with qemu 5.2.0, with a Fedora 32 guest with 4 VCPU and 4GB memory. I tried this a handful of times across a couple different hosts, so the likelihood that these numbers are outliers are pretty low. The histograms below come from a simple bpftrace, recording the number of pages asked to be pinned, and the length of time (in nanoseconds) it took to pin all those pages. I separated out the length of time for a request of one page versus a request of multiple pages, because as you will see the former far outnumbers the latter.
> 
> The first thing I tried was simply to boot the guest via vfio-ccw, to see how the patch itself behaved:
> 
> @1_page_ns    BASE        +PATCH
> 256, 512    12531    42.50%    12744    42.26%
> 512, 1K        5660    19.20%    5611    18.61%
> 1K, 2K        8416    28.54%    8947    29.67%
> 2K, 4K        2694    9.14%    2669    8.85%
> 4K, 8K        164    0.56%    169    0.56%
> 8K, 16K        14    0.05%    14    0.05%
> 16K, 32K    2    0.01%    3    0.01%
> 32K, 64K    0    0.00%    0    0.00%
> 64K, 128K    0    0.00%    0    0.00%
> 
> @n_pages_ns    BASE        +PATCH
> 256, 512    0    0.00%    0    0.00%
> 512, 1K        67    0.97%    48    0.68%
> 1K, 2K        1598    23.13%    1036    14.71%
> 2K, 4K        2784    40.30%    3112    44.17%
> 4K, 8K        1288    18.64%    1579    22.41%
> 8K, 16K        1011    14.63%    1032    14.65%
> 16K, 32K    159    2.30%    234    3.32%
> 32K, 64K    1    0.01%    2    0.03%
> 64K, 128K    0    0.00%    2    0.03%
> 
> @npage        BASE        +PATCH
> 1        29484    81.02%    30157    81.06%
> 2, 4        3298    9.06%    3385    9.10%
> 4, 8        1011    2.78%    1029    2.77%
> 8, 16        2600    7.14%    2631    7.07%
> 
> 
> The second thing I tried was simply fio, running it for about 10 minutes with a few minutes each for sequential read, sequential write, random read, and random write. (I tried this with both the guest booted off vfio-ccw and virtio-blk, but the difference was negligible.) The results in this space are similar as well:
> 
> @1_page_ns    BASE        +PATCH
> 256, 512    5648104    66.79%    6615878    66.75%
> 512, 1K        1784047    21.10%    2082852    21.01%
> 1K, 2K        648877    7.67%    771964    7.79%
> 2K, 4K        339551    4.01%    396381    4.00%
> 4K, 8K        32513    0.38%    40359    0.41%
> 8K, 16K        2602    0.03%    2884    0.03%
> 16K, 32K    758    0.01%    762    0.01%
> 32K, 64K    434    0.01%    352    0.00%
> 
> @n_pages_ns    BASE        +PATCH
> 256, 512    0    0.00%    0    0.00%
> 512, 1K        470803    12.18%    360524    7.95%
> 1K, 2K        1305166    33.75%    1739183    38.37%
> 2K, 4K        1338277    34.61%    1471161    32.46%
> 4K, 8K        733480    18.97%    937341    20.68%
> 8K, 16K        16954    0.44%    20708    0.46%
> 16K, 32K    1278    0.03%    2197    0.05%
> 32K, 64K    707    0.02%    703    0.02%
> 
> @npage        BASE        +PATCH
> 1        8457107    68.62%    9911624    68.62%
> 2, 4        2066957    16.77%    2446462    16.94%
> 4, 8        359989    2.92%    417188    2.89%
> 8, 16        1440006    11.68%    1668482    11.55%
> 
> 
> I tried a smattering of other tests that might be more realistic, but the results were all pretty similar so there's no point in appending them here. Across the board, the amount of time spent on a multi-page request grows with the supplied patch. It doesn't get me very excited.
> 
> If you are wondering why this might be, Conny's initial take about IDAWs being non-contiguous by design is spot on. Let's observe the page counts given to vfio_iommu_type1_pin_contiguous_pages() in addition to the counts in vfio_iommu_type1_pin_pages(). The following is an example of one guest boot PLUS an fio run:
> 
> vfio_iommu_type1_pin_pages npage:
> 1    9890332        68.64%
> 2, 4    2438213        16.92%
> 4, 8    416278        2.89%
> 8, 16    1663201        11.54%
> Total    14408024   
> 
> vfio_iommu_type1_pin_contiguous_pages npage:
> 1    16384925    86.89%
> 2, 4    1327548        7.04%
> 4, 8    727564        3.86%
> 8, 16    417182        2.21%
> Total    18857219   
> 
> Yup... 87% of the calls to vfio_iommu_type1_pin_contiguous_pages() do so with a length of just a single page.
> 
> Happy to provide more data if desired, but it doesn't look like a benefit to vfio-ccw's use.
> 
> Thanks,
> Eric
> 
> 
Eric, vfio-ccw pin single page accounted for 87%,
and the length of continuous pages is very short.
In my test data, the continuous page length is 512,
which is a huge difference.  It is easy to understand
that this patch does not benefit vfio-ccw.
Finally, thank you very much for your test data.

Regards,
Xu
>>
>> Regards,
>> Xu
>>
> .

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-12-15 13:14 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-21  7:58 [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance xuxiaoyang (C)
2020-12-08 13:55 ` xuxiaoyang (C)
2020-12-09 11:54   ` Cornelia Huck
2020-12-09 14:42     ` Eric Farman
2020-12-10 13:56       ` xuxiaoyang (C)
2020-12-14 18:58         ` Eric Farman
2020-12-15 13:13           ` xuxiaoyang (C)
2020-12-10 13:54     ` xuxiaoyang (C)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.