linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
@ 2020-11-10 13:42 xuxiaoyang (C)
  2020-11-11 15:56 ` Alex Williamson
  2020-11-13 16:44 ` Alex Williamson
  0 siblings, 2 replies; 7+ messages in thread
From: xuxiaoyang (C) @ 2020-11-10 13:42 UTC (permalink / raw)
  To: linux-kernel, kvm, alex.williamson
  Cc: kwankhede, wu.wubin, maoming.maoming, xieyingtai, lizhengui,
	wubinfeng, xuxiaoyang2

vfio_iommu_type1_pin_pages is very inefficient because
it is processed page by page when calling vfio_pin_page_external.
Added contiguous_vaddr_get_pfn to process continuous pages
to reduce the number of loops, thereby improving performance.

Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 241 ++++++++++++++++++++++++++++----
 1 file changed, 214 insertions(+), 27 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 67e827638995..935f80807527 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -628,6 +628,206 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 	return unlocked;
 }

+static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+				    int prot, long npage, unsigned long *phys_pfn)
+{
+	struct page **pages = NULL;
+	unsigned int flags = 0;
+	int i, ret;
+
+	pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	if (prot & IOMMU_WRITE)
+		flags |= FOLL_WRITE;
+
+	mmap_read_lock(mm);
+	ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
+				    pages, NULL, NULL);
+	mmap_read_unlock(mm);
+
+	for (i = 0; i < ret; i++)
+		*(phys_pfn + i) = page_to_pfn(pages[i]);
+
+	kvfree(pages);
+
+	return ret;
+}
+
+static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
+				    struct vfio_dma *dma,
+				    unsigned long *user_pfn,
+				    int npage, unsigned long *phys_pfn,
+				    bool do_accounting)
+{
+	int ret, i, j, lock_acct = 0;
+	unsigned long remote_vaddr;
+	dma_addr_t iova;
+	struct mm_struct *mm;
+	struct vfio_pfn *vpfn;
+
+	mm = get_task_mm(dma->task);
+	if (!mm)
+		return -ENODEV;
+
+	iova = user_pfn[0] << PAGE_SHIFT;
+	remote_vaddr = dma->vaddr + iova - dma->iova;
+	ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
+					    npage, phys_pfn);
+	mmput(mm);
+	if (ret <= 0)
+		return ret;
+
+	npage = ret;
+	for (i = 0; i < npage; i++) {
+		iova = user_pfn[i] << PAGE_SHIFT;
+		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
+		if (ret)
+			goto unwind;
+
+		if (!is_invalid_reserved_pfn(phys_pfn[i]))
+			lock_acct++;
+
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap,
+				   (iova - dma->iova) >> pgshift, 1);
+		}
+	}
+
+	if (do_accounting) {
+		ret = vfio_lock_acct(dma, lock_acct, true);
+		if (ret) {
+			if (ret == -ENOMEM)
+				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
+					__func__, dma->task->comm, task_pid_nr(dma->task),
+					task_rlimit(dma->task, RLIMIT_MEMLOCK));
+			goto unwind;
+		}
+	}
+
+	return i;
+unwind:
+	for (j = 0; j < npage; j++) {
+		put_pfn(phys_pfn[j], dma->prot);
+		phys_pfn[j] = 0;
+	}
+
+	for (j = 0; j < i; j++) {
+		iova = user_pfn[j] << PAGE_SHIFT;
+		vpfn = vfio_find_vpfn(dma, iova);
+		if (vpfn)
+			vfio_remove_from_pfn_list(dma, vpfn);
+	}
+
+	return ret;
+}
+
+static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
+					    struct vfio_dma *dma,
+					    unsigned long *user_pfn,
+					    int npage, unsigned long *phys_pfn,
+					    bool do_accounting)
+{
+	int ret, i, j;
+	unsigned long remote_vaddr;
+	dma_addr_t iova;
+
+	ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
+				phys_pfn, do_accounting);
+	if (ret == npage)
+		return ret;
+
+	if (ret < 0)
+		ret = 0;
+
+	for (i = ret; i < npage; i++) {
+		iova = user_pfn[i] << PAGE_SHIFT;
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
+			    do_accounting);
+		if (ret)
+			goto pin_unwind;
+
+		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
+		if (ret) {
+			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
+				vfio_lock_acct(dma, -1, true);
+			goto pin_unwind;
+		}
+
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap,
+					   (iova - dma->iova) >> pgshift, 1);
+		}
+	}
+
+	return i;
+
+pin_unwind:
+	phys_pfn[i] = 0;
+	for (j = 0; j < i; j++) {
+		dma_addr_t iova;
+
+		iova = user_pfn[j] << PAGE_SHIFT;
+		vfio_unpin_page_external(dma, iova, do_accounting);
+		phys_pfn[j] = 0;
+	}
+
+	return ret;
+}
+
+static int vfio_iommu_type1_get_contiguous_pages_length(struct vfio_iommu *iommu,
+				    unsigned long *user_pfn, int npage, int prot)
+{
+	struct vfio_dma *dma_base;
+	int i;
+	dma_addr_t iova;
+	struct vfio_pfn *vpfn;
+
+	if (npage <= 1)
+		return npage;
+
+	iova = user_pfn[0] << PAGE_SHIFT;
+	dma_base = vfio_find_dma(iommu, iova, PAGE_SIZE);
+	if (!dma_base)
+		return -EINVAL;
+
+	if ((dma_base->prot & prot) != prot)
+		return -EPERM;
+
+	for (i = 1; i < npage; i++) {
+		iova = user_pfn[i] << PAGE_SHIFT;
+
+		if (iova >= dma_base->iova + dma_base->size ||
+				iova + PAGE_SIZE <= dma_base->iova)
+			break;
+
+		vpfn = vfio_iova_get_vfio_pfn(dma_base, iova);
+		if (vpfn) {
+			vfio_iova_put_vfio_pfn(dma_base, vpfn);
+			break;
+		}
+
+		if (user_pfn[i] != user_pfn[0] + i)
+			break;
+	}
+	return i;
+}
+
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				      struct iommu_group *iommu_group,
 				      unsigned long *user_pfn,
@@ -637,9 +837,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_group *group;
 	int i, j, ret;
-	unsigned long remote_vaddr;
 	struct vfio_dma *dma;
 	bool do_accounting;
+	int contiguous_npage;

 	if (!iommu || !user_pfn || !phys_pfn)
 		return -EINVAL;
@@ -663,7 +863,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	 */
 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);

-	for (i = 0; i < npage; i++) {
+	for (i = 0; i < npage; i += contiguous_npage) {
 		dma_addr_t iova;
 		struct vfio_pfn *vpfn;

@@ -682,31 +882,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
 		if (vpfn) {
 			phys_pfn[i] = vpfn->pfn;
-			continue;
-		}
-
-		remote_vaddr = dma->vaddr + (iova - dma->iova);
-		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
-					     do_accounting);
-		if (ret)
-			goto pin_unwind;
-
-		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
-		if (ret) {
-			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
-				vfio_lock_acct(dma, -1, true);
-			goto pin_unwind;
-		}
-
-		if (iommu->dirty_page_tracking) {
-			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
-
-			/*
-			 * Bitmap populated with the smallest supported page
-			 * size
-			 */
-			bitmap_set(dma->bitmap,
-				   (iova - dma->iova) >> pgshift, 1);
+			contiguous_npage = 1;
+		} else {
+			ret = vfio_iommu_type1_get_contiguous_pages_length(iommu,
+					&user_pfn[i], npage - i, prot);
+			if (ret < 0)
+				goto pin_unwind;
+
+			ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
+					dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting);
+			if (ret < 0)
+				goto pin_unwind;
+			contiguous_npage = ret;
 		}
 	}
 	ret = i;
--
2.19.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-11-10 13:42 [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance xuxiaoyang (C)
@ 2020-11-11 15:56 ` Alex Williamson
  2020-11-12 11:49   ` xuxiaoyang (C)
  2020-11-13 16:44 ` Alex Williamson
  1 sibling, 1 reply; 7+ messages in thread
From: Alex Williamson @ 2020-11-11 15:56 UTC (permalink / raw)
  To: xuxiaoyang (C)
  Cc: linux-kernel, kvm, kwankhede, wu.wubin, maoming.maoming,
	xieyingtai, lizhengui, wubinfeng

On Tue, 10 Nov 2020 21:42:33 +0800
"xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:

> vfio_iommu_type1_pin_pages is very inefficient because
> it is processed page by page when calling vfio_pin_page_external.
> Added contiguous_vaddr_get_pfn to process continuous pages
> to reduce the number of loops, thereby improving performance.

vfio_pin_pages() accepts an array of unrelated iova pfns and processes
each to return the physical pfn.  AFAICT this proposal makes an
unfounded and unverified assumption that the caller is asking for a
range of contiguous iova pfns.  That's not the semantics of the call.
This is wrong.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-11-11 15:56 ` Alex Williamson
@ 2020-11-12 11:49   ` xuxiaoyang (C)
  0 siblings, 0 replies; 7+ messages in thread
From: xuxiaoyang (C) @ 2020-11-12 11:49 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-kernel, kvm, kwankhede, wu.wubin, maoming.maoming,
	xieyingtai, lizhengui, wubinfeng

On 2020/11/11 23:56, Alex Williamson wrote:
> On Tue, 10 Nov 2020 21:42:33 +0800
> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
> 
>> vfio_iommu_type1_pin_pages is very inefficient because
>> it is processed page by page when calling vfio_pin_page_external.
>> Added contiguous_vaddr_get_pfn to process continuous pages
>> to reduce the number of loops, thereby improving performance.
> 
> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
> each to return the physical pfn.  AFAICT this proposal makes an
> unfounded and unverified assumption that the caller is asking for a
> range of contiguous iova pfns.  That's not the semantics of the call.
> This is wrong.  Thanks,
> 
> Alex
> 
> .
> 
Thank you for your reply.  Sorry that the comment is too simple
and not clear enough.
We did not change the external behavior of the function.  What we
have to do is to divide the iova pfn array into multiple continuous
ranges and optimize them.  For example, when the iova pfn array is
{1,5,6,7,9}, it will be divided into three groups {1}, {5,6,7}, {9}
for processing.  When processing {5,6,7}, the number of calls to
pin_user_pages_remote is reduced from 3 times to once.  The more
continuous the iova pfn array, the greater the performance improvement.
I see that most of the iova pfn arrays processed by callers are
continuous, such as pfn_array_pin, gvt_pin_guest_page.  For them,
performance will be improved.

Regards,
Xu

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-11-10 13:42 [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance xuxiaoyang (C)
  2020-11-11 15:56 ` Alex Williamson
@ 2020-11-13 16:44 ` Alex Williamson
  2020-11-16 13:47   ` xuxiaoyang (C)
  1 sibling, 1 reply; 7+ messages in thread
From: Alex Williamson @ 2020-11-13 16:44 UTC (permalink / raw)
  To: xuxiaoyang (C)
  Cc: linux-kernel, kvm, kwankhede, wu.wubin, maoming.maoming,
	xieyingtai, lizhengui, wubinfeng

On Tue, 10 Nov 2020 21:42:33 +0800
"xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:

> vfio_iommu_type1_pin_pages is very inefficient because
> it is processed page by page when calling vfio_pin_page_external.
> Added contiguous_vaddr_get_pfn to process continuous pages
> to reduce the number of loops, thereby improving performance.
> 
> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 241 ++++++++++++++++++++++++++++----
>  1 file changed, 214 insertions(+), 27 deletions(-)

Sorry for my previous misunderstanding of the logic here.  Still, this
adds a lot of complexity, can you quantify the performance improvement
you're seeing?  Would it instead be more worthwhile to support an
interface that pins based on an iova and range?  Further comments below.

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 67e827638995..935f80807527 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -628,6 +628,206 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>  	return unlocked;
>  }
> 
> +static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +				    int prot, long npage, unsigned long *phys_pfn)
> +{
> +	struct page **pages = NULL;
> +	unsigned int flags = 0;
> +	int i, ret;
> +
> +	pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
> +	if (!pages)
> +		return -ENOMEM;
> +
> +	if (prot & IOMMU_WRITE)
> +		flags |= FOLL_WRITE;
> +
> +	mmap_read_lock(mm);
> +	ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
> +				    pages, NULL, NULL);

This is essentially the root of the performance improvement claim,
right?  ie. we're pinning a range of pages rather than individually.
Unfortunately it means we also add a dynamic memory allocation even
when npage=1.


> +	mmap_read_unlock(mm);
> +
> +	for (i = 0; i < ret; i++)
> +		*(phys_pfn + i) = page_to_pfn(pages[i]);
> +
> +	kvfree(pages);
> +
> +	return ret;
> +}
> +
> +static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
> +				    struct vfio_dma *dma,
> +				    unsigned long *user_pfn,
> +				    int npage, unsigned long *phys_pfn,
> +				    bool do_accounting)
> +{
> +	int ret, i, j, lock_acct = 0;
> +	unsigned long remote_vaddr;
> +	dma_addr_t iova;
> +	struct mm_struct *mm;
> +	struct vfio_pfn *vpfn;
> +
> +	mm = get_task_mm(dma->task);
> +	if (!mm)
> +		return -ENODEV;
> +
> +	iova = user_pfn[0] << PAGE_SHIFT;
> +	remote_vaddr = dma->vaddr + iova - dma->iova;
> +	ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
> +					    npage, phys_pfn);
> +	mmput(mm);
> +	if (ret <= 0)
> +		return ret;
> +
> +	npage = ret;
> +	for (i = 0; i < npage; i++) {
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> +		if (ret)
> +			goto unwind;
> +
> +		if (!is_invalid_reserved_pfn(phys_pfn[i]))
> +			lock_acct++;
> +
> +		if (iommu->dirty_page_tracking) {
> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> +
> +			/*
> +			 * Bitmap populated with the smallest supported page
> +			 * size
> +			 */
> +			bitmap_set(dma->bitmap,
> +				   (iova - dma->iova) >> pgshift, 1);
> +		}

It doesn't make sense that we wouldn't also optimize this to simply set
npage bits.  There's also no unwind for this.

> +	}
> +
> +	if (do_accounting) {
> +		ret = vfio_lock_acct(dma, lock_acct, true);
> +		if (ret) {
> +			if (ret == -ENOMEM)
> +				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> +					__func__, dma->task->comm, task_pid_nr(dma->task),
> +					task_rlimit(dma->task, RLIMIT_MEMLOCK));
> +			goto unwind;
> +		}
> +	}

This algorithm allows pinning many more pages in advance of testing
whether we've exceeded the task locked memory limit than the per page
approach.


> +
> +	return i;
> +unwind:
> +	for (j = 0; j < npage; j++) {
> +		put_pfn(phys_pfn[j], dma->prot);
> +		phys_pfn[j] = 0;
> +	}
> +
> +	for (j = 0; j < i; j++) {
> +		iova = user_pfn[j] << PAGE_SHIFT;
> +		vpfn = vfio_find_vpfn(dma, iova);
> +		if (vpfn)

Seems like not finding a vpfn would be an error.

> +			vfio_remove_from_pfn_list(dma, vpfn);

It seems poor form not to use vfio_iova_put_vfio_pfn() here even if we
know we hold the only reference.


> +	}
> +
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
> +					    struct vfio_dma *dma,
> +					    unsigned long *user_pfn,
> +					    int npage, unsigned long *phys_pfn,
> +					    bool do_accounting)
> +{
> +	int ret, i, j;
> +	unsigned long remote_vaddr;
> +	dma_addr_t iova;
> +
> +	ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
> +				phys_pfn, do_accounting);
> +	if (ret == npage)
> +		return ret;
> +
> +	if (ret < 0)
> +		ret = 0;


I'm lost, why do we need the single page iteration below??

> +	for (i = ret; i < npage; i++) {
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> +			    do_accounting);
> +		if (ret)
> +			goto pin_unwind;
> +
> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> +		if (ret) {
> +			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> +				vfio_lock_acct(dma, -1, true);
> +			goto pin_unwind;
> +		}
> +
> +		if (iommu->dirty_page_tracking) {
> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> +
> +			/*
> +			 * Bitmap populated with the smallest supported page
> +			 * size
> +			 */
> +			bitmap_set(dma->bitmap,
> +					   (iova - dma->iova) >> pgshift, 1);
> +		}
> +	}
> +
> +	return i;
> +
> +pin_unwind:
> +	phys_pfn[i] = 0;
> +	for (j = 0; j < i; j++) {
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[j] << PAGE_SHIFT;
> +		vfio_unpin_page_external(dma, iova, do_accounting);
> +		phys_pfn[j] = 0;
> +	}
> +
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_get_contiguous_pages_length(struct vfio_iommu *iommu,
> +				    unsigned long *user_pfn, int npage, int prot)
> +{
> +	struct vfio_dma *dma_base;
> +	int i;
> +	dma_addr_t iova;
> +	struct vfio_pfn *vpfn;
> +
> +	if (npage <= 1)
> +		return npage;
> +
> +	iova = user_pfn[0] << PAGE_SHIFT;
> +	dma_base = vfio_find_dma(iommu, iova, PAGE_SIZE);
> +	if (!dma_base)
> +		return -EINVAL;
> +
> +	if ((dma_base->prot & prot) != prot)
> +		return -EPERM;
> +
> +	for (i = 1; i < npage; i++) {
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		if (iova >= dma_base->iova + dma_base->size ||
> +				iova + PAGE_SIZE <= dma_base->iova)
> +			break;
> +
> +		vpfn = vfio_iova_get_vfio_pfn(dma_base, iova);
> +		if (vpfn) {
> +			vfio_iova_put_vfio_pfn(dma_base, vpfn);

Why not just use vfio_find_vpfn() rather than get+put?

> +			break;
> +		}
> +
> +		if (user_pfn[i] != user_pfn[0] + i)

Shouldn't this be the first test?

> +			break;
> +	}
> +	return i;
> +}
> +
>  static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  				      struct iommu_group *iommu_group,
>  				      unsigned long *user_pfn,
> @@ -637,9 +837,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_group *group;
>  	int i, j, ret;
> -	unsigned long remote_vaddr;
>  	struct vfio_dma *dma;
>  	bool do_accounting;
> +	int contiguous_npage;
> 
>  	if (!iommu || !user_pfn || !phys_pfn)
>  		return -EINVAL;
> @@ -663,7 +863,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  	 */
>  	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> 
> -	for (i = 0; i < npage; i++) {
> +	for (i = 0; i < npage; i += contiguous_npage) {
>  		dma_addr_t iova;
>  		struct vfio_pfn *vpfn;
> 
> @@ -682,31 +882,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>  		if (vpfn) {
>  			phys_pfn[i] = vpfn->pfn;
> -			continue;
> -		}
> -
> -		remote_vaddr = dma->vaddr + (iova - dma->iova);
> -		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> -					     do_accounting);
> -		if (ret)
> -			goto pin_unwind;
> -
> -		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> -		if (ret) {
> -			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> -				vfio_lock_acct(dma, -1, true);
> -			goto pin_unwind;
> -		}
> -
> -		if (iommu->dirty_page_tracking) {
> -			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> -
> -			/*
> -			 * Bitmap populated with the smallest supported page
> -			 * size
> -			 */
> -			bitmap_set(dma->bitmap,
> -				   (iova - dma->iova) >> pgshift, 1);
> +			contiguous_npage = 1;
> +		} else {
> +			ret = vfio_iommu_type1_get_contiguous_pages_length(iommu,
> +					&user_pfn[i], npage - i, prot);


It doesn't make a lot of sense to me that this isn't more integrated
into the base function.  For example, we're passing &user_pfn[i] for
which we've already converted to an iova, found the vfio_dma associated
to that iova, and checked the protection.  This callout does all of
that again on the same.

> +			if (ret < 0)
> +				goto pin_unwind;
> +
> +			ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
> +					dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting);
> +			if (ret < 0)
> +				goto pin_unwind;
> +			contiguous_npage = ret;
>  		}
>  	}
>  	ret = i;


This all seems _way_ more complicated than it needs to be, there are
too many different ways to flow through this code and claims of a
performance improvement are not substantiated with evidence.  The type1
code is already too fragile.  Please simplify and justify.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-11-13 16:44 ` Alex Williamson
@ 2020-11-16 13:47   ` xuxiaoyang (C)
  2020-11-16 16:33     ` Alex Williamson
  0 siblings, 1 reply; 7+ messages in thread
From: xuxiaoyang (C) @ 2020-11-16 13:47 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-kernel, kvm, kwankhede, wu.wubin, maoming.maoming,
	xieyingtai, lizhengui, wubinfeng

On 2020/11/14 0:44, Alex Williamson wrote:
> On Tue, 10 Nov 2020 21:42:33 +0800
> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
> 
>> vfio_iommu_type1_pin_pages is very inefficient because
>> it is processed page by page when calling vfio_pin_page_external.
>> Added contiguous_vaddr_get_pfn to process continuous pages
>> to reduce the number of loops, thereby improving performance.
>>
>> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 241 ++++++++++++++++++++++++++++----
>>  1 file changed, 214 insertions(+), 27 deletions(-)
> 
> Sorry for my previous misunderstanding of the logic here.  Still, this
> adds a lot of complexity, can you quantify the performance improvement
> you're seeing?  Would it instead be more worthwhile to support an
> interface that pins based on an iova and range?  Further comments below.
> 
Thank you for your reply.  I have a set of performance test data for reference.
The host kernel version of my test environment is 4.19.36, and the test cases
are for pin one page and 512 pages.  When pin 512pages, the input is a
continuous iova address.  At the same time increase the measurement factor of
whether the mem backend uses large pages.  The following is the average of
multiple tests.

The patch was not applied
                    1 page           512 pages
no huge pages:     1638ns           223651ns
THP:               1668ns           222330ns
HugeTLB:           1526ns           208151ns

The patch was applied
                    1 page           512 pages
no huge pages       1735ns           167286ns
THP:               1934ns           126900ns
HugeTLB:           1713ns           102188ns

The performance will be reduced when the page is fixed with a single pin,
while the page time of continuous pins can be reduced to half of the original.
If you have other suggestions for testing methods, please let me know.

>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 67e827638995..935f80807527 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -628,6 +628,206 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>>  	return unlocked;
>>  }
>>
>> +static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>> +				    int prot, long npage, unsigned long *phys_pfn)
>> +{
>> +	struct page **pages = NULL;
>> +	unsigned int flags = 0;
>> +	int i, ret;
>> +
>> +	pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
>> +	if (!pages)
>> +		return -ENOMEM;
>> +
>> +	if (prot & IOMMU_WRITE)
>> +		flags |= FOLL_WRITE;
>> +
>> +	mmap_read_lock(mm);
>> +	ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
>> +				    pages, NULL, NULL);
> 
> This is essentially the root of the performance improvement claim,
> right?  ie. we're pinning a range of pages rather than individually.
> Unfortunately it means we also add a dynamic memory allocation even
> when npage=1.
> 
Yes, when npage = 1, I should keep the previous scheme of the scene
and use local variables to save time in allocated memory
> 
>> +	mmap_read_unlock(mm);
>> +
>> +	for (i = 0; i < ret; i++)
>> +		*(phys_pfn + i) = page_to_pfn(pages[i]);
>> +
>> +	kvfree(pages);
>> +
>> +	return ret;
>> +}
>> +
>> +static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
>> +				    struct vfio_dma *dma,
>> +				    unsigned long *user_pfn,
>> +				    int npage, unsigned long *phys_pfn,
>> +				    bool do_accounting)
>> +{
>> +	int ret, i, j, lock_acct = 0;
>> +	unsigned long remote_vaddr;
>> +	dma_addr_t iova;
>> +	struct mm_struct *mm;
>> +	struct vfio_pfn *vpfn;
>> +
>> +	mm = get_task_mm(dma->task);
>> +	if (!mm)
>> +		return -ENODEV;
>> +
>> +	iova = user_pfn[0] << PAGE_SHIFT;
>> +	remote_vaddr = dma->vaddr + iova - dma->iova;
>> +	ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
>> +					    npage, phys_pfn);
>> +	mmput(mm);
>> +	if (ret <= 0)
>> +		return ret;
>> +
>> +	npage = ret;
>> +	for (i = 0; i < npage; i++) {
>> +		iova = user_pfn[i] << PAGE_SHIFT;
>> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>> +		if (ret)
>> +			goto unwind;
>> +
>> +		if (!is_invalid_reserved_pfn(phys_pfn[i]))
>> +			lock_acct++;
>> +
>> +		if (iommu->dirty_page_tracking) {
>> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
>> +
>> +			/*
>> +			 * Bitmap populated with the smallest supported page
>> +			 * size
>> +			 */
>> +			bitmap_set(dma->bitmap,
>> +				   (iova - dma->iova) >> pgshift, 1);
>> +		}
> 
> It doesn't make sense that we wouldn't also optimize this to simply set
> npage bits.  There's also no unwind for this.
> 
Thank you for your correction, I should set npage to simplify.
There is no unwind process on the current master branch.  Is this a bug?
>> +	}
>> +
>> +	if (do_accounting) {
>> +		ret = vfio_lock_acct(dma, lock_acct, true);
>> +		if (ret) {
>> +			if (ret == -ENOMEM)
>> +				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
>> +					__func__, dma->task->comm, task_pid_nr(dma->task),
>> +					task_rlimit(dma->task, RLIMIT_MEMLOCK));
>> +			goto unwind;
>> +		}
>> +	}
> 
> This algorithm allows pinning many more pages in advance of testing
> whether we've exceeded the task locked memory limit than the per page
> approach.
> 
More than 1~VFIO_PIN_PAGES_MAX_ENTRIES. But after failure, all pinned
pages will be released. Is there a big impact here?
> 
>> +
>> +	return i;
>> +unwind:
>> +	for (j = 0; j < npage; j++) {
>> +		put_pfn(phys_pfn[j], dma->prot);
>> +		phys_pfn[j] = 0;
>> +	}
>> +
>> +	for (j = 0; j < i; j++) {
>> +		iova = user_pfn[j] << PAGE_SHIFT;
>> +		vpfn = vfio_find_vpfn(dma, iova);
>> +		if (vpfn)
> 
> Seems like not finding a vpfn would be an error.
> 
I think the above process can ensure that vpfn is not NULL
and delete this judgment.
>> +			vfio_remove_from_pfn_list(dma, vpfn);
> 
> It seems poor form not to use vfio_iova_put_vfio_pfn() here even if we
> know we hold the only reference.
> 
The logic here can be reduced to a loop.
When j < i, call vfio_iova_put_vfio_pfn.
> 
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
>> +					    struct vfio_dma *dma,
>> +					    unsigned long *user_pfn,
>> +					    int npage, unsigned long *phys_pfn,
>> +					    bool do_accounting)
>> +{
>> +	int ret, i, j;
>> +	unsigned long remote_vaddr;
>> +	dma_addr_t iova;
>> +
>> +	ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
>> +				phys_pfn, do_accounting);
>> +	if (ret == npage)
>> +		return ret;
>> +
>> +	if (ret < 0)
>> +		ret = 0;
> 
> 
> I'm lost, why do we need the single page iteration below??
> 
Since there is no retry in contiguous_vaddr_get_pfn, oncean error occurs,
the remaining pages will be processed in a single page.
Is it better to increase retry in contiguous_vaddr_get_pfn?
>> +	for (i = ret; i < npage; i++) {
>> +		iova = user_pfn[i] << PAGE_SHIFT;
>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
>> +
>> +		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
>> +			    do_accounting);
>> +		if (ret)
>> +			goto pin_unwind;
>> +
>> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>> +		if (ret) {
>> +			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
>> +				vfio_lock_acct(dma, -1, true);
>> +			goto pin_unwind;
>> +		}
>> +
>> +		if (iommu->dirty_page_tracking) {
>> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
>> +
>> +			/*
>> +			 * Bitmap populated with the smallest supported page
>> +			 * size
>> +			 */
>> +			bitmap_set(dma->bitmap,
>> +					   (iova - dma->iova) >> pgshift, 1);
>> +		}
>> +	}
>> +
>> +	return i;
>> +
>> +pin_unwind:
>> +	phys_pfn[i] = 0;
>> +	for (j = 0; j < i; j++) {
>> +		dma_addr_t iova;
>> +
>> +		iova = user_pfn[j] << PAGE_SHIFT;
>> +		vfio_unpin_page_external(dma, iova, do_accounting);
>> +		phys_pfn[j] = 0;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static int vfio_iommu_type1_get_contiguous_pages_length(struct vfio_iommu *iommu,
>> +				    unsigned long *user_pfn, int npage, int prot)
>> +{
>> +	struct vfio_dma *dma_base;
>> +	int i;
>> +	dma_addr_t iova;
>> +	struct vfio_pfn *vpfn;
>> +
>> +	if (npage <= 1)
>> +		return npage;
>> +
>> +	iova = user_pfn[0] << PAGE_SHIFT;
>> +	dma_base = vfio_find_dma(iommu, iova, PAGE_SIZE);
>> +	if (!dma_base)
>> +		return -EINVAL;
>> +
>> +	if ((dma_base->prot & prot) != prot)
>> +		return -EPERM;
>> +
>> +	for (i = 1; i < npage; i++) {
>> +		iova = user_pfn[i] << PAGE_SHIFT;
>> +
>> +		if (iova >= dma_base->iova + dma_base->size ||
>> +				iova + PAGE_SIZE <= dma_base->iova)
>> +			break;
>> +
>> +		vpfn = vfio_iova_get_vfio_pfn(dma_base, iova);
>> +		if (vpfn) {
>> +			vfio_iova_put_vfio_pfn(dma_base, vpfn);
> 
> Why not just use vfio_find_vpfn() rather than get+put?
> 
Thank you for your correction, I should use vfio_find_vpfn here.
>> +			break;
>> +		}
>> +
>> +		if (user_pfn[i] != user_pfn[0] + i)
> 
> Shouldn't this be the first test?
> 
Thank you for your correction, the least costly judgment should be
the first test.
>> +			break;
>> +	}
>> +	return i;
>> +}
>> +
>>  static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>  				      struct iommu_group *iommu_group,
>>  				      unsigned long *user_pfn,
>> @@ -637,9 +837,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>  	struct vfio_iommu *iommu = iommu_data;
>>  	struct vfio_group *group;
>>  	int i, j, ret;
>> -	unsigned long remote_vaddr;
>>  	struct vfio_dma *dma;
>>  	bool do_accounting;
>> +	int contiguous_npage;
>>
>>  	if (!iommu || !user_pfn || !phys_pfn)
>>  		return -EINVAL;
>> @@ -663,7 +863,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>  	 */
>>  	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
>>
>> -	for (i = 0; i < npage; i++) {
>> +	for (i = 0; i < npage; i += contiguous_npage) {
>>  		dma_addr_t iova;
>>  		struct vfio_pfn *vpfn;
>>
>> @@ -682,31 +882,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>  		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>>  		if (vpfn) {
>>  			phys_pfn[i] = vpfn->pfn;
>> -			continue;
>> -		}
>> -
>> -		remote_vaddr = dma->vaddr + (iova - dma->iova);
>> -		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
>> -					     do_accounting);
>> -		if (ret)
>> -			goto pin_unwind;
>> -
>> -		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>> -		if (ret) {
>> -			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
>> -				vfio_lock_acct(dma, -1, true);
>> -			goto pin_unwind;
>> -		}
>> -
>> -		if (iommu->dirty_page_tracking) {
>> -			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
>> -
>> -			/*
>> -			 * Bitmap populated with the smallest supported page
>> -			 * size
>> -			 */
>> -			bitmap_set(dma->bitmap,
>> -				   (iova - dma->iova) >> pgshift, 1);
>> +			contiguous_npage = 1;
>> +		} else {
>> +			ret = vfio_iommu_type1_get_contiguous_pages_length(iommu,
>> +					&user_pfn[i], npage - i, prot);
> 
> 
> It doesn't make a lot of sense to me that this isn't more integrated
> into the base function.  For example, we're passing &user_pfn[i] for
> which we've already converted to an iova, found the vfio_dma associated
> to that iova, and checked the protection.  This callout does all of
> that again on the same.
> 
Thanks for your correction, I will delete the redundant check in
vfio_iommu_type1_get_contiguous_pages_length and simplify the function to
static int vfio_get_contiguous_pages_length (struct vfio_dma * dma,
unsigned long * user_pfn, int npage)
>> +			if (ret < 0)
>> +				goto pin_unwind;
>> +
>> +			ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
>> +					dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting);
>> +			if (ret < 0)
>> +				goto pin_unwind;
>> +			contiguous_npage = ret;
>>  		}
>>  	}
>>  	ret = i;
> 
> 
> This all seems _way_ more complicated than it needs to be, there are
> too many different ways to flow through this code and claims of a
> performance improvement are not substantiated with evidence.  The type1
> code is already too fragile.  Please simplify and justify.  Thanks,
> 
> Alex
> 
> .
> 
The main issue is the balance between performance and complexity.
The test data has been given above, please tell me your opinion.

Regards,
Xu

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-11-16 13:47   ` xuxiaoyang (C)
@ 2020-11-16 16:33     ` Alex Williamson
  2020-11-17  3:31       ` xuxiaoyang (C)
  0 siblings, 1 reply; 7+ messages in thread
From: Alex Williamson @ 2020-11-16 16:33 UTC (permalink / raw)
  To: xuxiaoyang (C)
  Cc: linux-kernel, kvm, kwankhede, wu.wubin, maoming.maoming,
	xieyingtai, lizhengui, wubinfeng

On Mon, 16 Nov 2020 21:47:33 +0800
"xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:

> On 2020/11/14 0:44, Alex Williamson wrote:
> > On Tue, 10 Nov 2020 21:42:33 +0800
> > "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
> >   
> >> vfio_iommu_type1_pin_pages is very inefficient because
> >> it is processed page by page when calling vfio_pin_page_external.
> >> Added contiguous_vaddr_get_pfn to process continuous pages
> >> to reduce the number of loops, thereby improving performance.
> >>
> >> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 241 ++++++++++++++++++++++++++++----
> >>  1 file changed, 214 insertions(+), 27 deletions(-)  
> > 
> > Sorry for my previous misunderstanding of the logic here.  Still, this
> > adds a lot of complexity, can you quantify the performance improvement
> > you're seeing?  Would it instead be more worthwhile to support an
> > interface that pins based on an iova and range?  Further comments below.
> >   
> Thank you for your reply.  I have a set of performance test data for reference.
> The host kernel version of my test environment is 4.19.36, and the test cases
> are for pin one page and 512 pages.  When pin 512pages, the input is a
> continuous iova address.  At the same time increase the measurement factor of
> whether the mem backend uses large pages.  The following is the average of
> multiple tests.
> 
> The patch was not applied
>                     1 page           512 pages
> no huge pages:     1638ns           223651ns
> THP:               1668ns           222330ns
> HugeTLB:           1526ns           208151ns
> 
> The patch was applied
>                     1 page           512 pages
> no huge pages       1735ns           167286ns
> THP:               1934ns           126900ns
> HugeTLB:           1713ns           102188ns
> 
> The performance will be reduced when the page is fixed with a single pin,
> while the page time of continuous pins can be reduced to half of the original.
> If you have other suggestions for testing methods, please let me know.

These are artificial test results, which in fact show a performance
decrease for the single page use cases.  What do we see in typical real
world scenarios?  Do those real world scenarios tend more towards large
arrays of contiguous IOVAs or single pages, or large arrays of
discontiguous IOVAs?  What's the resulting change in device
performance?  Are pages pinned at a high enough frequency that pinning
latency results in a measurable benefit at the device or to the
overhead on the host?

> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 67e827638995..935f80807527 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -628,6 +628,206 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> >>  	return unlocked;
> >>  }
> >>
> >> +static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> >> +				    int prot, long npage, unsigned long *phys_pfn)
> >> +{
> >> +	struct page **pages = NULL;
> >> +	unsigned int flags = 0;
> >> +	int i, ret;
> >> +
> >> +	pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
> >> +	if (!pages)
> >> +		return -ENOMEM;
> >> +
> >> +	if (prot & IOMMU_WRITE)
> >> +		flags |= FOLL_WRITE;
> >> +
> >> +	mmap_read_lock(mm);
> >> +	ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
> >> +				    pages, NULL, NULL);  
> > 
> > This is essentially the root of the performance improvement claim,
> > right?  ie. we're pinning a range of pages rather than individually.
> > Unfortunately it means we also add a dynamic memory allocation even
> > when npage=1.
> >   
> Yes, when npage = 1, I should keep the previous scheme of the scene
> and use local variables to save time in allocated memory
> >   
> >> +	mmap_read_unlock(mm);
> >> +
> >> +	for (i = 0; i < ret; i++)
> >> +		*(phys_pfn + i) = page_to_pfn(pages[i]);
> >> +
> >> +	kvfree(pages);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
> >> +				    struct vfio_dma *dma,
> >> +				    unsigned long *user_pfn,
> >> +				    int npage, unsigned long *phys_pfn,
> >> +				    bool do_accounting)
> >> +{
> >> +	int ret, i, j, lock_acct = 0;
> >> +	unsigned long remote_vaddr;
> >> +	dma_addr_t iova;
> >> +	struct mm_struct *mm;
> >> +	struct vfio_pfn *vpfn;
> >> +
> >> +	mm = get_task_mm(dma->task);
> >> +	if (!mm)
> >> +		return -ENODEV;
> >> +
> >> +	iova = user_pfn[0] << PAGE_SHIFT;
> >> +	remote_vaddr = dma->vaddr + iova - dma->iova;
> >> +	ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
> >> +					    npage, phys_pfn);
> >> +	mmput(mm);
> >> +	if (ret <= 0)
> >> +		return ret;
> >> +
> >> +	npage = ret;
> >> +	for (i = 0; i < npage; i++) {
> >> +		iova = user_pfn[i] << PAGE_SHIFT;
> >> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> >> +		if (ret)
> >> +			goto unwind;
> >> +
> >> +		if (!is_invalid_reserved_pfn(phys_pfn[i]))
> >> +			lock_acct++;
> >> +
> >> +		if (iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> >> +
> >> +			/*
> >> +			 * Bitmap populated with the smallest supported page
> >> +			 * size
> >> +			 */
> >> +			bitmap_set(dma->bitmap,
> >> +				   (iova - dma->iova) >> pgshift, 1);
> >> +		}  
> > 
> > It doesn't make sense that we wouldn't also optimize this to simply set
> > npage bits.  There's also no unwind for this.
> >   
> Thank you for your correction, I should set npage to simplify.
> There is no unwind process on the current master branch.  Is this a bug?

It's a bit tricky to know when it's ok to clear a bit in the dirty
bitmap, it's much, much worse to accidentally clear a bit to
incorrectly report a page a clean than it is to fail to unwind and
leave pages marked as dirty.  Given that this is an error path, we
might be willing to incorrectly leave pages marked dirty rather than
risk the alternative.

> >> +	}
> >> +
> >> +	if (do_accounting) {
> >> +		ret = vfio_lock_acct(dma, lock_acct, true);
> >> +		if (ret) {
> >> +			if (ret == -ENOMEM)
> >> +				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> >> +					__func__, dma->task->comm, task_pid_nr(dma->task),
> >> +					task_rlimit(dma->task, RLIMIT_MEMLOCK));
> >> +			goto unwind;
> >> +		}
> >> +	}  
> > 
> > This algorithm allows pinning many more pages in advance of testing
> > whether we've exceeded the task locked memory limit than the per page
> > approach.
> >   
> More than 1~VFIO_PIN_PAGES_MAX_ENTRIES. But after failure, all pinned
> pages will be released. Is there a big impact here?
> >   
> >> +
> >> +	return i;
> >> +unwind:
> >> +	for (j = 0; j < npage; j++) {
> >> +		put_pfn(phys_pfn[j], dma->prot);
> >> +		phys_pfn[j] = 0;
> >> +	}
> >> +
> >> +	for (j = 0; j < i; j++) {
> >> +		iova = user_pfn[j] << PAGE_SHIFT;
> >> +		vpfn = vfio_find_vpfn(dma, iova);
> >> +		if (vpfn)  
> > 
> > Seems like not finding a vpfn would be an error.
> >   
> I think the above process can ensure that vpfn is not NULL
> and delete this judgment.
> >> +			vfio_remove_from_pfn_list(dma, vpfn);  
> > 
> > It seems poor form not to use vfio_iova_put_vfio_pfn() here even if we
> > know we hold the only reference.
> >   
> The logic here can be reduced to a loop.
> When j < i, call vfio_iova_put_vfio_pfn.
> >   
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
> >> +					    struct vfio_dma *dma,
> >> +					    unsigned long *user_pfn,
> >> +					    int npage, unsigned long *phys_pfn,
> >> +					    bool do_accounting)
> >> +{
> >> +	int ret, i, j;
> >> +	unsigned long remote_vaddr;
> >> +	dma_addr_t iova;
> >> +
> >> +	ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
> >> +				phys_pfn, do_accounting);
> >> +	if (ret == npage)
> >> +		return ret;
> >> +
> >> +	if (ret < 0)
> >> +		ret = 0;  
> > 
> > 
> > I'm lost, why do we need the single page iteration below??
> >   
> Since there is no retry in contiguous_vaddr_get_pfn, oncean error occurs,
> the remaining pages will be processed in a single page.
> Is it better to increase retry in contiguous_vaddr_get_pfn?

Do we expect it to work if we call it again?  Do we expect the below
single page iteration to work if the npage pinning failed?  Why?


> >> +	for (i = ret; i < npage; i++) {
> >> +		iova = user_pfn[i] << PAGE_SHIFT;
> >> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> >> +
> >> +		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> >> +			    do_accounting);
> >> +		if (ret)
> >> +			goto pin_unwind;
> >> +
> >> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> >> +		if (ret) {
> >> +			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> >> +				vfio_lock_acct(dma, -1, true);
> >> +			goto pin_unwind;
> >> +		}
> >> +
> >> +		if (iommu->dirty_page_tracking) {
> >> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> >> +
> >> +			/*
> >> +			 * Bitmap populated with the smallest supported page
> >> +			 * size
> >> +			 */
> >> +			bitmap_set(dma->bitmap,
> >> +					   (iova - dma->iova) >> pgshift, 1);
> >> +		}
> >> +	}
> >> +
> >> +	return i;
> >> +
> >> +pin_unwind:
> >> +	phys_pfn[i] = 0;
> >> +	for (j = 0; j < i; j++) {
> >> +		dma_addr_t iova;
> >> +
> >> +		iova = user_pfn[j] << PAGE_SHIFT;
> >> +		vfio_unpin_page_external(dma, iova, do_accounting);
> >> +		phys_pfn[j] = 0;
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int vfio_iommu_type1_get_contiguous_pages_length(struct vfio_iommu *iommu,
> >> +				    unsigned long *user_pfn, int npage, int prot)
> >> +{
> >> +	struct vfio_dma *dma_base;
> >> +	int i;
> >> +	dma_addr_t iova;
> >> +	struct vfio_pfn *vpfn;
> >> +
> >> +	if (npage <= 1)
> >> +		return npage;
> >> +
> >> +	iova = user_pfn[0] << PAGE_SHIFT;
> >> +	dma_base = vfio_find_dma(iommu, iova, PAGE_SIZE);
> >> +	if (!dma_base)
> >> +		return -EINVAL;
> >> +
> >> +	if ((dma_base->prot & prot) != prot)
> >> +		return -EPERM;
> >> +
> >> +	for (i = 1; i < npage; i++) {
> >> +		iova = user_pfn[i] << PAGE_SHIFT;
> >> +
> >> +		if (iova >= dma_base->iova + dma_base->size ||
> >> +				iova + PAGE_SIZE <= dma_base->iova)
> >> +			break;
> >> +
> >> +		vpfn = vfio_iova_get_vfio_pfn(dma_base, iova);
> >> +		if (vpfn) {
> >> +			vfio_iova_put_vfio_pfn(dma_base, vpfn);  
> > 
> > Why not just use vfio_find_vpfn() rather than get+put?
> >   
> Thank you for your correction, I should use vfio_find_vpfn here.
> >> +			break;
> >> +		}
> >> +
> >> +		if (user_pfn[i] != user_pfn[0] + i)  
> > 
> > Shouldn't this be the first test?
> >   
> Thank you for your correction, the least costly judgment should be
> the first test.
> >> +			break;
> >> +	}
> >> +	return i;
> >> +}
> >> +
> >>  static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>  				      struct iommu_group *iommu_group,
> >>  				      unsigned long *user_pfn,
> >> @@ -637,9 +837,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>  	struct vfio_iommu *iommu = iommu_data;
> >>  	struct vfio_group *group;
> >>  	int i, j, ret;
> >> -	unsigned long remote_vaddr;
> >>  	struct vfio_dma *dma;
> >>  	bool do_accounting;
> >> +	int contiguous_npage;
> >>
> >>  	if (!iommu || !user_pfn || !phys_pfn)
> >>  		return -EINVAL;
> >> @@ -663,7 +863,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>  	 */
> >>  	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> >>
> >> -	for (i = 0; i < npage; i++) {
> >> +	for (i = 0; i < npage; i += contiguous_npage) {
> >>  		dma_addr_t iova;
> >>  		struct vfio_pfn *vpfn;
> >>
> >> @@ -682,31 +882,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>  		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> >>  		if (vpfn) {
> >>  			phys_pfn[i] = vpfn->pfn;
> >> -			continue;
> >> -		}
> >> -
> >> -		remote_vaddr = dma->vaddr + (iova - dma->iova);
> >> -		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> >> -					     do_accounting);
> >> -		if (ret)
> >> -			goto pin_unwind;
> >> -
> >> -		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> >> -		if (ret) {
> >> -			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> >> -				vfio_lock_acct(dma, -1, true);
> >> -			goto pin_unwind;
> >> -		}
> >> -
> >> -		if (iommu->dirty_page_tracking) {
> >> -			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> >> -
> >> -			/*
> >> -			 * Bitmap populated with the smallest supported page
> >> -			 * size
> >> -			 */
> >> -			bitmap_set(dma->bitmap,
> >> -				   (iova - dma->iova) >> pgshift, 1);
> >> +			contiguous_npage = 1;
> >> +		} else {
> >> +			ret = vfio_iommu_type1_get_contiguous_pages_length(iommu,
> >> +					&user_pfn[i], npage - i, prot);  
> > 
> > 
> > It doesn't make a lot of sense to me that this isn't more integrated
> > into the base function.  For example, we're passing &user_pfn[i] for
> > which we've already converted to an iova, found the vfio_dma associated
> > to that iova, and checked the protection.  This callout does all of
> > that again on the same.
> >   
> Thanks for your correction, I will delete the redundant check in
> vfio_iommu_type1_get_contiguous_pages_length and simplify the function to
> static int vfio_get_contiguous_pages_length (struct vfio_dma * dma,
> unsigned long * user_pfn, int npage)
> >> +			if (ret < 0)
> >> +				goto pin_unwind;
> >> +
> >> +			ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
> >> +					dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting);
> >> +			if (ret < 0)
> >> +				goto pin_unwind;
> >> +			contiguous_npage = ret;
> >>  		}
> >>  	}
> >>  	ret = i;  
> > 
> > 
> > This all seems _way_ more complicated than it needs to be, there are
> > too many different ways to flow through this code and claims of a
> > performance improvement are not substantiated with evidence.  The type1
> > code is already too fragile.  Please simplify and justify.  Thanks,
> > 
> > Alex
> > 
> > .
> >   
> The main issue is the balance between performance and complexity.
> The test data has been given above, please tell me your opinion.

I think the implementation here is overly complicated, there are too
many code paths and it's not clear what real world improvement to
expect.  The test data only shows us the theoretical best case
improvement of optimizing for a specific use case, without indicating
how prevalent or frequent that use case occurs in operation of an
actual device.  I suspect that it's possible to make the optimization
you're trying to achieve without this degree of complexity.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance
  2020-11-16 16:33     ` Alex Williamson
@ 2020-11-17  3:31       ` xuxiaoyang (C)
  0 siblings, 0 replies; 7+ messages in thread
From: xuxiaoyang (C) @ 2020-11-17  3:31 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-kernel, kvm, kwankhede, wu.wubin, maoming.maoming,
	xieyingtai, lizhengui, wubinfeng



On 2020/11/17 0:33, Alex Williamson wrote:
> On Mon, 16 Nov 2020 21:47:33 +0800
> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
> 
>> On 2020/11/14 0:44, Alex Williamson wrote:
>>> On Tue, 10 Nov 2020 21:42:33 +0800
>>> "xuxiaoyang (C)" <xuxiaoyang2@huawei.com> wrote:
>>>   
>>>> vfio_iommu_type1_pin_pages is very inefficient because
>>>> it is processed page by page when calling vfio_pin_page_external.
>>>> Added contiguous_vaddr_get_pfn to process continuous pages
>>>> to reduce the number of loops, thereby improving performance.
>>>>
>>>> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@huawei.com>
>>>> ---
>>>>  drivers/vfio/vfio_iommu_type1.c | 241 ++++++++++++++++++++++++++++----
>>>>  1 file changed, 214 insertions(+), 27 deletions(-)  
>>>
>>> Sorry for my previous misunderstanding of the logic here.  Still, this
>>> adds a lot of complexity, can you quantify the performance improvement
>>> you're seeing?  Would it instead be more worthwhile to support an
>>> interface that pins based on an iova and range?  Further comments below.
>>>   
>> Thank you for your reply.  I have a set of performance test data for reference.
>> The host kernel version of my test environment is 4.19.36, and the test cases
>> are for pin one page and 512 pages.  When pin 512pages, the input is a
>> continuous iova address.  At the same time increase the measurement factor of
>> whether the mem backend uses large pages.  The following is the average of
>> multiple tests.
>>
>> The patch was not applied
>>                     1 page           512 pages
>> no huge pages:     1638ns           223651ns
>> THP:               1668ns           222330ns
>> HugeTLB:           1526ns           208151ns
>>
>> The patch was applied
>>                     1 page           512 pages
>> no huge pages       1735ns           167286ns
>> THP:               1934ns           126900ns
>> HugeTLB:           1713ns           102188ns
>>
>> The performance will be reduced when the page is fixed with a single pin,
>> while the page time of continuous pins can be reduced to half of the original.
>> If you have other suggestions for testing methods, please let me know.
> 
> These are artificial test results, which in fact show a performance
> decrease for the single page use cases.  What do we see in typical real
> world scenarios?  Do those real world scenarios tend more towards large
> arrays of contiguous IOVAs or single pages, or large arrays of
> discontiguous IOVAs?  What's the resulting change in device
> performance?  Are pages pinned at a high enough frequency that pinning
> latency results in a measurable benefit at the device or to the
> overhead on the host?
> 
Thank you for your reply.  It is difficult for me to construct these
use cases.  When the device is performing DMA, frequent calls to
pin/unpin pages are a big overhead, so we will pin a large part of
the memory to create a memory pool at the beginning.  Therefore, we
pay more attention to the performance of pin continuous pages.
Single page is processed in contiguous_vaddr_get_pfn, which will
reduce performance. We can keep the previous code processing path
in vfio_pin_page_external.  As long as vfio_get_contiguous_pages_length
can return fast enough, the performance loss of single page processing
will be minimized.
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>> index 67e827638995..935f80807527 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -628,6 +628,206 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
>>>>  	return unlocked;
>>>>  }
>>>>
>>>> +static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>>>> +				    int prot, long npage, unsigned long *phys_pfn)
>>>> +{
>>>> +	struct page **pages = NULL;
>>>> +	unsigned int flags = 0;
>>>> +	int i, ret;
>>>> +
>>>> +	pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
>>>> +	if (!pages)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	if (prot & IOMMU_WRITE)
>>>> +		flags |= FOLL_WRITE;
>>>> +
>>>> +	mmap_read_lock(mm);
>>>> +	ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
>>>> +				    pages, NULL, NULL);  
>>>
>>> This is essentially the root of the performance improvement claim,
>>> right?  ie. we're pinning a range of pages rather than individually.
>>> Unfortunately it means we also add a dynamic memory allocation even
>>> when npage=1.
>>>   
>> Yes, when npage = 1, I should keep the previous scheme of the scene
>> and use local variables to save time in allocated memory
>>>   
>>>> +	mmap_read_unlock(mm);
>>>> +
>>>> +	for (i = 0; i < ret; i++)
>>>> +		*(phys_pfn + i) = page_to_pfn(pages[i]);
>>>> +
>>>> +	kvfree(pages);
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
>>>> +				    struct vfio_dma *dma,
>>>> +				    unsigned long *user_pfn,
>>>> +				    int npage, unsigned long *phys_pfn,
>>>> +				    bool do_accounting)
>>>> +{
>>>> +	int ret, i, j, lock_acct = 0;
>>>> +	unsigned long remote_vaddr;
>>>> +	dma_addr_t iova;
>>>> +	struct mm_struct *mm;
>>>> +	struct vfio_pfn *vpfn;
>>>> +
>>>> +	mm = get_task_mm(dma->task);
>>>> +	if (!mm)
>>>> +		return -ENODEV;
>>>> +
>>>> +	iova = user_pfn[0] << PAGE_SHIFT;
>>>> +	remote_vaddr = dma->vaddr + iova - dma->iova;
>>>> +	ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
>>>> +					    npage, phys_pfn);
>>>> +	mmput(mm);
>>>> +	if (ret <= 0)
>>>> +		return ret;
>>>> +
>>>> +	npage = ret;
>>>> +	for (i = 0; i < npage; i++) {
>>>> +		iova = user_pfn[i] << PAGE_SHIFT;
>>>> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>>>> +		if (ret)
>>>> +			goto unwind;
>>>> +
>>>> +		if (!is_invalid_reserved_pfn(phys_pfn[i]))
>>>> +			lock_acct++;
>>>> +
>>>> +		if (iommu->dirty_page_tracking) {
>>>> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
>>>> +
>>>> +			/*
>>>> +			 * Bitmap populated with the smallest supported page
>>>> +			 * size
>>>> +			 */
>>>> +			bitmap_set(dma->bitmap,
>>>> +				   (iova - dma->iova) >> pgshift, 1);
>>>> +		}  
>>>
>>> It doesn't make sense that we wouldn't also optimize this to simply set
>>> npage bits.  There's also no unwind for this.
>>>   
>> Thank you for your correction, I should set npage to simplify.
>> There is no unwind process on the current master branch.  Is this a bug?
> 
> It's a bit tricky to know when it's ok to clear a bit in the dirty
> bitmap, it's much, much worse to accidentally clear a bit to
> incorrectly report a page a clean than it is to fail to unwind and
> leave pages marked as dirty.  Given that this is an error path, we
> might be willing to incorrectly leave pages marked dirty rather than
> risk the alternative.
> 
>>>> +	}
>>>> +
>>>> +	if (do_accounting) {
>>>> +		ret = vfio_lock_acct(dma, lock_acct, true);
>>>> +		if (ret) {
>>>> +			if (ret == -ENOMEM)
>>>> +				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
>>>> +					__func__, dma->task->comm, task_pid_nr(dma->task),
>>>> +					task_rlimit(dma->task, RLIMIT_MEMLOCK));
>>>> +			goto unwind;
>>>> +		}
>>>> +	}  
>>>
>>> This algorithm allows pinning many more pages in advance of testing
>>> whether we've exceeded the task locked memory limit than the per page
>>> approach.
>>>   
>> More than 1~VFIO_PIN_PAGES_MAX_ENTRIES. But after failure, all pinned
>> pages will be released. Is there a big impact here?
>>>   
>>>> +
>>>> +	return i;
>>>> +unwind:
>>>> +	for (j = 0; j < npage; j++) {
>>>> +		put_pfn(phys_pfn[j], dma->prot);
>>>> +		phys_pfn[j] = 0;
>>>> +	}
>>>> +
>>>> +	for (j = 0; j < i; j++) {
>>>> +		iova = user_pfn[j] << PAGE_SHIFT;
>>>> +		vpfn = vfio_find_vpfn(dma, iova);
>>>> +		if (vpfn)  
>>>
>>> Seems like not finding a vpfn would be an error.
>>>   
>> I think the above process can ensure that vpfn is not NULL
>> and delete this judgment.
>>>> +			vfio_remove_from_pfn_list(dma, vpfn);  
>>>
>>> It seems poor form not to use vfio_iova_put_vfio_pfn() here even if we
>>> know we hold the only reference.
>>>   
>> The logic here can be reduced to a loop.
>> When j < i, call vfio_iova_put_vfio_pfn.
>>>   
>>>> +	}
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
>>>> +					    struct vfio_dma *dma,
>>>> +					    unsigned long *user_pfn,
>>>> +					    int npage, unsigned long *phys_pfn,
>>>> +					    bool do_accounting)
>>>> +{
>>>> +	int ret, i, j;
>>>> +	unsigned long remote_vaddr;
>>>> +	dma_addr_t iova;
>>>> +
>>>> +	ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
>>>> +				phys_pfn, do_accounting);
>>>> +	if (ret == npage)
>>>> +		return ret;
>>>> +
>>>> +	if (ret < 0)
>>>> +		ret = 0;  
>>>
>>>
>>> I'm lost, why do we need the single page iteration below??
>>>   
>> Since there is no retry in contiguous_vaddr_get_pfn, oncean error occurs,
>> the remaining pages will be processed in a single page.
>> Is it better to increase retry in contiguous_vaddr_get_pfn?
> 
> Do we expect it to work if we call it again?  Do we expect the below
> single page iteration to work if the npage pinning failed?  Why?
> 
Vaddr_get_pfn contains error handling for the VM_PFNMAP type of
mapping area.  In this case, we hope he can continue to work.
As mentioned above, the scene of n=1 will be processed directly
using vfio_pin_page_external.
> 
>>>> +	for (i = ret; i < npage; i++) {
>>>> +		iova = user_pfn[i] << PAGE_SHIFT;
>>>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
>>>> +
>>>> +		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
>>>> +			    do_accounting);
>>>> +		if (ret)
>>>> +			goto pin_unwind;
>>>> +
>>>> +		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>>>> +		if (ret) {
>>>> +			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
>>>> +				vfio_lock_acct(dma, -1, true);
>>>> +			goto pin_unwind;
>>>> +		}
>>>> +
>>>> +		if (iommu->dirty_page_tracking) {
>>>> +			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
>>>> +
>>>> +			/*
>>>> +			 * Bitmap populated with the smallest supported page
>>>> +			 * size
>>>> +			 */
>>>> +			bitmap_set(dma->bitmap,
>>>> +					   (iova - dma->iova) >> pgshift, 1);
>>>> +		}
>>>> +	}
>>>> +
>>>> +	return i;
>>>> +
>>>> +pin_unwind:
>>>> +	phys_pfn[i] = 0;
>>>> +	for (j = 0; j < i; j++) {
>>>> +		dma_addr_t iova;
>>>> +
>>>> +		iova = user_pfn[j] << PAGE_SHIFT;
>>>> +		vfio_unpin_page_external(dma, iova, do_accounting);
>>>> +		phys_pfn[j] = 0;
>>>> +	}
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static int vfio_iommu_type1_get_contiguous_pages_length(struct vfio_iommu *iommu,
>>>> +				    unsigned long *user_pfn, int npage, int prot)
>>>> +{
>>>> +	struct vfio_dma *dma_base;
>>>> +	int i;
>>>> +	dma_addr_t iova;
>>>> +	struct vfio_pfn *vpfn;
>>>> +
>>>> +	if (npage <= 1)
>>>> +		return npage;
>>>> +
>>>> +	iova = user_pfn[0] << PAGE_SHIFT;
>>>> +	dma_base = vfio_find_dma(iommu, iova, PAGE_SIZE);
>>>> +	if (!dma_base)
>>>> +		return -EINVAL;
>>>> +
>>>> +	if ((dma_base->prot & prot) != prot)
>>>> +		return -EPERM;
>>>> +
>>>> +	for (i = 1; i < npage; i++) {
>>>> +		iova = user_pfn[i] << PAGE_SHIFT;
>>>> +
>>>> +		if (iova >= dma_base->iova + dma_base->size ||
>>>> +				iova + PAGE_SIZE <= dma_base->iova)
>>>> +			break;
>>>> +
>>>> +		vpfn = vfio_iova_get_vfio_pfn(dma_base, iova);
>>>> +		if (vpfn) {
>>>> +			vfio_iova_put_vfio_pfn(dma_base, vpfn);  
>>>
>>> Why not just use vfio_find_vpfn() rather than get+put?
>>>   
>> Thank you for your correction, I should use vfio_find_vpfn here.
>>>> +			break;
>>>> +		}
>>>> +
>>>> +		if (user_pfn[i] != user_pfn[0] + i)  
>>>
>>> Shouldn't this be the first test?
>>>   
>> Thank you for your correction, the least costly judgment should be
>> the first test.
>>>> +			break;
>>>> +	}
>>>> +	return i;
>>>> +}
>>>> +
>>>>  static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>  				      struct iommu_group *iommu_group,
>>>>  				      unsigned long *user_pfn,
>>>> @@ -637,9 +837,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>  	struct vfio_iommu *iommu = iommu_data;
>>>>  	struct vfio_group *group;
>>>>  	int i, j, ret;
>>>> -	unsigned long remote_vaddr;
>>>>  	struct vfio_dma *dma;
>>>>  	bool do_accounting;
>>>> +	int contiguous_npage;
>>>>
>>>>  	if (!iommu || !user_pfn || !phys_pfn)
>>>>  		return -EINVAL;
>>>> @@ -663,7 +863,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>  	 */
>>>>  	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
>>>>
>>>> -	for (i = 0; i < npage; i++) {
>>>> +	for (i = 0; i < npage; i += contiguous_npage) {
>>>>  		dma_addr_t iova;
>>>>  		struct vfio_pfn *vpfn;
>>>>
>>>> @@ -682,31 +882,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>  		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>>>>  		if (vpfn) {
>>>>  			phys_pfn[i] = vpfn->pfn;
>>>> -			continue;
>>>> -		}
>>>> -
>>>> -		remote_vaddr = dma->vaddr + (iova - dma->iova);
>>>> -		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
>>>> -					     do_accounting);
>>>> -		if (ret)
>>>> -			goto pin_unwind;
>>>> -
>>>> -		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
>>>> -		if (ret) {
>>>> -			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
>>>> -				vfio_lock_acct(dma, -1, true);
>>>> -			goto pin_unwind;
>>>> -		}
>>>> -
>>>> -		if (iommu->dirty_page_tracking) {
>>>> -			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
>>>> -
>>>> -			/*
>>>> -			 * Bitmap populated with the smallest supported page
>>>> -			 * size
>>>> -			 */
>>>> -			bitmap_set(dma->bitmap,
>>>> -				   (iova - dma->iova) >> pgshift, 1);
>>>> +			contiguous_npage = 1;
>>>> +		} else {
>>>> +			ret = vfio_iommu_type1_get_contiguous_pages_length(iommu,
>>>> +					&user_pfn[i], npage - i, prot);  
>>>
>>>
>>> It doesn't make a lot of sense to me that this isn't more integrated
>>> into the base function.  For example, we're passing &user_pfn[i] for
>>> which we've already converted to an iova, found the vfio_dma associated
>>> to that iova, and checked the protection.  This callout does all of
>>> that again on the same.
>>>   
>> Thanks for your correction, I will delete the redundant check in
>> vfio_iommu_type1_get_contiguous_pages_length and simplify the function to
>> static int vfio_get_contiguous_pages_length (struct vfio_dma * dma,
>> unsigned long * user_pfn, int npage)
>>>> +			if (ret < 0)
>>>> +				goto pin_unwind;
>>>> +
>>>> +			ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
>>>> +					dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting);
>>>> +			if (ret < 0)
>>>> +				goto pin_unwind;
>>>> +			contiguous_npage = ret;
>>>>  		}
>>>>  	}
>>>>  	ret = i;  
>>>
>>>
>>> This all seems _way_ more complicated than it needs to be, there are
>>> too many different ways to flow through this code and claims of a
>>> performance improvement are not substantiated with evidence.  The type1
>>> code is already too fragile.  Please simplify and justify.  Thanks,
>>>
>>> Alex
>>>
>>> .
>>>   
>> The main issue is the balance between performance and complexity.
>> The test data has been given above, please tell me your opinion.
> 
> I think the implementation here is overly complicated, there are too
> many code paths and it's not clear what real world improvement to
> expect.  The test data only shows us the theoretical best case
> improvement of optimizing for a specific use case, without indicating
> how prevalent or frequent that use case occurs in operation of an
> actual device.  I suspect that it's possible to make the optimization
> you're trying to achieve without this degree of complexity.  Thanks,
> 
> Alex
> 
> .
> 
More other real-world improvements may require feedback from others.
I will post another patch to fix the previous bug and improve the
performance of the pin page.  Maybe others will make some improvements
on this basis.

Regards,
Xu

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-11-17  3:32 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-10 13:42 [PATCH] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance xuxiaoyang (C)
2020-11-11 15:56 ` Alex Williamson
2020-11-12 11:49   ` xuxiaoyang (C)
2020-11-13 16:44 ` Alex Williamson
2020-11-16 13:47   ` xuxiaoyang (C)
2020-11-16 16:33     ` Alex Williamson
2020-11-17  3:31       ` xuxiaoyang (C)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).