linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] vfio iommu type1: improve memory pinning process for raw PFN mapping
@ 2018-02-24  5:44 jason
  2018-02-26 19:19 ` Alex Williamson
  0 siblings, 1 reply; 5+ messages in thread
From: jason @ 2018-02-24  5:44 UTC (permalink / raw)
  To: jason, alex.williamson, pbonzini, kvm, linux-kernel, linux-mm; +Cc: gnehzuil

When using vfio to pass through a PCIe device (e.g. a GPU card) that
has a huge BAR (e.g. 16GB), a lot of cycles are wasted on memory
pinning because PFNs of PCI BAR are not backed by struct page, and
the corresponding VMA has flags VM_IO|VM_PFNMAP.

With this change, memory pinning process will firstly try to figure
out whether the corresponding region is a raw PFN mapping, and if so
it can skip unnecessary user memory pinning process.

Even though it commes with a little overhead, finding vma and testing
flags, on each call, it can significantly improve VM's boot up time
when passing through devices via VFIO.
---
 drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29ae4819..1a471ece3f9c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -374,6 +374,24 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
        return ret;
 }

+static int try_io_pfnmap(struct mm_struct *mm, unsigned long vaddr, long npage,
+                        unsigned long *pfn)
+{
+       struct vm_area_struct *vma;
+       int pinned = 0;
+
+       down_read(&mm->mmap_sem);
+       vma = find_vma_intersection(mm, vaddr, vaddr + 1);
+       if (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+               *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+               if (is_invalid_reserved_pfn(*pfn))
+                       pinned = min(npage, (long)vma_pages(vma));
+       }
+       up_read(&mm->mmap_sem);
+
+       return pinned;
+}
+
 /*
  * Attempt to pin pages.  We really don't want to track all the pfns and
  * the iommu can only map chunks of consecutive pfns anyway, so get the
@@ -392,6 +410,10 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
        if (!current->mm)
                return -ENODEV;

+       ret = try_io_pfnmap(current->mm, vaddr, npage, pfn_base);
+       if (ret)
+               return ret;
+
        ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
        if (ret)
                return ret;
--
2.13.6

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC] vfio iommu type1: improve memory pinning process for raw PFN mapping
  2018-02-24  5:44 [RFC] vfio iommu type1: improve memory pinning process for raw PFN mapping jason
@ 2018-02-26 19:19 ` Alex Williamson
  2018-02-27  7:44   ` Jason Cai (Xiang Feng)
  2018-03-03 12:10   ` [RFC v2] " Jason Cai (Xiang Feng)
  0 siblings, 2 replies; 5+ messages in thread
From: Alex Williamson @ 2018-02-26 19:19 UTC (permalink / raw)
  To: jason; +Cc: pbonzini, kvm, linux-kernel, linux-mm, gnehzuil

On Sat, 24 Feb 2018 13:44:07 +0800
jason <jason.cai@linux.alibaba.com> wrote:

> When using vfio to pass through a PCIe device (e.g. a GPU card) that
> has a huge BAR (e.g. 16GB), a lot of cycles are wasted on memory
> pinning because PFNs of PCI BAR are not backed by struct page, and
> the corresponding VMA has flags VM_IO|VM_PFNMAP.
> 
> With this change, memory pinning process will firstly try to figure
> out whether the corresponding region is a raw PFN mapping, and if so
> it can skip unnecessary user memory pinning process.
> 
> Even though it commes with a little overhead, finding vma and testing
> flags, on each call, it can significantly improve VM's boot up time
> when passing through devices via VFIO.

Needs a Sign-off, see Documentation/process/submitting-patches.rst

> ---
>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>  1 file changed, 22 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29ae4819..1a471ece3f9c 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -374,6 +374,24 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>         return ret;
>  }
> 
> +static int try_io_pfnmap(struct mm_struct *mm, unsigned long vaddr, long npage,
> +                        unsigned long *pfn)
> +{
> +       struct vm_area_struct *vma;
> +       int pinned = 0;
> +
> +       down_read(&mm->mmap_sem);
> +       vma = find_vma_intersection(mm, vaddr, vaddr + 1);
> +       if (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> +               *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> +               if (is_invalid_reserved_pfn(*pfn))
> +                       pinned = min(npage, (long)vma_pages(vma));
> +       }
> +       up_read(&mm->mmap_sem);
> +
> +       return pinned;
> +}
> +
>  /*
>   * Attempt to pin pages.  We really don't want to track all the pfns and
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
> @@ -392,6 +410,10 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>         if (!current->mm)
>                 return -ENODEV;
> 
> +       ret = try_io_pfnmap(current->mm, vaddr, npage, pfn_base);
> +       if (ret)
> +               return ret;
> +
>         ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
>         if (ret)
>                 return ret;

I like the idea, but couldn't we integrated it better?  For instance,
does it really make sense to test for this first, the majority of users
are going to have more regular mappings than PFNMAP mappings.  If we
were to do the above optimization, doesn't the rsvd bits in the
remainder of the code become cruft?  What if we optimized from the point
where we test the return of vaddr_get_pfn() for a reserved/invalid page?
Perhaps something like the below (untested, uncompiled) patch.  Also
curious why the above tests VM_IO|VM_PFNMAP while vaddr_get_pfn() only
tests VM_PFNMAP, we should at least be consistent, but also correct the
existing function if it's missing a case. Thanks,

Alex

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e113b2c43be2..425922393316 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -399,7 +399,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 {
 	unsigned long pfn = 0;
 	long ret, pinned = 0, lock_acct = 0;
-	bool rsvd;
 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 
 	/* This code path is only user initiated */
@@ -410,14 +409,23 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	if (ret)
 		return ret;
 
+	if (is_invalid_reserved_pfn(*pfn_base)) {
+		struct vm_area_struct *vma;
+
+		down_read(&mm->mmap_sem);
+		vma = find_vma_intersection(mm, vaddr, vaddr + 1);
+		pinned = min(npage, (long)vma_pages(vma));
+		up_read(&mm->mmap_sem);
+		return pinned;
+	}
+
 	pinned++;
-	rsvd = is_invalid_reserved_pfn(*pfn_base);
 
 	/*
 	 * Reserved pages aren't counted against the user, externally pinned
 	 * pages are already counted against the user.
 	 */
-	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+	if (!vfio_find_vpfn(dma, iova)) {
 		if (!lock_cap && current->mm->locked_vm + 1 > limit) {
 			put_pfn(*pfn_base, dma->prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
@@ -437,13 +445,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 		if (ret)
 			break;
 
-		if (pfn != *pfn_base + pinned ||
-		    rsvd != is_invalid_reserved_pfn(pfn)) {
+		if (pfn != *pfn_base + pinned) {
 			put_pfn(pfn, dma->prot);
 			break;
 		}
 
-		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+		if (!vfio_find_vpfn(dma, iova)) {
 			if (!lock_cap &&
 			    current->mm->locked_vm + lock_acct + 1 > limit) {
 				put_pfn(pfn, dma->prot);
@@ -461,10 +468,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 
 unpin_out:
 	if (ret) {
-		if (!rsvd) {
-			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
-				put_pfn(pfn, dma->prot);
-		}
+		for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
+			put_pfn(pfn, dma->prot);
 
 		return ret;
 	}

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC] vfio iommu type1: improve memory pinning process for raw PFN mapping
  2018-02-26 19:19 ` Alex Williamson
@ 2018-02-27  7:44   ` Jason Cai (Xiang Feng)
  2018-03-03 12:10   ` [RFC v2] " Jason Cai (Xiang Feng)
  1 sibling, 0 replies; 5+ messages in thread
From: Jason Cai (Xiang Feng) @ 2018-02-27  7:44 UTC (permalink / raw)
  To: Alex Williamson; +Cc: pbonzini, kvm, linux-kernel, linux-mm, gnehzuil

On 27 Feb 2018, at 3:19 AM, Alex Williamson <alex.williamson@redhat.com> wrote:

> On Sat, 24 Feb 2018 13:44:07 +0800
> jason <jason.cai@linux.alibaba.com> wrote:
> 
>> When using vfio to pass through a PCIe device (e.g. a GPU card) that
>> has a huge BAR (e.g. 16GB), a lot of cycles are wasted on memory
>> pinning because PFNs of PCI BAR are not backed by struct page, and
>> the corresponding VMA has flags VM_IO|VM_PFNMAP.
>> 
>> With this change, memory pinning process will firstly try to figure
>> out whether the corresponding region is a raw PFN mapping, and if so
>> it can skip unnecessary user memory pinning process.
>> 
>> Even though it commes with a little overhead, finding vma and testing
>> flags, on each call, it can significantly improve VM's boot up time
>> when passing through devices via VFIO.
> 
> Needs a Sign-off, see Documentation/process/submitting-patches.rst
> 
>> ---
>> drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>> 1 file changed, 22 insertions(+)
>> 
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index e30e29ae4819..1a471ece3f9c 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -374,6 +374,24 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>>        return ret;
>> }
>> 
>> +static int try_io_pfnmap(struct mm_struct *mm, unsigned long vaddr, long npage,
>> +                        unsigned long *pfn)
>> +{
>> +       struct vm_area_struct *vma;
>> +       int pinned = 0;
>> +
>> +       down_read(&mm->mmap_sem);
>> +       vma = find_vma_intersection(mm, vaddr, vaddr + 1);
>> +       if (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)) {
>> +               *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>> +               if (is_invalid_reserved_pfn(*pfn))
>> +                       pinned = min(npage, (long)vma_pages(vma));
>> +       }
>> +       up_read(&mm->mmap_sem);
>> +
>> +       return pinned;
>> +}
>> +
>> /*
>>  * Attempt to pin pages.  We really don't want to track all the pfns and
>>  * the iommu can only map chunks of consecutive pfns anyway, so get the
>> @@ -392,6 +410,10 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>>        if (!current->mm)
>>                return -ENODEV;
>> 
>> +       ret = try_io_pfnmap(current->mm, vaddr, npage, pfn_base);
>> +       if (ret)
>> +               return ret;
>> +
>>        ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
>>        if (ret)
>>                return ret;
> 
> I like the idea, but couldn't we integrated it better?  For instance,
> does it really make sense to test for this first, the majority of users
> are going to have more regular mappings than PFNMAP mappings.  If we
> were to do the above optimization, doesn't the rsvd bits in the
> remainder of the code become cruft?  What if we optimized from the point
> where we test the return of vaddr_get_pfn() for a reserved/invalid page?
> Perhaps something like the below (untested, uncompiled) patch.  Also
> curious why the above tests VM_IO|VM_PFNMAP while vaddr_get_pfn() only
> tests VM_PFNMAP, we should at least be consistent, but also correct the
> existing function if it's missing a case. Thanks,
> 
> Alex
> 

Good points! The patch has been updated according to these points.
And I’ve also tested with it. Please see it at the end. Thanks.

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e113b2c43be2..425922393316 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -399,7 +399,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> {
> 	unsigned long pfn = 0;
> 	long ret, pinned = 0, lock_acct = 0;
> -	bool rsvd;
> 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
> 
> 	/* This code path is only user initiated */
> @@ -410,14 +409,23 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> 	if (ret)
> 		return ret;
> 
> +	if (is_invalid_reserved_pfn(*pfn_base)) {
> +		struct vm_area_struct *vma;
> +
> +		down_read(&mm->mmap_sem);
> +		vma = find_vma_intersection(mm, vaddr, vaddr + 1);
> +		pinned = min(npage, (long)vma_pages(vma));
> +		up_read(&mm->mmap_sem);
> +		return pinned;
> +	}
> +
> 	pinned++;
> -	rsvd = is_invalid_reserved_pfn(*pfn_base);
> 
> 	/*
> 	 * Reserved pages aren't counted against the user, externally pinned
> 	 * pages are already counted against the user.
> 	 */
> -	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> +	if (!vfio_find_vpfn(dma, iova)) {
> 		if (!lock_cap && current->mm->locked_vm + 1 > limit) {
> 			put_pfn(*pfn_base, dma->prot);
> 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> @@ -437,13 +445,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> 		if (ret)
> 			break;
> 
> -		if (pfn != *pfn_base + pinned ||
> -		    rsvd != is_invalid_reserved_pfn(pfn)) {
> +		if (pfn != *pfn_base + pinned) {
> 			put_pfn(pfn, dma->prot);
> 			break;
> 		}
> 
> -		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> +		if (!vfio_find_vpfn(dma, iova)) {
> 			if (!lock_cap &&
> 			    current->mm->locked_vm + lock_acct + 1 > limit) {
> 				put_pfn(pfn, dma->prot);
> @@ -461,10 +468,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> 
> unpin_out:
> 	if (ret) {
> -		if (!rsvd) {
> -			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> -				put_pfn(pfn, dma->prot);
> -		}
> +		for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> +			put_pfn(pfn, dma->prot);
> 
> 		return ret;
> 	}


When using vfio to pass through a PCIe device (e.g. a GPU card) that
has a huge BAR (e.g. 16GB), a lot of cycles are wasted on memory
pinning because PFNs of PCI BAR are not backed by struct page, and
the corresponding VMA has flag VM_PFNMAP.

With this change, when pinning a region which is a raw PFN mapping,
it can skip unnecessary user memory pinning process. Thus, it can
significantly improve VM's boot up time when passing through devices
via VFIO.

Signed-off-by: Jason Cai (Xiang Feng) <jason.cai@linux.alibaba.com>
---
 drivers/vfio/vfio_iommu_type1.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29ae4819..82ccfa350315 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -385,7 +385,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 {
        unsigned long pfn = 0;
        long ret, pinned = 0, lock_acct = 0;
-       bool rsvd;
        dma_addr_t iova = vaddr - dma->vaddr + dma->iova;

        /* This code path is only user initiated */
@@ -396,14 +395,22 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
        if (ret)
                return ret;

+       if (is_invalid_reserved_pfn(*pfn_base)) {
+               struct vm_area_struct *vma;
+               down_read(&current->mm->mmap_sem);
+               vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+               pinned = min(npage, (long)vma_pages(vma));
+               up_read(&current->mm->mmap_sem);
+               return pinned;
+       }
+
        pinned++;
-       rsvd = is_invalid_reserved_pfn(*pfn_base);

        /*
         * Reserved pages aren't counted against the user, externally pinned
         * pages are already counted against the user.
         */
-       if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+       if (!vfio_find_vpfn(dma, iova)) {
                if (!lock_cap && current->mm->locked_vm + 1 > limit) {
                        put_pfn(*pfn_base, dma->prot);
                        pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
@@ -423,13 +430,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
                if (ret)
                        break;

-               if (pfn != *pfn_base + pinned ||
-                   rsvd != is_invalid_reserved_pfn(pfn)) {
+               if (pfn != *pfn_base + pinned) {
                        put_pfn(pfn, dma->prot);
                        break;
                }

-               if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+               if (!vfio_find_vpfn(dma, iova)) {
                        if (!lock_cap &&
                            current->mm->locked_vm + lock_acct + 1 > limit) {
                                put_pfn(pfn, dma->prot);
@@ -447,10 +453,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,

 unpin_out:
        if (ret) {
-               if (!rsvd) {
-                       for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
-                               put_pfn(pfn, dma->prot);
-               }
+               for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
+                       put_pfn(pfn, dma->prot);

                return ret;
        }
--
2.13.6

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [RFC v2] vfio iommu type1: improve memory pinning process for raw PFN mapping
  2018-02-26 19:19 ` Alex Williamson
  2018-02-27  7:44   ` Jason Cai (Xiang Feng)
@ 2018-03-03 12:10   ` Jason Cai (Xiang Feng)
  2018-03-12 22:06     ` Alex Williamson
  1 sibling, 1 reply; 5+ messages in thread
From: Jason Cai (Xiang Feng) @ 2018-03-03 12:10 UTC (permalink / raw)
  To: Alex Williamson; +Cc: pbonzini, kvm, linux-kernel, linux-mm, gnehzuil

When using vfio to pass through a PCIe device (e.g. a GPU card) that
has a huge BAR (e.g. 16GB), a lot of cycles are wasted on memory
pinning because PFNs of PCI BAR are not backed by struct page, and
the corresponding VMA has flag VM_PFNMAP.

With this change, when pinning a region which is a raw PFN mapping,
it can skip unnecessary user memory pinning process. Thus, it can
significantly improve VM's boot up time when passing through devices
via VFIO.

Signed-off-by: Jason Cai (Xiang Feng) <jason.cai@linux.alibaba.com>
---
 drivers/vfio/vfio_iommu_type1.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29ae4819..82ccfa350315 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -385,7 +385,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 {
        unsigned long pfn = 0;
        long ret, pinned = 0, lock_acct = 0;
-       bool rsvd;
        dma_addr_t iova = vaddr - dma->vaddr + dma->iova;

        /* This code path is only user initiated */
@@ -396,14 +395,22 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
        if (ret)
                return ret;

+       if (is_invalid_reserved_pfn(*pfn_base)) {
+               struct vm_area_struct *vma;
+               down_read(&current->mm->mmap_sem);
+               vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+               pinned = min(npage, (long)vma_pages(vma));
+               up_read(&current->mm->mmap_sem);
+               return pinned;
+       }
+
        pinned++;
-       rsvd = is_invalid_reserved_pfn(*pfn_base);

        /*
         * Reserved pages aren't counted against the user, externally pinned
         * pages are already counted against the user.
         */
-       if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+       if (!vfio_find_vpfn(dma, iova)) {
                if (!lock_cap && current->mm->locked_vm + 1 > limit) {
                        put_pfn(*pfn_base, dma->prot);
                        pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
@@ -423,13 +430,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
                if (ret)
                        break;

-               if (pfn != *pfn_base + pinned ||
-                   rsvd != is_invalid_reserved_pfn(pfn)) {
+               if (pfn != *pfn_base + pinned) {
                        put_pfn(pfn, dma->prot);
                        break;
                }

-               if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+               if (!vfio_find_vpfn(dma, iova)) {
                        if (!lock_cap &&
                            current->mm->locked_vm + lock_acct + 1 > limit) {
                                put_pfn(pfn, dma->prot);
@@ -447,10 +453,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,

 unpin_out:
        if (ret) {
-               if (!rsvd) {
-                       for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
-                               put_pfn(pfn, dma->prot);
-               }
+               for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
+                       put_pfn(pfn, dma->prot);

                return ret;
        }
--
2.13.6

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC v2] vfio iommu type1: improve memory pinning process for raw PFN mapping
  2018-03-03 12:10   ` [RFC v2] " Jason Cai (Xiang Feng)
@ 2018-03-12 22:06     ` Alex Williamson
  0 siblings, 0 replies; 5+ messages in thread
From: Alex Williamson @ 2018-03-12 22:06 UTC (permalink / raw)
  To: Jason Cai (Xiang Feng); +Cc: pbonzini, kvm, linux-kernel, linux-mm, gnehzuil

On Sat, 3 Mar 2018 20:10:33 +0800
"Jason Cai (Xiang Feng)" <jason.cai@linux.alibaba.com> wrote:

> When using vfio to pass through a PCIe device (e.g. a GPU card) that
> has a huge BAR (e.g. 16GB), a lot of cycles are wasted on memory
> pinning because PFNs of PCI BAR are not backed by struct page, and
> the corresponding VMA has flag VM_PFNMAP.
> 
> With this change, when pinning a region which is a raw PFN mapping,
> it can skip unnecessary user memory pinning process. Thus, it can
> significantly improve VM's boot up time when passing through devices
> via VFIO.
> 
> Signed-off-by: Jason Cai (Xiang Feng) <jason.cai@linux.alibaba.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 24 ++++++++++++++----------
>  1 file changed, 14 insertions(+), 10 deletions(-)


It looks reasonable to me, is this still really an RFC?  It would also
be interesting to include performance data in the commit log, how much
faster is it to map that 16GB BAR with this change?  Thanks,

Alex

 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29ae4819..82ccfa350315 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -385,7 +385,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  {
>         unsigned long pfn = 0;
>         long ret, pinned = 0, lock_acct = 0;
> -       bool rsvd;
>         dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
> 
>         /* This code path is only user initiated */
> @@ -396,14 +395,22 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>         if (ret)
>                 return ret;
> 
> +       if (is_invalid_reserved_pfn(*pfn_base)) {
> +               struct vm_area_struct *vma;
> +               down_read(&current->mm->mmap_sem);
> +               vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +               pinned = min(npage, (long)vma_pages(vma));
> +               up_read(&current->mm->mmap_sem);
> +               return pinned;
> +       }
> +
>         pinned++;
> -       rsvd = is_invalid_reserved_pfn(*pfn_base);
> 
>         /*
>          * Reserved pages aren't counted against the user, externally pinned
>          * pages are already counted against the user.
>          */
> -       if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> +       if (!vfio_find_vpfn(dma, iova)) {
>                 if (!lock_cap && current->mm->locked_vm + 1 > limit) {
>                         put_pfn(*pfn_base, dma->prot);
>                         pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> @@ -423,13 +430,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>                 if (ret)
>                         break;
> 
> -               if (pfn != *pfn_base + pinned ||
> -                   rsvd != is_invalid_reserved_pfn(pfn)) {
> +               if (pfn != *pfn_base + pinned) {
>                         put_pfn(pfn, dma->prot);
>                         break;
>                 }
> 
> -               if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> +               if (!vfio_find_vpfn(dma, iova)) {
>                         if (!lock_cap &&
>                             current->mm->locked_vm + lock_acct + 1 > limit) {
>                                 put_pfn(pfn, dma->prot);
> @@ -447,10 +453,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> 
>  unpin_out:
>         if (ret) {
> -               if (!rsvd) {
> -                       for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> -                               put_pfn(pfn, dma->prot);
> -               }
> +               for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> +                       put_pfn(pfn, dma->prot);
> 
>                 return ret;
>         }
> --
> 2.13.6

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-03-12 22:06 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-24  5:44 [RFC] vfio iommu type1: improve memory pinning process for raw PFN mapping jason
2018-02-26 19:19 ` Alex Williamson
2018-02-27  7:44   ` Jason Cai (Xiang Feng)
2018-03-03 12:10   ` [RFC v2] " Jason Cai (Xiang Feng)
2018-03-12 22:06     ` Alex Williamson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).