linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
@ 2020-07-20  8:29 Jay Zhou
  2020-07-20 14:24 ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Jay Zhou @ 2020-07-20  8:29 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: alex.williamson, cohuck, maoming.maoming, jianjay.zhou, weidong.huang

From: Ming Mao <maoming.maoming@huawei.com>

Hi all,
I'm working on starting lots of big size
Virtual Machines(memory: >128GB) with VFIO-devices. And I
encounter a problem that is the waiting time of starting
all Virtual Machines is too long. I analyze the startup log
and find that the time of pinning/unpinning pages could be reduced.

In the original process, to make sure the pages are contiguous,
we have to check all pages one by one. I think maybe we can use
hugetlbfs pages which can skip this step.
So I create a patch to do this.
According to my test, the result of this patch is pretty well.

Virtual Machine: 50G memory, 32 CPU, 1 VFIO-device, 1G hugetlbfs page
        original   after optimization
pin time   700ms          0.1ms

I Suppose that:
1)the hugetlbfs page should not be split
2)PG_reserved is not relevant for hugetlbfs pages
3)we can delete the for loops and use some operations
(such as atomic_add,page_ref_add) instead

please correct me if I am wrong.

Thanks.

Signed-off-by: Ming Mao <maoming.maoming@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 236 ++++++++++++++++++++++++++++++--
 include/linux/vfio.h            |  20 +++
 2 files changed, 246 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5e556ac91..42e25752e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -415,6 +415,46 @@ static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
+/*
+ * put pfns for a hugetlbfs page
+ * @start:the 4KB-page we start to put,can be any page in this hugetlbfs page
+ * @npage:the number of 4KB-pages need to put
+ * @prot:IOMMU_READ/WRITE
+ */
+static int hugetlb_put_pfn(unsigned long start, unsigned int npage, int prot)
+{
+	struct page *page = NULL;
+	struct page *head = NULL;
+
+	if (!npage || !pfn_valid(start))
+		return 0;
+
+	page = pfn_to_page(start);
+	if (!page || !PageHuge(page))
+		return 0;
+	head = compound_head(page);
+	/*
+	 * The last page should be in this hugetlbfs page.
+	 * The number of putting pages should be equal to the number
+	 * of getting pages.So the hugepage pinned refcount and the normal
+	 * page refcount can not be smaller than npage.
+	 */
+	if ((head != compound_head(pfn_to_page(start + npage - 1)))
+	    || (page_ref_count(head) < npage)
+	    || (compound_pincount(page) < npage))
+		return 0;
+
+	if ((prot & IOMMU_WRITE) && !PageDirty(page))
+		set_page_dirty_lock(page);
+
+	atomic_sub(npage, compound_pincount_ptr(head));
+	if (page_ref_sub_and_test(head, npage))
+		__put_page(head);
+
+	mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_RELEASED, npage);
+	return 1;
+}
+
 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 			    unsigned long vaddr, unsigned long *pfn,
 			    bool write_fault)
@@ -479,6 +519,90 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 	return ret;
 }
 
+struct vfio_hupetlbpage_info vfio_hugetlbpage_info[HUGE_MAX_HSTATE] = {
+	{vfio_hugetlbpage_2M, PMD_SIZE, ~((1ULL << HPAGE_PMD_SHIFT) - 1)},
+	{vfio_hugetlbpage_1G, PUD_SIZE, ~((1ULL << HPAGE_PUD_SHIFT) - 1)},
+};
+
+static bool is_hugetlbpage(unsigned long pfn, enum vfio_hugetlbpage_type *type)
+{
+	struct page *page = NULL;
+
+	if (!pfn_valid(pfn) || !type)
+		return false;
+
+	page = pfn_to_page(pfn);
+	/* only check for hugetlbfs pages */
+	if (!page || !PageHuge(page))
+		return false;
+
+	switch (compound_order(compound_head(page))) {
+	case PMD_ORDER:
+		*type = vfio_hugetlbpage_2M;
+		break;
+	case PUD_ORDER:
+		*type = vfio_hugetlbpage_1G;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+/* Is the addr in the last page in hugetlbfs pages? */
+static bool hugetlb_is_last_page(unsigned long addr, enum vfio_hugetlbpage_type type)
+{
+	unsigned int num = 0;
+
+	num = hugetlb_get_resdual_pages(addr & ~(PAGE_SIZE - 1), type);
+
+	if (num == 1)
+		return true;
+	else
+		return false;
+}
+
+static bool hugetlb_page_is_pinned(struct vfio_dma *dma,
+				unsigned long start,
+				unsigned long npages)
+{
+	struct vfio_pfn *vpfn = NULL;
+	struct rb_node *node = rb_first(&dma->pfn_list);
+	unsigned long end = start + npages - 1;
+
+	for (; node; node = rb_next(node)) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if ((vpfn->pfn >= start) && (vpfn->pfn <= end))
+			return true;
+	}
+
+	return false;
+}
+
+static unsigned int hugetlb_get_contiguous_pages_num(struct vfio_dma *dma,
+						unsigned long pfn,
+						unsigned long resdual_npage,
+						unsigned long max_npage)
+{
+	unsigned int num = 0;
+
+	if (!dma)
+		return 0;
+
+	num = resdual_npage < max_npage ? resdual_npage : max_npage;
+	/*
+	 * If there is only one page, it is no need to optimize them.
+	 * Maybe some pages have been pinned and inserted into dma->pfn_list by others.
+	 * In this case, we just goto the slow path simply.
+	 */
+	if ((num < 2) || hugetlb_page_is_pinned(dma, pfn, num))
+		return 0;
+
+	return num;
+}
+
 /*
  * Attempt to pin pages.  We really don't want to track all the pfns and
  * the iommu can only map chunks of consecutive pfns anyway, so get the
@@ -492,6 +616,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	long ret, pinned = 0, lock_acct = 0;
 	bool rsvd;
 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
+	enum vfio_hugetlbpage_type type;
 
 	/* This code path is only user initiated */
 	if (!current->mm)
@@ -521,6 +646,55 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	if (unlikely(disable_hugepages))
 		goto out;
 
+	/*
+	 * It is no need to get pages one by one for hugetlbfs pages.
+	 * 4KB-pages in hugetlbfs pages are contiguous.
+	 * But if the vaddr is in the last 4KB-page, we just goto the slow path.
+	 */
+	if (is_hugetlbpage(*pfn_base, &type) && !hugetlb_is_last_page(vaddr, type)) {
+		unsigned long hugetlb_resdual_npage = 0;
+		unsigned long contiguous_npage = 0;
+		struct page *head = NULL;
+
+		hugetlb_resdual_npage =
+			hugetlb_get_resdual_pages((vaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1), type);
+		/*
+		 * Maybe the hugetlb_resdual_npage is invalid.
+		 * For example, hugetlb_resdual_npage > (npage - 1) or
+		 * some pages of this hugetlbfs page have been pinned.
+		 */
+		contiguous_npage = hugetlb_get_contiguous_pages_num(dma, *pfn_base + 1,
+						hugetlb_resdual_npage, npage - 1);
+		if (!contiguous_npage)
+			goto slow_path;
+
+		/*
+		 * Unlike THP, the splitting should not happen for hugetlbfs pages.
+		 * Since PG_reserved is not relevant for compound pages, and the pfn of
+		 * 4KB-page which in hugetlbfs pages is valid,
+		 * it is no need to check rsvd for hugetlbfs pages.
+		 */
+		if (!dma->lock_cap &&
+		    current->mm->locked_vm + lock_acct + contiguous_npage > limit) {
+			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+				 __func__, limit << PAGE_SHIFT);
+			ret = -ENOMEM;
+			goto unpin_out;
+		}
+		/*
+		 * We got a hugetlbfs page using vaddr_get_pfn alreadly.
+		 * In this case,we do not need to alloc pages and we can finish all
+		 * work by a single operation to the head page.
+		 */
+		lock_acct += contiguous_npage;
+		head = compound_head(pfn_to_page(*pfn_base));
+		atomic_add(contiguous_npage, compound_pincount_ptr(head));
+		page_ref_add(head, contiguous_npage);
+		mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_ACQUIRED, contiguous_npage);
+		pinned += contiguous_npage;
+		goto out;
+	}
+slow_path:
 	/* Lock all the consecutive pages from pfn_base */
 	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
 	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
@@ -569,7 +743,30 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 {
 	long unlocked = 0, locked = 0;
 	long i;
+	enum vfio_hugetlbpage_type type;
+
+	if (is_hugetlbpage(pfn, &type)) {
+		unsigned long hugetlb_resdual_npage = 0;
+		unsigned long contiguous_npage = 0;
 
+		hugetlb_resdual_npage = hugetlb_get_resdual_pages(iova & ~(PAGE_SIZE - 1), type);
+		contiguous_npage = hugetlb_get_contiguous_pages_num(dma, pfn,
+						hugetlb_resdual_npage, npage);
+		/*
+		 * There is not enough contiguous pages or this hugetlbfs page
+		 * has been pinned.
+		 * Let's try the slow path.
+		 */
+		if (!contiguous_npage)
+			goto slow_path;
+
+		/* try the slow path if failed */
+		if (hugetlb_put_pfn(pfn, contiguous_npage, dma->prot)) {
+			unlocked = contiguous_npage;
+			goto out;
+		}
+	}
+slow_path:
 	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
 		if (put_pfn(pfn++, dma->prot)) {
 			unlocked++;
@@ -578,6 +775,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 		}
 	}
 
+out:
 	if (do_accounting)
 		vfio_lock_acct(dma, locked - unlocked, true);
 
@@ -867,6 +1065,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	struct iommu_iotlb_gather iotlb_gather;
 	int unmapped_region_cnt = 0;
 	long unlocked = 0;
+	enum vfio_hugetlbpage_type type;
 
 	if (!dma->size)
 		return 0;
@@ -900,16 +1099,33 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 			continue;
 		}
 
-		/*
-		 * To optimize for fewer iommu_unmap() calls, each of which
-		 * may require hardware cache flushing, try to find the
-		 * largest contiguous physical memory chunk to unmap.
-		 */
-		for (len = PAGE_SIZE;
-		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
-			next = iommu_iova_to_phys(domain->domain, iova + len);
-			if (next != phys + len)
-				break;
+		if (is_hugetlbpage((phys >> PAGE_SHIFT), &type)
+		    && (!domain->fgsp)) {
+			unsigned long hugetlb_resdual_npage = 0;
+			unsigned long contiguous_npage = 0;
+
+			hugetlb_resdual_npage =
+				hugetlb_get_resdual_pages(iova & ~(PAGE_SIZE - 1), type);
+			/*
+			 * The number of contiguous page can not be larger than dma->size
+			 * which is the number of pages pinned.
+			 */
+			contiguous_npage = ((dma->size >> PAGE_SHIFT) > hugetlb_resdual_npage) ?
+				hugetlb_resdual_npage : (dma->size >> PAGE_SHIFT);
+
+			len = contiguous_npage * PAGE_SIZE;
+		} else {
+			/*
+			 * To optimize for fewer iommu_unmap() calls, each of which
+			 * may require hardware cache flushing, try to find the
+			 * largest contiguous physical memory chunk to unmap.
+			 */
+			for (len = PAGE_SIZE;
+			     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
+				next = iommu_iova_to_phys(domain->domain, iova + len);
+				if (next != phys + len)
+					break;
+			}
 		}
 
 		/*
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 38d3c6a8d..91ef2058f 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -214,4 +214,24 @@ extern int vfio_virqfd_enable(void *opaque,
 			      void *data, struct virqfd **pvirqfd, int fd);
 extern void vfio_virqfd_disable(struct virqfd **pvirqfd);
 
+enum vfio_hugetlbpage_type {
+	vfio_hugetlbpage_2M,
+	vfio_hugetlbpage_1G,
+};
+
+struct vfio_hupetlbpage_info {
+	enum vfio_hugetlbpage_type type;
+	unsigned long size;
+	unsigned long mask;
+};
+
+#define PMD_ORDER 9
+#define PUD_ORDER 18
+/*
+ * get the number of resdual 4KB-pages in a hugetlbfs page
+ * (including the page which pointed by this address)
+ */
+#define hugetlb_get_resdual_pages(address, type)				\
+		((vfio_hugetlbpage_info[type].size				\
+		- (address & ~vfio_hugetlbpage_info[type].mask)) >> PAGE_SHIFT)
 #endif /* VFIO_H */
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
  2020-07-20  8:29 [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages Jay Zhou
@ 2020-07-20 14:24 ` kernel test robot
  2020-07-20 22:46 ` Alex Williamson
  2020-07-20 23:38 ` kernel test robot
  2 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2020-07-20 14:24 UTC (permalink / raw)
  To: Jay Zhou, linux-kernel, kvm
  Cc: kbuild-all, alex.williamson, cohuck, maoming.maoming,
	jianjay.zhou, weidong.huang

[-- Attachment #1: Type: text/plain, Size: 2619 bytes --]

Hi Jay,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on vfio/next]
[also build test ERROR on linux/master linus/master v5.8-rc6 next-20200720]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Jay-Zhou/vfio-dma_map-unmap-optimized-for-hugetlbfs-pages/20200720-163157
base:   https://github.com/awilliam/linux-vfio.git next
config: i386-randconfig-m021-20200720 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-14) 9.3.0
reproduce (this is a W=1 build):
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from include/linux/mm.h:679,
                    from include/linux/scatterlist.h:8,
                    from include/linux/iommu.h:10,
                    from drivers/vfio/vfio_iommu_type1.c:27:
>> include/linux/huge_mm.h:319:25: error: braced-group within expression allowed only inside a function
     319 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
         |                         ^
   drivers/vfio/vfio_iommu_type1.c:523:45: note: in expansion of macro 'HPAGE_PMD_SHIFT'
     523 |  {vfio_hugetlbpage_2M, PMD_SIZE, ~((1ULL << HPAGE_PMD_SHIFT) - 1)},
         |                                             ^~~~~~~~~~~~~~~
   include/linux/huge_mm.h:323:25: error: braced-group within expression allowed only inside a function
     323 | #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
         |                         ^
   drivers/vfio/vfio_iommu_type1.c:524:45: note: in expansion of macro 'HPAGE_PUD_SHIFT'
     524 |  {vfio_hugetlbpage_1G, PUD_SIZE, ~((1ULL << HPAGE_PUD_SHIFT) - 1)},
         |                                             ^~~~~~~~~~~~~~~

vim +319 include/linux/huge_mm.h

87eaceb3faa59b Yang Shi         2019-09-23  317  
71e3aac0724ffe Andrea Arcangeli 2011-01-13  318  #else /* CONFIG_TRANSPARENT_HUGEPAGE */
d8c37c480678eb Naoya Horiguchi  2012-03-21 @319  #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
d8c37c480678eb Naoya Horiguchi  2012-03-21  320  #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
d8c37c480678eb Naoya Horiguchi  2012-03-21  321  #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
71e3aac0724ffe Andrea Arcangeli 2011-01-13  322  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31291 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
  2020-07-20  8:29 [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages Jay Zhou
  2020-07-20 14:24 ` kernel test robot
@ 2020-07-20 22:46 ` Alex Williamson
  2020-07-21 13:28   ` Zhoujian (jay)
  2020-07-22  0:48   ` Peter Xu
  2020-07-20 23:38 ` kernel test robot
  2 siblings, 2 replies; 7+ messages in thread
From: Alex Williamson @ 2020-07-20 22:46 UTC (permalink / raw)
  To: Jay Zhou
  Cc: linux-kernel, kvm, cohuck, maoming.maoming, weidong.huang,
	Peter Xu, Andrea Arcangeli

On Mon, 20 Jul 2020 16:29:47 +0800
Jay Zhou <jianjay.zhou@huawei.com> wrote:

> From: Ming Mao <maoming.maoming@huawei.com>
> 
> Hi all,
> I'm working on starting lots of big size
> Virtual Machines(memory: >128GB) with VFIO-devices. And I
> encounter a problem that is the waiting time of starting
> all Virtual Machines is too long. I analyze the startup log
> and find that the time of pinning/unpinning pages could be reduced.
> 
> In the original process, to make sure the pages are contiguous,
> we have to check all pages one by one. I think maybe we can use
> hugetlbfs pages which can skip this step.
> So I create a patch to do this.
> According to my test, the result of this patch is pretty well.
> 
> Virtual Machine: 50G memory, 32 CPU, 1 VFIO-device, 1G hugetlbfs page
>         original   after optimization
> pin time   700ms          0.1ms
> 
> I Suppose that:
> 1)the hugetlbfs page should not be split
> 2)PG_reserved is not relevant for hugetlbfs pages
> 3)we can delete the for loops and use some operations
> (such as atomic_add,page_ref_add) instead
> 
> please correct me if I am wrong.
> 
> Thanks.
> 
> Signed-off-by: Ming Mao <maoming.maoming@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 236 ++++++++++++++++++++++++++++++--
>  include/linux/vfio.h            |  20 +++
>  2 files changed, 246 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 5e556ac91..42e25752e 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -415,6 +415,46 @@ static int put_pfn(unsigned long pfn, int prot)
>  	return 0;
>  }
>  
> +/*
> + * put pfns for a hugetlbfs page
> + * @start:the 4KB-page we start to put,can be any page in this hugetlbfs page
> + * @npage:the number of 4KB-pages need to put

This code supports systems where PAGE_SIZE is not 4KB.

> + * @prot:IOMMU_READ/WRITE
> + */
> +static int hugetlb_put_pfn(unsigned long start, unsigned int npage, int prot)
> +{
> +	struct page *page = NULL;
> +	struct page *head = NULL;

Unnecessary initialization.

> +
> +	if (!npage || !pfn_valid(start))
> +		return 0;
> +
> +	page = pfn_to_page(start);
> +	if (!page || !PageHuge(page))
> +		return 0;
> +	head = compound_head(page);
> +	/*
> +	 * The last page should be in this hugetlbfs page.
> +	 * The number of putting pages should be equal to the number
> +	 * of getting pages.So the hugepage pinned refcount and the normal
> +	 * page refcount can not be smaller than npage.
> +	 */
> +	if ((head != compound_head(pfn_to_page(start + npage - 1)))
> +	    || (page_ref_count(head) < npage)
> +	    || (compound_pincount(page) < npage))
> +		return 0;
> +
> +	if ((prot & IOMMU_WRITE) && !PageDirty(page))
> +		set_page_dirty_lock(page);
> +
> +	atomic_sub(npage, compound_pincount_ptr(head));
> +	if (page_ref_sub_and_test(head, npage))
> +		__put_page(head);
> +
> +	mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_RELEASED, npage);
> +	return 1;
> +}
> +
>  static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
>  			    unsigned long vaddr, unsigned long *pfn,
>  			    bool write_fault)
> @@ -479,6 +519,90 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>  	return ret;
>  }
>  
> +struct vfio_hupetlbpage_info vfio_hugetlbpage_info[HUGE_MAX_HSTATE] = {
> +	{vfio_hugetlbpage_2M, PMD_SIZE, ~((1ULL << HPAGE_PMD_SHIFT) - 1)},
> +	{vfio_hugetlbpage_1G, PUD_SIZE, ~((1ULL << HPAGE_PUD_SHIFT) - 1)},

Other architectures support more huge page sizes, also 0-day identified
these #defines don't exist when THP is not configured.  But why
couldn't we figure out all of these form the compound_order()?  order
== shift, size = PAGE_SIZE << order.

> +};
> +
> +static bool is_hugetlbpage(unsigned long pfn, enum vfio_hugetlbpage_type *type)
> +{
> +	struct page *page = NULL;

Unnecessary initialization.

> +
> +	if (!pfn_valid(pfn) || !type)
> +		return false;
> +
> +	page = pfn_to_page(pfn);
> +	/* only check for hugetlbfs pages */
> +	if (!page || !PageHuge(page))
> +		return false;
> +
> +	switch (compound_order(compound_head(page))) {
> +	case PMD_ORDER:
> +		*type = vfio_hugetlbpage_2M;
> +		break;
> +	case PUD_ORDER:
> +		*type = vfio_hugetlbpage_1G;
> +		break;
> +	default:
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +/* Is the addr in the last page in hugetlbfs pages? */
> +static bool hugetlb_is_last_page(unsigned long addr, enum vfio_hugetlbpage_type type)
> +{
> +	unsigned int num = 0;

Unnecessary initialization, and in fact unnecessary variable altogether.
ie.

	return hugetlb_get_resdual_pages(addr & ~(PAGE_SIZE - 1), type) == 1;


> +
> +	num = hugetlb_get_resdual_pages(addr & ~(PAGE_SIZE - 1), type);

residual?

> +
> +	if (num == 1)
> +		return true;
> +	else
> +		return false;
> +}
> +
> +static bool hugetlb_page_is_pinned(struct vfio_dma *dma,
> +				unsigned long start,
> +				unsigned long npages)
> +{
> +	struct vfio_pfn *vpfn = NULL;

Unnecessary initialization.

> +	struct rb_node *node = rb_first(&dma->pfn_list);
> +	unsigned long end = start + npages - 1;
> +
> +	for (; node; node = rb_next(node)) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if ((vpfn->pfn >= start) && (vpfn->pfn <= end))
> +			return true;

This function could be named better, it suggests the hugetlbfs page is
pinned, but really we're only looking for any pfn_list pinnings
overlapping the pfn range.

> +	}
> +
> +	return false;
> +}
> +
> +static unsigned int hugetlb_get_contiguous_pages_num(struct vfio_dma *dma,
> +						unsigned long pfn,
> +						unsigned long resdual_npage,
> +						unsigned long max_npage)
> +{
> +	unsigned int num = 0;

Unnecessary initialization

> +
> +	if (!dma)
> +		return 0;
> +
> +	num = resdual_npage < max_npage ? resdual_npage : max_npage;

min(resdual_npage, max_npage)


> +	/*
> +	 * If there is only one page, it is no need to optimize them.

s/no need/not necessary/

> +	 * Maybe some pages have been pinned and inserted into dma->pfn_list by others.
> +	 * In this case, we just goto the slow path simply.
> +	 */
> +	if ((num < 2) || hugetlb_page_is_pinned(dma, pfn, num))
> +		return 0;

Why does having pinnings in the pfn_list disqualify it from hugetlbfs pinning?

Testing for the last page here is redundant to the pinning path, should
it only be done here?  Can num be zero?  

Maybe better to return -errno for this and above zero conditions rather
than return a value that doesn't reflect the purpose of the function?

> +
> +	return num;
> +}
> +
>  /*
>   * Attempt to pin pages.  We really don't want to track all the pfns and
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
> @@ -492,6 +616,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  	long ret, pinned = 0, lock_acct = 0;
>  	bool rsvd;
>  	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
> +	enum vfio_hugetlbpage_type type;
>  
>  	/* This code path is only user initiated */
>  	if (!current->mm)
> @@ -521,6 +646,55 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>  	if (unlikely(disable_hugepages))
>  		goto out;
>  
> +	/*
> +	 * It is no need to get pages one by one for hugetlbfs pages.

s/no need/not necessary/

> +	 * 4KB-pages in hugetlbfs pages are contiguous.
> +	 * But if the vaddr is in the last 4KB-page, we just goto the slow path.


s/4KB-/PAGE_SIZE /

Please explain the significance of vaddr being in the last PAGE_SIZE
page of a hugetlbfs page.  Is it simply that we should take the slow
path for mapping a single page before the end of the hugetlbfs page?
Is this optimization worthwhile?  Isn't it more consistent to handle
all mappings over hugetlbfs pages the same?  Should we be operating on
vaddr here for the hugetlbfs page alignment or base_pfn?

> +	 */
> +	if (is_hugetlbpage(*pfn_base, &type) && !hugetlb_is_last_page(vaddr, type)) {
> +		unsigned long hugetlb_resdual_npage = 0;
> +		unsigned long contiguous_npage = 0;
> +		struct page *head = NULL;
> +
> +		hugetlb_resdual_npage =
> +			hugetlb_get_resdual_pages((vaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1), type);

~(PAGE_SIZE - 1) is PAGE_MASK, but that whole operation looks like a
PAGE_ALIGN(vaddr)

This is trying to get the number of pages after this page to the end of
the hugetlbfs page, right?  Rather than the hugetlb_is_last_page()
above, shouldn't we have recorded the number of pages there and used
math to get this value?

> +		/*
> +		 * Maybe the hugetlb_resdual_npage is invalid.
> +		 * For example, hugetlb_resdual_npage > (npage - 1) or
> +		 * some pages of this hugetlbfs page have been pinned.
> +		 */
> +		contiguous_npage = hugetlb_get_contiguous_pages_num(dma, *pfn_base + 1,
> +						hugetlb_resdual_npage, npage - 1);
> +		if (!contiguous_npage)
> +			goto slow_path;
> +
> +		/*
> +		 * Unlike THP, the splitting should not happen for hugetlbfs pages.
> +		 * Since PG_reserved is not relevant for compound pages, and the pfn of
> +		 * 4KB-page which in hugetlbfs pages is valid,

s/4KB/PAGE_SIZE/

> +		 * it is no need to check rsvd for hugetlbfs pages.

s/no need/not necessary/

> +		 */
> +		if (!dma->lock_cap &&
> +		    current->mm->locked_vm + lock_acct + contiguous_npage > limit) {
> +			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> +				 __func__, limit << PAGE_SHIFT);
> +			ret = -ENOMEM;
> +			goto unpin_out;
> +		}
> +		/*
> +		 * We got a hugetlbfs page using vaddr_get_pfn alreadly.
> +		 * In this case,we do not need to alloc pages and we can finish all
> +		 * work by a single operation to the head page.
> +		 */
> +		lock_acct += contiguous_npage;
> +		head = compound_head(pfn_to_page(*pfn_base));
> +		atomic_add(contiguous_npage, compound_pincount_ptr(head));
> +		page_ref_add(head, contiguous_npage);
> +		mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_ACQUIRED, contiguous_npage);
> +		pinned += contiguous_npage;
> +		goto out;

I'm hoping Peter or Andrea understand this, but I think we still have
pfn_base pinned separately and I don't see that we've done an unpin
anywhere, so are we leaking the pin of the first page??

> +	}
> +slow_path:
>  	/* Lock all the consecutive pages from pfn_base */
>  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
>  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> @@ -569,7 +743,30 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
>  {
>  	long unlocked = 0, locked = 0;
>  	long i;
> +	enum vfio_hugetlbpage_type type;
> +
> +	if (is_hugetlbpage(pfn, &type)) {
> +		unsigned long hugetlb_resdual_npage = 0;
> +		unsigned long contiguous_npage = 0;

Unnecessary initialization...

>  
> +		hugetlb_resdual_npage = hugetlb_get_resdual_pages(iova & ~(PAGE_SIZE - 1), type);

PAGE_MASK

Like above, is it pfn or iova that we should be using when looking at
hugetlbfs alignment?

> +		contiguous_npage = hugetlb_get_contiguous_pages_num(dma, pfn,
> +						hugetlb_resdual_npage, npage);
> +		/*
> +		 * There is not enough contiguous pages or this hugetlbfs page
> +		 * has been pinned.
> +		 * Let's try the slow path.
> +		 */
> +		if (!contiguous_npage)
> +			goto slow_path;
> +
> +		/* try the slow path if failed */
> +		if (hugetlb_put_pfn(pfn, contiguous_npage, dma->prot)) {
> +			unlocked = contiguous_npage;
> +			goto out;
> +		}

Should probably break the pin path into a separate get_pfn function for symmetry.


> +	}
> +slow_path:
>  	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
>  		if (put_pfn(pfn++, dma->prot)) {
>  			unlocked++;
> @@ -578,6 +775,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
>  		}
>  	}
>  
> +out:
>  	if (do_accounting)
>  		vfio_lock_acct(dma, locked - unlocked, true);
>  
> @@ -867,6 +1065,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  	struct iommu_iotlb_gather iotlb_gather;
>  	int unmapped_region_cnt = 0;
>  	long unlocked = 0;
> +	enum vfio_hugetlbpage_type type;
>  
>  	if (!dma->size)
>  		return 0;
> @@ -900,16 +1099,33 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  			continue;
>  		}
>  
> -		/*
> -		 * To optimize for fewer iommu_unmap() calls, each of which
> -		 * may require hardware cache flushing, try to find the
> -		 * largest contiguous physical memory chunk to unmap.
> -		 */
> -		for (len = PAGE_SIZE;
> -		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
> -			next = iommu_iova_to_phys(domain->domain, iova + len);
> -			if (next != phys + len)
> -				break;
> +		if (is_hugetlbpage((phys >> PAGE_SHIFT), &type)
> +		    && (!domain->fgsp)) {

Reverse the order of these tests.

> +			unsigned long hugetlb_resdual_npage = 0;
> +			unsigned long contiguous_npage = 0;


Unnecessary...

> +			hugetlb_resdual_npage =
> +				hugetlb_get_resdual_pages(iova & ~(PAGE_SIZE - 1), type);

PAGE_MASK

> +			/*
> +			 * The number of contiguous page can not be larger than dma->size
> +			 * which is the number of pages pinned.
> +			 */
> +			contiguous_npage = ((dma->size >> PAGE_SHIFT) > hugetlb_resdual_npage) ?
> +				hugetlb_resdual_npage : (dma->size >> PAGE_SHIFT);

min()

> +
> +			len = contiguous_npage * PAGE_SIZE;
> +		} else {
> +			/*
> +			 * To optimize for fewer iommu_unmap() calls, each of which
> +			 * may require hardware cache flushing, try to find the
> +			 * largest contiguous physical memory chunk to unmap.
> +			 */
> +			for (len = PAGE_SIZE;
> +			     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
> +				next = iommu_iova_to_phys(domain->domain, iova + len);
> +				if (next != phys + len)
> +					break;
> +			}
>  		}
>  
>  		/*
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 38d3c6a8d..91ef2058f 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -214,4 +214,24 @@ extern int vfio_virqfd_enable(void *opaque,
>  			      void *data, struct virqfd **pvirqfd, int fd);
>  extern void vfio_virqfd_disable(struct virqfd **pvirqfd);
>  
> +enum vfio_hugetlbpage_type {
> +	vfio_hugetlbpage_2M,
> +	vfio_hugetlbpage_1G,
> +};
> +
> +struct vfio_hupetlbpage_info {
> +	enum vfio_hugetlbpage_type type;

The enum within the structure serves no purpose.

> +	unsigned long size;
> +	unsigned long mask;
> +};
> +
> +#define PMD_ORDER 9
> +#define PUD_ORDER 18

Architecture specific.

> +/*
> + * get the number of resdual 4KB-pages in a hugetlbfs page
> + * (including the page which pointed by this address)

s/4KB/PAGE_SIZE/

> + */
> +#define hugetlb_get_resdual_pages(address, type)				\
> +		((vfio_hugetlbpage_info[type].size				\
> +		- (address & ~vfio_hugetlbpage_info[type].mask)) >> PAGE_SHIFT)
>  #endif /* VFIO_H */


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
  2020-07-20  8:29 [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages Jay Zhou
  2020-07-20 14:24 ` kernel test robot
  2020-07-20 22:46 ` Alex Williamson
@ 2020-07-20 23:38 ` kernel test robot
  2 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2020-07-20 23:38 UTC (permalink / raw)
  To: Jay Zhou, linux-kernel, kvm
  Cc: kbuild-all, alex.williamson, cohuck, maoming.maoming,
	jianjay.zhou, weidong.huang

[-- Attachment #1: Type: text/plain, Size: 4305 bytes --]

Hi Jay,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on vfio/next]
[also build test ERROR on linux/master linus/master v5.8-rc6 next-20200720]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Jay-Zhou/vfio-dma_map-unmap-optimized-for-hugetlbfs-pages/20200720-163157
base:   https://github.com/awilliam/linux-vfio.git next
config: arm-allyesconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=arm 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

>> drivers/vfio/vfio_iommu_type1.c:522:52: error: 'HUGE_MAX_HSTATE' undeclared here (not in a function)
     522 | struct vfio_hupetlbpage_info vfio_hugetlbpage_info[HUGE_MAX_HSTATE] = {
         |                                                    ^~~~~~~~~~~~~~~
   In file included from include/linux/mm.h:679,
                    from include/linux/scatterlist.h:8,
                    from include/linux/iommu.h:10,
                    from drivers/vfio/vfio_iommu_type1.c:27:
   include/linux/huge_mm.h:319:25: error: braced-group within expression allowed only inside a function
     319 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
         |                         ^
   drivers/vfio/vfio_iommu_type1.c:523:45: note: in expansion of macro 'HPAGE_PMD_SHIFT'
     523 |  {vfio_hugetlbpage_2M, PMD_SIZE, ~((1ULL << HPAGE_PMD_SHIFT) - 1)},
         |                                             ^~~~~~~~~~~~~~~
   include/linux/huge_mm.h:323:25: error: braced-group within expression allowed only inside a function
     323 | #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
         |                         ^
   drivers/vfio/vfio_iommu_type1.c:524:45: note: in expansion of macro 'HPAGE_PUD_SHIFT'
     524 |  {vfio_hugetlbpage_1G, PUD_SIZE, ~((1ULL << HPAGE_PUD_SHIFT) - 1)},
         |                                             ^~~~~~~~~~~~~~~
   drivers/vfio/vfio_iommu_type1.c: In function 'hugetlb_is_last_page':
>> drivers/vfio/vfio_iommu_type1.c:554:81: warning: parameter 'type' set but not used [-Wunused-but-set-parameter]
     554 | static bool hugetlb_is_last_page(unsigned long addr, enum vfio_hugetlbpage_type type)
         |                                                      ~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~

vim +/HUGE_MAX_HSTATE +522 drivers/vfio/vfio_iommu_type1.c

   521	
 > 522	struct vfio_hupetlbpage_info vfio_hugetlbpage_info[HUGE_MAX_HSTATE] = {
   523		{vfio_hugetlbpage_2M, PMD_SIZE, ~((1ULL << HPAGE_PMD_SHIFT) - 1)},
   524		{vfio_hugetlbpage_1G, PUD_SIZE, ~((1ULL << HPAGE_PUD_SHIFT) - 1)},
   525	};
   526	
   527	static bool is_hugetlbpage(unsigned long pfn, enum vfio_hugetlbpage_type *type)
   528	{
   529		struct page *page = NULL;
   530	
   531		if (!pfn_valid(pfn) || !type)
   532			return false;
   533	
   534		page = pfn_to_page(pfn);
   535		/* only check for hugetlbfs pages */
   536		if (!page || !PageHuge(page))
   537			return false;
   538	
   539		switch (compound_order(compound_head(page))) {
   540		case PMD_ORDER:
   541			*type = vfio_hugetlbpage_2M;
   542			break;
   543		case PUD_ORDER:
   544			*type = vfio_hugetlbpage_1G;
   545			break;
   546		default:
   547			return false;
   548		}
   549	
   550		return true;
   551	}
   552	
   553	/* Is the addr in the last page in hugetlbfs pages? */
 > 554	static bool hugetlb_is_last_page(unsigned long addr, enum vfio_hugetlbpage_type type)
   555	{
   556		unsigned int num = 0;
   557	
   558		num = hugetlb_get_resdual_pages(addr & ~(PAGE_SIZE - 1), type);
   559	
   560		if (num == 1)
   561			return true;
   562		else
   563			return false;
   564	}
   565	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 74593 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
  2020-07-20 22:46 ` Alex Williamson
@ 2020-07-21 13:28   ` Zhoujian (jay)
  2020-08-13  6:11     ` 答复: " Maoming (maoming, Cloud Infrastructure Service Product Dept.)
  2020-07-22  0:48   ` Peter Xu
  1 sibling, 1 reply; 7+ messages in thread
From: Zhoujian (jay) @ 2020-07-21 13:28 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-kernel, kvm, cohuck, Maoming (maoming,
	Cloud Infrastructure Service Product Dept.), Huangweidong (C),
	Peter Xu, Andrea Arcangeli

Thanks for taking a close look at the code, Alex.
We'll check them one by one ASAP.

> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, July 21, 2020 6:46 AM
> To: Zhoujian (jay) <jianjay.zhou@huawei.com>
> Cc: linux-kernel@vger.kernel.org; kvm@vger.kernel.org; cohuck@redhat.com;
> Maoming (maoming, Cloud Infrastructure Service Product Dept.)
> <maoming.maoming@huawei.com>; Huangweidong (C)
> <weidong.huang@huawei.com>; Peter Xu <peterx@redhat.com>; Andrea
> Arcangeli <aarcange@redhat.com>
> Subject: Re: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
> 
> On Mon, 20 Jul 2020 16:29:47 +0800
> Jay Zhou <jianjay.zhou@huawei.com> wrote:
> 
> > From: Ming Mao <maoming.maoming@huawei.com>
> >
> > Hi all,
> > I'm working on starting lots of big size Virtual Machines(memory:
> > >128GB) with VFIO-devices. And I encounter a problem that is the
> > waiting time of starting all Virtual Machines is too long. I analyze
> > the startup log and find that the time of pinning/unpinning pages
> > could be reduced.
> >
> > In the original process, to make sure the pages are contiguous, we
> > have to check all pages one by one. I think maybe we can use hugetlbfs
> > pages which can skip this step.
> > So I create a patch to do this.
> > According to my test, the result of this patch is pretty well.
> >
> > Virtual Machine: 50G memory, 32 CPU, 1 VFIO-device, 1G hugetlbfs page
> >         original   after optimization
> > pin time   700ms          0.1ms
> >
> > I Suppose that:
> > 1)the hugetlbfs page should not be split 2)PG_reserved is not relevant
> > for hugetlbfs pages 3)we can delete the for loops and use some
> > operations (such as atomic_add,page_ref_add) instead
> >
> > please correct me if I am wrong.
> >
> > Thanks.
> >
> > Signed-off-by: Ming Mao <maoming.maoming@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 236 ++++++++++++++++++++++++++++++--
> >  include/linux/vfio.h            |  20 +++
> >  2 files changed, 246 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index 5e556ac91..42e25752e 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -415,6 +415,46 @@ static int put_pfn(unsigned long pfn, int prot)
> >  	return 0;
> >  }
> >
> > +/*
> > + * put pfns for a hugetlbfs page
> > + * @start:the 4KB-page we start to put,can be any page in this
> > +hugetlbfs page
> > + * @npage:the number of 4KB-pages need to put
> 
> This code supports systems where PAGE_SIZE is not 4KB.
> 
> > + * @prot:IOMMU_READ/WRITE
> > + */
> > +static int hugetlb_put_pfn(unsigned long start, unsigned int npage,
> > +int prot) {
> > +	struct page *page = NULL;
> > +	struct page *head = NULL;
> 
> Unnecessary initialization.
> 
> > +
> > +	if (!npage || !pfn_valid(start))
> > +		return 0;
> > +
> > +	page = pfn_to_page(start);
> > +	if (!page || !PageHuge(page))
> > +		return 0;
> > +	head = compound_head(page);
> > +	/*
> > +	 * The last page should be in this hugetlbfs page.
> > +	 * The number of putting pages should be equal to the number
> > +	 * of getting pages.So the hugepage pinned refcount and the normal
> > +	 * page refcount can not be smaller than npage.
> > +	 */
> > +	if ((head != compound_head(pfn_to_page(start + npage - 1)))
> > +	    || (page_ref_count(head) < npage)
> > +	    || (compound_pincount(page) < npage))
> > +		return 0;
> > +
> > +	if ((prot & IOMMU_WRITE) && !PageDirty(page))
> > +		set_page_dirty_lock(page);
> > +
> > +	atomic_sub(npage, compound_pincount_ptr(head));
> > +	if (page_ref_sub_and_test(head, npage))
> > +		__put_page(head);
> > +
> > +	mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_RELEASED,
> npage);
> > +	return 1;
> > +}
> > +
> >  static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct
> *mm,
> >  			    unsigned long vaddr, unsigned long *pfn,
> >  			    bool write_fault)
> > @@ -479,6 +519,90 @@ static int vaddr_get_pfn(struct mm_struct *mm,
> unsigned long vaddr,
> >  	return ret;
> >  }
> >
> > +struct vfio_hupetlbpage_info vfio_hugetlbpage_info[HUGE_MAX_HSTATE] = {
> > +	{vfio_hugetlbpage_2M, PMD_SIZE, ~((1ULL << HPAGE_PMD_SHIFT) - 1)},
> > +	{vfio_hugetlbpage_1G, PUD_SIZE, ~((1ULL << HPAGE_PUD_SHIFT) - 1)},
> 
> Other architectures support more huge page sizes, also 0-day identified these
> #defines don't exist when THP is not configured.  But why couldn't we figure out
> all of these form the compound_order()?  order == shift, size = PAGE_SIZE <<
> order.
> 
> > +};
> > +
> > +static bool is_hugetlbpage(unsigned long pfn, enum
> > +vfio_hugetlbpage_type *type) {
> > +	struct page *page = NULL;
> 
> Unnecessary initialization.
> 
> > +
> > +	if (!pfn_valid(pfn) || !type)
> > +		return false;
> > +
> > +	page = pfn_to_page(pfn);
> > +	/* only check for hugetlbfs pages */
> > +	if (!page || !PageHuge(page))
> > +		return false;
> > +
> > +	switch (compound_order(compound_head(page))) {
> > +	case PMD_ORDER:
> > +		*type = vfio_hugetlbpage_2M;
> > +		break;
> > +	case PUD_ORDER:
> > +		*type = vfio_hugetlbpage_1G;
> > +		break;
> > +	default:
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +/* Is the addr in the last page in hugetlbfs pages? */ static bool
> > +hugetlb_is_last_page(unsigned long addr, enum vfio_hugetlbpage_type
> > +type) {
> > +	unsigned int num = 0;
> 
> Unnecessary initialization, and in fact unnecessary variable altogether.
> ie.
> 
> 	return hugetlb_get_resdual_pages(addr & ~(PAGE_SIZE - 1), type) == 1;
> 
> 
> > +
> > +	num = hugetlb_get_resdual_pages(addr & ~(PAGE_SIZE - 1), type);
> 
> residual?
> 
> > +
> > +	if (num == 1)
> > +		return true;
> > +	else
> > +		return false;
> > +}
> > +
> > +static bool hugetlb_page_is_pinned(struct vfio_dma *dma,
> > +				unsigned long start,
> > +				unsigned long npages)
> > +{
> > +	struct vfio_pfn *vpfn = NULL;
> 
> Unnecessary initialization.
> 
> > +	struct rb_node *node = rb_first(&dma->pfn_list);
> > +	unsigned long end = start + npages - 1;
> > +
> > +	for (; node; node = rb_next(node)) {
> > +		vpfn = rb_entry(node, struct vfio_pfn, node);
> > +
> > +		if ((vpfn->pfn >= start) && (vpfn->pfn <= end))
> > +			return true;
> 
> This function could be named better, it suggests the hugetlbfs page is pinned, but
> really we're only looking for any pfn_list pinnings overlapping the pfn range.
> 
> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +static unsigned int hugetlb_get_contiguous_pages_num(struct vfio_dma
> *dma,
> > +						unsigned long pfn,
> > +						unsigned long resdual_npage,
> > +						unsigned long max_npage)
> > +{
> > +	unsigned int num = 0;
> 
> Unnecessary initialization
> 
> > +
> > +	if (!dma)
> > +		return 0;
> > +
> > +	num = resdual_npage < max_npage ? resdual_npage : max_npage;
> 
> min(resdual_npage, max_npage)
> 
> 
> > +	/*
> > +	 * If there is only one page, it is no need to optimize them.
> 
> s/no need/not necessary/
> 
> > +	 * Maybe some pages have been pinned and inserted into dma->pfn_list by
> others.
> > +	 * In this case, we just goto the slow path simply.
> > +	 */
> > +	if ((num < 2) || hugetlb_page_is_pinned(dma, pfn, num))
> > +		return 0;
> 
> Why does having pinnings in the pfn_list disqualify it from hugetlbfs pinning?
> 
> Testing for the last page here is redundant to the pinning path, should it only be
> done here?  Can num be zero?
> 
> Maybe better to return -errno for this and above zero conditions rather than
> return a value that doesn't reflect the purpose of the function?
> 
> > +
> > +	return num;
> > +}
> > +
> >  /*
> >   * Attempt to pin pages.  We really don't want to track all the pfns and
> >   * the iommu can only map chunks of consecutive pfns anyway, so get
> > the @@ -492,6 +616,7 @@ static long vfio_pin_pages_remote(struct vfio_dma
> *dma, unsigned long vaddr,
> >  	long ret, pinned = 0, lock_acct = 0;
> >  	bool rsvd;
> >  	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
> > +	enum vfio_hugetlbpage_type type;
> >
> >  	/* This code path is only user initiated */
> >  	if (!current->mm)
> > @@ -521,6 +646,55 @@ static long vfio_pin_pages_remote(struct vfio_dma
> *dma, unsigned long vaddr,
> >  	if (unlikely(disable_hugepages))
> >  		goto out;
> >
> > +	/*
> > +	 * It is no need to get pages one by one for hugetlbfs pages.
> 
> s/no need/not necessary/
> 
> > +	 * 4KB-pages in hugetlbfs pages are contiguous.
> > +	 * But if the vaddr is in the last 4KB-page, we just goto the slow path.
> 
> 
> s/4KB-/PAGE_SIZE /
> 
> Please explain the significance of vaddr being in the last PAGE_SIZE page of a
> hugetlbfs page.  Is it simply that we should take the slow path for mapping a
> single page before the end of the hugetlbfs page?
> Is this optimization worthwhile?  Isn't it more consistent to handle all mappings
> over hugetlbfs pages the same?  Should we be operating on vaddr here for the
> hugetlbfs page alignment or base_pfn?
> 
> > +	 */
> > +	if (is_hugetlbpage(*pfn_base, &type) && !hugetlb_is_last_page(vaddr,
> type)) {
> > +		unsigned long hugetlb_resdual_npage = 0;
> > +		unsigned long contiguous_npage = 0;
> > +		struct page *head = NULL;
> > +
> > +		hugetlb_resdual_npage =
> > +			hugetlb_get_resdual_pages((vaddr + PAGE_SIZE) & ~(PAGE_SIZE -
> 1),
> > +type);
> 
> ~(PAGE_SIZE - 1) is PAGE_MASK, but that whole operation looks like a
> PAGE_ALIGN(vaddr)
> 
> This is trying to get the number of pages after this page to the end of the
> hugetlbfs page, right?  Rather than the hugetlb_is_last_page() above, shouldn't
> we have recorded the number of pages there and used math to get this value?
> 
> > +		/*
> > +		 * Maybe the hugetlb_resdual_npage is invalid.
> > +		 * For example, hugetlb_resdual_npage > (npage - 1) or
> > +		 * some pages of this hugetlbfs page have been pinned.
> > +		 */
> > +		contiguous_npage = hugetlb_get_contiguous_pages_num(dma,
> *pfn_base + 1,
> > +						hugetlb_resdual_npage, npage - 1);
> > +		if (!contiguous_npage)
> > +			goto slow_path;
> > +
> > +		/*
> > +		 * Unlike THP, the splitting should not happen for hugetlbfs pages.
> > +		 * Since PG_reserved is not relevant for compound pages, and the pfn
> of
> > +		 * 4KB-page which in hugetlbfs pages is valid,
> 
> s/4KB/PAGE_SIZE/
> 
> > +		 * it is no need to check rsvd for hugetlbfs pages.
> 
> s/no need/not necessary/
> 
> > +		 */
> > +		if (!dma->lock_cap &&
> > +		    current->mm->locked_vm + lock_acct + contiguous_npage > limit)
> {
> > +			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > +				 __func__, limit << PAGE_SHIFT);
> > +			ret = -ENOMEM;
> > +			goto unpin_out;
> > +		}
> > +		/*
> > +		 * We got a hugetlbfs page using vaddr_get_pfn alreadly.
> > +		 * In this case,we do not need to alloc pages and we can finish all
> > +		 * work by a single operation to the head page.
> > +		 */
> > +		lock_acct += contiguous_npage;
> > +		head = compound_head(pfn_to_page(*pfn_base));
> > +		atomic_add(contiguous_npage, compound_pincount_ptr(head));
> > +		page_ref_add(head, contiguous_npage);
> > +		mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_ACQUIRED,
> contiguous_npage);
> > +		pinned += contiguous_npage;
> > +		goto out;
> 
> I'm hoping Peter or Andrea understand this, but I think we still have pfn_base
> pinned separately and I don't see that we've done an unpin anywhere, so are we
> leaking the pin of the first page??
> 
> > +	}
> > +slow_path:
> >  	/* Lock all the consecutive pages from pfn_base */
> >  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> >  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { @@ -569,7
> > +743,30 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma,
> > dma_addr_t iova,  {
> >  	long unlocked = 0, locked = 0;
> >  	long i;
> > +	enum vfio_hugetlbpage_type type;
> > +
> > +	if (is_hugetlbpage(pfn, &type)) {
> > +		unsigned long hugetlb_resdual_npage = 0;
> > +		unsigned long contiguous_npage = 0;
> 
> Unnecessary initialization...
> 
> >
> > +		hugetlb_resdual_npage = hugetlb_get_resdual_pages(iova &
> > +~(PAGE_SIZE - 1), type);
> 
> PAGE_MASK
> 
> Like above, is it pfn or iova that we should be using when looking at hugetlbfs
> alignment?
> 
> > +		contiguous_npage = hugetlb_get_contiguous_pages_num(dma, pfn,
> > +						hugetlb_resdual_npage, npage);
> > +		/*
> > +		 * There is not enough contiguous pages or this hugetlbfs page
> > +		 * has been pinned.
> > +		 * Let's try the slow path.
> > +		 */
> > +		if (!contiguous_npage)
> > +			goto slow_path;
> > +
> > +		/* try the slow path if failed */
> > +		if (hugetlb_put_pfn(pfn, contiguous_npage, dma->prot)) {
> > +			unlocked = contiguous_npage;
> > +			goto out;
> > +		}
> 
> Should probably break the pin path into a separate get_pfn function for
> symmetry.
> 
> 
> > +	}
> > +slow_path:
> >  	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> >  		if (put_pfn(pfn++, dma->prot)) {
> >  			unlocked++;
> > @@ -578,6 +775,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma
> *dma, dma_addr_t iova,
> >  		}
> >  	}
> >
> > +out:
> >  	if (do_accounting)
> >  		vfio_lock_acct(dma, locked - unlocked, true);
> >
> > @@ -867,6 +1065,7 @@ static long vfio_unmap_unpin(struct vfio_iommu
> *iommu, struct vfio_dma *dma,
> >  	struct iommu_iotlb_gather iotlb_gather;
> >  	int unmapped_region_cnt = 0;
> >  	long unlocked = 0;
> > +	enum vfio_hugetlbpage_type type;
> >
> >  	if (!dma->size)
> >  		return 0;
> > @@ -900,16 +1099,33 @@ static long vfio_unmap_unpin(struct vfio_iommu
> *iommu, struct vfio_dma *dma,
> >  			continue;
> >  		}
> >
> > -		/*
> > -		 * To optimize for fewer iommu_unmap() calls, each of which
> > -		 * may require hardware cache flushing, try to find the
> > -		 * largest contiguous physical memory chunk to unmap.
> > -		 */
> > -		for (len = PAGE_SIZE;
> > -		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
> > -			next = iommu_iova_to_phys(domain->domain, iova + len);
> > -			if (next != phys + len)
> > -				break;
> > +		if (is_hugetlbpage((phys >> PAGE_SHIFT), &type)
> > +		    && (!domain->fgsp)) {
> 
> Reverse the order of these tests.
> 
> > +			unsigned long hugetlb_resdual_npage = 0;
> > +			unsigned long contiguous_npage = 0;
> 
> 
> Unnecessary...
> 
> > +			hugetlb_resdual_npage =
> > +				hugetlb_get_resdual_pages(iova & ~(PAGE_SIZE - 1), type);
> 
> PAGE_MASK
> 
> > +			/*
> > +			 * The number of contiguous page can not be larger than
> dma->size
> > +			 * which is the number of pages pinned.
> > +			 */
> > +			contiguous_npage = ((dma->size >> PAGE_SHIFT) >
> hugetlb_resdual_npage) ?
> > +				hugetlb_resdual_npage : (dma->size >> PAGE_SHIFT);
> 
> min()
> 
> > +
> > +			len = contiguous_npage * PAGE_SIZE;
> > +		} else {
> > +			/*
> > +			 * To optimize for fewer iommu_unmap() calls, each of which
> > +			 * may require hardware cache flushing, try to find the
> > +			 * largest contiguous physical memory chunk to unmap.
> > +			 */
> > +			for (len = PAGE_SIZE;
> > +			     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
> > +				next = iommu_iova_to_phys(domain->domain, iova + len);
> > +				if (next != phys + len)
> > +					break;
> > +			}
> >  		}
> >
> >  		/*
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h index
> > 38d3c6a8d..91ef2058f 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -214,4 +214,24 @@ extern int vfio_virqfd_enable(void *opaque,
> >  			      void *data, struct virqfd **pvirqfd, int fd);  extern void
> > vfio_virqfd_disable(struct virqfd **pvirqfd);
> >
> > +enum vfio_hugetlbpage_type {
> > +	vfio_hugetlbpage_2M,
> > +	vfio_hugetlbpage_1G,
> > +};
> > +
> > +struct vfio_hupetlbpage_info {
> > +	enum vfio_hugetlbpage_type type;
> 
> The enum within the structure serves no purpose.
> 
> > +	unsigned long size;
> > +	unsigned long mask;
> > +};
> > +
> > +#define PMD_ORDER 9
> > +#define PUD_ORDER 18
> 
> Architecture specific.
> 
> > +/*
> > + * get the number of resdual 4KB-pages in a hugetlbfs page
> > + * (including the page which pointed by this address)
> 
> s/4KB/PAGE_SIZE/
> 
> > + */
> > +#define hugetlb_get_resdual_pages(address, type)				\
> > +		((vfio_hugetlbpage_info[type].size				\
> > +		- (address & ~vfio_hugetlbpage_info[type].mask)) >> PAGE_SHIFT)
> >  #endif /* VFIO_H */


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
  2020-07-20 22:46 ` Alex Williamson
  2020-07-21 13:28   ` Zhoujian (jay)
@ 2020-07-22  0:48   ` Peter Xu
  1 sibling, 0 replies; 7+ messages in thread
From: Peter Xu @ 2020-07-22  0:48 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jay Zhou, linux-kernel, kvm, cohuck, maoming.maoming,
	weidong.huang, Andrea Arcangeli

On Mon, Jul 20, 2020 at 04:46:03PM -0600, Alex Williamson wrote:
> > +		/*
> > +		 * We got a hugetlbfs page using vaddr_get_pfn alreadly.
> > +		 * In this case,we do not need to alloc pages and we can finish all
> > +		 * work by a single operation to the head page.
> > +		 */
> > +		lock_acct += contiguous_npage;
> > +		head = compound_head(pfn_to_page(*pfn_base));
> > +		atomic_add(contiguous_npage, compound_pincount_ptr(head));
> > +		page_ref_add(head, contiguous_npage);
> > +		mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_ACQUIRED, contiguous_npage);
> > +		pinned += contiguous_npage;
> > +		goto out;
> 
> I'm hoping Peter or Andrea understand this, but I think we still have
> pfn_base pinned separately and I don't see that we've done an unpin
> anywhere, so are we leaking the pin of the first page??

I'm not very familiar with that either, however it seems to me most of above
chunk was already done in the gup core.  For hugetlbfs, imho it should be where
follow_hugetlb_page() calls try_grab_page().  Thanks,

-- 
Peter Xu


^ permalink raw reply	[flat|nested] 7+ messages in thread

* 答复: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
  2020-07-21 13:28   ` Zhoujian (jay)
@ 2020-08-13  6:11     ` Maoming (maoming, Cloud Infrastructure Service Product Dept.)
  0 siblings, 0 replies; 7+ messages in thread
From: Maoming (maoming, Cloud Infrastructure Service Product Dept.) @ 2020-08-13  6:11 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-kernel, kvm, cohuck, Huangweidong (C),
	Peter Xu, Andrea Arcangeli, Zhoujian (jay)

Hi, Alex
Thanks for taking a look. Also, my apologies for the slow reply.
Some replies below:
-----邮件原件-----
发件人: Zhoujian (jay) 
发送时间: 2020年7月21日 21:28
收件人: Alex Williamson <alex.williamson@redhat.com>
抄送: linux-kernel@vger.kernel.org; kvm@vger.kernel.org; cohuck@redhat.com; Maoming (maoming, Cloud Infrastructure Service Product Dept.) <maoming.maoming@huawei.com>; Huangweidong (C) <weidong.huang@huawei.com>; Peter Xu <peterx@redhat.com>; Andrea Arcangeli <aarcange@redhat.com>
主题: RE: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages

Thanks for taking a close look at the code, Alex.
We'll check them one by one ASAP.

> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, July 21, 2020 6:46 AM
> To: Zhoujian (jay) <jianjay.zhou@huawei.com>
> Cc: linux-kernel@vger.kernel.org; kvm@vger.kernel.org; 
> cohuck@redhat.com; Maoming (maoming, Cloud Infrastructure Service 
> Product Dept.) <maoming.maoming@huawei.com>; Huangweidong (C) 
> <weidong.huang@huawei.com>; Peter Xu <peterx@redhat.com>; Andrea 
> Arcangeli <aarcange@redhat.com>
> Subject: Re: [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages
> 
> On Mon, 20 Jul 2020 16:29:47 +0800
> Jay Zhou <jianjay.zhou@huawei.com> wrote:
> 
> > From: Ming Mao <maoming.maoming@huawei.com>
> >
> > Hi all,
> > I'm working on starting lots of big size Virtual Machines(memory:
> > >128GB) with VFIO-devices. And I encounter a problem that is the
> > waiting time of starting all Virtual Machines is too long. I analyze 
> > the startup log and find that the time of pinning/unpinning pages 
> > could be reduced.
> >
> > In the original process, to make sure the pages are contiguous, we 
> > have to check all pages one by one. I think maybe we can use 
> > hugetlbfs pages which can skip this step.
> > So I create a patch to do this.
> > According to my test, the result of this patch is pretty well.
> >
> > Virtual Machine: 50G memory, 32 CPU, 1 VFIO-device, 1G hugetlbfs page
> >         original   after optimization
> > pin time   700ms          0.1ms
> >
> > I Suppose that:
> > 1)the hugetlbfs page should not be split 2)PG_reserved is not 
> > relevant for hugetlbfs pages 3)we can delete the for loops and use 
> > some operations (such as atomic_add,page_ref_add) instead
> >
> > please correct me if I am wrong.
> >
> > Thanks.
> >
> > Signed-off-by: Ming Mao <maoming.maoming@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 236 ++++++++++++++++++++++++++++++--
> >  include/linux/vfio.h            |  20 +++
> >  2 files changed, 246 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c 
> > b/drivers/vfio/vfio_iommu_type1.c index 5e556ac91..42e25752e 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -415,6 +415,46 @@ static int put_pfn(unsigned long pfn, int prot)
> >  	return 0;
> >  }
> >
> > +/*
> > + * put pfns for a hugetlbfs page
> > + * @start:the 4KB-page we start to put,can be any page in this 
> > +hugetlbfs page
> > + * @npage:the number of 4KB-pages need to put
> 
> This code supports systems where PAGE_SIZE is not 4KB.
> 

Yes, I will fix this problem


> > + * @prot:IOMMU_READ/WRITE
> > + */
> > +static int hugetlb_put_pfn(unsigned long start, unsigned int npage, 
> > +int prot) {
> > +	struct page *page = NULL;
> > +	struct page *head = NULL;
> 
> Unnecessary initialization.
>
Yes,you are right


 
> > +
> > +	if (!npage || !pfn_valid(start))
> > +		return 0;
> > +
> > +	page = pfn_to_page(start);
> > +	if (!page || !PageHuge(page))
> > +		return 0;
> > +	head = compound_head(page);
> > +	/*
> > +	 * The last page should be in this hugetlbfs page.
> > +	 * The number of putting pages should be equal to the number
> > +	 * of getting pages.So the hugepage pinned refcount and the normal
> > +	 * page refcount can not be smaller than npage.
> > +	 */
> > +	if ((head != compound_head(pfn_to_page(start + npage - 1)))
> > +	    || (page_ref_count(head) < npage)
> > +	    || (compound_pincount(page) < npage))
> > +		return 0;
> > +
> > +	if ((prot & IOMMU_WRITE) && !PageDirty(page))
> > +		set_page_dirty_lock(page);
> > +
> > +	atomic_sub(npage, compound_pincount_ptr(head));
> > +	if (page_ref_sub_and_test(head, npage))
> > +		__put_page(head);
> > +
> > +	mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_RELEASED,
> npage);
> > +	return 1;
> > +}
> > +
> >  static int follow_fault_pfn(struct vm_area_struct *vma, struct 
> > mm_struct
> *mm,
> >  			    unsigned long vaddr, unsigned long *pfn,
> >  			    bool write_fault)
> > @@ -479,6 +519,90 @@ static int vaddr_get_pfn(struct mm_struct *mm,
> unsigned long vaddr,
> >  	return ret;
> >  }
> >
> > +struct vfio_hupetlbpage_info vfio_hugetlbpage_info[HUGE_MAX_HSTATE] = {
> > +	{vfio_hugetlbpage_2M, PMD_SIZE, ~((1ULL << HPAGE_PMD_SHIFT) - 1)},
> > +	{vfio_hugetlbpage_1G, PUD_SIZE, ~((1ULL << HPAGE_PUD_SHIFT) - 1)},
> 
> Other architectures support more huge page sizes, also 0-day 
> identified these #defines don't exist when THP is not configured.  But 
> why couldn't we figure out all of these form the compound_order()?  
> order == shift, size = PAGE_SIZE << order.
> 

Yes, compound_order() is better.


> > +};
> > +
> > +static bool is_hugetlbpage(unsigned long pfn, enum 
> > +vfio_hugetlbpage_type *type) {
> > +	struct page *page = NULL;
> 
> Unnecessary initialization.
> 
Yes,you are right



> > +
> > +	if (!pfn_valid(pfn) || !type)
> > +		return false;
> > +
> > +	page = pfn_to_page(pfn);
> > +	/* only check for hugetlbfs pages */
> > +	if (!page || !PageHuge(page))
> > +		return false;
> > +
> > +	switch (compound_order(compound_head(page))) {
> > +	case PMD_ORDER:
> > +		*type = vfio_hugetlbpage_2M;
> > +		break;
> > +	case PUD_ORDER:
> > +		*type = vfio_hugetlbpage_1G;
> > +		break;
> > +	default:
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +/* Is the addr in the last page in hugetlbfs pages? */ static bool 
> > +hugetlb_is_last_page(unsigned long addr, enum vfio_hugetlbpage_type
> > +type) {
> > +	unsigned int num = 0;
> 
> Unnecessary initialization, and in fact unnecessary variable altogether.
> ie.
> 
> 	return hugetlb_get_resdual_pages(addr & ~(PAGE_SIZE - 1), type) == 1;
> 
> 
> > +
> > +	num = hugetlb_get_resdual_pages(addr & ~(PAGE_SIZE - 1), type);
> 
> residual?
> 

Yes, I will fix this problem


> > +
> > +	if (num == 1)
> > +		return true;
> > +	else
> > +		return false;
> > +}
> > +
> > +static bool hugetlb_page_is_pinned(struct vfio_dma *dma,
> > +				unsigned long start,
> > +				unsigned long npages)
> > +{
> > +	struct vfio_pfn *vpfn = NULL;
> 
> Unnecessary initialization.
> 
Yes,you are right


> > +	struct rb_node *node = rb_first(&dma->pfn_list);
> > +	unsigned long end = start + npages - 1;
> > +
> > +	for (; node; node = rb_next(node)) {
> > +		vpfn = rb_entry(node, struct vfio_pfn, node);
> > +
> > +		if ((vpfn->pfn >= start) && (vpfn->pfn <= end))
> > +			return true;
> 
> This function could be named better, it suggests the hugetlbfs page is 
> pinned, but really we're only looking for any pfn_list pinnings overlapping the pfn range.
> 

Yes,you are right




> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +static unsigned int hugetlb_get_contiguous_pages_num(struct 
> > +vfio_dma
> *dma,
> > +						unsigned long pfn,
> > +						unsigned long resdual_npage,
> > +						unsigned long max_npage)
> > +{
> > +	unsigned int num = 0;
> 
> Unnecessary initialization
> 


Yes,you are right



> > +
> > +	if (!dma)
> > +		return 0;
> > +
> > +	num = resdual_npage < max_npage ? resdual_npage : max_npage;
> 
> min(resdual_npage, max_npage)
> 
> 

Yes,min is better



> > +	/*
> > +	 * If there is only one page, it is no need to optimize them.
> 
> s/no need/not necessary/
> 

I will fix this problem

> > +	 * Maybe some pages have been pinned and inserted into 
> > +dma->pfn_list by
> others.
> > +	 * In this case, we just goto the slow path simply.
> > +	 */
> > +	if ((num < 2) || hugetlb_page_is_pinned(dma, pfn, num))
> > +		return 0;
> 
> Why does having pinnings in the pfn_list disqualify it from hugetlbfs pinning?
> 
> Testing for the last page here is redundant to the pinning path, 
> should it only be done here?  Can num be zero?
> 
> Maybe better to return -errno for this and above zero conditions 
> rather than return a value that doesn't reflect the purpose of the function?
> 


All pages found in pfn_list have been pinned and counted externally.
In this case, the lock_acct should not be added.
I thought it could spend lots of time to figure out how many pages have been pinned
in this hugetlb page.
But considering that the dma->pfn_list is NULL in most cases, it does not matter to 
do this. So,I will modify it to figure out the number of pages pinned externally.

It is a special case if the vaddr is the last page.
Because the next page is in another hugetlb page, and the hugetlb pages may be not contiguous.
But,as you said, it is redundant to the pinning path.
I will rewrite this function.


> > +
> > +	return num;
> > +}
> > +
> >  /*
> >   * Attempt to pin pages.  We really don't want to track all the pfns and
> >   * the iommu can only map chunks of consecutive pfns anyway, so get 
> > the @@ -492,6 +616,7 @@ static long vfio_pin_pages_remote(struct 
> > vfio_dma
> *dma, unsigned long vaddr,
> >  	long ret, pinned = 0, lock_acct = 0;
> >  	bool rsvd;
> >  	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
> > +	enum vfio_hugetlbpage_type type;
> >
> >  	/* This code path is only user initiated */
> >  	if (!current->mm)
> > @@ -521,6 +646,55 @@ static long vfio_pin_pages_remote(struct 
> > vfio_dma
> *dma, unsigned long vaddr,
> >  	if (unlikely(disable_hugepages))
> >  		goto out;
> >
> > +	/*
> > +	 * It is no need to get pages one by one for hugetlbfs pages.
> 
> s/no need/not necessary/
> 
I will fix this problem


> > +	 * 4KB-pages in hugetlbfs pages are contiguous.
> > +	 * But if the vaddr is in the last 4KB-page, we just goto the slow path.
> 
> 
> s/4KB-/PAGE_SIZE /
> 
I will fix this problem


> Please explain the significance of vaddr being in the last PAGE_SIZE 
> page of a hugetlbfs page.  Is it simply that we should take the slow 
> path for mapping a single page before the end of the hugetlbfs page?
> Is this optimization worthwhile?  Isn't it more consistent to handle 
> all mappings over hugetlbfs pages the same?  Should we be operating on 
> vaddr here for the hugetlbfs page alignment or base_pfn?
> 


If the vaddr is the last page, the next page is in another hugetlb page.
In the new patch, I will check the address of all hugetlb pages. If the vaddr is the
last page of a hugetlb page and the next hugetlb page is contiguous, I wll take the
fast path.

Yes, the vaddr should be alignment.


> > +	 */
> > +	if (is_hugetlbpage(*pfn_base, &type) && 
> > +!hugetlb_is_last_page(vaddr,
> type)) {
> > +		unsigned long hugetlb_resdual_npage = 0;
> > +		unsigned long contiguous_npage = 0;
> > +		struct page *head = NULL;
> > +
> > +		hugetlb_resdual_npage =
> > +			hugetlb_get_resdual_pages((vaddr + PAGE_SIZE) & ~(PAGE_SIZE -
> 1),
> > +type);
> 
> ~(PAGE_SIZE - 1) is PAGE_MASK, but that whole operation looks like a
> PAGE_ALIGN(vaddr)
> 
> This is trying to get the number of pages after this page to the end 
> of the hugetlbfs page, right?  Rather than the hugetlb_is_last_page() 
> above, shouldn't we have recorded the number of pages there and used math to get this value?
> 

Yes, you are right. I will fix this problem


> > +		/*
> > +		 * Maybe the hugetlb_resdual_npage is invalid.
> > +		 * For example, hugetlb_resdual_npage > (npage - 1) or
> > +		 * some pages of this hugetlbfs page have been pinned.
> > +		 */
> > +		contiguous_npage = hugetlb_get_contiguous_pages_num(dma,
> *pfn_base + 1,
> > +						hugetlb_resdual_npage, npage - 1);
> > +		if (!contiguous_npage)
> > +			goto slow_path;
> > +
> > +		/*
> > +		 * Unlike THP, the splitting should not happen for hugetlbfs pages.
> > +		 * Since PG_reserved is not relevant for compound pages, and the 
> > +pfn
> of
> > +		 * 4KB-page which in hugetlbfs pages is valid,
> 
> s/4KB/PAGE_SIZE/
> 

I will fix this problem

> > +		 * it is no need to check rsvd for hugetlbfs pages.
> 
> s/no need/not necessary/
> 

I will fix this problem

> > +		 */
> > +		if (!dma->lock_cap &&
> > +		    current->mm->locked_vm + lock_acct + contiguous_npage > 
> > +limit)
> {
> > +			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > +				 __func__, limit << PAGE_SHIFT);
> > +			ret = -ENOMEM;
> > +			goto unpin_out;
> > +		}
> > +		/*
> > +		 * We got a hugetlbfs page using vaddr_get_pfn alreadly.
> > +		 * In this case,we do not need to alloc pages and we can finish all
> > +		 * work by a single operation to the head page.
> > +		 */
> > +		lock_acct += contiguous_npage;
> > +		head = compound_head(pfn_to_page(*pfn_base));
> > +		atomic_add(contiguous_npage, compound_pincount_ptr(head));
> > +		page_ref_add(head, contiguous_npage);
> > +		mod_node_page_state(page_pgdat(head), NR_FOLL_PIN_ACQUIRED,
> contiguous_npage);
> > +		pinned += contiguous_npage;
> > +		goto out;
> 
> I'm hoping Peter or Andrea understand this, but I think we still have 
> pfn_base pinned separately and I don't see that we've done an unpin 
> anywhere, so are we leaking the pin of the first page??
> 

Do you mean like this?
(1) ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base)
(2)rsvd is true
(3)There is something wrong and goto unpin_out
(4)can not put_pfn for pfn_base because of rsvd



> > +	}
> > +slow_path:
> >  	/* Lock all the consecutive pages from pfn_base */
> >  	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> >  	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { @@ -569,7
> > +743,30 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma,
> > dma_addr_t iova,  {
> >  	long unlocked = 0, locked = 0;
> >  	long i;
> > +	enum vfio_hugetlbpage_type type;
> > +
> > +	if (is_hugetlbpage(pfn, &type)) {
> > +		unsigned long hugetlb_resdual_npage = 0;
> > +		unsigned long contiguous_npage = 0;
> 
> Unnecessary initialization...
> 
> >

Yes, you are right

> > +		hugetlb_resdual_npage = hugetlb_get_resdual_pages(iova & 
> > +~(PAGE_SIZE - 1), type);
> 
> PAGE_MASK
> 

I will fix this problem


> Like above, is it pfn or iova that we should be using when looking at 
> hugetlbfs alignment?
> 

Yes, I will fix this problem



> > +		contiguous_npage = hugetlb_get_contiguous_pages_num(dma, pfn,
> > +						hugetlb_resdual_npage, npage);
> > +		/*
> > +		 * There is not enough contiguous pages or this hugetlbfs page
> > +		 * has been pinned.
> > +		 * Let's try the slow path.
> > +		 */
> > +		if (!contiguous_npage)
> > +			goto slow_path;
> > +
> > +		/* try the slow path if failed */
> > +		if (hugetlb_put_pfn(pfn, contiguous_npage, dma->prot)) {
> > +			unlocked = contiguous_npage;
> > +			goto out;
> > +		}
> 
> Should probably break the pin path into a separate get_pfn function 
> for symmetry.
> 
> 

Yes, I will fix this problem


> > +	}
> > +slow_path:
> >  	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> >  		if (put_pfn(pfn++, dma->prot)) {
> >  			unlocked++;
> > @@ -578,6 +775,7 @@ static long vfio_unpin_pages_remote(struct 
> > vfio_dma
> *dma, dma_addr_t iova,
> >  		}
> >  	}
> >
> > +out:
> >  	if (do_accounting)
> >  		vfio_lock_acct(dma, locked - unlocked, true);
> >
> > @@ -867,6 +1065,7 @@ static long vfio_unmap_unpin(struct vfio_iommu
> *iommu, struct vfio_dma *dma,
> >  	struct iommu_iotlb_gather iotlb_gather;
> >  	int unmapped_region_cnt = 0;
> >  	long unlocked = 0;
> > +	enum vfio_hugetlbpage_type type;
> >
> >  	if (!dma->size)
> >  		return 0;
> > @@ -900,16 +1099,33 @@ static long vfio_unmap_unpin(struct 
> > vfio_iommu
> *iommu, struct vfio_dma *dma,
> >  			continue;
> >  		}
> >
> > -		/*
> > -		 * To optimize for fewer iommu_unmap() calls, each of which
> > -		 * may require hardware cache flushing, try to find the
> > -		 * largest contiguous physical memory chunk to unmap.
> > -		 */
> > -		for (len = PAGE_SIZE;
> > -		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
> > -			next = iommu_iova_to_phys(domain->domain, iova + len);
> > -			if (next != phys + len)
> > -				break;
> > +		if (is_hugetlbpage((phys >> PAGE_SHIFT), &type)
> > +		    && (!domain->fgsp)) {
> 
> Reverse the order of these tests.
> 
OK, I will fix this problem


> > +			unsigned long hugetlb_resdual_npage = 0;
> > +			unsigned long contiguous_npage = 0;
> 
> 
> Unnecessary...
> 

Yes, I will fix this problem

> > +			hugetlb_resdual_npage =
> > +				hugetlb_get_resdual_pages(iova & ~(PAGE_SIZE - 1), type);
> 
> PAGE_MASK
> 

Yes, I will fix this problem


> > +			/*
> > +			 * The number of contiguous page can not be larger than
> dma->size
> > +			 * which is the number of pages pinned.
> > +			 */
> > +			contiguous_npage = ((dma->size >> PAGE_SHIFT) >
> hugetlb_resdual_npage) ?
> > +				hugetlb_resdual_npage : (dma->size >> PAGE_SHIFT);
> 
> min()
> 


Yes, min is better
> > +
> > +			len = contiguous_npage * PAGE_SIZE;
> > +		} else {
> > +			/*
> > +			 * To optimize for fewer iommu_unmap() calls, each of which
> > +			 * may require hardware cache flushing, try to find the
> > +			 * largest contiguous physical memory chunk to unmap.
> > +			 */
> > +			for (len = PAGE_SIZE;
> > +			     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
> > +				next = iommu_iova_to_phys(domain->domain, iova + len);
> > +				if (next != phys + len)
> > +					break;
> > +			}
> >  		}
> >
> >  		/*
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 
> > 38d3c6a8d..91ef2058f 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -214,4 +214,24 @@ extern int vfio_virqfd_enable(void *opaque,
> >  			      void *data, struct virqfd **pvirqfd, int fd);  extern void 
> > vfio_virqfd_disable(struct virqfd **pvirqfd);
> >
> > +enum vfio_hugetlbpage_type {
> > +	vfio_hugetlbpage_2M,
> > +	vfio_hugetlbpage_1G,
> > +};
> > +
> > +struct vfio_hupetlbpage_info {
> > +	enum vfio_hugetlbpage_type type;
> 
> The enum within the structure serves no purpose.
> 

Yes, I will delete it


> > +	unsigned long size;
> > +	unsigned long mask;
> > +};
> > +
> > +#define PMD_ORDER 9
> > +#define PUD_ORDER 18
> 
> Architecture specific.
> 

I will use compound_order()

> > +/*
> > + * get the number of resdual 4KB-pages in a hugetlbfs page
> > + * (including the page which pointed by this address)
> 
> s/4KB/PAGE_SIZE/
> 

Yes, I will fix this problem


> > + */
> > +#define hugetlb_get_resdual_pages(address, type)				\
> > +		((vfio_hugetlbpage_info[type].size				\
> > +		- (address & ~vfio_hugetlbpage_info[type].mask)) >> PAGE_SHIFT)
> >  #endif /* VFIO_H */


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-08-13  6:11 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-20  8:29 [PATCH] vfio dma_map/unmap: optimized for hugetlbfs pages Jay Zhou
2020-07-20 14:24 ` kernel test robot
2020-07-20 22:46 ` Alex Williamson
2020-07-21 13:28   ` Zhoujian (jay)
2020-08-13  6:11     ` 答复: " Maoming (maoming, Cloud Infrastructure Service Product Dept.)
2020-07-22  0:48   ` Peter Xu
2020-07-20 23:38 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).