KVM Archive on lore.kernel.org
 help / color / Atom feed
From: Daniel Jordan <daniel.m.jordan@oracle.com>
To: linux-mm@kvack.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: aarcange@redhat.com, aaron.lu@intel.com,
	akpm@linux-foundation.org, alex.williamson@redhat.com,
	bsd@redhat.com, daniel.m.jordan@oracle.com,
	darrick.wong@oracle.com, dave.hansen@linux.intel.com,
	jgg@mellanox.com, jwadams@google.com, jiangshanlai@gmail.com,
	mhocko@kernel.org, mike.kravetz@oracle.com,
	Pavel.Tatashin@microsoft.com, prasad.singamsetty@oracle.com,
	rdunlap@infradead.org, steven.sistare@oracle.com,
	tim.c.chen@intel.com, tj@kernel.org, vbabka@suse.cz
Subject: [RFC PATCH v4 06/13] vfio: parallelize vfio_pin_map_dma
Date: Mon,  5 Nov 2018 11:55:51 -0500
Message-ID: <20181105165558.11698-7-daniel.m.jordan@oracle.com> (raw)
In-Reply-To: <20181105165558.11698-1-daniel.m.jordan@oracle.com>

When starting a large-memory kvm guest, it takes an excessively long
time to start the boot process because qemu must pin all guest pages to
accommodate DMA when VFIO is in use.  Currently just one CPU is
responsible for the page pinning, which usually boils down to page
clearing time-wise, so the ways to optimize this are buying a faster
CPU ;-) or using more of the CPUs you already have.

Parallelize with ktask.  Refactor so workqueue workers pin with the mm
of the calling thread, and to enable an undo callback for ktask to
handle errors during page pinning.

Performance results appear later in the series.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 drivers/vfio/vfio_iommu_type1.c | 106 +++++++++++++++++++++++---------
 1 file changed, 76 insertions(+), 30 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d9fd3188615d..e7cfbf0c8071 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -41,6 +41,7 @@
 #include <linux/notifier.h>
 #include <linux/dma-iommu.h>
 #include <linux/irqdomain.h>
+#include <linux/ktask.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -395,7 +396,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
  */
 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 				  long npage, unsigned long *pfn_base,
-				  unsigned long limit)
+				  unsigned long limit, struct mm_struct *mm)
 {
 	unsigned long pfn = 0;
 	long ret, pinned = 0, lock_acct = 0;
@@ -403,10 +404,10 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 
 	/* This code path is only user initiated */
-	if (!current->mm)
+	if (!mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
+	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
 	if (ret)
 		return ret;
 
@@ -418,7 +419,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	 * pages are already counted against the user.
 	 */
 	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
-		if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
+		if (!dma->lock_cap && mm->locked_vm + 1 > limit) {
 			put_pfn(*pfn_base, dma->prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 					limit << PAGE_SHIFT);
@@ -433,7 +434,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	/* Lock all the consecutive pages from pfn_base */
 	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
 	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
-		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
+		ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
 		if (ret)
 			break;
 
@@ -445,7 +446,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 
 		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
 			if (!dma->lock_cap &&
-			    current->mm->locked_vm + lock_acct + 1 > limit) {
+			    mm->locked_vm + lock_acct + 1 > limit) {
 				put_pfn(pfn, dma->prot);
 				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 					__func__, limit << PAGE_SHIFT);
@@ -752,15 +753,15 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
 }
 
 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			     dma_addr_t iova, dma_addr_t end,
 			     bool do_accounting)
 {
-	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
 	struct vfio_domain *domain, *d;
 	LIST_HEAD(unmapped_region_list);
 	int unmapped_region_cnt = 0;
 	long unlocked = 0;
 
-	if (!dma->size)
+	if (iova == end)
 		return 0;
 
 	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
@@ -777,7 +778,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 				      struct vfio_domain, next);
 
 	list_for_each_entry_continue(d, &iommu->domain_list, next) {
-		iommu_unmap(d->domain, dma->iova, dma->size);
+		iommu_unmap(d->domain, iova, end - iova);
 		cond_resched();
 	}
 
@@ -818,8 +819,6 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		}
 	}
 
-	dma->iommu_mapped = false;
-
 	if (unmapped_region_cnt)
 		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list);
 
@@ -830,14 +829,21 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	return unlocked;
 }
 
-static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
+static void vfio_remove_dma_finish(struct vfio_iommu *iommu,
+				   struct vfio_dma *dma)
 {
-	vfio_unmap_unpin(iommu, dma, true);
+	dma->iommu_mapped = false;
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
 	kfree(dma);
 }
 
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
+{
+	vfio_unmap_unpin(iommu, dma, dma->iova, dma->iova + dma->size, true);
+	vfio_remove_dma_finish(iommu, dma);
+}
+
 static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 {
 	struct vfio_domain *domain;
@@ -1031,20 +1037,29 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	return ret;
 }
 
-static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
-			    size_t map_size)
+struct vfio_pin_args {
+	struct vfio_iommu *iommu;
+	struct vfio_dma *dma;
+	unsigned long limit;
+	struct mm_struct *mm;
+};
+
+static int vfio_pin_map_dma_chunk(unsigned long start_vaddr,
+				  unsigned long end_vaddr,
+				  struct vfio_pin_args *args)
 {
-	dma_addr_t iova = dma->iova;
-	unsigned long vaddr = dma->vaddr;
-	size_t size = map_size;
+	struct vfio_dma *dma = args->dma;
+	dma_addr_t iova = dma->iova + (start_vaddr - dma->vaddr);
+	unsigned long unmapped_size = end_vaddr - start_vaddr;
+	unsigned long pfn, mapped_size = 0;
 	long npage;
-	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	int ret = 0;
 
-	while (size) {
+	while (unmapped_size) {
 		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
-					      size >> PAGE_SHIFT, &pfn, limit);
+		npage = vfio_pin_pages_remote(dma, start_vaddr + mapped_size,
+					      unmapped_size >> PAGE_SHIFT,
+					      &pfn, args->limit, args->mm);
 		if (npage <= 0) {
 			WARN_ON(!npage);
 			ret = (int)npage;
@@ -1052,22 +1067,50 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		}
 
 		/* Map it! */
-		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
-				     dma->prot);
+		ret = vfio_iommu_map(args->iommu, iova + mapped_size, pfn,
+				     npage, dma->prot);
 		if (ret) {
-			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
+			vfio_unpin_pages_remote(dma, iova + mapped_size, pfn,
 						npage, true);
 			break;
 		}
 
-		size -= npage << PAGE_SHIFT;
-		dma->size += npage << PAGE_SHIFT;
+		unmapped_size -= npage << PAGE_SHIFT;
+		mapped_size   += npage << PAGE_SHIFT;
 	}
 
+	return (ret == 0) ? KTASK_RETURN_SUCCESS : ret;
+}
+
+static void vfio_pin_map_dma_undo(unsigned long start_vaddr,
+				  unsigned long end_vaddr,
+				  struct vfio_pin_args *args)
+{
+	struct vfio_dma *dma = args->dma;
+	dma_addr_t iova = dma->iova + (start_vaddr - dma->vaddr);
+	dma_addr_t end  = dma->iova + (end_vaddr   - dma->vaddr);
+
+	vfio_unmap_unpin(args->iommu, args->dma, iova, end, true);
+}
+
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    size_t map_size)
+{
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	int ret = 0;
+	struct vfio_pin_args args = { iommu, dma, limit, current->mm };
+	/* Stay on PMD boundary in case THP is being used. */
+	DEFINE_KTASK_CTL(ctl, vfio_pin_map_dma_chunk, &args, PMD_SIZE);
+
+	ktask_ctl_set_undo_func(&ctl, vfio_pin_map_dma_undo);
+	ret = ktask_run((void *)dma->vaddr, map_size, &ctl);
+
 	dma->iommu_mapped = true;
 
 	if (ret)
-		vfio_remove_dma(iommu, dma);
+		vfio_remove_dma_finish(iommu, dma);
+	else
+		dma->size += map_size;
 
 	return ret;
 }
@@ -1229,7 +1272,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 
 				npage = vfio_pin_pages_remote(dma, vaddr,
 							      n >> PAGE_SHIFT,
-							      &pfn, limit);
+							      &pfn, limit,
+							      current->mm);
 				if (npage <= 0) {
 					WARN_ON(!npage);
 					ret = (int)npage;
@@ -1497,7 +1541,9 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
 		long locked = 0, unlocked = 0;
 
 		dma = rb_entry(n, struct vfio_dma, node);
-		unlocked += vfio_unmap_unpin(iommu, dma, false);
+		unlocked += vfio_unmap_unpin(iommu, dma, dma->iova,
+					     dma->iova + dma->size, false);
+		dma->iommu_mapped = false;
 		p = rb_first(&dma->pfn_list);
 		for (; p; p = rb_next(p)) {
 			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
-- 
2.19.1

  parent reply index

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-05 16:55 [RFC PATCH v4 00/13] ktask: multithread CPU-intensive kernel work Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 01/13] ktask: add documentation Daniel Jordan
2018-11-05 21:19   ` Randy Dunlap
2018-11-06  2:27     ` Daniel Jordan
2018-11-06  8:49   ` Peter Zijlstra
2018-11-06 20:34     ` Daniel Jordan
2018-11-06 20:51       ` Jason Gunthorpe
2018-11-07 10:27         ` Peter Zijlstra
2018-11-07 20:21           ` Daniel Jordan
2018-11-07 10:35       ` Peter Zijlstra
2018-11-07 21:20         ` Daniel Jordan
2018-11-08 17:26   ` Jonathan Corbet
2018-11-08 19:15     ` Daniel Jordan
2018-11-08 19:24       ` Jonathan Corbet
2018-11-27 19:50   ` Pavel Machek
2018-11-28 16:56     ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 02/13] ktask: multithread CPU-intensive kernel work Daniel Jordan
2018-11-05 20:51   ` Randy Dunlap
2018-11-06  2:24     ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 03/13] ktask: add undo support Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 04/13] ktask: run helper threads at MAX_NICE Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 05/13] workqueue, ktask: renice helper threads to prevent starvation Daniel Jordan
2018-11-13 16:34   ` Tejun Heo
2018-11-19 16:45     ` Daniel Jordan
2018-11-20 16:33       ` Tejun Heo
2018-11-20 17:03         ` Daniel Jordan
2018-11-05 16:55 ` Daniel Jordan [this message]
2018-11-05 21:51   ` [RFC PATCH v4 06/13] vfio: parallelize vfio_pin_map_dma Alex Williamson
2018-11-06  2:42     ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 07/13] mm: change locked_vm's type from unsigned long to atomic_long_t Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 08/13] vfio: remove unnecessary mmap_sem writer acquisition around locked_vm Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 09/13] vfio: relieve mmap_sem reader cacheline bouncing by holding it longer Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 10/13] mm: enlarge type of offset argument in mem_map_offset and mem_map_next Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 11/13] mm: parallelize deferred struct page initialization within each node Daniel Jordan
2018-11-10  3:48   ` Elliott, Robert (Persistent Memory)
2018-11-12 16:54     ` Daniel Jordan
2018-11-12 22:15       ` Elliott, Robert (Persistent Memory)
2018-11-19 16:01         ` Daniel Jordan
2018-11-27  0:12           ` Elliott, Robert (Persistent Memory)
2018-11-27 20:23             ` Daniel Jordan
2018-11-19 16:29       ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 12/13] mm: parallelize clear_gigantic_page Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 13/13] hugetlbfs: parallelize hugetlbfs_fallocate with ktask Daniel Jordan
2018-11-05 17:29 ` [RFC PATCH v4 00/13] ktask: multithread CPU-intensive kernel work Michal Hocko
2018-11-06  1:29   ` Daniel Jordan
2018-11-06  9:21     ` Michal Hocko
2018-11-07 20:17       ` Daniel Jordan
2018-11-05 18:49 ` Zi Yan
2018-11-06  2:20   ` Daniel Jordan
2018-11-06  2:48     ` Zi Yan
2018-11-06 19:00       ` Daniel Jordan
2018-11-30 19:18 ` Tejun Heo
2018-12-01  0:13   ` Daniel Jordan
2018-12-03 16:16     ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181105165558.11698-7-daniel.m.jordan@oracle.com \
    --to=daniel.m.jordan@oracle.com \
    --cc=Pavel.Tatashin@microsoft.com \
    --cc=aarcange@redhat.com \
    --cc=aaron.lu@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex.williamson@redhat.com \
    --cc=bsd@redhat.com \
    --cc=darrick.wong@oracle.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=jgg@mellanox.com \
    --cc=jiangshanlai@gmail.com \
    --cc=jwadams@google.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mike.kravetz@oracle.com \
    --cc=prasad.singamsetty@oracle.com \
    --cc=rdunlap@infradead.org \
    --cc=steven.sistare@oracle.com \
    --cc=tim.c.chen@intel.com \
    --cc=tj@kernel.org \
    --cc=vbabka@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

KVM Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/kvm/0 kvm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 kvm kvm/ https://lore.kernel.org/kvm \
		kvm@vger.kernel.org
	public-inbox-index kvm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.kvm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git