linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Andrea Arcangeli <aarcange@redhat.com>
To: qemu-devel@nongnu.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-api@vger.kernel.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>,
	Andres Lagar-Cavilla <andreslc@google.com>,
	Dave Hansen <dave@sr71.net>, Paolo Bonzini <pbonzini@redhat.com>,
	Rik van Riel <riel@redhat.com>, Mel Gorman <mgorman@suse.de>,
	Andy Lutomirski <luto@amacapital.net>,
	Andrew Morton <akpm@linux-foundation.org>,
	Sasha Levin <sasha.levin@oracle.com>,
	Hugh Dickins <hughd@google.com>,
	Peter Feiner <pfeiner@google.com>,
	"\\\"Dr. David Alan Gilbert\\\"" <dgilbert@redhat.com>,
	Christopher Covington <cov@codeaurora.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Android Kernel Team <kernel-team@android.com>,
	Robert Love <rlove@google.com>,
	Dmitry Adamushko <dmitry.adamushko@gmail.com>,
	Neil Brown <neilb@suse.de>, Mike Hommey <mh@glandium.org>,
	Taras Glek <tglek@mozilla.com>, Jan Kara <jack@suse.cz>,
	KOSAKI Motohiro <kosaki.motohiro@gmail.com>,
	Michel Lespinasse <walken@google.com>,
	Minchan Kim <minchan@kernel.org>,
	Keith Packard <keithp@keithp.com>,
	"Huangpeng (Peter)" <peter.huangpeng@huawei.com>,
	Isaku Yamahata <yamahata@valinux.co.jp>,
	Anthony Liguori <anthony@codemonkey.ws>,
	Stefan Hajnoczi <stefanha@gmail.com>,
	Wenchao Xia <wenchaoqemu@gmail.com>,
	Andrew Jones <drjones@redhat.com>,
	Juan Quintela <quintela@redhat.com>
Subject: [PATCH 04/17] mm: gup: make get_user_pages_fast and __get_user_pages_fast latency conscious
Date: Fri,  3 Oct 2014 19:07:54 +0200	[thread overview]
Message-ID: <1412356087-16115-5-git-send-email-aarcange@redhat.com> (raw)
In-Reply-To: <1412356087-16115-1-git-send-email-aarcange@redhat.com>

This teaches gup_fast and __gup_fast to re-enable irqs and
cond_resched() if possible every BATCH_PAGES.

This must be implemented by other archs as well and it's a requirement
before converting more get_user_pages() to get_user_pages_fast() as an
optimization (instead of using get_user_pages_unlocked which would be
slower).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/mm/gup.c | 234 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 149 insertions(+), 85 deletions(-)

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 2ab183b..917d8c1 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -12,6 +12,12 @@
 
 #include <asm/pgtable.h>
 
+/*
+ * Keep irq disabled for no more than BATCH_PAGES pages.
+ * Matches PTRS_PER_PTE (or half in non-PAE kernels).
+ */
+#define BATCH_PAGES	512
+
 static inline pte_t gup_get_pte(pte_t *ptep)
 {
 #ifndef CONFIG_X86_PAE
@@ -250,6 +256,40 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 	return 1;
 }
 
+static inline int __get_user_pages_fast_batch(unsigned long start,
+					      unsigned long end,
+					      int write, struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long next;
+	unsigned long flags;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86, with the above PAE exception), we can follow the
+	 * address down to the the page and take a ref on it.
+	 */
+	local_irq_save(flags);
+	pgdp = pgd_offset(mm, start);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(start, end);
+		if (pgd_none(pgd))
+			break;
+		if (!gup_pud_range(pgd, start, next, write, pages, &nr))
+			break;
+	} while (pgdp++, start = next, start != end);
+	local_irq_restore(flags);
+
+	return nr;
+}
+
 /*
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
@@ -257,31 +297,55 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			  struct page **pages)
 {
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	unsigned long flags;
-	pgd_t *pgdp;
-	int nr = 0;
+	unsigned long len, end, batch_pages;
+	int nr, ret;
 
 	start &= PAGE_MASK;
-	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
+	/*
+	 * get_user_pages() handles nr_pages == 0 gracefully, but
+	 * gup_fast starts walking the first pagetable in a do {}
+	 * while() fashion so it's not robust to handle nr_pages ==
+	 * 0. There's no point in being permissive about end < start
+	 * either. So this check verifies both nr_pages being non
+	 * zero, and that "end" didn't overflow.
+	 */
+	VM_BUG_ON(end <= start);
 	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
 					(void __user *)start, len)))
 		return 0;
 
-	/*
-	 * XXX: batch / limit 'nr', to avoid large irq off latency
-	 * needs some instrumenting to determine the common sizes used by
-	 * important workloads (eg. DB2), and whether limiting the batch size
-	 * will decrease performance.
-	 *
-	 * It seems like we're in the clear for the moment. Direct-IO is
-	 * the main guy that batches up lots of get_user_pages, and even
-	 * they are limited to 64-at-a-time which is not so many.
-	 */
+	ret = 0;
+	for (;;) {
+		batch_pages = nr_pages;
+		if (batch_pages > BATCH_PAGES && !irqs_disabled())
+			batch_pages = BATCH_PAGES;
+		len = (unsigned long) batch_pages << PAGE_SHIFT;
+		end = start + len;
+		nr = __get_user_pages_fast_batch(start, end, write, pages);
+		VM_BUG_ON(nr > batch_pages);
+		nr_pages -= nr;
+		ret += nr;
+		if (!nr_pages || nr != batch_pages)
+			break;
+		start += len;
+		pages += batch_pages;
+	}
+
+	return ret;
+}
+
+static inline int get_user_pages_fast_batch(unsigned long start,
+					    unsigned long end,
+					    int write, struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+	unsigned long orig_start = start;
+
 	/*
 	 * This doesn't prevent pagetable teardown, but does prevent
 	 * the pagetables and pages from being freed on x86.
@@ -290,18 +354,24 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	 * (which we do on x86, with the above PAE exception), we can follow the
 	 * address down to the the page and take a ref on it.
 	 */
-	local_irq_save(flags);
-	pgdp = pgd_offset(mm, addr);
+	local_irq_disable();
+	pgdp = pgd_offset(mm, start);
 	do {
 		pgd_t pgd = *pgdp;
 
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
+		next = pgd_addr_end(start, end);
+		if (pgd_none(pgd)) {
+			VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT);
 			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		}
+		if (!gup_pud_range(pgd, start, next, write, pages, &nr)) {
+			VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT);
 			break;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
+		}
+	} while (pgdp++, start = next, start != end);
+	local_irq_enable();
+
+	cond_resched();
 
 	return nr;
 }
@@ -326,80 +396,74 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	pgd_t *pgdp;
-	int nr = 0;
+	unsigned long len, end, batch_pages;
+	int nr, ret;
+	unsigned long orig_start;
 
 	start &= PAGE_MASK;
-	addr = start;
+	orig_start = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 
 	end = start + len;
-	if (end < start)
-		goto slow_irqon;
+	/*
+	 * get_user_pages() handles nr_pages == 0 gracefully, but
+	 * gup_fast starts walking the first pagetable in a do {}
+	 * while() fashion so it's not robust to handle nr_pages ==
+	 * 0. There's no point in being permissive about end < start
+	 * either. So this check verifies both nr_pages being non
+	 * zero, and that "end" didn't overflow.
+	 */
+	VM_BUG_ON(end <= start);
 
+	nr = ret = 0;
 #ifdef CONFIG_X86_64
 	if (end >> __VIRTUAL_MASK_SHIFT)
 		goto slow_irqon;
 #endif
+	for (;;) {
+		batch_pages = min(nr_pages, BATCH_PAGES);
+		len = (unsigned long) batch_pages << PAGE_SHIFT;
+		end = start + len;
+		nr = get_user_pages_fast_batch(start, end, write, pages);
+		VM_BUG_ON(nr > batch_pages);
+		nr_pages -= nr;
+		ret += nr;
+		if (!nr_pages)
+			break;
+		if (nr < batch_pages)
+			goto slow_irqon;
+		start += len;
+		pages += batch_pages;
+	}
 
-	/*
-	 * XXX: batch / limit 'nr', to avoid large irq off latency
-	 * needs some instrumenting to determine the common sizes used by
-	 * important workloads (eg. DB2), and whether limiting the batch size
-	 * will decrease performance.
-	 *
-	 * It seems like we're in the clear for the moment. Direct-IO is
-	 * the main guy that batches up lots of get_user_pages, and even
-	 * they are limited to 64-at-a-time which is not so many.
-	 */
-	/*
-	 * This doesn't prevent pagetable teardown, but does prevent
-	 * the pagetables and pages from being freed on x86.
-	 *
-	 * So long as we atomically load page table pointers versus teardown
-	 * (which we do on x86, with the above PAE exception), we can follow the
-	 * address down to the the page and take a ref on it.
-	 */
-	local_irq_disable();
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd_t pgd = *pgdp;
-
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			goto slow;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
-			goto slow;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_enable();
-
-	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
-	return nr;
-
-	{
-		int ret;
+	VM_BUG_ON(ret != (end - orig_start) >> PAGE_SHIFT);
+	return ret;
 
-slow:
-		local_irq_enable();
 slow_irqon:
-		/* Try to get the remaining pages with get_user_pages */
-		start += nr << PAGE_SHIFT;
-		pages += nr;
-
-		ret = get_user_pages_unlocked(current, mm, start,
-					      (end - start) >> PAGE_SHIFT,
-					      write, 0, pages);
-
-		/* Have to be a bit careful with return values */
-		if (nr > 0) {
-			if (ret < 0)
-				ret = nr;
-			else
-				ret += nr;
-		}
+	/* Try to get the remaining pages with get_user_pages */
+	start += nr << PAGE_SHIFT;
+	pages += nr;
 
-		return ret;
+	/*
+	 * "nr" was the get_user_pages_fast_batch last retval, "ret"
+	 * was the sum of all get_user_pages_fast_batch retvals, now
+	 * "nr" becomes the sum of all get_user_pages_fast_batch
+	 * retvals and "ret" will become the get_user_pages_unlocked
+	 * retval.
+	 */
+	nr = ret;
+
+	ret = get_user_pages_unlocked(current, mm, start,
+				      (end - start) >> PAGE_SHIFT,
+				      write, 0, pages);
+
+	/* Have to be a bit careful with return values */
+	if (nr > 0) {
+		if (ret < 0)
+			ret = nr;
+		else
+			ret += nr;
 	}
+
+	return ret;
 }

  parent reply	other threads:[~2014-10-03 17:11 UTC|newest]

Thread overview: 71+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-10-03 17:07 [PATCH 00/17] RFC: userfault v2 Andrea Arcangeli
2014-10-03 17:07 ` [PATCH 01/17] mm: gup: add FOLL_TRIED Andrea Arcangeli
2014-10-03 18:15   ` Linus Torvalds
2014-10-03 20:55     ` Paolo Bonzini
2014-10-03 17:07 ` [PATCH 02/17] mm: gup: add get_user_pages_locked and get_user_pages_unlocked Andrea Arcangeli
2014-10-03 17:07 ` [PATCH 03/17] mm: gup: use get_user_pages_unlocked within get_user_pages_fast Andrea Arcangeli
2014-10-03 17:07 ` Andrea Arcangeli [this message]
2014-10-03 18:23   ` [PATCH 04/17] mm: gup: make get_user_pages_fast and __get_user_pages_fast latency conscious Linus Torvalds
2014-10-06 14:14     ` Andrea Arcangeli
2014-10-03 17:07 ` [PATCH 05/17] mm: gup: use get_user_pages_fast and get_user_pages_unlocked Andrea Arcangeli
2014-10-03 17:07 ` [PATCH 06/17] kvm: Faults which trigger IO release the mmap_sem Andrea Arcangeli
2014-10-03 17:07 ` [PATCH 07/17] mm: madvise MADV_USERFAULT: prepare vm_flags to allow more than 32bits Andrea Arcangeli
2014-10-07  9:03   ` Kirill A. Shutemov
2014-11-06 20:08   ` Konstantin Khlebnikov
2014-10-03 17:07 ` [PATCH 08/17] mm: madvise MADV_USERFAULT Andrea Arcangeli
2014-10-03 23:13   ` Mike Hommey
2014-10-06 17:24     ` Andrea Arcangeli
2014-10-07 10:36   ` Kirill A. Shutemov
2014-10-07 10:46     ` Dr. David Alan Gilbert
2014-10-07 10:52       ` [Qemu-devel] " Kirill A. Shutemov
2014-10-07 11:01         ` Dr. David Alan Gilbert
2014-10-07 11:30           ` Kirill A. Shutemov
2014-10-07 13:24     ` Andrea Arcangeli
2014-10-07 15:21       ` Kirill A. Shutemov
2014-10-03 17:07 ` [PATCH 09/17] mm: PT lock: export double_pt_lock/unlock Andrea Arcangeli
2014-10-03 17:08 ` [PATCH 10/17] mm: rmap preparation for remap_anon_pages Andrea Arcangeli
2014-10-03 18:31   ` Linus Torvalds
2014-10-06  8:55     ` Dr. David Alan Gilbert
2014-10-06 16:41       ` Andrea Arcangeli
2014-10-07 12:47         ` Linus Torvalds
2014-10-07 14:19           ` Andrea Arcangeli
2014-10-07 15:52             ` Andrea Arcangeli
2014-10-07 15:54               ` Andy Lutomirski
2014-10-07 16:13               ` Peter Feiner
2014-10-07 16:56             ` Linus Torvalds
2014-10-07 17:07           ` Dr. David Alan Gilbert
2014-10-07 17:14             ` Paolo Bonzini
2014-10-07 17:25               ` Dr. David Alan Gilbert
2014-10-07 11:10   ` [Qemu-devel] " Kirill A. Shutemov
2014-10-07 13:37     ` Andrea Arcangeli
2014-10-03 17:08 ` [PATCH 11/17] mm: swp_entry_swapcount Andrea Arcangeli
2014-10-03 17:08 ` [PATCH 12/17] mm: sys_remap_anon_pages Andrea Arcangeli
2014-10-04 13:13   ` Andi Kleen
2014-10-06 17:00     ` Andrea Arcangeli
2014-10-03 17:08 ` [PATCH 13/17] waitqueue: add nr wake parameter to __wake_up_locked_key Andrea Arcangeli
2014-10-03 17:08 ` [PATCH 14/17] userfaultfd: add new syscall to provide memory externalization Andrea Arcangeli
2014-10-03 17:08 ` [PATCH 15/17] userfaultfd: make userfaultfd_write non blocking Andrea Arcangeli
2014-10-03 17:08 ` [PATCH 16/17] powerpc: add remap_anon_pages and userfaultfd Andrea Arcangeli
2014-10-03 17:08 ` [PATCH 17/17] userfaultfd: implement USERFAULTFD_RANGE_REGISTER|UNREGISTER Andrea Arcangeli
2014-10-27  9:32 ` [PATCH 00/17] RFC: userfault v2 zhanghailiang
2014-10-29 17:46   ` Andrea Arcangeli
2014-10-29 17:56     ` [Qemu-devel] " Peter Maydell
2014-11-21 20:14       ` Andrea Arcangeli
2014-11-21 23:05         ` Peter Maydell
2014-11-25 19:45           ` Andrea Arcangeli
2014-10-30 11:31     ` zhanghailiang
2014-10-30 12:49       ` Dr. David Alan Gilbert
2014-10-31  1:26         ` zhanghailiang
2014-11-19 18:49           ` Andrea Arcangeli
2014-11-20  2:54             ` zhanghailiang
2014-11-20 17:38               ` Andrea Arcangeli
2014-11-21  7:19                 ` zhanghailiang
2014-10-31  2:23       ` Peter Feiner
2014-10-31  3:29         ` zhanghailiang
2014-10-31  4:38           ` zhanghailiang
2014-10-31  5:17             ` Andres Lagar-Cavilla
2014-10-31  8:11               ` zhanghailiang
2014-10-31 19:39           ` Peter Feiner
2014-11-01  8:48             ` zhanghailiang
2014-11-20 17:29             ` Andrea Arcangeli
2014-11-12  7:18       ` zhanghailiang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1412356087-16115-5-git-send-email-aarcange@redhat.com \
    --to=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=andreslc@google.com \
    --cc=anthony@codemonkey.ws \
    --cc=cov@codeaurora.org \
    --cc=dave@sr71.net \
    --cc=dgilbert@redhat.com \
    --cc=dmitry.adamushko@gmail.com \
    --cc=drjones@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=jack@suse.cz \
    --cc=keithp@keithp.com \
    --cc=kernel-team@android.com \
    --cc=kosaki.motohiro@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@amacapital.net \
    --cc=mgorman@suse.de \
    --cc=mh@glandium.org \
    --cc=minchan@kernel.org \
    --cc=neilb@suse.de \
    --cc=pbonzini@redhat.com \
    --cc=peter.huangpeng@huawei.com \
    --cc=pfeiner@google.com \
    --cc=qemu-devel@nongnu.org \
    --cc=quintela@redhat.com \
    --cc=riel@redhat.com \
    --cc=rlove@google.com \
    --cc=sasha.levin@oracle.com \
    --cc=stefanha@gmail.com \
    --cc=tglek@mozilla.com \
    --cc=torvalds@linux-foundation.org \
    --cc=walken@google.com \
    --cc=wenchaoqemu@gmail.com \
    --cc=yamahata@valinux.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).