linux-mips.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* RFC: switch the remaining architectures to use generic GUP
@ 2019-05-25 13:31 Christoph Hellwig
  2019-05-25 13:31 ` [PATCH 1/6] MIPS: use the generic get_user_pages_fast code Christoph Hellwig
                   ` (6 more replies)
  0 siblings, 7 replies; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-25 13:31 UTC (permalink / raw)
  To: Linus Torvalds, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller
  Cc: Nicholas Piggin, linux-mips, linux-sh, sparclinux, linux-mm,
	linux-kernel

Hi Linus and maintainers,

below is a series to switch mips, sh and sparc64 to use the generic
GUP code so that we only have one codebase to touch for further
improvements to this code.  I don't have hardware for any of these
architectures, and generally no clue about their page table
management, so handle with care.  But it at least survives a
basic defconfig compile test..

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/6] MIPS: use the generic get_user_pages_fast code
  2019-05-25 13:31 RFC: switch the remaining architectures to use generic GUP Christoph Hellwig
@ 2019-05-25 13:31 ` Christoph Hellwig
  2019-05-25 13:31 ` [PATCH 2/6] sh: add a missing pud_page definition Christoph Hellwig
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-25 13:31 UTC (permalink / raw)
  To: Linus Torvalds, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller
  Cc: Nicholas Piggin, linux-mips, linux-sh, sparclinux, linux-mm,
	linux-kernel

The mips code is mostly equivalent to the generic one, minus various
bugfixes and one and a half arch overrides that this patch adds to
pgtable.h.

Note that this defines ARCH_HAS_PTE_SPECIAL for mips as mips has
pte_special and pte_mkspecial implemented and used in the existing
gup code.  They are no-op stubs, though which makes me a little unsure
if this is really right thing to do.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/mips/Kconfig               |   2 +
 arch/mips/include/asm/pgtable.h |  30 ++++
 arch/mips/mm/Makefile           |   1 -
 arch/mips/mm/gup.c              | 303 --------------------------------
 4 files changed, 32 insertions(+), 304 deletions(-)
 delete mode 100644 arch/mips/mm/gup.c

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 70d3200476bf..53a103cdc352 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -6,6 +6,7 @@ config MIPS
 	select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_ELF_RANDOMIZE
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_SUPPORTS_UPROBES
@@ -55,6 +56,7 @@ config MIPS
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
+	select HAVE_GENERIC_GUP
 	select HAVE_IDE
 	select HAVE_IOREMAP_PROT
 	select HAVE_IRQ_EXIT_ON_IRQ_STACK
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 4ccb465ef3f2..a6fd98563837 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -20,6 +20,7 @@
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 #include <asm/pgtable-bits.h>
+#include <asm/cpu-features.h>
 
 struct mm_struct;
 struct vm_area_struct;
@@ -651,4 +652,33 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  */
 #define pgtable_cache_init()	do { } while (0)
 
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages)
+{
+	unsigned long len = (unsigned long)nr_pages << PAGE_SHIFT;
+	unsigned long end = start + len;
+
+	return !cpu_has_dc_aliases && end >= start;
+}
+#define gup_fast_permitted gup_fast_permitted
+
+#if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+	pte_t pte;
+
+retry:
+	pte.pte_low = ptep->pte_low;
+	smp_rmb();
+	pte.pte_high = ptep->pte_high;
+	smp_rmb();
+	if (unlikely(pte.pte_low != ptep->pte_low))
+		goto retry;
+
+	return pte;
+}
+#define gup_get_pte gup_get_pte
+#endif
+
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+
 #endif /* _ASM_PGTABLE_H */
diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile
index f34d7ff5eb60..1e8d335025d7 100644
--- a/arch/mips/mm/Makefile
+++ b/arch/mips/mm/Makefile
@@ -7,7 +7,6 @@ obj-y				+= cache.o
 obj-y				+= context.o
 obj-y				+= extable.o
 obj-y				+= fault.o
-obj-y				+= gup.o
 obj-y				+= init.o
 obj-y				+= mmap.o
 obj-y				+= page.o
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
deleted file mode 100644
index 4c2b4483683c..000000000000
--- a/arch/mips/mm/gup.c
+++ /dev/null
@@ -1,303 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Lockless get_user_pages_fast for MIPS
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- * Copyright (C) 2011 Ralf Baechle
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/hugetlb.h>
-
-#include <asm/cpu-features.h>
-#include <asm/pgtable.h>
-
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-#if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
-	pte_t pte;
-
-retry:
-	pte.pte_low = ptep->pte_low;
-	smp_rmb();
-	pte.pte_high = ptep->pte_high;
-	smp_rmb();
-	if (unlikely(pte.pte_low != ptep->pte_low))
-		goto retry;
-
-	return pte;
-#else
-	return READ_ONCE(*ptep);
-#endif
-}
-
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
-{
-	pte_t *ptep = pte_offset_map(&pmd, addr);
-	do {
-		pte_t pte = gup_get_pte(ptep);
-		struct page *page;
-
-		if (!pte_present(pte) ||
-		    pte_special(pte) || (write && !pte_write(pte))) {
-			pte_unmap(ptep);
-			return 0;
-		}
-		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-		page = pte_page(pte);
-		get_page(page);
-		SetPageReferenced(page);
-		pages[*nr] = page;
-		(*nr)++;
-
-	} while (ptep++, addr += PAGE_SIZE, addr != end);
-
-	pte_unmap(ptep - 1);
-	return 1;
-}
-
-static inline void get_head_page_multiple(struct page *page, int nr)
-{
-	VM_BUG_ON(page != compound_head(page));
-	VM_BUG_ON(page_count(page) == 0);
-	page_ref_add(page, nr);
-	SetPageReferenced(page);
-}
-
-static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
-{
-	pte_t pte = *(pte_t *)&pmd;
-	struct page *head, *page;
-	int refs;
-
-	if (write && !pte_write(pte))
-		return 0;
-	/* hugepages are never "special" */
-	VM_BUG_ON(pte_special(pte));
-	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-	refs = 0;
-	head = pte_page(pte);
-	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (addr += PAGE_SIZE, addr != end);
-
-	get_head_page_multiple(head, refs);
-	return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
-{
-	unsigned long next;
-	pmd_t *pmdp;
-
-	pmdp = pmd_offset(&pud, addr);
-	do {
-		pmd_t pmd = *pmdp;
-
-		next = pmd_addr_end(addr, end);
-		if (pmd_none(pmd))
-			return 0;
-		if (unlikely(pmd_huge(pmd))) {
-			if (!gup_huge_pmd(pmd, addr, next, write, pages,nr))
-				return 0;
-		} else {
-			if (!gup_pte_range(pmd, addr, next, write, pages,nr))
-				return 0;
-		}
-	} while (pmdp++, addr = next, addr != end);
-
-	return 1;
-}
-
-static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
-{
-	pte_t pte = *(pte_t *)&pud;
-	struct page *head, *page;
-	int refs;
-
-	if (write && !pte_write(pte))
-		return 0;
-	/* hugepages are never "special" */
-	VM_BUG_ON(pte_special(pte));
-	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-	refs = 0;
-	head = pte_page(pte);
-	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (addr += PAGE_SIZE, addr != end);
-
-	get_head_page_multiple(head, refs);
-	return 1;
-}
-
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
-{
-	unsigned long next;
-	pud_t *pudp;
-
-	pudp = pud_offset(&pgd, addr);
-	do {
-		pud_t pud = *pudp;
-
-		next = pud_addr_end(addr, end);
-		if (pud_none(pud))
-			return 0;
-		if (unlikely(pud_huge(pud))) {
-			if (!gup_huge_pud(pud, addr, next, write, pages,nr))
-				return 0;
-		} else {
-			if (!gup_pmd_range(pud, addr, next, write, pages,nr))
-				return 0;
-		}
-	} while (pudp++, addr = next, addr != end);
-
-	return 1;
-}
-
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			  struct page **pages)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	unsigned long flags;
-	pgd_t *pgdp;
-	int nr = 0;
-
-	start &= PAGE_MASK;
-	addr = start;
-	len = (unsigned long) nr_pages << PAGE_SHIFT;
-	end = start + len;
-	if (unlikely(!access_ok((void __user *)start, len)))
-		return 0;
-
-	/*
-	 * XXX: batch / limit 'nr', to avoid large irq off latency
-	 * needs some instrumenting to determine the common sizes used by
-	 * important workloads (eg. DB2), and whether limiting the batch
-	 * size will decrease performance.
-	 *
-	 * It seems like we're in the clear for the moment. Direct-IO is
-	 * the main guy that batches up lots of get_user_pages, and even
-	 * they are limited to 64-at-a-time which is not so many.
-	 */
-	/*
-	 * This doesn't prevent pagetable teardown, but does prevent
-	 * the pagetables and pages from being freed.
-	 *
-	 * So long as we atomically load page table pointers versus teardown,
-	 * we can follow the address down to the page and take a ref on it.
-	 */
-	local_irq_save(flags);
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd_t pgd = *pgdp;
-
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
-			break;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
-
-	return nr;
-}
-
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start:	starting user address
- * @nr_pages:	number of pages from start to pin
- * @gup_flags:	flags modifying pin behaviour
- * @pages:	array that receives pointers to the pages pinned.
- *		Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages,
-			unsigned int gup_flags, struct page **pages)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	pgd_t *pgdp;
-	int ret, nr = 0;
-
-	start &= PAGE_MASK;
-	addr = start;
-	len = (unsigned long) nr_pages << PAGE_SHIFT;
-
-	end = start + len;
-	if (end < start || cpu_has_dc_aliases)
-		goto slow_irqon;
-
-	/* XXX: batch / limit 'nr' */
-	local_irq_disable();
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd_t pgd = *pgdp;
-
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			goto slow;
-		if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
-				   pages, &nr))
-			goto slow;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_enable();
-
-	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
-	return nr;
-slow:
-	local_irq_enable();
-
-slow_irqon:
-	/* Try to get the remaining pages with get_user_pages */
-	start += nr << PAGE_SHIFT;
-	pages += nr;
-
-	ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
-				      pages, gup_flags);
-
-	/* Have to be a bit careful with return values */
-	if (nr > 0) {
-		if (ret < 0)
-			ret = nr;
-		else
-			ret += nr;
-	}
-	return ret;
-}
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/6] sh: add a missing pud_page definition
  2019-05-25 13:31 RFC: switch the remaining architectures to use generic GUP Christoph Hellwig
  2019-05-25 13:31 ` [PATCH 1/6] MIPS: use the generic get_user_pages_fast code Christoph Hellwig
@ 2019-05-25 13:31 ` Christoph Hellwig
  2019-05-25 13:32 ` [PATCH 3/6] sh: use the generic get_user_pages_fast code Christoph Hellwig
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-25 13:31 UTC (permalink / raw)
  To: Linus Torvalds, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller
  Cc: Nicholas Piggin, linux-mips, linux-sh, sparclinux, linux-mm,
	linux-kernel

sh oddly enough had pud_page_vaddr, but not pud_page.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/sh/include/asm/pgtable-3level.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h
index 7d8587eb65ff..8ff6fb6b4d19 100644
--- a/arch/sh/include/asm/pgtable-3level.h
+++ b/arch/sh/include/asm/pgtable-3level.h
@@ -37,6 +37,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud)
 {
 	return pud_val(pud);
 }
+#define pud_page(pud)		virt_to_page((void *)pud_page_vaddr(pud))
 
 #define pmd_index(address)	(((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/6] sh: use the generic get_user_pages_fast code
  2019-05-25 13:31 RFC: switch the remaining architectures to use generic GUP Christoph Hellwig
  2019-05-25 13:31 ` [PATCH 1/6] MIPS: use the generic get_user_pages_fast code Christoph Hellwig
  2019-05-25 13:31 ` [PATCH 2/6] sh: add a missing pud_page definition Christoph Hellwig
@ 2019-05-25 13:32 ` Christoph Hellwig
  2019-05-25 13:32 ` [PATCH 4/6] mm: add a gup_fixup_start_addr hook Christoph Hellwig
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-25 13:32 UTC (permalink / raw)
  To: Linus Torvalds, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller
  Cc: Nicholas Piggin, linux-mips, linux-sh, sparclinux, linux-mm,
	linux-kernel

The sh code is mostly equivalent to the generic one, minus various
bugfixes and two arch overrides that this patch adds to pgtable.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/sh/Kconfig               |   1 +
 arch/sh/include/asm/pgtable.h |  85 +++++++++++
 arch/sh/mm/Makefile           |   2 +-
 arch/sh/mm/gup.c              | 277 ----------------------------------
 4 files changed, 87 insertions(+), 278 deletions(-)
 delete mode 100644 arch/sh/mm/gup.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index b77f512bb176..2fd8c12ca128 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -14,6 +14,7 @@ config SUPERH
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_PERF_EVENTS
 	select HAVE_DEBUG_BUGVERBOSE
+	select HAVE_GENERIC_GUP
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 3587103afe59..d3c177144f90 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -149,6 +149,91 @@ extern void paging_init(void);
 extern void page_table_range_init(unsigned long start, unsigned long end,
 				  pgd_t *pgd);
 
+static inline bool __pte_access_permitted(pte_t pte, u64 prot)
+{
+	return (pte_val(pte) & (prot | _PAGE_SPECIAL)) == prot;
+}
+
+#ifdef CONFIG_X2TLB
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+	/*
+	 * With get_user_pages_fast, we walk down the pagetables without
+	 * taking any locks.  For this we would like to load the pointers
+	 * atomically, but that is not possible with 64-bit PTEs.  What
+	 * we do have is the guarantee that a pte will only either go
+	 * from not present to present, or present to not present or both
+	 * -- it will not switch to a completely different present page
+	 * without a TLB flush in between; something that we are blocking
+	 * by holding interrupts off.
+	 *
+	 * Setting ptes from not present to present goes:
+	 * ptep->pte_high = h;
+	 * smp_wmb();
+	 * ptep->pte_low = l;
+	 *
+	 * And present to not present goes:
+	 * ptep->pte_low = 0;
+	 * smp_wmb();
+	 * ptep->pte_high = 0;
+	 *
+	 * We must ensure here that the load of pte_low sees l iff pte_high
+	 * sees h. We load pte_high *after* loading pte_low, which ensures we
+	 * don't see an older value of pte_high.  *Then* we recheck pte_low,
+	 * which ensures that we haven't picked up a changed pte high. We might
+	 * have got rubbish values from pte_low and pte_high, but we are
+	 * guaranteed that pte_low will not have the present bit set *unless*
+	 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+	 * we're safe.
+	 *
+	 * gup_get_pte should not be used or copied outside gup.c without being
+	 * very careful -- it does not atomically load the pte or anything that
+	 * is likely to be useful for you.
+	 */
+	pte_t pte;
+
+retry:
+	pte.pte_low = ptep->pte_low;
+	smp_rmb();
+	pte.pte_high = ptep->pte_high;
+	smp_rmb();
+	if (unlikely(pte.pte_low != ptep->pte_low))
+		goto retry;
+
+	return pte;
+}
+#define gup_get_pte gup_get_pte
+
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+	u64 prot = _PAGE_PRESENT;
+
+	prot |= _PAGE_EXT(_PAGE_EXT_KERN_READ | _PAGE_EXT_USER_READ);
+	if (write)
+		prot |= _PAGE_EXT(_PAGE_EXT_KERN_WRITE | _PAGE_EXT_USER_WRITE);
+	return __pte_access_permitted(pte, prot);
+}
+#elif defined(CONFIG_SUPERH64)
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+	u64 prot = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ;
+
+	if (write)
+		prot |= _PAGE_WRITE;
+	return __pte_access_permitted(pte, prot);
+}
+#else
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+	u64 prot = _PAGE_PRESENT | _PAGE_USER;
+
+	if (write)
+		prot |= _PAGE_RW;
+	return __pte_access_permitted(pte, prot);
+#endif
+
+#define pte_access_permitted pte_access_permitted
+
 /* arch/sh/mm/mmap.c */
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
diff --git a/arch/sh/mm/Makefile b/arch/sh/mm/Makefile
index fbe5e79751b3..5051b38fd5b6 100644
--- a/arch/sh/mm/Makefile
+++ b/arch/sh/mm/Makefile
@@ -17,7 +17,7 @@ cacheops-$(CONFIG_CPU_SHX3)		+= cache-shx3.o
 obj-y			+= $(cacheops-y)
 
 mmu-y			:= nommu.o extable_32.o
-mmu-$(CONFIG_MMU)	:= extable_$(BITS).o fault.o gup.o ioremap.o kmap.o \
+mmu-$(CONFIG_MMU)	:= extable_$(BITS).o fault.o ioremap.o kmap.o \
 			   pgtable.o tlbex_$(BITS).o tlbflush_$(BITS).o
 
 obj-y			+= $(mmu-y)
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
deleted file mode 100644
index 277c882f7489..000000000000
--- a/arch/sh/mm/gup.c
+++ /dev/null
@@ -1,277 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Lockless get_user_pages_fast for SuperH
- *
- * Copyright (C) 2009 - 2010  Paul Mundt
- *
- * Cloned from the x86 and PowerPC versions, by:
- *
- *	Copyright (C) 2008 Nick Piggin
- *	Copyright (C) 2008 Novell Inc.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/highmem.h>
-#include <asm/pgtable.h>
-
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-#ifndef CONFIG_X2TLB
-	return READ_ONCE(*ptep);
-#else
-	/*
-	 * With get_user_pages_fast, we walk down the pagetables without
-	 * taking any locks.  For this we would like to load the pointers
-	 * atomically, but that is not possible with 64-bit PTEs.  What
-	 * we do have is the guarantee that a pte will only either go
-	 * from not present to present, or present to not present or both
-	 * -- it will not switch to a completely different present page
-	 * without a TLB flush in between; something that we are blocking
-	 * by holding interrupts off.
-	 *
-	 * Setting ptes from not present to present goes:
-	 * ptep->pte_high = h;
-	 * smp_wmb();
-	 * ptep->pte_low = l;
-	 *
-	 * And present to not present goes:
-	 * ptep->pte_low = 0;
-	 * smp_wmb();
-	 * ptep->pte_high = 0;
-	 *
-	 * We must ensure here that the load of pte_low sees l iff pte_high
-	 * sees h. We load pte_high *after* loading pte_low, which ensures we
-	 * don't see an older value of pte_high.  *Then* we recheck pte_low,
-	 * which ensures that we haven't picked up a changed pte high. We might
-	 * have got rubbish values from pte_low and pte_high, but we are
-	 * guaranteed that pte_low will not have the present bit set *unless*
-	 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
-	 * we're safe.
-	 *
-	 * gup_get_pte should not be used or copied outside gup.c without being
-	 * very careful -- it does not atomically load the pte or anything that
-	 * is likely to be useful for you.
-	 */
-	pte_t pte;
-
-retry:
-	pte.pte_low = ptep->pte_low;
-	smp_rmb();
-	pte.pte_high = ptep->pte_high;
-	smp_rmb();
-	if (unlikely(pte.pte_low != ptep->pte_low))
-		goto retry;
-
-	return pte;
-#endif
-}
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	u64 mask, result;
-	pte_t *ptep;
-
-#ifdef CONFIG_X2TLB
-	result = _PAGE_PRESENT | _PAGE_EXT(_PAGE_EXT_KERN_READ | _PAGE_EXT_USER_READ);
-	if (write)
-		result |= _PAGE_EXT(_PAGE_EXT_KERN_WRITE | _PAGE_EXT_USER_WRITE);
-#elif defined(CONFIG_SUPERH64)
-	result = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ;
-	if (write)
-		result |= _PAGE_WRITE;
-#else
-	result = _PAGE_PRESENT | _PAGE_USER;
-	if (write)
-		result |= _PAGE_RW;
-#endif
-
-	mask = result | _PAGE_SPECIAL;
-
-	ptep = pte_offset_map(&pmd, addr);
-	do {
-		pte_t pte = gup_get_pte(ptep);
-		struct page *page;
-
-		if ((pte_val(pte) & mask) != result) {
-			pte_unmap(ptep);
-			return 0;
-		}
-		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-		page = pte_page(pte);
-		get_page(page);
-		__flush_anon_page(page, addr);
-		flush_dcache_page(page);
-		pages[*nr] = page;
-		(*nr)++;
-
-	} while (ptep++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(ptep - 1);
-
-	return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-		int write, struct page **pages, int *nr)
-{
-	unsigned long next;
-	pmd_t *pmdp;
-
-	pmdp = pmd_offset(&pud, addr);
-	do {
-		pmd_t pmd = *pmdp;
-
-		next = pmd_addr_end(addr, end);
-		if (pmd_none(pmd))
-			return 0;
-		if (!gup_pte_range(pmd, addr, next, write, pages, nr))
-			return 0;
-	} while (pmdp++, addr = next, addr != end);
-
-	return 1;
-}
-
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
-{
-	unsigned long next;
-	pud_t *pudp;
-
-	pudp = pud_offset(&pgd, addr);
-	do {
-		pud_t pud = *pudp;
-
-		next = pud_addr_end(addr, end);
-		if (pud_none(pud))
-			return 0;
-		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
-			return 0;
-	} while (pudp++, addr = next, addr != end);
-
-	return 1;
-}
-
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			  struct page **pages)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	unsigned long flags;
-	pgd_t *pgdp;
-	int nr = 0;
-
-	start &= PAGE_MASK;
-	addr = start;
-	len = (unsigned long) nr_pages << PAGE_SHIFT;
-	end = start + len;
-	if (unlikely(!access_ok((void __user *)start, len)))
-		return 0;
-
-	/*
-	 * This doesn't prevent pagetable teardown, but does prevent
-	 * the pagetables and pages from being freed.
-	 */
-	local_irq_save(flags);
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd_t pgd = *pgdp;
-
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
-			break;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
-
-	return nr;
-}
-
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start:	starting user address
- * @nr_pages:	number of pages from start to pin
- * @gup_flags:	flags modifying pin behaviour
- * @pages:	array that receives pointers to the pages pinned.
- *		Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages,
-			unsigned int gup_flags, struct page **pages)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	pgd_t *pgdp;
-	int nr = 0;
-
-	start &= PAGE_MASK;
-	addr = start;
-	len = (unsigned long) nr_pages << PAGE_SHIFT;
-
-	end = start + len;
-	if (end < start)
-		goto slow_irqon;
-
-	local_irq_disable();
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd_t pgd = *pgdp;
-
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			goto slow;
-		if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
-				   pages, &nr))
-			goto slow;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_enable();
-
-	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
-	return nr;
-
-	{
-		int ret;
-
-slow:
-		local_irq_enable();
-slow_irqon:
-		/* Try to get the remaining pages with get_user_pages */
-		start += nr << PAGE_SHIFT;
-		pages += nr;
-
-		ret = get_user_pages_unlocked(start,
-			(end - start) >> PAGE_SHIFT, pages,
-			gup_flags);
-
-		/* Have to be a bit careful with return values */
-		if (nr > 0) {
-			if (ret < 0)
-				ret = nr;
-			else
-				ret += nr;
-		}
-
-		return ret;
-	}
-}
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/6] mm: add a gup_fixup_start_addr hook
  2019-05-25 13:31 RFC: switch the remaining architectures to use generic GUP Christoph Hellwig
                   ` (2 preceding siblings ...)
  2019-05-25 13:32 ` [PATCH 3/6] sh: use the generic get_user_pages_fast code Christoph Hellwig
@ 2019-05-25 13:32 ` Christoph Hellwig
  2019-05-25 17:05   ` Linus Torvalds
  2019-05-29  8:19   ` Catalin Marinas
  2019-05-25 13:32 ` [PATCH 5/6] sparc64: use the generic get_user_pages_fast code Christoph Hellwig
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-25 13:32 UTC (permalink / raw)
  To: Linus Torvalds, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller
  Cc: Nicholas Piggin, linux-mips, linux-sh, sparclinux, linux-mm,
	linux-kernel

This will allow sparc64 to override its ADI tags for
get_user_pages and get_user_pages_fast.  I have no idea why this
is not required for plain old get_user_pages, but it keeps the
existing sparc64 behavior.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 mm/gup.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index f173fcbaf1b2..1c21ecfbf38b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2117,6 +2117,10 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
 	} while (pgdp++, addr = next, addr != end);
 }
 
+#ifndef gup_fixup_start_addr
+#define gup_fixup_start_addr(start)	(start)
+#endif
+
 #ifndef gup_fast_permitted
 /*
  * Check if it's allowed to use __get_user_pages_fast() for the range, or
@@ -2145,7 +2149,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	unsigned long flags;
 	int nr = 0;
 
-	start &= PAGE_MASK;
+	start = gup_fixup_start_addr(start) & PAGE_MASK;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
 
@@ -2218,7 +2222,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
 	unsigned long addr, len, end;
 	int nr = 0, ret = 0;
 
-	start &= PAGE_MASK;
+	start = gup_fixup_start_addr(start) & PAGE_MASK;
 	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/6] sparc64: use the generic get_user_pages_fast code
  2019-05-25 13:31 RFC: switch the remaining architectures to use generic GUP Christoph Hellwig
                   ` (3 preceding siblings ...)
  2019-05-25 13:32 ` [PATCH 4/6] mm: add a gup_fixup_start_addr hook Christoph Hellwig
@ 2019-05-25 13:32 ` Christoph Hellwig
  2019-05-25 16:55   ` David Miller
  2019-05-25 13:32 ` [PATCH 6/6] mm: don't allow non-generic get_user_pages_fast implementations Christoph Hellwig
  2019-05-25 17:07 ` RFC: switch the remaining architectures to use generic GUP Linus Torvalds
  6 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-25 13:32 UTC (permalink / raw)
  To: Linus Torvalds, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller
  Cc: Nicholas Piggin, linux-mips, linux-sh, sparclinux, linux-mm,
	linux-kernel

The sparc64 code is mostly equivalent to the generic one, minus various
bugfixes and two arch overrides that this patch adds to pgtable.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/sparc/Kconfig                  |   1 +
 arch/sparc/include/asm/pgtable_64.h |  40 ++++
 arch/sparc/mm/Makefile              |   2 +-
 arch/sparc/mm/gup.c                 | 340 ----------------------------
 4 files changed, 42 insertions(+), 341 deletions(-)
 delete mode 100644 arch/sparc/mm/gup.c

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 26ab6f5bbaaf..22435471f942 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -28,6 +28,7 @@ config SPARC
 	select RTC_DRV_M48T59
 	select RTC_SYSTOHC
 	select HAVE_ARCH_JUMP_LABEL if SPARC64
+	select HAVE_GENERIC_GUP if SPARC64
 	select GENERIC_IRQ_SHOW
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select GENERIC_PCI_IOMAP
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 22500c3be7a9..753d1417bae1 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1075,6 +1075,46 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
 }
 #define io_remap_pfn_range io_remap_pfn_range 
 
+static inline unsigned long gup_fixup_start_addr(unsigned long start)
+{
+	if (adi_capable()) {
+		long addr = start;
+
+		/* If userspace has passed a versioned address, kernel
+		 * will not find it in the VMAs since it does not store
+		 * the version tags in the list of VMAs. Storing version
+		 * tags in list of VMAs is impractical since they can be
+		 * changed any time from userspace without dropping into
+		 * kernel. Any address search in VMAs will be done with
+		 * non-versioned addresses. Ensure the ADI version bits
+		 * are dropped here by sign extending the last bit before
+		 * ADI bits. IOMMU does not implement version tags.
+		 */
+		return (addr << (long)adi_nbits()) >> (long)adi_nbits();
+	}
+
+	return start;
+}
+#define gup_fixup_start_addr gup_fixup_start_addr
+
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+	u64 prot;
+
+	if (tlb_type == hypervisor) {
+		prot = _PAGE_PRESENT_4V | _PAGE_P_4V;
+		if (prot)
+			prot |= _PAGE_WRITE_4V;
+	} else {
+		prot = _PAGE_PRESENT_4U | _PAGE_P_4U;
+		if (write)
+			prot |= _PAGE_WRITE_4U;
+	}
+
+	return (pte_val(pte) & (prot | _PAGE_SPECIAL)) == prot;
+}
+#define pte_access_permitted pte_access_permitted
+
 #include <asm/tlbflush.h>
 #include <asm-generic/pgtable.h>
 
diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile
index d39075b1e3b7..b078205b70e0 100644
--- a/arch/sparc/mm/Makefile
+++ b/arch/sparc/mm/Makefile
@@ -5,7 +5,7 @@
 asflags-y := -ansi
 ccflags-y := -Werror
 
-obj-$(CONFIG_SPARC64)   += ultra.o tlb.o tsb.o gup.o
+obj-$(CONFIG_SPARC64)   += ultra.o tlb.o tsb.o
 obj-y                   += fault_$(BITS).o
 obj-y                   += init_$(BITS).o
 obj-$(CONFIG_SPARC32)   += extable.o srmmu.o iommu.o io-unit.o
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
deleted file mode 100644
index 1e770a517d4a..000000000000
--- a/arch/sparc/mm/gup.c
+++ /dev/null
@@ -1,340 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Lockless get_user_pages_fast for sparc, cribbed from powerpc
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/pagemap.h>
-#include <linux/rwsem.h>
-#include <asm/pgtable.h>
-#include <asm/adi.h>
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	unsigned long mask, result;
-	pte_t *ptep;
-
-	if (tlb_type == hypervisor) {
-		result = _PAGE_PRESENT_4V|_PAGE_P_4V;
-		if (write)
-			result |= _PAGE_WRITE_4V;
-	} else {
-		result = _PAGE_PRESENT_4U|_PAGE_P_4U;
-		if (write)
-			result |= _PAGE_WRITE_4U;
-	}
-	mask = result | _PAGE_SPECIAL;
-
-	ptep = pte_offset_kernel(&pmd, addr);
-	do {
-		struct page *page, *head;
-		pte_t pte = *ptep;
-
-		if ((pte_val(pte) & mask) != result)
-			return 0;
-		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-		/* The hugepage case is simplified on sparc64 because
-		 * we encode the sub-page pfn offsets into the
-		 * hugepage PTEs.  We could optimize this in the future
-		 * use page_cache_add_speculative() for the hugepage case.
-		 */
-		page = pte_page(pte);
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
-			return 0;
-		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-			put_page(head);
-			return 0;
-		}
-
-		pages[*nr] = page;
-		(*nr)++;
-	} while (ptep++, addr += PAGE_SIZE, addr != end);
-
-	return 1;
-}
-
-static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
-			unsigned long end, int write, struct page **pages,
-			int *nr)
-{
-	struct page *head, *page;
-	int refs;
-
-	if (!(pmd_val(pmd) & _PAGE_VALID))
-		return 0;
-
-	if (write && !pmd_write(pmd))
-		return 0;
-
-	refs = 0;
-	page = pmd_page(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-	head = compound_head(page);
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (addr += PAGE_SIZE, addr != end);
-
-	if (!page_cache_add_speculative(head, refs)) {
-		*nr -= refs;
-		return 0;
-	}
-
-	if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
-		*nr -= refs;
-		while (refs--)
-			put_page(head);
-		return 0;
-	}
-
-	return 1;
-}
-
-static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
-			unsigned long end, int write, struct page **pages,
-			int *nr)
-{
-	struct page *head, *page;
-	int refs;
-
-	if (!(pud_val(pud) & _PAGE_VALID))
-		return 0;
-
-	if (write && !pud_write(pud))
-		return 0;
-
-	refs = 0;
-	page = pud_page(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-	head = compound_head(page);
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (addr += PAGE_SIZE, addr != end);
-
-	if (!page_cache_add_speculative(head, refs)) {
-		*nr -= refs;
-		return 0;
-	}
-
-	if (unlikely(pud_val(pud) != pud_val(*pudp))) {
-		*nr -= refs;
-		while (refs--)
-			put_page(head);
-		return 0;
-	}
-
-	return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-		int write, struct page **pages, int *nr)
-{
-	unsigned long next;
-	pmd_t *pmdp;
-
-	pmdp = pmd_offset(&pud, addr);
-	do {
-		pmd_t pmd = *pmdp;
-
-		next = pmd_addr_end(addr, end);
-		if (pmd_none(pmd))
-			return 0;
-		if (unlikely(pmd_large(pmd))) {
-			if (!gup_huge_pmd(pmdp, pmd, addr, next,
-					  write, pages, nr))
-				return 0;
-		} else if (!gup_pte_range(pmd, addr, next, write,
-					  pages, nr))
-			return 0;
-	} while (pmdp++, addr = next, addr != end);
-
-	return 1;
-}
-
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
-		int write, struct page **pages, int *nr)
-{
-	unsigned long next;
-	pud_t *pudp;
-
-	pudp = pud_offset(&pgd, addr);
-	do {
-		pud_t pud = *pudp;
-
-		next = pud_addr_end(addr, end);
-		if (pud_none(pud))
-			return 0;
-		if (unlikely(pud_large(pud))) {
-			if (!gup_huge_pud(pudp, pud, addr, next,
-					  write, pages, nr))
-				return 0;
-		} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
-			return 0;
-	} while (pudp++, addr = next, addr != end);
-
-	return 1;
-}
-
-/*
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			  struct page **pages)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next, flags;
-	pgd_t *pgdp;
-	int nr = 0;
-
-#ifdef CONFIG_SPARC64
-	if (adi_capable()) {
-		long addr = start;
-
-		/* If userspace has passed a versioned address, kernel
-		 * will not find it in the VMAs since it does not store
-		 * the version tags in the list of VMAs. Storing version
-		 * tags in list of VMAs is impractical since they can be
-		 * changed any time from userspace without dropping into
-		 * kernel. Any address search in VMAs will be done with
-		 * non-versioned addresses. Ensure the ADI version bits
-		 * are dropped here by sign extending the last bit before
-		 * ADI bits. IOMMU does not implement version tags.
-		 */
-		addr = (addr << (long)adi_nbits()) >> (long)adi_nbits();
-		start = addr;
-	}
-#endif
-	start &= PAGE_MASK;
-	addr = start;
-	len = (unsigned long) nr_pages << PAGE_SHIFT;
-	end = start + len;
-
-	local_irq_save(flags);
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd_t pgd = *pgdp;
-
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
-			break;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
-
-	return nr;
-}
-
-int get_user_pages_fast(unsigned long start, int nr_pages,
-			unsigned int gup_flags, struct page **pages)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	pgd_t *pgdp;
-	int nr = 0;
-
-#ifdef CONFIG_SPARC64
-	if (adi_capable()) {
-		long addr = start;
-
-		/* If userspace has passed a versioned address, kernel
-		 * will not find it in the VMAs since it does not store
-		 * the version tags in the list of VMAs. Storing version
-		 * tags in list of VMAs is impractical since they can be
-		 * changed any time from userspace without dropping into
-		 * kernel. Any address search in VMAs will be done with
-		 * non-versioned addresses. Ensure the ADI version bits
-		 * are dropped here by sign extending the last bit before
-		 * ADI bits. IOMMU does not implements version tags,
-		 */
-		addr = (addr << (long)adi_nbits()) >> (long)adi_nbits();
-		start = addr;
-	}
-#endif
-	start &= PAGE_MASK;
-	addr = start;
-	len = (unsigned long) nr_pages << PAGE_SHIFT;
-	end = start + len;
-
-	/*
-	 * XXX: batch / limit 'nr', to avoid large irq off latency
-	 * needs some instrumenting to determine the common sizes used by
-	 * important workloads (eg. DB2), and whether limiting the batch size
-	 * will decrease performance.
-	 *
-	 * It seems like we're in the clear for the moment. Direct-IO is
-	 * the main guy that batches up lots of get_user_pages, and even
-	 * they are limited to 64-at-a-time which is not so many.
-	 */
-	/*
-	 * This doesn't prevent pagetable teardown, but does prevent
-	 * the pagetables from being freed on sparc.
-	 *
-	 * So long as we atomically load page table pointers versus teardown,
-	 * we can follow the address down to the the page and take a ref on it.
-	 */
-	local_irq_disable();
-
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd_t pgd = *pgdp;
-
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			goto slow;
-		if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
-				   pages, &nr))
-			goto slow;
-	} while (pgdp++, addr = next, addr != end);
-
-	local_irq_enable();
-
-	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
-	return nr;
-
-	{
-		int ret;
-
-slow:
-		local_irq_enable();
-
-		/* Try to get the remaining pages with get_user_pages */
-		start += nr << PAGE_SHIFT;
-		pages += nr;
-
-		ret = get_user_pages_unlocked(start,
-			(end - start) >> PAGE_SHIFT, pages,
-			gup_flags);
-
-		/* Have to be a bit careful with return values */
-		if (nr > 0) {
-			if (ret < 0)
-				ret = nr;
-			else
-				ret += nr;
-		}
-
-		return ret;
-	}
-}
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 6/6] mm: don't allow non-generic get_user_pages_fast implementations
  2019-05-25 13:31 RFC: switch the remaining architectures to use generic GUP Christoph Hellwig
                   ` (4 preceding siblings ...)
  2019-05-25 13:32 ` [PATCH 5/6] sparc64: use the generic get_user_pages_fast code Christoph Hellwig
@ 2019-05-25 13:32 ` Christoph Hellwig
  2019-05-25 17:07 ` RFC: switch the remaining architectures to use generic GUP Linus Torvalds
  6 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-25 13:32 UTC (permalink / raw)
  To: Linus Torvalds, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller
  Cc: Nicholas Piggin, linux-mips, linux-sh, sparclinux, linux-mm,
	linux-kernel

Add an explicit ifdef instead of the weak functions for the stubs
so that we can't let new get_user_pages_fast implementation slip in.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 mm/util.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mm/util.c b/mm/util.c
index 91682a2090ee..74ae737ffd95 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -300,6 +300,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 }
 #endif
 
+#ifndef CONFIG_HAVE_GENERIC_GUP
 /*
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
@@ -308,8 +309,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
  * If the architecture does not support this function, simply return with no
  * pages pinned.
  */
-int __weak __get_user_pages_fast(unsigned long start,
-				 int nr_pages, int write, struct page **pages)
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+		struct page **pages)
 {
 	return 0;
 }
@@ -339,13 +340,13 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
  */
-int __weak get_user_pages_fast(unsigned long start,
-				int nr_pages, unsigned int gup_flags,
-				struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+		unsigned int gup_flags, struct page **pages)
 {
 	return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
+#endif /* !CONFIG_HAVE_GENERIC_GUP */
 
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 5/6] sparc64: use the generic get_user_pages_fast code
  2019-05-25 13:32 ` [PATCH 5/6] sparc64: use the generic get_user_pages_fast code Christoph Hellwig
@ 2019-05-25 16:55   ` David Miller
  0 siblings, 0 replies; 14+ messages in thread
From: David Miller @ 2019-05-25 16:55 UTC (permalink / raw)
  To: hch
  Cc: torvalds, paul.burton, jhogan, ysato, dalias, npiggin,
	linux-mips, linux-sh, sparclinux, linux-mm, linux-kernel

From: Christoph Hellwig <hch@lst.de>
Date: Sat, 25 May 2019 15:32:02 +0200

> The sparc64 code is mostly equivalent to the generic one, minus various
> bugfixes and two arch overrides that this patch adds to pgtable.h.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] mm: add a gup_fixup_start_addr hook
  2019-05-25 13:32 ` [PATCH 4/6] mm: add a gup_fixup_start_addr hook Christoph Hellwig
@ 2019-05-25 17:05   ` Linus Torvalds
  2019-05-28 15:57     ` Khalid Aziz
  2019-05-29  8:19   ` Catalin Marinas
  1 sibling, 1 reply; 14+ messages in thread
From: Linus Torvalds @ 2019-05-25 17:05 UTC (permalink / raw)
  To: Christoph Hellwig, Khalid Aziz
  Cc: Paul Burton, James Hogan, Yoshinori Sato, Rich Felker,
	David S. Miller, Nicholas Piggin, linux-mips, Linux-sh list,
	sparclinux, Linux-MM, Linux List Kernel Mailing

[ Adding Khalid, who added the sparc64 code ]

On Sat, May 25, 2019 at 6:32 AM Christoph Hellwig <hch@lst.de> wrote:
>
> This will allow sparc64 to override its ADI tags for
> get_user_pages and get_user_pages_fast.  I have no idea why this
> is not required for plain old get_user_pages, but it keeps the
> existing sparc64 behavior.

This is actually generic. ARM64 has tagged pointers too. Right now the
system call interfaces are all supposed to mask off the tags, but
there's been noise about having the kernel understand them.

That said:

> +#ifndef gup_fixup_start_addr
> +#define gup_fixup_start_addr(start)    (start)
> +#endif

I'd rather name this much more specifically (ie make it very much
about "clean up pointer tags") and I'm also not clear on why sparc64
actually wants this. I thought the sparc64 rules were the same as the
(current) arm64 rules: any addresses passed to the kernel have to be
the non-tagged ones.

As you say, nothing *else* in the kernel does that address cleanup,
why should get_user_pages_fast() do it?

David? Khalid? Why does sparc64 actually need this? It looks like the
generic get_user_pages() doesn't do it.

                Linus

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC: switch the remaining architectures to use generic GUP
  2019-05-25 13:31 RFC: switch the remaining architectures to use generic GUP Christoph Hellwig
                   ` (5 preceding siblings ...)
  2019-05-25 13:32 ` [PATCH 6/6] mm: don't allow non-generic get_user_pages_fast implementations Christoph Hellwig
@ 2019-05-25 17:07 ` Linus Torvalds
  2019-05-25 17:39   ` Christoph Hellwig
  6 siblings, 1 reply; 14+ messages in thread
From: Linus Torvalds @ 2019-05-25 17:07 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Paul Burton, James Hogan, Yoshinori Sato, Rich Felker,
	David S. Miller, Nicholas Piggin, linux-mips, Linux-sh list,
	sparclinux, Linux-MM, Linux List Kernel Mailing

Looks good to me apart from the question about sparc64 (that you also
raised) and requesting that interface to be re-named if it is really
needed.

Let's just do it (but presumably for 5.3), and any architecture that
doesn't react to this and gets broken because it wasn't tested can get
fixed up later when/if they notice.

              Linus

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC: switch the remaining architectures to use generic GUP
  2019-05-25 17:07 ` RFC: switch the remaining architectures to use generic GUP Linus Torvalds
@ 2019-05-25 17:39   ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-25 17:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Hellwig, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller, Nicholas Piggin, linux-mips,
	Linux-sh list, sparclinux, Linux-MM, Linux List Kernel Mailing

On Sat, May 25, 2019 at 10:07:32AM -0700, Linus Torvalds wrote:
> Looks good to me apart from the question about sparc64 (that you also
> raised) and requesting that interface to be re-named if it is really
> needed.
> 
> Let's just do it (but presumably for 5.3), and any architecture that
> doesn't react to this and gets broken because it wasn't tested can get
> fixed up later when/if they notice.

FYI, my compile testing was very basic and a few issues showed up
from the build bot later on.  I'll keep the branch here uptodate
for now:

	http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/generic-gup

and won't resend until we make progress on the pointer tagging
thing.  I've also got a few follow on patches on top, so they might
be ready by then as well.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] mm: add a gup_fixup_start_addr hook
  2019-05-25 17:05   ` Linus Torvalds
@ 2019-05-28 15:57     ` Khalid Aziz
  2019-05-29  7:26       ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Khalid Aziz @ 2019-05-28 15:57 UTC (permalink / raw)
  To: Linus Torvalds, Christoph Hellwig
  Cc: Paul Burton, James Hogan, Yoshinori Sato, Rich Felker,
	David S. Miller, Nicholas Piggin, linux-mips, Linux-sh list,
	sparclinux, Linux-MM, Linux List Kernel Mailing

On 5/25/19 11:05 AM, Linus Torvalds wrote:
> [ Adding Khalid, who added the sparc64 code ]
> 
> On Sat, May 25, 2019 at 6:32 AM Christoph Hellwig <hch@lst.de> wrote:
>>
>> This will allow sparc64 to override its ADI tags for
>> get_user_pages and get_user_pages_fast.  I have no idea why this
>> is not required for plain old get_user_pages, but it keeps the
>> existing sparc64 behavior.
> 
> This is actually generic. ARM64 has tagged pointers too. Right now the
> system call interfaces are all supposed to mask off the tags, but
> there's been noise about having the kernel understand them.
> 
> That said:
> 
>> +#ifndef gup_fixup_start_addr
>> +#define gup_fixup_start_addr(start)    (start)
>> +#endif
> 
> I'd rather name this much more specifically (ie make it very much
> about "clean up pointer tags") and I'm also not clear on why sparc64
> actually wants this. I thought the sparc64 rules were the same as the
> (current) arm64 rules: any addresses passed to the kernel have to be
> the non-tagged ones.
> 
> As you say, nothing *else* in the kernel does that address cleanup,
> why should get_user_pages_fast() do it?
> 
> David? Khalid? Why does sparc64 actually need this? It looks like the
> generic get_user_pages() doesn't do it.
> 


There is another discussion going on about tagged pointers on ARM64 and
intersection with sparc64 code. I agree there is a generic need to mask
off tags for kernel use now that ARM64 is also looking into supporting
memory tagging. The need comes from sparc64 not storing tagged address
in VMAs. It is not practical to store tagged addresses in VMAs because
manipulation of address tags is done entirely in userspace on sparc64.
Userspace is free to change tags on an address range at any time without
involving kernel and constantly rotating tags is actually a security
feature even. This makes it impractical for kernel to try to keep up
with constantly changing tagged addresses in VMAs. Untagged addresses in
VMAs means any find_vma() and brethren calls need to be passed an
untagged address.

On sparc64, my intent was to support address tagging for dynamically
allocated data buffers only (malloc, mmap and shm specifically) and not
for any generic system calls which limited the scope and amount of
untagging needed in the kernel. ARM64 is working to add transparent
tagged address support at C library level. Adding tagged addresses to C
library requires every possible call into kernel to either handle tagged
addresses or untag address at some point. Andrey found out it is not as
easy as untagging addresses in functions that search through vma.
Callers of find_vma() and others tend to do address arithmetic on the
address stored in vma that is returned. This requires a more complex
solution than just stripping tags in vma lookup routines.

Since untagging addresses is a generic need required for far more than
gup, I prefer the way Andrey wrote it -
<https://patchwork.kernel.org/patch/10923637/>

--
Khalid




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] mm: add a gup_fixup_start_addr hook
  2019-05-28 15:57     ` Khalid Aziz
@ 2019-05-29  7:26       ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2019-05-29  7:26 UTC (permalink / raw)
  To: Khalid Aziz
  Cc: Linus Torvalds, Christoph Hellwig, Paul Burton, James Hogan,
	Yoshinori Sato, Rich Felker, David S. Miller, Nicholas Piggin,
	linux-mips, Linux-sh list, sparclinux, Linux-MM,
	Linux List Kernel Mailing

On Tue, May 28, 2019 at 09:57:25AM -0600, Khalid Aziz wrote:
> Since untagging addresses is a generic need required for far more than
> gup, I prefer the way Andrey wrote it -
> <https://patchwork.kernel.org/patch/10923637/>

Linus, what do you think of picking up that trivial prep patch for
5.2?  That way the arm64 and get_user_pages series can progress
independently for 5.3.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] mm: add a gup_fixup_start_addr hook
  2019-05-25 13:32 ` [PATCH 4/6] mm: add a gup_fixup_start_addr hook Christoph Hellwig
  2019-05-25 17:05   ` Linus Torvalds
@ 2019-05-29  8:19   ` Catalin Marinas
  1 sibling, 0 replies; 14+ messages in thread
From: Catalin Marinas @ 2019-05-29  8:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Linus Torvalds, Paul Burton, James Hogan, Yoshinori Sato,
	Rich Felker, David S. Miller, Nicholas Piggin, linux-mips,
	linux-sh, sparclinux, linux-mm, Linux Kernel Mailing List

Hi Christoph,

On Sat, 25 May 2019 at 14:33, Christoph Hellwig <hch@lst.de> wrote:
> diff --git a/mm/gup.c b/mm/gup.c
> index f173fcbaf1b2..1c21ecfbf38b 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2117,6 +2117,10 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
>         } while (pgdp++, addr = next, addr != end);
>  }
>
> +#ifndef gup_fixup_start_addr
> +#define gup_fixup_start_addr(start)    (start)
> +#endif

As you pointed out in a subsequent reply, we could use the
untagged_addr() macro from Andrey (or a shorter "untag_addr" if you
want it to look like a verb).

>  #ifndef gup_fast_permitted
>  /*
>   * Check if it's allowed to use __get_user_pages_fast() for the range, or
> @@ -2145,7 +2149,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
>         unsigned long flags;
>         int nr = 0;
>
> -       start &= PAGE_MASK;
> +       start = gup_fixup_start_addr(start) & PAGE_MASK;
>         len = (unsigned long) nr_pages << PAGE_SHIFT;
>         end = start + len;
>
> @@ -2218,7 +2222,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
>         unsigned long addr, len, end;
>         int nr = 0, ret = 0;
>
> -       start &= PAGE_MASK;
> +       start = gup_fixup_start_addr(start) & PAGE_MASK;
>         addr = start;
>         len = (unsigned long) nr_pages << PAGE_SHIFT;
>         end = start + len;

In Andrey's patch [1] we don't fix __get_user_pages_fast(), only
__get_user_pages() as it needs to do a find_vma() search. I wonder
whether this is actually necessary for the *_fast() versions. If the
top byte is non-zero (i.e. tagged address), 'end' would also have the
same tag. The page table macros like pgd_index() and pgd_addr_end()
already take care of masking out the top bits (at least for arm64)
since they need to work on kernel address with the top bits all 1. So
gup_pgd_range() should cope with tagged addresses already.

[1] https://lore.kernel.org/lkml/d234cd71774f35229bdfc0a793c34d6712b73093.1557160186.git.andreyknvl@google.com/

-- 
Catalin

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2019-05-29  8:20 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-25 13:31 RFC: switch the remaining architectures to use generic GUP Christoph Hellwig
2019-05-25 13:31 ` [PATCH 1/6] MIPS: use the generic get_user_pages_fast code Christoph Hellwig
2019-05-25 13:31 ` [PATCH 2/6] sh: add a missing pud_page definition Christoph Hellwig
2019-05-25 13:32 ` [PATCH 3/6] sh: use the generic get_user_pages_fast code Christoph Hellwig
2019-05-25 13:32 ` [PATCH 4/6] mm: add a gup_fixup_start_addr hook Christoph Hellwig
2019-05-25 17:05   ` Linus Torvalds
2019-05-28 15:57     ` Khalid Aziz
2019-05-29  7:26       ` Christoph Hellwig
2019-05-29  8:19   ` Catalin Marinas
2019-05-25 13:32 ` [PATCH 5/6] sparc64: use the generic get_user_pages_fast code Christoph Hellwig
2019-05-25 16:55   ` David Miller
2019-05-25 13:32 ` [PATCH 6/6] mm: don't allow non-generic get_user_pages_fast implementations Christoph Hellwig
2019-05-25 17:07 ` RFC: switch the remaining architectures to use generic GUP Linus Torvalds
2019-05-25 17:39   ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).