All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] mm: adds MAP_NOSIGBUS extension for shmem read
@ 2021-06-01 23:22 Ming Lin
  2021-06-01 23:22 ` [PATCH 1/2] mm: make "vm_flags" be an u64 Ming Lin
  2021-06-01 23:22 ` [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read Ming Lin
  0 siblings, 2 replies; 29+ messages in thread
From: Ming Lin @ 2021-06-01 23:22 UTC (permalink / raw)
  To: Linus Torvalds, Hugh Dickins, Simon Ser; +Cc: linux-mm, linux-kernel, Ming Lin

These 2 patches are based on the discussion of "Sealed memfd & no-fault mmap"
at https://bit.ly/3pdwOGR

patch 1: make "vm_flags" be an u64, so we can have enough bits on 32-bit
architectures. Then we can add VM_NOSIGBUS which is bit 38.

patch 2: support no-fault mmap for shmem read

Ming Lin (2):
  mm: make "vm_flags" be an u64
  mm: adds NOSIGBUS extension for out-of-band shmem read

 arch/arm64/Kconfig                     |   1 -
 arch/powerpc/Kconfig                   |   1 -
 arch/x86/Kconfig                       |   1 -
 include/linux/mm.h                     | 102 ++++++++++++++++-----------------
 include/linux/mm_types.h               |   4 +-
 include/linux/mman.h                   |   5 +-
 include/uapi/asm-generic/mman-common.h |   1 +
 mm/Kconfig                             |   2 -
 mm/memory.c                            |   2 +-
 mm/mmap.c                              |   5 +-
 mm/shmem.c                             |  17 +++++-
 11 files changed, 76 insertions(+), 65 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/2] mm: make "vm_flags" be an u64
  2021-06-01 23:22 [PATCH 0/2] mm: adds MAP_NOSIGBUS extension for shmem read Ming Lin
@ 2021-06-01 23:22 ` Ming Lin
  2021-06-02  1:58   ` kernel test robot
  2021-06-02  2:06   ` kernel test robot
  2021-06-01 23:22 ` [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read Ming Lin
  1 sibling, 2 replies; 29+ messages in thread
From: Ming Lin @ 2021-06-01 23:22 UTC (permalink / raw)
  To: Linus Torvalds, Hugh Dickins, Simon Ser; +Cc: linux-mm, linux-kernel, Ming Lin

So we can have enough bits on 32-bit architectures.

Signed-off-by: Ming Lin <mlin@kernel.org>
---
 arch/arm64/Kconfig       |   1 -
 arch/powerpc/Kconfig     |   1 -
 arch/x86/Kconfig         |   1 -
 include/linux/mm.h       | 100 ++++++++++++++++++++++-------------------------
 include/linux/mm_types.h |   4 +-
 include/linux/mman.h     |   4 +-
 mm/Kconfig               |   2 -
 mm/memory.c              |   2 +-
 mm/mmap.c                |   2 +-
 9 files changed, 53 insertions(+), 64 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f1d856..c6960ea 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1658,7 +1658,6 @@ config ARM64_MTE
 	depends on AS_HAS_LSE_ATOMICS
 	# Required for tag checking in the uaccess routines
 	depends on ARM64_PAN
-	select ARCH_USES_HIGH_VMA_FLAGS
 	help
 	  Memory Tagging (part of the ARMv8.5 Extensions) provides
 	  architectural support for run-time, always-on detection of
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 088dd2a..5c1b49e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -940,7 +940,6 @@ config PPC_MEM_KEYS
 	prompt "PowerPC Memory Protection Keys"
 	def_bool y
 	depends on PPC_BOOK3S_64
-	select ARCH_USES_HIGH_VMA_FLAGS
 	select ARCH_HAS_PKEYS
 	help
 	  Memory Protection Keys provides a mechanism for enforcing
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b..a885336 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1874,7 +1874,6 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
 	def_bool y
 	# Note: only available in 64-bit mode
 	depends on X86_64 && (CPU_SUP_INTEL || CPU_SUP_AMD)
-	select ARCH_USES_HIGH_VMA_FLAGS
 	select ARCH_HAS_PKEYS
 	help
 	  Memory Protection Keys provides a mechanism for enforcing
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c274f75..e9d67bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -264,73 +264,68 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 extern unsigned int kobjsize(const void *objp);
 #endif
 
+#define VM_FLAGS_BIT(N)	(1ULL << (N))
+
 /*
  * vm_flags in vm_area_struct, see mm_types.h.
  * When changing, update also include/trace/events/mmflags.h
  */
 #define VM_NONE		0x00000000
 
-#define VM_READ		0x00000001	/* currently active flags */
-#define VM_WRITE	0x00000002
-#define VM_EXEC		0x00000004
-#define VM_SHARED	0x00000008
+#define VM_READ		VM_FLAGS_BIT(0)	 /* currently active flags */
+#define VM_WRITE	VM_FLAGS_BIT(1)
+#define VM_EXEC		VM_FLAGS_BIT(2)
+#define VM_SHARED	VM_FLAGS_BIT(3)
 
 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
-#define VM_MAYREAD	0x00000010	/* limits for mprotect() etc */
-#define VM_MAYWRITE	0x00000020
-#define VM_MAYEXEC	0x00000040
-#define VM_MAYSHARE	0x00000080
-
-#define VM_GROWSDOWN	0x00000100	/* general info on the segment */
-#define VM_UFFD_MISSING	0x00000200	/* missing pages tracking */
-#define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
-#define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
-#define VM_UFFD_WP	0x00001000	/* wrprotect pages tracking */
-
-#define VM_LOCKED	0x00002000
-#define VM_IO           0x00004000	/* Memory mapped I/O or similar */
-
-					/* Used by sys_madvise() */
-#define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
-#define VM_RAND_READ	0x00010000	/* App will not benefit from clustered reads */
-
-#define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
-#define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_LOCKONFAULT	0x00080000	/* Lock the pages covered when they are faulted in */
-#define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
-#define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
-#define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
-#define VM_SYNC		0x00800000	/* Synchronous page faults */
-#define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
-#define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
-#define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
+#define VM_MAYREAD	VM_FLAGS_BIT(4)	 /* limits for mprotect() etc */
+#define VM_MAYWRITE	VM_FLAGS_BIT(5)
+#define VM_MAYEXEC	VM_FLAGS_BIT(6)
+#define VM_MAYSHARE	VM_FLAGS_BIT(7)
+
+#define VM_GROWSDOWN	VM_FLAGS_BIT(8)	 /* general info on the segment */
+#define VM_UFFD_MISSING	VM_FLAGS_BIT(9)	 /* missing pages tracking */
+#define VM_PFNMAP	VM_FLAGS_BIT(10) /* Page-ranges managed without "struct page", just pure PFN */
+#define VM_DENYWRITE	VM_FLAGS_BIT(11) /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP	VM_FLAGS_BIT(12) /* wrprotect pages tracking */
+
+#define VM_LOCKED	VM_FLAGS_BIT(13)
+#define VM_IO           VM_FLAGS_BIT(14) /* Memory mapped I/O or similar */
+
+					 /* Used by sys_madvise() */
+#define VM_SEQ_READ	VM_FLAGS_BIT(15) /* App will access data sequentially */
+#define VM_RAND_READ	VM_FLAGS_BIT(16) /* App will not benefit from clustered reads */
+
+#define VM_DONTCOPY	VM_FLAGS_BIT(17) /* Do not copy this vma on fork */
+#define VM_DONTEXPAND	VM_FLAGS_BIT(18) /* Cannot expand with mremap() */
+#define VM_LOCKONFAULT	VM_FLAGS_BIT(19) /* Lock the pages covered when they are faulted in */
+#define VM_ACCOUNT	VM_FLAGS_BIT(20) /* Is a VM accounted object */
+#define VM_NORESERVE	VM_FLAGS_BIT(21) /* should the VM suppress accounting */
+#define VM_HUGETLB	VM_FLAGS_BIT(22) /* Huge TLB Page VM */
+#define VM_SYNC		VM_FLAGS_BIT(23) /* Synchronous page faults */
+#define VM_ARCH_1	VM_FLAGS_BIT(24) /* Architecture-specific flag */
+#define VM_WIPEONFORK	VM_FLAGS_BIT(25) /* Wipe VMA contents in child. */
+#define VM_DONTDUMP	VM_FLAGS_BIT(26) /* Do not include in the core dump */
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
-# define VM_SOFTDIRTY	0x08000000	/* Not soft dirty clean area */
+# define VM_SOFTDIRTY	VM_FLAGS_BIT(27) /* Not soft dirty clean area */
 #else
 # define VM_SOFTDIRTY	0
 #endif
 
-#define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
-#define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
-#define VM_NOHUGEPAGE	0x40000000	/* MADV_NOHUGEPAGE marked this vma */
-#define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
-
-#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
-#define VM_HIGH_ARCH_BIT_0	32	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_1	33	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_2	34	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
-#define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
-#define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
-#define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
-#define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
-#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
+#define VM_MIXEDMAP	VM_FLAGS_BIT(28) /* Can contain "struct page" and pure PFN pages */
+#define VM_HUGEPAGE	VM_FLAGS_BIT(29) /* MADV_HUGEPAGE marked this vma */
+#define VM_NOHUGEPAGE	VM_FLAGS_BIT(30) /* MADV_NOHUGEPAGE marked this vma */
+#define VM_MERGEABLE	VM_FLAGS_BIT(31) /* KSM may merge identical pages */
+
+#define VM_HIGH_ARCH_0	VM_FLAGS_BIT(32)
+#define VM_HIGH_ARCH_1	VM_FLAGS_BIT(33)
+#define VM_HIGH_ARCH_2	VM_FLAGS_BIT(34)
+#define VM_HIGH_ARCH_3	VM_FLAGS_BIT(35)
+#define VM_HIGH_ARCH_4	VM_FLAGS_BIT(36)
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT	VM_HIGH_ARCH_BIT_0
+# define VM_PKEY_SHIFT	32
 # define VM_PKEY_BIT0	VM_HIGH_ARCH_0	/* A protection key is a 4-bit value */
 # define VM_PKEY_BIT1	VM_HIGH_ARCH_1	/* on x86 and 5-bit value on ppc64   */
 # define VM_PKEY_BIT2	VM_HIGH_ARCH_2
@@ -373,8 +368,7 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 #endif
 
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT	37
-# define VM_UFFD_MINOR		BIT(VM_UFFD_MINOR_BIT)	/* UFFD minor faults */
+# define VM_UFFD_MINOR		VM_FLAGS_BIT(37)	/* UFFD minor faults */
 #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
 # define VM_UFFD_MINOR		VM_NONE
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5aacc1c..5347293 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -264,7 +264,7 @@ struct page_frag_cache {
 	bool pfmemalloc;
 };
 
-typedef unsigned long vm_flags_t;
+typedef u64 vm_flags_t;
 
 /*
  * A region containing a mapping of a non-memory backed file under NOMMU
@@ -330,7 +330,7 @@ struct vm_area_struct {
 	 * See vmf_insert_mixed_prot() for discussion.
 	 */
 	pgprot_t vm_page_prot;
-	unsigned long vm_flags;		/* Flags, see mm.h. */
+	vm_flags_t vm_flags;			/* Flags, see mm.h. */
 
 	/*
 	 * For areas with an address space and backing store,
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 629cefc..b2cbae9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -135,7 +135,7 @@ static inline bool arch_validate_flags(unsigned long flags)
 /*
  * Combine the mmap "prot" argument into "vm_flags" used internally.
  */
-static inline unsigned long
+static inline vm_flags_t
 calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 {
 	return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
@@ -147,7 +147,7 @@ static inline bool arch_validate_flags(unsigned long flags)
 /*
  * Combine the mmap "flags" argument into "vm_flags" used internally.
  */
-static inline unsigned long
+static inline vm_flags_t
 calc_vm_flag_bits(unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
diff --git a/mm/Kconfig b/mm/Kconfig
index 02d44e3..aa8efba 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -830,8 +830,6 @@ config DEVICE_PRIVATE
 config VMAP_PFN
 	bool
 
-config ARCH_USES_HIGH_VMA_FLAGS
-	bool
 config ARCH_HAS_PKEYS
 	bool
 
diff --git a/mm/memory.c b/mm/memory.c
index 730daa0..eff2a47 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -550,7 +550,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
 	if (page)
 		dump_page(page, "bad pte");
-	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
+	pr_alert("addr:%px vm_flags:%08llx anon_vma:%px mapping:%px index:%lx\n",
 		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
 	pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
 		 vma->vm_file,
diff --git a/mm/mmap.c b/mm/mmap.c
index 0584e54..096bba4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1353,7 +1353,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 }
 
 static inline int mlock_future_check(struct mm_struct *mm,
-				     unsigned long flags,
+				     vm_flags_t flags,
 				     unsigned long len)
 {
 	unsigned long locked, lock_limit;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-01 23:22 [PATCH 0/2] mm: adds MAP_NOSIGBUS extension for shmem read Ming Lin
  2021-06-01 23:22 ` [PATCH 1/2] mm: make "vm_flags" be an u64 Ming Lin
@ 2021-06-01 23:22 ` Ming Lin
  2021-06-02  0:16     ` Linus Torvalds
                     ` (3 more replies)
  1 sibling, 4 replies; 29+ messages in thread
From: Ming Lin @ 2021-06-01 23:22 UTC (permalink / raw)
  To: Linus Torvalds, Hugh Dickins, Simon Ser; +Cc: linux-mm, linux-kernel, Ming Lin

Adds new flag MAP_NOSIGBUS of mmap() to specify the behavior of
"don't SIGBUS on read beyond i_size". This flag is only allowed
for read only shmem mapping.

If you use MAP_NOSIGBUS, and you access pages that don't have a backing
store, you will get zero pages, and they will NOT BE SYNCHRONIZED with
the backing store possibly later being updated.

Any user that uses MAP_NOSIGBUS had better just accept that it's not
compatible with expanding the shmem backing store later.

Signed-off-by: Ming Lin <mlin@kernel.org>
---
 include/linux/mm.h                     |  2 ++
 include/linux/mman.h                   |  1 +
 include/uapi/asm-generic/mman-common.h |  1 +
 mm/mmap.c                              |  3 +++
 mm/shmem.c                             | 17 ++++++++++++++++-
 5 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e9d67bc..5d0e0dc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 # define VM_UFFD_MINOR		VM_NONE
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
 
+#define VM_NOSIGBUS		VM_FLAGS_BIT(38)	/* Do not SIGBUS on out-of-band shmem read */
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
 
diff --git a/include/linux/mman.h b/include/linux/mman.h
index b2cbae9..c966b08 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
 	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
+	       _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
 	       arch_calc_vm_flag_bits(flags);
 }
 
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f94f65d..55f4be0 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -29,6 +29,7 @@
 #define MAP_HUGETLB		0x040000	/* create a huge page mapping */
 #define MAP_SYNC		0x080000 /* perform synchronous page faults for the mapping */
 #define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
+#define MAP_NOSIGBUS		0x200000	/* do not SIGBUS on out-of-band shmem read */
 
 #define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
 					 * uninitialized */
diff --git a/mm/mmap.c b/mm/mmap.c
index 096bba4..69cd856 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1419,6 +1419,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	if (!len)
 		return -EINVAL;
 
+	if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
+		return -EINVAL;
+
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC?
 	 *
diff --git a/mm/shmem.c b/mm/shmem.c
index 5d46611..5d15b08 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1812,7 +1812,22 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 repeat:
 	if (sgp <= SGP_CACHE &&
 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
-		return -EINVAL;
+		if (!vma || !(vma->vm_flags & VM_NOSIGBUS))
+			return -EINVAL;
+
+		vma->vm_flags |= VM_MIXEDMAP;
+		/*
+		 * Get zero page for MAP_NOSIGBUS mapping, which isn't
+                 * coherent wrt shmem contents that are expanded and
+		 * filled in later.
+		 */
+		error = vm_insert_page(vma, (unsigned long)vmf->address,
+					ZERO_PAGE(0));
+		if (error)
+			return error;
+
+		*fault_type = VM_FAULT_NOPAGE;
+		return 0;
 	}
 
 	sbinfo = SHMEM_SB(inode->i_sb);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-01 23:22 ` [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read Ming Lin
@ 2021-06-02  0:16     ` Linus Torvalds
  2021-06-02  2:02   ` kernel test robot
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2021-06-02  0:16 UTC (permalink / raw)
  To: Ming Lin; +Cc: Hugh Dickins, Simon Ser, Linux-MM, Linux Kernel Mailing List

This series passes my "looks fine, is simple and straightforward" test.

One nit:

On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <mlin@kernel.org> wrote:
>
> +               error = vm_insert_page(vma, (unsigned long)vmf->address,
> +                                       ZERO_PAGE(0));

On architectures where this matters - bad virtual caches - it would be
better to use ZERO_PAGE(vmf->address).

It doesn't make a difference on any sane architecture, but it's the
RightThing(tm) to do.

            Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-02  0:16     ` Linus Torvalds
  0 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2021-06-02  0:16 UTC (permalink / raw)
  To: Ming Lin; +Cc: Hugh Dickins, Simon Ser, Linux-MM, Linux Kernel Mailing List

This series passes my "looks fine, is simple and straightforward" test.

One nit:

On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <mlin@kernel.org> wrote:
>
> +               error = vm_insert_page(vma, (unsigned long)vmf->address,
> +                                       ZERO_PAGE(0));

On architectures where this matters - bad virtual caches - it would be
better to use ZERO_PAGE(vmf->address).

It doesn't make a difference on any sane architecture, but it's the
RightThing(tm) to do.

            Linus


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-02  0:16     ` Linus Torvalds
@ 2021-06-02  1:06       ` Ming Lin
  -1 siblings, 0 replies; 29+ messages in thread
From: Ming Lin @ 2021-06-02  1:06 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hugh Dickins, Simon Ser, Linux-MM, Linux Kernel Mailing List

On 6/1/2021 5:16 PM, Linus Torvalds wrote:
> This series passes my "looks fine, is simple and straightforward" test.
>
> One nit:
>
> On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <mlin@kernel.org> wrote:
>>
>> +               error = vm_insert_page(vma, (unsigned long)vmf->address,
>> +                                       ZERO_PAGE(0));
>
> On architectures where this matters - bad virtual caches - it would be
> better to use ZERO_PAGE(vmf->address).
>
> It doesn't make a difference on any sane architecture, but it's the
> RightThing(tm) to do.


grep -Rn ZERO_PAGE linux/arch/ | grep define

s390 and mips do use the "address" of ZERO_PAGE(address)

Fixed.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-02  1:06       ` Ming Lin
  0 siblings, 0 replies; 29+ messages in thread
From: Ming Lin @ 2021-06-02  1:06 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hugh Dickins, Simon Ser, Linux-MM, Linux Kernel Mailing List

On 6/1/2021 5:16 PM, Linus Torvalds wrote:
> This series passes my "looks fine, is simple and straightforward" test.
>
> One nit:
>
> On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <mlin@kernel.org> wrote:
>>
>> +               error = vm_insert_page(vma, (unsigned long)vmf->address,
>> +                                       ZERO_PAGE(0));
>
> On architectures where this matters - bad virtual caches - it would be
> better to use ZERO_PAGE(vmf->address).
>
> It doesn't make a difference on any sane architecture, but it's the
> RightThing(tm) to do.


grep -Rn ZERO_PAGE linux/arch/ | grep define

s390 and mips do use the "address" of ZERO_PAGE(address)

Fixed.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/2] mm: make "vm_flags" be an u64
  2021-06-01 23:22 ` [PATCH 1/2] mm: make "vm_flags" be an u64 Ming Lin
@ 2021-06-02  1:58   ` kernel test robot
  2021-06-02  2:06   ` kernel test robot
  1 sibling, 0 replies; 29+ messages in thread
From: kernel test robot @ 2021-06-02  1:58 UTC (permalink / raw)
  To: Ming Lin-SSI, Linus Torvalds, Hugh Dickins, Simon Ser
  Cc: kbuild-all, LKML, linux-mm, Ming Lin-SSI

[-- Attachment #1: Type: text/plain, Size: 36720 bytes --]

Hi Ming,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on arm64/for-next/core powerpc/next asm-generic/master linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master tip/x86/core next-20210601]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: x86_64-allyesconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/0b6b8b44f566199698248899d0fef7466ba6b0f3
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
        git checkout 0b6b8b44f566199698248899d0fef7466ba6b0f3
        # save the attached .config to linux build tree
        make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from drivers/infiniband/hw/hfi1/trace.h:57,
                    from drivers/infiniband/hw/hfi1/file_ops.c:61:
   drivers/infiniband/hw/hfi1/file_ops.c: In function 'hfi1_file_mmap':
>> drivers/infiniband/hw/hfi1/file_ops.c:572:5: warning: format '%lx' expects argument of type 'long unsigned int', but argument 11 has type 'vm_flags_t' {aka 'long long unsigned int'} [-Wformat=]
     572 |     "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
         |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     573 |       ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
     574 |       vma->vm_end - vma->vm_start, vma->vm_flags);
         |                                    ~~~~~~~~~~~~~
         |                                       |
         |                                       vm_flags_t {aka long long unsigned int}
   drivers/infiniband/hw/hfi1/trace_dbg.h:133:33: note: in definition of macro 'hfi1_cdbg'
     133 |  __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
         |                                 ^~~
   drivers/infiniband/hw/hfi1/file_ops.c:572:70: note: format string is defined here
     572 |     "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
         |                                                                    ~~^
         |                                                                      |
         |                                                                      long unsigned int
         |                                                                    %llx
--
   In file included from include/linux/device.h:15,
                    from include/linux/pci.h:37,
                    from drivers/infiniband/hw/qib/qib_file_ops.c:35:
   drivers/infiniband/hw/qib/qib_file_ops.c: In function 'mmap_rcvegrbufs':
>> drivers/infiniband/hw/qib/qib_file_ops.c:849:4: warning: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'vm_flags_t' {aka 'long long unsigned int'} [-Wformat=]
     849 |    "Can't map eager buffers as writable (flags=%lx)\n",
         |    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/dev_printk.h:19:22: note: in definition of macro 'dev_fmt'
      19 | #define dev_fmt(fmt) fmt
         |                      ^~~
   drivers/infiniband/hw/qib/qib.h:1472:2: note: in expansion of macro 'dev_info'
    1472 |  dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__)
         |  ^~~~~~~~
   drivers/infiniband/hw/qib/qib_file_ops.c:848:3: note: in expansion of macro 'qib_devinfo'
     848 |   qib_devinfo(dd->pcidev,
         |   ^~~~~~~~~~~
   drivers/infiniband/hw/qib/qib_file_ops.c:849:50: note: format string is defined here
     849 |    "Can't map eager buffers as writable (flags=%lx)\n",
         |                                                ~~^
         |                                                  |
         |                                                  long unsigned int
         |                                                %llx
   In file included from include/linux/device.h:15,
                    from include/linux/pci.h:37,
                    from drivers/infiniband/hw/qib/qib_file_ops.c:35:
   drivers/infiniband/hw/qib/qib_file_ops.c: In function 'mmap_kvaddr':
   drivers/infiniband/hw/qib/qib_file_ops.c:938:6: warning: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'vm_flags_t' {aka 'long long unsigned int'} [-Wformat=]
     938 |      "Can't map eager buffers as writable (flags=%lx)\n",
         |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/dev_printk.h:19:22: note: in definition of macro 'dev_fmt'
      19 | #define dev_fmt(fmt) fmt
         |                      ^~~
   drivers/infiniband/hw/qib/qib.h:1472:2: note: in expansion of macro 'dev_info'
    1472 |  dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__)
         |  ^~~~~~~~
   drivers/infiniband/hw/qib/qib_file_ops.c:937:4: note: in expansion of macro 'qib_devinfo'
     937 |    qib_devinfo(dd->pcidev,
         |    ^~~~~~~~~~~
   drivers/infiniband/hw/qib/qib_file_ops.c:938:52: note: format string is defined here
     938 |      "Can't map eager buffers as writable (flags=%lx)\n",
         |                                                  ~~^
         |                                                    |
         |                                                    long unsigned int
         |                                                  %llx
--
   In file included from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_priv.h:48,
                    from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_chardev.c:38:
   drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_chardev.c: In function 'kfd_mmio_mmap':
>> drivers/gpu/drm/amd/amdgpu/../amdgpu/amdgpu.h:35:21: warning: format '%lX' expects argument of type 'long unsigned int', but argument 6 has type 'vm_flags_t' {aka 'long long unsigned int'} [-Wformat=]
      35 | #define pr_fmt(fmt) "amdgpu: " fmt
         |                     ^~~~~~~~~~
   include/linux/dynamic_debug.h:129:15: note: in expansion of macro 'pr_fmt'
     129 |   func(&id, ##__VA_ARGS__);  \
         |               ^~~~~~~~~~~
   include/linux/dynamic_debug.h:147:2: note: in expansion of macro '__dynamic_func_call'
     147 |  __dynamic_func_call(__UNIQUE_ID(ddebug), fmt, func, ##__VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~~~
   include/linux/dynamic_debug.h:157:2: note: in expansion of macro '_dynamic_func_call'
     157 |  _dynamic_func_call(fmt, __dynamic_pr_debug,  \
         |  ^~~~~~~~~~~~~~~~~~
   include/linux/printk.h:424:2: note: in expansion of macro 'dynamic_pr_debug'
     424 |  dynamic_pr_debug(fmt, ##__VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~
   drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_chardev.c:1957:2: note: in expansion of macro 'pr_debug'
    1957 |  pr_debug("pasid 0x%x mapping mmio page\n"
         |  ^~~~~~~~
   drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_chardev.c:1960:39: note: format string is defined here
    1960 |    "     vm_flags            == 0x%04lX\n"
         |                                   ~~~~^
         |                                       |
         |                                       long unsigned int
         |                                   %04llX


vim +572 drivers/infiniband/hw/hfi1/file_ops.c

7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  347  
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  348  static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  349  {
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c  Ira Weiny          2015-10-30  350  	struct hfi1_filedata *fd = fp->private_data;
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c  Ira Weiny          2015-10-30  351  	struct hfi1_ctxtdata *uctxt = fd->uctxt;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  352  	struct hfi1_devdata *dd;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  353  	unsigned long flags;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  354  	u64 token = vma->vm_pgoff << PAGE_SHIFT,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  355  		memaddr = 0;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  356  	void *memvirt = NULL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  357  	u8 subctxt, mapio = 0, vmf = 0, type;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  358  	ssize_t memlen = 0;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  359  	int ret = 0;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  360  	u16 ctxt;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  361  
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  362  	if (!is_valid_mmap(token) || !uctxt ||
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  363  	    !(vma->vm_flags & VM_SHARED)) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  364  		ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  365  		goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  366  	}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  367  	dd = uctxt->dd;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  368  	ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  369  	subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  370  	type = HFI1_MMAP_TOKEN_GET(TYPE, token);
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c  Ira Weiny          2015-10-30  371  	if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  372  		ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  373  		goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  374  	}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  375  
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  376  	flags = vma->vm_flags;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  377  
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  378  	switch (type) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  379  	case PIO_BUFS:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  380  	case PIO_BUFS_SOP:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  381  		memaddr = ((dd->physaddr + TXE_PIO_SEND) +
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  382  				/* chip pio base */
d32cf44a62716d drivers/staging/rdma/hfi1/file_ops.c  Amitoj Kaur Chawla 2015-10-16  383  			   (uctxt->sc->hw_context * BIT(16))) +
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  384  				/* 64K PIO space / ctxt */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  385  			(type == PIO_BUFS_SOP ?
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  386  				(TXE_PIO_SIZE / 2) : 0); /* sop? */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  387  		/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  388  		 * Map only the amount allocated to the context, not the
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  389  		 * entire available context's PIO space.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  390  		 */
437b29d1159af1 drivers/staging/rdma/hfi1/file_ops.c  Amitoj Kaur Chawla 2016-03-04  391  		memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  392  		flags &= ~VM_MAYREAD;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  393  		flags |= VM_DONTCOPY | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  394  		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  395  		mapio = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  396  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  397  	case PIO_CRED:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  398  		if (flags & VM_WRITE) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  399  			ret = -EPERM;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  400  			goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  401  		}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  402  		/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  403  		 * The credit return location for this context could be on the
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  404  		 * second or third page allocated for credit returns (if number
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  405  		 * of enabled contexts > 64 and 128 respectively).
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  406  		 */
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  407  		memvirt = dd->cr_base[uctxt->numa_id].va;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  408  		memaddr = virt_to_phys(memvirt) +
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  409  			(((u64)uctxt->sc->hw_free -
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  410  			  (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  411  		memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  412  		flags &= ~VM_MAYWRITE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  413  		flags |= VM_DONTCOPY | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  414  		/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  415  		 * The driver has already allocated memory for credit
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  416  		 * returns and programmed it into the chip. Has that
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  417  		 * memory been flagged as non-cached?
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  418  		 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  419  		/* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  420  		mapio = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  421  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  422  	case RCV_HDRQ:
b25784312840bc drivers/infiniband/hw/hfi1/file_ops.c Mike Marciniszyn   2018-06-20  423  		memlen = rcvhdrq_size(uctxt);
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  424  		memvirt = uctxt->rcvhdrq;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  425  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  426  	case RCV_EGRBUF: {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  427  		unsigned long addr;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  428  		int i;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  429  		/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  430  		 * The RcvEgr buffer need to be handled differently
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  431  		 * as multiple non-contiguous pages need to be mapped
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  432  		 * into the user process.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  433  		 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  434  		memlen = uctxt->egrbufs.size;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  435  		if ((vma->vm_end - vma->vm_start) != memlen) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  436  			dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  437  				   (vma->vm_end - vma->vm_start), memlen);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  438  			ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  439  			goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  440  		}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  441  		if (vma->vm_flags & VM_WRITE) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  442  			ret = -EPERM;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  443  			goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  444  		}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  445  		vma->vm_flags &= ~VM_MAYWRITE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  446  		addr = vma->vm_start;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  447  		for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  448  			memlen = uctxt->egrbufs.buffers[i].len;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  449  			memvirt = uctxt->egrbufs.buffers[i].addr;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  450  			ret = remap_pfn_range(
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  451  				vma, addr,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  452  				/*
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  453  				 * virt_to_pfn() does the same, but
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  454  				 * it's not available on x86_64
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  455  				 * when CONFIG_MMU is enabled.
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  456  				 */
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  457  				PFN_DOWN(__pa(memvirt)),
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  458  				memlen,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  459  				vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  460  			if (ret < 0)
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  461  				goto done;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  462  			addr += memlen;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  463  		}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  464  		ret = 0;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  465  		goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  466  	}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  467  	case UREGS:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  468  		/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  469  		 * Map only the page that contains this context's user
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  470  		 * registers.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  471  		 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  472  		memaddr = (unsigned long)
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  473  			(dd->physaddr + RXE_PER_CONTEXT_USER)
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  474  			+ (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  475  		/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  476  		 * TidFlow table is on the same page as the rest of the
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  477  		 * user registers.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  478  		 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  479  		memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  480  		flags |= VM_DONTCOPY | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  481  		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  482  		mapio = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  483  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  484  	case EVENTS:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  485  		/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  486  		 * Use the page where this context's flags are. User level
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  487  		 * knows where it's own bitmap is within the page.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  488  		 */
21e5acc06403f6 drivers/infiniband/hw/hfi1/file_ops.c Michael J. Ruhl    2017-09-26  489  		memaddr = (unsigned long)
21e5acc06403f6 drivers/infiniband/hw/hfi1/file_ops.c Michael J. Ruhl    2017-09-26  490  			(dd->events + uctxt_offset(uctxt)) & PAGE_MASK;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  491  		memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  492  		/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  493  		 * v3.7 removes VM_RESERVED but the effect is kept by
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  494  		 * using VM_IO.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  495  		 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  496  		flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  497  		vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  498  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  499  	case STATUS:
7709b0dc265f28 drivers/infiniband/hw/hfi1/file_ops.c Michael J. Ruhl    2019-01-17  500  		if (flags & VM_WRITE) {
12220267645cb7 drivers/infiniband/hw/hfi1/file_ops.c Ira Weiny          2017-04-09  501  			ret = -EPERM;
12220267645cb7 drivers/infiniband/hw/hfi1/file_ops.c Ira Weiny          2017-04-09  502  			goto done;
12220267645cb7 drivers/infiniband/hw/hfi1/file_ops.c Ira Weiny          2017-04-09  503  		}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  504  		memaddr = kvirt_to_phys((void *)dd->status);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  505  		memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  506  		flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  507  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  508  	case RTAIL:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  509  		if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  510  			/*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  511  			 * If the memory allocation failed, the context alloc
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  512  			 * also would have failed, so we would never get here
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  513  			 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  514  			ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  515  			goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  516  		}
2fb3b5ae1ca771 drivers/infiniband/hw/hfi1/file_ops.c Mike Marciniszyn   2019-12-19  517  		if ((flags & VM_WRITE) || !hfi1_rcvhdrtail_kvaddr(uctxt)) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  518  			ret = -EPERM;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  519  			goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  520  		}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  521  		memlen = PAGE_SIZE;
2fb3b5ae1ca771 drivers/infiniband/hw/hfi1/file_ops.c Mike Marciniszyn   2019-12-19  522  		memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  523  		flags &= ~VM_MAYWRITE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  524  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  525  	case SUBCTXT_UREGS:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  526  		memaddr = (u64)uctxt->subctxt_uregbase;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  527  		memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  528  		flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  529  		vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  530  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  531  	case SUBCTXT_RCV_HDRQ:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  532  		memaddr = (u64)uctxt->subctxt_rcvhdr_base;
b25784312840bc drivers/infiniband/hw/hfi1/file_ops.c Mike Marciniszyn   2018-06-20  533  		memlen = rcvhdrq_size(uctxt) * uctxt->subctxt_cnt;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  534  		flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  535  		vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  536  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  537  	case SUBCTXT_EGRBUF:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  538  		memaddr = (u64)uctxt->subctxt_rcvegrbuf;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  539  		memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  540  		flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  541  		flags &= ~VM_MAYWRITE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  542  		vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  543  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  544  	case SDMA_COMP: {
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c  Ira Weiny          2015-10-30  545  		struct hfi1_user_sdma_comp_q *cq = fd->cq;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  546  
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c  Ira Weiny          2015-10-30  547  		if (!cq) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  548  			ret = -EFAULT;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  549  			goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  550  		}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  551  		memaddr = (u64)cq->comps;
437b29d1159af1 drivers/staging/rdma/hfi1/file_ops.c  Amitoj Kaur Chawla 2016-03-04  552  		memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  553  		flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  554  		vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  555  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  556  	}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  557  	default:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  558  		ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  559  		break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  560  	}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  561  
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  562  	if ((vma->vm_end - vma->vm_start) != memlen) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  563  		hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c  Ira Weiny          2015-10-30  564  			  uctxt->ctxt, fd->subctxt,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  565  			  (vma->vm_end - vma->vm_start), memlen);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  566  		ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  567  		goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  568  	}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  569  
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  570  	vma->vm_flags = flags;
6c63e4238acad0 drivers/staging/rdma/hfi1/file_ops.c  Sebastian Sanchez  2015-11-06  571  	hfi1_cdbg(PROC,
6c63e4238acad0 drivers/staging/rdma/hfi1/file_ops.c  Sebastian Sanchez  2015-11-06 @572  		  "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
6c63e4238acad0 drivers/staging/rdma/hfi1/file_ops.c  Sebastian Sanchez  2015-11-06  573  		    ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  574  		    vma->vm_end - vma->vm_start, vma->vm_flags);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  575  	if (vmf) {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  576  		vma->vm_pgoff = PFN_DOWN(memaddr);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  577  		vma->vm_ops = &vm_ops;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  578  		ret = 0;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  579  	} else if (mapio) {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  580  		ret = io_remap_pfn_range(vma, vma->vm_start,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  581  					 PFN_DOWN(memaddr),
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  582  					 memlen,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  583  					 vma->vm_page_prot);
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  584  	} else if (memvirt) {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  585  		ret = remap_pfn_range(vma, vma->vm_start,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  586  				      PFN_DOWN(__pa(memvirt)),
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  587  				      memlen,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  588  				      vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  589  	} else {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  590  		ret = remap_pfn_range(vma, vma->vm_start,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  591  				      PFN_DOWN(memaddr),
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan   2016-09-06  592  				      memlen,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  593  				      vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  594  	}
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  595  done:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  596  	return ret;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  597  }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c  Mike Marciniszyn   2015-07-30  598  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 65642 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-01 23:22 ` [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read Ming Lin
  2021-06-02  0:16     ` Linus Torvalds
@ 2021-06-02  2:02   ` kernel test robot
  2021-06-02  3:49     ` Hugh Dickins
  2021-06-02  9:30   ` kernel test robot
  3 siblings, 0 replies; 29+ messages in thread
From: kernel test robot @ 2021-06-02  2:02 UTC (permalink / raw)
  To: Ming Lin-SSI, Linus Torvalds, Hugh Dickins, Simon Ser
  Cc: kbuild-all, LKML, linux-mm, Ming Lin-SSI

[-- Attachment #1: Type: text/plain, Size: 7115 bytes --]

Hi Ming,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on arm64/for-next/core powerpc/next asm-generic/master linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master tip/x86/core next-20210601]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: parisc-randconfig-r015-20210601 (attached as .config)
compiler: hppa-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/c14d1ac79e68e85a2ff97e19c36100990b09a7c3
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
        git checkout c14d1ac79e68e85a2ff97e19c36100990b09a7c3
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=parisc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from mm/filemap.c:24:
   include/linux/mman.h: In function 'calc_vm_flag_bits':
>> include/linux/mman.h:157:31: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
     157 |         _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
         |                               ^~~~~~~~~~~~
   include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
     131 |   ((!(bit1) || !(bit2)) ? 0 : \
         |       ^~~~
   include/linux/mman.h:157:31: note: each undeclared identifier is reported only once for each function it appears in
     157 |         _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
         |                               ^~~~~~~~~~~~
   include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
     131 |   ((!(bit1) || !(bit2)) ? 0 : \
         |       ^~~~
--
   In file included from mm/util.c:15:
   include/linux/mman.h: In function 'calc_vm_flag_bits':
>> include/linux/mman.h:157:31: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
     157 |         _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
         |                               ^~~~~~~~~~~~
   include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
     131 |   ((!(bit1) || !(bit2)) ? 0 : \
         |       ^~~~
   include/linux/mman.h:157:31: note: each undeclared identifier is reported only once for each function it appears in
     157 |         _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
         |                               ^~~~~~~~~~~~
   include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
     131 |   ((!(bit1) || !(bit2)) ? 0 : \
         |       ^~~~
   mm/util.c: In function 'page_mapping':
   mm/util.c:700:15: warning: variable 'entry' set but not used [-Wunused-but-set-variable]
     700 |   swp_entry_t entry;
         |               ^~~~~
--
   In file included from mm/mmap.c:18:
   include/linux/mman.h: In function 'calc_vm_flag_bits':
>> include/linux/mman.h:157:31: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
     157 |         _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
         |                               ^~~~~~~~~~~~
   include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
     131 |   ((!(bit1) || !(bit2)) ? 0 : \
         |       ^~~~
   include/linux/mman.h:157:31: note: each undeclared identifier is reported only once for each function it appears in
     157 |         _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
         |                               ^~~~~~~~~~~~
   include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
     131 |   ((!(bit1) || !(bit2)) ? 0 : \
         |       ^~~~
   mm/mmap.c: In function 'do_mmap':
>> mm/mmap.c:1422:15: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
    1422 |  if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
         |               ^~~~~~~~~~~~
         |               VM_NOSIGBUS
   In file included from mm/mmap.c:18:
   include/linux/mman.h: In function 'calc_vm_flag_bits':
   include/linux/mman.h:159:1: error: control reaches end of non-void function [-Werror=return-type]
     159 | }
         | ^
   cc1: some warnings being treated as errors
--
   In file included from drivers/char/mem.c:16:
   include/linux/mman.h: In function 'calc_vm_flag_bits':
>> include/linux/mman.h:157:31: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
     157 |         _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
         |                               ^~~~~~~~~~~~
   include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
     131 |   ((!(bit1) || !(bit2)) ? 0 : \
         |       ^~~~
   include/linux/mman.h:157:31: note: each undeclared identifier is reported only once for each function it appears in
     157 |         _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
         |                               ^~~~~~~~~~~~
   include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
     131 |   ((!(bit1) || !(bit2)) ? 0 : \
         |       ^~~~
   drivers/char/mem.c: At top level:
   drivers/char/mem.c:95:29: warning: no previous prototype for 'unxlate_dev_mem_ptr' [-Wmissing-prototypes]
      95 | #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
         |                             ^~~~~~~~~~~~~~~~~~~
   drivers/char/mem.c:96:13: note: in expansion of macro 'unxlate_dev_mem_ptr'
      96 | void __weak unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
         |             ^~~~~~~~~~~~~~~~~~~


vim +157 include/linux/mman.h

   146	
   147	/*
   148	 * Combine the mmap "flags" argument into "vm_flags" used internally.
   149	 */
   150	static inline vm_flags_t
   151	calc_vm_flag_bits(unsigned long flags)
   152	{
   153		return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
   154		       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
   155		       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
   156		       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
 > 157		       _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
   158		       arch_calc_vm_flag_bits(flags);
   159	}
   160	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 21291 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/2] mm: make "vm_flags" be an u64
  2021-06-01 23:22 ` [PATCH 1/2] mm: make "vm_flags" be an u64 Ming Lin
  2021-06-02  1:58   ` kernel test robot
@ 2021-06-02  2:06   ` kernel test robot
  1 sibling, 0 replies; 29+ messages in thread
From: kernel test robot @ 2021-06-02  2:06 UTC (permalink / raw)
  To: Ming Lin-SSI, Linus Torvalds, Hugh Dickins, Simon Ser
  Cc: kbuild-all, LKML, linux-mm, Ming Lin-SSI

[-- Attachment #1: Type: text/plain, Size: 3902 bytes --]

Hi Ming,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on arm64/for-next/core powerpc/next asm-generic/master linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master tip/x86/core next-20210601]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: s390-randconfig-r011-20210601 (attached as .config)
compiler: s390-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/0b6b8b44f566199698248899d0fef7466ba6b0f3
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
        git checkout 0b6b8b44f566199698248899d0fef7466ba6b0f3
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=s390 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/s390/mm/gmap.c: In function 'gmap_mark_unmergeable':
>> arch/s390/mm/gmap.c:2577:25: error: passing argument 5 of 'ksm_madvise' from incompatible pointer type [-Werror=incompatible-pointer-types]
    2577 |       MADV_UNMERGEABLE, &vma->vm_flags);
         |                         ^~~~~~~~~~~~~~
         |                         |
         |                         vm_flags_t * {aka long long unsigned int *}
   In file included from arch/s390/mm/gmap.c:18:
   include/linux/ksm.h:70:49: note: expected 'long unsigned int *' but argument is of type 'vm_flags_t *' {aka 'long long unsigned int *'}
      70 |   unsigned long end, int advice, unsigned long *vm_flags)
         |                                  ~~~~~~~~~~~~~~~^~~~~~~~
   cc1: some warnings being treated as errors


vim +/ksm_madvise +2577 arch/s390/mm/gmap.c

1e133ab296f3ff Martin Schwidefsky    2016-03-08  2568  
fa0c5eabbdd330 Janosch Frank         2019-07-16  2569  int gmap_mark_unmergeable(void)
fa0c5eabbdd330 Janosch Frank         2019-07-16  2570  {
fa0c5eabbdd330 Janosch Frank         2019-07-16  2571  	struct mm_struct *mm = current->mm;
fa0c5eabbdd330 Janosch Frank         2019-07-16  2572  	struct vm_area_struct *vma;
7a2653612bb6f1 Christian Borntraeger 2020-03-27  2573  	int ret;
fa0c5eabbdd330 Janosch Frank         2019-07-16  2574  
fa0c5eabbdd330 Janosch Frank         2019-07-16  2575  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
7a2653612bb6f1 Christian Borntraeger 2020-03-27  2576  		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
7a2653612bb6f1 Christian Borntraeger 2020-03-27 @2577  				  MADV_UNMERGEABLE, &vma->vm_flags);
7a2653612bb6f1 Christian Borntraeger 2020-03-27  2578  		if (ret)
7a2653612bb6f1 Christian Borntraeger 2020-03-27  2579  			return ret;
fa0c5eabbdd330 Janosch Frank         2019-07-16  2580  	}
fa0c5eabbdd330 Janosch Frank         2019-07-16  2581  	mm->def_flags &= ~VM_MERGEABLE;
fa0c5eabbdd330 Janosch Frank         2019-07-16  2582  	return 0;
fa0c5eabbdd330 Janosch Frank         2019-07-16  2583  }
fa0c5eabbdd330 Janosch Frank         2019-07-16  2584  EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
fa0c5eabbdd330 Janosch Frank         2019-07-16  2585  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 16533 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-02  0:16     ` Linus Torvalds
@ 2021-06-02  2:13       ` Hugh Dickins
  -1 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2021-06-02  2:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ming Lin, Hugh Dickins, Simon Ser, Linux-MM, Linux Kernel Mailing List

On Tue, 1 Jun 2021, Linus Torvalds wrote:

> This series passes my "looks fine, is simple and straightforward" test.

I'm sorry, but it also passes my "hack that we do not want in shmem.c"
test. I'll say more in response to the preceding mail.

Hugh

> 
> One nit:
> 
> On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <mlin@kernel.org> wrote:
> >
> > +               error = vm_insert_page(vma, (unsigned long)vmf->address,
> > +                                       ZERO_PAGE(0));
> 
> On architectures where this matters - bad virtual caches - it would be
> better to use ZERO_PAGE(vmf->address).
> 
> It doesn't make a difference on any sane architecture, but it's the
> RightThing(tm) to do.
> 
>             Linus
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-02  2:13       ` Hugh Dickins
  0 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2021-06-02  2:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ming Lin, Hugh Dickins, Simon Ser, Linux-MM, Linux Kernel Mailing List

On Tue, 1 Jun 2021, Linus Torvalds wrote:

> This series passes my "looks fine, is simple and straightforward" test.

I'm sorry, but it also passes my "hack that we do not want in shmem.c"
test. I'll say more in response to the preceding mail.

Hugh

> 
> One nit:
> 
> On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <mlin@kernel.org> wrote:
> >
> > +               error = vm_insert_page(vma, (unsigned long)vmf->address,
> > +                                       ZERO_PAGE(0));
> 
> On architectures where this matters - bad virtual caches - it would be
> better to use ZERO_PAGE(vmf->address).
> 
> It doesn't make a difference on any sane architecture, but it's the
> RightThing(tm) to do.
> 
>             Linus
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-01 23:22 ` [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read Ming Lin
@ 2021-06-02  3:49     ` Hugh Dickins
  2021-06-02  2:02   ` kernel test robot
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2021-06-02  3:49 UTC (permalink / raw)
  To: Ming Lin
  Cc: Linus Torvalds, Hugh Dickins, Simon Ser, linux-mm, linux-kernel,
	linux-fsdevel, linux-api

On Tue, 1 Jun 2021, Ming Lin wrote:

> Adds new flag MAP_NOSIGBUS of mmap() to specify the behavior of
> "don't SIGBUS on read beyond i_size". This flag is only allowed
> for read only shmem mapping.
> 
> If you use MAP_NOSIGBUS, and you access pages that don't have a backing
> store, you will get zero pages, and they will NOT BE SYNCHRONIZED with
> the backing store possibly later being updated.
> 
> Any user that uses MAP_NOSIGBUS had better just accept that it's not
> compatible with expanding the shmem backing store later.
> 
> Signed-off-by: Ming Lin <mlin@kernel.org>

I disagree with Linus on this: I think it's a mistake,
and is being targeted at tmpfs to avoid wider scrutiny.
Though I have a more constructive suggestion under your mmap.c mod.

I've added linux-fsdevel and linux-api to the Cc list:
linux-api definitely needed to approve any MAP_NOSIGBUS semantics;
linux-fsdevel shouldn't be affected, but they need to know about it.

The prior discussion on "Sealed memfd & no-fault mmap" is at
https://lore.kernel.org/linux-mm/vs1Us2sm4qmfvLOqNat0-r16GyfmWzqUzQ4KHbXJwEcjhzeoQ4sBTxx7QXDG9B6zk5AeT7FsNb3CSr94LaKy6Novh1fbbw8D_BBxYsbPLms=@emersion.fr/

I've not yet seen a response from Simon Ser, as to whether this
kind of "opaque blob of zeroes" implementation would be of any
use to Wayland: you expected it to be a problem, and we shouldn't
waste any time on it if it's not going to be useful to someone.

Maybe there will be other takers (certainly SIGBUS is unpopular).

> ---
>  include/linux/mm.h                     |  2 ++
>  include/linux/mman.h                   |  1 +
>  include/uapi/asm-generic/mman-common.h |  1 +
>  mm/mmap.c                              |  3 +++
>  mm/shmem.c                             | 17 ++++++++++++++++-
>  5 files changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index e9d67bc..5d0e0dc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
>  # define VM_UFFD_MINOR		VM_NONE
>  #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
>  
> +#define VM_NOSIGBUS		VM_FLAGS_BIT(38)	/* Do not SIGBUS on out-of-band shmem read */

"out-of-band shmem read" means nothing to me: "Do not SIGBUS on fault".

> +
>  /* Bits set in the VMA until the stack is in its final location */
>  #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
>  
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index b2cbae9..c966b08 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
>  	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
>  	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
>  	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
> +	       _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
>  	       arch_calc_vm_flag_bits(flags);
>  }
>  
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index f94f65d..55f4be0 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -29,6 +29,7 @@
>  #define MAP_HUGETLB		0x040000	/* create a huge page mapping */
>  #define MAP_SYNC		0x080000 /* perform synchronous page faults for the mapping */
>  #define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
> +#define MAP_NOSIGBUS		0x200000	/* do not SIGBUS on out-of-band shmem read */

Ditto.

>  
>  #define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
>  					 * uninitialized */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 096bba4..69cd856 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1419,6 +1419,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  	if (!len)
>  		return -EINVAL;
>  
> +	if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
> +		return -EINVAL;
> +

No, for several reasons.

This has nothing to do with shmem really, that's just where this patch
hacks it in - and where you have a first user in mind.  If this goes
forward, please modify mm/memory.c not mm/shmem.c, to make
VM_FAULT_SIGBUS on fault to VM_NOSIGBUS vma do the mapping of zero page.

(prot & PROT_WRITE) tells you about the mmap() flags, but says nothing
about what mprotect() could do later on.  Look out for VM_SHARED and
VM_MAYSHARE and VM_MAYWRITE further down; and beware the else (!file)
block below them, shared anonymous would need more protection too.

Constructive comment: I guess much of my objection to this feature
comes from allowing it in the MAP_SHARED case.  If you restrict it
to MAP_PRIVATE mapping of file, then it's less objectionable, and
you won't have to worry (so much?) about write protection.  Copy
on write is normal there, and it's well established that subsequent
changes in the file will not be shared; you'd just be extending that
behaviour from writes to sigbusy reads.

And by restricting to MAP_PRIVATE, you would allow for adding a
proper MAP_SHARED implementation later, if it's thought useful
(that being the implementation which can subsequently unmap a
zero page to let new page cache be mapped).

>  	/*
>  	 * Does the application expect PROT_READ to imply PROT_EXEC?
>  	 *
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 5d46611..5d15b08 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1812,7 +1812,22 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>  repeat:
>  	if (sgp <= SGP_CACHE &&
>  	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
> -		return -EINVAL;
> +		if (!vma || !(vma->vm_flags & VM_NOSIGBUS))
> +			return -EINVAL;
> +
> +		vma->vm_flags |= VM_MIXEDMAP;

No.  Presumably you hit the BUG_ON(mmap_read_trylock(vma->vm_mm))
in vm_insert_page(), so decided to modify the vm_flags here: no,
that BUG is saying you need mmap_write_lock() to write vm_flags.

And I have no idea of the ramifications of shmem in a VM_MIXEDMAP
vma; perhaps it works out fine, but I'd have to research that.
I'd rather not.

> +		/*
> +		 * Get zero page for MAP_NOSIGBUS mapping, which isn't
> +                 * coherent wrt shmem contents that are expanded and
> +		 * filled in later.
> +		 */
> +		error = vm_insert_page(vma, (unsigned long)vmf->address,
> +					ZERO_PAGE(0));
> +		if (error)
> +			return error;
> +
> +		*fault_type = VM_FAULT_NOPAGE;
> +		return 0;

But there are other ways in which shmem_getpage_gfp() can fail and
shmem_fault() end up returning VM_FAULT_SIGBUS.  Notably -ENOSPC.
It's trivial for someone to pass the MAP_NOSIGBUS user the fd of a
sparse file in a full filesystem, causing SIGBUS on access despite
MAP_NOSIGBUS.  On shmem or some other filesystem.

I say the VM_FAULT_SIGBUS->map-in-zero-page handling should be back
in mm/memory.c, where it calls ->fault(): where others can review it.

One other thing while it crosses my mind.  You'll need to decide
what truncating or hole-punching the file does to the zero pages
in its userspace mappings.  I may turn out wrong, but I think you'll
find that truncation removes them, but hole-punch leaves them, and
ought to be modified to remove them too (it's a matter of how the
"even_cows" arg to unmap_mapping_range() is treated).

Hugh

>  	}
>  
>  	sbinfo = SHMEM_SB(inode->i_sb);
> -- 
> 1.8.3.1

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-02  3:49     ` Hugh Dickins
  0 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2021-06-02  3:49 UTC (permalink / raw)
  To: Ming Lin
  Cc: Linus Torvalds, Hugh Dickins, Simon Ser, linux-mm, linux-kernel,
	linux-fsdevel, linux-api

On Tue, 1 Jun 2021, Ming Lin wrote:

> Adds new flag MAP_NOSIGBUS of mmap() to specify the behavior of
> "don't SIGBUS on read beyond i_size". This flag is only allowed
> for read only shmem mapping.
> 
> If you use MAP_NOSIGBUS, and you access pages that don't have a backing
> store, you will get zero pages, and they will NOT BE SYNCHRONIZED with
> the backing store possibly later being updated.
> 
> Any user that uses MAP_NOSIGBUS had better just accept that it's not
> compatible with expanding the shmem backing store later.
> 
> Signed-off-by: Ming Lin <mlin@kernel.org>

I disagree with Linus on this: I think it's a mistake,
and is being targeted at tmpfs to avoid wider scrutiny.
Though I have a more constructive suggestion under your mmap.c mod.

I've added linux-fsdevel and linux-api to the Cc list:
linux-api definitely needed to approve any MAP_NOSIGBUS semantics;
linux-fsdevel shouldn't be affected, but they need to know about it.

The prior discussion on "Sealed memfd & no-fault mmap" is at
https://lore.kernel.org/linux-mm/vs1Us2sm4qmfvLOqNat0-r16GyfmWzqUzQ4KHbXJwEcjhzeoQ4sBTxx7QXDG9B6zk5AeT7FsNb3CSr94LaKy6Novh1fbbw8D_BBxYsbPLms=@emersion.fr/

I've not yet seen a response from Simon Ser, as to whether this
kind of "opaque blob of zeroes" implementation would be of any
use to Wayland: you expected it to be a problem, and we shouldn't
waste any time on it if it's not going to be useful to someone.

Maybe there will be other takers (certainly SIGBUS is unpopular).

> ---
>  include/linux/mm.h                     |  2 ++
>  include/linux/mman.h                   |  1 +
>  include/uapi/asm-generic/mman-common.h |  1 +
>  mm/mmap.c                              |  3 +++
>  mm/shmem.c                             | 17 ++++++++++++++++-
>  5 files changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index e9d67bc..5d0e0dc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
>  # define VM_UFFD_MINOR		VM_NONE
>  #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
>  
> +#define VM_NOSIGBUS		VM_FLAGS_BIT(38)	/* Do not SIGBUS on out-of-band shmem read */

"out-of-band shmem read" means nothing to me: "Do not SIGBUS on fault".

> +
>  /* Bits set in the VMA until the stack is in its final location */
>  #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
>  
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index b2cbae9..c966b08 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
>  	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
>  	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
>  	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
> +	       _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
>  	       arch_calc_vm_flag_bits(flags);
>  }
>  
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index f94f65d..55f4be0 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -29,6 +29,7 @@
>  #define MAP_HUGETLB		0x040000	/* create a huge page mapping */
>  #define MAP_SYNC		0x080000 /* perform synchronous page faults for the mapping */
>  #define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
> +#define MAP_NOSIGBUS		0x200000	/* do not SIGBUS on out-of-band shmem read */

Ditto.

>  
>  #define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
>  					 * uninitialized */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 096bba4..69cd856 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1419,6 +1419,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  	if (!len)
>  		return -EINVAL;
>  
> +	if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
> +		return -EINVAL;
> +

No, for several reasons.

This has nothing to do with shmem really, that's just where this patch
hacks it in - and where you have a first user in mind.  If this goes
forward, please modify mm/memory.c not mm/shmem.c, to make
VM_FAULT_SIGBUS on fault to VM_NOSIGBUS vma do the mapping of zero page.

(prot & PROT_WRITE) tells you about the mmap() flags, but says nothing
about what mprotect() could do later on.  Look out for VM_SHARED and
VM_MAYSHARE and VM_MAYWRITE further down; and beware the else (!file)
block below them, shared anonymous would need more protection too.

Constructive comment: I guess much of my objection to this feature
comes from allowing it in the MAP_SHARED case.  If you restrict it
to MAP_PRIVATE mapping of file, then it's less objectionable, and
you won't have to worry (so much?) about write protection.  Copy
on write is normal there, and it's well established that subsequent
changes in the file will not be shared; you'd just be extending that
behaviour from writes to sigbusy reads.

And by restricting to MAP_PRIVATE, you would allow for adding a
proper MAP_SHARED implementation later, if it's thought useful
(that being the implementation which can subsequently unmap a
zero page to let new page cache be mapped).

>  	/*
>  	 * Does the application expect PROT_READ to imply PROT_EXEC?
>  	 *
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 5d46611..5d15b08 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1812,7 +1812,22 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>  repeat:
>  	if (sgp <= SGP_CACHE &&
>  	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
> -		return -EINVAL;
> +		if (!vma || !(vma->vm_flags & VM_NOSIGBUS))
> +			return -EINVAL;
> +
> +		vma->vm_flags |= VM_MIXEDMAP;

No.  Presumably you hit the BUG_ON(mmap_read_trylock(vma->vm_mm))
in vm_insert_page(), so decided to modify the vm_flags here: no,
that BUG is saying you need mmap_write_lock() to write vm_flags.

And I have no idea of the ramifications of shmem in a VM_MIXEDMAP
vma; perhaps it works out fine, but I'd have to research that.
I'd rather not.

> +		/*
> +		 * Get zero page for MAP_NOSIGBUS mapping, which isn't
> +                 * coherent wrt shmem contents that are expanded and
> +		 * filled in later.
> +		 */
> +		error = vm_insert_page(vma, (unsigned long)vmf->address,
> +					ZERO_PAGE(0));
> +		if (error)
> +			return error;
> +
> +		*fault_type = VM_FAULT_NOPAGE;
> +		return 0;

But there are other ways in which shmem_getpage_gfp() can fail and
shmem_fault() end up returning VM_FAULT_SIGBUS.  Notably -ENOSPC.
It's trivial for someone to pass the MAP_NOSIGBUS user the fd of a
sparse file in a full filesystem, causing SIGBUS on access despite
MAP_NOSIGBUS.  On shmem or some other filesystem.

I say the VM_FAULT_SIGBUS->map-in-zero-page handling should be back
in mm/memory.c, where it calls ->fault(): where others can review it.

One other thing while it crosses my mind.  You'll need to decide
what truncating or hole-punching the file does to the zero pages
in its userspace mappings.  I may turn out wrong, but I think you'll
find that truncation removes them, but hole-punch leaves them, and
ought to be modified to remove them too (it's a matter of how the
"even_cows" arg to unmap_mapping_range() is treated).

Hugh

>  	}
>  
>  	sbinfo = SHMEM_SB(inode->i_sb);
> -- 
> 1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-01 23:22 ` [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read Ming Lin
                     ` (2 preceding siblings ...)
  2021-06-02  3:49     ` Hugh Dickins
@ 2021-06-02  9:30   ` kernel test robot
  3 siblings, 0 replies; 29+ messages in thread
From: kernel test robot @ 2021-06-02  9:30 UTC (permalink / raw)
  To: Ming Lin-SSI, Linus Torvalds, Hugh Dickins, Simon Ser
  Cc: kbuild-all, LKML, linux-mm, Ming Lin-SSI

Hi Ming,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on arm64/for-next/core powerpc/next asm-generic/master linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master tip/x86/core next-20210601]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
cd tools/perf && ./check-headers.sh

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


perfheadercheck warnings: (new ones prefixed by >>)
>> Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/mman-common.h' differs from latest version at 'include/uapi/asm-generic/mman-common.h':   32> #define MAP_NOSIGBUS		0x200000	/* do not SIGBUS on out-of-band shmem read */

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-02  3:49     ` Hugh Dickins
  (?)
@ 2021-06-03  0:05     ` Ming Lin
  2021-06-03  0:46         ` Hugh Dickins
  -1 siblings, 1 reply; 29+ messages in thread
From: Ming Lin @ 2021-06-03  0:05 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Simon Ser, linux-mm, linux-kernel, linux-fsdevel,
	linux-api

On 6/1/2021 8:49 PM, Hugh Dickins wrote:

>> index 096bba4..69cd856 100644
>> --- a/mm/mmap.c
>> +++ b/mm/mmap.c
>> @@ -1419,6 +1419,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>>   	if (!len)
>>   		return -EINVAL;
>>   
>> +	if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
>> +		return -EINVAL;
>> +
> 
> No, for several reasons.
> 
> This has nothing to do with shmem really, that's just where this patch
> hacks it in - and where you have a first user in mind.  If this goes
> forward, please modify mm/memory.c not mm/shmem.c, to make
> VM_FAULT_SIGBUS on fault to VM_NOSIGBUS vma do the mapping of zero page.
> 
> (prot & PROT_WRITE) tells you about the mmap() flags, but says nothing
> about what mprotect() could do later on.  Look out for VM_SHARED and
> VM_MAYSHARE and VM_MAYWRITE further down; and beware the else (!file)
> block below them, shared anonymous would need more protection too.
> 
> Constructive comment: I guess much of my objection to this feature
> comes from allowing it in the MAP_SHARED case.  If you restrict it
> to MAP_PRIVATE mapping of file, then it's less objectionable, and
> you won't have to worry (so much?) about write protection.  Copy
> on write is normal there, and it's well established that subsequent
> changes in the file will not be shared; you'd just be extending that
> behaviour from writes to sigbusy reads.
> 
> And by restricting to MAP_PRIVATE, you would allow for adding a
> proper MAP_SHARED implementation later, if it's thought useful
> (that being the implementation which can subsequently unmap a
> zero page to let new page cache be mapped).

This is what I wrote so far.

---
  include/linux/mm.h                     |  2 ++
  include/linux/mman.h                   |  1 +
  include/uapi/asm-generic/mman-common.h |  1 +
  mm/memory.c                            | 12 ++++++++++++
  mm/mmap.c                              |  4 ++++
  5 files changed, 20 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e9d67bc..af9e277 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
  # define VM_UFFD_MINOR		VM_NONE
  #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
  
+#define VM_NOSIGBUS		VM_FLAGS_BIT(38)	/* Do not SIGBUS on fault */
+
  /* Bits set in the VMA until the stack is in its final location */
  #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
  
diff --git a/include/linux/mman.h b/include/linux/mman.h
index b2cbae9..c966b08 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
  	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
  	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
  	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
+	       _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
  	       arch_calc_vm_flag_bits(flags);
  }
  
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f94f65d..a2a5333 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -29,6 +29,7 @@
  #define MAP_HUGETLB		0x040000	/* create a huge page mapping */
  #define MAP_SYNC		0x080000 /* perform synchronous page faults for the mapping */
  #define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
+#define MAP_NOSIGBUS		0x200000	/* do not SIGBUS on fault */
  
  #define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
  					 * uninitialized */
diff --git a/mm/memory.c b/mm/memory.c
index eff2a47..7195dac 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3676,6 +3676,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
  	}
  
  	ret = vma->vm_ops->fault(vmf);
+	if (unlikely(ret & VM_FAULT_SIGBUS) && (vma->vm_flags & VM_NOSIGBUS)) {
+		/*
+		 * Get zero page for MAP_NOSIGBUS mapping, which isn't
+		 * coherent wrt shmem contents that are expanded and
+		 * filled in later.
+		 */
+		vma->vm_flags |= VM_MIXEDMAP;
+		if (!vm_insert_page(vma, (unsigned long)vmf->address,
+				ZERO_PAGE(vmf->address)))
+			return VM_FAULT_NOPAGE;
+	}
+
  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
  			    VM_FAULT_DONE_COW)))
  		return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index 096bba4..74fb49a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1419,6 +1419,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
  	if (!len)
  		return -EINVAL;
  
+	/* Restrict MAP_NOSIGBUS to MAP_PRIVATE mapping */
+	if ((flags & MAP_NOSIGBUS) && !(flags & MAP_PRIVATE))
+		return -EINVAL;
+
  	/*
  	 * Does the application expect PROT_READ to imply PROT_EXEC?
  	 *

> 
>>   	/*
>>   	 * Does the application expect PROT_READ to imply PROT_EXEC?
>>   	 *
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index 5d46611..5d15b08 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1812,7 +1812,22 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>>   repeat:
>>   	if (sgp <= SGP_CACHE &&
>>   	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
>> -		return -EINVAL;
>> +		if (!vma || !(vma->vm_flags & VM_NOSIGBUS))
>> +			return -EINVAL;
>> +
>> +		vma->vm_flags |= VM_MIXEDMAP;
> 
> No.  Presumably you hit the BUG_ON(mmap_read_trylock(vma->vm_mm))
> in vm_insert_page(), so decided to modify the vm_flags here: no,
> that BUG is saying you need mmap_write_lock() to write vm_flags.

But the comments above vm_insert_page() told me to set VM_MIXEDMAP on vma

  * Usually this function is called from f_op->mmap() handler
  * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
  * Caller must set VM_MIXEDMAP on vma if it wants to call this
  * function from other places, for example from page-fault handler.

> 
> One other thing while it crosses my mind.  You'll need to decide
> what truncating or hole-punching the file does to the zero pages
> in its userspace mappings.  I may turn out wrong, but I think you'll
> find that truncation removes them, but hole-punch leaves them, and
> ought to be modified to remove them too (it's a matter of how the
> "even_cows" arg to unmap_mapping_range() is treated).

I did a quick test, after inserting zero pages, seems that truncation
also leaves the mappings.

I'm still reading code to learn this part ...

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-03  0:05     ` Ming Lin
@ 2021-06-03  0:46         ` Hugh Dickins
  0 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2021-06-03  0:46 UTC (permalink / raw)
  To: Ming Lin
  Cc: Hugh Dickins, Linus Torvalds, Simon Ser, linux-mm, linux-kernel,
	linux-fsdevel, linux-api

On Wed, 2 Jun 2021, Ming Lin wrote:
> 
> This is what I wrote so far.
> 
> ---
>  include/linux/mm.h                     |  2 ++
>  include/linux/mman.h                   |  1 +
>  include/uapi/asm-generic/mman-common.h |  1 +
>  mm/memory.c                            | 12 ++++++++++++
>  mm/mmap.c                              |  4 ++++
>  5 files changed, 20 insertions(+)

I have not looked at the rest, just looking at mm/memory.c:

> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3676,6 +3676,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
>  	}
>   	ret = vma->vm_ops->fault(vmf);
> +	if (unlikely(ret & VM_FAULT_SIGBUS) && (vma->vm_flags & VM_NOSIGBUS))
> {
> +		/*
> +		 * Get zero page for MAP_NOSIGBUS mapping, which isn't
> +		 * coherent wrt shmem contents that are expanded and
> +		 * filled in later.
> +		 */
> +		vma->vm_flags |= VM_MIXEDMAP;
> +		if (!vm_insert_page(vma, (unsigned long)vmf->address,
> +				ZERO_PAGE(vmf->address)))
> +			return VM_FAULT_NOPAGE;
> +	}
> +
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY
> |
>  			    VM_FAULT_DONE_COW)))
>  		return ret;

Sorry, I directed you to mm/memory.c without indicating what's
appropriate here.  Please don't attempt to use VM_MIXEDMAP and
vm_insert_page(): they're for special driver mmaps, they're no
better here than they were in mm/shmem.c.

It's do_anonymous_page()'s business to map in the zero page on
read fault (see "my_zero_pfn(vmf->address)" in there), or fill
a freshly allocated page with zeroes on write fault - and now
you're sticking to MAP_PRIVATE, write faults in VM_WRITE areas
are okay for VM_NOSIGBUS.

Ideally you can simply call do_anonymous_page() from __do_fault()
in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.  That's what to start
from anyway: but look to see if there's state to be adjusted to
achieve that; and it won't be surprising if somewhere down in
do_anonymous_page() or something it calls, there's a BUG on it
being called when vma->vm_file is set, or something like that.
May need some tweaking.

Hugh

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-03  0:46         ` Hugh Dickins
  0 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2021-06-03  0:46 UTC (permalink / raw)
  To: Ming Lin
  Cc: Hugh Dickins, Linus Torvalds, Simon Ser, linux-mm, linux-kernel,
	linux-fsdevel, linux-api

On Wed, 2 Jun 2021, Ming Lin wrote:
> 
> This is what I wrote so far.
> 
> ---
>  include/linux/mm.h                     |  2 ++
>  include/linux/mman.h                   |  1 +
>  include/uapi/asm-generic/mman-common.h |  1 +
>  mm/memory.c                            | 12 ++++++++++++
>  mm/mmap.c                              |  4 ++++
>  5 files changed, 20 insertions(+)

I have not looked at the rest, just looking at mm/memory.c:

> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3676,6 +3676,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
>  	}
>   	ret = vma->vm_ops->fault(vmf);
> +	if (unlikely(ret & VM_FAULT_SIGBUS) && (vma->vm_flags & VM_NOSIGBUS))
> {
> +		/*
> +		 * Get zero page for MAP_NOSIGBUS mapping, which isn't
> +		 * coherent wrt shmem contents that are expanded and
> +		 * filled in later.
> +		 */
> +		vma->vm_flags |= VM_MIXEDMAP;
> +		if (!vm_insert_page(vma, (unsigned long)vmf->address,
> +				ZERO_PAGE(vmf->address)))
> +			return VM_FAULT_NOPAGE;
> +	}
> +
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY
> |
>  			    VM_FAULT_DONE_COW)))
>  		return ret;

Sorry, I directed you to mm/memory.c without indicating what's
appropriate here.  Please don't attempt to use VM_MIXEDMAP and
vm_insert_page(): they're for special driver mmaps, they're no
better here than they were in mm/shmem.c.

It's do_anonymous_page()'s business to map in the zero page on
read fault (see "my_zero_pfn(vmf->address)" in there), or fill
a freshly allocated page with zeroes on write fault - and now
you're sticking to MAP_PRIVATE, write faults in VM_WRITE areas
are okay for VM_NOSIGBUS.

Ideally you can simply call do_anonymous_page() from __do_fault()
in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.  That's what to start
from anyway: but look to see if there's state to be adjusted to
achieve that; and it won't be surprising if somewhere down in
do_anonymous_page() or something it calls, there's a BUG on it
being called when vma->vm_file is set, or something like that.
May need some tweaking.

Hugh


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-03  0:46         ` Hugh Dickins
@ 2021-06-03 18:25           ` Linus Torvalds
  -1 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2021-06-03 18:25 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Ming Lin, Simon Ser, Linux-MM, Linux Kernel Mailing List,
	linux-fsdevel, Linux API

[-- Attachment #1: Type: text/plain, Size: 1973 bytes --]

On Wed, Jun 2, 2021 at 5:46 PM Hugh Dickins <hughd@google.com> wrote:
>
> Ideally you can simply call do_anonymous_page() from __do_fault()
> in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.

Heh.

We're actually then back to my original patch.

That one doesn't handle shared mappings (even read-only ones), for the
simple reason that do_anonymous_page() refuses to insert anonymous
pages into a shared mapping, and has

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

at the very top.

But yes, if we just remove that check, I think my original patch
should actually "JustWork(tm)".

I'm attaching it again, with old name and old commentary (ie that

    /* FIXME! We don't have a VM_NOFAULT bit */

should just be replaced with that VM_NOSIGBUS bit instead, and the
#if'ed out region should be enabled.

Oh, and we need to think hard about one more case: mprotect().

In particular, I think the attached patch fails horribly for the case
of a shared mapping that starts out read-only, then inserts a zero
page, then somebody does mprotect(MAP_WRITE), and then writes to the
page. I haven't checked what the write protect fault handler does, but
I think that for a shared mapping it will just make the page dirty and
writable.

Which would be horribly wrong for VM_NOSIGBUS.

So that support infrastructure that adds MAP_NOSIGBUS, and checks that
it is only done on a read-only mapping, also has to make sure that it
clears the VM_MAYWRITE bit when it sets VM_NOSIGBUS.

That way mprotect can't then later make it writable.

Hugh, comments on this approach?

Again: this patch is my *OLD* one, I didn't try to update it to the
new world order. It requires

 - Ming's MAP_NOSIGBUS ccode

 - removal of that "File mapping without ->vm_ops" case

 - that FIXME fixed and name updated

 - and that VM_MAYWRITE clearing if VM_NOSIGBUS is set, to avoid the
mprotect issue.

Hmm?

                  Linus

[-- Attachment #2: VM_NOSIGBUS.patch --]
[-- Type: text/x-patch, Size: 904 bytes --]

 mm/memory.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 550405fc3b5e..bbede6b52f7a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4312,10 +4312,21 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	}
 
 	if (!vmf->pte) {
-		if (vma_is_anonymous(vmf->vma))
-			return do_anonymous_page(vmf);
-		else
-			return do_fault(vmf);
+		if (!vma_is_anonymous(vmf->vma)) {
+			vm_fault_t ret = do_fault(vmf);
+			if (ret & VM_FAULT_RETRY)
+				return ret;
+			if (!(ret & VM_FAULT_SIGBUS))
+				return ret;
+/* FIXME! We don't have a VM_NOFAULT bit */
+#if 0
+			/* See if we should turn a SIGBUS into an anonymous page */
+			if (!(vma->vm_flags & VM_NOFAULT))
+				return ret;
+#endif
+/* Fall back on do_anonymous_page() instead of SIGBUS */
+		}
+		return do_anonymous_page(vmf);
 	}
 
 	if (!pte_present(vmf->orig_pte))

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-03 18:25           ` Linus Torvalds
  0 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2021-06-03 18:25 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Ming Lin, Simon Ser, Linux-MM, Linux Kernel Mailing List,
	linux-fsdevel, Linux API

[-- Attachment #1: Type: text/plain, Size: 1973 bytes --]

On Wed, Jun 2, 2021 at 5:46 PM Hugh Dickins <hughd@google.com> wrote:
>
> Ideally you can simply call do_anonymous_page() from __do_fault()
> in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.

Heh.

We're actually then back to my original patch.

That one doesn't handle shared mappings (even read-only ones), for the
simple reason that do_anonymous_page() refuses to insert anonymous
pages into a shared mapping, and has

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

at the very top.

But yes, if we just remove that check, I think my original patch
should actually "JustWork(tm)".

I'm attaching it again, with old name and old commentary (ie that

    /* FIXME! We don't have a VM_NOFAULT bit */

should just be replaced with that VM_NOSIGBUS bit instead, and the
#if'ed out region should be enabled.

Oh, and we need to think hard about one more case: mprotect().

In particular, I think the attached patch fails horribly for the case
of a shared mapping that starts out read-only, then inserts a zero
page, then somebody does mprotect(MAP_WRITE), and then writes to the
page. I haven't checked what the write protect fault handler does, but
I think that for a shared mapping it will just make the page dirty and
writable.

Which would be horribly wrong for VM_NOSIGBUS.

So that support infrastructure that adds MAP_NOSIGBUS, and checks that
it is only done on a read-only mapping, also has to make sure that it
clears the VM_MAYWRITE bit when it sets VM_NOSIGBUS.

That way mprotect can't then later make it writable.

Hugh, comments on this approach?

Again: this patch is my *OLD* one, I didn't try to update it to the
new world order. It requires

 - Ming's MAP_NOSIGBUS ccode

 - removal of that "File mapping without ->vm_ops" case

 - that FIXME fixed and name updated

 - and that VM_MAYWRITE clearing if VM_NOSIGBUS is set, to avoid the
mprotect issue.

Hmm?

                  Linus

[-- Attachment #2: VM_NOSIGBUS.patch --]
[-- Type: text/x-patch, Size: 904 bytes --]

 mm/memory.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 550405fc3b5e..bbede6b52f7a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4312,10 +4312,21 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	}
 
 	if (!vmf->pte) {
-		if (vma_is_anonymous(vmf->vma))
-			return do_anonymous_page(vmf);
-		else
-			return do_fault(vmf);
+		if (!vma_is_anonymous(vmf->vma)) {
+			vm_fault_t ret = do_fault(vmf);
+			if (ret & VM_FAULT_RETRY)
+				return ret;
+			if (!(ret & VM_FAULT_SIGBUS))
+				return ret;
+/* FIXME! We don't have a VM_NOFAULT bit */
+#if 0
+			/* See if we should turn a SIGBUS into an anonymous page */
+			if (!(vma->vm_flags & VM_NOFAULT))
+				return ret;
+#endif
+/* Fall back on do_anonymous_page() instead of SIGBUS */
+		}
+		return do_anonymous_page(vmf);
 	}
 
 	if (!pte_present(vmf->orig_pte))

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-03 18:25           ` Linus Torvalds
@ 2021-06-03 19:07             ` Hugh Dickins
  -1 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2021-06-03 19:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hugh Dickins, Ming Lin, Simon Ser, Linux-MM,
	Linux Kernel Mailing List, linux-fsdevel, Linux API

On Thu, 3 Jun 2021, Linus Torvalds wrote:
> On Wed, Jun 2, 2021 at 5:46 PM Hugh Dickins <hughd@google.com> wrote:
> >
> > Ideally you can simply call do_anonymous_page() from __do_fault()
> > in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.
> 
> Heh.
> 
> We're actually then back to my original patch.
> 
> That one doesn't handle shared mappings (even read-only ones), for the
> simple reason that do_anonymous_page() refuses to insert anonymous
> pages into a shared mapping, and has
> 
>         /* File mapping without ->vm_ops ? */
>         if (vma->vm_flags & VM_SHARED)
>                 return VM_FAULT_SIGBUS;
> 
> at the very top.
> 
> But yes, if we just remove that check, I think my original patch
> should actually "JustWork(tm)".

But no!

Sorry, I don't have time for this at present, so haven't looked at
your original patch.

But the point that we've arrived at, that I'm actually now fairly
happy with, is do *not* permit MAP_NOSIGBUS on MAP_SHARED mappings.

I didn't check the placement yet, easy to get wrong, but I believe
Ming Lin is now enforcing that over at the mmap() end.

On a MAP_PRIVATE mapping, the nasty opaque blob of zeroes can
claim some precedent in what already happens with COW'ed pages.

Which leaves MAP_NOSIGBUS on MAP_SHARED as currently unsupported,
perhaps never supported on anything, perhaps one day supported on
shmem; but if it's ever supported then that one will naturally be
transparent to future changes in page cache - we call that "shared".

Of course, internally, there's the in-between case of MAP_SHARED
without PROT_WRITE and without writable fd: VM_MAYSHARE without
VM_SHARED or VM_MAYWRITE.  We *could* let that one accept
MAP_NOSIGBUS, but who wants to write the manpage for it?

Please stick to MAP_PRIVATE: that's good enough.

> 
> I'm attaching it again, with old name and old commentary (ie that
> 
>     /* FIXME! We don't have a VM_NOFAULT bit */
> 
> should just be replaced with that VM_NOSIGBUS bit instead, and the
> #if'ed out region should be enabled.
> 
> Oh, and we need to think hard about one more case: mprotect().
> 
> In particular, I think the attached patch fails horribly for the case
> of a shared mapping that starts out read-only, then inserts a zero
> page, then somebody does mprotect(MAP_WRITE), and then writes to the
> page. I haven't checked what the write protect fault handler does, but
> I think that for a shared mapping it will just make the page dirty and
> writable.

Obviously the finished patch will need to be scrutinized carefully, but
I think the mprotect() questions vanish when restricted to MAP_PRIVATE.

> 
> Which would be horribly wrong for VM_NOSIGBUS.
> 
> So that support infrastructure that adds MAP_NOSIGBUS, and checks that
> it is only done on a read-only mapping, also has to make sure that it
> clears the VM_MAYWRITE bit when it sets VM_NOSIGBUS.
> 
> That way mprotect can't then later make it writable.
> 
> Hugh, comments on this approach?

Comments above, just stick to MAP_PRIVATE.

Hugh

> 
> Again: this patch is my *OLD* one, I didn't try to update it to the
> new world order. It requires
> 
>  - Ming's MAP_NOSIGBUS ccode
> 
>  - removal of that "File mapping without ->vm_ops" case
> 
>  - that FIXME fixed and name updated
> 
>  - and that VM_MAYWRITE clearing if VM_NOSIGBUS is set, to avoid the
> mprotect issue.
> 
> Hmm?
> 
>                   Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-03 19:07             ` Hugh Dickins
  0 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2021-06-03 19:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hugh Dickins, Ming Lin, Simon Ser, Linux-MM,
	Linux Kernel Mailing List, linux-fsdevel, Linux API

On Thu, 3 Jun 2021, Linus Torvalds wrote:
> On Wed, Jun 2, 2021 at 5:46 PM Hugh Dickins <hughd@google.com> wrote:
> >
> > Ideally you can simply call do_anonymous_page() from __do_fault()
> > in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.
> 
> Heh.
> 
> We're actually then back to my original patch.
> 
> That one doesn't handle shared mappings (even read-only ones), for the
> simple reason that do_anonymous_page() refuses to insert anonymous
> pages into a shared mapping, and has
> 
>         /* File mapping without ->vm_ops ? */
>         if (vma->vm_flags & VM_SHARED)
>                 return VM_FAULT_SIGBUS;
> 
> at the very top.
> 
> But yes, if we just remove that check, I think my original patch
> should actually "JustWork(tm)".

But no!

Sorry, I don't have time for this at present, so haven't looked at
your original patch.

But the point that we've arrived at, that I'm actually now fairly
happy with, is do *not* permit MAP_NOSIGBUS on MAP_SHARED mappings.

I didn't check the placement yet, easy to get wrong, but I believe
Ming Lin is now enforcing that over at the mmap() end.

On a MAP_PRIVATE mapping, the nasty opaque blob of zeroes can
claim some precedent in what already happens with COW'ed pages.

Which leaves MAP_NOSIGBUS on MAP_SHARED as currently unsupported,
perhaps never supported on anything, perhaps one day supported on
shmem; but if it's ever supported then that one will naturally be
transparent to future changes in page cache - we call that "shared".

Of course, internally, there's the in-between case of MAP_SHARED
without PROT_WRITE and without writable fd: VM_MAYSHARE without
VM_SHARED or VM_MAYWRITE.  We *could* let that one accept
MAP_NOSIGBUS, but who wants to write the manpage for it?

Please stick to MAP_PRIVATE: that's good enough.

> 
> I'm attaching it again, with old name and old commentary (ie that
> 
>     /* FIXME! We don't have a VM_NOFAULT bit */
> 
> should just be replaced with that VM_NOSIGBUS bit instead, and the
> #if'ed out region should be enabled.
> 
> Oh, and we need to think hard about one more case: mprotect().
> 
> In particular, I think the attached patch fails horribly for the case
> of a shared mapping that starts out read-only, then inserts a zero
> page, then somebody does mprotect(MAP_WRITE), and then writes to the
> page. I haven't checked what the write protect fault handler does, but
> I think that for a shared mapping it will just make the page dirty and
> writable.

Obviously the finished patch will need to be scrutinized carefully, but
I think the mprotect() questions vanish when restricted to MAP_PRIVATE.

> 
> Which would be horribly wrong for VM_NOSIGBUS.
> 
> So that support infrastructure that adds MAP_NOSIGBUS, and checks that
> it is only done on a read-only mapping, also has to make sure that it
> clears the VM_MAYWRITE bit when it sets VM_NOSIGBUS.
> 
> That way mprotect can't then later make it writable.
> 
> Hugh, comments on this approach?

Comments above, just stick to MAP_PRIVATE.

Hugh

> 
> Again: this patch is my *OLD* one, I didn't try to update it to the
> new world order. It requires
> 
>  - Ming's MAP_NOSIGBUS ccode
> 
>  - removal of that "File mapping without ->vm_ops" case
> 
>  - that FIXME fixed and name updated
> 
>  - and that VM_MAYWRITE clearing if VM_NOSIGBUS is set, to avoid the
> mprotect issue.
> 
> Hmm?
> 
>                   Linus


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-03 19:07             ` Hugh Dickins
@ 2021-06-03 19:12               ` Linus Torvalds
  -1 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2021-06-03 19:12 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Ming Lin, Simon Ser, Linux-MM, Linux Kernel Mailing List,
	linux-fsdevel, Linux API

On Thu, Jun 3, 2021 at 12:07 PM Hugh Dickins <hughd@google.com> wrote:
>
> But the point that we've arrived at, that I'm actually now fairly
> happy with, is do *not* permit MAP_NOSIGBUS on MAP_SHARED mappings.

Yeah, if that's sufficient, then that original patch should just work as-is.

But there was some reason why people didn't like that patch
originally, and I think it was literally about how it only worked on
private mappings (the "we don't have a flag for it in the vm_flags"
part was just a small detail.

I guess that objection ended up changing over time.

            Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-03 19:12               ` Linus Torvalds
  0 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2021-06-03 19:12 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Ming Lin, Simon Ser, Linux-MM, Linux Kernel Mailing List,
	linux-fsdevel, Linux API

On Thu, Jun 3, 2021 at 12:07 PM Hugh Dickins <hughd@google.com> wrote:
>
> But the point that we've arrived at, that I'm actually now fairly
> happy with, is do *not* permit MAP_NOSIGBUS on MAP_SHARED mappings.

Yeah, if that's sufficient, then that original patch should just work as-is.

But there was some reason why people didn't like that patch
originally, and I think it was literally about how it only worked on
private mappings (the "we don't have a flag for it in the vm_flags"
part was just a small detail.

I guess that objection ended up changing over time.

            Linus


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-03 19:12               ` Linus Torvalds
@ 2021-06-03 19:15                 ` Linus Torvalds
  -1 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2021-06-03 19:15 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Ming Lin, Simon Ser, Linux-MM, Linux Kernel Mailing List,
	linux-fsdevel, Linux API

On Thu, Jun 3, 2021 at 12:12 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Yeah, if that's sufficient, then that original patch should just work as-is.

To clarify: it obviously needs the VM_xyz flags things, but the
VM_SHARED check in do_anonymous_page() is fine, and the whole issue
with VM_MAYWRITE is entirely moot.

MAP_PRIVATE works fine with zero pages even when writable - they get
COW'ed properly, of course.

               Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
@ 2021-06-03 19:15                 ` Linus Torvalds
  0 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2021-06-03 19:15 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Ming Lin, Simon Ser, Linux-MM, Linux Kernel Mailing List,
	linux-fsdevel, Linux API

On Thu, Jun 3, 2021 at 12:12 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Yeah, if that's sufficient, then that original patch should just work as-is.

To clarify: it obviously needs the VM_xyz flags things, but the
VM_SHARED check in do_anonymous_page() is fine, and the whole issue
with VM_MAYWRITE is entirely moot.

MAP_PRIVATE works fine with zero pages even when writable - they get
COW'ed properly, of course.

               Linus


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-03 19:12               ` Linus Torvalds
  (?)
  (?)
@ 2021-06-03 19:24               ` Andy Lutomirski
  2021-06-03 19:35                 ` Simon Ser
  -1 siblings, 1 reply; 29+ messages in thread
From: Andy Lutomirski @ 2021-06-03 19:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hugh Dickins, Ming Lin, Simon Ser, Linux-MM,
	Linux Kernel Mailing List, linux-fsdevel, Linux API



> On Jun 3, 2021, at 12:14 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> On Thu, Jun 3, 2021 at 12:07 PM Hugh Dickins <hughd@google.com> wrote:
>> 
>> But the point that we've arrived at, that I'm actually now fairly
>> happy with, is do *not* permit MAP_NOSIGBUS on MAP_SHARED mappings.
> 
> Yeah, if that's sufficient, then that original patch should just work as-is.
> 
> But there was some reason why people didn't like that patch
> originally, and I think it was literally about how it only worked on
> private mappings (the "we don't have a flag for it in the vm_flags"
> part was just a small detail.
> 
> I guess that objection ended up changing over time.
> 
> 

I don’t understand the use case well enough to comment on whether MAP_PRIVATE is sufficient, but I’m with Hugh: if this feature is implemented for MAP_SHARED, it should be fully coherent.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-03 19:24               ` Andy Lutomirski
@ 2021-06-03 19:35                 ` Simon Ser
  0 siblings, 0 replies; 29+ messages in thread
From: Simon Ser @ 2021-06-03 19:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Linus Torvalds, Hugh Dickins, Ming Lin, Linux-MM,
	Linux Kernel Mailing List, linux-fsdevel, Linux API

On Thursday, June 3rd, 2021 at 9:24 PM, Andy Lutomirski <luto@amacapital.net> wrote:

> I don’t understand the use case well enough to comment on whether MAP_PRIVATE
> is sufficient, but I’m with Hugh: if this feature is implemented for
> MAP_SHARED, it should be fully coherent.

I've tried to explain what we'd need from user-space PoV in [1].
tl;dr the MAP_PRIVATE restriction would get us pretty far, even if it
won't allow us to have all of the bells and whistles.

[1]: https://lore.kernel.org/linux-mm/vs1Us2sm4qmfvLOqNat0-r16GyfmWzqUzQ4KHbXJwEcjhzeoQ4sBTxx7QXDG9B6zk5AeT7FsNb3CSr94LaKy6Novh1fbbw8D_BBxYsbPLms=@emersion.fr/T/#mb321a8d39e824740877ba95f1df780ffd52c3862

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read
  2021-06-03  0:46         ` Hugh Dickins
  (?)
  (?)
@ 2021-06-03 19:57         ` Ming Lin
  -1 siblings, 0 replies; 29+ messages in thread
From: Ming Lin @ 2021-06-03 19:57 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Simon Ser, linux-mm, linux-kernel, linux-fsdevel,
	linux-api

On 6/2/2021 5:46 PM, Hugh Dickins wrote
> 
> It's do_anonymous_page()'s business to map in the zero page on
> read fault (see "my_zero_pfn(vmf->address)" in there), or fill
> a freshly allocated page with zeroes on write fault - and now
> you're sticking to MAP_PRIVATE, write faults in VM_WRITE areas
> are okay for VM_NOSIGBUS.
> 
> Ideally you can simply call do_anonymous_page() from __do_fault()
> in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.  That's what to start
> from anyway: but look to see if there's state to be adjusted to
> achieve that; and it won't be surprising if somewhere down in
> do_anonymous_page() or something it calls, there's a BUG on it
> being called when vma->vm_file is set, or something like that.
> May need some tweaking.

do_anonymous_page() works nicely for read fault and write fault.
I didn't see any BUG() thing in my test.

But I'm still struggling with how to do "punch hole should remove the mapping of zero page".
Here is the hack I have now.

diff --git a/mm/memory.c b/mm/memory.c
index 46ecda5..6b5a897 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1241,7 +1241,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                         struct page *page;
  
                         page = vm_normal_page(vma, addr, ptent);
-                       if (unlikely(details) && page) {
+                       if (unlikely(details) && page && !(vma->vm_flags & VM_NOSIGBUS)) {
                                 /*
                                  * unmap_shared_mapping_pages() wants to
                                  * invalidate cache without truncating:


And other parts of the patch is following,

----

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e9d67bc..af9e277 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
  # define VM_UFFD_MINOR		VM_NONE
  #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
  
+#define VM_NOSIGBUS		VM_FLAGS_BIT(38)	/* Do not SIGBUS on fault */
+
  /* Bits set in the VMA until the stack is in its final location */
  #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
  
diff --git a/include/linux/mman.h b/include/linux/mman.h
index b2cbae9..c966b08 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
  	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
  	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
  	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
+	       _calc_vm_trans(flags, MAP_NOSIGBUS,   VM_NOSIGBUS  ) |
  	       arch_calc_vm_flag_bits(flags);
  }
  
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f94f65d..a2a5333 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -29,6 +29,7 @@
  #define MAP_HUGETLB		0x040000	/* create a huge page mapping */
  #define MAP_SYNC		0x080000 /* perform synchronous page faults for the mapping */
  #define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
+#define MAP_NOSIGBUS		0x200000	/* do not SIGBUS on fault */
  
  #define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
  					 * uninitialized */
diff --git a/mm/memory.c b/mm/memory.c
index eff2a47..46ecda5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3676,6 +3676,17 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
  	}
  
  	ret = vma->vm_ops->fault(vmf);
+	if (unlikely(ret & VM_FAULT_SIGBUS) && (vma->vm_flags & VM_NOSIGBUS)) {
+		/*
+		 * For MAP_NOSIGBUS mapping, map in the zero page on read fault
+		 * or fill a freshly allocated page with zeroes on write fault
+		 */
+		ret = do_anonymous_page(vmf);
+		if (!ret)
+			ret = VM_FAULT_NOPAGE;
+		return ret;
+	}
+
  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
  			    VM_FAULT_DONE_COW)))
  		return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index 096bba4..74fb49a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1419,6 +1419,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
  	if (!len)
  		return -EINVAL;
  
+	/* Restrict MAP_NOSIGBUS to MAP_PRIVATE mapping */
+	if ((flags & MAP_NOSIGBUS) && !(flags & MAP_PRIVATE))
+		return -EINVAL;
+
  	/*
  	 * Does the application expect PROT_READ to imply PROT_EXEC?
  	 *

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2021-06-03 19:57 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-01 23:22 [PATCH 0/2] mm: adds MAP_NOSIGBUS extension for shmem read Ming Lin
2021-06-01 23:22 ` [PATCH 1/2] mm: make "vm_flags" be an u64 Ming Lin
2021-06-02  1:58   ` kernel test robot
2021-06-02  2:06   ` kernel test robot
2021-06-01 23:22 ` [PATCH 2/2] mm: adds NOSIGBUS extension for out-of-band shmem read Ming Lin
2021-06-02  0:16   ` Linus Torvalds
2021-06-02  0:16     ` Linus Torvalds
2021-06-02  1:06     ` Ming Lin
2021-06-02  1:06       ` Ming Lin
2021-06-02  2:13     ` Hugh Dickins
2021-06-02  2:13       ` Hugh Dickins
2021-06-02  2:02   ` kernel test robot
2021-06-02  3:49   ` Hugh Dickins
2021-06-02  3:49     ` Hugh Dickins
2021-06-03  0:05     ` Ming Lin
2021-06-03  0:46       ` Hugh Dickins
2021-06-03  0:46         ` Hugh Dickins
2021-06-03 18:25         ` Linus Torvalds
2021-06-03 18:25           ` Linus Torvalds
2021-06-03 19:07           ` Hugh Dickins
2021-06-03 19:07             ` Hugh Dickins
2021-06-03 19:12             ` Linus Torvalds
2021-06-03 19:12               ` Linus Torvalds
2021-06-03 19:15               ` Linus Torvalds
2021-06-03 19:15                 ` Linus Torvalds
2021-06-03 19:24               ` Andy Lutomirski
2021-06-03 19:35                 ` Simon Ser
2021-06-03 19:57         ` Ming Lin
2021-06-02  9:30   ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.