[PATCH RFC] userfaultfd: introduce UFFDIO_COPY_MODE_YOUNG

* [PATCH RFC] userfaultfd: introduce UFFDIO_COPY_MODE_YOUNG
@ 2022-06-13 20:40 Nadav Amit
  2022-06-14 15:22 ` David Hildenbrand
  0 siblings, 1 reply; 18+ messages in thread
From: Nadav Amit @ 2022-06-13 20:40 UTC (permalink / raw)
  To: Peter Xu
  Cc: linux-mm, Nadav Amit, Mike Kravetz, Hugh Dickins, Andrew Morton,
	Axel Rasmussen, David Hildenbrand, Mike Rapoport

From: Nadav Amit <namit@vmware.com>

As we know, using a PTE on x86 with cleared access-bit (aka young-bit)
takes ~600 cycles more than when the access-bit is set. At the same
time, setting the access-bit for memory that is not used (e.g.,
prefetched) can introduce greater overheads, as the prefetched memory is
reclaimed later than it should be.

Userfaultfd currently does not set the access-bit (excluding the
huge-pages case). Arguably, it is best to let the uffd monitor control
whether the access-bit should be set or not. The expected use is for the
monitor to request userfaultfd to set the access-bit when the copy
operation is done to resolve a page-fault, and not to set the young-bit
when the memory is prefetched.

Introduce UFFDIO_COPY_MODE_YOUNG to enable userspace to request the
young bit to be set. For UFFDIO_CONTINUE and UFFDIO_ZEROPAGE set the bit
unconditionally since the former is only used to resolve page-faults and
the latter would not benefit from not setting the access-bit.

Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Nadav Amit <namit@vmware.com>

---

There are 2 possible enhancements:

1. Use the flag to decide on whether to mark the PTE as dirty (for
writable PTEs). I guess that setting the dirty-bit is as expensive as
setting the access-bit, and setting it introduces similar tradeoffs,
as mentioned above.

2. Introduce a similar mode for write-protect and use this information
for setting both the young and dirty bits. Makes one wonder whether
mprotect() should also set the bit in certain cases...
---
 fs/userfaultfd.c                 |  3 +-
 include/linux/hugetlb.h          |  6 ++--
 include/linux/shmem_fs.h         |  6 ++--
 include/linux/userfaultfd_k.h    |  3 +-
 include/uapi/linux/userfaultfd.h | 14 ++++++++-
 mm/hugetlb.c                     |  7 +++--
 mm/shmem.c                       |  6 ++--
 mm/userfaultfd.c                 | 50 ++++++++++++++++++++++----------
 8 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index aa0c47cb0d16..888a1514b9af 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1721,7 +1721,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	ret = -EINVAL;
 	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
 		goto out;
-	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
+	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP|
+				 UFFDIO_COPY_MODE_YOUNG))
 		goto out;
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ac2a1d758a80..da8276d78464 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -160,7 +160,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				unsigned long dst_addr,
 				unsigned long src_addr,
 				enum mcopy_atomic_mode mode,
-				struct page **pagep);
+				struct page **pagep,
+				bool young);
 #endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
@@ -356,7 +357,8 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 						unsigned long dst_addr,
 						unsigned long src_addr,
 						enum mcopy_atomic_mode mode,
-						struct page **pagep)
+						struct page **pagep,
+						bool young)
 {
 	BUG();
 	return 0;
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ab51d3cd39bd..99565bcbe4ba 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -146,10 +146,12 @@ extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 				  unsigned long dst_addr,
 				  unsigned long src_addr,
 				  bool zeropage,
-				  struct page **pagep);
+				  struct page **pagep,
+				  bool young);
 #else /* !CONFIG_SHMEM */
 #define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
-			       src_addr, zeropage, pagep)       ({ BUG(); 0; })
+			       src_addr, zeropage, pagep, young) \
+								({ BUG(); 0; })
 #endif /* CONFIG_SHMEM */
 #endif /* CONFIG_USERFAULTFD */
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 33cea484d1ad..1ad4dc9668b4 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -56,7 +56,8 @@ enum mcopy_atomic_mode {
 extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 				    struct vm_area_struct *dst_vma,
 				    unsigned long dst_addr, struct page *page,
-				    bool newly_allocated, bool wp_copy);
+				    bool newly_allocated, bool wp_copy,
+				    bool young);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len,
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index ef739054cb1c..4d86829a732a 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -33,7 +33,8 @@
 			   UFFD_FEATURE_THREAD_ID |		\
 			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
 			   UFFD_FEATURE_MINOR_SHMEM |		\
-			   UFFD_FEATURE_EXACT_ADDRESS)
+			   UFFD_FEATURE_EXACT_ADDRESS |		\
+			   UFFD_FEATURE_YOUNG)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -194,6 +195,9 @@ struct uffdio_api {
 	 * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page
 	 * faults would be provided and the offset within the page would not be
 	 * masked.
+	 *
+	 * UFFD_FEATURE_YOUNG indicates that the copy supports
+	 * UFFDIO_COPY_MODE_YOUNG.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -207,6 +211,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MINOR_HUGETLBFS		(1<<9)
 #define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
 #define UFFD_FEATURE_EXACT_ADDRESS		(1<<11)
+#define UFFD_FEATURE_YOUNG			(1<<12)
 	__u64 features;
 
 	__u64 ioctls;
@@ -250,6 +255,13 @@ struct uffdio_copy {
 	 * copy_from_user will not read the last 8 bytes.
 	 */
 	__s64 copy;
+	/*
+	 * UFFDIO_COPY_MODE_YOUNG will set the mapped page as young. This can
+	 * reduce the time that the first access to the page takes. Yet, if set
+	 * opportunistically to memory that is not used, it might lengthen the
+	 * time before the unused memory pages are reclaimed.
+	 */
+#define UFFDIO_COPY_MODE_YOUNG			((__u64)1<<2)
 };
 
 struct uffdio_zeropage {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3fc721789743..405651b117e6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5746,7 +5746,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    unsigned long dst_addr,
 			    unsigned long src_addr,
 			    enum mcopy_atomic_mode mode,
-			    struct page **pagep)
+			    struct page **pagep,
+			    bool young)
 {
 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
 	struct hstate *h = hstate_vma(dst_vma);
@@ -5895,7 +5896,9 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	_dst_pte = make_huge_pte(dst_vma, page, writable);
 	if (writable)
 		_dst_pte = huge_pte_mkdirty(_dst_pte);
-	_dst_pte = pte_mkyoung(_dst_pte);
+
+	if (young)
+		_dst_pte = pte_mkyoung(_dst_pte);
 
 	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 4b2fea33158e..36ccc9609cea 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2319,7 +2319,8 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 			   unsigned long dst_addr,
 			   unsigned long src_addr,
 			   bool zeropage,
-			   struct page **pagep)
+			   struct page **pagep,
+			   bool young)
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2391,7 +2392,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 		goto out_release;
 
 	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-				       page, true, false);
+				       page, true, false, young);
 	if (ret)
 		goto out_delete_from_cache;
 
@@ -2412,6 +2413,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 	shmem_inode_unacct_blocks(inode, 1);
 	return ret;
 }
+
 #endif /* CONFIG_USERFAULTFD */
 
 #ifdef CONFIG_TMPFS
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e9bb6db002aa..9181fe4442c7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -57,7 +57,8 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 			     struct vm_area_struct *dst_vma,
 			     unsigned long dst_addr, struct page *page,
-			     bool newly_allocated, bool wp_copy)
+			     bool newly_allocated, bool wp_copy,
+			     bool young)
 {
 	int ret;
 	pte_t _dst_pte, *dst_pte;
@@ -82,6 +83,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 	else if (writable)
 		_dst_pte = pte_mkwrite(_dst_pte);
 
+	if (young)
+		pte_mkyoung(_dst_pte);
+
 	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
 
 	if (vma_is_shmem(dst_vma)) {
@@ -130,7 +134,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    unsigned long dst_addr,
 			    unsigned long src_addr,
 			    struct page **pagep,
-			    bool wp_copy)
+			    bool wp_copy, bool young)
 {
 	void *page_kaddr;
 	int ret;
@@ -174,7 +178,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 		goto out_release;
 
 	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-				       page, true, wp_copy);
+				       page, true, wp_copy, young);
 	if (ret)
 		goto out_release;
 out:
@@ -187,7 +191,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 static int mfill_zeropage_pte(struct mm_struct *dst_mm,
 			      pmd_t *dst_pmd,
 			      struct vm_area_struct *dst_vma,
-			      unsigned long dst_addr)
+			      unsigned long dst_addr,
+			      bool young)
 {
 	pte_t _dst_pte, *dst_pte;
 	spinlock_t *ptl;
@@ -210,6 +215,10 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
 	ret = -EEXIST;
 	if (!pte_none(*dst_pte))
 		goto out_unlock;
+
+	if (young)
+		pte_mkyoung(_dst_pte);
+
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
@@ -245,7 +254,7 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
 	}
 
 	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-				       page, false, wp_copy);
+				       page, false, wp_copy, false);
 	if (ret)
 		goto out_release;
 
@@ -290,7 +299,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      enum mcopy_atomic_mode mode)
+					      enum mcopy_atomic_mode mode,
+					      bool young)
 {
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
 	ssize_t err;
@@ -386,7 +396,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 		}
 
 		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
-					       dst_addr, src_addr, mode, &page);
+					       dst_addr, src_addr, mode, &page,
+					       young);
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		i_mmap_unlock_read(mapping);
@@ -441,7 +452,8 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 				      unsigned long dst_start,
 				      unsigned long src_start,
 				      unsigned long len,
-				      enum mcopy_atomic_mode mode);
+				      enum mcopy_atomic_mode mode,
+				      bool young);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -451,7 +463,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 						unsigned long src_addr,
 						struct page **page,
 						enum mcopy_atomic_mode mode,
-						bool wp_copy)
+						bool wp_copy,
+						bool young)
 {
 	ssize_t err;
 
@@ -474,16 +487,16 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 		if (mode == MCOPY_ATOMIC_NORMAL)
 			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
 					       dst_addr, src_addr, page,
-					       wp_copy);
+					       wp_copy, young);
 		else
 			err = mfill_zeropage_pte(dst_mm, dst_pmd,
-						 dst_vma, dst_addr);
+						 dst_vma, dst_addr, young);
 	} else {
 		VM_WARN_ON_ONCE(wp_copy);
 		err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
 					     dst_addr, src_addr,
 					     mode != MCOPY_ATOMIC_NORMAL,
-					     page);
+					     page, young);
 	}
 
 	return err;
@@ -504,6 +517,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	long copied;
 	struct page *page;
 	bool wp_copy;
+	bool young;
 
 	/*
 	 * Sanitize the command parameters:
@@ -557,12 +571,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
 		goto out_unlock;
 
+	young = mode & UFFDIO_COPY_MODE_YOUNG;
+
 	/*
 	 * If this is a HUGETLB vma, pass off to appropriate routine
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
 		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-						src_start, len, mcopy_mode);
+						src_start, len, mcopy_mode,
+						young);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
@@ -614,7 +631,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 		BUG_ON(pmd_trans_huge(*dst_pmd));
 
 		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-				       src_addr, &page, mcopy_mode, wp_copy);
+				       src_addr, &page, mcopy_mode, wp_copy,
+				       young);
 		cond_resched();
 
 		if (unlikely(err == -ENOENT)) {
@@ -672,14 +690,14 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
 		       unsigned long len, atomic_t *mmap_changing)
 {
 	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
-			      mmap_changing, 0);
+			      mmap_changing, UFFDIO_COPY_MODE_YOUNG);
 }
 
 ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
 		       unsigned long len, atomic_t *mmap_changing)
 {
 	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
-			      mmap_changing, 0);
+			      mmap_changing, UFFDIO_COPY_MODE_YOUNG);
 }
 
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 18+ messages in thread