All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Xu <peterx@redhat.com>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	peterx@redhat.com, Jerome Glisse <jglisse@redhat.com>,
	"Kirill A . Shutemov" <kirill@shutemov.name>,
	Hugh Dickins <hughd@google.com>,
	Axel Rasmussen <axelrasmussen@google.com>,
	Matthew Wilcox <willy@infradead.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Nadav Amit <nadav.amit@gmail.com>
Subject: [PATCH RFC 13/30] shmem/userfaultfd: Persist uffd-wp bit across zapping for file-backed
Date: Fri, 15 Jan 2021 12:08:50 -0500	[thread overview]
Message-ID: <20210115170907.24498-14-peterx@redhat.com> (raw)
In-Reply-To: <20210115170907.24498-1-peterx@redhat.com>

File-backed memory is prone to being unmapped at any time.  It means all
information in the pte will be dropped, including the uffd-wp flag.

Since the uffd-wp info cannot be stored in page cache or swap cache, persist
this wr-protect information by installing the special uffd-wp marker pte when
we're going to unmap a uffd wr-protected pte.  When the pte is accessed again,
we will know it's previously wr-protected by recognizing the special pte.

Meanwhile add a new flag ZAP_FLAG_DROP_FILE_UFFD_WP when we don't want to
persist such an information.  For example, when destroying the whole vma, or
punching a hole in a shmem file.  For the latter, we can only drop the uffd-wp
bit when holding the page lock.  It means the unmap_mapping_range() in
shmem_fallocate() still reuqires to zap without ZAP_FLAG_DROP_FILE_UFFD_WP
because that's still racy with the page faults.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/mm.h        | 11 ++++++++++
 include/linux/mm_inline.h | 43 +++++++++++++++++++++++++++++++++++++++
 mm/memory.c               | 42 +++++++++++++++++++++++++++++++++++++-
 mm/rmap.c                 |  8 ++++++++
 mm/truncate.c             |  8 +++++++-
 5 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57bb3d680844..e4aba745be62 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1634,6 +1634,8 @@ extern void user_shm_unlock(size_t, struct user_struct *);
 #define  ZAP_FLAG_CHECK_MAPPING             BIT(0)
 /* Whether to skip zapping swap entries */
 #define  ZAP_FLAG_SKIP_SWAP                 BIT(1)
+/* Whether to completely drop uffd-wp entries for file-backed memory */
+#define  ZAP_FLAG_DROP_FILE_UFFD_WP         BIT(2)
 
 /*
  * Parameter block passed down to zap_pte_range in exceptional cases.
@@ -1666,6 +1668,15 @@ zap_skip_swap(struct zap_details *details)
 	return details->zap_flags & ZAP_FLAG_SKIP_SWAP;
 }
 
+static inline bool
+zap_drop_file_uffd_wp(struct zap_details *details)
+{
+	if (!details)
+		return false;
+
+	return details->zap_flags & ZAP_FLAG_DROP_FILE_UFFD_WP;
+}
+
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8fc71e9d7bb0..da4c710859e6 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -4,6 +4,8 @@
 
 #include <linux/huge_mm.h>
 #include <linux/swap.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/swapops.h>
 
 /**
  * page_is_file_lru - should the page be on a file LRU or anon LRU?
@@ -125,4 +127,45 @@ static __always_inline enum lru_list page_lru(struct page *page)
 	}
 	return lru;
 }
+
+/*
+ * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
+ * replace a none pte.  NOTE!  This should only be called when *pte is already
+ * cleared so we will never accidentally replace something valuable.  Meanwhile
+ * none pte also means we are not demoting the pte so if tlb flushed then we
+ * don't need to do it again; otherwise if tlb flush is postponed then it's
+ * even better.
+ *
+ * Must be called with pgtable lock held.
+ */
+static inline void
+pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
+			      pte_t *pte, pte_t pteval)
+{
+#ifdef CONFIG_USERFAULTFD
+	bool arm_uffd_pte = false;
+
+	/* The current status of the pte should be "cleared" before calling */
+	WARN_ON_ONCE(!pte_none(*pte));
+
+	if (vma_is_anonymous(vma))
+		return;
+
+	/* A uffd-wp wr-protected normal pte */
+	if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
+		arm_uffd_pte = true;
+
+	/*
+	 * A uffd-wp wr-protected swap pte.  Note: this should even work for
+	 * pte_swp_uffd_wp_special() too.
+	 */
+	if (unlikely(is_swap_pte(pteval) && pte_swp_uffd_wp(pteval)))
+		arm_uffd_pte = true;
+
+	if (unlikely(arm_uffd_pte))
+		set_pte_at(vma->vm_mm, addr, pte,
+			   pte_swp_mkuffd_wp_special(vma));
+#endif
+}
+
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index afe09fccdee1..f87b5a8a098e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -73,6 +73,7 @@
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
+#include <linux/mm_inline.h>
 
 #include <trace/events/kmem.h>
 
@@ -1194,6 +1195,21 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 	return ret;
 }
 
+/*
+ * This function makes sure that we'll replace the none pte with an uffd-wp
+ * swap special pte marker when necessary. Must be with the pgtable lock held.
+ */
+static inline void
+zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
+			      unsigned long addr, pte_t *pte,
+			      struct zap_details *details, pte_t pteval)
+{
+	if (zap_drop_file_uffd_wp(details))
+		return;
+
+	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
@@ -1231,6 +1247,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
+			zap_install_uffd_wp_if_needed(vma, addr, pte, details,
+						      ptent);
 			if (unlikely(!page))
 				continue;
 
@@ -1255,6 +1273,22 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			continue;
 		}
 
+		/*
+		 * If this is a special uffd-wp marker pte... Drop it only if
+		 * enforced to do so.
+		 */
+		if (unlikely(is_swap_special_pte(ptent))) {
+			WARN_ON_ONCE(!pte_swp_uffd_wp_special(ptent));
+			/*
+			 * If this is a common unmap of ptes, keep this as is.
+			 * Drop it only if this is a whole-vma destruction.
+			 */
+			if (zap_drop_file_uffd_wp(details))
+				ptep_get_and_clear_full(mm, addr, pte,
+							tlb->fullmm);
+			continue;
+		}
+
 		entry = pte_to_swp_entry(ptent);
 		if (is_device_private_entry(entry)) {
 			struct page *page = device_private_entry_to_page(entry);
@@ -1265,6 +1299,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			rss[mm_counter(page)]--;
 			page_remove_rmap(page, false);
 			put_page(page);
+			zap_install_uffd_wp_if_needed(vma, addr, pte, details,
+						      ptent);
 			continue;
 		}
 
@@ -1282,6 +1318,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		if (unlikely(!free_swap_and_cache(entry)))
 			print_bad_pte(vma, addr, ptent, NULL);
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
 	add_mm_rss_vec(mm, rss);
@@ -1481,12 +1518,15 @@ void unmap_vmas(struct mmu_gather *tlb,
 		unsigned long end_addr)
 {
 	struct mmu_notifier_range range;
+	struct zap_details details = {
+		.zap_flags = ZAP_FLAG_DROP_FILE_UFFD_WP,
+	};
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
 				start_addr, end_addr);
 	mmu_notifier_invalidate_range_start(&range);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
-		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
+		unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
 	mmu_notifier_invalidate_range_end(&range);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 31b29321adfe..f6cc0b9b1963 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,7 @@
 #include <linux/page_idle.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
 
 #include <asm/tlbflush.h>
 
@@ -1560,6 +1561,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			pteval = ptep_clear_flush(vma, address, pvmw.pte);
 		}
 
+		/*
+		 * Now the pte is cleared.  If this is uffd-wp armed pte, we
+		 * may want to replace a none pte with a marker pte if it's
+		 * file-backed, so we don't lose the tracking information.
+		 */
+		pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
+
 		/* Move the dirty bit to the page. Now the pte is gone. */
 		if (pte_dirty(pteval))
 			set_page_dirty(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index dac66749e400..35df3b1d301e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -179,7 +179,13 @@ truncate_cleanup_page(struct address_space *mapping, struct page *page)
 	if (page_mapped(page)) {
 		unsigned int nr = thp_nr_pages(page);
 		unmap_mapping_pages(mapping, page->index, nr,
-				    ZAP_FLAG_CHECK_MAPPING);
+				    ZAP_FLAG_CHECK_MAPPING |
+				    /*
+				     * Now it's safe to drop uffd-wp because
+				     * we're with page lock, and the page is
+				     * being truncated.
+				     */
+				    ZAP_FLAG_DROP_FILE_UFFD_WP);
 	}
 
 	if (page_has_private(page))
-- 
2.26.2


  parent reply	other threads:[~2021-01-15 17:13 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-15 17:08 [PATCH RFC 00/30] userfaultfd-wp: Support shmem and hugetlbfs Peter Xu
2021-01-15 17:08 ` [PATCH RFC 01/30] mm/thp: Simplify copying of huge zero page pmd when fork Peter Xu
2021-01-15 17:08 ` [PATCH RFC 02/30] mm/userfaultfd: Fix uffd-wp special cases for fork() Peter Xu
2021-01-15 17:08 ` [PATCH RFC 03/30] mm/userfaultfd: Fix a few thp pmd missing uffd-wp bit Peter Xu
2021-01-15 17:08 ` [PATCH RFC 04/30] shmem/userfaultfd: Take care of UFFDIO_COPY_MODE_WP Peter Xu
2021-01-15 17:08 ` [PATCH RFC 05/30] mm: Clear vmf->pte after pte_unmap_same() returns Peter Xu
2021-01-15 17:08 ` [PATCH RFC 06/30] mm/userfaultfd: Introduce special pte for unmapped file-backed mem Peter Xu
2021-01-15 17:08 ` [PATCH RFC 07/30] mm/swap: Introduce the idea of special swap ptes Peter Xu
2021-01-18 19:40   ` Jason Gunthorpe
2021-01-19 14:24     ` Peter Xu
2021-01-15 17:08 ` [PATCH RFC 08/30] shmem/userfaultfd: Handle uffd-wp special pte in page fault handler Peter Xu
2021-01-15 19:51   ` kernel test robot
2021-01-15 21:01   ` kernel test robot
2021-01-15 17:08 ` [PATCH RFC 09/30] mm: Drop first_index/last_index in zap_details Peter Xu
2021-01-15 17:08 ` [PATCH RFC 10/30] mm: Introduce zap_details.zap_flags Peter Xu
2021-01-15 17:08 ` [PATCH RFC 11/30] mm: Introduce ZAP_FLAG_SKIP_SWAP Peter Xu
2021-01-15 17:08 ` [PATCH RFC 12/30] mm: Pass zap_flags into unmap_mapping_pages() Peter Xu
2021-01-15 17:08 ` Peter Xu [this message]
2021-01-15 17:08 ` [PATCH RFC 14/30] shmem/userfaultfd: Allow wr-protect none pte for file-backed mem Peter Xu
2021-01-15 17:08 ` [PATCH RFC 15/30] shmem/userfaultfd: Allows file-back mem to be uffd wr-protected on thps Peter Xu
2021-01-15 17:08 ` [PATCH RFC 16/30] shmem/userfaultfd: Handle the left-overed special swap ptes Peter Xu
2021-01-15 17:08 ` [PATCH RFC 17/30] shmem/userfaultfd: Pass over uffd-wp special swap pte when fork() Peter Xu
2021-01-15 17:08 ` [PATCH RFC 18/30] hugetlb/userfaultfd: Hook page faults for uffd write protection Peter Xu
2021-01-15 17:08 ` [PATCH RFC 19/30] hugetlb/userfaultfd: Take care of UFFDIO_COPY_MODE_WP Peter Xu
2021-01-15 17:08 ` [PATCH RFC 20/30] hugetlb/userfaultfd: Handle UFFDIO_WRITEPROTECT Peter Xu
2021-01-15 17:08 ` [PATCH RFC 21/30] hugetlb: Pass vma into huge_pte_alloc() Peter Xu
2021-01-28 22:59   ` Axel Rasmussen
2021-01-28 22:59     ` Axel Rasmussen
2021-01-29 22:31     ` Peter Xu
2021-01-30  8:08       ` Axel Rasmussen
2021-01-30  8:08         ` Axel Rasmussen
2021-01-15 17:08 ` [PATCH RFC 22/30] hugetlb/userfaultfd: Forbid huge pmd sharing when uffd enabled Peter Xu
2021-01-16 10:02   ` kernel test robot
2021-01-15 17:09 ` [PATCH RFC 23/30] mm/hugetlb: Introduce huge version of special swap pte helpers Peter Xu
2021-01-15 17:09 ` [PATCH RFC 24/30] mm/hugetlb: Move flush_hugetlb_tlb_range() into hugetlb.h Peter Xu
2021-01-15 17:09 ` [PATCH RFC 25/30] hugetlb/userfaultfd: Unshare all pmds for hugetlbfs when register wp Peter Xu
2021-01-15 23:05   ` kernel test robot
2021-01-15 17:09 ` [PATCH RFC 26/30] hugetlb/userfaultfd: Handle uffd-wp special pte in hugetlb pf handler Peter Xu
2021-01-15 17:09 ` [PATCH RFC 27/30] hugetlb/userfaultfd: Allow wr-protect none ptes Peter Xu
2021-01-15 17:09 ` [PATCH RFC 28/30] hugetlb/userfaultfd: Only drop uffd-wp special pte if required Peter Xu
2021-01-15 17:09 ` [PATCH RFC 29/30] userfaultfd: Enable write protection for shmem & hugetlbfs Peter Xu
2021-01-15 17:12 ` [PATCH RFC 30/30] userfaultfd/selftests: Enable uffd-wp for shmem/hugetlbfs Peter Xu
2021-01-29 22:49 ` [PATCH RFC 00/30] userfaultfd-wp: Support shmem and hugetlbfs Peter Xu
2021-02-05 21:53   ` Mike Kravetz
2021-02-06  2:36     ` Peter Xu
2021-02-09 19:29       ` Mike Kravetz
2021-02-09 22:00         ` Peter Xu
2021-02-05 22:21   ` Hugh Dickins
2021-02-05 22:21     ` Hugh Dickins
2021-02-06  2:47     ` Peter Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210115170907.24498-14-peterx@redhat.com \
    --to=peterx@redhat.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=hughd@google.com \
    --cc=jglisse@redhat.com \
    --cc=kirill@shutemov.name \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.kravetz@oracle.com \
    --cc=nadav.amit@gmail.com \
    --cc=rppt@linux.vnet.ibm.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.