linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: David Hildenbrand <david@redhat.com>
To: linux-kernel@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Hugh Dickins <hughd@google.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	David Rientjes <rientjes@google.com>,
	Shakeel Butt <shakeelb@google.com>,
	John Hubbard <jhubbard@nvidia.com>,
	Jason Gunthorpe <jgg@nvidia.com>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	Mike Rapoport <rppt@linux.ibm.com>,
	Yang Shi <shy828301@gmail.com>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Matthew Wilcox <willy@infradead.org>,
	Vlastimil Babka <vbabka@suse.cz>, Jann Horn <jannh@google.com>,
	Michal Hocko <mhocko@kernel.org>, Nadav Amit <namit@vmware.com>,
	Rik van Riel <riel@surriel.com>, Roman Gushchin <guro@fb.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Peter Xu <peterx@redhat.com>, Donald Dutile <ddutile@redhat.com>,
	Christoph Hellwig <hch@lst.de>, Oleg Nesterov <oleg@redhat.com>,
	Jan Kara <jack@suse.cz>,
	linux-mm@kvack.org, linux-kselftest@vger.kernel.org,
	linux-doc@vger.kernel.org, David Hildenbrand <david@redhat.com>
Subject: [PATCH v1 08/11] mm: hugetlb: support GUP-triggered unsharing via FAULT_FLAG_UNSHARE
Date: Fri, 17 Dec 2021 12:30:46 +0100	[thread overview]
Message-ID: <20211217113049.23850-9-david@redhat.com> (raw)
In-Reply-To: <20211217113049.23850-1-david@redhat.com>

Let's support FAULT_FLAG_UNSHARE to implement GUP-triggered unsharing,
preparing for its use in the GUP paths when there is need to unshare a
shared anonymous hugetlb page.

We'll make use of it next by setting FAULT_FLAG_UNSHARE in case we
detect that unsharing is necessary.

This commit is based on a prototype patch by Andrea.

Co-developed-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/hugetlb.c | 86 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 63 insertions(+), 23 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a1baa198519a..5f2863b046ef 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5130,14 +5130,15 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 /*
- * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * __wp_hugetlb() should be called with page lock of the original hugepage held.
  * Called with hugetlb_fault_mutex_table held and pte_page locked so we
  * cannot race with other handlers or page migration.
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
-static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-		       unsigned long address, pte_t *ptep,
-		       struct page *pagecache_page, spinlock_t *ptl)
+static __always_inline vm_fault_t
+__wp_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma,
+	     unsigned long address, pte_t *ptep, struct page *pagecache_page,
+	     spinlock_t *ptl, bool unshare)
 {
 	pte_t pte;
 	struct hstate *h = hstate_vma(vma);
@@ -5151,11 +5152,21 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	old_page = pte_page(pte);
 
 retry_avoidcopy:
-	/* If no-one else is actually using this page, avoid the copy
-	 * and just make the page writable */
-	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
-		page_move_anon_rmap(old_page, vma);
-		set_huge_ptep_writable(vma, haddr, ptep);
+	if (!unshare) {
+		/*
+		 * If no-one else is actually using this page, avoid the copy
+		 * and just make the page writable.
+		 */
+		if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
+			page_move_anon_rmap(old_page, vma);
+			set_huge_ptep_writable(vma, haddr, ptep);
+			return 0;
+		}
+	} else if (!PageAnon(old_page) || page_mapcount(old_page) == 1) {
+		/*
+		 * GUP-triggered unsharing only applies to shared anonymous
+		 * pages. If that does no longer apply, there is nothing to do.
+		 */
 		return 0;
 	}
 
@@ -5256,11 +5267,11 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
 		ClearHPageRestoreReserve(new_page);
 
-		/* Break COW */
+		/* Break COW or unshare */
 		huge_ptep_clear_flush(vma, haddr, ptep);
 		mmu_notifier_invalidate_range(mm, range.start, range.end);
 		set_huge_pte_at(mm, haddr, ptep,
-				make_huge_pte(vma, new_page, 1));
+				make_huge_pte(vma, new_page, !unshare));
 		page_remove_rmap(old_page, true);
 		hugepage_add_new_anon_rmap(new_page, vma, haddr);
 		SetHPageMigratable(new_page);
@@ -5270,7 +5281,10 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(&range);
 out_release_all:
-	/* No restore in case of successful pagetable update (Break COW) */
+	/*
+	 * No restore in case of successful pagetable update (Break COW or
+	 * unshare)
+	 */
 	if (new_page != old_page)
 		restore_reserve_on_error(h, vma, haddr, new_page);
 	put_page(new_page);
@@ -5281,6 +5295,23 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
+static vm_fault_t
+wp_hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+	       unsigned long address, pte_t *ptep, struct page *pagecache_page,
+	       spinlock_t *ptl)
+{
+	return __wp_hugetlb(mm, vma, address, ptep, pagecache_page, ptl,
+			    false);
+}
+
+static vm_fault_t
+wp_hugetlb_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+		   unsigned long address, pte_t *ptep,
+		   struct page *pagecache_page, spinlock_t *ptl)
+{
+	return __wp_hugetlb(mm, vma, address, ptep, pagecache_page, ptl, true);
+}
+
 /* Return the pagecache page at a given address within a VMA */
 static struct page *hugetlbfs_pagecache_page(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long address)
@@ -5393,7 +5424,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	/*
 	 * Currently, we are forced to kill the process in the event the
 	 * original mapper has unmapped pages from the child due to a failed
-	 * COW. Warn that such a situation has occurred as it may not be obvious
+	 * COW/unsharing. Warn that such a situation has occurred as it may not
+	 * be obvious.
 	 */
 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
 		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
@@ -5519,7 +5551,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	hugetlb_count_add(pages_per_huge_page(h), mm);
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
-		ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+		ret = wp_hugetlb_cow(mm, vma, address, ptep, page, ptl);
 	}
 
 	spin_unlock(ptl);
@@ -5649,14 +5681,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_mutex;
 
 	/*
-	 * If we are going to COW the mapping later, we examine the pending
-	 * reservations for this page now. This will ensure that any
+	 * If we are going to COW/unshare the mapping later, we examine the
+	 * pending reservations for this page now. This will ensure that any
 	 * allocations necessary to record that reservation occur outside the
 	 * spinlock. For private mappings, we also lookup the pagecache
 	 * page now as it is used to determine if a reservation has been
 	 * consumed.
 	 */
-	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+	if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+	    !huge_pte_write(entry)) {
 		if (vma_needs_reservation(h, vma, haddr) < 0) {
 			ret = VM_FAULT_OOM;
 			goto out_mutex;
@@ -5671,14 +5704,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	ptl = huge_pte_lock(h, mm, ptep);
 
-	/* Check for a racing update before calling hugetlb_cow */
+	/*
+	 * Check for a racing update before calling wp_hugetlb_cow /
+	 * wp_hugetlb_unshare
+	 */
 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
 		goto out_ptl;
 
 	/*
-	 * hugetlb_cow() requires page locks of pte_page(entry) and
-	 * pagecache_page, so here we need take the former one
-	 * when page != pagecache_page or !pagecache_page.
+	 * wp_hugetlb_cow()/wp_hugetlb_unshare() requires page locks of
+	 * pte_page(entry) and pagecache_page, so here we need take the former
+	 * one when page != pagecache_page or !pagecache_page.
 	 */
 	page = pte_page(entry);
 	if (page != pagecache_page)
@@ -5691,11 +5727,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!huge_pte_write(entry)) {
-			ret = hugetlb_cow(mm, vma, address, ptep,
-					  pagecache_page, ptl);
+			ret = wp_hugetlb_cow(mm, vma, address, ptep,
+					     pagecache_page, ptl);
 			goto out_put_page;
 		}
 		entry = huge_pte_mkdirty(entry);
+	} else if (flags & FAULT_FLAG_UNSHARE && !huge_pte_write(entry)) {
+		ret = wp_hugetlb_unshare(mm, vma, address, ptep, pagecache_page,
+					 ptl);
+		goto out_put_page;
 	}
 	entry = pte_mkyoung(entry);
 	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
-- 
2.31.1


  parent reply	other threads:[~2021-12-17 11:34 UTC|newest]

Thread overview: 137+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-12-17 11:30 [PATCH v1 00/11] mm: COW fixes part 1: fix the COW security issue for THP and hugetlb David Hildenbrand
2021-12-17 11:30 ` [PATCH v1 01/11] seqlock: provide lockdep-free raw_seqcount_t variant David Hildenbrand
2021-12-17 17:02   ` Nadav Amit
2021-12-17 17:29     ` David Hildenbrand
2021-12-17 17:49       ` David Hildenbrand
2021-12-17 18:01         ` Nadav Amit
2021-12-17 21:28   ` Thomas Gleixner
2021-12-17 22:02     ` David Hildenbrand
2021-12-17 11:30 ` [PATCH v1 02/11] mm: thp: consolidate mapcount logic on THP split David Hildenbrand
2021-12-17 19:06   ` Yang Shi
2021-12-18 14:24   ` Kirill A. Shutemov
2021-12-17 11:30 ` [PATCH v1 03/11] mm: simplify hugetlb and file-THP handling in __page_mapcount() David Hildenbrand
2021-12-17 17:16   ` Nadav Amit
2021-12-17 17:30     ` David Hildenbrand
2021-12-17 18:06   ` Mike Kravetz
2021-12-17 18:11     ` David Hildenbrand
2021-12-17 19:07   ` Yang Shi
2021-12-18 14:31   ` Kirill A. Shutemov
2021-12-17 11:30 ` [PATCH v1 04/11] mm: thp: simlify total_mapcount() David Hildenbrand
2021-12-17 19:12   ` Yang Shi
2021-12-18 14:35   ` Kirill A. Shutemov
2021-12-17 11:30 ` [PATCH v1 05/11] mm: thp: allow for reading the THP mapcount atomically via a raw_seqlock_t David Hildenbrand
2021-12-17 11:30 ` [PATCH v1 06/11] mm: support GUP-triggered unsharing via FAULT_FLAG_UNSHARE (!hugetlb) David Hildenbrand
2021-12-17 19:04   ` Linus Torvalds
2021-12-17 19:22     ` Linus Torvalds
2021-12-17 20:17       ` David Hildenbrand
2021-12-17 20:36         ` Linus Torvalds
2021-12-17 20:39           ` Linus Torvalds
2021-12-17 20:43             ` Linus Torvalds
2021-12-17 20:42           ` David Hildenbrand
2021-12-17 20:45             ` Linus Torvalds
2021-12-18 22:52               ` Kirill A. Shutemov
2021-12-18 23:05                 ` Linus Torvalds
2021-12-17 20:47           ` Jason Gunthorpe
2021-12-17 20:56             ` Linus Torvalds
2021-12-17 21:17               ` David Hildenbrand
2021-12-17 21:04             ` David Hildenbrand
2021-12-18  0:50               ` Jason Gunthorpe
2021-12-17 21:15             ` Nadav Amit
2021-12-17 21:20               ` David Hildenbrand
2021-12-18  0:50               ` Jason Gunthorpe
2021-12-18  1:53               ` Linus Torvalds
2021-12-18  2:17                 ` Linus Torvalds
2021-12-18  2:42                   ` Linus Torvalds
2021-12-18  3:36                     ` Linus Torvalds
2021-12-18 10:06                     ` David Hildenbrand
2021-12-18  3:05                 ` Jason Gunthorpe
2021-12-18  3:30                   ` Nadav Amit
2021-12-18  3:38                     ` Linus Torvalds
2021-12-18 18:42                       ` Jason Gunthorpe
2021-12-18 18:49                         ` David Hildenbrand
2021-12-18 21:48                         ` Nadav Amit
2021-12-18 22:53                           ` Linus Torvalds
2021-12-19  0:19                             ` Nadav Amit
2021-12-19  0:35                               ` Linus Torvalds
2021-12-19  6:02                                 ` Nadav Amit
2021-12-19  8:01                                   ` John Hubbard
2021-12-19 11:30                                     ` Matthew Wilcox
2021-12-19 17:27                                   ` Linus Torvalds
2021-12-19 17:44                                     ` David Hildenbrand
2021-12-19 17:44                                     ` Linus Torvalds
2021-12-19 17:59                                       ` David Hildenbrand
2021-12-19 21:12                                         ` Matthew Wilcox
2021-12-19 21:27                                           ` Linus Torvalds
2021-12-19 21:47                                             ` Matthew Wilcox
2021-12-19 21:53                                               ` Linus Torvalds
2021-12-19 22:02                                                 ` Matthew Wilcox
2021-12-19 22:12                                                   ` Linus Torvalds
2021-12-19 22:26                                                     ` Matthew Wilcox
2021-12-20 18:37                                           ` Matthew Wilcox
2021-12-20 18:52                                             ` Matthew Wilcox
2021-12-20 19:38                                               ` Linus Torvalds
2021-12-20 19:15                                             ` Linus Torvalds
2021-12-20 21:02                                               ` Matthew Wilcox
2021-12-20 21:27                                                 ` Linus Torvalds
2021-12-21  1:03                                         ` Jason Gunthorpe
2021-12-21  3:29                                           ` Matthew Wilcox
2021-12-21  8:58                                           ` David Hildenbrand
2021-12-21 14:28                                             ` Jason Gunthorpe
2021-12-21 15:19                                               ` David Hildenbrand
2021-12-21 23:54                                                 ` Jason Gunthorpe
2021-12-21 17:05                                             ` Linus Torvalds
2021-12-21 17:40                                               ` David Hildenbrand
2021-12-21 18:00                                                 ` Linus Torvalds
2021-12-21 18:28                                                   ` David Hildenbrand
2021-12-21 21:11                                                     ` John Hubbard
2021-12-21 18:07                                                 ` Jan Kara
2021-12-21 18:30                                                   ` Linus Torvalds
2021-12-21 18:51                                                     ` David Hildenbrand
2021-12-21 18:58                                                       ` Linus Torvalds
2021-12-21 21:16                                                     ` John Hubbard
2021-12-21 19:07                                                 ` Jason Gunthorpe
2021-12-22  8:51                                                   ` David Hildenbrand
2021-12-22  9:58                                                     ` David Hildenbrand
2021-12-22 12:41                                                       ` Jan Kara
2021-12-22 13:09                                                         ` David Hildenbrand
2021-12-22 14:42                                                           ` Jan Kara
2021-12-22 14:48                                                             ` David Hildenbrand
2021-12-22 16:08                                                               ` Jan Kara
2021-12-22 16:44                                                                 ` Matthew Wilcox
2021-12-22 18:40                                                                 ` Linus Torvalds
2021-12-23 12:54                                                                   ` Jan Kara
2021-12-23 17:18                                                                     ` Linus Torvalds
2021-12-23  0:21                                                           ` Matthew Wilcox
2021-12-24  2:53                                                             ` Jason Gunthorpe
2021-12-24  4:53                                                               ` Matthew Wilcox
2022-01-04  0:33                                                                 ` Jason Gunthorpe
2021-12-21 23:59                                                 ` Jason Gunthorpe
2021-12-22  8:30                                                   ` David Hildenbrand
2021-12-22 12:44                                                   ` Jan Kara
2021-12-17 20:45     ` David Hildenbrand
2021-12-17 20:51       ` Linus Torvalds
2021-12-17 20:55         ` David Hildenbrand
2021-12-17 21:36           ` Linus Torvalds
2021-12-17 21:47             ` David Hildenbrand
2021-12-17 21:50               ` Linus Torvalds
2021-12-17 22:29                 ` David Hildenbrand
2021-12-17 22:58                   ` Linus Torvalds
2021-12-17 23:29                     ` David Hildenbrand
2021-12-17 23:53                       ` Nadav Amit
2021-12-18  4:02                         ` Linus Torvalds
2021-12-18  4:52                           ` Nadav Amit
2021-12-18  5:03                             ` Matthew Wilcox
2021-12-18  5:23                               ` Nadav Amit
2021-12-18 18:37                               ` Linus Torvalds
2021-12-17 22:18               ` Linus Torvalds
2021-12-17 22:43                 ` David Hildenbrand
2021-12-17 23:20                   ` Linus Torvalds
2021-12-18  9:57                     ` David Hildenbrand
2021-12-18 19:21                       ` Linus Torvalds
2021-12-18 19:52                         ` Linus Torvalds
2021-12-19  8:43                           ` David Hildenbrand
2021-12-17 11:30 ` [PATCH v1 07/11] mm: gup: trigger unsharing via FAULT_FLAG_UNSHARE when required (!hugetlb) David Hildenbrand
2021-12-17 11:30 ` David Hildenbrand [this message]
2021-12-17 11:30 ` [PATCH v1 09/11] mm: gup: trigger unsharing via FAULT_FLAG_UNSHARE when required (hugetlb) David Hildenbrand
2021-12-17 11:30 ` [PATCH v1 10/11] mm: thp: introduce and use page_trans_huge_anon_shared() David Hildenbrand
2021-12-17 11:30 ` [PATCH v1 11/11] selftests/vm: add tests for the known COW security issues David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211217113049.23850-9-david@redhat.com \
    --to=david@redhat.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=ddutile@redhat.com \
    --cc=guro@fb.com \
    --cc=hch@lst.de \
    --cc=hughd@google.com \
    --cc=jack@suse.cz \
    --cc=jannh@google.com \
    --cc=jgg@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mike.kravetz@oracle.com \
    --cc=namit@vmware.com \
    --cc=oleg@redhat.com \
    --cc=peterx@redhat.com \
    --cc=riel@surriel.com \
    --cc=rientjes@google.com \
    --cc=rppt@linux.ibm.com \
    --cc=shakeelb@google.com \
    --cc=shy828301@gmail.com \
    --cc=torvalds@linux-foundation.org \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).