linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Qi Zheng <zhengqi.arch@bytedance.com>
To: akpm@linux-foundation.org, tglx@linutronix.de,
	kirill.shutemov@linux.intel.com, mika.penttila@nextfour.com,
	david@redhat.com, jgg@nvidia.com
Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, songmuchun@bytedance.com,
	zhouchengming@bytedance.com,
	Qi Zheng <zhengqi.arch@bytedance.com>
Subject: [PATCH v3 10/15] mm/pte_ref: add support for page fault path
Date: Wed, 10 Nov 2021 18:54:23 +0800	[thread overview]
Message-ID: <20211110105428.32458-11-zhengqi.arch@bytedance.com> (raw)
In-Reply-To: <20211110105428.32458-1-zhengqi.arch@bytedance.com>

In the page fault path, we need to take a reference of
the PTE page table page if the pmd entry is not none,
which ensures the PTE page table page will not be released
by other threads.

And the mmap_lock may be unlocked in advance in some cases
in handle_pte_fault(), then the pmd entry will no longer
be stable:

	thread A		thread B
	page fault		collapse_huge_page
	==========		==================

	mmap_read_unlock()
				mmap_write_lock()
				pgtable_trans_huge_deposit()
				set_pmd_at()
	/* pmd entry is changed! */
	pte_put()

So we should call pte_put() before dropping the mmap_lock.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 fs/userfaultfd.c |  1 +
 mm/filemap.c     |  2 ++
 mm/internal.h    |  1 +
 mm/khugepaged.c  |  8 +++++++-
 mm/memory.c      | 33 ++++++++++++++++++++++++---------
 5 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 22bf14ab2d16..ddbcefa7e0a6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -509,6 +509,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 		must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
 						       vmf->address,
 						       vmf->flags, reason);
+	pte_put_vmf(vmf);
 	mmap_read_unlock(mm);
 
 	if (likely(must_wait && !READ_ONCE(ctx->released))) {
diff --git a/mm/filemap.c b/mm/filemap.c
index aa47ee11a3d8..4fdc74dc6736 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1708,6 +1708,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
 		if (flags & FAULT_FLAG_RETRY_NOWAIT)
 			return false;
 
+		pte_put_vmf(vmf);
 		mmap_read_unlock(mm);
 		if (flags & FAULT_FLAG_KILLABLE)
 			folio_wait_locked_killable(folio);
@@ -1720,6 +1721,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
 
 		ret = __folio_lock_killable(folio);
 		if (ret) {
+			pte_put_vmf(vmf);
 			mmap_read_unlock(mm);
 			return false;
 		}
diff --git a/mm/internal.h b/mm/internal.h
index 474d6e3443f8..460418828a76 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -488,6 +488,7 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
 	if (fault_flag_allow_retry_first(flags) &&
 	    !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
 		fpin = get_file(vmf->vma->vm_file);
+		pte_put_vmf(vmf);
 		mmap_read_unlock(vmf->vma->vm_mm);
 	}
 	return fpin;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e99101162f1a..92b0494f4a00 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1019,10 +1019,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 			.pmd = pmd,
 		};
 
-		vmf.pte = pte_offset_map(pmd, address);
+		vmf.pte = pte_tryget_map(pmd, address);
+		if (!vmf.pte)
+			continue;
 		vmf.orig_pte = *vmf.pte;
 		if (!is_swap_pte(vmf.orig_pte)) {
 			pte_unmap(vmf.pte);
+			pte_put_vmf(&vmf);
 			continue;
 		}
 		swapped_in++;
@@ -1041,7 +1044,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
 				return false;
 			}
+		} else {
+			pte_put_vmf(&vmf);
 		}
+
 		if (ret & VM_FAULT_ERROR) {
 			trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
 			return false;
diff --git a/mm/memory.c b/mm/memory.c
index ea4d651ac8c7..5cc4ce0af665 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4571,8 +4571,10 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
 static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 {
 	pte_t entry;
+	vm_fault_t ret;
 
-	if (unlikely(pmd_none(*vmf->pmd))) {
+retry:
+	if (unlikely(pmd_none(READ_ONCE(*vmf->pmd)))) {
 		/*
 		 * Leave __pte_alloc() until later: because vm_ops->fault may
 		 * want to allocate huge page, and if we expose page table
@@ -4595,13 +4597,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		 */
 		if (pmd_devmap_trans_unstable(vmf->pmd))
 			return 0;
+
 		/*
 		 * A regular pmd is established and it can't morph into a huge
 		 * pmd from under us anymore at this point because we hold the
 		 * mmap_lock read mode and khugepaged takes it in write mode.
 		 * So now it's safe to run pte_offset_map().
 		 */
-		vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+		vmf->pte = pte_tryget_map(vmf->pmd, vmf->address);
+		if (!vmf->pte)
+			goto retry;
 		vmf->orig_pte = *vmf->pte;
 
 		/*
@@ -4616,6 +4621,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		if (pte_none(vmf->orig_pte)) {
 			pte_unmap(vmf->pte);
 			vmf->pte = NULL;
+			pte_put_vmf(vmf);
 		}
 	}
 
@@ -4626,11 +4632,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 			return do_fault(vmf);
 	}
 
-	if (!pte_present(vmf->orig_pte))
-		return do_swap_page(vmf);
+	if (!pte_present(vmf->orig_pte)) {
+		ret = do_swap_page(vmf);
+		goto put;
+	}
 
-	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
-		return do_numa_page(vmf);
+	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) {
+		ret = do_numa_page(vmf);
+		goto put;
+	}
 
 	vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
 	spin_lock(vmf->ptl);
@@ -4640,8 +4650,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		goto unlock;
 	}
 	if (vmf->flags & FAULT_FLAG_WRITE) {
-		if (!pte_write(entry))
-			return do_wp_page(vmf);
+		if (!pte_write(entry)) {
+			ret = do_wp_page(vmf);
+			goto put;
+		}
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
@@ -4663,7 +4675,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	}
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
-	return 0;
+	ret = 0;
+put:
+	pte_put_vmf(vmf);
+	return ret;
 }
 
 /*
-- 
2.11.0


  parent reply	other threads:[~2021-11-10 10:55 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-10 10:54 [PATCH v3 00/15] Free user PTE page table pages Qi Zheng
2021-11-10 10:54 ` [PATCH v3 01/15] mm: do code cleanups to filemap_map_pmd() Qi Zheng
2021-11-10 10:54 ` [PATCH v3 02/15] mm: introduce is_huge_pmd() helper Qi Zheng
2021-11-11 13:46   ` kernel test robot
2021-11-10 10:54 ` [PATCH v3 03/15] mm: move pte_offset_map_lock() to pgtable.h Qi Zheng
2021-11-10 10:54 ` [PATCH v3 04/15] mm: rework the parameter of lock_page_or_retry() Qi Zheng
2021-11-10 10:54 ` [PATCH v3 05/15] mm: add pmd_installed_type return for __pte_alloc() and other friends Qi Zheng
2021-11-10 10:54 ` [PATCH v3 06/15] mm: introduce refcount for user PTE page table page Qi Zheng
2021-11-11  0:37   ` kernel test robot
2021-11-10 10:54 ` [PATCH v3 07/15] mm/pte_ref: add support for user PTE page table page allocation Qi Zheng
2021-11-11 15:17   ` kernel test robot
2021-11-10 10:54 ` [PATCH v3 08/15] mm/pte_ref: initialize the refcount of the withdrawn PTE page table page Qi Zheng
2021-11-10 10:54 ` [PATCH v3 09/15] mm/pte_ref: add support for the map/unmap of user " Qi Zheng
2021-11-10 10:54 ` Qi Zheng [this message]
2021-11-10 10:54 ` [PATCH v3 11/15] mm/pte_ref: take a refcount before accessing the " Qi Zheng
2021-11-10 10:54 ` [PATCH v3 12/15] mm/pte_ref: update the pmd entry in move_normal_pmd() Qi Zheng
2021-11-10 10:54 ` [PATCH v3 13/15] mm/pte_ref: free user PTE page table pages Qi Zheng
2021-11-14 14:43   ` [mm/pte_ref] afcc9fb874: kernel_BUG_at_include/linux/pte_ref.h kernel test robot
2021-11-10 10:54 ` [PATCH v3 14/15] Documentation: add document for pte_ref Qi Zheng
2021-11-10 14:39   ` Jonathan Corbet
2021-11-11  5:40     ` Qi Zheng
2021-11-10 10:54 ` [PATCH v3 15/15] mm/pte_ref: use mmu_gather to free PTE page table pages Qi Zheng
2021-11-10 12:56 ` [PATCH v3 00/15] Free user " Jason Gunthorpe
2021-11-10 13:25   ` David Hildenbrand
2021-11-10 13:59     ` Qi Zheng
2021-11-10 14:38     ` Jason Gunthorpe
2021-11-10 15:37       ` David Hildenbrand
2021-11-10 16:39         ` Jason Gunthorpe
2021-11-10 17:37           ` David Hildenbrand
2021-11-10 17:49             ` Jason Gunthorpe
2021-11-11  3:58             ` Qi Zheng
2021-11-11  9:22               ` David Hildenbrand
2021-11-11 11:08                 ` Qi Zheng
2021-11-11 11:19                   ` David Hildenbrand
2021-11-11 12:00                     ` Qi Zheng
2021-11-11 12:20                       ` David Hildenbrand
2021-11-11 12:32                         ` Qi Zheng
2021-11-11 12:51                           ` David Hildenbrand
2021-11-11 13:01                             ` Qi Zheng
2021-11-10 16:49         ` Matthew Wilcox
2021-11-10 16:53           ` David Hildenbrand
2021-11-10 16:56             ` Jason Gunthorpe
2021-11-10 13:54   ` Qi Zheng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211110105428.32458-11-zhengqi.arch@bytedance.com \
    --to=zhengqi.arch@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=jgg@nvidia.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mika.penttila@nextfour.com \
    --cc=songmuchun@bytedance.com \
    --cc=tglx@linutronix.de \
    --cc=zhouchengming@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).