linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Hugh Dickins <hughd@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>,
	Mike Rapoport <rppt@kernel.org>,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
	Matthew Wilcox <willy@infradead.org>,
	David Hildenbrand <david@redhat.com>,
	Suren Baghdasaryan <surenb@google.com>,
	Qi Zheng <zhengqi.arch@bytedance.com>,
	Yang Shi <shy828301@gmail.com>,
	Mel Gorman <mgorman@techsingularity.net>,
	Peter Xu <peterx@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Will Deacon <will@kernel.org>, Yu Zhao <yuzhao@google.com>,
	Alistair Popple <apopple@nvidia.com>,
	Ralph Campbell <rcampbell@nvidia.com>,
	Ira Weiny <ira.weiny@intel.com>,
	Steven Price <steven.price@arm.com>,
	SeongJae Park <sj@kernel.org>,
	Naoya Horiguchi <naoya.horiguchi@nec.com>,
	Christophe Leroy <christophe.leroy@csgroup.eu>,
	Zack Rusin <zackr@vmware.com>, Jason Gunthorpe <jgg@ziepe.ca>,
	Axel Rasmussen <axelrasmussen@google.com>,
	Anshuman Khandual <anshuman.khandual@arm.com>,
	Pasha Tatashin <pasha.tatashin@soleen.com>,
	Miaohe Lin <linmiaohe@huawei.com>,
	Minchan Kim <minchan@kernel.org>,
	Christoph Hellwig <hch@infradead.org>, Song Liu <song@kernel.org>,
	Thomas Hellstrom <thomas.hellstrom@linux.intel.com>,
	Russell King <linux@armlinux.org.uk>,
	"David S. Miller" <davem@davemloft.net>,
	Michael Ellerman <mpe@ellerman.id.au>,
	"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>,
	Heiko Carstens <hca@linux.ibm.com>,
	Christian Borntraeger <borntraeger@linux.ibm.com>,
	Claudio Imbrenda <imbrenda@linux.ibm.com>,
	Alexander Gordeev <agordeev@linux.ibm.com>,
	Jann Horn <jannh@google.com>,
	linux-arm-kernel@lists.infradead.org, sparclinux@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: [PATCH 10/12] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()
Date: Sun, 28 May 2023 23:26:48 -0700 (PDT)	[thread overview]
Message-ID: <563340a4-7ac9-7cc8-33d8-f7cc6ef19ea6@google.com> (raw)
In-Reply-To: <35e983f5-7ed3-b310-d949-9ae8b130cdab@google.com>

Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp().
It does need mmap_read_lock(), but it does not need mmap_write_lock(),
nor vma_start_write() nor i_mmap lock nor anon_vma lock.  All racing
paths are relying on pte_offset_map_lock() and pmd_lock(), so use those.
Follow the pattern in retract_page_tables(); and using pte_free_defer()
removes the need for tlb_remove_table_sync_one() here.

Confirm the preliminary find_pmd_or_thp_or_none() once page lock has been
acquired and the page looks suitable: from then on its state is stable.

However, collapse_pte_mapped_thp() was doing something others don't:
freeing a page table still containing "valid" entries.  i_mmap lock did
stop a racing truncate from double-freeing those pages, but we prefer
collapse_pte_mapped_thp() to clear the entries as usual.  Their TLB
flush can wait until the pmdp_collapse_flush() which follows, but the
mmu_notifier_invalidate_range_start() has to be done earlier.

Some cleanup while rearranging: rename "count" to "nr_ptes";
and "step 2" does not need to duplicate the checks in "step 1".

Signed-off-by: Hugh Dickins <hughd@google.com>
---
 mm/khugepaged.c | 131 +++++++++++++++---------------------------------
 1 file changed, 41 insertions(+), 90 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4fd408154692..2999500abdd5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1485,7 +1485,7 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
 	return ret;
 }
 
-/* hpage must be locked, and mmap_lock must be held in write */
+/* hpage must be locked, and mmap_lock must be held */
 static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
 			pmd_t *pmdp, struct page *hpage)
 {
@@ -1497,7 +1497,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
 	};
 
 	VM_BUG_ON(!PageTransHuge(hpage));
-	mmap_assert_write_locked(vma->vm_mm);
+	mmap_assert_locked(vma->vm_mm);
 
 	if (do_set_pmd(&vmf, hpage))
 		return SCAN_FAIL;
@@ -1506,48 +1506,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return SCAN_SUCCEED;
 }
 
-/*
- * A note about locking:
- * Trying to take the page table spinlocks would be useless here because those
- * are only used to synchronize:
- *
- *  - modifying terminal entries (ones that point to a data page, not to another
- *    page table)
- *  - installing *new* non-terminal entries
- *
- * Instead, we need roughly the same kind of protection as free_pgtables() or
- * mm_take_all_locks() (but only for a single VMA):
- * The mmap lock together with this VMA's rmap locks covers all paths towards
- * the page table entries we're messing with here, except for hardware page
- * table walks and lockless_pages_from_mm().
- */
-static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
-				  unsigned long addr, pmd_t *pmdp)
-{
-	pmd_t pmd;
-	struct mmu_notifier_range range;
-
-	mmap_assert_write_locked(mm);
-	if (vma->vm_file)
-		lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
-	/*
-	 * All anon_vmas attached to the VMA have the same root and are
-	 * therefore locked by the same lock.
-	 */
-	if (vma->anon_vma)
-		lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
-
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
-				addr + HPAGE_PMD_SIZE);
-	mmu_notifier_invalidate_range_start(&range);
-	pmd = pmdp_collapse_flush(vma, addr, pmdp);
-	tlb_remove_table_sync_one();
-	mmu_notifier_invalidate_range_end(&range);
-	mm_dec_nr_ptes(mm);
-	page_table_check_pte_clear_range(mm, addr, pmd);
-	pte_free(mm, pmd_pgtable(pmd));
-}
-
 /**
  * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
  * address haddr.
@@ -1563,16 +1521,17 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
 int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 			    bool install_pmd)
 {
+	struct mmu_notifier_range range;
 	unsigned long haddr = addr & HPAGE_PMD_MASK;
 	struct vm_area_struct *vma = vma_lookup(mm, haddr);
 	struct page *hpage;
 	pte_t *start_pte, *pte;
-	pmd_t *pmd;
-	spinlock_t *ptl;
-	int count = 0, result = SCAN_FAIL;
+	pmd_t *pmd, pgt_pmd;
+	spinlock_t *pml, *ptl;
+	int nr_ptes = 0, result = SCAN_FAIL;
 	int i;
 
-	mmap_assert_write_locked(mm);
+	mmap_assert_locked(mm);
 
 	/* Fast check before locking page if already PMD-mapped */
 	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
@@ -1612,6 +1571,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 		goto drop_hpage;
 	}
 
+	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
 	switch (result) {
 	case SCAN_SUCCEED:
 		break;
@@ -1625,27 +1585,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 		goto drop_hpage;
 	}
 
-	/* Lock the vma before taking i_mmap and page table locks */
-	vma_start_write(vma);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				haddr, haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
 
-	/*
-	 * We need to lock the mapping so that from here on, only GUP-fast and
-	 * hardware page walks can access the parts of the page tables that
-	 * we're operating on.
-	 * See collapse_and_free_pmd().
-	 */
-	i_mmap_lock_write(vma->vm_file->f_mapping);
-
-	/*
-	 * This spinlock should be unnecessary: Nobody else should be accessing
-	 * the page tables under spinlock protection here, only
-	 * lockless_pages_from_mm() and the hardware page walker can access page
-	 * tables while all the high-level locks are held in write mode.
-	 */
 	result = SCAN_FAIL;
 	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
-	if (!start_pte)
-		goto drop_immap;
+	if (!start_pte)		/* mmap_lock + page lock should prevent this */
+		goto abort;
 
 	/* step 1: check all mapped PTEs are to the right huge page */
 	for (i = 0, addr = haddr, pte = start_pte;
@@ -1671,40 +1618,44 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 		 */
 		if (hpage + i != page)
 			goto abort;
-		count++;
+		nr_ptes++;
 	}
 
-	/* step 2: adjust rmap */
+	/* step 2: clear page table and adjust rmap */
 	for (i = 0, addr = haddr, pte = start_pte;
 	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
-		struct page *page;
-
 		if (pte_none(*pte))
 			continue;
-		page = vm_normal_page(vma, addr, *pte);
-		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
-			goto abort;
-		page_remove_rmap(page, vma, false);
+
+		/* Must clear entry, or a racing truncate may re-remove it */
+		pte_clear(mm, addr, pte);
+		page_remove_rmap(hpage + i, vma, false);
 	}
 
 	pte_unmap_unlock(start_pte, ptl);
 
 	/* step 3: set proper refcount and mm_counters. */
-	if (count) {
-		page_ref_sub(hpage, count);
-		add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+	if (nr_ptes) {
+		page_ref_sub(hpage, nr_ptes);
+		add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -nr_ptes);
 	}
 
-	/* step 4: remove pte entries */
-	/* we make no change to anon, but protect concurrent anon page lookup */
-	if (vma->anon_vma)
-		anon_vma_lock_write(vma->anon_vma);
+	/* step 4: remove page table */
 
-	collapse_and_free_pmd(mm, vma, haddr, pmd);
+	/* Huge page lock is still held, so page table must remain empty */
+	pml = pmd_lock(mm, pmd);
+	if (ptl != pml)
+		spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+	pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
+	if (ptl != pml)
+		spin_unlock(ptl);
+	spin_unlock(pml);
 
-	if (vma->anon_vma)
-		anon_vma_unlock_write(vma->anon_vma);
-	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	mmu_notifier_invalidate_range_end(&range);
+
+	mm_dec_nr_ptes(mm);
+	page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
+	pte_free_defer(mm, pmd_pgtable(pgt_pmd));
 
 maybe_install_pmd:
 	/* step 5: install pmd entry */
@@ -1718,9 +1669,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	return result;
 
 abort:
-	pte_unmap_unlock(start_pte, ptl);
-drop_immap:
-	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	if (start_pte)
+		pte_unmap_unlock(start_pte, ptl);
+	mmu_notifier_invalidate_range_end(&range);
 	goto drop_hpage;
 }
 
@@ -2842,9 +2793,9 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		case SCAN_PTE_MAPPED_HUGEPAGE:
 			BUG_ON(mmap_locked);
 			BUG_ON(*prev);
-			mmap_write_lock(mm);
+			mmap_read_lock(mm);
 			result = collapse_pte_mapped_thp(mm, addr, true);
-			mmap_write_unlock(mm);
+			mmap_locked = true;
 			goto handle_result;
 		/* Whitelisted set of results where continuing OK */
 		case SCAN_PMD_NULL:
-- 
2.35.3


  parent reply	other threads:[~2023-05-29  6:27 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-29  6:11 [PATCH 00/12] mm: free retracted page table by RCU Hugh Dickins
2023-05-29  6:14 ` [PATCH 01/12] mm/pgtable: add rcu_read_lock() and rcu_read_unlock()s Hugh Dickins
2023-05-31 17:06   ` Jann Horn
2023-06-02  2:50     ` Hugh Dickins
2023-06-02 14:21       ` Jann Horn
2023-05-29  6:16 ` [PATCH 02/12] mm/pgtable: add PAE safety to __pte_offset_map() Hugh Dickins
2023-05-29 13:56   ` Matthew Wilcox
     [not found]   ` <ZHeg3oRljRn6wlLX@ziepe.ca>
2023-06-02  5:35     ` Hugh Dickins
2023-05-29  6:17 ` [PATCH 03/12] arm: adjust_pte() use pte_offset_map_nolock() Hugh Dickins
2023-05-29  6:18 ` [PATCH 04/12] powerpc: assert_pte_locked() " Hugh Dickins
2023-05-29  6:20 ` [PATCH 05/12] powerpc: add pte_free_defer() for pgtables sharing page Hugh Dickins
2023-05-29 14:02   ` Matthew Wilcox
2023-05-29 14:36     ` Hugh Dickins
2023-06-01 13:57       ` Gerald Schaefer
2023-06-02  6:38         ` Hugh Dickins
2023-06-02 14:20     ` Jason Gunthorpe
2023-06-06  3:40       ` Hugh Dickins
2023-06-06 18:23         ` Jason Gunthorpe
2023-06-06 19:03           ` Peter Xu
2023-06-06 19:08             ` Jason Gunthorpe
2023-06-07  3:49               ` Hugh Dickins
2023-05-29  6:21 ` [PATCH 06/12] sparc: " Hugh Dickins
2023-06-06  3:46   ` Hugh Dickins
2023-05-29  6:22 ` [PATCH 07/12] s390: add pte_free_defer(), with use of mmdrop_async() Hugh Dickins
2023-06-06  5:11   ` Hugh Dickins
2023-06-06 18:39     ` Jason Gunthorpe
2023-06-08  2:46       ` Hugh Dickins
2023-06-06 19:40     ` Gerald Schaefer
2023-06-08  3:35       ` Hugh Dickins
2023-06-08 13:58         ` Jason Gunthorpe
2023-06-08 15:47         ` Gerald Schaefer
2023-05-29  6:23 ` [PATCH 08/12] mm/pgtable: add pte_free_defer() for pgtable as page Hugh Dickins
2023-06-01 13:31   ` Jann Horn
     [not found]   ` <ZHekpAKJ05cr/GLl@ziepe.ca>
2023-06-02  6:03     ` Hugh Dickins
2023-06-02 12:15       ` Jason Gunthorpe
2023-05-29  6:25 ` [PATCH 09/12] mm/khugepaged: retract_page_tables() without mmap or vma lock Hugh Dickins
2023-05-29 23:26   ` Peter Xu
2023-05-31  0:38     ` Hugh Dickins
2023-05-31 15:34   ` Jann Horn
     [not found]     ` <ZHe0A079X9B8jWlH@x1n>
2023-05-31 22:18       ` Jann Horn
2023-06-01 14:06         ` Jason Gunthorpe
2023-06-06  6:18     ` Hugh Dickins
2023-05-29  6:26 ` Hugh Dickins [this message]
2023-05-31 17:25   ` [PATCH 10/12] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock() Jann Horn
2023-06-02  5:11     ` Hugh Dickins
2023-05-29  6:28 ` [PATCH 11/12] mm/khugepaged: delete khugepaged_collapse_pte_mapped_thps() Hugh Dickins
2023-05-29  6:30 ` [PATCH 12/12] mm: delete mmap_write_trylock() and vma_try_start_write() Hugh Dickins
2023-05-31 17:59 ` [PATCH 00/12] mm: free retracted page table by RCU Jann Horn
2023-06-02  4:37   ` Hugh Dickins
2023-06-02 15:26     ` Jann Horn
2023-06-06  6:28       ` Hugh Dickins

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=563340a4-7ac9-7cc8-33d8-f7cc6ef19ea6@google.com \
    --to=hughd@google.com \
    --cc=agordeev@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=anshuman.khandual@arm.com \
    --cc=apopple@nvidia.com \
    --cc=axelrasmussen@google.com \
    --cc=borntraeger@linux.ibm.com \
    --cc=christophe.leroy@csgroup.eu \
    --cc=davem@davemloft.net \
    --cc=david@redhat.com \
    --cc=hca@linux.ibm.com \
    --cc=hch@infradead.org \
    --cc=imbrenda@linux.ibm.com \
    --cc=ira.weiny@intel.com \
    --cc=jannh@google.com \
    --cc=jgg@ziepe.ca \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=linux@armlinux.org.uk \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mgorman@techsingularity.net \
    --cc=mike.kravetz@oracle.com \
    --cc=minchan@kernel.org \
    --cc=mpe@ellerman.id.au \
    --cc=naoya.horiguchi@nec.com \
    --cc=pasha.tatashin@soleen.com \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rcampbell@nvidia.com \
    --cc=rppt@kernel.org \
    --cc=shy828301@gmail.com \
    --cc=sj@kernel.org \
    --cc=song@kernel.org \
    --cc=sparclinux@vger.kernel.org \
    --cc=steven.price@arm.com \
    --cc=surenb@google.com \
    --cc=thomas.hellstrom@linux.intel.com \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=yuzhao@google.com \
    --cc=zackr@vmware.com \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).