All of lore.kernel.org
 help / color / mirror / Atom feed
From: Xu Yu <xuyu@linux.alibaba.com>
To: linux-mm@kvack.org
Cc: david@redhat.com
Subject: [PATCH v3 1/2] mm/khugepaged: map RO non-exclusive pte-mapped anon THPs by pmds
Date: Mon, 18 Dec 2023 15:06:32 +0800	[thread overview]
Message-ID: <1fecc331345653b8a3ab1dc2cfb24b5f946f5569.1702882426.git.xuyu@linux.alibaba.com> (raw)
In-Reply-To: <cover.1702882426.git.xuyu@linux.alibaba.com>

In the anonymous collapse path, khugepaged always collapses pte-mapped
hugepage by allocating and copying to a new hugepage.

In some scenarios, we can only update the mapping page tables for
anonymous pte-mapped THPs, in the same way as file/shmem-backed
pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
attempt to map file/shmem-backed pte-mapped THPs by pmds")

The simplest scenario that satisfies the conditions, as David points out,
is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
collapse into a R/O PMD without further action.

Let's start from this simplest scenario.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 mm/khugepaged.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..57e261387124 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1237,6 +1237,196 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	return result;
 }
 
+static struct folio *find_lock_pte_mapped_folio(struct vm_area_struct *vma,
+						unsigned long addr, pmd_t *pmd)
+{
+	pte_t *pte, pteval;
+	struct folio *folio = NULL;
+
+	pte = pte_offset_map(pmd, addr);
+	if (!pte)
+		return NULL;
+
+	pteval = ptep_get_lockless(pte);
+	if (pte_none(pteval) || !pte_present(pteval))
+		goto out;
+
+	folio = vm_normal_folio(vma, addr, pteval);
+	if (unlikely(!folio) || unlikely(folio_is_zone_device(folio)))
+		goto out;
+
+	if (!folio_trylock(folio)) {
+		folio = NULL;
+		goto out;
+	}
+
+	if (!folio_try_get(folio)) {
+		folio_unlock(folio);
+		folio = NULL;
+		goto out;
+	}
+
+out:
+	pte_unmap(pte);
+	return folio;
+}
+
+static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
+				struct vm_area_struct *vma,
+				unsigned long haddr, bool *mmap_locked,
+				struct collapse_control *cc)
+{
+	struct mmu_notifier_range range;
+	struct folio *folio;
+	pte_t *start_pte, *pte;
+	pmd_t *pmd, pmdval;
+	spinlock_t *pml, *ptl;
+	pgtable_t pgtable;
+	unsigned long addr;
+	int exclusive = 0;
+	bool writable = false;
+	int result, i;
+
+	/* Fast check before locking folio if already PMD-mapped */
+	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+	if (result == SCAN_PMD_MAPPED)
+		return result;
+
+	folio = find_lock_pte_mapped_folio(vma, haddr, pmd);
+	if (!folio)
+		return SCAN_PAGE_NULL;
+	if (!folio_test_large(folio)) {
+		result = SCAN_FAIL;
+		goto drop_folio;
+	}
+	if (folio_order(folio) != HPAGE_PMD_ORDER) {
+		result = SCAN_PAGE_COMPOUND;
+		goto drop_folio;
+	}
+
+	mmap_read_unlock(mm);
+	*mmap_locked = false;
+
+	/* Prevent all access to pagetables */
+	mmap_write_lock(mm);
+	vma_start_write(vma);
+
+	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	result = check_pmd_still_valid(mm, haddr, pmd);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	/* Recheck with mmap write lock */
+	result = SCAN_SUCCEED;
+	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+	if (!start_pte)
+		goto up_write;
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *subpage;
+		pte_t pteval = ptep_get(pte);
+
+		if (pte_none(pteval) || !pte_present(pteval)) {
+			result = SCAN_PTE_NON_PRESENT;
+			break;
+		}
+
+		if (pte_uffd_wp(pteval)) {
+			result = SCAN_PTE_UFFD_WP;
+			break;
+		}
+
+		if (pte_write(pteval))
+			writable = true;
+
+		subpage = vm_normal_page(vma, addr, pteval);
+
+		if (unlikely(!subpage) ||
+		    unlikely(is_zone_device_page(subpage))) {
+			result = SCAN_PAGE_NULL;
+			break;
+		}
+
+		if (folio_page(folio, i) != subpage) {
+			result = SCAN_FAIL;
+			break;
+		}
+
+		if (PageAnonExclusive(subpage))
+			exclusive++;
+	}
+	pte_unmap_unlock(start_pte, ptl);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	/*
+	 * Case 1:
+	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
+	 * collapse into a R/O PMD without further action.
+	 */
+	if (!(exclusive == 0 && !writable))
+		goto up_write;
+
+	/* Collapse pmd entry */
+	anon_vma_lock_write(vma->anon_vma);
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				haddr, haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	pml = pmd_lock(mm, pmd); /* probably unnecessary */
+	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
+	spin_unlock(pml);
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_remove_table_sync_one();
+
+	anon_vma_unlock_write(vma->anon_vma);
+
+	/*
+	 * Obtain a new pmd rmap before dropping pte rmaps to avoid
+	 * false-negative page_mapped().
+	 */
+	folio_get(folio);
+	page_add_anon_rmap(&folio->page, vma, haddr, RMAP_COMPOUND);
+
+	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *subpage;
+		pte_t pteval = ptep_get(pte);
+
+		ptep_clear(mm, addr, pte);
+		subpage = vm_normal_page(vma, addr, pteval);
+		page_remove_rmap(subpage, vma, false);
+	}
+	pte_unmap_unlock(start_pte, ptl);
+	folio_ref_sub(folio, HPAGE_PMD_NR);
+
+	/* Install pmd entry */
+	pgtable = pmd_pgtable(pmdval);
+	pmdval = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+	spin_lock(pml);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	set_pmd_at(mm, haddr, pmd, pmdval);
+	update_mmu_cache_pmd(vma, haddr, pmd);
+	spin_unlock(pml);
+
+	result = SCAN_SUCCEED;
+
+up_write:
+	mmap_write_unlock(mm);
+
+drop_folio:
+	folio_unlock(folio);
+	folio_put(folio);
+
+	/* TODO: tracepoints */
+	return result;
+}
+
 static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 				   struct vm_area_struct *vma,
 				   unsigned long address, bool *mmap_locked,
@@ -1251,6 +1441,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
 	bool writable = false;
+	int exclusive = 0;
+	bool is_hpage = false;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1333,8 +1525,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 			}
 		}
 
+		if (PageAnonExclusive(page))
+			exclusive++;
+
 		page = compound_head(page);
 
+		if (compound_order(page) == HPAGE_PMD_ORDER)
+			is_hpage = true;
+
 		/*
 		 * Record which node the original page is from and save this
 		 * information to cc->node_load[].
@@ -1396,7 +1594,21 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+
+	if (is_hpage && (exclusive == 0 && !writable)) {
+		int res;
+
+		res = collapse_pte_mapped_anon_thp(mm, vma, address,
+						   mmap_locked, cc);
+		if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) {
+			result = res;
+			goto out;
+		}
+	}
+
 	if (result == SCAN_SUCCEED) {
+		if (!*mmap_locked)
+			mmap_read_lock(mm);
 		result = collapse_huge_page(mm, address, referenced,
 					    unmapped, cc);
 		/* collapse_huge_page will return with the mmap_lock released */
-- 
2.37.1



  reply	other threads:[~2023-12-18  7:06 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-13  9:05 [PATCH 0/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds Xu Yu
2023-11-13  9:05 ` [PATCH 1/1] " Xu Yu
2023-11-13  9:26   ` David Hildenbrand
2023-11-13  9:33     ` Xu Yu
2023-11-13 10:10       ` David Hildenbrand
2023-12-07  3:09 ` [PATCH v2 0/2] attempt to " Xu Yu
2023-12-07  3:09   ` [PATCH v2 1/2] mm/khugepaged: " Xu Yu
2023-12-07  7:47     ` Xu Yu
2023-12-07 10:37     ` David Hildenbrand
2023-12-18  2:45       ` Xu Yu
2023-12-07  3:09   ` [PATCH v2 2/2] mm/khugepaged: add case for mapping " Xu Yu
2023-12-18  7:06 ` [PATCH v3 0/2] attempt to map " Xu Yu
2023-12-18  7:06   ` Xu Yu [this message]
2023-12-18  7:06   ` [PATCH v3 2/2] mm/khugepaged: map exclusive " Xu Yu
2023-12-21 20:40   ` [PATCH v3 0/2] attempt to map " Zach O'Keefe
2023-12-21 20:54     ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1fecc331345653b8a3ab1dc2cfb24b5f946f5569.1702882426.git.xuyu@linux.alibaba.com \
    --to=xuyu@linux.alibaba.com \
    --cc=david@redhat.com \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.