All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds
@ 2023-11-13  9:05 Xu Yu
  2023-11-13  9:05 ` [PATCH 1/1] " Xu Yu
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Xu Yu @ 2023-11-13  9:05 UTC (permalink / raw)
  To: linux-mm; +Cc: akpm, zokeefe, song, shy828301

I write a testcase[1] to demonstrate this scenario.

vanilla kernel (v6.6):
# ./test
[mmap rw    ] vaddr: 0x0x7f2000000000, pfn: 0x110000, is_thp: 1, is_thp_mapped: 1
[mprotect ro] vaddr: 0x0x7f2000000000, pfn: 0x110000, is_thp: 1, is_thp_mapped: 0
          ^^ __split_huge_pmd() is called in  change_protection()
[mprotect rw] vaddr: 0x0x7f2000000000, pfn: 0x110000, is_thp: 1, is_thp_mapped: 0
          ^^ vma_merge()
[khugepaged ] vaddr: 0x0x7f2000000000, pfn: 0x110000, is_thp: 1, is_thp_mapped: 0
[khugepaged ] vaddr: 0x0x7f2000000000, pfn: 0x110000, is_thp: 1, is_thp_mapped: 0
[khugepaged ] vaddr: 0x0x7f2000000000, pfn: 0x117a00, is_thp: 1, is_thp_mapped: 1
         ^^ hpage_collapse_scan_pmd         ^^^^^^^^ new hugepage is allocated
[khugepaged ] vaddr: 0x0x7f2000000000, pfn: 0x117a00, is_thp: 1, is_thp_mapped: 1

patched kernel:
# ./test
[mmap rw    ] vaddr: 0x0x7f2000000000, pfn: 0x128400, is_thp: 1, is_thp_mapped: 1
[mprotect ro] vaddr: 0x0x7f2000000000, pfn: 0x128400, is_thp: 1, is_thp_mapped: 0
[mprotect rw] vaddr: 0x0x7f2000000000, pfn: 0x128400, is_thp: 1, is_thp_mapped: 0
[khugepaged ] vaddr: 0x0x7f2000000000, pfn: 0x128400, is_thp: 1, is_thp_mapped: 0
[khugepaged ] vaddr: 0x0x7f2000000000, pfn: 0x128400, is_thp: 1, is_thp_mapped: 0
[khugepaged ] vaddr: 0x0x7f2000000000, pfn: 0x128400, is_thp: 1, is_thp_mapped: 1
         ^^ hpage_collapse_scan_pmd         ^^^^^^^^ old hugepage
[khugepaged ] vaddr: 0x0x7f2000000000, pfn: 0x128400, is_thp: 1, is_thp_mapped: 1

[1] testcase source code
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include<signal.h>

#define PAGE_SHIFT	12
#define SIZE_2M		(2<<20)
#define ADDRESS		0x7f2000000000

#define PAGEMAP_PRESENT		63
#define PAGEMAP_PFN_MASK	0x007FFFFFFFFFFFFFUL    /* bits 54:0 */

#define KPF_SIZE	8
#define KPF_THP		22

static bool is_thp(unsigned long pfn)
{
	int kpageflags_fd;
	unsigned long flag;
	int ret;

	kpageflags_fd = open("/proc/kpageflags", O_RDONLY);
	ret = pread(kpageflags_fd, &flag, KPF_SIZE, KPF_SIZE * pfn);
	close(kpageflags_fd);

	if (ret != KPF_SIZE)
		return false;

	return flag & (1 << KPF_THP);
}

static bool is_thp_mapped(void *addr)
{
	char path[64], line[128], pattern[64];
	int pid;
	FILE *fp;
	int anon_huge_pages = 0;

	snprintf(pattern, 64, "%lx", (unsigned long)addr);
	pid = getpid();
	snprintf(path, 64, "/proc/%u/smaps", pid);
	fp = fopen(path, "r");

	while (fgets(line, 64, fp) != NULL) {
		if (strstr(line, pattern))
			break;
	}

	while (fgets(line, 64, fp) != NULL) {
		if (1 == sscanf(line, "AnonHugePages:      %d kB", &anon_huge_pages))
			break;
	}
	fclose(fp);

	return anon_huge_pages != 0;
}

static unsigned long get_pfn(unsigned long vaddr)
{
	char path[64];
	int pid, pagemap_fd;
	off64_t offset;
	unsigned long pfn = 0;
	int ret;

	offset = (vaddr >> PAGE_SHIFT) * sizeof(unsigned long);
	pid = getpid();
	snprintf(path, 64, "/proc/%u/pagemap", pid);

	pagemap_fd = open(path, O_RDONLY);
	lseek64(pagemap_fd, offset, SEEK_SET);
	ret = read(pagemap_fd, &pfn, sizeof(pfn));
	close(pagemap_fd);

	if (ret < 0)
		return -1;

	if (!(pfn & (1UL << PAGEMAP_PRESENT)))
		return 0;

	return (pfn & PAGEMAP_PFN_MASK);
}

static void show_addr_info(void *addr, char *msg)
{
	unsigned long pfn;

	pfn = get_pfn((unsigned long)addr);
	printf("[%s] vaddr: 0x%p, pfn: 0x%lx, is_thp: %d, is_thp_mapped: %d\n",
	       msg, addr, pfn, is_thp(pfn), is_thp_mapped(addr));
}

int main(int argc, char *argv[])
{
	void *addr;

	addr = mmap((void *)ADDRESS, SIZE_2M, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0);
	if (addr == MAP_FAILED) {
		perror("mmap");
		return 1;
	}
	show_addr_info(addr, "mmap rw    ");

	mprotect(addr, SIZE_2M / 2, PROT_READ);
	show_addr_info(addr, "mprotect ro");

	mprotect(addr, SIZE_2M / 2, PROT_READ | PROT_WRITE);
	show_addr_info(addr, "mprotect rw");

	while (1) {
		show_addr_info(addr, "khugepaged ");
		sleep(5);
	}

	return 0;
}

Xu Yu (1):
  mm/khugepaged: map anonymous pte-mapped THPs by pmds

 mm/khugepaged.c | 187 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 180 insertions(+), 7 deletions(-)

-- 
2.37.1



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds
  2023-11-13  9:05 [PATCH 0/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds Xu Yu
@ 2023-11-13  9:05 ` Xu Yu
  2023-11-13  9:26   ` David Hildenbrand
  2023-12-07  3:09 ` [PATCH v2 0/2] attempt to " Xu Yu
  2023-12-18  7:06 ` [PATCH v3 0/2] attempt to map " Xu Yu
  2 siblings, 1 reply; 16+ messages in thread
From: Xu Yu @ 2023-11-13  9:05 UTC (permalink / raw)
  To: linux-mm; +Cc: akpm, zokeefe, song, shy828301

In the anonymous collapse path, khugepaged collapses pte-mapped
hugepages by allocating and copying to a new hugepage, which is
suboptimally.

In fact, we only need to update the mapping page tables for anonymous
pte-mapped THPs, in the same way as file/shmem-backed pte-mapped THPs,
as shown in commit 58ac9a8993a1 ("mm/khugepaged: attempt to map
file/shmem-backed pte-mapped THPs by pmds").

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 mm/khugepaged.c | 187 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 180 insertions(+), 7 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..14069dedebdc 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1384,6 +1384,12 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		     PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
 								     address)))
 			referenced++;
+
+		if (compound_order(page) == HPAGE_PMD_ORDER &&
+		    !is_huge_zero_page(page)) {
+			result = SCAN_PTE_MAPPED_HUGEPAGE;
+			goto out_unmap;
+		}
 	}
 	if (!writable) {
 		result = SCAN_PAGE_RO;
@@ -1402,6 +1408,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		/* collapse_huge_page will return with the mmap_lock released */
 		*mmap_locked = false;
 	}
+	if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
+		/* adapt to calling convention of collapse_pte_mapped_thp() */
+		mmap_read_unlock(mm);
+		*mmap_locked = false;
+	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
 				     none_or_zero, result, unmapped);
@@ -1454,6 +1465,140 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return SCAN_SUCCEED;
 }
 
+static struct page *find_lock_pte_mapped_page_unsafe(struct vm_area_struct *vma,
+						unsigned long addr, pmd_t *pmd)
+{
+	pte_t *pte, pteval;
+	struct page *page = NULL;
+
+	/* caller should recheck with ptl. */
+	pte = pte_offset_map(pmd, addr);
+	if (!pte)
+		return NULL;
+
+	pteval = ptep_get_lockless(pte);
+	if (pte_none(pteval) || !pte_present(pteval))
+		goto out;
+
+	page = vm_normal_page(vma, addr, pteval);
+	if (unlikely(!page) || unlikely(is_zone_device_page(page)))
+		goto out;
+
+	page = compound_head(page);
+
+	if (!trylock_page(page)) {
+		page = NULL;
+		goto out;
+	}
+
+	if (!get_page_unless_zero(page)) {
+		unlock_page(page);
+		page = NULL;
+		goto out;
+	}
+
+out:
+	pte_unmap(pte);
+	return page;
+}
+
+/* call with mmap write lock, and hpage is PG_locked. */
+static noinline int collapse_pte_mapped_thp_anon(struct mm_struct *mm,
+					struct vm_area_struct *vma,
+					unsigned long haddr, struct page *hpage)
+{
+	struct mmu_notifier_range range;
+	unsigned long addr;
+	pmd_t *pmd, pmdval;
+	pte_t *start_pte, *pte;
+	spinlock_t *pml, *ptl;
+	pgtable_t pgtable;
+	int result, i;
+
+	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+	if (result != SCAN_SUCCEED)
+		goto out;
+
+	result = SCAN_FAIL;
+	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+	if (!start_pte)		/* mmap_lock + page lock should prevent this */
+		goto out;
+	/* step 1: check all mapped PTEs are to the right huge page */
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *page;
+		pte_t pteval = ptep_get(pte);
+
+		if (pte_none(pteval) || !pte_present(pteval)) {
+			result = SCAN_PTE_NON_PRESENT;
+			goto out_unmap;
+		}
+
+		page = vm_normal_page(vma, addr, pteval);
+		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+			page = NULL;
+		/*
+		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
+		 * page table, but the new page will not be a subpage of hpage.
+		 */
+		if (hpage + i != page)
+			goto out_unmap;
+	}
+	pte_unmap_unlock(start_pte, ptl);
+
+	/* step 2: clear page table and adjust rmap */
+	vma_start_write(vma);
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				haddr, haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	pml = pmd_lock(mm, pmd);
+	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
+	spin_unlock(pml);
+
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_remove_table_sync_one();
+
+	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
+	if (!start_pte)
+		goto abort;
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *page;
+		pte_t pteval = ptep_get(pte);
+
+		page = vm_normal_page(vma, addr, pteval);
+		page_remove_rmap(page, vma, false);
+	}
+	pte_unmap_unlock(start_pte, ptl);
+
+	/* step 3: install pmd entry */
+	pgtable = pmd_pgtable(pmdval);
+
+	pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
+	pmdval = maybe_pmd_mkwrite(pmd_mkdirty(pmdval), vma);
+
+	spin_lock(pml);
+	page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	set_pmd_at(mm, haddr, pmd, pmdval);
+	update_mmu_cache_pmd(vma, haddr, pmd);
+	spin_unlock(pml);
+
+	result = SCAN_SUCCEED;
+	return result;
+abort:
+	spin_lock(pml);
+	pmd_populate(mm, pmd, pmd_pgtable(pmdval));
+	spin_unlock(pml);
+out_unmap:
+	if (start_pte)
+		pte_unmap_unlock(start_pte, ptl);
+out:
+	return result;
+}
+
 /**
  * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
  * address haddr.
@@ -1479,14 +1624,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	spinlock_t *pml = NULL, *ptl;
 	int nr_ptes = 0, result = SCAN_FAIL;
 	int i;
+	bool file;
 
 	mmap_assert_locked(mm);
 
 	/* First check VMA found, in case page tables are being torn down */
-	if (!vma || !vma->vm_file ||
-	    !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
+	if (!vma || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
 		return SCAN_VMA_CHECK;
 
+	file = !!vma->vm_file;
+
 	/* Fast check before locking page if already PMD-mapped */
 	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
 	if (result == SCAN_PMD_MAPPED)
@@ -1506,8 +1653,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	if (userfaultfd_wp(vma))
 		return SCAN_PTE_UFFD_WP;
 
-	hpage = find_lock_page(vma->vm_file->f_mapping,
-			       linear_page_index(vma, haddr));
+	if (file)
+		hpage = find_lock_page(vma->vm_file->f_mapping,
+				linear_page_index(vma, haddr));
+	else
+		hpage = find_lock_pte_mapped_page_unsafe(vma, haddr, pmd);
 	if (!hpage)
 		return SCAN_PAGE_NULL;
 
@@ -1521,6 +1671,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 		goto drop_hpage;
 	}
 
+	if (!file) {
+		result = collapse_pte_mapped_thp_anon(mm, vma, haddr, hpage);
+		goto drop_hpage;
+	}
+
 	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
 	switch (result) {
 	case SCAN_SUCCEED:
@@ -2415,6 +2570,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 			} else {
 				*result = hpage_collapse_scan_pmd(mm, vma,
 					khugepaged_scan.address, &mmap_locked, cc);
+				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
+					mmap_write_lock(mm);
+					if (hpage_collapse_test_exit(mm)) {
+						mmap_write_unlock(mm);
+						goto breakouterloop_mmap_lock;
+					}
+					*result = collapse_pte_mapped_thp(mm,
+						khugepaged_scan.address, true);
+					if (*result == SCAN_PMD_MAPPED)
+						*result = SCAN_SUCCEED;
+					mmap_write_unlock(mm);
+				}
 			}
 
 			if (*result == SCAN_SUCCEED)
@@ -2764,9 +2931,15 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		case SCAN_PTE_MAPPED_HUGEPAGE:
 			BUG_ON(mmap_locked);
 			BUG_ON(*prev);
-			mmap_read_lock(mm);
-			result = collapse_pte_mapped_thp(mm, addr, true);
-			mmap_read_unlock(mm);
+			if (vma->vm_file) {
+				mmap_read_lock(mm);
+				result = collapse_pte_mapped_thp(mm, addr, true);
+				mmap_read_unlock(mm);
+			} else {
+				mmap_write_lock(mm);
+				result = collapse_pte_mapped_thp(mm, addr, true);
+				mmap_write_unlock(mm);
+			}
 			goto handle_result;
 		/* Whitelisted set of results where continuing OK */
 		case SCAN_PMD_NULL:
-- 
2.37.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds
  2023-11-13  9:05 ` [PATCH 1/1] " Xu Yu
@ 2023-11-13  9:26   ` David Hildenbrand
  2023-11-13  9:33     ` Xu Yu
  0 siblings, 1 reply; 16+ messages in thread
From: David Hildenbrand @ 2023-11-13  9:26 UTC (permalink / raw)
  To: Xu Yu, linux-mm; +Cc: akpm, zokeefe, song, shy828301

On 13.11.23 10:05, Xu Yu wrote:
> In the anonymous collapse path, khugepaged collapses pte-mapped
> hugepages by allocating and copying to a new hugepage, which is
> suboptimally.
> 
> In fact, we only need to update the mapping page tables for anonymous
> pte-mapped THPs, in the same way as file/shmem-backed pte-mapped THPs,
> as shown in commit 58ac9a8993a1 ("mm/khugepaged: attempt to map
> file/shmem-backed pte-mapped THPs by pmds").

Somewhere along that patch set discussion, there was a discussion about
how to deal with PageAnonExclusive properly.

Quite a lot of information is documented in:

commit dee2ad120571f38433211098cd6b95a59bdfc8c7
Author: David Hildenbrand <david@redhat.com>
Date:   Wed Jan 4 15:49:05 2023 +0100

     selftests/vm: cow: add COW tests for collapsing of PTE-mapped anon THP
     
     Currently, anonymous PTE-mapped THPs cannot be collapsed in-place:
     collapsing (e.g., via MADV_COLLAPSE) implies allocating a fresh THP and
     mapping that new THP via a PMD: as it's a fresh anon THP, it will get the
     exclusive flag set on the head page and everybody is happy.
...



I purely speculate that you didn't consider it and got it wrong, sorry :P

Fortunately, I wrote a test case for that where it's documented (above)

tools/testing/selftests/mm/cow.c tests:

# [RUN] Basic COW after fork() when collapsing before fork()
ok 169 No leak from parent into child
# [RUN] Basic COW after fork() when collapsing after fork() (fully shared)
ok 170 # SKIP MADV_COLLAPSE failed: Invalid argument
# [RUN] Basic COW after fork() when collapsing after fork() (lower shared)
ok 171 No leak from parent into child
# [RUN] Basic COW after fork() when collapsing after fork() (upper shared)
ok 172 No leak from parent into child


If your implementation supports MADV_COLLAPSE, it might reveal any problems.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds
  2023-11-13  9:26   ` David Hildenbrand
@ 2023-11-13  9:33     ` Xu Yu
  2023-11-13 10:10       ` David Hildenbrand
  0 siblings, 1 reply; 16+ messages in thread
From: Xu Yu @ 2023-11-13  9:33 UTC (permalink / raw)
  To: David Hildenbrand, linux-mm; +Cc: akpm, zokeefe, song, shy828301

On 11/13/23 5:26 PM, David Hildenbrand wrote:
> On 13.11.23 10:05, Xu Yu wrote:
>> In the anonymous collapse path, khugepaged collapses pte-mapped
>> hugepages by allocating and copying to a new hugepage, which is
>> suboptimally.
>>
>> In fact, we only need to update the mapping page tables for anonymous
>> pte-mapped THPs, in the same way as file/shmem-backed pte-mapped THPs,
>> as shown in commit 58ac9a8993a1 ("mm/khugepaged: attempt to map
>> file/shmem-backed pte-mapped THPs by pmds").
> 
> Somewhere along that patch set discussion, there was a discussion about
> how to deal with PageAnonExclusive properly.
> 
> Quite a lot of information is documented in:
> 
> commit dee2ad120571f38433211098cd6b95a59bdfc8c7
> Author: David Hildenbrand <david@redhat.com>
> Date:   Wed Jan 4 15:49:05 2023 +0100
> 
>      selftests/vm: cow: add COW tests for collapsing of PTE-mapped anon THP
>      Currently, anonymous PTE-mapped THPs cannot be collapsed in-place:
>      collapsing (e.g., via MADV_COLLAPSE) implies allocating a fresh THP and
>      mapping that new THP via a PMD: as it's a fresh anon THP, it will get the
>      exclusive flag set on the head page and everybody is happy.
> ...
> 
> 
> 
> I purely speculate that you didn't consider it and got it wrong, sorry :P

As you said, I did not consider it. Actually, I didn't even know that. :(

> 
> Fortunately, I wrote a test case for that where it's documented (above)

Thanks very much! I will dig into it.

> 
> tools/testing/selftests/mm/cow.c tests:
> 
> # [RUN] Basic COW after fork() when collapsing before fork()
> ok 169 No leak from parent into child
> # [RUN] Basic COW after fork() when collapsing after fork() (fully shared)
> ok 170 # SKIP MADV_COLLAPSE failed: Invalid argument
> # [RUN] Basic COW after fork() when collapsing after fork() (lower shared)
> ok 171 No leak from parent into child
> # [RUN] Basic COW after fork() when collapsing after fork() (upper shared)
> ok 172 No leak from parent into child
> 
> 
> If your implementation supports MADV_COLLAPSE, it might reveal any problems.
> 
-- 
Thanks,
Yu



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds
  2023-11-13  9:33     ` Xu Yu
@ 2023-11-13 10:10       ` David Hildenbrand
  0 siblings, 0 replies; 16+ messages in thread
From: David Hildenbrand @ 2023-11-13 10:10 UTC (permalink / raw)
  To: Xu Yu, linux-mm; +Cc: akpm, zokeefe, song, shy828301

On 13.11.23 10:33, Xu Yu wrote:
> On 11/13/23 5:26 PM, David Hildenbrand wrote:
>> On 13.11.23 10:05, Xu Yu wrote:
>>> In the anonymous collapse path, khugepaged collapses pte-mapped
>>> hugepages by allocating and copying to a new hugepage, which is
>>> suboptimally.
>>>
>>> In fact, we only need to update the mapping page tables for anonymous
>>> pte-mapped THPs, in the same way as file/shmem-backed pte-mapped THPs,
>>> as shown in commit 58ac9a8993a1 ("mm/khugepaged: attempt to map
>>> file/shmem-backed pte-mapped THPs by pmds").
>>
>> Somewhere along that patch set discussion, there was a discussion about
>> how to deal with PageAnonExclusive properly.
>>
>> Quite a lot of information is documented in:
>>
>> commit dee2ad120571f38433211098cd6b95a59bdfc8c7
>> Author: David Hildenbrand <david@redhat.com>
>> Date:   Wed Jan 4 15:49:05 2023 +0100
>>
>>       selftests/vm: cow: add COW tests for collapsing of PTE-mapped anon THP
>>       Currently, anonymous PTE-mapped THPs cannot be collapsed in-place:
>>       collapsing (e.g., via MADV_COLLAPSE) implies allocating a fresh THP and
>>       mapping that new THP via a PMD: as it's a fresh anon THP, it will get the
>>       exclusive flag set on the head page and everybody is happy.
>> ...
>>
>>
>>
>> I purely speculate that you didn't consider it and got it wrong, sorry :P
> 
> As you said, I did not consider it. Actually, I didn't even know that. :(

I think the simplest two cases to handle that we should implement:

no subpages are PageAnonExclusive (PTEs must be R/O) -> we can collapse 
into a R/O PMD without further action.

all subpages are PageAnonExclusive (PTEs may be either R/O or R/W) -> 
clear PageAnonExclusive on all tail pages but the first (head) page and 
collapse to a R/W PMD with VM_WRITE or a R/O PMD without VM_WRITE.

We must check PageAnonExclusive with the page table lock held, and 
recheck in case we dropped it (IOW, what existing collapse code already 
does).

softdirty tracking and uffd-wp checks should likely be the same as for 
existing collapse handling.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 0/2] attempt to map anonymous pte-mapped THPs by pmds
  2023-11-13  9:05 [PATCH 0/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds Xu Yu
  2023-11-13  9:05 ` [PATCH 1/1] " Xu Yu
@ 2023-12-07  3:09 ` Xu Yu
  2023-12-07  3:09   ` [PATCH v2 1/2] mm/khugepaged: " Xu Yu
  2023-12-07  3:09   ` [PATCH v2 2/2] mm/khugepaged: add case for mapping " Xu Yu
  2023-12-18  7:06 ` [PATCH v3 0/2] attempt to map " Xu Yu
  2 siblings, 2 replies; 16+ messages in thread
From: Xu Yu @ 2023-12-07  3:09 UTC (permalink / raw)
  To: linux-mm; +Cc: david

Result of tools/testing/selftests/mm/cow.c tests:
# [RUN] Basic COW after fork() when collapsing before fork()
ok 145 No leak from parent into child
# [RUN] Basic COW after fork() when collapsing after fork() (fully shared)
ok 146 No leak from parent into child
# [RUN] Basic COW after fork() when collapsing after fork() (lower shared)
ok 147 No leak from parent into child
# [RUN] Basic COW after fork() when collapsing after fork() (upper shared)
ok 148 No leak from parent into child

Debug messages (simply with printk, will add tracepoints later) show
that THP in case 145 and 146 is remapped with pmd in place.

To be honest, I'm not sure some of the conditions are handled properly,
such as pin and uffd. And I send out this workable patchset to get more
comments. Many thanks!


Changes since v1:
- Deal with PageAnonExclusive properly, as suggested by David.

Xu Yu (2):
  mm/khugepaged: attempt to map anonymous pte-mapped THPs by pmds
  mm/khugepaged: add case for mapping anonymous pte-mapped THPs by pmds

 mm/khugepaged.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)

-- 
2.37.1



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/2] mm/khugepaged: attempt to map anonymous pte-mapped THPs by pmds
  2023-12-07  3:09 ` [PATCH v2 0/2] attempt to " Xu Yu
@ 2023-12-07  3:09   ` Xu Yu
  2023-12-07  7:47     ` Xu Yu
  2023-12-07 10:37     ` David Hildenbrand
  2023-12-07  3:09   ` [PATCH v2 2/2] mm/khugepaged: add case for mapping " Xu Yu
  1 sibling, 2 replies; 16+ messages in thread
From: Xu Yu @ 2023-12-07  3:09 UTC (permalink / raw)
  To: linux-mm; +Cc: david

In the anonymous collapse path, khugepaged always collapses pte-mapped
hugepage by allocating and copying to a new hugepage.

In some scenarios, we can only update the mapping page tables for
anonymous pte-mapped THPs, in the same way as file/shmem-backed
pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
attempt to map file/shmem-backed pte-mapped THPs by pmds")

The simplest scenario that satisfies the conditions, as David points out,
is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
collapse into a R/O PMD without further action.

Let's start from this simplest scenario.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 mm/khugepaged.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 214 insertions(+)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..85c7a2ab44ce 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1237,6 +1237,197 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	return result;
 }
 
+static struct page *find_lock_pte_mapped_page(struct vm_area_struct *vma,
+					      unsigned long addr, pmd_t *pmd)
+{
+	pte_t *pte, pteval;
+	struct page *page = NULL;
+
+	pte = pte_offset_map(pmd, addr);
+	if (!pte)
+		return NULL;
+
+	pteval = ptep_get_lockless(pte);
+	if (pte_none(pteval) || !pte_present(pteval))
+		goto out;
+
+	page = vm_normal_page(vma, addr, pteval);
+	if (unlikely(!page) || unlikely(is_zone_device_page(page)))
+		goto out;
+
+	page = compound_head(page);
+
+	if (!trylock_page(page)) {
+		page = NULL;
+		goto out;
+	}
+
+	if (!get_page_unless_zero(page)) {
+		unlock_page(page);
+		page = NULL;
+		goto out;
+	}
+
+out:
+	pte_unmap(pte);
+	return page;
+}
+
+static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
+				struct vm_area_struct *vma,
+				unsigned long haddr, bool *mmap_locked,
+				struct collapse_control *cc)
+{
+	struct mmu_notifier_range range;
+	struct page *hpage;
+	pte_t *start_pte, *pte;
+	pmd_t *pmd, pmdval;
+	spinlock_t *pml, *ptl;
+	pgtable_t pgtable;
+	unsigned long addr;
+	int exclusive = 0;
+	bool writable = false;
+	int result, i;
+
+	/* Fast check before locking page if already PMD-mapped */
+	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+	if (result == SCAN_PMD_MAPPED)
+		return result;
+
+	hpage = find_lock_pte_mapped_page(vma, haddr, pmd);
+	if (!hpage)
+		return SCAN_PAGE_NULL;
+	if (!PageHead(hpage)) {
+		result = SCAN_FAIL;
+		goto drop_hpage;
+	}
+	if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+		result = SCAN_PAGE_COMPOUND;
+		goto drop_hpage;
+	}
+
+	mmap_read_unlock(mm);
+	*mmap_locked = false;
+
+	/* Prevent all access to pagetables */
+	mmap_write_lock(mm);
+
+	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	result = check_pmd_still_valid(mm, haddr, pmd);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	/* Recheck with mmap write lock */
+	result = SCAN_SUCCEED;
+	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+	if (!start_pte)
+		goto drop_hpage;
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *page;
+		pte_t pteval = ptep_get(pte);
+
+		if (pte_none(pteval) || !pte_present(pteval)) {
+			result = SCAN_PTE_NON_PRESENT;
+			break;
+		}
+
+		if (pte_uffd_wp(pteval)) {
+			result = SCAN_PTE_UFFD_WP;
+			break;
+		}
+
+		if (pte_write(pteval))
+			writable = true;
+
+		page = vm_normal_page(vma, addr, pteval);
+
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
+			result = SCAN_PAGE_NULL;
+			break;
+		}
+
+		if (hpage + i != page) {
+			result = SCAN_FAIL;
+			break;
+		}
+
+		if (PageAnonExclusive(page))
+			exclusive++;
+	}
+	pte_unmap_unlock(start_pte, ptl);
+	if (result != SCAN_SUCCEED)
+		goto drop_hpage;
+
+	/*
+	 * Case 1:
+	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
+	 * collapse into a R/O PMD without further action.
+	 */
+	if (!(exclusive == 0 && !writable))
+		goto drop_hpage;
+
+	/* Collapse pmd entry */
+	vma_start_write(vma);
+	anon_vma_lock_write(vma->anon_vma);
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				haddr, haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	pml = pmd_lock(mm, pmd); /* probably unnecessary */
+	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
+	spin_unlock(pml);
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_remove_table_sync_one();
+
+	anon_vma_unlock_write(vma->anon_vma);
+
+	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
+	if (!start_pte)
+		goto rollback;
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *page;
+		pte_t pteval = ptep_get(pte);
+
+		page = vm_normal_page(vma, addr, pteval);
+		page_remove_rmap(page, vma, false);
+	}
+	pte_unmap_unlock(start_pte, ptl);
+
+	/* Install pmd entry */
+	pgtable = pmd_pgtable(pmdval);
+	pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
+	spin_lock(pml);
+	page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	set_pmd_at(mm, haddr, pmd, pmdval);
+	update_mmu_cache_pmd(vma, haddr, pmd);
+	spin_unlock(pml);
+
+	result = SCAN_SUCCEED;
+	goto up_write;
+
+rollback:
+	spin_lock(pml);
+	pmd_populate(mm, pmd, pmd_pgtable(pmdval));
+	spin_unlock(pml);
+
+up_write:
+	mmap_write_unlock(mm);
+
+drop_hpage:
+	unlock_page(hpage);
+	put_page(hpage);
+
+	/* TODO: tracepoints */
+	return result;
+}
+
 static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 				   struct vm_area_struct *vma,
 				   unsigned long address, bool *mmap_locked,
@@ -1251,6 +1442,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
 	bool writable = false;
+	int exclusive = 0;
+	bool is_hpage = false;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1333,8 +1526,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 			}
 		}
 
+		if (PageAnonExclusive(page))
+			exclusive++;
+
 		page = compound_head(page);
 
+		if (compound_order(page) == HPAGE_PMD_ORDER)
+			is_hpage = true;
+
 		/*
 		 * Record which node the original page is from and save this
 		 * information to cc->node_load[].
@@ -1396,7 +1595,22 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+
+	if (is_hpage && (exclusive == 0 && !writable)) {
+		int res;
+
+		res = collapse_pte_mapped_anon_thp(mm, vma, address,
+						   mmap_locked, cc);
+		if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) {
+			result = res;
+			goto out;
+		}
+
+	}
+
 	if (result == SCAN_SUCCEED) {
+		if (!*mmap_locked)
+			mmap_read_lock(mm);
 		result = collapse_huge_page(mm, address, referenced,
 					    unmapped, cc);
 		/* collapse_huge_page will return with the mmap_lock released */
-- 
2.37.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/2] mm/khugepaged: add case for mapping anonymous pte-mapped THPs by pmds
  2023-12-07  3:09 ` [PATCH v2 0/2] attempt to " Xu Yu
  2023-12-07  3:09   ` [PATCH v2 1/2] mm/khugepaged: " Xu Yu
@ 2023-12-07  3:09   ` Xu Yu
  1 sibling, 0 replies; 16+ messages in thread
From: Xu Yu @ 2023-12-07  3:09 UTC (permalink / raw)
  To: linux-mm; +Cc: david

This adds another case, as David points out, which is suitable for
mapping anonymous pte-mapped THPs by pmds. When all subpages are
PageAnonExclusive (PTEs may be either R/O or R/W), we can clear
PageAnonExclusive on all tail pages but the first (head) page and
collapse to a R/W PMD with VM_WRITE or a R/O PMD without VM_WRITE.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 mm/khugepaged.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 85c7a2ab44ce..5c51b09cb291 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1366,8 +1366,14 @@ static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
 	 * Case 1:
 	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
 	 * collapse into a R/O PMD without further action.
+	 *
+	 * Case 2:
+	 * All subpages are PageAnonExclusive (PTEs may be either R/O or R/W),
+	 * we clear PageAnonExclusive on all tail pages but the head page and
+	 * collapse to a R/W PMD with VM_WRITE or a R/O PMD without VM_WRITE.
 	 */
-	if (!(exclusive == 0 && !writable))
+	if (!((exclusive == 0 && !writable) ||
+	      (exclusive == HPAGE_PMD_NR)))
 		goto drop_hpage;
 
 	/* Collapse pmd entry */
@@ -1396,12 +1402,21 @@ static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
 
 		page = vm_normal_page(vma, addr, pteval);
 		page_remove_rmap(page, vma, false);
+
+		if (exclusive == HPAGE_PMD_NR)
+			ClearPageAnonExclusive(page);
 	}
 	pte_unmap_unlock(start_pte, ptl);
 
 	/* Install pmd entry */
 	pgtable = pmd_pgtable(pmdval);
 	pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
+
+	if (exclusive == HPAGE_PMD_NR) {
+		SetPageAnonExclusive(hpage);
+		pmdval = maybe_pmd_mkwrite(pmd_mkdirty(pmdval), vma);
+	}
+
 	spin_lock(pml);
 	page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
@@ -1596,7 +1611,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 
-	if (is_hpage && (exclusive == 0 && !writable)) {
+	if (is_hpage &&
+	    ((exclusive == 0 && !writable) ||
+	     (exclusive == HPAGE_PMD_NR))) {
 		int res;
 
 		res = collapse_pte_mapped_anon_thp(mm, vma, address,
-- 
2.37.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] mm/khugepaged: attempt to map anonymous pte-mapped THPs by pmds
  2023-12-07  3:09   ` [PATCH v2 1/2] mm/khugepaged: " Xu Yu
@ 2023-12-07  7:47     ` Xu Yu
  2023-12-07 10:37     ` David Hildenbrand
  1 sibling, 0 replies; 16+ messages in thread
From: Xu Yu @ 2023-12-07  7:47 UTC (permalink / raw)
  To: linux-mm; +Cc: david

On 12/7/23 11:09 AM, Xu Yu wrote:
> In the anonymous collapse path, khugepaged always collapses pte-mapped
> hugepage by allocating and copying to a new hugepage.
> 
> In some scenarios, we can only update the mapping page tables for
> anonymous pte-mapped THPs, in the same way as file/shmem-backed
> pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
> attempt to map file/shmem-backed pte-mapped THPs by pmds")
> 
> The simplest scenario that satisfies the conditions, as David points out,
> is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
> collapse into a R/O PMD without further action.
> 
> Let's start from this simplest scenario.
> 
> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
> ---
>   mm/khugepaged.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 214 insertions(+)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 88433cc25d8a..85c7a2ab44ce 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1237,6 +1237,197 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>   	return result;
>   }
>   
> +static struct page *find_lock_pte_mapped_page(struct vm_area_struct *vma,
> +					      unsigned long addr, pmd_t *pmd)
> +{
> +	pte_t *pte, pteval;
> +	struct page *page = NULL;
> +
> +	pte = pte_offset_map(pmd, addr);
> +	if (!pte)
> +		return NULL;
> +
> +	pteval = ptep_get_lockless(pte);
> +	if (pte_none(pteval) || !pte_present(pteval))
> +		goto out;
> +
> +	page = vm_normal_page(vma, addr, pteval);
> +	if (unlikely(!page) || unlikely(is_zone_device_page(page)))
> +		goto out;
> +
> +	page = compound_head(page);
> +
> +	if (!trylock_page(page)) {
> +		page = NULL;
> +		goto out;
> +	}
> +
> +	if (!get_page_unless_zero(page)) {
> +		unlock_page(page);
> +		page = NULL;
> +		goto out;
> +	}
> +
> +out:
> +	pte_unmap(pte);
> +	return page;
> +}
> +
> +static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
> +				struct vm_area_struct *vma,
> +				unsigned long haddr, bool *mmap_locked,
> +				struct collapse_control *cc)
> +{
> +	struct mmu_notifier_range range;
> +	struct page *hpage;
> +	pte_t *start_pte, *pte;
> +	pmd_t *pmd, pmdval;
> +	spinlock_t *pml, *ptl;
> +	pgtable_t pgtable;
> +	unsigned long addr;
> +	int exclusive = 0;
> +	bool writable = false;
> +	int result, i;
> +
> +	/* Fast check before locking page if already PMD-mapped */
> +	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
> +	if (result == SCAN_PMD_MAPPED)
> +		return result;
> +
> +	hpage = find_lock_pte_mapped_page(vma, haddr, pmd);
> +	if (!hpage)
> +		return SCAN_PAGE_NULL;
> +	if (!PageHead(hpage)) {
> +		result = SCAN_FAIL;
> +		goto drop_hpage;
> +	}
> +	if (compound_order(hpage) != HPAGE_PMD_ORDER) {
> +		result = SCAN_PAGE_COMPOUND;
> +		goto drop_hpage;
> +	}
> +
> +	mmap_read_unlock(mm);
> +	*mmap_locked = false;
> +
> +	/* Prevent all access to pagetables */
> +	mmap_write_lock(mm);
> +
> +	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
> +	if (result != SCAN_SUCCEED)
> +		goto up_write;
> +
> +	result = check_pmd_still_valid(mm, haddr, pmd);
> +	if (result != SCAN_SUCCEED)
> +		goto up_write;
> +
> +	/* Recheck with mmap write lock */
> +	result = SCAN_SUCCEED;
> +	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
> +	if (!start_pte)
> +		goto drop_hpage;
                      ^^^^^^^^^^ should be up_write.
> +	for (i = 0, addr = haddr, pte = start_pte;
> +	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> +		struct page *page;
> +		pte_t pteval = ptep_get(pte);
> +
> +		if (pte_none(pteval) || !pte_present(pteval)) {
> +			result = SCAN_PTE_NON_PRESENT;
> +			break;
> +		}
> +
> +		if (pte_uffd_wp(pteval)) {
> +			result = SCAN_PTE_UFFD_WP;
> +			break;
> +		}
> +
> +		if (pte_write(pteval))
> +			writable = true;
> +
> +		page = vm_normal_page(vma, addr, pteval);
> +
> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> +			result = SCAN_PAGE_NULL;
> +			break;
> +		}
> +
> +		if (hpage + i != page) {
> +			result = SCAN_FAIL;
> +			break;
> +		}
> +
> +		if (PageAnonExclusive(page))
> +			exclusive++;
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +	if (result != SCAN_SUCCEED)
> +		goto drop_hpage;
                      ^^^^^^^^^^ should be up_write.
> +
> +	/*
> +	 * Case 1:
> +	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
> +	 * collapse into a R/O PMD without further action.
> +	 */
> +	if (!(exclusive == 0 && !writable))
> +		goto drop_hpage;
                      ^^^^^^^^^^ should be up_write.
> +
> +	/* Collapse pmd entry */
> +	vma_start_write(vma);
> +	anon_vma_lock_write(vma->anon_vma);
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> +				haddr, haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +
> +	pml = pmd_lock(mm, pmd); /* probably unnecessary */
> +	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
> +	spin_unlock(pml);
> +	mmu_notifier_invalidate_range_end(&range);
> +	tlb_remove_table_sync_one();
> +
> +	anon_vma_unlock_write(vma->anon_vma);
> +
> +	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
> +	if (!start_pte)
> +		goto rollback;
> +	for (i = 0, addr = haddr, pte = start_pte;
> +	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> +		struct page *page;
> +		pte_t pteval = ptep_get(pte);
> +
> +		page = vm_normal_page(vma, addr, pteval);
> +		page_remove_rmap(page, vma, false);
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +
> +	/* Install pmd entry */
> +	pgtable = pmd_pgtable(pmdval);
> +	pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
> +	spin_lock(pml);
> +	page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
> +	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +	set_pmd_at(mm, haddr, pmd, pmdval);
> +	update_mmu_cache_pmd(vma, haddr, pmd);
> +	spin_unlock(pml);
> +
> +	result = SCAN_SUCCEED;
> +	goto up_write;
> +
> +rollback:
> +	spin_lock(pml);
> +	pmd_populate(mm, pmd, pmd_pgtable(pmdval));
> +	spin_unlock(pml);
> +
> +up_write:
> +	mmap_write_unlock(mm);
> +
> +drop_hpage:
> +	unlock_page(hpage);
> +	put_page(hpage);
> +
> +	/* TODO: tracepoints */
> +	return result;
> +}
> +
>   static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>   				   struct vm_area_struct *vma,
>   				   unsigned long address, bool *mmap_locked,
> @@ -1251,6 +1442,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>   	spinlock_t *ptl;
>   	int node = NUMA_NO_NODE, unmapped = 0;
>   	bool writable = false;
> +	int exclusive = 0;
> +	bool is_hpage = false;
>   
>   	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>   
> @@ -1333,8 +1526,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>   			}
>   		}
>   
> +		if (PageAnonExclusive(page))
> +			exclusive++;
> +
>   		page = compound_head(page);
>   
> +		if (compound_order(page) == HPAGE_PMD_ORDER)
> +			is_hpage = true;
> +
>   		/*
>   		 * Record which node the original page is from and save this
>   		 * information to cc->node_load[].
> @@ -1396,7 +1595,22 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>   	}
>   out_unmap:
>   	pte_unmap_unlock(pte, ptl);
> +
> +	if (is_hpage && (exclusive == 0 && !writable)) {
> +		int res;
> +
> +		res = collapse_pte_mapped_anon_thp(mm, vma, address,
> +						   mmap_locked, cc);
> +		if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) {
> +			result = res;
> +			goto out;
> +		}
> +
> +	}
> +
>   	if (result == SCAN_SUCCEED) {
> +		if (!*mmap_locked)
> +			mmap_read_lock(mm);
>   		result = collapse_huge_page(mm, address, referenced,
>   					    unmapped, cc);
>   		/* collapse_huge_page will return with the mmap_lock released */
-- 
Thanks,
Yu



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] mm/khugepaged: attempt to map anonymous pte-mapped THPs by pmds
  2023-12-07  3:09   ` [PATCH v2 1/2] mm/khugepaged: " Xu Yu
  2023-12-07  7:47     ` Xu Yu
@ 2023-12-07 10:37     ` David Hildenbrand
  2023-12-18  2:45       ` Xu Yu
  1 sibling, 1 reply; 16+ messages in thread
From: David Hildenbrand @ 2023-12-07 10:37 UTC (permalink / raw)
  To: Xu Yu, linux-mm

On 07.12.23 04:09, Xu Yu wrote:
> In the anonymous collapse path, khugepaged always collapses pte-mapped
> hugepage by allocating and copying to a new hugepage.
> 
> In some scenarios, we can only update the mapping page tables for
> anonymous pte-mapped THPs, in the same way as file/shmem-backed
> pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
> attempt to map file/shmem-backed pte-mapped THPs by pmds")
> 
> The simplest scenario that satisfies the conditions, as David points out,
> is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
> collapse into a R/O PMD without further action.
> 
> Let's start from this simplest scenario.
> 
> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
> ---
>   mm/khugepaged.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 214 insertions(+)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 88433cc25d8a..85c7a2ab44ce 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1237,6 +1237,197 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>   	return result;
>   }
>   
> +static struct page *find_lock_pte_mapped_page(struct vm_area_struct *vma,
> +					      unsigned long addr, pmd_t *pmd)
> +{
> +	pte_t *pte, pteval;
> +	struct page *page = NULL;
> +
> +	pte = pte_offset_map(pmd, addr);
> +	if (!pte)
> +		return NULL;
> +
> +	pteval = ptep_get_lockless(pte);
> +	if (pte_none(pteval) || !pte_present(pteval))
> +		goto out;
> +
> +	page = vm_normal_page(vma, addr, pteval);
> +	if (unlikely(!page) || unlikely(is_zone_device_page(page)))
> +		goto out;
> +
> +	page = compound_head(page);
> +
> +	if (!trylock_page(page)) {
> +		page = NULL;
> +		goto out;
> +	}
> +
> +	if (!get_page_unless_zero(page)) {
> +		unlock_page(page);
> +		page = NULL;
> +		goto out;
> +	}
> +
> +out:
> +	pte_unmap(pte);
> +	return page;
> +}
> +
> +static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
> +				struct vm_area_struct *vma,
> +				unsigned long haddr, bool *mmap_locked,
> +				struct collapse_control *cc)
> +{
> +	struct mmu_notifier_range range;
> +	struct page *hpage;
> +	pte_t *start_pte, *pte;
> +	pmd_t *pmd, pmdval;
> +	spinlock_t *pml, *ptl;
> +	pgtable_t pgtable;
> +	unsigned long addr;
> +	int exclusive = 0;
> +	bool writable = false;
> +	int result, i;
> +
> +	/* Fast check before locking page if already PMD-mapped */
> +	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
> +	if (result == SCAN_PMD_MAPPED)
> +		return result;
> +
> +	hpage = find_lock_pte_mapped_page(vma, haddr, pmd);
> +	if (!hpage)
> +		return SCAN_PAGE_NULL;
> +	if (!PageHead(hpage)) {
> +		result = SCAN_FAIL;
> +		goto drop_hpage;
> +	}
> +	if (compound_order(hpage) != HPAGE_PMD_ORDER) {
> +		result = SCAN_PAGE_COMPOUND;
> +		goto drop_hpage;
> +	}
> +
> +	mmap_read_unlock(mm);
> +	*mmap_locked = false;
> +
> +	/* Prevent all access to pagetables */
> +	mmap_write_lock(mm);
> +
> +	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
> +	if (result != SCAN_SUCCEED)
> +		goto up_write;
> +
> +	result = check_pmd_still_valid(mm, haddr, pmd);
> +	if (result != SCAN_SUCCEED)
> +		goto up_write;
> +
> +	/* Recheck with mmap write lock */
> +	result = SCAN_SUCCEED;
> +	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
> +	if (!start_pte)
> +		goto drop_hpage;
> +	for (i = 0, addr = haddr, pte = start_pte;
> +	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> +		struct page *page;
> +		pte_t pteval = ptep_get(pte);
> +
> +		if (pte_none(pteval) || !pte_present(pteval)) {
> +			result = SCAN_PTE_NON_PRESENT;
> +			break;
> +		}
> +
> +		if (pte_uffd_wp(pteval)) {
> +			result = SCAN_PTE_UFFD_WP;
> +			break;
> +		}
> +
> +		if (pte_write(pteval))
> +			writable = true;
> +
> +		page = vm_normal_page(vma, addr, pteval);
> +
> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> +			result = SCAN_PAGE_NULL;
> +			break;
> +		}
> +
> +		if (hpage + i != page) {
> +			result = SCAN_FAIL;
> +			break;
> +		}
> +
> +		if (PageAnonExclusive(page))
> +			exclusive++;
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +	if (result != SCAN_SUCCEED)
> +		goto drop_hpage;
> +

I'm wondering if anybody can now modify the page table. Likely you also 
need the VMA write lock. The mmap write lock is no longer sufficient.

> +	/*
> +	 * Case 1:
> +	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
> +	 * collapse into a R/O PMD without further action.
> +	 */
> +	if (!(exclusive == 0 && !writable))
> +		goto drop_hpage;
> +
> +	/* Collapse pmd entry */
> +	vma_start_write(vma);
> +	anon_vma_lock_write(vma->anon_vma);
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> +				haddr, haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +
> +	pml = pmd_lock(mm, pmd); /* probably unnecessary */
> +	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
> +	spin_unlock(pml);
> +	mmu_notifier_invalidate_range_end(&range);
> +	tlb_remove_table_sync_one();
> +
> +	anon_vma_unlock_write(vma->anon_vma);
> +
> +	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
> +	if (!start_pte)
> +		goto rollback;

How can that happen?

> +	for (i = 0, addr = haddr, pte = start_pte;
> +	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> +		struct page *page;
> +		pte_t pteval = ptep_get(pte);
> +
> +		page = vm_normal_page(vma, addr, pteval);
> +		page_remove_rmap(page, vma, false);
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +
> +	/* Install pmd entry */
> +	pgtable = pmd_pgtable(pmdval);
> +	pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
> +	spin_lock(pml);
> +	page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
> +	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +	set_pmd_at(mm, haddr, pmd, pmdval);
> +	update_mmu_cache_pmd(vma, haddr, pmd);
> +	spin_unlock(pml);

1) Looks like you are missing to update the refcount.

You should take one reference before adding the new rmap, and drop the 
other references when dropping the rmaps.

2) You should drop the old rmaps after obtaining the new rmap (+reference)

3) You should use folios in the new code.

4) You'll soon be able to use rmap batching [1] and avoid looping over
    the pages once more manually.


Based on [1], the code flow likely should look something like this:


first_page = &folio->page;
...
folio_get(folio);
folio_add_anon_rmap_pmd(folio, first_page, RMAP_NONE);
...
folio_remove_rmap_ptes(folio, first_page, HPAGE_PMD_NR, vma);
folio_ref_sub(folio, HPAGE_PMD_NR);
...


[1] https://lkml.kernel.org/r/20231204142146.91437-24-david@redhat.com

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] mm/khugepaged: attempt to map anonymous pte-mapped THPs by pmds
  2023-12-07 10:37     ` David Hildenbrand
@ 2023-12-18  2:45       ` Xu Yu
  0 siblings, 0 replies; 16+ messages in thread
From: Xu Yu @ 2023-12-18  2:45 UTC (permalink / raw)
  To: David Hildenbrand, linux-mm

On 12/7/23 6:37 PM, David Hildenbrand wrote:
> On 07.12.23 04:09, Xu Yu wrote:
>> In the anonymous collapse path, khugepaged always collapses pte-mapped
>> hugepage by allocating and copying to a new hugepage.
>>
>> In some scenarios, we can only update the mapping page tables for
>> anonymous pte-mapped THPs, in the same way as file/shmem-backed
>> pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
>> attempt to map file/shmem-backed pte-mapped THPs by pmds")
>>
>> The simplest scenario that satisfies the conditions, as David points out,
>> is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
>> collapse into a R/O PMD without further action.
>>
>> Let's start from this simplest scenario.
>>
>> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
>> ---
>>   mm/khugepaged.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 214 insertions(+)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 88433cc25d8a..85c7a2ab44ce 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1237,6 +1237,197 @@ static int collapse_huge_page(struct mm_struct *mm, 
>> unsigned long address,
>>       return result;
>>   }
>> +static struct page *find_lock_pte_mapped_page(struct vm_area_struct *vma,
>> +                          unsigned long addr, pmd_t *pmd)
>> +{
>> +    pte_t *pte, pteval;
>> +    struct page *page = NULL;
>> +
>> +    pte = pte_offset_map(pmd, addr);
>> +    if (!pte)
>> +        return NULL;
>> +
>> +    pteval = ptep_get_lockless(pte);
>> +    if (pte_none(pteval) || !pte_present(pteval))
>> +        goto out;
>> +
>> +    page = vm_normal_page(vma, addr, pteval);
>> +    if (unlikely(!page) || unlikely(is_zone_device_page(page)))
>> +        goto out;
>> +
>> +    page = compound_head(page);
>> +
>> +    if (!trylock_page(page)) {
>> +        page = NULL;
>> +        goto out;
>> +    }
>> +
>> +    if (!get_page_unless_zero(page)) {
>> +        unlock_page(page);
>> +        page = NULL;
>> +        goto out;
>> +    }
>> +
>> +out:
>> +    pte_unmap(pte);
>> +    return page;
>> +}
>> +
>> +static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
>> +                struct vm_area_struct *vma,
>> +                unsigned long haddr, bool *mmap_locked,
>> +                struct collapse_control *cc)
>> +{
>> +    struct mmu_notifier_range range;
>> +    struct page *hpage;
>> +    pte_t *start_pte, *pte;
>> +    pmd_t *pmd, pmdval;
>> +    spinlock_t *pml, *ptl;
>> +    pgtable_t pgtable;
>> +    unsigned long addr;
>> +    int exclusive = 0;
>> +    bool writable = false;
>> +    int result, i;
>> +
>> +    /* Fast check before locking page if already PMD-mapped */
>> +    result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
>> +    if (result == SCAN_PMD_MAPPED)
>> +        return result;
>> +
>> +    hpage = find_lock_pte_mapped_page(vma, haddr, pmd);
>> +    if (!hpage)
>> +        return SCAN_PAGE_NULL;
>> +    if (!PageHead(hpage)) {
>> +        result = SCAN_FAIL;
>> +        goto drop_hpage;
>> +    }
>> +    if (compound_order(hpage) != HPAGE_PMD_ORDER) {
>> +        result = SCAN_PAGE_COMPOUND;
>> +        goto drop_hpage;
>> +    }
>> +
>> +    mmap_read_unlock(mm);
>> +    *mmap_locked = false;
>> +
>> +    /* Prevent all access to pagetables */
>> +    mmap_write_lock(mm);
>> +
>> +    result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
>> +    if (result != SCAN_SUCCEED)
>> +        goto up_write;
>> +
>> +    result = check_pmd_still_valid(mm, haddr, pmd);
>> +    if (result != SCAN_SUCCEED)
>> +        goto up_write;
>> +
>> +    /* Recheck with mmap write lock */
>> +    result = SCAN_SUCCEED;
>> +    start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
>> +    if (!start_pte)
>> +        goto drop_hpage;
>> +    for (i = 0, addr = haddr, pte = start_pte;
>> +         i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>> +        struct page *page;
>> +        pte_t pteval = ptep_get(pte);
>> +
>> +        if (pte_none(pteval) || !pte_present(pteval)) {
>> +            result = SCAN_PTE_NON_PRESENT;
>> +            break;
>> +        }
>> +
>> +        if (pte_uffd_wp(pteval)) {
>> +            result = SCAN_PTE_UFFD_WP;
>> +            break;
>> +        }
>> +
>> +        if (pte_write(pteval))
>> +            writable = true;
>> +
>> +        page = vm_normal_page(vma, addr, pteval);
>> +
>> +        if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>> +            result = SCAN_PAGE_NULL;
>> +            break;
>> +        }
>> +
>> +        if (hpage + i != page) {
>> +            result = SCAN_FAIL;
>> +            break;
>> +        }
>> +
>> +        if (PageAnonExclusive(page))
>> +            exclusive++;
>> +    }
>> +    pte_unmap_unlock(start_pte, ptl);
>> +    if (result != SCAN_SUCCEED)
>> +        goto drop_hpage;
>> +
> 
> I'm wondering if anybody can now modify the page table. Likely you also need the 
> VMA write lock. The mmap write lock is no longer sufficient.
> 
>> +    /*
>> +     * Case 1:
>> +     * No subpages are PageAnonExclusive (PTEs must be R/O), we can
>> +     * collapse into a R/O PMD without further action.
>> +     */
>> +    if (!(exclusive == 0 && !writable))
>> +        goto drop_hpage;
>> +
>> +    /* Collapse pmd entry */
>> +    vma_start_write(vma);

Will move vma_start_write() to the front, just after mmap write lock.

>> +    anon_vma_lock_write(vma->anon_vma);
>> +
>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
>> +                haddr, haddr + HPAGE_PMD_SIZE);
>> +    mmu_notifier_invalidate_range_start(&range);
>> +
>> +    pml = pmd_lock(mm, pmd); /* probably unnecessary */
>> +    pmdval = pmdp_collapse_flush(vma, haddr, pmd);
>> +    spin_unlock(pml);
>> +    mmu_notifier_invalidate_range_end(&range);
>> +    tlb_remove_table_sync_one();
>> +
>> +    anon_vma_unlock_write(vma->anon_vma);
>> +
>> +    start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
>> +    if (!start_pte)
>> +        goto rollback;
> 
> How can that happen?

I referred to the abort logic in collapse_pte_mapped_thp(), I'm not sure if
anyone can modify the page table at the same time, so I added a redundant check.

If mmap write lock + vma write lock + page lock is sufficient, I should delete
this check.

> 
>> +    for (i = 0, addr = haddr, pte = start_pte;
>> +         i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>> +        struct page *page;
>> +        pte_t pteval = ptep_get(pte);
>> +
>> +        page = vm_normal_page(vma, addr, pteval);
>> +        page_remove_rmap(page, vma, false);
>> +    }
>> +    pte_unmap_unlock(start_pte, ptl);
>> +
>> +    /* Install pmd entry */
>> +    pgtable = pmd_pgtable(pmdval);
>> +    pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
>> +    spin_lock(pml);
>> +    page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
>> +    pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> +    set_pmd_at(mm, haddr, pmd, pmdval);
>> +    update_mmu_cache_pmd(vma, haddr, pmd);
>> +    spin_unlock(pml);
> 
> 1) Looks like you are missing to update the refcount.
> 
> You should take one reference before adding the new rmap, and drop the other 
> references when dropping the rmaps.
> 
> 2) You should drop the old rmaps after obtaining the new rmap (+reference)
> 
> 3) You should use folios in the new code.

Many thanks! Will do in v3.

> 
> 4) You'll soon be able to use rmap batching [1] and avoid looping over
>     the pages once more manually.
> 
> 
> Based on [1], the code flow likely should look something like this:

Thanks again! I notice that [1] is in the patchset ("mm/rmap: interface
overhaul") which is under review, should I wait for this patchset to be
merged into mm-unstable or mm-stable? Or can I send out v3 without this
helper first?

If you don't mind, I will send out v3 first.

> 
> 
> first_page = &folio->page;
> ...
> folio_get(folio);
> folio_add_anon_rmap_pmd(folio, first_page, RMAP_NONE);
> ...
> folio_remove_rmap_ptes(folio, first_page, HPAGE_PMD_NR, vma);
> folio_ref_sub(folio, HPAGE_PMD_NR);
> ...
> 
> 
> [1] https://lkml.kernel.org/r/20231204142146.91437-24-david@redhat.com
> 
-- 
Thanks,
Yu



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 0/2] attempt to map anonymous pte-mapped THPs by pmds
  2023-11-13  9:05 [PATCH 0/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds Xu Yu
  2023-11-13  9:05 ` [PATCH 1/1] " Xu Yu
  2023-12-07  3:09 ` [PATCH v2 0/2] attempt to " Xu Yu
@ 2023-12-18  7:06 ` Xu Yu
  2023-12-18  7:06   ` [PATCH v3 1/2] mm/khugepaged: map RO non-exclusive pte-mapped anon " Xu Yu
                     ` (2 more replies)
  2 siblings, 3 replies; 16+ messages in thread
From: Xu Yu @ 2023-12-18  7:06 UTC (permalink / raw)
  To: linux-mm; +Cc: david

Result of tools/testing/selftests/mm/cow.c tests:
# [RUN] Basic COW after fork() when collapsing before fork()
ok 145 No leak from parent into child
# [RUN] Basic COW after fork() when collapsing after fork() (fully shared)
ok 146 No leak from parent into child
# [RUN] Basic COW after fork() when collapsing after fork() (lower shared)
ok 147 No leak from parent into child
# [RUN] Basic COW after fork() when collapsing after fork() (upper shared)
ok 148 No leak from parent into child

A long run (w/ CONFIG_DEBUG_VM enabled) shows no panic or memory leaks.

Changes since v2:
- Use folios in the new code, as suggested by David.
- Handle folio refcount and rmap properly, as suggested by David.
- minor modification includes 1) advance vma write lock, 2) remove
  redundant rollback logic, 3) clear old ptes in pgtable before deposit.

Changes since v1:
- Deal with PageAnonExclusive properly, as suggested by David.

Xu Yu (2):
  mm/khugepaged: map RO non-exclusive pte-mapped anon THPs by pmds
  mm/khugepaged: map exclusive anonymous pte-mapped THPs by pmds

 mm/khugepaged.c | 229 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 229 insertions(+)

-- 
2.37.1



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 1/2] mm/khugepaged: map RO non-exclusive pte-mapped anon THPs by pmds
  2023-12-18  7:06 ` [PATCH v3 0/2] attempt to map " Xu Yu
@ 2023-12-18  7:06   ` Xu Yu
  2023-12-18  7:06   ` [PATCH v3 2/2] mm/khugepaged: map exclusive anonymous pte-mapped " Xu Yu
  2023-12-21 20:40   ` [PATCH v3 0/2] attempt to map " Zach O'Keefe
  2 siblings, 0 replies; 16+ messages in thread
From: Xu Yu @ 2023-12-18  7:06 UTC (permalink / raw)
  To: linux-mm; +Cc: david

In the anonymous collapse path, khugepaged always collapses pte-mapped
hugepage by allocating and copying to a new hugepage.

In some scenarios, we can only update the mapping page tables for
anonymous pte-mapped THPs, in the same way as file/shmem-backed
pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
attempt to map file/shmem-backed pte-mapped THPs by pmds")

The simplest scenario that satisfies the conditions, as David points out,
is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
collapse into a R/O PMD without further action.

Let's start from this simplest scenario.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 mm/khugepaged.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..57e261387124 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1237,6 +1237,196 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	return result;
 }
 
+static struct folio *find_lock_pte_mapped_folio(struct vm_area_struct *vma,
+						unsigned long addr, pmd_t *pmd)
+{
+	pte_t *pte, pteval;
+	struct folio *folio = NULL;
+
+	pte = pte_offset_map(pmd, addr);
+	if (!pte)
+		return NULL;
+
+	pteval = ptep_get_lockless(pte);
+	if (pte_none(pteval) || !pte_present(pteval))
+		goto out;
+
+	folio = vm_normal_folio(vma, addr, pteval);
+	if (unlikely(!folio) || unlikely(folio_is_zone_device(folio)))
+		goto out;
+
+	if (!folio_trylock(folio)) {
+		folio = NULL;
+		goto out;
+	}
+
+	if (!folio_try_get(folio)) {
+		folio_unlock(folio);
+		folio = NULL;
+		goto out;
+	}
+
+out:
+	pte_unmap(pte);
+	return folio;
+}
+
+static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
+				struct vm_area_struct *vma,
+				unsigned long haddr, bool *mmap_locked,
+				struct collapse_control *cc)
+{
+	struct mmu_notifier_range range;
+	struct folio *folio;
+	pte_t *start_pte, *pte;
+	pmd_t *pmd, pmdval;
+	spinlock_t *pml, *ptl;
+	pgtable_t pgtable;
+	unsigned long addr;
+	int exclusive = 0;
+	bool writable = false;
+	int result, i;
+
+	/* Fast check before locking folio if already PMD-mapped */
+	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+	if (result == SCAN_PMD_MAPPED)
+		return result;
+
+	folio = find_lock_pte_mapped_folio(vma, haddr, pmd);
+	if (!folio)
+		return SCAN_PAGE_NULL;
+	if (!folio_test_large(folio)) {
+		result = SCAN_FAIL;
+		goto drop_folio;
+	}
+	if (folio_order(folio) != HPAGE_PMD_ORDER) {
+		result = SCAN_PAGE_COMPOUND;
+		goto drop_folio;
+	}
+
+	mmap_read_unlock(mm);
+	*mmap_locked = false;
+
+	/* Prevent all access to pagetables */
+	mmap_write_lock(mm);
+	vma_start_write(vma);
+
+	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	result = check_pmd_still_valid(mm, haddr, pmd);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	/* Recheck with mmap write lock */
+	result = SCAN_SUCCEED;
+	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+	if (!start_pte)
+		goto up_write;
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *subpage;
+		pte_t pteval = ptep_get(pte);
+
+		if (pte_none(pteval) || !pte_present(pteval)) {
+			result = SCAN_PTE_NON_PRESENT;
+			break;
+		}
+
+		if (pte_uffd_wp(pteval)) {
+			result = SCAN_PTE_UFFD_WP;
+			break;
+		}
+
+		if (pte_write(pteval))
+			writable = true;
+
+		subpage = vm_normal_page(vma, addr, pteval);
+
+		if (unlikely(!subpage) ||
+		    unlikely(is_zone_device_page(subpage))) {
+			result = SCAN_PAGE_NULL;
+			break;
+		}
+
+		if (folio_page(folio, i) != subpage) {
+			result = SCAN_FAIL;
+			break;
+		}
+
+		if (PageAnonExclusive(subpage))
+			exclusive++;
+	}
+	pte_unmap_unlock(start_pte, ptl);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	/*
+	 * Case 1:
+	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
+	 * collapse into a R/O PMD without further action.
+	 */
+	if (!(exclusive == 0 && !writable))
+		goto up_write;
+
+	/* Collapse pmd entry */
+	anon_vma_lock_write(vma->anon_vma);
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				haddr, haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	pml = pmd_lock(mm, pmd); /* probably unnecessary */
+	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
+	spin_unlock(pml);
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_remove_table_sync_one();
+
+	anon_vma_unlock_write(vma->anon_vma);
+
+	/*
+	 * Obtain a new pmd rmap before dropping pte rmaps to avoid
+	 * false-negative page_mapped().
+	 */
+	folio_get(folio);
+	page_add_anon_rmap(&folio->page, vma, haddr, RMAP_COMPOUND);
+
+	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *subpage;
+		pte_t pteval = ptep_get(pte);
+
+		ptep_clear(mm, addr, pte);
+		subpage = vm_normal_page(vma, addr, pteval);
+		page_remove_rmap(subpage, vma, false);
+	}
+	pte_unmap_unlock(start_pte, ptl);
+	folio_ref_sub(folio, HPAGE_PMD_NR);
+
+	/* Install pmd entry */
+	pgtable = pmd_pgtable(pmdval);
+	pmdval = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+	spin_lock(pml);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	set_pmd_at(mm, haddr, pmd, pmdval);
+	update_mmu_cache_pmd(vma, haddr, pmd);
+	spin_unlock(pml);
+
+	result = SCAN_SUCCEED;
+
+up_write:
+	mmap_write_unlock(mm);
+
+drop_folio:
+	folio_unlock(folio);
+	folio_put(folio);
+
+	/* TODO: tracepoints */
+	return result;
+}
+
 static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 				   struct vm_area_struct *vma,
 				   unsigned long address, bool *mmap_locked,
@@ -1251,6 +1441,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
 	bool writable = false;
+	int exclusive = 0;
+	bool is_hpage = false;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1333,8 +1525,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 			}
 		}
 
+		if (PageAnonExclusive(page))
+			exclusive++;
+
 		page = compound_head(page);
 
+		if (compound_order(page) == HPAGE_PMD_ORDER)
+			is_hpage = true;
+
 		/*
 		 * Record which node the original page is from and save this
 		 * information to cc->node_load[].
@@ -1396,7 +1594,21 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+
+	if (is_hpage && (exclusive == 0 && !writable)) {
+		int res;
+
+		res = collapse_pte_mapped_anon_thp(mm, vma, address,
+						   mmap_locked, cc);
+		if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) {
+			result = res;
+			goto out;
+		}
+	}
+
 	if (result == SCAN_SUCCEED) {
+		if (!*mmap_locked)
+			mmap_read_lock(mm);
 		result = collapse_huge_page(mm, address, referenced,
 					    unmapped, cc);
 		/* collapse_huge_page will return with the mmap_lock released */
-- 
2.37.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v3 2/2] mm/khugepaged: map exclusive anonymous pte-mapped THPs by pmds
  2023-12-18  7:06 ` [PATCH v3 0/2] attempt to map " Xu Yu
  2023-12-18  7:06   ` [PATCH v3 1/2] mm/khugepaged: map RO non-exclusive pte-mapped anon " Xu Yu
@ 2023-12-18  7:06   ` Xu Yu
  2023-12-21 20:40   ` [PATCH v3 0/2] attempt to map " Zach O'Keefe
  2 siblings, 0 replies; 16+ messages in thread
From: Xu Yu @ 2023-12-18  7:06 UTC (permalink / raw)
  To: linux-mm; +Cc: david

This adds another case, as David points out, which is suitable for
mapping anonymous pte-mapped THPs by pmds. When all subpages are
PageAnonExclusive (PTEs may be either R/O or R/W), we can clear
PageAnonExclusive on all tail pages but the first (head) page and
collapse to a R/W PMD with VM_WRITE or a R/O PMD without VM_WRITE.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 mm/khugepaged.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 57e261387124..d843c1e3ec39 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1285,6 +1285,7 @@ static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
 	unsigned long addr;
 	int exclusive = 0;
 	bool writable = false;
+	rmap_t rmap_flags = RMAP_COMPOUND;
 	int result, i;
 
 	/* Fast check before locking folio if already PMD-mapped */
@@ -1366,8 +1367,14 @@ static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
 	 * Case 1:
 	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
 	 * collapse into a R/O PMD without further action.
+	 *
+	 * Case 2:
+	 * All subpages are PageAnonExclusive (PTEs may be either R/O or R/W),
+	 * we clear PageAnonExclusive on all tail pages but the head page and
+	 * collapse to a R/W PMD with VM_WRITE or a R/O PMD without VM_WRITE.
 	 */
-	if (!(exclusive == 0 && !writable))
+	if (!((exclusive == 0 && !writable) ||
+	      (exclusive == HPAGE_PMD_NR)))
 		goto up_write;
 
 	/* Collapse pmd entry */
@@ -1390,7 +1397,9 @@ static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
 	 * false-negative page_mapped().
 	 */
 	folio_get(folio);
-	page_add_anon_rmap(&folio->page, vma, haddr, RMAP_COMPOUND);
+	if (exclusive == HPAGE_PMD_NR)
+		rmap_flags |= RMAP_EXCLUSIVE;
+	page_add_anon_rmap(&folio->page, vma, haddr, rmap_flags);
 
 	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
 	for (i = 0, addr = haddr, pte = start_pte;
@@ -1401,6 +1410,10 @@ static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
 		ptep_clear(mm, addr, pte);
 		subpage = vm_normal_page(vma, addr, pteval);
 		page_remove_rmap(subpage, vma, false);
+
+		/* Clear PageAnonExclusive on tail pages */
+		if (exclusive == HPAGE_PMD_NR && i != 0)
+			ClearPageAnonExclusive(subpage);
 	}
 	pte_unmap_unlock(start_pte, ptl);
 	folio_ref_sub(folio, HPAGE_PMD_NR);
@@ -1408,6 +1421,8 @@ static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
 	/* Install pmd entry */
 	pgtable = pmd_pgtable(pmdval);
 	pmdval = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+	if (exclusive == HPAGE_PMD_NR)
+		pmdval = maybe_pmd_mkwrite(pmd_mkdirty(pmdval), vma);
 	spin_lock(pml);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, pmdval);
@@ -1595,7 +1610,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 
-	if (is_hpage && (exclusive == 0 && !writable)) {
+	if (is_hpage &&
+	    ((exclusive == 0 && !writable) ||
+	     (exclusive == HPAGE_PMD_NR))) {
 		int res;
 
 		res = collapse_pte_mapped_anon_thp(mm, vma, address,
-- 
2.37.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 0/2] attempt to map anonymous pte-mapped THPs by pmds
  2023-12-18  7:06 ` [PATCH v3 0/2] attempt to map " Xu Yu
  2023-12-18  7:06   ` [PATCH v3 1/2] mm/khugepaged: map RO non-exclusive pte-mapped anon " Xu Yu
  2023-12-18  7:06   ` [PATCH v3 2/2] mm/khugepaged: map exclusive anonymous pte-mapped " Xu Yu
@ 2023-12-21 20:40   ` Zach O'Keefe
  2023-12-21 20:54     ` David Hildenbrand
  2 siblings, 1 reply; 16+ messages in thread
From: Zach O'Keefe @ 2023-12-21 20:40 UTC (permalink / raw)
  To: Xu Yu; +Cc: linux-mm, david

Hey Xu,

Thanks for the patches.

As a precursor, can you help understand what the use case is for these
patches? In-place collapse of anon memory is something I've thought
about before, but the opportunity has never been especially clear.

In particular, your patches take an order-9 compound page, and just
try to see if we can update the mappings to it (like we do with
file/shmem). Functionally this seems fine, but the difference is that
with file/shmem, it's quite easy to have a pte-mapped-hugepage arise
naturally (the formation of the hugepage happening in the pagecache
being logically separate from the pmd-mapping of w/e task is mapping
it).\

For anonymous memory, the only time I can see us having a pte-mapped
hugepage (that isn't destined for splitting on deferred split list)
that we want to remap by a pmd is if we cause a VMA split + remerge by
mucking with VMA attributes.

In my mind, what I had been thinking of w.r.t in-place anon collapse
was for the case where we've split a THP with MADV_FREE/MADV_DONTNEED
(i.e. to subrelease pages back to kernel), but later want to reform
the THP. In particular, if, for example, we only subrelease O(10s) of
order-0 pages, it seems wasteful to have to reallocate a fresh
hugepage, then copy over O(100s) of pages, on collapse. If we were
able to attempt to first migrate-away any of those previously
subreleased pages (now possibly backing some other memory entirely),
it could save us from having to allocate a fresh order-9 page. Under
memory pressure / fragmentation, this could mean the difference
between success and failure.

Thanks for your help here,
Zach

On Sun, Dec 17, 2023 at 11:06 PM Xu Yu <xuyu@linux.alibaba.com> wrote:
>
> Result of tools/testing/selftests/mm/cow.c tests:
> # [RUN] Basic COW after fork() when collapsing before fork()
> ok 145 No leak from parent into child
> # [RUN] Basic COW after fork() when collapsing after fork() (fully shared)
> ok 146 No leak from parent into child
> # [RUN] Basic COW after fork() when collapsing after fork() (lower shared)
> ok 147 No leak from parent into child
> # [RUN] Basic COW after fork() when collapsing after fork() (upper shared)
> ok 148 No leak from parent into child
>
> A long run (w/ CONFIG_DEBUG_VM enabled) shows no panic or memory leaks.
>
> Changes since v2:
> - Use folios in the new code, as suggested by David.
> - Handle folio refcount and rmap properly, as suggested by David.
> - minor modification includes 1) advance vma write lock, 2) remove
>   redundant rollback logic, 3) clear old ptes in pgtable before deposit.
>
> Changes since v1:
> - Deal with PageAnonExclusive properly, as suggested by David.
>
> Xu Yu (2):
>   mm/khugepaged: map RO non-exclusive pte-mapped anon THPs by pmds
>   mm/khugepaged: map exclusive anonymous pte-mapped THPs by pmds
>
>  mm/khugepaged.c | 229 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 229 insertions(+)
>
> --
> 2.37.1
>
>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 0/2] attempt to map anonymous pte-mapped THPs by pmds
  2023-12-21 20:40   ` [PATCH v3 0/2] attempt to map " Zach O'Keefe
@ 2023-12-21 20:54     ` David Hildenbrand
  0 siblings, 0 replies; 16+ messages in thread
From: David Hildenbrand @ 2023-12-21 20:54 UTC (permalink / raw)
  To: Zach O'Keefe, Xu Yu; +Cc: linux-mm

On 21.12.23 21:40, Zach O'Keefe wrote:
> Hey Xu,
> 
> Thanks for the patches.
> 
> As a precursor, can you help understand what the use case is for these
> patches? In-place collapse of anon memory is something I've thought
> about before, but the opportunity has never been especially clear.
> 
> In particular, your patches take an order-9 compound page, and just
> try to see if we can update the mappings to it (like we do with
> file/shmem). Functionally this seems fine, but the difference is that
> with file/shmem, it's quite easy to have a pte-mapped-hugepage arise
> naturally (the formation of the hugepage happening in the pagecache
> being logically separate from the pmd-mapping of w/e task is mapping
> it).\
> 
> For anonymous memory, the only time I can see us having a pte-mapped
> hugepage (that isn't destined for splitting on deferred split list)
> that we want to remap by a pmd is if we cause a VMA split + remerge by
> mucking with VMA attributes.

Yes, mostly because of madvise(), mprotect(), mremap(). But also, when 
putting a THP into the swap cache right now. When refaulting, you get a 
PTE-mapped THP.

There are some other odd cases, and there might be more in the future 
(below)

> 
> In my mind, what I had been thinking of w.r.t in-place anon collapse
> was for the case where we've split a THP with MADV_FREE/MADV_DONTNEED
> (i.e. to subrelease pages back to kernel), but later want to reform
> the THP. In particular, if, for example, we only subrelease O(10s) of

Right, and in-place collapse even works if the folio has been pinned, 
which is nice.

> order-0 pages, it seems wasteful to have to reallocate a fresh
> hugepage, then copy over O(100s) of pages, on collapse. If we were
> able to attempt to first migrate-away any of those previously
> subreleased pages (now possibly backing some other memory entirely),
> it could save us from having to allocate a fresh order-9 page. Under
> memory pressure / fragmentation, this could mean the difference
> between success and failure.
> 

One thing that popped up a couple of times already is that we might want 
to PTE-map a PMD-sized THP for a couple of reasons (IIRC, FreeBSD does 
some of that). For example:

* Lazily zero the pages of the folio on demand, keeping all non-zeroed
   parts protnone. At a certain time (e.g., all zeroed), simply remap
   using a PMD.
* Detecting sub-page access by temporarily mapping the THP using PTEs.

Maybe, also some uffd optimizations, whereby protnone parts are not 
faulted in yet.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-12-21 20:54 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-13  9:05 [PATCH 0/1] mm/khugepaged: map anonymous pte-mapped THPs by pmds Xu Yu
2023-11-13  9:05 ` [PATCH 1/1] " Xu Yu
2023-11-13  9:26   ` David Hildenbrand
2023-11-13  9:33     ` Xu Yu
2023-11-13 10:10       ` David Hildenbrand
2023-12-07  3:09 ` [PATCH v2 0/2] attempt to " Xu Yu
2023-12-07  3:09   ` [PATCH v2 1/2] mm/khugepaged: " Xu Yu
2023-12-07  7:47     ` Xu Yu
2023-12-07 10:37     ` David Hildenbrand
2023-12-18  2:45       ` Xu Yu
2023-12-07  3:09   ` [PATCH v2 2/2] mm/khugepaged: add case for mapping " Xu Yu
2023-12-18  7:06 ` [PATCH v3 0/2] attempt to map " Xu Yu
2023-12-18  7:06   ` [PATCH v3 1/2] mm/khugepaged: map RO non-exclusive pte-mapped anon " Xu Yu
2023-12-18  7:06   ` [PATCH v3 2/2] mm/khugepaged: map exclusive anonymous pte-mapped " Xu Yu
2023-12-21 20:40   ` [PATCH v3 0/2] attempt to map " Zach O'Keefe
2023-12-21 20:54     ` David Hildenbrand

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.