All of lore.kernel.org
 help / color / mirror / Atom feed
From: James Houghton <jthoughton@google.com>
To: Mike Kravetz <mike.kravetz@oracle.com>,
	Muchun Song <songmuchun@bytedance.com>,
	Peter Xu <peterx@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>,
	David Rientjes <rientjes@google.com>,
	Axel Rasmussen <axelrasmussen@google.com>,
	Mina Almasry <almasrymina@google.com>,
	"Zach O'Keefe" <zokeefe@google.com>,
	Manish Mishra <manish.mishra@nutanix.com>,
	Naoya Horiguchi <naoya.horiguchi@nec.com>,
	"Dr . David Alan Gilbert" <dgilbert@redhat.com>,
	"Matthew Wilcox (Oracle)" <willy@infradead.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	Baolin Wang <baolin.wang@linux.alibaba.com>,
	Miaohe Lin <linmiaohe@huawei.com>, Yang Shi <shy828301@gmail.com>,
	Frank van der Linden <fvdl@google.com>,
	Jiaqi Yan <jiaqiyan@google.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	James Houghton <jthoughton@google.com>
Subject: [PATCH v2 22/46] hugetlb: add HGM support to copy_hugetlb_page_range
Date: Sat, 18 Feb 2023 00:27:55 +0000	[thread overview]
Message-ID: <20230218002819.1486479-23-jthoughton@google.com> (raw)
In-Reply-To: <20230218002819.1486479-1-jthoughton@google.com>

This allows fork() to work with high-granularity mappings. The page
table structure is copied such that partially mapped regions will remain
partially mapped in the same way for the new process.

A page's reference count is incremented for *each* portion of it that
is mapped in the page table. For example, if you have a PMD-mapped 1G
page, the reference count will be incremented by 512.

mapcount is handled similar to THPs: if you're completely mapping a
hugepage, then the compound_mapcount is incremented. If you're mapping a
part of it, the subpages that are getting mapped will have their
mapcounts incremented.

Signed-off-by: James Houghton <jthoughton@google.com>

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1a1a71868dfd..2fe1eb6897d4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -162,6 +162,8 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
 
 void hugetlb_remove_rmap(struct page *subpage, unsigned long shift,
 			 struct hstate *h, struct vm_area_struct *vma);
+void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift,
+			   struct hstate *h, struct vm_area_struct *vma);
 
 void hugetlb_dup_vma_private(struct vm_area_struct *vma);
 void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 693332b7e186..210c6f2b16a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -141,6 +141,37 @@ void hugetlb_remove_rmap(struct page *subpage, unsigned long shift,
 			page_remove_rmap(subpage, vma, false);
 	}
 }
+/*
+ * hugetlb_add_file_rmap() - increment the mapcounts for file-backed hugetlb
+ * pages appropriately.
+ *
+ * For pages that are being mapped with their hstate-level PTE (e.g., a 1G page
+ * being mapped with a 1G PUD), then we increment the compound_mapcount for the
+ * head page.
+ *
+ * For pages that are being mapped with high-granularity, we increment the
+ * mapcounts for the individual subpages that are getting mapped.
+ */
+void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift,
+			   struct hstate *h, struct vm_area_struct *vma)
+{
+	struct page *hpage = compound_head(subpage);
+
+	if (shift == huge_page_shift(h)) {
+		VM_BUG_ON_PAGE(subpage != hpage, subpage);
+		page_add_file_rmap(hpage, vma, true);
+	} else {
+		unsigned long nr_subpages = 1UL << (shift - PAGE_SHIFT);
+		struct page *final_page = &subpage[nr_subpages];
+
+		VM_BUG_ON_PAGE(HPageVmemmapOptimized(hpage), hpage);
+		/*
+		 * Increment the mapcount on each page that is getting mapped.
+		 */
+		for (; subpage < final_page; ++subpage)
+			page_add_file_rmap(subpage, vma, false);
+	}
+}
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
@@ -5210,7 +5241,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			    struct vm_area_struct *src_vma)
 {
 	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
+	struct hugetlb_pte src_hpte, dst_hpte;
+	struct page *ptepage, *hpage;
 	unsigned long addr;
 	bool cow = is_cow_mapping(src_vma->vm_flags);
 	struct hstate *h = hstate_vma(src_vma);
@@ -5238,18 +5270,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	}
 
 	last_addr_mask = hugetlb_mask_last_page(h);
-	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
+	addr = src_vma->vm_start;
+	while (addr < src_vma->vm_end) {
 		spinlock_t *src_ptl, *dst_ptl;
-		src_pte = hugetlb_walk(src_vma, addr, sz);
-		if (!src_pte) {
-			addr |= last_addr_mask;
+		unsigned long hpte_sz;
+
+		if (hugetlb_full_walk(&src_hpte, src_vma, addr)) {
+			addr = (addr | last_addr_mask) + sz;
 			continue;
 		}
-		dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
-		if (!dst_pte) {
-			ret = -ENOMEM;
+		ret = hugetlb_full_walk_alloc(&dst_hpte, dst_vma, addr,
+				hugetlb_pte_size(&src_hpte));
+		if (ret)
 			break;
-		}
+
+		src_pte = src_hpte.ptep;
+		dst_pte = dst_hpte.ptep;
+
+		hpte_sz = hugetlb_pte_size(&src_hpte);
 
 		/*
 		 * If the pagetables are shared don't copy or take references.
@@ -5259,13 +5297,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		 * another vma. So page_count of ptep page is checked instead
 		 * to reliably determine whether pte is shared.
 		 */
-		if (page_count(virt_to_page(dst_pte)) > 1) {
-			addr |= last_addr_mask;
+		if (hugetlb_pte_size(&dst_hpte) == sz &&
+		    page_count(virt_to_page(dst_pte)) > 1) {
+			addr = (addr | last_addr_mask) + sz;
 			continue;
 		}
 
-		dst_ptl = huge_pte_lock(h, dst, dst_pte);
-		src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
+		dst_ptl = hugetlb_pte_lock(&dst_hpte);
+		src_ptl = hugetlb_pte_lockptr(&src_hpte);
 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 		entry = huge_ptep_get(src_pte);
 again:
@@ -5309,10 +5348,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			 */
 			if (userfaultfd_wp(dst_vma))
 				set_huge_pte_at(dst, addr, dst_pte, entry);
+		} else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) {
+			/* Retry the walk. */
+			spin_unlock(src_ptl);
+			spin_unlock(dst_ptl);
+			continue;
 		} else {
-			entry = huge_ptep_get(src_pte);
 			ptepage = pte_page(entry);
-			get_page(ptepage);
+			hpage = compound_head(ptepage);
+			get_page(hpage);
 
 			/*
 			 * Failing to duplicate the anon rmap is a rare case
@@ -5324,13 +5368,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			 * need to be without the pgtable locks since we could
 			 * sleep during the process.
 			 */
-			if (!PageAnon(ptepage)) {
-				page_add_file_rmap(ptepage, src_vma, true);
-			} else if (page_try_dup_anon_rmap(ptepage, true,
+			if (!PageAnon(hpage)) {
+				hugetlb_add_file_rmap(ptepage,
+						src_hpte.shift, h, src_vma);
+			}
+			/*
+			 * It is currently impossible to get anonymous HugeTLB
+			 * high-granularity mappings, so we use 'hpage' here.
+			 *
+			 * This will need to be changed when HGM support for
+			 * anon mappings is added.
+			 */
+			else if (page_try_dup_anon_rmap(hpage, true,
 							  src_vma)) {
 				pte_t src_pte_old = entry;
 				struct folio *new_folio;
 
+				/*
+				 * If we are mapped at high granularity, we
+				 * may end up allocating lots and lots of
+				 * hugepages when we only need one. Bail out
+				 * now.
+				 */
+				if (hugetlb_pte_size(&src_hpte) != sz) {
+					put_page(hpage);
+					ret = -EINVAL;
+					break;
+				}
+
 				spin_unlock(src_ptl);
 				spin_unlock(dst_ptl);
 				/* Do not use reserve as it's private owned */
@@ -5342,7 +5407,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				}
 				copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma,
 						    npages);
-				put_page(ptepage);
+				put_page(hpage);
 
 				/* Install the new hugetlb folio if src pte stable */
 				dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -5360,6 +5425,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio);
 				spin_unlock(src_ptl);
 				spin_unlock(dst_ptl);
+				addr += hugetlb_pte_size(&src_hpte);
 				continue;
 			}
 
@@ -5376,10 +5442,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			}
 
 			set_huge_pte_at(dst, addr, dst_pte, entry);
-			hugetlb_count_add(npages, dst);
+			hugetlb_count_add(
+					hugetlb_pte_size(&dst_hpte) / PAGE_SIZE,
+					dst);
 		}
 		spin_unlock(src_ptl);
 		spin_unlock(dst_ptl);
+		addr += hugetlb_pte_size(&src_hpte);
 	}
 
 	if (cow) {
-- 
2.39.2.637.g21b0678d19-goog


  parent reply	other threads:[~2023-02-18  0:30 UTC|newest]

Thread overview: 96+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-18  0:27 [PATCH v2 00/46] hugetlb: introduce HugeTLB high-granularity mapping James Houghton
2023-02-18  0:27 ` [PATCH v2 01/46] hugetlb: don't set PageUptodate for UFFDIO_CONTINUE James Houghton
2023-02-18  0:41   ` Mina Almasry
2023-02-21 15:59     ` James Houghton
2023-02-21 19:33       ` Mike Kravetz
2023-02-21 19:58         ` James Houghton
2023-02-18  0:27 ` [PATCH v2 02/46] hugetlb: remove mk_huge_pte; it is unused James Houghton
2023-02-18  0:27 ` [PATCH v2 03/46] hugetlb: remove redundant pte_mkhuge in migration path James Houghton
2023-02-18  0:27 ` [PATCH v2 04/46] hugetlb: only adjust address ranges when VMAs want PMD sharing James Houghton
2023-02-18  1:10   ` Mina Almasry
2023-02-18  0:27 ` [PATCH v2 05/46] rmap: hugetlb: switch from page_dup_file_rmap to page_add_file_rmap James Houghton
2023-03-02  1:06   ` Jiaqi Yan
2023-03-02 15:44     ` James Houghton
2023-03-02 16:43       ` James Houghton
2023-03-02 19:22         ` Mike Kravetz
2023-02-18  0:27 ` [PATCH v2 06/46] hugetlb: add CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING James Houghton
2023-02-18  0:27 ` [PATCH v2 07/46] mm: add VM_HUGETLB_HGM VMA flag James Houghton
2023-02-24 22:35   ` Mike Kravetz
2023-02-18  0:27 ` [PATCH v2 08/46] hugetlb: add HugeTLB HGM enablement helpers James Houghton
2023-02-18  1:40   ` Mina Almasry
2023-02-21 16:16     ` James Houghton
2023-02-24 23:08   ` Mike Kravetz
2023-02-18  0:27 ` [PATCH v2 09/46] mm: add MADV_SPLIT to enable HugeTLB HGM James Houghton
2023-02-18  1:58   ` Mina Almasry
2023-02-21 16:33     ` James Houghton
2023-02-24 23:25   ` Mike Kravetz
2023-02-27 15:14     ` James Houghton
2023-02-18  0:27 ` [PATCH v2 10/46] hugetlb: make huge_pte_lockptr take an explicit shift argument James Houghton
2023-02-18  0:27 ` [PATCH v2 11/46] hugetlb: add hugetlb_pte to track HugeTLB page table entries James Houghton
2023-02-18  5:24   ` Mina Almasry
2023-02-21 16:36     ` James Houghton
2023-02-25  0:09   ` Mike Kravetz
2023-02-18  0:27 ` [PATCH v2 12/46] hugetlb: add hugetlb_alloc_pmd and hugetlb_alloc_pte James Houghton
2023-02-18 17:46   ` kernel test robot
2023-02-27 19:16   ` Mike Kravetz
2023-02-27 19:31     ` James Houghton
2023-02-18  0:27 ` [PATCH v2 13/46] hugetlb: add hugetlb_hgm_walk and hugetlb_walk_step James Houghton
2023-02-18  7:43   ` kernel test robot
2023-02-18 18:07   ` kernel test robot
2023-02-21 17:09     ` James Houghton
2023-02-28 22:14   ` Mike Kravetz
2023-02-28 23:03     ` James Houghton
2023-02-18  0:27 ` [PATCH v2 14/46] hugetlb: split PTE markers when doing HGM walks James Houghton
2023-02-18 19:49   ` kernel test robot
2023-02-28 22:48   ` Mike Kravetz
2023-02-18  0:27 ` [PATCH v2 15/46] hugetlb: add make_huge_pte_with_shift James Houghton
2023-02-22 21:14   ` Mina Almasry
2023-02-22 22:53     ` James Houghton
2023-02-18  0:27 ` [PATCH v2 16/46] hugetlb: make default arch_make_huge_pte understand small mappings James Houghton
2023-02-22 21:17   ` Mina Almasry
2023-02-22 22:52     ` James Houghton
2023-02-28 23:02   ` Mike Kravetz
2023-02-18  0:27 ` [PATCH v2 17/46] hugetlbfs: do a full walk to check if vma maps a page James Houghton
2023-02-22 15:46   ` James Houghton
2023-02-28 23:52     ` Mike Kravetz
2023-02-18  0:27 ` [PATCH v2 18/46] hugetlb: add HGM support to __unmap_hugepage_range James Houghton
2023-02-18  0:27 ` [PATCH v2 19/46] hugetlb: add HGM support to hugetlb_change_protection James Houghton
2023-02-18  0:27 ` [PATCH v2 20/46] hugetlb: add HGM support to follow_hugetlb_page James Houghton
2023-02-18  0:27 ` [PATCH v2 21/46] hugetlb: add HGM support to hugetlb_follow_page_mask James Houghton
2023-02-18  0:27 ` James Houghton [this message]
2023-02-24 17:39   ` [PATCH v2 22/46] hugetlb: add HGM support to copy_hugetlb_page_range James Houghton
2023-02-18  0:27 ` [PATCH v2 23/46] hugetlb: add HGM support to move_hugetlb_page_tables James Houghton
2023-02-18  0:27 ` [PATCH v2 24/46] hugetlb: add HGM support to hugetlb_fault and hugetlb_no_page James Houghton
2023-02-18  0:27 ` [PATCH v2 25/46] hugetlb: use struct hugetlb_pte for walk_hugetlb_range James Houghton
2023-02-18  0:27 ` [PATCH v2 26/46] mm: rmap: provide pte_order in page_vma_mapped_walk James Houghton
2023-02-18  0:28 ` [PATCH v2 27/46] mm: rmap: update try_to_{migrate,unmap} to handle mapcount for HGM James Houghton
2023-02-18  0:28 ` [PATCH v2 28/46] mm: rmap: in try_to_{migrate,unmap}, check head page for hugetlb page flags James Houghton
2023-02-18  0:28 ` [PATCH v2 29/46] hugetlb: update page_vma_mapped to do high-granularity walks James Houghton
2023-02-18  0:28 ` [PATCH v2 30/46] hugetlb: add high-granularity migration support James Houghton
2023-02-18  0:28 ` [PATCH v2 31/46] hugetlb: sort hstates in hugetlb_init_hstates James Houghton
2023-02-18  0:28 ` [PATCH v2 32/46] hugetlb: add for_each_hgm_shift James Houghton
2023-02-18  0:28 ` [PATCH v2 33/46] hugetlb: userfaultfd: add support for high-granularity UFFDIO_CONTINUE James Houghton
2023-02-18  0:28 ` [PATCH v2 34/46] hugetlb: add MADV_COLLAPSE for hugetlb James Houghton
2023-02-18  0:28 ` [PATCH v2 35/46] hugetlb: add check to prevent refcount overflow via HGM James Houghton
2023-02-24 17:42   ` James Houghton
2023-02-24 18:05     ` James Houghton
2023-02-18  0:28 ` [PATCH v2 36/46] hugetlb: remove huge_pte_lock and huge_pte_lockptr James Houghton
2023-02-18  0:28 ` [PATCH v2 37/46] hugetlb: replace make_huge_pte with make_huge_pte_with_shift James Houghton
2023-02-18  0:28 ` [PATCH v2 38/46] mm: smaps: add stats for HugeTLB mapping size James Houghton
2023-02-18  0:28 ` [PATCH v2 39/46] hugetlb: x86: enable high-granularity mapping for x86_64 James Houghton
2023-02-18  0:28 ` [PATCH v2 40/46] docs: hugetlb: update hugetlb and userfaultfd admin-guides with HGM info James Houghton
2023-02-18  0:28 ` [PATCH v2 41/46] docs: proc: include information about HugeTLB HGM James Houghton
2023-02-18  0:28 ` [PATCH v2 42/46] selftests/mm: add HugeTLB HGM to userfaultfd selftest James Houghton
2023-02-18  0:28 ` [PATCH v2 43/46] KVM: selftests: add HugeTLB HGM to KVM demand paging selftest James Houghton
2023-02-18  0:28 ` [PATCH v2 44/46] selftests/mm: add anon and shared hugetlb to migration test James Houghton
2023-02-18  0:28 ` [PATCH v2 45/46] selftests/mm: add hugetlb HGM test to migration selftest James Houghton
2023-02-18  0:28 ` [PATCH v2 46/46] selftests/mm: add HGM UFFDIO_CONTINUE and hwpoison tests James Houghton
2023-02-24 17:37   ` James Houghton
2023-02-21 21:46 ` [PATCH v2 00/46] hugetlb: introduce HugeTLB high-granularity mapping Mike Kravetz
2023-02-22 15:48   ` David Hildenbrand
2023-02-22 20:57     ` Mina Almasry
2023-02-23  9:07       ` David Hildenbrand
2023-02-23 15:53         ` James Houghton
2023-02-23 16:17           ` David Hildenbrand
2023-02-23 18:33             ` Dr. David Alan Gilbert
2023-02-23 18:25           ` Mike Kravetz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230218002819.1486479-23-jthoughton@google.com \
    --to=jthoughton@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=almasrymina@google.com \
    --cc=axelrasmussen@google.com \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=david@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=fvdl@google.com \
    --cc=jiaqiyan@google.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=manish.mishra@nutanix.com \
    --cc=mike.kravetz@oracle.com \
    --cc=naoya.horiguchi@nec.com \
    --cc=peterx@redhat.com \
    --cc=rientjes@google.com \
    --cc=shy828301@gmail.com \
    --cc=songmuchun@bytedance.com \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    --cc=zokeefe@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.