Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
From: Zi Yan <zi.yan@sent.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Dave Hansen <dave.hansen@linux.intel.com>,
	Michal Hocko <mhocko@kernel.org>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mel Gorman <mgorman@techsingularity.net>,
	John Hubbard <jhubbard@nvidia.com>,
	Mark Hairgrove <mhairgrove@nvidia.com>,
	Nitin Gupta <nigupta@nvidia.com>,
	David Nellans <dnellans@nvidia.com>, Zi Yan <ziy@nvidia.com>
Subject: [RFC PATCH 30/31] mm: mem_defrag: thp: PMD THP and PUD THP in-place promotion support.
Date: Fri, 15 Feb 2019 14:08:55 -0800
Message-ID: <20190215220856.29749-31-zi.yan@sent.com> (raw)
In-Reply-To: <20190215220856.29749-1-zi.yan@sent.com>

From: Zi Yan <ziy@nvidia.com>

PMD THPs will get PMD page table entry promotion as well.
PUD THPs only gets PUD page table entry promotion when the toggle is
on, which is off by default. Since 1GB THP performs not so good due to
shortage of 1GB TLB entries.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/mem_defrag.c | 79 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/mm/mem_defrag.c b/mm/mem_defrag.c
index 4d458b125c95..d7a579924d12 100644
--- a/mm/mem_defrag.c
+++ b/mm/mem_defrag.c
@@ -56,6 +56,7 @@ struct defrag_result_stats {
 	unsigned long dst_non_lru_failed;
 	unsigned long dst_non_moveable_failed;
 	unsigned long not_defrag_vpn;
+	unsigned int aligned_max_order;
 };
 
 enum {
@@ -689,6 +690,10 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
 
 		page_size = get_contig_page_size(scan_page);
 
+		if (compound_order(compound_head(scan_page)) == HPAGE_PUD_ORDER) {
+			defrag_stats->aligned_max_order = HPAGE_PUD_ORDER;
+			goto quit_defrag;
+		}
 		/* PTE-mapped THP not allowed  */
 		if ((scan_page == compound_head(scan_page)) &&
 			PageTransHuge(scan_page) && !PageHuge(scan_page))
@@ -714,6 +719,8 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* already in the contiguous pos  */
 		if (page_dist == (long long)(scan_page - anchor_page)) {
 			defrag_stats->aligned += (page_size/PAGE_SIZE);
+			defrag_stats->aligned_max_order = max(defrag_stats->aligned_max_order,
+				compound_order(scan_page));
 			continue;
 		} else { /* migrate pages according to the anchor pages in the vma.  */
 			struct page *dest_page = anchor_page + page_dist;
@@ -901,6 +908,10 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
 			} else { /* exchange  */
 				int err = -EBUSY;
 
+				if (compound_order(compound_head(dest_page)) == HPAGE_PUD_ORDER) {
+					defrag_stats->aligned_max_order = HPAGE_PUD_ORDER;
+					goto quit_defrag;
+				}
 				/* PTE-mapped THP not allowed  */
 				if ((dest_page == compound_head(dest_page)) &&
 					PageTransHuge(dest_page) && !PageHuge(dest_page))
@@ -1486,10 +1497,13 @@ static int kmem_defragd_scan_mm(struct defrag_scan_control *sc)
 				up_read(&vma->vm_mm->mmap_sem);
 			} else if (sc->action == MEM_DEFRAG_DO_DEFRAG) {
 				/* go to nearest 1GB aligned address  */
+				unsigned long defrag_begin = *scan_address;
 				unsigned long defrag_end = min_t(unsigned long,
 							(*scan_address + HPAGE_PUD_SIZE) & HPAGE_PUD_MASK,
 							vend);
 				int defrag_result;
+				int nr_fails_in_1gb_range = 0;
+				int skip_promotion = 0;
 
 				anchor_node = get_anchor_page_node_from_vma(vma, *scan_address);
 
@@ -1583,14 +1597,47 @@ static int kmem_defragd_scan_mm(struct defrag_scan_control *sc)
 					 * skip the page which cannot be defragged and restart
 					 * from the next page
 					 */
-					if (defrag_stats.not_defrag_vpn &&
-						defrag_stats.not_defrag_vpn < defrag_sub_chunk_end) {
+					if (defrag_stats.not_defrag_vpn) {
 						VM_BUG_ON(defrag_sub_chunk_end != defrag_end &&
 							defrag_stats.not_defrag_vpn > defrag_sub_chunk_end);
-
-						*scan_address = defrag_stats.not_defrag_vpn;
-						defrag_stats.not_defrag_vpn = 0;
-						goto continue_defrag;
+						find_anchor_pages_in_vma(mm, vma, defrag_stats.not_defrag_vpn);
+
+						nr_fails_in_1gb_range += 1;
+						if (defrag_stats.not_defrag_vpn < defrag_sub_chunk_end) {
+							/* reset and continue  */
+							*scan_address = defrag_stats.not_defrag_vpn;
+							defrag_stats.not_defrag_vpn = 0;
+							goto continue_defrag;
+						}
+					} else {
+						/* defrag works for the whole chunk,
+						 * promote to THP in place
+						 */
+						if (!defrag_result &&
+							/* skip existing THPs */
+							defrag_stats.aligned_max_order < HPAGE_PMD_ORDER &&
+							!(*scan_address & (HPAGE_PMD_SIZE-1)) &&
+							!(defrag_sub_chunk_end & (HPAGE_PMD_SIZE-1))) {
+							int ret = 0;
+							/* find a range to promote pmd */
+							down_write(&mm->mmap_sem);
+							ret = promote_huge_page_address(vma, *scan_address);
+							if (!ret) {
+								/*
+								 * promote to 2MB THP successful, but it is
+								 * still PTE pointed
+								 */
+								/* promote PTE-mapped THP to PMD-mapped */
+								promote_huge_pmd_address(vma, *scan_address);
+							}
+							up_write(&mm->mmap_sem);
+						}
+						/* skip PUD pages */
+						if (defrag_stats.aligned_max_order == HPAGE_PUD_ORDER) {
+							*scan_address = defrag_end;
+							skip_promotion = 1;
+							continue;
+						}
 					}
 
 					/* Done with current 2MB chunk */
@@ -1606,6 +1653,26 @@ static int kmem_defragd_scan_mm(struct defrag_scan_control *sc)
 					}
 				}
 
+				/* defrag works for the whole chunk, promote to PUD THP in place */
+				if (!nr_fails_in_1gb_range &&
+					!skip_promotion && /* avoid existing THP */
+					!(defrag_begin & (HPAGE_PUD_SIZE-1)) &&
+					!(defrag_end & (HPAGE_PUD_SIZE-1))) {
+					int ret = 0;
+					/* find a range to promote pud */
+					down_write(&mm->mmap_sem);
+					ret = promote_huge_pud_page_address(vma, defrag_begin);
+					if (!ret) {
+						/*
+						 * promote to 1GB THP successful, but it is
+						 * still PMD pointed
+						 */
+						/* promote PMD-mapped THP to PUD-mapped */
+						if (mem_defrag_promote_1gb_thp)
+							promote_huge_pud_address(vma, defrag_begin);
+					}
+					up_write(&mm->mmap_sem);
+				}
 			}
 		}
 done_one_vma:
-- 
2.20.1


  parent reply index

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-02-15 22:08 [RFC PATCH 00/31] Generating physically contiguous memory after page allocation Zi Yan
2019-02-15 22:08 ` [RFC PATCH 01/31] mm: migrate: Add exchange_pages to exchange two lists of pages Zi Yan
2019-02-17 11:29   ` Matthew Wilcox
2019-02-18 17:31     ` Zi Yan
2019-02-18 17:42       ` Vlastimil Babka
2019-02-18 17:51         ` Zi Yan
2019-02-18 17:52           ` Matthew Wilcox
2019-02-18 17:59             ` Zi Yan
2019-02-19  7:42               ` Anshuman Khandual
2019-02-19 12:56                 ` Matthew Wilcox
2019-02-20  4:38                   ` Anshuman Khandual
2019-03-14  2:39                     ` Zi Yan
2019-02-21 21:10   ` Jerome Glisse
2019-02-21 21:25     ` Zi Yan
2019-02-15 22:08 ` [RFC PATCH 02/31] mm: migrate: Add THP exchange support Zi Yan
2019-02-15 22:08 ` [RFC PATCH 03/31] mm: migrate: Add tmpfs " Zi Yan
2019-02-15 22:08 ` [RFC PATCH 04/31] mm: add mem_defrag functionality Zi Yan
2019-02-15 22:08 ` [RFC PATCH 05/31] mem_defrag: split a THP if either src or dst is THP only Zi Yan
2019-02-15 22:08 ` [RFC PATCH 06/31] mm: Make MAX_ORDER configurable in Kconfig for buddy allocator Zi Yan
2019-02-15 22:08 ` [RFC PATCH 07/31] mm: deallocate pages with order > MAX_ORDER Zi Yan
2019-02-15 22:08 ` [RFC PATCH 08/31] mm: add pagechain container for storing multiple pages Zi Yan
2019-02-15 22:08 ` [RFC PATCH 09/31] mm: thp: 1GB anonymous page implementation Zi Yan
2019-02-15 22:08 ` [RFC PATCH 10/31] mm: proc: add 1GB THP kpageflag Zi Yan
2019-02-15 22:08 ` [RFC PATCH 11/31] mm: debug: print compound page order in dump_page() Zi Yan
2019-02-15 22:08 ` [RFC PATCH 12/31] mm: stats: Separate PMD THP and PUD THP stats Zi Yan
2019-02-15 22:08 ` [RFC PATCH 13/31] mm: thp: 1GB THP copy on write implementation Zi Yan
2019-02-15 22:08 ` [RFC PATCH 14/31] mm: thp: handling 1GB THP reference bit Zi Yan
2019-02-15 22:08 ` [RFC PATCH 15/31] mm: thp: add 1GB THP split_huge_pud_page() function Zi Yan
2019-02-15 22:08 ` [RFC PATCH 16/31] mm: thp: check compound_mapcount of PMD-mapped PUD THPs at free time Zi Yan
2019-02-15 22:08 ` [RFC PATCH 17/31] mm: thp: split properly PMD-mapped PUD THP to PTE-mapped PUD THP Zi Yan
2019-02-15 22:08 ` [RFC PATCH 18/31] mm: page_vma_walk: teach it about PMD-mapped " Zi Yan
2019-02-15 22:08 ` [RFC PATCH 19/31] mm: thp: 1GB THP support in try_to_unmap() Zi Yan
2019-02-15 22:08 ` [RFC PATCH 20/31] mm: thp: split 1GB THPs at page reclaim Zi Yan
2019-02-15 22:08 ` [RFC PATCH 21/31] mm: thp: 1GB zero page shrinker Zi Yan
2019-02-15 22:08 ` [RFC PATCH 22/31] mm: thp: 1GB THP follow_p*d_page() support Zi Yan
2019-02-15 22:08 ` [RFC PATCH 23/31] mm: support 1GB THP pagemap support Zi Yan
2019-02-15 22:08 ` [RFC PATCH 24/31] sysctl: add an option to only print the head page virtual address Zi Yan
2019-02-15 22:08 ` [RFC PATCH 25/31] mm: thp: add a knob to enable/disable 1GB THPs Zi Yan
2019-02-15 22:08 ` [RFC PATCH 26/31] mm: thp: promote PTE-mapped THP to PMD-mapped THP Zi Yan
2019-02-15 22:08 ` [RFC PATCH 27/31] mm: thp: promote PMD-mapped PUD pages to PUD-mapped PUD pages Zi Yan
2019-02-15 22:08 ` [RFC PATCH 28/31] mm: vmstats: add page promotion stats Zi Yan
2019-02-15 22:08 ` [RFC PATCH 29/31] mm: madvise: add madvise options to split PMD and PUD THPs Zi Yan
2019-02-15 22:08 ` Zi Yan [this message]
2019-02-15 22:08 ` [RFC PATCH 31/31] sysctl: toggle to promote PUD-mapped 1GB THP or not Zi Yan
2019-02-20  1:42 ` [RFC PATCH 00/31] Generating physically contiguous memory after page allocation Mike Kravetz
2019-02-20  2:33   ` Zi Yan
2019-02-20  3:18     ` Mike Kravetz
2019-02-20  5:19       ` Zi Yan
2019-02-20  5:27         ` Mike Kravetz

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190215220856.29749-31-zi.yan@sent.com \
    --to=zi.yan@sent.com \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@linux.intel.com \
    --cc=dnellans@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhairgrove@nvidia.com \
    --cc=mhocko@kernel.org \
    --cc=nigupta@nvidia.com \
    --cc=vbabka@suse.cz \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org linux-mm@archiver.kernel.org
	public-inbox-index linux-mm


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/ public-inbox