All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Kirill A. Shutemov" <kirill@shutemov.name>
To: Yang Shi <yang.shi@linux.alibaba.com>
Cc: akpm@linux-foundation.org, Andrea Arcangeli <aarcange@redhat.com>,
	Zi Yan <ziy@nvidia.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: Re: [PATCHv2 6/8] khugepaged: Allow to collapse PTE-mapped compound pages
Date: Thu, 9 Apr 2020 16:47:31 +0300	[thread overview]
Message-ID: <20200409134731.hm5a7ye7tljdguge@box> (raw)
In-Reply-To: <90b803eb-e3ba-bcf7-7987-6f0e33f76d4c@linux.alibaba.com>

On Wed, Apr 08, 2020 at 11:57:27AM -0700, Yang Shi wrote:
> 
> 
> On 4/8/20 6:29 AM, Kirill A. Shutemov wrote:
> > On Mon, Apr 06, 2020 at 02:29:35PM -0700, Yang Shi wrote:
> > > 
> > > On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
> > > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > > handling them more than once: lock/unlock page only once if it's present
> > > > in the PMD range multiple times as it handled on compound level. The
> > > > same goes for LRU isolation and putback.
> > > > 
> > > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > > ---
> > > >    mm/khugepaged.c | 103 ++++++++++++++++++++++++++++++++----------------
> > > >    1 file changed, 68 insertions(+), 35 deletions(-)
> > > > 
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index 1e7e6543ebca..49e56e4e30d1 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -515,23 +515,37 @@ void __khugepaged_exit(struct mm_struct *mm)
> > > >    static void release_pte_page(struct page *page)
> > > >    {
> > > > -	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > > > +	mod_node_page_state(page_pgdat(page),
> > > > +			NR_ISOLATED_ANON + page_is_file_cache(page),
> > > > +			-compound_nr(page));
> > > >    	unlock_page(page);
> > > >    	putback_lru_page(page);
> > > >    }
> > > > -static void release_pte_pages(pte_t *pte, pte_t *_pte)
> > > > +static void release_pte_pages(pte_t *pte, pte_t *_pte,
> > > > +		struct list_head *compound_pagelist)
> > > >    {
> > > > +	struct page *page, *tmp;
> > > > +
> > > >    	while (--_pte >= pte) {
> > > >    		pte_t pteval = *_pte;
> > > > -		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
> > > > -			release_pte_page(pte_page(pteval));
> > > > +
> > > > +		page = pte_page(pteval);
> > > > +		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
> > > > +				!PageCompound(page))
> > > > +			release_pte_page(page);
> > > > +	}
> > > > +
> > > > +	list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
> > > > +		list_del(&page->lru);
> > > > +		release_pte_page(page);
> > > >    	}
> > > >    }
> > > >    static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > >    					unsigned long address,
> > > > -					pte_t *pte)
> > > > +					pte_t *pte,
> > > > +					struct list_head *compound_pagelist)
> > > >    {
> > > >    	struct page *page = NULL;
> > > >    	pte_t *_pte;
> > > > @@ -561,13 +575,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > >    			goto out;
> > > >    		}
> > > > -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> > > > +		VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > > +
> > > >    		if (PageCompound(page)) {
> > > > -			result = SCAN_PAGE_COMPOUND;
> > > > -			goto out;
> > > > -		}
> > > > +			struct page *p;
> > > > +			page = compound_head(page);
> > > > -		VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > > +			/*
> > > > +			 * Check if we have dealt with the compound page
> > > > +			 * already
> > > > +			 */
> > > > +			list_for_each_entry(p, compound_pagelist, lru) {
> > > > +				if (page == p)
> > > > +					goto next;
> > > > +			}
> > > > +		}
> > > >    		/*
> > > >    		 * We can do it before isolate_lru_page because the
> > > > @@ -597,19 +619,15 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > >    			result = SCAN_PAGE_COUNT;
> > > >    			goto out;
> > > >    		}
> > > > -		if (pte_write(pteval)) {
> > > > -			writable = true;
> > > > -		} else {
> > > > -			if (PageSwapCache(page) &&
> > > > -			    !reuse_swap_page(page, NULL)) {
> > > > -				unlock_page(page);
> > > > -				result = SCAN_SWAP_CACHE_PAGE;
> > > > -				goto out;
> > > > -			}
> > > > +		if (!pte_write(pteval) && PageSwapCache(page) &&
> > > > +				!reuse_swap_page(page, NULL)) {
> > > >    			/*
> > > > -			 * Page is not in the swap cache. It can be collapsed
> > > > -			 * into a THP.
> > > > +			 * Page is in the swap cache and cannot be re-used.
> > > > +			 * It cannot be collapsed into a THP.
> > > >    			 */
> > > > +			unlock_page(page);
> > > > +			result = SCAN_SWAP_CACHE_PAGE;
> > > > +			goto out;
> > > >    		}
> > > >    		/*
> > > > @@ -621,16 +639,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > >    			result = SCAN_DEL_PAGE_LRU;
> > > >    			goto out;
> > > >    		}
> > > > -		inc_node_page_state(page,
> > > > -				NR_ISOLATED_ANON + page_is_file_cache(page));
> > > > +		mod_node_page_state(page_pgdat(page),
> > > > +				NR_ISOLATED_ANON + page_is_file_cache(page),
> > > > +				compound_nr(page));
> > > >    		VM_BUG_ON_PAGE(!PageLocked(page), page);
> > > >    		VM_BUG_ON_PAGE(PageLRU(page), page);
> > > > +		if (PageCompound(page))
> > > > +			list_add_tail(&page->lru, compound_pagelist);
> > > > +next:
> > > >    		/* There should be enough young pte to collapse the page */
> > > >    		if (pte_young(pteval) ||
> > > >    		    page_is_young(page) || PageReferenced(page) ||
> > > >    		    mmu_notifier_test_young(vma->vm_mm, address))
> > > >    			referenced++;
> > > > +
> > > > +		if (pte_write(pteval))
> > > > +			writable = true;
> > > >    	}
> > > >    	if (likely(writable)) {
> > > >    		if (likely(referenced)) {
> > > > @@ -644,7 +669,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > >    	}
> > > >    out:
> > > > -	release_pte_pages(pte, _pte);
> > > > +	release_pte_pages(pte, _pte, compound_pagelist);
> > > >    	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
> > > >    					    referenced, writable, result);
> > > >    	return 0;
> > > > @@ -653,13 +678,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > >    static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> > > >    				      struct vm_area_struct *vma,
> > > >    				      unsigned long address,
> > > > -				      spinlock_t *ptl)
> > > > +				      spinlock_t *ptl,
> > > > +				      struct list_head *compound_pagelist)
> > > >    {
> > > > +	struct page *src_page, *tmp;
> > > >    	pte_t *_pte;
> > > >    	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> > > >    				_pte++, page++, address += PAGE_SIZE) {
> > > >    		pte_t pteval = *_pte;
> > > > -		struct page *src_page;
> > > >    		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
> > > >    			clear_user_highpage(page, address);
> > > > @@ -679,7 +705,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> > > >    		} else {
> > > >    			src_page = pte_page(pteval);
> > > >    			copy_user_highpage(page, src_page, address, vma);
> > > > -			release_pte_page(src_page);
> > > >    			/*
> > > >    			 * ptl mostly unnecessary, but preempt has to
> > > >    			 * be disabled to update the per-cpu stats
> > > > @@ -693,9 +718,18 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> > > >    			pte_clear(vma->vm_mm, address, _pte);
> > > >    			page_remove_rmap(src_page, false);
> > > >    			spin_unlock(ptl);
> > > > -			free_page_and_swap_cache(src_page);
> > > > +			if (!PageCompound(src_page)) {
> > > > +				release_pte_page(src_page);
> > > > +				free_page_and_swap_cache(src_page);
> > > > +			}
> > > >    		}
> > > >    	}
> > > > +
> > > > +	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
> > > > +		list_del(&src_page->lru);
> > > > +		release_pte_page(src_page);
> > > > +		free_page_and_swap_cache(src_page);
> > > > +	}
> > > It looks this may mess up the PTE-mapped THP's refcount if it has multiple
> > > PTE-mapped subpages since put_page() is not called for every PTE-mapped
> > > subpages.
> > Good catch!
> > 
> > I *think* something like this should fix the issue (untested):
> 
> Is this an incremental fix?

Yes. I will fold it in before the next submission.

> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index bfb6155f1d69..9a96f9bff798 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -570,6 +570,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
> >   	list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
> >   		list_del(&page->lru);
> >   		release_pte_page(page);
> > +		put_page(page);
> >   	}
> >   }
> > @@ -682,8 +683,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >   		VM_BUG_ON_PAGE(!PageLocked(page), page);
> >   		VM_BUG_ON_PAGE(PageLRU(page), page);
> > -		if (PageCompound(page))
> > +		if (PageCompound(page)) {
> >   			list_add_tail(&page->lru, compound_pagelist);
> > +			get_page(page);
> 
> I don't get why we have to bump refcount here? The THP won't go away since
> it is pinned by isolation. I'm supposed calling
> free_page_and_swap_cache(src_page) in each loop looks good enough to dec
> refcount for each PTE-mapped subpage.
> 
> Then calling release_pte_page() for each entry on compund_pagelist outside
> the loop to release the pin from isolation. The compound page would get
> freed via pagevec finally.

Fair enough.

-- 
 Kirill A. Shutemov

  reply	other threads:[~2020-04-09 13:47 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-04-03 11:29 [PATCHv2 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
2020-04-03 11:29 ` [PATCHv2 1/8] khugepaged: Add self test Kirill A. Shutemov
2020-04-06 14:59   ` Zi Yan
2020-04-06 15:20     ` Kirill A. Shutemov
2020-04-06 18:50       ` Zi Yan
2020-04-08 14:21         ` Kirill A. Shutemov
2020-04-08 15:53           ` Zi Yan
2020-04-10 11:47     ` Kirill A. Shutemov
2020-04-10 14:36       ` Zi Yan
2020-04-10 14:58         ` Kirill A. Shutemov
2020-04-10 15:03           ` Zi Yan
2020-04-06 18:53   ` Ralph Campbell
2020-04-03 11:29 ` [PATCHv2 2/8] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
2020-04-06 18:13   ` Yang Shi
2020-04-06 19:53   ` Ralph Campbell
2020-04-09 13:34     ` Kirill A. Shutemov
2020-04-03 11:29 ` [PATCHv2 3/8] khugepaged: Drain all LRU caches before scanning pages Kirill A. Shutemov
2020-04-06 18:15   ` Yang Shi
2020-04-03 11:29 ` [PATCHv2 4/8] khugepaged: Drain LRU add pagevec after swapin Kirill A. Shutemov
2020-04-06 13:11   ` Zi Yan
2020-04-06 18:29   ` Yang Shi
2020-04-08 13:05     ` Kirill A. Shutemov
2020-04-08 18:42       ` Yang Shi
2020-04-03 11:29 ` [PATCHv2 5/8] khugepaged: Allow to callapse a page shared across fork Kirill A. Shutemov
2020-04-06 20:15   ` Ralph Campbell
2020-04-06 20:50   ` Yang Shi
2020-04-08 13:10     ` Kirill A. Shutemov
2020-04-08 18:51       ` Yang Shi
2020-04-10  0:03         ` Yang Shi
2020-04-10 15:56           ` Kirill A. Shutemov
2020-04-06 21:30   ` John Hubbard
2020-04-10 15:55     ` Kirill A. Shutemov
2020-04-10 20:59       ` John Hubbard
2020-04-13  9:42         ` Kirill A. Shutemov
2020-04-03 11:29 ` [PATCHv2 6/8] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
2020-04-06 21:29   ` Yang Shi
2020-04-08 13:29     ` Kirill A. Shutemov
2020-04-08 18:57       ` Yang Shi
2020-04-09 13:47         ` Kirill A. Shutemov [this message]
2020-04-03 11:29 ` [PATCHv2 7/8] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
2020-04-07  7:57   ` [thp] db001b7115: vm-scalability.median 8.9% improvement kernel test robot
2020-04-07  7:57     ` kernel test robot
2020-04-03 11:29 ` [PATCHv2 8/8] khugepaged: Introduce 'max_ptes_shared' tunable Kirill A. Shutemov
2020-04-06 13:17   ` Zi Yan
2020-04-05 23:40 ` [PATCHv2 0/8] thp/khugepaged improvements and CoW semantics William Kucharski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200409134731.hm5a7ye7tljdguge@box \
    --to=kirill@shutemov.name \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=yang.shi@linux.alibaba.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.