All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Kirill A. Shutemov" <kirill@shutemov.name>
To: Zi Yan <ziy@nvidia.com>
Cc: linux-mm@kvack.org, Roman Gushchin <guro@fb.com>,
	Rik van Riel <riel@surriel.com>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Matthew Wilcox <willy@infradead.org>,
	Shakeel Butt <shakeelb@google.com>,
	Yang Shi <yang.shi@linux.alibaba.com>,
	David Nellans <dnellans@nvidia.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [RFC PATCH 05/16] mm: thp: handling 1GB THP reference bit.
Date: Wed, 9 Sep 2020 17:09:12 +0300	[thread overview]
Message-ID: <20200909140912.g2s4y22li2xwfttr@box> (raw)
In-Reply-To: <20200902180628.4052244-6-zi.yan@sent.com>

On Wed, Sep 02, 2020 at 02:06:17PM -0400, Zi Yan wrote:
> From: Zi Yan <ziy@nvidia.com>
> 
> Add PUD-level TLB flush ops and teach page_vma_mapped_talk about 1GB
> THPs.
> 
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>  arch/x86/include/asm/pgtable.h |  3 +++
>  arch/x86/mm/pgtable.c          | 13 +++++++++++++
>  include/linux/mmu_notifier.h   | 13 +++++++++++++
>  include/linux/pgtable.h        | 14 ++++++++++++++
>  include/linux/rmap.h           |  1 +
>  mm/page_vma_mapped.c           | 33 +++++++++++++++++++++++++++++----
>  mm/rmap.c                      | 12 +++++++++---
>  7 files changed, 82 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 26255cac78c0..15334f5ba172 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -1127,6 +1127,9 @@ extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
>  extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
>  				  unsigned long address, pmd_t *pmdp);
>  
> +#define __HAVE_ARCH_PUDP_CLEAR_YOUNG_FLUSH
> +extern int pudp_clear_flush_young(struct vm_area_struct *vma,
> +				  unsigned long address, pud_t *pudp);
>  
>  #define pmd_write pmd_write
>  static inline int pmd_write(pmd_t pmd)
> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
> index 7be73aee6183..e4a2dffcc418 100644
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -633,6 +633,19 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
>  
>  	return young;
>  }
> +int pudp_clear_flush_young(struct vm_area_struct *vma,
> +			   unsigned long address, pud_t *pudp)
> +{
> +	int young;
> +
> +	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
> +
> +	young = pudp_test_and_clear_young(vma, address, pudp);
> +	if (young)
> +		flush_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
> +
> +	return young;
> +}
>  #endif
>  
>  /**
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index b8200782dede..4ffa179e654f 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -557,6 +557,19 @@ static inline void mmu_notifier_range_init_migrate(
>  	__young;							\
>  })
>  
> +#define pudp_clear_flush_young_notify(__vma, __address, __pudp)		\
> +({									\
> +	int __young;							\
> +	struct vm_area_struct *___vma = __vma;				\
> +	unsigned long ___address = __address;				\
> +	__young = pudp_clear_flush_young(___vma, ___address, __pudp);	\
> +	__young |= mmu_notifier_clear_flush_young(___vma->vm_mm,	\
> +						  ___address,		\
> +						  ___address +		\
> +							PUD_SIZE);	\
> +	__young;							\
> +})
> +
>  #define ptep_clear_young_notify(__vma, __address, __ptep)		\
>  ({									\
>  	int __young;							\
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 255275d5b73e..8ef358c386af 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -240,6 +240,20 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  #endif
>  
> +#ifndef __HAVE_ARCH_PUDP_CLEAR_YOUNG_FLUSH
> +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> +extern int pudp_clear_flush_young(struct vm_area_struct *vma,
> +				  unsigned long address, pud_t *pudp);
> +#else
> +int pudp_clear_flush_young(struct vm_area_struct *vma,
> +				  unsigned long address, pud_t *pudp)
> +{
> +	BUILD_BUG();
> +	return 0;
> +}
> +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD  */
> +#endif
> +
>  #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
>  static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
>  				       unsigned long address,
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 3a6adfa70fb0..0af61dd193d2 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -206,6 +206,7 @@ struct page_vma_mapped_walk {
>  	struct page *page;
>  	struct vm_area_struct *vma;
>  	unsigned long address;
> +	pud_t *pud;
>  	pmd_t *pmd;
>  	pte_t *pte;
>  	spinlock_t *ptl;
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 5e77b269c330..d9d39ec06e21 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -145,9 +145,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  	struct page *page = pvmw->page;
>  	pgd_t *pgd;
>  	p4d_t *p4d;
> -	pud_t *pud;
> +	pud_t pude;
>  	pmd_t pmde;
>  
> +	if (!pvmw->pte && !pvmw->pmd && pvmw->pud)
> +		return not_found(pvmw);
> +
>  	/* The only possible pmd mapping has been handled on last iteration */
>  	if (pvmw->pmd && !pvmw->pte)
>  		return not_found(pvmw);
> @@ -174,10 +177,31 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  	p4d = p4d_offset(pgd, pvmw->address);
>  	if (!p4d_present(*p4d))
>  		return false;
> -	pud = pud_offset(p4d, pvmw->address);
> -	if (!pud_present(*pud))
> +	pvmw->pud = pud_offset(p4d, pvmw->address);
> +
> +	/*
> +	 * Make sure the pud value isn't cached in a register by the
> +	 * compiler and used as a stale value after we've observed a
> +	 * subsequent update.
> +	 */
> +	pude = READ_ONCE(*pvmw->pud);
> +	if (pud_trans_huge(pude)) {
> +		pvmw->ptl = pud_lock(mm, pvmw->pud);
> +		if (likely(pud_trans_huge(*pvmw->pud))) {
> +			if (pvmw->flags & PVMW_MIGRATION)
> +				return not_found(pvmw);
> +			if (pud_page(*pvmw->pud) != page)
> +				return not_found(pvmw);
> +			return true;
> +		} else {
> +			/* THP pud was split under us: handle on pmd level */
> +			spin_unlock(pvmw->ptl);
> +			pvmw->ptl = NULL;

Hm. What makes you sure the pmd table is established here?

I have not looked at PUD THP handling of  MADV_DONTNEED yet, but for PMD
THP can became pmd_none() at any point (unless ptl is locked).

> +		}
> +	} else if (!pud_present(pude))
>  		return false;
> -	pvmw->pmd = pmd_offset(pud, pvmw->address);
> +
> +	pvmw->pmd = pmd_offset(pvmw->pud, pvmw->address);
>  	/*
>  	 * Make sure the pmd value isn't cached in a register by the
>  	 * compiler and used as a stale value after we've observed a
> @@ -213,6 +237,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  	} else if (!pmd_present(pmde)) {
>  		return false;
>  	}
> +
>  	if (!map_pte(pvmw))
>  		goto next_pte;
>  	while (1) {
> diff --git a/mm/rmap.c b/mm/rmap.c

Why?

> index 10195a2421cf..77cec0658b76 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -803,9 +803,15 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
>  					referenced++;
>  			}
>  		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
> -			if (pmdp_clear_flush_young_notify(vma, address,
> -						pvmw.pmd))
> -				referenced++;
> +			if (pvmw.pmd) {
> +				if (pmdp_clear_flush_young_notify(vma, address,
> +							pvmw.pmd))
> +					referenced++;
> +			} else if (pvmw.pud) {
> +				if (pudp_clear_flush_young_notify(vma, address,
> +							pvmw.pud))
> +					referenced++;
> +			}
>  		} else {
>  			/* unexpected pmd-mapped page? */
>  			WARN_ON_ONCE(1);
> -- 
> 2.28.0
> 
> 

-- 
 Kirill A. Shutemov

  reply	other threads:[~2020-09-09 16:46 UTC|newest]

Thread overview: 85+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-02 18:06 [RFC PATCH 00/16] 1GB THP support on x86_64 Zi Yan
2020-09-02 18:06 ` [RFC PATCH 01/16] mm: add pagechain container for storing multiple pages Zi Yan
2020-09-02 20:29   ` Randy Dunlap
2020-09-02 20:48     ` Zi Yan
2020-09-03  3:15   ` Matthew Wilcox
2020-09-07 12:22   ` Kirill A. Shutemov
2020-09-07 15:11     ` Zi Yan
2020-09-09 13:46       ` Kirill A. Shutemov
2020-09-09 14:15         ` Zi Yan
2020-09-02 18:06 ` [RFC PATCH 02/16] mm: thp: 1GB anonymous page implementation Zi Yan
2020-09-02 18:06 ` [RFC PATCH 03/16] mm: proc: add 1GB THP kpageflag Zi Yan
2020-09-09 13:46   ` Kirill A. Shutemov
2020-09-02 18:06 ` [RFC PATCH 04/16] mm: thp: 1GB THP copy on write implementation Zi Yan
2020-09-02 18:06 ` [RFC PATCH 05/16] mm: thp: handling 1GB THP reference bit Zi Yan
2020-09-09 14:09   ` Kirill A. Shutemov [this message]
2020-09-09 14:36     ` Zi Yan
2020-09-02 18:06 ` [RFC PATCH 06/16] mm: thp: add 1GB THP split_huge_pud_page() function Zi Yan
2020-09-09 14:18   ` Kirill A. Shutemov
2020-09-09 14:19     ` Zi Yan
2020-09-02 18:06 ` [RFC PATCH 07/16] mm: stats: make smap stats understand PUD THPs Zi Yan
2020-09-02 18:06 ` [RFC PATCH 08/16] mm: page_vma_walk: teach it about PMD-mapped PUD THP Zi Yan
2020-09-02 18:06 ` [RFC PATCH 09/16] mm: thp: 1GB THP support in try_to_unmap() Zi Yan
2020-09-02 18:06 ` [RFC PATCH 10/16] mm: thp: split 1GB THPs at page reclaim Zi Yan
2020-09-02 18:06 ` [RFC PATCH 11/16] mm: thp: 1GB THP follow_p*d_page() support Zi Yan
2020-09-02 18:06 ` [RFC PATCH 12/16] mm: support 1GB THP pagemap support Zi Yan
2020-09-02 18:06 ` [RFC PATCH 13/16] mm: thp: add a knob to enable/disable 1GB THPs Zi Yan
2020-09-02 18:06 ` [RFC PATCH 14/16] mm: page_alloc: >=MAX_ORDER pages allocation an deallocation Zi Yan
2020-09-02 18:06 ` [RFC PATCH 15/16] hugetlb: cma: move cma reserve function to cma.c Zi Yan
2020-09-02 18:06 ` [RFC PATCH 16/16] mm: thp: use cma reservation for pud thp allocation Zi Yan
2020-09-02 18:40 ` [RFC PATCH 00/16] 1GB THP support on x86_64 Jason Gunthorpe
2020-09-02 18:45   ` Zi Yan
2020-09-02 18:48     ` Jason Gunthorpe
2020-09-02 19:05       ` Zi Yan
2020-09-02 19:57         ` Jason Gunthorpe
2020-09-02 20:29           ` Zi Yan
2020-09-03 16:40             ` Jason Gunthorpe
2020-09-03 16:55               ` Matthew Wilcox
2020-09-03 17:08                 ` Jason Gunthorpe
2020-09-03  7:32 ` Michal Hocko
2020-09-03 16:25   ` Roman Gushchin
2020-09-03 16:50     ` Jason Gunthorpe
2020-09-03 17:01       ` Matthew Wilcox
2020-09-03 17:18         ` Jason Gunthorpe
2020-09-03 20:57     ` Mike Kravetz
2020-09-03 21:06       ` Roman Gushchin
2020-09-04  7:42     ` Michal Hocko
2020-09-04 21:10       ` Roman Gushchin
2020-09-07  7:20         ` Michal Hocko
2020-09-08 15:09           ` Zi Yan
2020-09-08 19:58             ` Roman Gushchin
2020-09-09  4:01               ` John Hubbard
2020-09-09  7:15               ` Michal Hocko
2020-09-03 14:23 ` Kirill A. Shutemov
2020-09-03 16:30   ` Roman Gushchin
2020-09-08 11:57     ` David Hildenbrand
2020-09-08 14:05       ` Zi Yan
2020-09-08 14:22         ` David Hildenbrand
2020-09-08 15:36           ` Zi Yan
2020-09-08 14:27         ` Matthew Wilcox
2020-09-08 15:50           ` Zi Yan
2020-09-09 12:11           ` Jason Gunthorpe
2020-09-09 12:32             ` Matthew Wilcox
2020-09-09 13:14               ` Jason Gunthorpe
2020-09-09 13:27                 ` David Hildenbrand
2020-09-10 10:02                   ` William Kucharski
2020-09-08 14:35         ` Michal Hocko
2020-09-08 14:41           ` Rik van Riel
2020-09-08 14:41             ` Rik van Riel
2020-09-08 15:02             ` David Hildenbrand
2020-09-09  7:04             ` Michal Hocko
2020-09-09 13:19               ` Rik van Riel
2020-09-09 13:43                 ` David Hildenbrand
2020-09-09 13:49                   ` Rik van Riel
2020-09-09 13:49                     ` Rik van Riel
2020-09-09 13:54                     ` David Hildenbrand
2020-09-10  7:32                   ` Michal Hocko
2020-09-10  8:27                     ` David Hildenbrand
2020-09-10 14:21                       ` Zi Yan
2020-09-10 14:34                         ` David Hildenbrand
2020-09-10 14:41                           ` Zi Yan
2020-09-10 15:15                             ` David Hildenbrand
2020-09-10 13:32                     ` Rik van Riel
2020-09-10 13:32                       ` Rik van Riel
2020-09-10 14:30                       ` Zi Yan
2020-09-09 13:59                 ` Michal Hocko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200909140912.g2s4y22li2xwfttr@box \
    --to=kirill@shutemov.name \
    --cc=dnellans@nvidia.com \
    --cc=guro@fb.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=riel@surriel.com \
    --cc=shakeelb@google.com \
    --cc=willy@infradead.org \
    --cc=yang.shi@linux.alibaba.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.