linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: "Kirill A. Shutemov" <kirill@shutemov.name>
To: Boris Ostrovsky <boris.ostrovsky@oracle.com>,
	"Zhang, Xiong Y" <xiong.y.zhang@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	x86@kernel.org, Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Arnd Bergmann <arnd@arndb.de>,
	"H. Peter Anvin" <hpa@zytor.com>, Andi Kleen <ak@linux.intel.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Andy Lutomirski <luto@amacapital.net>,
	linux-arch@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, Juergen Gross <jgross@suse.com>,
	xen-devel <xen-devel@lists.xen.org>
Subject: Re: [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d
Date: Tue, 7 Mar 2017 16:00:09 +0300	[thread overview]
Message-ID: <20170307130009.GA2154@node> (raw)
In-Reply-To: <ab2868ea-1dd1-d51b-4c5a-921ef5c9a427@oracle.com>

On Mon, Mar 06, 2017 at 03:48:24PM -0500, Boris Ostrovsky wrote:
> 
> > +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
> > +		int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
> > +		bool last, unsigned long limit)
> > +{
> > +	int i, nr, flush = 0;
> > +
> > +	nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
> > +	for (i = 0; i < nr; i++) {
> > +		pud_t *pud;
> > +
> > +		if (p4d_none(p4d[i]))
> > +			continue;
> > +
> > +		pud = pud_offset(&p4d[i], 0);
> > +		if (PTRS_PER_PUD > 1)
> > +			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
> > +		xen_pud_walk(mm, pud, func, last && i == nr - 1, limit);
> > +	}
> > +	return flush;
> > +}
> 
> ..
> 
> > +		p4d = p4d_offset(&pgd[i], 0);
> > +		if (PTRS_PER_P4D > 1)
> > +			flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
> > +		xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
> 
> 
> We are losing flush status at all levels so we need something like
> 
> flush |= xen_XXX_walk(...)

+ Xiong.

Thanks for noticing this. The fixup is below.

Please test, I don't have a setup for this.

> 
> 
> 
> >  	}
> >  
> > -out:
> >  	/* Do the top level last, so that the callbacks can use it as
> >  	   a cue to do final things like tlb flushes. */
> >  	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
> > @@ -1150,57 +1161,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
> >  	xen_free_ro_pages(pa, PAGE_SIZE);
> >  }
> >  
> > +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
> > +{
> > +	unsigned long pa;
> > +	pte_t *pte_tbl;
> > +	int i;
> > +
> > +	if (pmd_large(*pmd)) {
> > +		pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> > +		xen_free_ro_pages(pa, PMD_SIZE);
> > +		return;
> > +	}
> > +
> > +	pte_tbl = pte_offset_kernel(pmd, 0);
> > +	for (i = 0; i < PTRS_PER_PTE; i++) {
> > +		if (pte_none(pte_tbl[i]))
> > +			continue;
> > +		pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
> > +		xen_free_ro_pages(pa, PAGE_SIZE);
> > +	}
> > +	set_pmd(pmd, __pmd(0));
> > +	xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
> > +}
> > +
> > +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
> > +{
> > +	unsigned long pa;
> > +	pmd_t *pmd_tbl;
> > +	int i;
> > +
> > +	if (pud_large(*pud)) {
> > +		pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> > +		xen_free_ro_pages(pa, PUD_SIZE);
> > +		return;
> > +	}
> > +
> > +	pmd_tbl = pmd_offset(pud, 0);
> > +	for (i = 0; i < PTRS_PER_PMD; i++) {
> > +		if (pmd_none(pmd_tbl[i]))
> > +			continue;
> > +		xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
> > +	}
> > +	set_pud(pud, __pud(0));
> > +	xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
> > +}
> > +
> > +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
> > +{
> > +	unsigned long pa;
> > +	pud_t *pud_tbl;
> > +	int i;
> > +
> > +	if (p4d_large(*p4d)) {
> > +		pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
> > +		xen_free_ro_pages(pa, P4D_SIZE);
> > +		return;
> > +	}
> > +
> > +	pud_tbl = pud_offset(p4d, 0);
> > +	for (i = 0; i < PTRS_PER_PUD; i++) {
> > +		if (pud_none(pud_tbl[i]))
> > +			continue;
> > +		xen_cleanmfnmap_pud(pud_tbl + i, unpin);
> > +	}
> > +	set_p4d(p4d, __p4d(0));
> > +	xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
> > +}
> > +
> >  /*
> >   * Since it is well isolated we can (and since it is perhaps large we should)
> >   * also free the page tables mapping the initial P->M table.
> >   */
> >  static void __init xen_cleanmfnmap(unsigned long vaddr)
> >  {
> > -	unsigned long va = vaddr & PMD_MASK;
> > -	unsigned long pa;
> > -	pgd_t *pgd = pgd_offset_k(va);
> > -	pud_t *pud_page = pud_offset(pgd, 0);
> > -	pud_t *pud;
> > -	pmd_t *pmd;
> > -	pte_t *pte;
> > +	pgd_t *pgd;
> > +	p4d_t *p4d;
> >  	unsigned int i;
> >  	bool unpin;
> >  
> >  	unpin = (vaddr == 2 * PGDIR_SIZE);
> > -	set_pgd(pgd, __pgd(0));
> > -	do {
> > -		pud = pud_page + pud_index(va);
> > -		if (pud_none(*pud)) {
> > -			va += PUD_SIZE;
> > -		} else if (pud_large(*pud)) {
> > -			pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> > -			xen_free_ro_pages(pa, PUD_SIZE);
> > -			va += PUD_SIZE;
> > -		} else {
> > -			pmd = pmd_offset(pud, va);
> > -			if (pmd_large(*pmd)) {
> > -				pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> > -				xen_free_ro_pages(pa, PMD_SIZE);
> > -			} else if (!pmd_none(*pmd)) {
> > -				pte = pte_offset_kernel(pmd, va);
> > -				set_pmd(pmd, __pmd(0));
> > -				for (i = 0; i < PTRS_PER_PTE; ++i) {
> > -					if (pte_none(pte[i]))
> > -						break;
> > -					pa = pte_pfn(pte[i]) << PAGE_SHIFT;
> > -					xen_free_ro_pages(pa, PAGE_SIZE);
> > -				}
> > -				xen_cleanmfnmap_free_pgtbl(pte, unpin);
> > -			}
> > -			va += PMD_SIZE;
> > -			if (pmd_index(va))
> > -				continue;
> > -			set_pud(pud, __pud(0));
> > -			xen_cleanmfnmap_free_pgtbl(pmd, unpin);
> > -		}
> > -
> > -	} while (pud_index(va) || pmd_index(va));
> > -	xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
> > +	vaddr &= PMD_MASK;
> > +	pgd = pgd_offset_k(vaddr);
> > +	p4d = p4d_offset(pgd, 0);
> > +	for (i = 0; i < PTRS_PER_P4D; i++) {
> > +		if (p4d_none(p4d[i]))
> > +			continue;
> > +		xen_cleanmfnmap_p4d(p4d + i, unpin);
> > +	}
> 
> Don't we need to pass vaddr down to all routines so that they select
> appropriate tables? You seem to always be choosing the first one.

IIUC, we clear whole page table subtree covered by one pgd entry.
So, no, there's no need to pass vaddr down. Just pointer to page table
entry is enough.

But I know virtually nothing about Xen. Please re-check my reasoning.

I would also appreciate help with getting x86 Xen code work with 5-level
paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.

Fixup:

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a4079cfab007..d66b7e79781a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -629,7 +629,8 @@ static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
 		pmd = pmd_offset(&pud[i], 0);
 		if (PTRS_PER_PMD > 1)
 			flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
-		xen_pmd_walk(mm, pmd, func, last && i == nr - 1, limit);
+		flush |= xen_pmd_walk(mm, pmd, func,
+				last && i == nr - 1, limit);
 	}
 	return flush;
 }
@@ -650,7 +651,8 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
 		pud = pud_offset(&p4d[i], 0);
 		if (PTRS_PER_PUD > 1)
 			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
-		xen_pud_walk(mm, pud, func, last && i == nr - 1, limit);
+		flush |= xen_pud_walk(mm, pud, func,
+				last && i == nr - 1, limit);
 	}
 	return flush;
 }
@@ -706,7 +708,7 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 		p4d = p4d_offset(&pgd[i], 0);
 		if (PTRS_PER_P4D > 1)
 			flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
-		xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
+		flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
 	}
 
 	/* Do the top level last, so that the callbacks can use it as
-- 
 Kirill A. Shutemov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2017-03-07 13:00 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-06 13:53 [PATCHv4 00/33] 5-level paging Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 01/33] x86/cpufeature: Add 5-level paging detection Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 02/33] asm-generic: introduce 5level-fixup.h Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 03/33] asm-generic: introduce __ARCH_USE_5LEVEL_HACK Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 04/33] arch, mm: convert all architectures to use 5level-fixup.h Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 05/33] asm-generic: introduce <asm-generic/pgtable-nop4d.h> Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 06/33] mm: convert generic code to 5-level paging Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 07/33] mm: introduce __p4d_alloc() Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 08/33] x86: basic changes into headers for 5-level paging Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 09/33] x86: trivial portion of 5-level paging conversion Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 10/33] x86/gup: add 5-level paging support Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 11/33] x86/ident_map: " Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 12/33] x86/mm: add support of p4d_t in vmalloc_fault() Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 13/33] x86/power: support p4d_t in hibernate code Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 14/33] x86/kexec: support p4d_t Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 15/33] x86/efi: handle p4d in EFI pagetables Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 16/33] x86/mm/pat: handle additional page table Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 17/33] x86/kasan: prepare clear_pgds() to switch to <asm-generic/pgtable-nop4d.h> Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d Kirill A. Shutemov
2017-03-06 20:48   ` Boris Ostrovsky
2017-03-07 13:00     ` Kirill A. Shutemov [this message]
2017-03-07 18:18       ` Boris Ostrovsky
2017-03-07 18:26         ` [Xen-devel] " Andrew Cooper
2017-03-07 18:45           ` Boris Ostrovsky
2017-03-06 13:53 ` [PATCHv4 19/33] x86: convert the rest of the code to support p4d_t Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 20/33] x86: detect 5-level paging support Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 21/33] x86/asm: remove __VIRTUAL_MASK_SHIFT==47 assert Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 22/33] x86/mm: define virtual memory map for 5-level paging Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 23/33] x86/paravirt: make paravirt code support " Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 24/33] x86/mm: basic defines/helpers for CONFIG_X86_5LEVEL Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 25/33] x86/dump_pagetables: support 5-level paging Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 26/33] x86/kasan: extend to " Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 27/33] x86/espfix: " Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 28/33] x86/mm: add support of additional page table level during early boot Kirill A. Shutemov
2017-03-06 20:05   ` Boris Ostrovsky
2017-03-06 20:23     ` Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 29/33] x86/mm: add sync_global_pgds() for configuration with 5-level paging Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 30/33] x86/mm: make kernel_physical_mapping_init() support " Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 31/33] x86/mm: add support for 5-level paging for KASLR Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 32/33] x86: enable 5-level paging support Kirill A. Shutemov
2017-03-06 13:53 ` [PATCHv4 33/33] x86/mm: allow to have userspace mappigs above 47-bits Kirill A. Shutemov
2017-03-06 18:27 ` [PATCHv4 00/33] 5-level paging Linus Torvalds
2017-03-06 18:42   ` Thomas Gleixner
2017-03-06 19:03     ` Linus Torvalds
2017-03-06 19:09       ` Kirill A. Shutemov
2017-03-06 19:35         ` Linus Torvalds
2017-03-07  0:41     ` Stephen Rothwell
2017-03-07  9:32       ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170307130009.GA2154@node \
    --to=kirill@shutemov.name \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=arnd@arndb.de \
    --cc=boris.ostrovsky@oracle.com \
    --cc=dave.hansen@intel.com \
    --cc=hpa@zytor.com \
    --cc=jgross@suse.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@amacapital.net \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    --cc=xen-devel@lists.xen.org \
    --cc=xiong.y.zhang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).