From: Janosch Frank <frankja@linux.ibm.com>
To: Claudio Imbrenda <imbrenda@linux.ibm.com>, kvm@vger.kernel.org
Cc: linux-s390@vger.kernel.org, david@redhat.com, thuth@redhat.com,
cohuck@redhat.com
Subject: Re: [kvm-unit-tests PATCH v5 6/7] s390x: mmu: add support for large pages
Date: Fri, 18 Jun 2021 09:36:08 +0200 [thread overview]
Message-ID: <ac930fdd-53e9-cbc5-687d-8d99d968a3a1@linux.ibm.com> (raw)
In-Reply-To: <20210611140705.553307-7-imbrenda@linux.ibm.com>
On 6/11/21 4:07 PM, Claudio Imbrenda wrote:
> Add support for 1M and 2G pages.
>
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Acked-by: Janosch Frank <frankja@de.ibm.com>
> ---
> lib/s390x/mmu.h | 84 +++++++++++++++-
> lib/s390x/mmu.c | 262 +++++++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 320 insertions(+), 26 deletions(-)
>
> diff --git a/lib/s390x/mmu.h b/lib/s390x/mmu.h
> index b995f85b..ab35d782 100644
> --- a/lib/s390x/mmu.h
> +++ b/lib/s390x/mmu.h
> @@ -10,9 +10,89 @@
> #ifndef _S390X_MMU_H_
> #define _S390X_MMU_H_
>
> -void protect_page(void *vaddr, unsigned long prot);
> +enum pgt_level {
> + pgtable_level_pgd = 1,
> + pgtable_level_p4d,
> + pgtable_level_pud,
> + pgtable_level_pmd,
> + pgtable_level_pte,
> +};
> +
> +/*
> + * Splits the pagetables down to the given DAT tables level.
> + * Returns a pointer to the DAT table entry of the given level.
> + * @pgtable root of the page table tree
> + * @vaddr address whose page tables are to split
> + * @level 3 (for 2GB pud), 4 (for 1 MB pmd) or 5 (for 4KB pages)
> + */
> +void *split_page(pgd_t *pgtable, void *vaddr, enum pgt_level level);
> +
> +/*
> + * Applies the given protection bits to the given DAT tables level,
> + * splitting if necessary.
> + * @pgtable root of the page table tree
> + * @vaddr address whose protection bits are to be changed
> + * @prot the protection bits to set
> + * @level 3 (for 2GB pud), 4 (for 1MB pmd) or 5 (for 4KB pages)
> + */
> +void protect_dat_entry(void *vaddr, unsigned long prot, enum pgt_level level);
> +
> +/*
> + * Clears the given protection bits from the given DAT tables level,
> + * splitting if necessary.
> + * @pgtable root of the page table tree
> + * @vaddr address whose protection bits are to be changed
> + * @prot the protection bits to clear
> + * @level 3 (for 2GB pud), 4 (for 1MB pmd) or 5 (for 4kB pages)
> + */
> +void unprotect_dat_entry(void *vaddr, unsigned long prot, enum pgt_level level);
> +
> +/*
> + * Applies the given protection bits to the given 4kB pages range,
> + * splitting if necessary.
> + * @start starting address whose protection bits are to be changed
> + * @len size in bytes
> + * @prot the protection bits to set
> + */
> void protect_range(void *start, unsigned long len, unsigned long prot);
> -void unprotect_page(void *vaddr, unsigned long prot);
> +
> +/*
> + * Clears the given protection bits from the given 4kB pages range,
> + * splitting if necessary.
> + * @start starting address whose protection bits are to be changed
> + * @len size in bytes
> + * @prot the protection bits to set
> + */
> void unprotect_range(void *start, unsigned long len, unsigned long prot);
>
> +/* Similar to install_page, maps the virtual address to the physical address
> + * for the given page tables, using 1MB large pages.
> + * Returns a pointer to the DAT table entry.
> + * @pgtable root of the page table tree
> + * @phys physical address to map, must be 1MB aligned!
> + * @vaddr virtual address to map, must be 1MB aligned!
> + */
> +pmdval_t *install_large_page(pgd_t *pgtable, phys_addr_t phys, void *vaddr);
> +
> +/* Similar to install_page, maps the virtual address to the physical address
> + * for the given page tables, using 2GB huge pages.
> + * Returns a pointer to the DAT table entry.
> + * @pgtable root of the page table tree
> + * @phys physical address to map, must be 2GB aligned!
> + * @vaddr virtual address to map, must be 2GB aligned!
> + */
> +pudval_t *install_huge_page(pgd_t *pgtable, phys_addr_t phys, void *vaddr);
> +
> +static inline void protect_page(void *vaddr, unsigned long prot)
> +{
> + protect_dat_entry(vaddr, prot, pgtable_level_pte);
> +}
> +
> +static inline void unprotect_page(void *vaddr, unsigned long prot)
> +{
> + unprotect_dat_entry(vaddr, prot, pgtable_level_pte);
> +}
> +
> +void *get_dat_entry(pgd_t *pgtable, void *vaddr, unsigned int level);
> +
> #endif /* _ASMS390X_MMU_H_ */
> diff --git a/lib/s390x/mmu.c b/lib/s390x/mmu.c
> index 5c517366..c973443b 100644
> --- a/lib/s390x/mmu.c
> +++ b/lib/s390x/mmu.c
> @@ -15,6 +15,18 @@
> #include <vmalloc.h>
> #include "mmu.h"
>
> +/*
> + * The naming convention used here is the same as used in the Linux kernel;
> + * this is the correspondence between the s390x architectural names and the
> + * Linux ones:
> + *
> + * pgd - region 1 table entry
> + * p4d - region 2 table entry
> + * pud - region 3 table entry
> + * pmd - segment table entry
> + * pte - page table entry
> + */
> +
> static pgd_t *table_root;
>
> void configure_dat(int enable)
> @@ -46,54 +58,256 @@ static void mmu_enable(pgd_t *pgtable)
> lc->pgm_new_psw.mask |= PSW_MASK_DAT;
> }
>
> -static pteval_t *get_pte(pgd_t *pgtable, uintptr_t vaddr)
> +/*
> + * Get the pud (region 3) DAT table entry for the given address and root,
> + * allocating it if necessary
> + */
> +static inline pud_t *get_pud(pgd_t *pgtable, uintptr_t vaddr)
> {
> pgd_t *pgd = pgd_offset(pgtable, vaddr);
> p4d_t *p4d = p4d_alloc(pgd, vaddr);
> pud_t *pud = pud_alloc(p4d, vaddr);
> - pmd_t *pmd = pmd_alloc(pud, vaddr);
> - pte_t *pte = pte_alloc(pmd, vaddr);
>
> - return &pte_val(*pte);
> + return pud;
> +}
> +
> +/*
> + * Get the pmd (segment) DAT table entry for the given address and pud,
> + * allocating it if necessary.
> + * The pud must not be huge.
> + */
> +static inline pmd_t *get_pmd(pud_t *pud, uintptr_t vaddr)
> +{
> + pmd_t *pmd;
> +
> + assert(!pud_huge(*pud));
> + pmd = pmd_alloc(pud, vaddr);
> + return pmd;
> +}
> +
> +/*
> + * Get the pte (page) DAT table entry for the given address and pmd,
> + * allocating it if necessary.
> + * The pmd must not be large.
> + */
> +static inline pte_t *get_pte(pmd_t *pmd, uintptr_t vaddr)
> +{
> + pte_t *pte;
> +
> + assert(!pmd_large(*pmd));
> + pte = pte_alloc(pmd, vaddr);
> + return pte;
> +}
> +
> +/*
> + * Splits a large pmd (segment) DAT table entry into equivalent 4kB small
> + * pages.
> + * @pmd The pmd to split, it must be large.
> + * @va the virtual address corresponding to this pmd.
> + */
> +static void split_pmd(pmd_t *pmd, uintptr_t va)
> +{
> + phys_addr_t pa = pmd_val(*pmd) & SEGMENT_ENTRY_SFAA;
> + unsigned long i, prot;
> + pte_t *pte;
> +
> + assert(pmd_large(*pmd));
> + pte = alloc_pages(PAGE_TABLE_ORDER);
> + prot = pmd_val(*pmd) & (SEGMENT_ENTRY_IEP | SEGMENT_ENTRY_P);
> + for (i = 0; i < PAGE_TABLE_ENTRIES; i++)
> + pte_val(pte[i]) = pa | PAGE_SIZE * i | prot;
> + idte_pmdp(va, &pmd_val(*pmd));
> + pmd_val(*pmd) = __pa(pte) | SEGMENT_ENTRY_TT_SEGMENT;
> +
> +}
> +
> +/*
> + * Splits a huge pud (region 3) DAT table entry into equivalent 1MB large
> + * pages.
> + * @pud The pud to split, it must be huge.
> + * @va the virtual address corresponding to this pud.
> + */
> +static void split_pud(pud_t *pud, uintptr_t va)
> +{
> + phys_addr_t pa = pud_val(*pud) & REGION3_ENTRY_RFAA;
> + unsigned long i, prot;
> + pmd_t *pmd;
> +
> + assert(pud_huge(*pud));
> + pmd = alloc_pages(SEGMENT_TABLE_ORDER);
> + prot = pud_val(*pud) & (REGION3_ENTRY_IEP | REGION_ENTRY_P);
> + for (i = 0; i < SEGMENT_TABLE_ENTRIES; i++)
> + pmd_val(pmd[i]) = pa | SZ_1M * i | prot | SEGMENT_ENTRY_FC | SEGMENT_ENTRY_TT_SEGMENT;
> + idte_pudp(va, &pud_val(*pud));
> + pud_val(*pud) = __pa(pmd) | REGION_ENTRY_TT_REGION3 | REGION_TABLE_LENGTH;
> +}
> +
> +void *get_dat_entry(pgd_t *pgtable, void *vaddr, enum pgt_level level)
> +{
> + uintptr_t va = (uintptr_t)vaddr;
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + assert(level && (level <= 5));
> + pgd = pgd_offset(pgtable, va);
> + if (level == pgtable_level_pgd)
> + return pgd;
> + p4d = p4d_alloc(pgd, va);
> + if (level == pgtable_level_p4d)
> + return p4d;
> + pud = pud_alloc(p4d, va);
> +
> + if (level == pgtable_level_pud)
> + return pud;
> + if (!pud_none(*pud) && pud_huge(*pud))
> + split_pud(pud, va);
> + pmd = get_pmd(pud, va);
> + if (level == pgtable_level_pmd)
> + return pmd;
> + if (!pmd_none(*pmd) && pmd_large(*pmd))
> + split_pmd(pmd, va);
> + return get_pte(pmd, va);
> +}
> +
> +void *split_page(pgd_t *pgtable, void *vaddr, enum pgt_level level)
> +{
> + assert((level >= 3) && (level <= 5));
> + return get_dat_entry(pgtable ? pgtable : table_root, vaddr, level);
> }
>
> phys_addr_t virt_to_pte_phys(pgd_t *pgtable, void *vaddr)
> {
> - return (*get_pte(pgtable, (uintptr_t)vaddr) & PAGE_MASK) +
> - ((unsigned long)vaddr & ~PAGE_MASK);
> + uintptr_t va = (uintptr_t)vaddr;
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte;
> +
> + pud = get_pud(pgtable, va);
> + if (pud_huge(*pud))
> + return (pud_val(*pud) & REGION3_ENTRY_RFAA) | (va & ~REGION3_ENTRY_RFAA);
> + pmd = get_pmd(pud, va);
> + if (pmd_large(*pmd))
> + return (pmd_val(*pmd) & SEGMENT_ENTRY_SFAA) | (va & ~SEGMENT_ENTRY_SFAA);
> + pte = get_pte(pmd, va);
> + return (pte_val(*pte) & PAGE_MASK) | (va & ~PAGE_MASK);
> +}
> +
> +/*
> + * Get the DAT table entry of the given level for the given address,
> + * splitting if necessary. If the entry was not invalid, invalidate it, and
> + * return the pointer to the entry and, if requested, its old value.
> + * @pgtable root of the page tables
> + * @vaddr virtual address
> + * @level 3 (for 2GB pud), 4 (for 1MB pmd) or 5 (for 4kB pages)
> + * @old if not NULL, will be written with the old value of the DAT table
> + * entry before invalidation
> + */
> +static void *dat_get_and_invalidate(pgd_t *pgtable, void *vaddr, enum pgt_level level, unsigned long *old)
> +{
> + unsigned long va = (unsigned long)vaddr;
> + void *ptr;
> +
> + ptr = get_dat_entry(pgtable, vaddr, level);
> + if (old)
> + *old = *(unsigned long *)ptr;
> + if ((level == pgtable_level_pgd) && !pgd_none(*(pgd_t *)ptr))
> + idte_pgdp(va, ptr);
> + else if ((level == pgtable_level_p4d) && !p4d_none(*(p4d_t *)ptr))
> + idte_p4dp(va, ptr);
> + else if ((level == pgtable_level_pud) && !pud_none(*(pud_t *)ptr))
> + idte_pudp(va, ptr);
> + else if ((level == pgtable_level_pmd) && !pmd_none(*(pmd_t *)ptr))
> + idte_pmdp(va, ptr);
> + else if (!pte_none(*(pte_t *)ptr))
> + ipte(va, ptr);
> + return ptr;
> }
>
> -static pteval_t *set_pte(pgd_t *pgtable, pteval_t val, void *vaddr)
> +static void cleanup_pmd(pmd_t *pmd)
> {
> - pteval_t *p_pte = get_pte(pgtable, (uintptr_t)vaddr);
> + /* was invalid or large, nothing to do */
> + if (pmd_none(*pmd) || pmd_large(*pmd))
> + return;
> + /* was not large, free the corresponding page table */
> + free_pages((void *)(pmd_val(*pmd) & PAGE_MASK));
> +}
>
> - /* first flush the old entry (if we're replacing anything) */
> - if (!(*p_pte & PAGE_ENTRY_I))
> - ipte((uintptr_t)vaddr, p_pte);
> +static void cleanup_pud(pud_t *pud)
> +{
> + unsigned long i;
> + pmd_t *pmd;
>
> - *p_pte = val;
> - return p_pte;
> + /* was invalid or large, nothing to do */
> + if (pud_none(*pud) || pud_huge(*pud))
> + return;
> + /* recursively clean up all pmds if needed */
> + pmd = (pmd_t *)(pud_val(*pud) & PAGE_MASK);
> + for (i = 0; i < SEGMENT_TABLE_ENTRIES; i++)
> + cleanup_pmd(pmd + i);
> + /* free the corresponding segment table */
> + free_pages(pmd);
> +}
> +
> +/*
> + * Set the DAT entry for the given level of the given virtual address. If a
> + * mapping already existed, it is overwritten. If an existing mapping with
> + * smaller pages existed, all the lower tables are freed.
> + * Returns the pointer to the DAT table entry.
> + * @pgtable root of the page tables
> + * @val the new value for the DAT table entry
> + * @vaddr the virtual address
> + * @level 3 for pud (region 3), 4 for pmd (segment) and 5 for pte (pages)
> + */
> +static void *set_dat_entry(pgd_t *pgtable, unsigned long val, void *vaddr, enum pgt_level level)
> +{
> + unsigned long old, *res;
> +
> + res = dat_get_and_invalidate(pgtable, vaddr, level, &old);
> + if (level == pgtable_level_pmd)
> + cleanup_pmd((pmd_t *)&old);
> + if (level == pgtable_level_pud)
> + cleanup_pud((pud_t *)&old);
> + *res = val;
> + return res;
> }
>
> pteval_t *install_page(pgd_t *pgtable, phys_addr_t phys, void *vaddr)
> {
> - return set_pte(pgtable, __pa(phys), vaddr);
> + assert(IS_ALIGNED(phys, PAGE_SIZE));
> + assert(IS_ALIGNED((uintptr_t)vaddr, PAGE_SIZE));
> + return set_dat_entry(pgtable, phys, vaddr, pgtable_level_pte);
> +}
> +
> +pmdval_t *install_large_page(pgd_t *pgtable, phys_addr_t phys, void *vaddr)
> +{
> + assert(IS_ALIGNED(phys, SZ_1M));
> + assert(IS_ALIGNED((uintptr_t)vaddr, SZ_1M));
> + return set_dat_entry(pgtable, phys | SEGMENT_ENTRY_FC, vaddr, pgtable_level_pmd);
> +}
> +
> +pudval_t *install_huge_page(pgd_t *pgtable, phys_addr_t phys, void *vaddr)
> +{
> + assert(IS_ALIGNED(phys, SZ_2G));
> + assert(IS_ALIGNED((uintptr_t)vaddr, SZ_2G));
> + return set_dat_entry(pgtable, phys | REGION3_ENTRY_FC | REGION_ENTRY_TT_REGION3, vaddr, pgtable_level_pud);
> }
>
> -void protect_page(void *vaddr, unsigned long prot)
> +void protect_dat_entry(void *vaddr, unsigned long prot, enum pgt_level level)
> {
> - pteval_t *p_pte = get_pte(table_root, (uintptr_t)vaddr);
> - pteval_t n_pte = *p_pte | prot;
> + unsigned long old, *ptr;
>
> - set_pte(table_root, n_pte, vaddr);
> + ptr = dat_get_and_invalidate(table_root, vaddr, level, &old);
> + *ptr = old | prot;
> }
>
> -void unprotect_page(void *vaddr, unsigned long prot)
> +void unprotect_dat_entry(void *vaddr, unsigned long prot, enum pgt_level level)
> {
> - pteval_t *p_pte = get_pte(table_root, (uintptr_t)vaddr);
> - pteval_t n_pte = *p_pte & ~prot;
> + unsigned long old, *ptr;
>
> - set_pte(table_root, n_pte, vaddr);
> + ptr = dat_get_and_invalidate(table_root, vaddr, level, &old);
> + *ptr = old & ~prot;
> }
>
> void protect_range(void *start, unsigned long len, unsigned long prot)
> @@ -102,7 +316,7 @@ void protect_range(void *start, unsigned long len, unsigned long prot)
>
> len &= PAGE_MASK;
> for (; len; len -= PAGE_SIZE, curr += PAGE_SIZE)
> - protect_page((void *)curr, prot);
> + protect_dat_entry((void *)curr, prot, 5);
> }
>
> void unprotect_range(void *start, unsigned long len, unsigned long prot)
> @@ -111,7 +325,7 @@ void unprotect_range(void *start, unsigned long len, unsigned long prot)
>
> len &= PAGE_MASK;
> for (; len; len -= PAGE_SIZE, curr += PAGE_SIZE)
> - unprotect_page((void *)curr, prot);
> + unprotect_dat_entry((void *)curr, prot, 5);
> }
>
> static void setup_identity(pgd_t *pgtable, phys_addr_t start_addr,
>
next prev parent reply other threads:[~2021-06-18 7:36 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-06-11 14:06 [kvm-unit-tests PATCH v5 0/7] s390: Add support for large pages Claudio Imbrenda
2021-06-11 14:06 ` [kvm-unit-tests PATCH v5 1/7] s390x: lib: add and use macros for control register bits Claudio Imbrenda
2021-06-11 14:07 ` [kvm-unit-tests PATCH v5 2/7] libcflat: add SZ_1M and SZ_2G Claudio Imbrenda
2021-06-11 14:07 ` [kvm-unit-tests PATCH v5 3/7] s390x: lib: fix pgtable.h Claudio Imbrenda
2021-06-11 14:07 ` [kvm-unit-tests PATCH v5 4/7] s390x: lib: Add idte and other huge pages functions/macros Claudio Imbrenda
2021-06-11 14:07 ` [kvm-unit-tests PATCH v5 5/7] s390x: lib: add teid union and clear teid from lowcore Claudio Imbrenda
2021-06-11 14:07 ` [kvm-unit-tests PATCH v5 6/7] s390x: mmu: add support for large pages Claudio Imbrenda
2021-06-18 7:36 ` Janosch Frank [this message]
2021-06-11 14:07 ` [kvm-unit-tests PATCH v5 7/7] s390x: edat test Claudio Imbrenda
2021-06-18 7:36 ` [kvm-unit-tests PATCH v5 0/7] s390: Add support for large pages Janosch Frank
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=ac930fdd-53e9-cbc5-687d-8d99d968a3a1@linux.ibm.com \
--to=frankja@linux.ibm.com \
--cc=cohuck@redhat.com \
--cc=david@redhat.com \
--cc=imbrenda@linux.ibm.com \
--cc=kvm@vger.kernel.org \
--cc=linux-s390@vger.kernel.org \
--cc=thuth@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).