linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] mm: pagewalk: Fix walk for hugepage tables
@ 2021-06-25  5:10 Christophe Leroy
  2021-06-28  1:12 ` Andrew Morton
  2021-06-28  6:03 ` Aneesh Kumar K.V
  0 siblings, 2 replies; 5+ messages in thread
From: Christophe Leroy @ 2021-06-25  5:10 UTC (permalink / raw)
  To: Steven Price, akpm, linux-mm
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, dja,
	Oliver O'Halloran, linux-arch, linuxppc-dev, linux-kernel

Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.

Add walk_hugepd_range() and use it to walk hugepage tables.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Steven Price <steven.price@arm.com>
---
v3:
- Rebased on next-20210624 (no change since v2)
- Added Steven's Reviewed-by
- Sent as standalone for merge via mm

v2:
- Add a guard for NULL ops->pte_entry
- Take mm->page_table_lock when walking hugepage table, as suggested by follow_huge_pd()
---
 mm/pagewalk.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..9b3db11a4d1d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+			     unsigned long end, struct mm_walk *walk, int pdshift)
+{
+	int err = 0;
+	const struct mm_walk_ops *ops = walk->ops;
+	int shift = hugepd_shift(*phpd);
+	int page_size = 1 << shift;
+
+	if (!ops->pte_entry)
+		return 0;
+
+	if (addr & (page_size - 1))
+		return 0;
+
+	for (;;) {
+		pte_t *pte;
+
+		spin_lock(&walk->mm->page_table_lock);
+		pte = hugepte_offset(*phpd, addr, pdshift);
+		err = ops->pte_entry(pte, addr, addr + page_size, walk);
+		spin_unlock(&walk->mm->page_table_lock);
+
+		if (err)
+			break;
+		if (addr >= end - page_size)
+			break;
+		addr += page_size;
+	}
+	return err;
+}
+#else
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+			     unsigned long end, struct mm_walk *walk, int pdshift)
+{
+	return 0;
+}
+#endif
+
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 			  struct mm_walk *walk)
 {
@@ -108,7 +147,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 				goto again;
 		}
 
-		err = walk_pte_range(pmd, addr, next, walk);
+		if (is_hugepd(__hugepd(pmd_val(*pmd))))
+			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
+		else
+			err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
@@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 		if (pud_none(*pud))
 			goto again;
 
-		err = walk_pmd_range(pud, addr, next, walk);
+		if (is_hugepd(__hugepd(pud_val(*pud))))
+			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
+		else
+			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
@@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 			if (err)
 				break;
 		}
-		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+		if (is_hugepd(__hugepd(p4d_val(*p4d))))
+			err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
+		else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 			err = walk_pud_range(p4d, addr, next, walk);
 		if (err)
 			break;
@@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
 			if (err)
 				break;
 		}
-		if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
-		    ops->pte_entry)
+		if (is_hugepd(__hugepd(pgd_val(*pgd))))
+			err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
+		else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 			err = walk_p4d_range(pgd, addr, next, walk);
 		if (err)
 			break;
-- 
2.25.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] mm: pagewalk: Fix walk for hugepage tables
  2021-06-25  5:10 [PATCH v3] mm: pagewalk: Fix walk for hugepage tables Christophe Leroy
@ 2021-06-28  1:12 ` Andrew Morton
  2021-06-28  5:56   ` Christophe Leroy
  2021-06-28  6:03 ` Aneesh Kumar K.V
  1 sibling, 1 reply; 5+ messages in thread
From: Andrew Morton @ 2021-06-28  1:12 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Steven Price, linux-mm, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, dja, Oliver O'Halloran, linux-arch,
	linuxppc-dev, linux-kernel

On Fri, 25 Jun 2021 05:10:12 +0000 (UTC) Christophe Leroy <christophe.leroy@csgroup.eu> wrote:

> Pagewalk ignores hugepd entries and walk down the tables
> as if it was traditionnal entries, leading to crazy result.
> 
> Add walk_hugepd_range() and use it to walk hugepage tables.

More details, please?  I assume "crazy result" is userspace visible? 
For how long has this bug existed?  Is a -stable backport needed?  Has
a Fixes: commit been identified?  etcetera!

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] mm: pagewalk: Fix walk for hugepage tables
  2021-06-28  1:12 ` Andrew Morton
@ 2021-06-28  5:56   ` Christophe Leroy
  0 siblings, 0 replies; 5+ messages in thread
From: Christophe Leroy @ 2021-06-28  5:56 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Steven Price, linux-mm, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, dja, Oliver O'Halloran, linux-arch,
	linuxppc-dev, linux-kernel



Le 28/06/2021 à 03:12, Andrew Morton a écrit :
> On Fri, 25 Jun 2021 05:10:12 +0000 (UTC) Christophe Leroy <christophe.leroy@csgroup.eu> wrote:
> 
>> Pagewalk ignores hugepd entries and walk down the tables
>> as if it was traditionnal entries, leading to crazy result.
>>
>> Add walk_hugepd_range() and use it to walk hugepage tables.
> 
> More details, please?  I assume "crazy result" is userspace visible?
> For how long has this bug existed?  Is a -stable backport needed?  Has
> a Fixes: commit been identified?  etcetera!
> 

I discovered the problem while porting powerpc to generic page table dump.
The generic page table dump uses walk_page_range_novma() .

Yes, "crazy result" is that when dumping /sys/kernel/debug/kernel_page_tables, you get random 
entries because at the time being the pagewalk code sees huge page directories as standard page tables.

The bug has always existed as far as I can see, but as no other architectures than powerpc use huge 
page directories, it only pops up now when powerpc is trying to use that generic page walking code.

So I don't think it is worth a backport to -stable, and about a Fixes: tag I don't know.

IIUC, hugepd was introduced for the first time in mm by commit cbd34da7dc9a ("mm: move the powerpc 
hugepd code to mm/gup.c")

Before that, hugepd was internal to powerpc.

I guess you are asking about Fixes: tag and backporting because of the patch subject.
Should I reword the page subject to something like "mm: enable the generic page walk code to walk 
huge page directories" ?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] mm: pagewalk: Fix walk for hugepage tables
  2021-06-25  5:10 [PATCH v3] mm: pagewalk: Fix walk for hugepage tables Christophe Leroy
  2021-06-28  1:12 ` Andrew Morton
@ 2021-06-28  6:03 ` Aneesh Kumar K.V
  2021-06-28  6:19   ` Christophe Leroy
  1 sibling, 1 reply; 5+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-28  6:03 UTC (permalink / raw)
  To: Christophe Leroy, Steven Price, akpm, linux-mm
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, dja,
	Oliver O'Halloran, linux-arch, linuxppc-dev, linux-kernel

Christophe Leroy <christophe.leroy@csgroup.eu> writes:

> Pagewalk ignores hugepd entries and walk down the tables
> as if it was traditionnal entries, leading to crazy result.

But we do handle hugetlb separately

	if (vma && is_vm_hugetlb_page(vma)) {
		if (ops->hugetlb_entry)
			err = walk_hugetlb_range(start, end, walk);
	} else
		err = walk_pgd_range(start, end, walk);

Are we using hugepd format for non hugetlb entries?

>
> Add walk_hugepd_range() and use it to walk hugepage tables.
>
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> Reviewed-by: Steven Price <steven.price@arm.com>
> ---
> v3:
> - Rebased on next-20210624 (no change since v2)
> - Added Steven's Reviewed-by
> - Sent as standalone for merge via mm
>
> v2:
> - Add a guard for NULL ops->pte_entry
> - Take mm->page_table_lock when walking hugepage table, as suggested by follow_huge_pd()
> ---
>  mm/pagewalk.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 53 insertions(+), 5 deletions(-)
>
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index e81640d9f177..9b3db11a4d1d 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  	return err;
>  }
>  
> +#ifdef CONFIG_ARCH_HAS_HUGEPD
> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
> +			     unsigned long end, struct mm_walk *walk, int pdshift)
> +{
> +	int err = 0;
> +	const struct mm_walk_ops *ops = walk->ops;
> +	int shift = hugepd_shift(*phpd);
> +	int page_size = 1 << shift;
> +
> +	if (!ops->pte_entry)
> +		return 0;
> +
> +	if (addr & (page_size - 1))
> +		return 0;
> +
> +	for (;;) {
> +		pte_t *pte;
> +
> +		spin_lock(&walk->mm->page_table_lock);
> +		pte = hugepte_offset(*phpd, addr, pdshift);
> +		err = ops->pte_entry(pte, addr, addr + page_size, walk);
> +		spin_unlock(&walk->mm->page_table_lock);
> +
> +		if (err)
> +			break;
> +		if (addr >= end - page_size)
> +			break;
> +		addr += page_size;
> +	}
> +	return err;
> +}
> +#else
> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
> +			     unsigned long end, struct mm_walk *walk, int pdshift)
> +{
> +	return 0;
> +}
> +#endif
> +
>  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  			  struct mm_walk *walk)
>  {
> @@ -108,7 +147,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  				goto again;
>  		}
>  
> -		err = walk_pte_range(pmd, addr, next, walk);
> +		if (is_hugepd(__hugepd(pmd_val(*pmd))))
> +			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
> +		else
> +			err = walk_pte_range(pmd, addr, next, walk);
>  		if (err)
>  			break;
>  	} while (pmd++, addr = next, addr != end);
> @@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>  		if (pud_none(*pud))
>  			goto again;
>  
> -		err = walk_pmd_range(pud, addr, next, walk);
> +		if (is_hugepd(__hugepd(pud_val(*pud))))
> +			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
> +		else
> +			err = walk_pmd_range(pud, addr, next, walk);
>  		if (err)
>  			break;
>  	} while (pud++, addr = next, addr != end);
> @@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
>  			if (err)
>  				break;
>  		}
> -		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
> +		if (is_hugepd(__hugepd(p4d_val(*p4d))))
> +			err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
> +		else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
>  			err = walk_pud_range(p4d, addr, next, walk);
>  		if (err)
>  			break;
> @@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
>  			if (err)
>  				break;
>  		}
> -		if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
> -		    ops->pte_entry)
> +		if (is_hugepd(__hugepd(pgd_val(*pgd))))
> +			err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
> +		else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
>  			err = walk_p4d_range(pgd, addr, next, walk);
>  		if (err)
>  			break;
> -- 
> 2.25.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] mm: pagewalk: Fix walk for hugepage tables
  2021-06-28  6:03 ` Aneesh Kumar K.V
@ 2021-06-28  6:19   ` Christophe Leroy
  0 siblings, 0 replies; 5+ messages in thread
From: Christophe Leroy @ 2021-06-28  6:19 UTC (permalink / raw)
  To: Aneesh Kumar K.V, Steven Price, akpm, linux-mm
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, dja,
	Oliver O'Halloran, linux-arch, linuxppc-dev, linux-kernel



Le 28/06/2021 à 08:03, Aneesh Kumar K.V a écrit :
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> 
>> Pagewalk ignores hugepd entries and walk down the tables
>> as if it was traditionnal entries, leading to crazy result.
> 
> But we do handle hugetlb separately
> 
> 	if (vma && is_vm_hugetlb_page(vma)) {
> 		if (ops->hugetlb_entry)
> 			err = walk_hugetlb_range(start, end, walk);
> 	} else
> 		err = walk_pgd_range(start, end, walk);
> 
> Are we using hugepd format for non hugetlb entries?

Yes, on the 8xx we use hugepd for 8M pages for linear mapping and for kasan shadow mapping (See 
commit bb5f33c06940 ("Merge "Use hugepages to map kernel mem on 8xx" into next")

And I'm working on implementing huge VMAP with 8M pages, that will also make use of hugepd.

> 
>>
>> Add walk_hugepd_range() and use it to walk hugepage tables.
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> Reviewed-by: Steven Price <steven.price@arm.com>
>> ---
>> v3:
>> - Rebased on next-20210624 (no change since v2)
>> - Added Steven's Reviewed-by
>> - Sent as standalone for merge via mm
>>
>> v2:
>> - Add a guard for NULL ops->pte_entry
>> - Take mm->page_table_lock when walking hugepage table, as suggested by follow_huge_pd()
>> ---
>>   mm/pagewalk.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 53 insertions(+), 5 deletions(-)
>>
>> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
>> index e81640d9f177..9b3db11a4d1d 100644
>> --- a/mm/pagewalk.c
>> +++ b/mm/pagewalk.c
>> @@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>>   	return err;
>>   }
>>   
>> +#ifdef CONFIG_ARCH_HAS_HUGEPD
>> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
>> +			     unsigned long end, struct mm_walk *walk, int pdshift)
>> +{
>> +	int err = 0;
>> +	const struct mm_walk_ops *ops = walk->ops;
>> +	int shift = hugepd_shift(*phpd);
>> +	int page_size = 1 << shift;
>> +
>> +	if (!ops->pte_entry)
>> +		return 0;
>> +
>> +	if (addr & (page_size - 1))
>> +		return 0;
>> +
>> +	for (;;) {
>> +		pte_t *pte;
>> +
>> +		spin_lock(&walk->mm->page_table_lock);
>> +		pte = hugepte_offset(*phpd, addr, pdshift);
>> +		err = ops->pte_entry(pte, addr, addr + page_size, walk);
>> +		spin_unlock(&walk->mm->page_table_lock);
>> +
>> +		if (err)
>> +			break;
>> +		if (addr >= end - page_size)
>> +			break;
>> +		addr += page_size;
>> +	}
>> +	return err;
>> +}
>> +#else
>> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
>> +			     unsigned long end, struct mm_walk *walk, int pdshift)
>> +{
>> +	return 0;
>> +}
>> +#endif
>> +
>>   static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>>   			  struct mm_walk *walk)
>>   {
>> @@ -108,7 +147,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>>   				goto again;
>>   		}
>>   
>> -		err = walk_pte_range(pmd, addr, next, walk);
>> +		if (is_hugepd(__hugepd(pmd_val(*pmd))))
>> +			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
>> +		else
>> +			err = walk_pte_range(pmd, addr, next, walk);
>>   		if (err)
>>   			break;
>>   	} while (pmd++, addr = next, addr != end);
>> @@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>>   		if (pud_none(*pud))
>>   			goto again;
>>   
>> -		err = walk_pmd_range(pud, addr, next, walk);
>> +		if (is_hugepd(__hugepd(pud_val(*pud))))
>> +			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
>> +		else
>> +			err = walk_pmd_range(pud, addr, next, walk);
>>   		if (err)
>>   			break;
>>   	} while (pud++, addr = next, addr != end);
>> @@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
>>   			if (err)
>>   				break;
>>   		}
>> -		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
>> +		if (is_hugepd(__hugepd(p4d_val(*p4d))))
>> +			err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
>> +		else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
>>   			err = walk_pud_range(p4d, addr, next, walk);
>>   		if (err)
>>   			break;
>> @@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
>>   			if (err)
>>   				break;
>>   		}
>> -		if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
>> -		    ops->pte_entry)
>> +		if (is_hugepd(__hugepd(pgd_val(*pgd))))
>> +			err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
>> +		else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
>>   			err = walk_p4d_range(pgd, addr, next, walk);
>>   		if (err)
>>   			break;
>> -- 
>> 2.25.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-06-28  6:19 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-25  5:10 [PATCH v3] mm: pagewalk: Fix walk for hugepage tables Christophe Leroy
2021-06-28  1:12 ` Andrew Morton
2021-06-28  5:56   ` Christophe Leroy
2021-06-28  6:03 ` Aneesh Kumar K.V
2021-06-28  6:19   ` Christophe Leroy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).