linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v0] powerpc: Fix BUG_ON during memory unplug on radix
@ 2019-06-19  7:45 Bharata B Rao
  2019-06-19  9:06 ` Aneesh Kumar K.V
  2019-06-19 10:17 ` Nicholas Piggin
  0 siblings, 2 replies; 5+ messages in thread
From: Bharata B Rao @ 2019-06-19  7:45 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: aneesh.kumar, npiggin, Bharata B Rao

We hit the following BUG_ON when memory hotplugged before reboot
is unplugged after reboot:

kernel BUG at arch/powerpc/mm/pgtable-frag.c:113!

 remove_pagetable+0x594/0x6a0
 (unreliable)
 remove_pagetable+0x94/0x6a0
 vmemmap_free+0x394/0x410
 sparse_remove_one_section+0x26c/0x2e8
 __remove_pages+0x428/0x540
 arch_remove_memory+0xd0/0x170
 __remove_memory+0xd4/0x1a0
 dlpar_remove_lmb+0xbc/0x110
 dlpar_memory+0xa80/0xd20
 handle_dlpar_errorlog+0xa8/0x160
 pseries_hp_work_fn+0x2c/0x60
 process_one_work+0x46c/0x860
 worker_thread+0x364/0x5e0
 kthread+0x1b0/0x1c0
 ret_from_kernel_thread+0x5c/0x68

This occurs because, during reboot-after-hotplug, the hotplugged
memory range gets initialized as regular memory and page
tables are setup using memblock allocator. This means that we
wouldn't have initialized the PMD or PTE fragment count for
those PMD or PTE pages.

Fixing this includes 3 aspects:

- Walk the init_mm page tables from mem_init() and initialize
  the PMD and PTE fragment counts appropriately.
- When we do early allocation of PMD (and PGD as well) pages,
  allocate in page size PAGE_SIZE granularity so that we are
  sure that the complete page is available for us to set the
  fragment count which is part of struct page.
- When PMD or PTE page is freed, check if it comes from memblock
  allocator and free it appropriately.

Reported-by: Srikanth Aithal <sraithal@linux.vnet.ibm.com>
Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |  1 +
 arch/powerpc/include/asm/sparsemem.h       |  1 +
 arch/powerpc/mm/book3s64/pgtable.c         | 12 +++-
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 67 +++++++++++++++++++++-
 arch/powerpc/mm/mem.c                      |  5 ++
 arch/powerpc/mm/pgtable-frag.c             |  5 +-
 6 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 574eca33f893..4320f2790e8d 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -285,6 +285,7 @@ static inline unsigned long radix__get_tree_size(void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
 int radix__remove_section_mapping(unsigned long start, unsigned long end);
+void radix__fixup_pgtable_fragments(void);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index 3192d454a733..e662f9232d35 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -15,6 +15,7 @@
 #ifdef CONFIG_MEMORY_HOTPLUG
 extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
 extern int remove_section_mapping(unsigned long start, unsigned long end);
+void fixup_pgtable_fragments(void);
 
 #ifdef CONFIG_PPC_BOOK3S_64
 extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 01bc9663360d..7efe9cc16b39 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
 
 	return hash__remove_section_mapping(start, end);
 }
+
+void fixup_pgtable_fragments(void)
+{
+	if (radix_enabled())
+		radix__fixup_pgtable_fragments();
+}
+
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 void __init mmu_partition_table_init(void)
@@ -320,7 +327,10 @@ void pmd_fragment_free(unsigned long *pmd)
 	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
 	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
 		pgtable_pmd_page_dtor(page);
-		__free_page(page);
+		if (PageReserved(page))
+			free_reserved_page(page);
+		else
+			__free_page(page);
 	}
 }
 
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 273ae66a9a45..402e8da28cab 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -32,6 +32,69 @@
 unsigned int mmu_pid_bits;
 unsigned int mmu_base_pid;
 
+static void fixup_pmd_fragments(pmd_t *pmd)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
+		pte_t *pte;
+		struct page *page;
+
+		if (pmd_none(*pmd))
+			continue;
+		if (pmd_huge(*pmd))
+			continue;
+
+		pte = pte_offset_kernel(pmd, 0);
+		if (!pte_none(*pte)) {
+			page = virt_to_page(pte);
+			atomic_inc(&page->pt_frag_refcount);
+		}
+	}
+}
+
+static void fixup_pud_fragments(pud_t *pud)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+		pmd_t *pmd;
+		struct page *page;
+
+		if (pud_none(*pud))
+			continue;
+		if (pud_huge(*pud))
+			continue;
+
+		pmd = pmd_offset(pud, 0);
+		if (!pmd_none(*pmd)) {
+			page = virt_to_page(pmd);
+			atomic_inc(&page->pt_frag_refcount);
+		}
+		fixup_pmd_fragments(pmd);
+	}
+}
+
+void radix__fixup_pgtable_fragments(void)
+{
+	int i;
+	pgd_t *pgd = pgd_offset_k(0UL);
+
+	spin_lock(&init_mm.page_table_lock);
+	for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
+		pud_t *pud;
+
+		if (pgd_none(*pgd))
+			continue;
+		if (pgd_huge(*pgd))
+			continue;
+
+		pud = pud_offset(pgd, 0);
+		fixup_pud_fragments(pud);
+	}
+	spin_unlock(&init_mm.page_table_lock);
+}
+
 static int native_register_process_table(unsigned long base, unsigned long pg_sz,
 					 unsigned long table_size)
 {
@@ -80,7 +143,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
 
 	pgdp = pgd_offset_k(ea);
 	if (pgd_none(*pgdp)) {
-		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
+		pudp = early_alloc_pgtable(PAGE_SIZE, nid,
 						region_start, region_end);
 		pgd_populate(&init_mm, pgdp, pudp);
 	}
@@ -90,7 +153,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
 		goto set_the_pte;
 	}
 	if (pud_none(*pudp)) {
-		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
+		pmdp = early_alloc_pgtable(PAGE_SIZE, nid,
 						region_start, region_end);
 		pud_populate(&init_mm, pudp, pmdp);
 	}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index cba29131bccc..a8788b404266 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -51,6 +51,10 @@
 
 #include <mm/mmu_decl.h>
 
+void __weak fixup_pgtable_fragments(void)
+{
+}
+
 #ifndef CPU_FTR_COHERENT_ICACHE
 #define CPU_FTR_COHERENT_ICACHE	0	/* XXX for now */
 #define CPU_FTR_NOEXECUTE	0
@@ -276,6 +280,7 @@ void __init mem_init(void)
 	set_max_mapnr(max_pfn);
 	memblock_free_all();
 
+	fixup_pgtable_fragments();
 #ifdef CONFIG_HIGHMEM
 	{
 		unsigned long pfn, highmem_mapnr;
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index a7b05214760c..694de7c731aa 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -114,6 +114,9 @@ void pte_fragment_free(unsigned long *table, int kernel)
 	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
 		if (!kernel)
 			pgtable_page_dtor(page);
-		__free_page(page);
+		if (PageReserved(page))
+			free_reserved_page(page);
+		else
+			__free_page(page);
 	}
 }
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH v0] powerpc: Fix BUG_ON during memory unplug on radix
  2019-06-19  7:45 [RFC PATCH v0] powerpc: Fix BUG_ON during memory unplug on radix Bharata B Rao
@ 2019-06-19  9:06 ` Aneesh Kumar K.V
  2019-06-19 14:40   ` Bharata B Rao
  2019-06-19 10:17 ` Nicholas Piggin
  1 sibling, 1 reply; 5+ messages in thread
From: Aneesh Kumar K.V @ 2019-06-19  9:06 UTC (permalink / raw)
  To: Bharata B Rao, linuxppc-dev; +Cc: aneesh.kumar, npiggin, Bharata B Rao

Bharata B Rao <bharata@linux.ibm.com> writes:

> We hit the following BUG_ON when memory hotplugged before reboot
> is unplugged after reboot:
>
> kernel BUG at arch/powerpc/mm/pgtable-frag.c:113!
>
>  remove_pagetable+0x594/0x6a0
>  (unreliable)
>  remove_pagetable+0x94/0x6a0
>  vmemmap_free+0x394/0x410
>  sparse_remove_one_section+0x26c/0x2e8
>  __remove_pages+0x428/0x540
>  arch_remove_memory+0xd0/0x170
>  __remove_memory+0xd4/0x1a0
>  dlpar_remove_lmb+0xbc/0x110
>  dlpar_memory+0xa80/0xd20
>  handle_dlpar_errorlog+0xa8/0x160
>  pseries_hp_work_fn+0x2c/0x60
>  process_one_work+0x46c/0x860
>  worker_thread+0x364/0x5e0
>  kthread+0x1b0/0x1c0
>  ret_from_kernel_thread+0x5c/0x68
>
> This occurs because, during reboot-after-hotplug, the hotplugged
> memory range gets initialized as regular memory and page
> tables are setup using memblock allocator. This means that we
> wouldn't have initialized the PMD or PTE fragment count for
> those PMD or PTE pages.
>
> Fixing this includes 3 aspects:
>
> - Walk the init_mm page tables from mem_init() and initialize
>   the PMD and PTE fragment counts appropriately.
> - When we do early allocation of PMD (and PGD as well) pages,
>   allocate in page size PAGE_SIZE granularity so that we are
>   sure that the complete page is available for us to set the
>   fragment count which is part of struct page.


That is an important change now. For early page table we now allocate
PAGE_SIZE tables and hencec we consider then as pages with fragment
count 1. You also may want to explain here why. I guess the challenge is
due to the fact that we can't clearly control how the rest of the page
will get used and we are not sure they all will be allocated for backing
page table pages.

> - When PMD or PTE page is freed, check if it comes from memblock
>   allocator and free it appropriately.
>
> Reported-by: Srikanth Aithal <sraithal@linux.vnet.ibm.com>
> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |  1 +
>  arch/powerpc/include/asm/sparsemem.h       |  1 +
>  arch/powerpc/mm/book3s64/pgtable.c         | 12 +++-
>  arch/powerpc/mm/book3s64/radix_pgtable.c   | 67 +++++++++++++++++++++-
>  arch/powerpc/mm/mem.c                      |  5 ++
>  arch/powerpc/mm/pgtable-frag.c             |  5 +-
>  6 files changed, 87 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 574eca33f893..4320f2790e8d 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -285,6 +285,7 @@ static inline unsigned long radix__get_tree_size(void)
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
>  int radix__remove_section_mapping(unsigned long start, unsigned long end);
> +void radix__fixup_pgtable_fragments(void);
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  #endif /* __ASSEMBLY__ */
>  #endif
> diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
> index 3192d454a733..e662f9232d35 100644
> --- a/arch/powerpc/include/asm/sparsemem.h
> +++ b/arch/powerpc/include/asm/sparsemem.h
> @@ -15,6 +15,7 @@
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
>  extern int remove_section_mapping(unsigned long start, unsigned long end);
> +void fixup_pgtable_fragments(void);
>
>  #ifdef CONFIG_PPC_BOOK3S_64
>  extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> index 01bc9663360d..7efe9cc16b39 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
>
>  	return hash__remove_section_mapping(start, end);
>  }
> +
> +void fixup_pgtable_fragments(void)
> +{
> +	if (radix_enabled())
> +		radix__fixup_pgtable_fragments();
> +}
> +
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>
>  void __init mmu_partition_table_init(void)
> @@ -320,7 +327,10 @@ void pmd_fragment_free(unsigned long *pmd)
>  	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
>  	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
>  		pgtable_pmd_page_dtor(page);
> -		__free_page(page);
> +		if (PageReserved(page))
> +			free_reserved_page(page);
> +		else
> +			__free_page(page);
>  	}
>  }
>
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 273ae66a9a45..402e8da28cab 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -32,6 +32,69 @@
>  unsigned int mmu_pid_bits;
>  unsigned int mmu_base_pid;
>
> +static void fixup_pmd_fragments(pmd_t *pmd)
> +{
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
> +		pte_t *pte;
> +		struct page *page;
> +
> +		if (pmd_none(*pmd))
> +			continue;
> +		if (pmd_huge(*pmd))
> +			continue;
> +
> +		pte = pte_offset_kernel(pmd, 0);
> +		if (!pte_none(*pte)) {
> +			page = virt_to_page(pte);
> +			atomic_inc(&page->pt_frag_refcount);
> +		}
> +	}
> +}


That naming is confusing. You are fixing up fragemts used for level 4
table here. I would call them pte fragments? 


> +
> +static void fixup_pud_fragments(pud_t *pud)
> +{
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
> +		pmd_t *pmd;
> +		struct page *page;
> +
> +		if (pud_none(*pud))
> +			continue;
> +		if (pud_huge(*pud))
> +			continue;
> +
> +		pmd = pmd_offset(pud, 0);
> +		if (!pmd_none(*pmd)) {
> +			page = virt_to_page(pmd);
> +			atomic_inc(&page->pt_frag_refcount);
> +		}
> +		fixup_pmd_fragments(pmd);
> +	}
> +}
> +


How do we handle pud table free. That is going to be tricky for general
allocation they are allocated out of slab and we free them back to slab.
With pages allocated out of memblock, we need to special case them? 


> +void radix__fixup_pgtable_fragments(void)
> +{
> +	int i;
> +	pgd_t *pgd = pgd_offset_k(0UL);
> +
> +	spin_lock(&init_mm.page_table_lock);
> +	for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
> +		pud_t *pud;
> +
> +		if (pgd_none(*pgd))
> +			continue;
> +		if (pgd_huge(*pgd))
> +			continue;
> +
> +		pud = pud_offset(pgd, 0);
> +		fixup_pud_fragments(pud);
> +	}
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
>  static int native_register_process_table(unsigned long base, unsigned long pg_sz,
>  					 unsigned long table_size)
>  {
> @@ -80,7 +143,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
>
>  	pgdp = pgd_offset_k(ea);
>  	if (pgd_none(*pgdp)) {
> -		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
> +		pudp = early_alloc_pgtable(PAGE_SIZE, nid,
>  						region_start, region_end);
>  		pgd_populate(&init_mm, pgdp, pudp);
>  	}
> @@ -90,7 +153,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
>  		goto set_the_pte;
>  	}
>  	if (pud_none(*pudp)) {
> -		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
> +		pmdp = early_alloc_pgtable(PAGE_SIZE, nid,
>  						region_start, region_end);
>  		pud_populate(&init_mm, pudp, pmdp);
>  	}
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index cba29131bccc..a8788b404266 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -51,6 +51,10 @@
>
>  #include <mm/mmu_decl.h>
>
> +void __weak fixup_pgtable_fragments(void)
> +{
> +}
> +
>  #ifndef CPU_FTR_COHERENT_ICACHE
>  #define CPU_FTR_COHERENT_ICACHE	0	/* XXX for now */
>  #define CPU_FTR_NOEXECUTE	0
> @@ -276,6 +280,7 @@ void __init mem_init(void)
>  	set_max_mapnr(max_pfn);
>  	memblock_free_all();
>
> +	fixup_pgtable_fragments();
>  #ifdef CONFIG_HIGHMEM
>  	{
>  		unsigned long pfn, highmem_mapnr;
> diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
> index a7b05214760c..694de7c731aa 100644
> --- a/arch/powerpc/mm/pgtable-frag.c
> +++ b/arch/powerpc/mm/pgtable-frag.c
> @@ -114,6 +114,9 @@ void pte_fragment_free(unsigned long *table, int kernel)
>  	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
>  		if (!kernel)
>  			pgtable_page_dtor(page);
> -		__free_page(page);
> +		if (PageReserved(page))
> +			free_reserved_page(page);
> +		else
> +			__free_page(page);
>  	}
>  }
> -- 
> 2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH v0] powerpc: Fix BUG_ON during memory unplug on radix
  2019-06-19  7:45 [RFC PATCH v0] powerpc: Fix BUG_ON during memory unplug on radix Bharata B Rao
  2019-06-19  9:06 ` Aneesh Kumar K.V
@ 2019-06-19 10:17 ` Nicholas Piggin
  2019-06-19 14:36   ` Bharata B Rao
  1 sibling, 1 reply; 5+ messages in thread
From: Nicholas Piggin @ 2019-06-19 10:17 UTC (permalink / raw)
  To: Bharata B Rao, linuxppc-dev; +Cc: aneesh.kumar

Bharata B Rao's on June 19, 2019 5:45 pm:
> We hit the following BUG_ON when memory hotplugged before reboot
> is unplugged after reboot:
> 
> kernel BUG at arch/powerpc/mm/pgtable-frag.c:113!
> 
>  remove_pagetable+0x594/0x6a0
>  (unreliable)
>  remove_pagetable+0x94/0x6a0
>  vmemmap_free+0x394/0x410
>  sparse_remove_one_section+0x26c/0x2e8
>  __remove_pages+0x428/0x540
>  arch_remove_memory+0xd0/0x170
>  __remove_memory+0xd4/0x1a0
>  dlpar_remove_lmb+0xbc/0x110
>  dlpar_memory+0xa80/0xd20
>  handle_dlpar_errorlog+0xa8/0x160
>  pseries_hp_work_fn+0x2c/0x60
>  process_one_work+0x46c/0x860
>  worker_thread+0x364/0x5e0
>  kthread+0x1b0/0x1c0
>  ret_from_kernel_thread+0x5c/0x68
> 
> This occurs because, during reboot-after-hotplug, the hotplugged
> memory range gets initialized as regular memory and page
> tables are setup using memblock allocator. This means that we
> wouldn't have initialized the PMD or PTE fragment count for
> those PMD or PTE pages.
> 
> Fixing this includes 3 aspects:
> 
> - Walk the init_mm page tables from mem_init() and initialize
>   the PMD and PTE fragment counts appropriately.
> - When we do early allocation of PMD (and PGD as well) pages,
>   allocate in page size PAGE_SIZE granularity so that we are
>   sure that the complete page is available for us to set the
>   fragment count which is part of struct page.
> - When PMD or PTE page is freed, check if it comes from memblock
>   allocator and free it appropriately.
> 
> Reported-by: Srikanth Aithal <sraithal@linux.vnet.ibm.com>
> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |  1 +
>  arch/powerpc/include/asm/sparsemem.h       |  1 +
>  arch/powerpc/mm/book3s64/pgtable.c         | 12 +++-
>  arch/powerpc/mm/book3s64/radix_pgtable.c   | 67 +++++++++++++++++++++-
>  arch/powerpc/mm/mem.c                      |  5 ++
>  arch/powerpc/mm/pgtable-frag.c             |  5 +-
>  6 files changed, 87 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 574eca33f893..4320f2790e8d 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -285,6 +285,7 @@ static inline unsigned long radix__get_tree_size(void)
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
>  int radix__remove_section_mapping(unsigned long start, unsigned long end);
> +void radix__fixup_pgtable_fragments(void);
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  #endif /* __ASSEMBLY__ */
>  #endif
> diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
> index 3192d454a733..e662f9232d35 100644
> --- a/arch/powerpc/include/asm/sparsemem.h
> +++ b/arch/powerpc/include/asm/sparsemem.h
> @@ -15,6 +15,7 @@
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
>  extern int remove_section_mapping(unsigned long start, unsigned long end);
> +void fixup_pgtable_fragments(void);
>  
>  #ifdef CONFIG_PPC_BOOK3S_64
>  extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> index 01bc9663360d..7efe9cc16b39 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
>  
>  	return hash__remove_section_mapping(start, end);
>  }
> +
> +void fixup_pgtable_fragments(void)
> +{
> +	if (radix_enabled())
> +		radix__fixup_pgtable_fragments();
> +}
> +
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  void __init mmu_partition_table_init(void)
> @@ -320,7 +327,10 @@ void pmd_fragment_free(unsigned long *pmd)
>  	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
>  	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
>  		pgtable_pmd_page_dtor(page);
> -		__free_page(page);
> +		if (PageReserved(page))
> +			free_reserved_page(page);

Hmm. Rather than adding this special case here, I wonder if you can
just go along in your fixup walk and convert all these pages to
non-reserved pages?

ClearPageReserved ; init_page_count ; adjust_managed_page_count ; 
should do the trick, right?


> +		else
> +			__free_page(page);

Thanks,
Nick

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH v0] powerpc: Fix BUG_ON during memory unplug on radix
  2019-06-19 10:17 ` Nicholas Piggin
@ 2019-06-19 14:36   ` Bharata B Rao
  0 siblings, 0 replies; 5+ messages in thread
From: Bharata B Rao @ 2019-06-19 14:36 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linuxppc-dev, aneesh.kumar

On Wed, Jun 19, 2019 at 08:17:01PM +1000, Nicholas Piggin wrote:
> Bharata B Rao's on June 19, 2019 5:45 pm:
> > We hit the following BUG_ON when memory hotplugged before reboot
> > is unplugged after reboot:
> > 
> > kernel BUG at arch/powerpc/mm/pgtable-frag.c:113!
> > 
> >  remove_pagetable+0x594/0x6a0
> >  (unreliable)
> >  remove_pagetable+0x94/0x6a0
> >  vmemmap_free+0x394/0x410
> >  sparse_remove_one_section+0x26c/0x2e8
> >  __remove_pages+0x428/0x540
> >  arch_remove_memory+0xd0/0x170
> >  __remove_memory+0xd4/0x1a0
> >  dlpar_remove_lmb+0xbc/0x110
> >  dlpar_memory+0xa80/0xd20
> >  handle_dlpar_errorlog+0xa8/0x160
> >  pseries_hp_work_fn+0x2c/0x60
> >  process_one_work+0x46c/0x860
> >  worker_thread+0x364/0x5e0
> >  kthread+0x1b0/0x1c0
> >  ret_from_kernel_thread+0x5c/0x68
> > 
> > This occurs because, during reboot-after-hotplug, the hotplugged
> > memory range gets initialized as regular memory and page
> > tables are setup using memblock allocator. This means that we
> > wouldn't have initialized the PMD or PTE fragment count for
> > those PMD or PTE pages.
> > 
> > Fixing this includes 3 aspects:
> > 
> > - Walk the init_mm page tables from mem_init() and initialize
> >   the PMD and PTE fragment counts appropriately.
> > - When we do early allocation of PMD (and PGD as well) pages,
> >   allocate in page size PAGE_SIZE granularity so that we are
> >   sure that the complete page is available for us to set the
> >   fragment count which is part of struct page.
> > - When PMD or PTE page is freed, check if it comes from memblock
> >   allocator and free it appropriately.
> > 
> > Reported-by: Srikanth Aithal <sraithal@linux.vnet.ibm.com>
> > Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> > ---
> >  arch/powerpc/include/asm/book3s/64/radix.h |  1 +
> >  arch/powerpc/include/asm/sparsemem.h       |  1 +
> >  arch/powerpc/mm/book3s64/pgtable.c         | 12 +++-
> >  arch/powerpc/mm/book3s64/radix_pgtable.c   | 67 +++++++++++++++++++++-
> >  arch/powerpc/mm/mem.c                      |  5 ++
> >  arch/powerpc/mm/pgtable-frag.c             |  5 +-
> >  6 files changed, 87 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> > index 574eca33f893..4320f2790e8d 100644
> > --- a/arch/powerpc/include/asm/book3s/64/radix.h
> > +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> > @@ -285,6 +285,7 @@ static inline unsigned long radix__get_tree_size(void)
> >  #ifdef CONFIG_MEMORY_HOTPLUG
> >  int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
> >  int radix__remove_section_mapping(unsigned long start, unsigned long end);
> > +void radix__fixup_pgtable_fragments(void);
> >  #endif /* CONFIG_MEMORY_HOTPLUG */
> >  #endif /* __ASSEMBLY__ */
> >  #endif
> > diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
> > index 3192d454a733..e662f9232d35 100644
> > --- a/arch/powerpc/include/asm/sparsemem.h
> > +++ b/arch/powerpc/include/asm/sparsemem.h
> > @@ -15,6 +15,7 @@
> >  #ifdef CONFIG_MEMORY_HOTPLUG
> >  extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
> >  extern int remove_section_mapping(unsigned long start, unsigned long end);
> > +void fixup_pgtable_fragments(void);
> >  
> >  #ifdef CONFIG_PPC_BOOK3S_64
> >  extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
> > diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> > index 01bc9663360d..7efe9cc16b39 100644
> > --- a/arch/powerpc/mm/book3s64/pgtable.c
> > +++ b/arch/powerpc/mm/book3s64/pgtable.c
> > @@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
> >  
> >  	return hash__remove_section_mapping(start, end);
> >  }
> > +
> > +void fixup_pgtable_fragments(void)
> > +{
> > +	if (radix_enabled())
> > +		radix__fixup_pgtable_fragments();
> > +}
> > +
> >  #endif /* CONFIG_MEMORY_HOTPLUG */
> >  
> >  void __init mmu_partition_table_init(void)
> > @@ -320,7 +327,10 @@ void pmd_fragment_free(unsigned long *pmd)
> >  	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
> >  	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
> >  		pgtable_pmd_page_dtor(page);
> > -		__free_page(page);
> > +		if (PageReserved(page))
> > +			free_reserved_page(page);
> 
> Hmm. Rather than adding this special case here, I wonder if you can
> just go along in your fixup walk and convert all these pages to
> non-reserved pages?
> 
> ClearPageReserved ; init_page_count ; adjust_managed_page_count ; 
> should do the trick, right?

Yes, that should. We are anyway fixing the frag count during
the walk, might as well do all the above too and avoid the special
case in the free path.

Regards,
Bharata.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH v0] powerpc: Fix BUG_ON during memory unplug on radix
  2019-06-19  9:06 ` Aneesh Kumar K.V
@ 2019-06-19 14:40   ` Bharata B Rao
  0 siblings, 0 replies; 5+ messages in thread
From: Bharata B Rao @ 2019-06-19 14:40 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, aneesh.kumar, npiggin

On Wed, Jun 19, 2019 at 02:36:54PM +0530, Aneesh Kumar K.V wrote:
> Bharata B Rao <bharata@linux.ibm.com> writes:
> 
> > We hit the following BUG_ON when memory hotplugged before reboot
> > is unplugged after reboot:
> >
> > kernel BUG at arch/powerpc/mm/pgtable-frag.c:113!
> >
> >  remove_pagetable+0x594/0x6a0
> >  (unreliable)
> >  remove_pagetable+0x94/0x6a0
> >  vmemmap_free+0x394/0x410
> >  sparse_remove_one_section+0x26c/0x2e8
> >  __remove_pages+0x428/0x540
> >  arch_remove_memory+0xd0/0x170
> >  __remove_memory+0xd4/0x1a0
> >  dlpar_remove_lmb+0xbc/0x110
> >  dlpar_memory+0xa80/0xd20
> >  handle_dlpar_errorlog+0xa8/0x160
> >  pseries_hp_work_fn+0x2c/0x60
> >  process_one_work+0x46c/0x860
> >  worker_thread+0x364/0x5e0
> >  kthread+0x1b0/0x1c0
> >  ret_from_kernel_thread+0x5c/0x68
> >
> > This occurs because, during reboot-after-hotplug, the hotplugged
> > memory range gets initialized as regular memory and page
> > tables are setup using memblock allocator. This means that we
> > wouldn't have initialized the PMD or PTE fragment count for
> > those PMD or PTE pages.
> >
> > Fixing this includes 3 aspects:
> >
> > - Walk the init_mm page tables from mem_init() and initialize
> >   the PMD and PTE fragment counts appropriately.
> > - When we do early allocation of PMD (and PGD as well) pages,
> >   allocate in page size PAGE_SIZE granularity so that we are
> >   sure that the complete page is available for us to set the
> >   fragment count which is part of struct page.
> 
> 
> That is an important change now. For early page table we now allocate
> PAGE_SIZE tables and hencec we consider then as pages with fragment
> count 1. You also may want to explain here why.

Sure will make this clear in my next version.

> I guess the challenge is
> due to the fact that we can't clearly control how the rest of the page
> will get used and we are not sure they all will be allocated for backing
> page table pages.
> 
> > - When PMD or PTE page is freed, check if it comes from memblock
> >   allocator and free it appropriately.
> >
> > Reported-by: Srikanth Aithal <sraithal@linux.vnet.ibm.com>
> > Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> > ---
> >  arch/powerpc/include/asm/book3s/64/radix.h |  1 +
> >  arch/powerpc/include/asm/sparsemem.h       |  1 +
> >  arch/powerpc/mm/book3s64/pgtable.c         | 12 +++-
> >  arch/powerpc/mm/book3s64/radix_pgtable.c   | 67 +++++++++++++++++++++-
> >  arch/powerpc/mm/mem.c                      |  5 ++
> >  arch/powerpc/mm/pgtable-frag.c             |  5 +-
> >  6 files changed, 87 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> > index 574eca33f893..4320f2790e8d 100644
> > --- a/arch/powerpc/include/asm/book3s/64/radix.h
> > +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> > @@ -285,6 +285,7 @@ static inline unsigned long radix__get_tree_size(void)
> >  #ifdef CONFIG_MEMORY_HOTPLUG
> >  int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
> >  int radix__remove_section_mapping(unsigned long start, unsigned long end);
> > +void radix__fixup_pgtable_fragments(void);
> >  #endif /* CONFIG_MEMORY_HOTPLUG */
> >  #endif /* __ASSEMBLY__ */
> >  #endif
> > diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
> > index 3192d454a733..e662f9232d35 100644
> > --- a/arch/powerpc/include/asm/sparsemem.h
> > +++ b/arch/powerpc/include/asm/sparsemem.h
> > @@ -15,6 +15,7 @@
> >  #ifdef CONFIG_MEMORY_HOTPLUG
> >  extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
> >  extern int remove_section_mapping(unsigned long start, unsigned long end);
> > +void fixup_pgtable_fragments(void);
> >
> >  #ifdef CONFIG_PPC_BOOK3S_64
> >  extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
> > diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> > index 01bc9663360d..7efe9cc16b39 100644
> > --- a/arch/powerpc/mm/book3s64/pgtable.c
> > +++ b/arch/powerpc/mm/book3s64/pgtable.c
> > @@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
> >
> >  	return hash__remove_section_mapping(start, end);
> >  }
> > +
> > +void fixup_pgtable_fragments(void)
> > +{
> > +	if (radix_enabled())
> > +		radix__fixup_pgtable_fragments();
> > +}
> > +
> >  #endif /* CONFIG_MEMORY_HOTPLUG */
> >
> >  void __init mmu_partition_table_init(void)
> > @@ -320,7 +327,10 @@ void pmd_fragment_free(unsigned long *pmd)
> >  	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
> >  	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
> >  		pgtable_pmd_page_dtor(page);
> > -		__free_page(page);
> > +		if (PageReserved(page))
> > +			free_reserved_page(page);
> > +		else
> > +			__free_page(page);
> >  	}
> >  }
> >
> > diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> > index 273ae66a9a45..402e8da28cab 100644
> > --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> > +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> > @@ -32,6 +32,69 @@
> >  unsigned int mmu_pid_bits;
> >  unsigned int mmu_base_pid;
> >
> > +static void fixup_pmd_fragments(pmd_t *pmd)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
> > +		pte_t *pte;
> > +		struct page *page;
> > +
> > +		if (pmd_none(*pmd))
> > +			continue;
> > +		if (pmd_huge(*pmd))
> > +			continue;
> > +
> > +		pte = pte_offset_kernel(pmd, 0);
> > +		if (!pte_none(*pte)) {
> > +			page = virt_to_page(pte);
> > +			atomic_inc(&page->pt_frag_refcount);
> > +		}
> > +	}
> > +}
> 
> 
> That naming is confusing. You are fixing up fragemts used for level 4
> table here. I would call them pte fragments? 

Sure, will rename.

> 
> 
> > +
> > +static void fixup_pud_fragments(pud_t *pud)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
> > +		pmd_t *pmd;
> > +		struct page *page;
> > +
> > +		if (pud_none(*pud))
> > +			continue;
> > +		if (pud_huge(*pud))
> > +			continue;
> > +
> > +		pmd = pmd_offset(pud, 0);
> > +		if (!pmd_none(*pmd)) {
> > +			page = virt_to_page(pmd);
> > +			atomic_inc(&page->pt_frag_refcount);
> > +		}
> > +		fixup_pmd_fragments(pmd);
> > +	}
> > +}
> > +
> 
> 
> How do we handle pud table free. That is going to be tricky for general
> allocation they are allocated out of slab and we free them back to slab.
> With pages allocated out of memblock, we need to special case them? 

Yes. With out addressing the freeing of pud table, we can't say that
memory unplug on radix is working fully. However with this fixing of
frag counts, it does work for a few cases.

May be we handle the pud case separately?

Regards,
Bharata.


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-06-19 14:47 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-19  7:45 [RFC PATCH v0] powerpc: Fix BUG_ON during memory unplug on radix Bharata B Rao
2019-06-19  9:06 ` Aneesh Kumar K.V
2019-06-19 14:40   ` Bharata B Rao
2019-06-19 10:17 ` Nicholas Piggin
2019-06-19 14:36   ` Bharata B Rao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).