linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Anshuman Khandual <anshuman.khandual@arm.com>
To: David Hildenbrand <david@redhat.com>,
	linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, akpm@linux-foundation.org,
	catalin.marinas@arm.com, will.deacon@arm.com
Cc: mark.rutland@arm.com, mhocko@suse.com, ira.weiny@intel.com,
	cai@lca.pw, logang@deltatee.com, james.morse@arm.com,
	cpandya@codeaurora.org, arunks@codeaurora.org,
	dan.j.williams@intel.com, mgorman@techsingularity.net,
	osalvador@suse.de, ard.biesheuvel@arm.com
Subject: Re: [PATCH V4 4/4] arm64/mm: Enable memory hot remove
Date: Mon, 27 May 2019 13:39:11 +0530	[thread overview]
Message-ID: <b226746e-fed1-ea18-f100-0a2c6c561d1f@arm.com> (raw)
In-Reply-To: <a660313e-f96c-91ed-1e15-5f3ec6463596@redhat.com>



On 05/21/2019 03:50 PM, David Hildenbrand wrote:
> On 20.05.19 07:18, Anshuman Khandual wrote:
>> The arch code for hot-remove must tear down portions of the linear map and
>> vmemmap corresponding to memory being removed. In both cases the page
>> tables mapping these regions must be freed, and when sparse vmemmap is in
>> use the memory backing the vmemmap must also be freed.
>>
>> This patch adds a new remove_pagetable() helper which can be used to tear
>> down either region, and calls it from vmemmap_free() and
>> ___remove_pgd_mapping(). The sparse_vmap argument determines whether the
>> backing memory will be freed.
>>
>> While freeing intermediate level page table pages bail out if any of it's
>> entries are still valid. This can happen for partially filled kernel page
>> table either from a previously attempted failed memory hot add or while
>> removing an address range which does not span the entire page table page
>> range.
>>
>> The vmemmap region may share levels of table with the vmalloc region. Take
>> the kernel ptl so that we can safely free potentially-shared tables.
>>
>> While here update arch_add_memory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture.
>>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>>  arch/arm64/Kconfig  |   3 +
>>  arch/arm64/mm/mmu.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 213 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 4780eb7..ce24427 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -267,6 +267,9 @@ config HAVE_GENERIC_GUP
>>  config ARCH_ENABLE_MEMORY_HOTPLUG
>>  	def_bool y
>>  
>> +config ARCH_ENABLE_MEMORY_HOTREMOVE
>> +	def_bool y
>> +
>>  config SMP
>>  	def_bool y
>>  
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index a1bfc44..0cf0d41 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -733,6 +733,187 @@ int kern_addr_valid(unsigned long addr)
>>  
>>  	return pfn_valid(pte_pfn(pte));
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void free_hotplug_page_range(struct page *page, ssize_t size)
>> +{
>> +	WARN_ON(PageReserved(page));
>> +	free_pages((unsigned long)page_address(page), get_order(size));
>> +}
>> +
>> +static void free_hotplug_pgtable_page(struct page *page)
>> +{
>> +	free_hotplug_page_range(page, PAGE_SIZE);
>> +}
>> +
>> +static void free_pte_table(pte_t *ptep, pmd_t *pmdp, unsigned long addr)
>> +{
>> +	struct page *page;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PTE; i++) {
>> +		if (!pte_none(ptep[i]))
>> +			return;
>> +	}
>> +
>> +	page = pmd_page(READ_ONCE(*pmdp));
>> +	pmd_clear(pmdp);
>> +	__flush_tlb_kernel_pgtable(addr);
>> +	free_hotplug_pgtable_page(page);
>> +}
>> +
>> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
>> +{
>> +	struct page *page;
>> +	int i;
>> +
>> +	if (CONFIG_PGTABLE_LEVELS <= 2)
>> +		return;
>> +
>> +	for (i = 0; i < PTRS_PER_PMD; i++) {
>> +		if (!pmd_none(pmdp[i]))
>> +			return;
>> +	}
>> +
>> +	page = pud_page(READ_ONCE(*pudp));
>> +	pud_clear(pudp);
>> +	__flush_tlb_kernel_pgtable(addr);
>> +	free_hotplug_pgtable_page(page);
>> +}
>> +
>> +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr)
>> +{
>> +	struct page *page;
>> +	int i;
>> +
>> +	if (CONFIG_PGTABLE_LEVELS <= 3)
>> +		return;
>> +
>> +	for (i = 0; i < PTRS_PER_PUD; i++) {
>> +		if (!pud_none(pudp[i]))
>> +			return;
>> +	}
>> +
>> +	page = pgd_page(READ_ONCE(*pgdp));
>> +	pgd_clear(pgdp);
>> +	__flush_tlb_kernel_pgtable(addr);
>> +	free_hotplug_pgtable_page(page);
>> +}
>> +
>> +static void
>> +remove_pte_table(pmd_t *pmdp, unsigned long addr,
>> +			unsigned long end, bool sparse_vmap)
>> +{
>> +	struct page *page;
>> +	pte_t *ptep, pte;
>> +	unsigned long start = addr;
>> +
>> +	for (; addr < end; addr += PAGE_SIZE) {
>> +		ptep = pte_offset_kernel(pmdp, addr);
>> +		pte = READ_ONCE(*ptep);
>> +
>> +		if (pte_none(pte))
>> +			continue;
>> +
>> +		WARN_ON(!pte_present(pte));
>> +		if (sparse_vmap) {
>> +			page = pte_page(pte);
>> +			free_hotplug_page_range(page, PAGE_SIZE);
>> +		}
>> +		pte_clear(&init_mm, addr, ptep);
>> +	}
>> +	flush_tlb_kernel_range(start, end);
>> +}
>> +
>> +static void
>> +remove_pmd_table(pud_t *pudp, unsigned long addr,
>> +			unsigned long end, bool sparse_vmap)
>> +{
>> +	unsigned long next;
>> +	struct page *page;
>> +	pte_t *ptep_base;
>> +	pmd_t *pmdp, pmd;
>> +
>> +	for (; addr < end; addr = next) {
>> +		next = pmd_addr_end(addr, end);
>> +		pmdp = pmd_offset(pudp, addr);
>> +		pmd = READ_ONCE(*pmdp);
>> +
>> +		if (pmd_none(pmd))
>> +			continue;
>> +
>> +		WARN_ON(!pmd_present(pmd));
>> +		if (pmd_sect(pmd)) {
>> +			if (sparse_vmap) {
>> +				page = pmd_page(pmd);
>> +				free_hotplug_page_range(page, PMD_SIZE);
>> +			}
>> +			pmd_clear(pmdp);
>> +			continue;
>> +		}
>> +		ptep_base = pte_offset_kernel(pmdp, 0UL);
>> +		remove_pte_table(pmdp, addr, next, sparse_vmap);
>> +		free_pte_table(ptep_base, pmdp, addr);
>> +	}
>> +}
>> +
>> +static void
>> +remove_pud_table(pgd_t *pgdp, unsigned long addr,
>> +			unsigned long end, bool sparse_vmap)
>> +{
>> +	unsigned long next;
>> +	struct page *page;
>> +	pmd_t *pmdp_base;
>> +	pud_t *pudp, pud;
>> +
>> +	for (; addr < end; addr = next) {
>> +		next = pud_addr_end(addr, end);
>> +		pudp = pud_offset(pgdp, addr);
>> +		pud = READ_ONCE(*pudp);
>> +
>> +		if (pud_none(pud))
>> +			continue;
>> +
>> +		WARN_ON(!pud_present(pud));
>> +		if (pud_sect(pud)) {
>> +			if (sparse_vmap) {
>> +				page = pud_page(pud);
>> +				free_hotplug_page_range(page, PUD_SIZE);
>> +			}
>> +			pud_clear(pudp);
>> +			continue;
>> +		}
>> +		pmdp_base = pmd_offset(pudp, 0UL);
>> +		remove_pmd_table(pudp, addr, next, sparse_vmap);
>> +		free_pmd_table(pmdp_base, pudp, addr);
>> +	}
>> +}
>> +
>> +static void
>> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
>> +{
>> +	unsigned long addr, next;
>> +	pud_t *pudp_base;
>> +	pgd_t *pgdp, pgd;
>> +
>> +	spin_lock(&init_mm.page_table_lock);
>> +	for (addr = start; addr < end; addr = next) {
>> +		next = pgd_addr_end(addr, end);
>> +		pgdp = pgd_offset_k(addr);
>> +		pgd = READ_ONCE(*pgdp);
>> +
>> +		if (pgd_none(pgd))
>> +			continue;
>> +
>> +		WARN_ON(!pgd_present(pgd));
>> +		pudp_base = pud_offset(pgdp, 0UL);
>> +		remove_pud_table(pgdp, addr, next, sparse_vmap);
>> +		free_pud_table(pudp_base, pgdp, addr);
>> +	}
>> +	spin_unlock(&init_mm.page_table_lock);
>> +}
>> +#endif
>> +
>>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>  #if !ARM64_SWAPPER_USES_SECTION_MAPS
>>  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>> @@ -780,6 +961,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>>  void vmemmap_free(unsigned long start, unsigned long end,
>>  		struct vmem_altmap *altmap)
>>  {
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +	remove_pagetable(start, end, true);
>> +#endif
>>  }
>>  #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
>>  
>> @@ -1070,10 +1254,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
>> +{
>> +	WARN_ON(pgdir != init_mm.pgd);
>> +	remove_pagetable(start, start + size, false);
>> +}
>> +
>>  int arch_add_memory(int nid, u64 start, u64 size,
>>  			struct mhp_restrictions *restrictions)
>>  {
>> -	int flags = 0;
>> +	int ret, flags = 0;
>>  
>>  	if (rodata_full || debug_pagealloc_enabled())
>>  		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>> @@ -1081,7 +1271,25 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>  	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>>  			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>>  
>> -	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>> +	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>>  			   restrictions);
>> +	if (ret)
>> +		__remove_pgd_mapping(swapper_pg_dir,
>> +					__phys_to_virt(start), size);
> 
> Nit: Indentation of the parameters looks really weird.
> 
>> +	return ret;
>> +}
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +void arch_remove_memory(int nid, u64 start, u64 size,
>> +				struct vmem_altmap *altmap)
>> +{
>> +	unsigned long start_pfn = start >> PAGE_SHIFT;
>> +	unsigned long nr_pages = size >> PAGE_SHIFT;
>> +	struct zone *zone = page_zone(pfn_to_page(start_pfn));
>> +
>> +	__remove_pages(zone, start_pfn, nr_pages, altmap);
>> +	__remove_pgd_mapping(swapper_pg_dir,
>> +					__phys_to_virt(start), size);
> 
> Dito, indentation of the parameters.
> 
> For these two changes (arch_*_memory)
> 
> Acked-by: David Hildenbrand <david@redhat.com>

Thanks David. The following change on this patch should fix the indentation
problem. If there are no other comments I will incorporate this and re-spin
the series once more.

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0cf0d41..a87ba18 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1273,9 +1273,10 @@ int arch_add_memory(int nid, u64 start, u64 size,
 
 	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   restrictions);
-	if (ret)
-		__remove_pgd_mapping(swapper_pg_dir,
-					__phys_to_virt(start), size);
+	if (!ret)
+		return ret;
+
+	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
 	return ret;
 }
 
@@ -1288,8 +1289,7 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	struct zone *zone = page_zone(pfn_to_page(start_pfn));
 
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
-	__remove_pgd_mapping(swapper_pg_dir,
-					__phys_to_virt(start), size);
+	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
 }
 #endif
 #endif

  reply	other threads:[~2019-05-27  8:09 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-20  5:18 [PATCH V4 0/4] arm64/mm: Enable memory hot remove Anshuman Khandual
2019-05-20  5:18 ` [PATCH V4 1/4] mm/hotplug: Reorder arch_remove_memory() call in __remove_memory() Anshuman Khandual
2019-05-20  5:18 ` [PATCH V4 2/4] arm64/mm: Inhibit huge-vmap with ptdump Anshuman Khandual
2019-05-20  5:18 ` [PATCH V4 3/4] arm64/mm: Hold memory hotplug lock while walking for kernel page table dump Anshuman Khandual
2019-05-21 10:14   ` Michal Hocko
2019-05-24  4:52     ` Anshuman Khandual
2019-05-20  5:18 ` [PATCH V4 4/4] arm64/mm: Enable memory hot remove Anshuman Khandual
2019-05-21 10:20   ` David Hildenbrand
2019-05-27  8:09     ` Anshuman Khandual [this message]
2019-05-27 10:21       ` David Hildenbrand
2019-05-27 10:21       ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b226746e-fed1-ea18-f100-0a2c6c561d1f@arm.com \
    --to=anshuman.khandual@arm.com \
    --cc=akpm@linux-foundation.org \
    --cc=ard.biesheuvel@arm.com \
    --cc=arunks@codeaurora.org \
    --cc=cai@lca.pw \
    --cc=catalin.marinas@arm.com \
    --cc=cpandya@codeaurora.org \
    --cc=dan.j.williams@intel.com \
    --cc=david@redhat.com \
    --cc=ira.weiny@intel.com \
    --cc=james.morse@arm.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=logang@deltatee.com \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@suse.com \
    --cc=osalvador@suse.de \
    --cc=will.deacon@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).