On Fri, Mar 12, 2021 at 04:29:41AM -0300, Leonardo Bras wrote: > During memory hotunplug, after each LMB is removed, the HPT may be > resized-down if it would map a max of 4 times the current amount of memory. > (2 shifts, due to introduced histeresis) > > It usually is not an issue, but it can take a lot of time if HPT > resizing-down fails. This happens because resize-down failures > usually repeat at each LMB removal, until there are no more bolted entries > conflict, which can take a while to happen. > > This can be solved by doing a single HPT resize at the end of memory > hotunplug, after all requested entries are removed. > > To make this happen, it's necessary to temporarily disable all HPT > resize-downs before hotunplug, re-enable them after hotunplug ends, > and then resize-down HPT to the current memory size. > > As an example, hotunplugging 256GB from a 385GB guest took 621s without > this patch, and 100s after applied. > > Signed-off-by: Leonardo Bras > --- > arch/powerpc/include/asm/book3s/64/hash.h | 2 ++ > arch/powerpc/include/asm/sparsemem.h | 2 ++ > arch/powerpc/mm/book3s64/hash_utils.c | 28 +++++++++++++++++++ > arch/powerpc/mm/book3s64/pgtable.c | 12 ++++++++ > .../platforms/pseries/hotplug-memory.c | 16 +++++++++++ > 5 files changed, 60 insertions(+) > > diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h > index 843b0a178590..f92697c107f7 100644 > --- a/arch/powerpc/include/asm/book3s/64/hash.h > +++ b/arch/powerpc/include/asm/book3s/64/hash.h > @@ -256,6 +256,8 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, > int hash__remove_section_mapping(unsigned long start, unsigned long end); > > void hash_memory_batch_expand_prepare(unsigned long newsize); > +void hash_memory_batch_shrink_begin(void); > +void hash_memory_batch_shrink_end(void); > > #endif /* !__ASSEMBLY__ */ > #endif /* __KERNEL__ */ > diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h > index 16b5f5300c84..a7a8a0d070fc 100644 > --- a/arch/powerpc/include/asm/sparsemem.h > +++ b/arch/powerpc/include/asm/sparsemem.h > @@ -18,6 +18,8 @@ extern int memory_add_physaddr_to_nid(u64 start); > #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid > > void memory_batch_expand_prepare(unsigned long newsize); > +void memory_batch_shrink_begin(void); > +void memory_batch_shrink_end(void); > > #ifdef CONFIG_NUMA > extern int hot_add_scn_to_nid(unsigned long scn_addr); > diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c > index 1f6aa0bf27e7..e16f207de8e4 100644 > --- a/arch/powerpc/mm/book3s64/hash_utils.c > +++ b/arch/powerpc/mm/book3s64/hash_utils.c > @@ -794,6 +794,9 @@ static unsigned long __init htab_get_table_size(void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > + > +atomic_t hpt_resize_disable = ATOMIC_INIT(0); > + > static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking) > { > unsigned target_hpt_shift; > @@ -805,6 +808,10 @@ static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking) > > if (shrinking) { > > + /* When batch removing entries, only resizes HPT at the end. */ > + if (atomic_read_acquire(&hpt_resize_disable)) > + return 0; > + I'm not quite convinced by this locking. Couldn't hpt_resize_disable be set after this point, but while you're still inside resize_hpt_for_hotplug()? Probably better to use an explicit mutex (and mutex_trylock()) to make the critical sections clearer. Except... do we even need the fancy mechanics to suppress the resizes in one place to do them elswhere. Couldn't we just replace the existing resize calls with the batched ones? > /* > * To avoid lots of HPT resizes if memory size is fluctuating > * across a boundary, we deliberately have some hysterisis > @@ -872,6 +879,27 @@ void hash_memory_batch_expand_prepare(unsigned long newsize) > pr_warn("Hash collision while resizing HPT\n"); > } > } > + > +void hash_memory_batch_shrink_begin(void) > +{ > + /* Disable HPT resize-down during hot-unplug */ > + atomic_set_release(&hpt_resize_disable, 1); > +} > + > +void hash_memory_batch_shrink_end(void) > +{ > + unsigned long newsize; > + > + /* Re-enables HPT resize-down after hot-unplug */ > + atomic_set_release(&hpt_resize_disable, 0); > + > + newsize = memblock_phys_mem_size(); > + /* Resize to smallest SHIFT possible */ > + while (resize_hpt_for_hotplug(newsize, true) == -ENOSPC) { > + newsize *= 2; As noted earlier, doing this without an explicit cap on the new hpt size (of the existing size) this makes me nervous. Less so, but doing the calculations on memory size, rather than explictly on HPT size / HPT order also seems kinda clunky. > + pr_warn("Hash collision while resizing HPT\n"); > + } > +} > #endif /* CONFIG_MEMORY_HOTPLUG */ > > static void __init hash_init_partition_table(phys_addr_t hash_table, > diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c > index f1cd8af0f67f..e01681e22e00 100644 > --- a/arch/powerpc/mm/book3s64/pgtable.c > +++ b/arch/powerpc/mm/book3s64/pgtable.c > @@ -199,6 +199,18 @@ void memory_batch_expand_prepare(unsigned long newsize) > if (!radix_enabled()) > hash_memory_batch_expand_prepare(newsize); > } > + > +void memory_batch_shrink_begin(void) > +{ > + if (!radix_enabled()) > + hash_memory_batch_shrink_begin(); > +} > + > +void memory_batch_shrink_end(void) > +{ > + if (!radix_enabled()) > + hash_memory_batch_shrink_end(); > +} Again, these wrappers don't seem particularly useful to me. > #endif /* CONFIG_MEMORY_HOTPLUG */ > > void __init mmu_partition_table_init(void) > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c > index 353c71249214..9182fb5b5c01 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c > @@ -425,6 +425,8 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove) > return -EINVAL; > } > > + memory_batch_shrink_begin(); > + > for_each_drmem_lmb(lmb) { > rc = dlpar_remove_lmb(lmb); > if (rc) > @@ -470,6 +472,8 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove) > rc = 0; > } > > + memory_batch_shrink_end(); > + > return rc; > } > > @@ -481,6 +485,8 @@ static int dlpar_memory_remove_by_index(u32 drc_index) > > pr_debug("Attempting to hot-remove LMB, drc index %x\n", drc_index); > > + memory_batch_shrink_begin(); > + > lmb_found = 0; > for_each_drmem_lmb(lmb) { > if (lmb->drc_index == drc_index) { > @@ -502,6 +508,8 @@ static int dlpar_memory_remove_by_index(u32 drc_index) > else > pr_debug("Memory at %llx was hot-removed\n", lmb->base_addr); > > + memory_batch_shrink_end(); remove_by_index only removes a single LMB, so there's no real point to batching here. > return rc; > } > > @@ -532,6 +540,8 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) > if (lmbs_available < lmbs_to_remove) > return -EINVAL; > > + memory_batch_shrink_begin(); > + > for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) { > if (!(lmb->flags & DRCONF_MEM_ASSIGNED)) > continue; > @@ -572,6 +582,8 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) > } > } > > + memory_batch_shrink_end(); > + > return rc; > } > > @@ -700,6 +712,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add) > if (lmbs_added != lmbs_to_add) { > pr_err("Memory hot-add failed, removing any added LMBs\n"); > > + memory_batch_shrink_begin(); The effect of these on the memory grow path is far from clear. > for_each_drmem_lmb(lmb) { > if (!drmem_lmb_reserved(lmb)) > continue; > @@ -713,6 +726,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add) > > drmem_remove_lmb_reservation(lmb); > } > + memory_batch_shrink_end(); > rc = -EINVAL; > } else { > for_each_drmem_lmb(lmb) { > @@ -814,6 +828,7 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index) > if (rc) { > pr_err("Memory indexed-count-add failed, removing any added LMBs\n"); > > + memory_batch_shrink_begin(); > for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) { > if (!drmem_lmb_reserved(lmb)) > continue; > @@ -827,6 +842,7 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index) > > drmem_remove_lmb_reservation(lmb); > } > + memory_batch_shrink_end(); > rc = -EINVAL; > } else { > for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) { -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson