All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing
@ 2023-06-09  6:08 Aneesh Kumar K.V
  2023-06-09  6:08 ` [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter Aneesh Kumar K.V
  2023-06-13 19:53 ` [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing Reza Arbab
  0 siblings, 2 replies; 9+ messages in thread
From: Aneesh Kumar K.V @ 2023-06-09  6:08 UTC (permalink / raw)
  To: linuxppc-dev, mpe, npiggin, christophe.leroy; +Cc: Aneesh Kumar K.V, foraker1

Parse the device tree in early init to find the memory block size to be
used by the kernel. Consolidate the memory block size device tree parsing
to one helper and use that on both powernv and pseries. We still want to
use machine-specific callback because on all machine types other than
powernv and pseries we continue to return MIN_MEMORY_BLOCK_SIZE.

pseries_memory_block_size used to look for the second memory
block (memory@x) to determine the memory_block_size value. This patch
changed that to look at all memory blocks and make sure we can map them all
correctly using the computed memory block size value.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Changes from v1:
* Add a helper to parse device tree for memory block size
* Drop pseries and powernv functions used. 

 arch/powerpc/include/asm/book3s/64/mmu.h      |  5 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c      | 65 +------------
 arch/powerpc/mm/init_64.c                     | 95 +++++++++++++++++++
 arch/powerpc/platforms/powernv/setup.c        | 10 +-
 .../platforms/pseries/hotplug-memory.c        | 60 +-----------
 arch/powerpc/platforms/pseries/pseries.h      |  2 -
 arch/powerpc/platforms/pseries/setup.c        |  7 ++
 7 files changed, 109 insertions(+), 135 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 570a4960cf17..28033fd5403c 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -71,10 +71,7 @@ extern unsigned int mmu_pid_bits;
 /* Base PID to allocate from */
 extern unsigned int mmu_base_pid;
 
-/*
- * memory block size used with radix translation.
- */
-extern unsigned long __ro_after_init radix_mem_block_size;
+extern unsigned long __ro_after_init memory_block_size;
 
 #define PRTB_SIZE_SHIFT	(mmu_pid_bits + 4)
 #define PRTB_ENTRIES	(1ul << mmu_pid_bits)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 2297aa764ecd..06c6666e3a39 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -37,7 +37,6 @@
 #include <mm/mmu_decl.h>
 
 unsigned int mmu_base_pid;
-unsigned long radix_mem_block_size __ro_after_init;
 
 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
 			unsigned long region_start, unsigned long region_end)
@@ -300,7 +299,7 @@ static int __meminit create_physical_mapping(unsigned long start,
 	bool prev_exec, exec = false;
 	pgprot_t prot;
 	int psize;
-	unsigned long max_mapping_size = radix_mem_block_size;
+	unsigned long max_mapping_size = memory_block_size;
 
 	if (debug_pagealloc_enabled_or_kfence())
 		max_mapping_size = PAGE_SIZE;
@@ -502,58 +501,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
 	return 1;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __init probe_memory_block_size(unsigned long node, const char *uname, int
-					  depth, void *data)
-{
-	unsigned long *mem_block_size = (unsigned long *)data;
-	const __be32 *prop;
-	int len;
-
-	if (depth != 1)
-		return 0;
-
-	if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
-		return 0;
-
-	prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
-
-	if (!prop || len < dt_root_size_cells * sizeof(__be32))
-		/*
-		 * Nothing in the device tree
-		 */
-		*mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-	else
-		*mem_block_size = of_read_number(prop, dt_root_size_cells);
-	return 1;
-}
-
-static unsigned long __init radix_memory_block_size(void)
-{
-	unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-
-	/*
-	 * OPAL firmware feature is set by now. Hence we are ok
-	 * to test OPAL feature.
-	 */
-	if (firmware_has_feature(FW_FEATURE_OPAL))
-		mem_block_size = 1UL * 1024 * 1024 * 1024;
-	else
-		of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
-
-	return mem_block_size;
-}
-
-#else   /* CONFIG_MEMORY_HOTPLUG */
-
-static unsigned long __init radix_memory_block_size(void)
-{
-	return 1UL * 1024 * 1024 * 1024;
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-
 void __init radix__early_init_devtree(void)
 {
 	int rc;
@@ -577,16 +524,6 @@ void __init radix__early_init_devtree(void)
 		mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
 			psize_to_rpti_pgsize(MMU_PAGE_64K);
 	}
-
-	/*
-	 * Max mapping size used when mapping pages. We don't use
-	 * ppc_md.memory_block_size() here because this get called
-	 * early and we don't have machine probe called yet. Also
-	 * the pseries implementation only check for ibm,lmb-size.
-	 * All hypervisor supporting radix do expose that device
-	 * tree node.
-	 */
-	radix_mem_block_size = radix_memory_block_size();
 	return;
 }
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 05b0d584e50b..97a9163f1280 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -40,6 +40,7 @@
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
 #include <linux/memremap.h>
+#include <linux/memory.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -471,6 +472,98 @@ static int __init dt_scan_mmu_pid_width(unsigned long node,
 	return 1;
 }
 
+static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size)
+{
+	unsigned long section_size = 1UL << SECTION_SIZE_BITS;
+
+	for (; *block_size > section_size; *block_size >>= 2) {
+
+		if ((mem_size & *block_size) == 0)
+			break;
+	}
+}
+
+static int __init probe_memory_block_size(unsigned long node, const char *uname, int
+					  depth, void *data)
+{
+	const char *type;
+	unsigned long *block_size = (unsigned long *)data;
+	const __be32 *reg, *endp;
+	int l;
+
+	if (depth != 1)
+		return 0;
+	/*
+	 * If we have dynamic-reconfiguration-memory node, use the
+	 * lmb value.
+	 */
+	if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
+
+		const __be32 *prop;
+
+		prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l);
+
+		if (!prop || l < dt_root_size_cells * sizeof(__be32))
+			/*
+			 * Nothing in the device tree
+			 */
+			*block_size = MIN_MEMORY_BLOCK_SIZE;
+		else
+			*block_size = of_read_number(prop, dt_root_size_cells);
+		/*
+		 * We have found the final value. Don't probe further.
+		 */
+		return 1;
+	}
+	/*
+	 * Find all the device tree nodes of memory type and make sure
+	 * the area can be mapped using the memory block size value
+	 * we end up using. We start with 1G value and keep reducing
+	 * it such that we can map the entire area using memory_block_size.
+	 * This will be used on powernv and older pseries that don't
+	 * have ibm,lmb-size node.
+	 * For ex: with P5 we can end up with
+	 * memory@0 -> 128MB
+	 * memory@128M -> 64M
+	 * This will end up using 64MB  memory block size value.
+	 */
+	type = of_get_flat_dt_prop(node, "device_type", NULL);
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	reg = of_get_flat_dt_prop(node, "reg", &l);
+	endp = reg + (l / sizeof(__be32));
+
+	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+		u64 base, size;
+
+		base = dt_mem_next_cell(dt_root_addr_cells, &reg);
+		size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+		if (size == 0)
+			continue;
+
+		update_memory_block_size(block_size, size);
+	}
+	/* continue looking for other memory device types */
+	return 0;
+}
+
+/*
+ * start with 1G memory block size. Early init will
+ * fix this with correct value.
+ */
+unsigned long memory_block_size __ro_after_init = 1UL << 30;
+static void __init early_init_memory_block_size(void)
+{
+	/*
+	 * We need to do memory_block_size probe early so that
+	 * radix__early_init_mmu() can use this as limit for
+	 * mapping page size.
+	 */
+	of_scan_flat_dt(probe_memory_block_size, &memory_block_size);
+}
+
 void __init mmu_early_init_devtree(void)
 {
 	bool hvmode = !!(mfmsr() & MSR_HV);
@@ -504,6 +597,8 @@ void __init mmu_early_init_devtree(void)
 	if (!hvmode)
 		early_check_vec5();
 
+	early_init_memory_block_size();
+
 	if (early_radix_enabled()) {
 		radix__early_init_devtree();
 
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 5e9c6b55809f..4dbb47ddbdcc 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -482,15 +482,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 #ifdef CONFIG_MEMORY_HOTPLUG
 static unsigned long pnv_memory_block_size(void)
 {
-	/*
-	 * We map the kernel linear region with 1GB large pages on radix. For
-	 * memory hot unplug to work our memory block size must be at least
-	 * this size.
-	 */
-	if (radix_enabled())
-		return radix_mem_block_size;
-	else
-		return 256UL * 1024 * 1024;
+	return memory_block_size;
 }
 #endif
 
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9c62c2c3b3d0..1333d9ab7621 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -21,54 +21,6 @@
 #include <asm/drmem.h>
 #include "pseries.h"
 
-unsigned long pseries_memory_block_size(void)
-{
-	struct device_node *np;
-	u64 memblock_size = MIN_MEMORY_BLOCK_SIZE;
-	struct resource r;
-
-	np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-	if (np) {
-		int len;
-		int size_cells;
-		const __be32 *prop;
-
-		size_cells = of_n_size_cells(np);
-
-		prop = of_get_property(np, "ibm,lmb-size", &len);
-		if (prop && len >= size_cells * sizeof(__be32))
-			memblock_size = of_read_number(prop, size_cells);
-		of_node_put(np);
-
-	} else  if (machine_is(pseries)) {
-		/* This fallback really only applies to pseries */
-		unsigned int memzero_size = 0;
-
-		np = of_find_node_by_path("/memory@0");
-		if (np) {
-			if (!of_address_to_resource(np, 0, &r))
-				memzero_size = resource_size(&r);
-			of_node_put(np);
-		}
-
-		if (memzero_size) {
-			/* We now know the size of memory@0, use this to find
-			 * the first memoryblock and get its size.
-			 */
-			char buf[64];
-
-			sprintf(buf, "/memory@%x", memzero_size);
-			np = of_find_node_by_path(buf);
-			if (np) {
-				if (!of_address_to_resource(np, 0, &r))
-					memblock_size = resource_size(&r);
-				of_node_put(np);
-			}
-		}
-	}
-	return memblock_size;
-}
-
 static void dlpar_free_property(struct property *prop)
 {
 	kfree(prop->name);
@@ -283,7 +235,7 @@ static int dlpar_offline_lmb(struct drmem_lmb *lmb)
 
 static int pseries_remove_memblock(unsigned long base, unsigned long memblock_size)
 {
-	unsigned long block_sz, start_pfn;
+	unsigned long start_pfn;
 	int sections_per_block;
 	int i;
 
@@ -294,8 +246,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si
 	if (!pfn_valid(start_pfn))
 		goto out;
 
-	block_sz = pseries_memory_block_size();
-	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+	sections_per_block = memory_block_size / MIN_MEMORY_BLOCK_SIZE;
 
 	for (i = 0; i < sections_per_block; i++) {
 		__remove_memory(base, MIN_MEMORY_BLOCK_SIZE);
@@ -354,7 +305,6 @@ static int dlpar_add_lmb(struct drmem_lmb *);
 static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 {
 	struct memory_block *mem_block;
-	unsigned long block_sz;
 	int rc;
 
 	if (!lmb_is_removable(lmb))
@@ -370,13 +320,11 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 		return rc;
 	}
 
-	block_sz = pseries_memory_block_size();
-
-	__remove_memory(lmb->base_addr, block_sz);
+	__remove_memory(lmb->base_addr, memory_block_size);
 	put_device(&mem_block->dev);
 
 	/* Update memory regions for memory remove */
-	memblock_remove(lmb->base_addr, block_sz);
+	memblock_remove(lmb->base_addr, memory_block_size);
 
 	invalidate_lmb_associativity_index(lmb);
 	lmb->flags &= ~DRCONF_MEM_ASSIGNED;
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index f8bce40ebd0c..02c012976b19 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -90,8 +90,6 @@ extern struct pci_controller_ops pseries_pci_controller_ops;
 int pseries_msi_allocate_domains(struct pci_controller *phb);
 void pseries_msi_free_domains(struct pci_controller *phb);
 
-unsigned long pseries_memory_block_size(void);
-
 extern int CMO_PrPSP;
 extern int CMO_SecPSP;
 extern unsigned long CMO_PageSize;
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index e2a57cfa6c83..6683debe283e 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -1116,6 +1116,13 @@ static int pSeries_pci_probe_mode(struct pci_bus *bus)
 	return PCI_PROBE_NORMAL;
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+static unsigned long pseries_memory_block_size(void)
+{
+	return memory_block_size;
+}
+#endif
+
 struct pci_controller_ops pseries_pci_controller_ops = {
 	.probe_mode		= pSeries_pci_probe_mode,
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter
  2023-06-09  6:08 [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing Aneesh Kumar K.V
@ 2023-06-09  6:08 ` Aneesh Kumar K.V
  2023-06-13 20:06   ` Reza Arbab
  2023-06-19 10:35   ` David Hildenbrand
  2023-06-13 19:53 ` [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing Reza Arbab
  1 sibling, 2 replies; 9+ messages in thread
From: Aneesh Kumar K.V @ 2023-06-09  6:08 UTC (permalink / raw)
  To: linuxppc-dev, mpe, npiggin, christophe.leroy; +Cc: Aneesh Kumar K.V, foraker1

Certain devices can possess non-standard memory capacities, not constrained
to multiples of 1GB. Provide a kernel parameter so that we can map the
device memory completely on memory hotplug.

Restrict memory_block_size value to a power of 2 value similar to LMB size.
The memory block size should also be more than the section size.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 .../admin-guide/kernel-parameters.txt         |  3 +++
 arch/powerpc/kernel/setup_64.c                | 23 +++++++++++++++++++
 arch/powerpc/mm/init_64.c                     | 17 ++++++++++----
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9e5bab29685f..833b8c5b4b4c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3190,6 +3190,9 @@
 			Note that even when enabled, there are a few cases where
 			the feature is not effective.
 
+	memory_block_size=size [PPC]
+			 Use this parameter to configure the memory block size value.
+
 	memtest=	[KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
 			Format: <integer>
 			default : 0 <disable>
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 246201d0d879..cbdb924462c7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -892,6 +892,29 @@ unsigned long memory_block_size_bytes(void)
 
 	return MIN_MEMORY_BLOCK_SIZE;
 }
+
+/*
+ * Restrict to a power of 2 value for memblock which is larger than
+ * section size
+ */
+static int __init parse_mem_block_size(char *ptr)
+{
+	unsigned int order;
+	unsigned long size = memparse(ptr, NULL);
+
+	order = fls64(size);
+	if (!order)
+		return 0;
+
+	order--;
+	if (order < SECTION_SIZE_BITS)
+		return 0;
+
+	memory_block_size = 1UL << order;
+
+	return 0;
+}
+early_param("memory_block_size", parse_mem_block_size);
 #endif
 
 #if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO)
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 97a9163f1280..5e6dde593ea3 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -549,13 +549,20 @@ static int __init probe_memory_block_size(unsigned long node, const char *uname,
 	return 0;
 }
 
-/*
- * start with 1G memory block size. Early init will
- * fix this with correct value.
- */
-unsigned long memory_block_size __ro_after_init = 1UL << 30;
+unsigned long memory_block_size __ro_after_init;
 static void __init early_init_memory_block_size(void)
 {
+	/*
+	 * if it is set via early param just return.
+	 */
+	if (memory_block_size)
+		return;
+
+	/*
+	 * start with 1G memory block size. update_memory_block_size()
+	 * will derive the right value based on device tree details.
+	 */
+	memory_block_size = 1UL << 30;
 	/*
 	 * We need to do memory_block_size probe early so that
 	 * radix__early_init_mmu() can use this as limit for
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing
  2023-06-09  6:08 [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing Aneesh Kumar K.V
  2023-06-09  6:08 ` [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter Aneesh Kumar K.V
@ 2023-06-13 19:53 ` Reza Arbab
  1 sibling, 0 replies; 9+ messages in thread
From: Reza Arbab @ 2023-06-13 19:53 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: foraker1, linuxppc-dev, npiggin

On Fri, Jun 09, 2023 at 11:38:50AM +0530, Aneesh Kumar K.V wrote:
>Parse the device tree in early init to find the memory block size to be
>used by the kernel. Consolidate the memory block size device tree parsing
>to one helper and use that on both powernv and pseries. We still want to
>use machine-specific callback because on all machine types other than
>powernv and pseries we continue to return MIN_MEMORY_BLOCK_SIZE.
>
>pseries_memory_block_size used to look for the second memory
>block (memory@x) to determine the memory_block_size value. This patch
>changed that to look at all memory blocks and make sure we can map them all
>correctly using the computed memory block size value.

Reviewed-by: Reza Arbab <arbab@linux.ibm.com>

-- 
Reza Arbab

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter
  2023-06-09  6:08 ` [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter Aneesh Kumar K.V
@ 2023-06-13 20:06   ` Reza Arbab
  2023-06-19 10:35   ` David Hildenbrand
  1 sibling, 0 replies; 9+ messages in thread
From: Reza Arbab @ 2023-06-13 20:06 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: foraker1, linuxppc-dev, npiggin

On Fri, Jun 09, 2023 at 11:38:51AM +0530, Aneesh Kumar K.V wrote:
>Certain devices can possess non-standard memory capacities, not constrained
>to multiples of 1GB. Provide a kernel parameter so that we can map the
>device memory completely on memory hotplug.

Case in point; the memory block size determined at boot is 1GB, but we 
know that 15.75GB of device memory will be hotplugged during runtime.

Reviewed-by: Reza Arbab <arbab@linux.ibm.com>

-- 
Reza Arbab

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter
  2023-06-09  6:08 ` [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter Aneesh Kumar K.V
  2023-06-13 20:06   ` Reza Arbab
@ 2023-06-19 10:35   ` David Hildenbrand
  2023-06-19 16:17     ` Aneesh Kumar K.V
  2023-06-20 12:35     ` Michael Ellerman
  1 sibling, 2 replies; 9+ messages in thread
From: David Hildenbrand @ 2023-06-19 10:35 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe, npiggin, christophe.leroy; +Cc: foraker1

On 09.06.23 08:08, Aneesh Kumar K.V wrote:
> Certain devices can possess non-standard memory capacities, not constrained
> to multiples of 1GB. Provide a kernel parameter so that we can map the
> device memory completely on memory hotplug.

So, the unfortunate thing is that these devices would have worked out of 
the box before the memory block size was increased from 256 MiB to 1 GiB 
in these setups. Now, one has to fine-tune the memory block size. The 
only other arch that I know, which supports setting the memory block 
size, is x86 for special (large) UV systems -- and at least in the past 
128 MiB vs. 2 GiB memory blocks made a performance difference during 
boot (maybe no longer today, who knows).


Obviously, less tunable and getting stuff simply working out of the box 
is preferable.

Two questions:

1) Isn't there a way to improve auto-detection to fallback to 256 MiB in 
these setups, to avoid specifying these parameters?

2) Is the 256 MiB -> 1 GiB memory block size switch really worth it? On 
x86-64, experiments (with direct map fragmentation) showed that the 
effective performance boost is pretty insignificant, so I wonder how big 
the 1 GiB direct map performance improvement is.


I guess the only real issue with 256 MiB memory blocks and 1 GiB direct 
mapping is memory unplug of boot memory: when unplugging a 256 MiB 
block, one would have to remap the 1 GiB range using 2 MiB ranges.

... I was wondering what would happen if you simply leave the direct 
mapping in this corner case in place instead of doing this remapping. 
IOW, remove the memory but keep the direct map pointing at the removed 
memory. Nobody should be touching it, or are there any cases where that 
could hurt?


Or is there any other reason why we really want 1 GiB memory blocks 
instead of to defaulting to 256 MiB the way it used to be?

Thanks!

> 
> Restrict memory_block_size value to a power of 2 value similar to LMB size.
> The memory block size should also be more than the section size.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   .../admin-guide/kernel-parameters.txt         |  3 +++
>   arch/powerpc/kernel/setup_64.c                | 23 +++++++++++++++++++
>   arch/powerpc/mm/init_64.c                     | 17 ++++++++++----
>   3 files changed, 38 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 9e5bab29685f..833b8c5b4b4c 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3190,6 +3190,9 @@
>   			Note that even when enabled, there are a few cases where
>   			the feature is not effective.
>   
> +	memory_block_size=size [PPC]
> +			 Use this parameter to configure the memory block size value.
> +
>   	memtest=	[KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
>   			Format: <integer>
>   			default : 0 <disable>
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 246201d0d879..cbdb924462c7 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -892,6 +892,29 @@ unsigned long memory_block_size_bytes(void)
>   
>   	return MIN_MEMORY_BLOCK_SIZE;
>   }
> +
> +/*
> + * Restrict to a power of 2 value for memblock which is larger than
> + * section size
> + */
> +static int __init parse_mem_block_size(char *ptr)
> +{
> +	unsigned int order;
> +	unsigned long size = memparse(ptr, NULL);
> +
> +	order = fls64(size);
> +	if (!order)
> +		return 0;
> +
> +	order--;
> +	if (order < SECTION_SIZE_BITS)
> +		return 0;
> +
> +	memory_block_size = 1UL << order;
> +
> +	return 0;
> +}
> +early_param("memory_block_size", parse_mem_block_size);
>   #endif
>   
>   #if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO)
> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
> index 97a9163f1280..5e6dde593ea3 100644
> --- a/arch/powerpc/mm/init_64.c
> +++ b/arch/powerpc/mm/init_64.c
> @@ -549,13 +549,20 @@ static int __init probe_memory_block_size(unsigned long node, const char *uname,
>   	return 0;
>   }
>   
> -/*
> - * start with 1G memory block size. Early init will
> - * fix this with correct value.
> - */
> -unsigned long memory_block_size __ro_after_init = 1UL << 30;
> +unsigned long memory_block_size __ro_after_init;
>   static void __init early_init_memory_block_size(void)
>   {
> +	/*
> +	 * if it is set via early param just return.
> +	 */
> +	if (memory_block_size)
> +		return;
> +
> +	/*
> +	 * start with 1G memory block size. update_memory_block_size()
> +	 * will derive the right value based on device tree details.
> +	 */
> +	memory_block_size = 1UL << 30;
>   	/*
>   	 * We need to do memory_block_size probe early so that
>   	 * radix__early_init_mmu() can use this as limit for

-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter
  2023-06-19 10:35   ` David Hildenbrand
@ 2023-06-19 16:17     ` Aneesh Kumar K.V
  2023-06-19 16:28       ` David Hildenbrand
  2023-06-20 12:35     ` Michael Ellerman
  1 sibling, 1 reply; 9+ messages in thread
From: Aneesh Kumar K.V @ 2023-06-19 16:17 UTC (permalink / raw)
  To: David Hildenbrand, linuxppc-dev, mpe, npiggin, christophe.leroy,
	Tarun Sahu
  Cc: foraker1

David Hildenbrand <david@redhat.com> writes:

> On 09.06.23 08:08, Aneesh Kumar K.V wrote:
>> Certain devices can possess non-standard memory capacities, not constrained
>> to multiples of 1GB. Provide a kernel parameter so that we can map the
>> device memory completely on memory hotplug.
>
> So, the unfortunate thing is that these devices would have worked out of 
> the box before the memory block size was increased from 256 MiB to 1 GiB 
> in these setups. Now, one has to fine-tune the memory block size. The 
> only other arch that I know, which supports setting the memory block 
> size, is x86 for special (large) UV systems -- and at least in the past 
> 128 MiB vs. 2 GiB memory blocks made a performance difference during 
> boot (maybe no longer today, who knows).
>
>
> Obviously, less tunable and getting stuff simply working out of the box 
> is preferable.
>
> Two questions:
>
> 1) Isn't there a way to improve auto-detection to fallback to 256 MiB in 
> these setups, to avoid specifying these parameters?

The patch does try to detect as much as possible by looking at device tree
nodes and aperture window size. But there are still cases where we find
a memory aperture of size X GB and device driver hotplug X.YGB memory.

>
> 2) Is the 256 MiB -> 1 GiB memory block size switch really worth it? On 
> x86-64, experiments (with direct map fragmentation) showed that the 
> effective performance boost is pretty insignificant, so I wonder how big 
> the 1 GiB direct map performance improvement is.


Tarun is running some tests to evaluate the impact. We used to use 1GiB
mapping always. This was later switched to use memory block size to fix
issues with memory unplug
commit af9d00e93a4f ("powerpc/mm/radix: Create separate mappings for hot-plugged memory")
explains some details related to that change.


>
>
> I guess the only real issue with 256 MiB memory blocks and 1 GiB direct 
> mapping is memory unplug of boot memory: when unplugging a 256 MiB 
> block, one would have to remap the 1 GiB range using 2 MiB ranges.

>
> ... I was wondering what would happen if you simply leave the direct 
> mapping in this corner case in place instead of doing this remapping. 
> IOW, remove the memory but keep the direct map pointing at the removed 
> memory. Nobody should be touching it, or are there any cases where that 
> could hurt?
>
>
> Or is there any other reason why we really want 1 GiB memory blocks 
> instead of to defaulting to 256 MiB the way it used to be?
>

The idea we are working towards is to keep the memory block size small
but map the boot memory using 1G. An unplug request can split that 1G
mapping later. We could look at the possibility of leaving that mapping
without splitting. But not sure why we would want to do that if we can
correctly split things. Right now there is no splitting support in powerpc.

-aneesh

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter
  2023-06-19 16:17     ` Aneesh Kumar K.V
@ 2023-06-19 16:28       ` David Hildenbrand
  0 siblings, 0 replies; 9+ messages in thread
From: David Hildenbrand @ 2023-06-19 16:28 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe, npiggin, christophe.leroy,
	Tarun Sahu
  Cc: foraker1

On 19.06.23 18:17, Aneesh Kumar K.V wrote:
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 09.06.23 08:08, Aneesh Kumar K.V wrote:
>>> Certain devices can possess non-standard memory capacities, not constrained
>>> to multiples of 1GB. Provide a kernel parameter so that we can map the
>>> device memory completely on memory hotplug.
>>
>> So, the unfortunate thing is that these devices would have worked out of
>> the box before the memory block size was increased from 256 MiB to 1 GiB
>> in these setups. Now, one has to fine-tune the memory block size. The
>> only other arch that I know, which supports setting the memory block
>> size, is x86 for special (large) UV systems -- and at least in the past
>> 128 MiB vs. 2 GiB memory blocks made a performance difference during
>> boot (maybe no longer today, who knows).
>>
>>
>> Obviously, less tunable and getting stuff simply working out of the box
>> is preferable.
>>
>> Two questions:
>>
>> 1) Isn't there a way to improve auto-detection to fallback to 256 MiB in
>> these setups, to avoid specifying these parameters?
> 
> The patch does try to detect as much as possible by looking at device tree
> nodes and aperture window size. But there are still cases where we find
> a memory aperture of size X GB and device driver hotplug X.YGB memory.
> 

Okay, and I assume we can't detect that case easily.

Which interface is that device driver using to hotplug memory? It's 
quite surprising I have to say ...

>>
>> 2) Is the 256 MiB -> 1 GiB memory block size switch really worth it? On
>> x86-64, experiments (with direct map fragmentation) showed that the
>> effective performance boost is pretty insignificant, so I wonder how big
>> the 1 GiB direct map performance improvement is.
> 
> 
> Tarun is running some tests to evaluate the impact. We used to use 1GiB
> mapping always. This was later switched to use memory block size to fix
> issues with memory unplug
> commit af9d00e93a4f ("powerpc/mm/radix: Create separate mappings for hot-plugged memory")
> explains some details related to that change.
> 

IIUC, that commit (conditionally) increased the memory block size to 
avoid the splitting, correct? By that, it broke the device driver use case.

> 
>>
>>
>> I guess the only real issue with 256 MiB memory blocks and 1 GiB direct
>> mapping is memory unplug of boot memory: when unplugging a 256 MiB
>> block, one would have to remap the 1 GiB range using 2 MiB ranges.
> 
>>
>> ... I was wondering what would happen if you simply leave the direct
>> mapping in this corner case in place instead of doing this remapping.
>> IOW, remove the memory but keep the direct map pointing at the removed
>> memory. Nobody should be touching it, or are there any cases where that
>> could hurt?
>>
>>
>> Or is there any other reason why we really want 1 GiB memory blocks
>> instead of to defaulting to 256 MiB the way it used to be?
>>
> 
> The idea we are working towards is to keep the memory block size small

That would be preferable, yes ...

> but map the boot memory using 1G. An unplug request can split that 1G
> mapping later. We could look at the possibility of leaving that mapping
> without splitting. But not sure why we would want to do that if we can
> correctly split things. Right now there is no splitting support in powerpc.

If splitting over-complicates the matter (and well, it will even consume 
more memory), it might at least be worth looking into that. Yes, it's 
cleaner.

I think there is also the option to fail memory offlining (and therefore 
unplug) if we have a 1 GiB mapping and don't want to split. For 
hotplugged memory it would always work to unplug again. aarch64 blocks 
any boot memory from getting unplugged.

But I guess that might break existing use cases (unplug boot memory) on 
ppc64 that rely on ZONE_MOVABLE to have it working with guarantees, 
right? Could be optimized but not sure if that's the best approach.


-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter
  2023-06-19 10:35   ` David Hildenbrand
  2023-06-19 16:17     ` Aneesh Kumar K.V
@ 2023-06-20 12:35     ` Michael Ellerman
  2023-06-20 12:53       ` David Hildenbrand
  1 sibling, 1 reply; 9+ messages in thread
From: Michael Ellerman @ 2023-06-20 12:35 UTC (permalink / raw)
  To: David Hildenbrand, Aneesh Kumar K.V, linuxppc-dev, npiggin,
	christophe.leroy
  Cc: foraker1

David Hildenbrand <david@redhat.com> writes:
> On 09.06.23 08:08, Aneesh Kumar K.V wrote:
>> Certain devices can possess non-standard memory capacities, not constrained
>> to multiples of 1GB. Provide a kernel parameter so that we can map the
>> device memory completely on memory hotplug.
>
> So, the unfortunate thing is that these devices would have worked out of 
> the box before the memory block size was increased from 256 MiB to 1 GiB 
> in these setups. Now, one has to fine-tune the memory block size. The 
> only other arch that I know, which supports setting the memory block 
> size, is x86 for special (large) UV systems -- and at least in the past 
> 128 MiB vs. 2 GiB memory blocks made a performance difference during 
> boot (maybe no longer today, who knows).
>
>
> Obviously, less tunable and getting stuff simply working out of the box 
> is preferable.
>
> Two questions:
>
> 1) Isn't there a way to improve auto-detection to fallback to 256 MiB in 
> these setups, to avoid specifying these parameters?
>
> 2) Is the 256 MiB -> 1 GiB memory block size switch really worth it? On 
> x86-64, experiments (with direct map fragmentation) showed that the 
> effective performance boost is pretty insignificant, so I wonder how big 
> the 1 GiB direct map performance improvement is.

The other issue is simply the number of sysfs entries.

With 64TB of memory and a 256MB block size you end up with ~250,000
directories in /sys/devices/system/memory.

cheers

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter
  2023-06-20 12:35     ` Michael Ellerman
@ 2023-06-20 12:53       ` David Hildenbrand
  0 siblings, 0 replies; 9+ messages in thread
From: David Hildenbrand @ 2023-06-20 12:53 UTC (permalink / raw)
  To: Michael Ellerman, Aneesh Kumar K.V, linuxppc-dev, npiggin,
	christophe.leroy
  Cc: foraker1

On 20.06.23 14:35, Michael Ellerman wrote:
> David Hildenbrand <david@redhat.com> writes:
>> On 09.06.23 08:08, Aneesh Kumar K.V wrote:
>>> Certain devices can possess non-standard memory capacities, not constrained
>>> to multiples of 1GB. Provide a kernel parameter so that we can map the
>>> device memory completely on memory hotplug.
>>
>> So, the unfortunate thing is that these devices would have worked out of
>> the box before the memory block size was increased from 256 MiB to 1 GiB
>> in these setups. Now, one has to fine-tune the memory block size. The
>> only other arch that I know, which supports setting the memory block
>> size, is x86 for special (large) UV systems -- and at least in the past
>> 128 MiB vs. 2 GiB memory blocks made a performance difference during
>> boot (maybe no longer today, who knows).
>>
>>
>> Obviously, less tunable and getting stuff simply working out of the box
>> is preferable.
>>
>> Two questions:
>>
>> 1) Isn't there a way to improve auto-detection to fallback to 256 MiB in
>> these setups, to avoid specifying these parameters?
>>
>> 2) Is the 256 MiB -> 1 GiB memory block size switch really worth it? On
>> x86-64, experiments (with direct map fragmentation) showed that the
>> effective performance boost is pretty insignificant, so I wonder how big
>> the 1 GiB direct map performance improvement is.
> 
> The other issue is simply the number of sysfs entries.
> 
> With 64TB of memory and a 256MB block size you end up with ~250,000
> directories in /sys/devices/system/memory.

Yes, and so far on other archs we only optimize for that for on UV x86 
systems (with a default of 2 GiB). And that was added before we started 
to speed up memory device lookups significantly using a radix tree IIRC.

It's worth noting that there was a discussion on:

(a) not creating these device sysfs entries (when configured on the 
cmdline); often, nobody really ends up using them to online/offline 
memory blocks. Then, the only primary users is lsmem.

(b) exposing logical devices (e.g., a DIMM) taht can only be 
offlined/removed as a whole, instead of their individual memblocks (when 
configured on the cmdline). But for PPC64 that won't help.


But (a) gets more tricky if device drivers (and things like dax/kmem) 
rely on user-space memory onlining/offlining.

-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-06-20 12:55 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-09  6:08 [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing Aneesh Kumar K.V
2023-06-09  6:08 ` [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter Aneesh Kumar K.V
2023-06-13 20:06   ` Reza Arbab
2023-06-19 10:35   ` David Hildenbrand
2023-06-19 16:17     ` Aneesh Kumar K.V
2023-06-19 16:28       ` David Hildenbrand
2023-06-20 12:35     ` Michael Ellerman
2023-06-20 12:53       ` David Hildenbrand
2023-06-13 19:53 ` [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing Reza Arbab

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.