linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: linuxppc-dev@lists.ozlabs.org, mpe@ellerman.id.au
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>,
	Bharata B Rao <bharata@linux.ibm.com>
Subject: [PATCH v3 4/4] powerpc/mm/radix: Create separate mappings for hot-plugged memory
Date: Thu,  9 Jul 2020 18:49:25 +0530	[thread overview]
Message-ID: <20200709131925.922266-5-aneesh.kumar@linux.ibm.com> (raw)
In-Reply-To: <20200709131925.922266-1-aneesh.kumar@linux.ibm.com>

To enable memory unplug without splitting kernel page table
mapping, we force the max mapping size to the LMB size. LMB
size is the unit in which hypervisor will do memory add/remove
operation.

Pseries systems supports max LMB size of 256MB. Hence on pseries,
we now end up mapping memory with 2M page size instead of 1G. To improve
that we want hypervisor to hint the kernel about the hotplug
memory range. That was added that as part of

commit b6eca183e23e ("powerpc/kernel: Enables memory
hot-remove after reboot on pseries guests")

But PowerVM doesn't provide that hint yet. Once we get PowerVM
updated, we can then force the 2M mapping only to hot-pluggable
memory region using memblock_is_hotpluggable(). Till then
let's depend on LMB size for finding the mapping page size
for linear range.

With this change KVM guest will also be doing linear mapping with
2M page size.

The actual TLB benefit of mapping guest page table entries with
hugepage size can only be materialized if the partition scoped
entries are also using the same or higher page size. A guest using
1G hugetlbfs backing guest memory can have a performance impact with
the above change.

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  5 ++
 arch/powerpc/mm/book3s64/radix_pgtable.c | 81 ++++++++++++++++++++----
 arch/powerpc/platforms/powernv/setup.c   | 10 ++-
 3 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 5393a535240c..15aae924f41c 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -82,6 +82,11 @@ extern unsigned int mmu_pid_bits;
 /* Base PID to allocate from */
 extern unsigned int mmu_base_pid;
 
+/*
+ * memory block size used with radix translation.
+ */
+extern unsigned int __ro_after_init radix_mem_block_size;
+
 #define PRTB_SIZE_SHIFT	(mmu_pid_bits + 4)
 #define PRTB_ENTRIES	(1ul << mmu_pid_bits)
 
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index d5a01b9aadc9..bba45fc0b7b2 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/string_helpers.h>
+#include <linux/memory.h>
 
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
@@ -33,6 +34,7 @@
 
 unsigned int mmu_pid_bits;
 unsigned int mmu_base_pid;
+unsigned int radix_mem_block_size __ro_after_init;
 
 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
 			unsigned long region_start, unsigned long region_end)
@@ -265,6 +267,7 @@ static unsigned long next_boundary(unsigned long addr, unsigned long end)
 
 static int __meminit create_physical_mapping(unsigned long start,
 					     unsigned long end,
+					     unsigned long max_mapping_size,
 					     int nid, pgprot_t _prot)
 {
 	unsigned long vaddr, addr, mapping_size = 0;
@@ -278,6 +281,8 @@ static int __meminit create_physical_mapping(unsigned long start,
 		int rc;
 
 		gap = next_boundary(addr, end) - addr;
+		if (gap > max_mapping_size)
+			gap = max_mapping_size;
 		previous_size = mapping_size;
 		prev_exec = exec;
 
@@ -328,8 +333,9 @@ static void __init radix_init_pgtable(void)
 
 	/* We don't support slb for radix */
 	mmu_slb_size = 0;
+
 	/*
-	 * Create the linear mapping, using standard page size for now
+	 * Create the linear mapping
 	 */
 	for_each_memblock(memory, reg) {
 		/*
@@ -345,6 +351,7 @@ static void __init radix_init_pgtable(void)
 
 		WARN_ON(create_physical_mapping(reg->base,
 						reg->base + reg->size,
+						radix_mem_block_size,
 						-1, PAGE_KERNEL));
 	}
 
@@ -485,6 +492,47 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
 	return 1;
 }
 
+static int __init probe_memory_block_size(unsigned long node, const char *uname, int
+					  depth, void *data)
+{
+	unsigned long *mem_block_size = (unsigned long *)data;
+	const __be64 *prop;
+	int len;
+
+	if (depth != 1)
+		return 0;
+
+	if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
+	if (!prop || len < sizeof(__be64))
+		/*
+		 * Nothing in the device tree
+		 */
+		*mem_block_size = MIN_MEMORY_BLOCK_SIZE;
+	else
+		*mem_block_size = be64_to_cpup(prop);
+	return 1;
+}
+
+static unsigned long radix_memory_block_size(void)
+{
+	unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
+
+	/*
+	 * OPAL firmware feature is set by now. Hence we are ok
+	 * to test OPAL feature.
+	 */
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		mem_block_size = 1UL * 1024 * 1024 * 1024;
+	else
+		of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
+
+	return mem_block_size;
+}
+
+
 void __init radix__early_init_devtree(void)
 {
 	int rc;
@@ -493,17 +541,27 @@ void __init radix__early_init_devtree(void)
 	 * Try to find the available page sizes in the device-tree
 	 */
 	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
-	if (rc != 0)  /* Found */
-		goto found;
+	if (!rc) {
+		/*
+		 * No page size details found in device tree.
+		 * Let's assume we have page 4k and 64k support
+		 */
+		mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+		mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+
+		mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+		mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+	}
+
 	/*
-	 * let's assume we have page 4k and 64k support
+	 * Max mapping size used when mapping pages. We don't use
+	 * ppc_md.memory_block_size() here because this get called
+	 * early and we don't have machine probe called yet. Also
+	 * the pseries implementation only check for ibm,lmb-size.
+	 * All hypervisor supporting radix do expose that device
+	 * tree node.
 	 */
-	mmu_psize_defs[MMU_PAGE_4K].shift = 12;
-	mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
-
-	mmu_psize_defs[MMU_PAGE_64K].shift = 16;
-	mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
-found:
+	radix_mem_block_size = radix_memory_block_size();
 	return;
 }
 
@@ -855,7 +913,8 @@ int __meminit radix__create_section_mapping(unsigned long start,
 		return -1;
 	}
 
-	return create_physical_mapping(__pa(start), __pa(end), nid, prot);
+	return create_physical_mapping(__pa(start), __pa(end),
+				       radix_mem_block_size, nid, prot);
 }
 
 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 3bc188da82ba..7fcb88623081 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -399,7 +399,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static unsigned long pnv_memory_block_size(void)
 {
-	return 256UL * 1024 * 1024;
+	/*
+	 * We map the kernel linear region with 1GB large pages on radix. For
+	 * memory hot unplug to work our memory block size must be at least
+	 * this size.
+	 */
+	if (radix_enabled())
+		return radix_mem_block_size;
+	else
+		return 256UL * 1024 * 1024;
 }
 #endif
 
-- 
2.26.2


  parent reply	other threads:[~2020-07-09 13:32 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-09 13:19 [PATCH v3 0/4] powerpc/mm/radix: Memory unplug fixes Aneesh Kumar K.V
2020-07-09 13:19 ` [PATCH v3 1/4] powerpc/mm/radix: Fix PTE/PMD fragment count for early page table mappings Aneesh Kumar K.V
2020-07-09 13:19 ` [PATCH v3 2/4] powerpc/mm/radix: Free PUD table when freeing pagetable Aneesh Kumar K.V
2020-07-09 13:19 ` [PATCH v3 3/4] powerpc/mm/radix: Remove split_kernel_mapping() Aneesh Kumar K.V
2020-07-09 13:19 ` Aneesh Kumar K.V [this message]
2020-07-16 14:00 ` [PATCH v3 0/4] powerpc/mm/radix: Memory unplug fixes Nathan Lynch
2020-07-21  1:45   ` Michael Ellerman
2020-07-21  3:29     ` Bharata B Rao
2020-07-21 12:25       ` Michael Ellerman
2020-07-22  6:05         ` Bharata B Rao
2020-07-22  7:51           ` David Gibson
2020-07-24 11:52           ` Michael Ellerman
2020-07-24 12:17             ` Bharata B Rao
2020-07-25  7:37             ` David Gibson
2020-07-21  4:42     ` Aneesh Kumar K.V
2020-07-24 13:24 ` Michael Ellerman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200709131925.922266-5-aneesh.kumar@linux.ibm.com \
    --to=aneesh.kumar@linux.ibm.com \
    --cc=bharata@linux.ibm.com \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mpe@ellerman.id.au \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).