* [PATCH v3 1/6] ACPI: NUMA: Up-level "map to online node" functionality
2020-01-20 19:02 [PATCH v3 0/6] Memory Hierarchy: Enable target node lookups for reserved memory Dan Williams
@ 2020-01-20 19:02 ` Dan Williams
2020-01-20 19:02 ` [PATCH v3 2/6] mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node() Dan Williams
` (4 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2020-01-20 19:02 UTC (permalink / raw)
To: tglx, mingo
Cc: Michal Hocko, Rafael J. Wysocki, peterz, dave.hansen, hch,
linux-kernel, linux-nvdimm, x86
The acpi_map_pxm_to_online_node() helper is used to find the closest
online node to a given proximity domain. This is used to map devices in
a proximity domain with no online memory or cpus to the closest online
node and populate a device's 'numa_node' property. The numa_node
property allows applications to be migrated "close" to a resource.
In preparation for providing a generic facility to optionally map an
address range to its closest online node, or the node the range would
represent were it to be onlined (target_node), up-level the core of
acpi_map_pxm_to_online_node() to a generic mm/numa helper.
Cc: Michal Hocko <mhocko@suse.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
drivers/acpi/numa/srat.c | 41 -----------------------------------------
include/linux/acpi.h | 23 ++++++++++++++++++++++-
include/linux/numa.h | 9 +++++++++
mm/mempolicy.c | 30 ++++++++++++++++++++++++++++++
4 files changed, 61 insertions(+), 42 deletions(-)
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index eadbf90e65d1..47b4969d9b93 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -72,47 +72,6 @@ int acpi_map_pxm_to_node(int pxm)
}
EXPORT_SYMBOL(acpi_map_pxm_to_node);
-/**
- * acpi_map_pxm_to_online_node - Map proximity ID to online node
- * @pxm: ACPI proximity ID
- *
- * This is similar to acpi_map_pxm_to_node(), but always returns an online
- * node. When the mapped node from a given proximity ID is offline, it
- * looks up the node distance table and returns the nearest online node.
- *
- * ACPI device drivers, which are called after the NUMA initialization has
- * completed in the kernel, can call this interface to obtain their device
- * NUMA topology from ACPI tables. Such drivers do not have to deal with
- * offline nodes. A node may be offline when a device proximity ID is
- * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
- * "numa=off" on x86.
- */
-int acpi_map_pxm_to_online_node(int pxm)
-{
- int node, min_node;
-
- node = acpi_map_pxm_to_node(pxm);
-
- if (node == NUMA_NO_NODE)
- node = 0;
-
- min_node = node;
- if (!node_online(node)) {
- int min_dist = INT_MAX, dist, n;
-
- for_each_online_node(n) {
- dist = node_distance(node, n);
- if (dist < min_dist) {
- min_dist = dist;
- min_node = n;
- }
- }
- }
-
- return min_node;
-}
-EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
-
static void __init
acpi_table_print_srat_entry(struct acpi_subtable_header *header)
{
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 0f37a7d5fa77..69b73ecfbee4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -401,9 +401,30 @@ extern void acpi_osi_setup(char *str);
extern bool acpi_osi_is_win8(void);
#ifdef CONFIG_ACPI_NUMA
-int acpi_map_pxm_to_online_node(int pxm);
int acpi_map_pxm_to_node(int pxm);
int acpi_get_node(acpi_handle handle);
+
+/**
+ * acpi_map_pxm_to_online_node - Map proximity ID to online node
+ * @pxm: ACPI proximity ID
+ *
+ * This is similar to acpi_map_pxm_to_node(), but always returns an online
+ * node. When the mapped node from a given proximity ID is offline, it
+ * looks up the node distance table and returns the nearest online node.
+ *
+ * ACPI device drivers, which are called after the NUMA initialization has
+ * completed in the kernel, can call this interface to obtain their device
+ * NUMA topology from ACPI tables. Such drivers do not have to deal with
+ * offline nodes. A node may be offline when a device proximity ID is
+ * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
+ * "numa=off" on x86.
+ */
+static inline int acpi_map_pxm_to_online_node(int pxm)
+{
+ int node = acpi_map_pxm_to_node(pxm);
+
+ return numa_map_to_online_node(node);
+}
#else
static inline int acpi_map_pxm_to_online_node(int pxm)
{
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 110b0e5d0fb0..20f4e44b186c 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -13,4 +13,13 @@
#define NUMA_NO_NODE (-1)
+#ifdef CONFIG_NUMA
+int numa_map_to_online_node(int node);
+#else
+static inline int numa_map_to_online_node(int node)
+{
+ return NUMA_NO_NODE;
+}
+#endif
+
#endif /* _LINUX_NUMA_H */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 067cf7d3daf5..4cff069279f6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -127,6 +127,36 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+/**
+ * numa_map_to_online_node - Find closest online node
+ * @nid: Node id to start the search
+ *
+ * Lookup the next closest node by distance if @nid is not online.
+ */
+int numa_map_to_online_node(int node)
+{
+ int min_node;
+
+ if (node == NUMA_NO_NODE)
+ node = 0;
+
+ min_node = node;
+ if (!node_online(node)) {
+ int min_dist = INT_MAX, dist, n;
+
+ for_each_online_node(n) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v3 2/6] mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node()
2020-01-20 19:02 [PATCH v3 0/6] Memory Hierarchy: Enable target node lookups for reserved memory Dan Williams
2020-01-20 19:02 ` [PATCH v3 1/6] ACPI: NUMA: Up-level "map to online node" functionality Dan Williams
@ 2020-01-20 19:02 ` Dan Williams
2020-01-21 1:36 ` Aneesh Kumar K.V
2020-01-20 19:03 ` [PATCH v3 3/6] powerpc/papr_scm: Switch to numa_map_to_online_node() Dan Williams
` (3 subsequent siblings)
5 siblings, 1 reply; 9+ messages in thread
From: Dan Williams @ 2020-01-20 19:02 UTC (permalink / raw)
To: tglx, mingo
Cc: Aneesh Kumar K.V, peterz, dave.hansen, hch, linux-kernel,
linux-nvdimm, x86
Update numa_map_to_online_node() to stop falling back to numa node 0
when the input is NUMA_NO_NODE. Also, skip the lookup if @node is
online. This makes the routine compatible with other arch node mapping
routines.
Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Link: https://lore.kernel.org/r/157401275716.43284.13185549705765009174.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
mm/mempolicy.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4cff069279f6..30d76db718bf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -137,8 +137,8 @@ int numa_map_to_online_node(int node)
{
int min_node;
- if (node == NUMA_NO_NODE)
- node = 0;
+ if (node == NUMA_NO_NODE || node_online(node))
+ return node;
min_node = node;
if (!node_online(node)) {
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH v3 2/6] mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node()
2020-01-20 19:02 ` [PATCH v3 2/6] mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node() Dan Williams
@ 2020-01-21 1:36 ` Aneesh Kumar K.V
2020-01-21 3:09 ` Dan Williams
0 siblings, 1 reply; 9+ messages in thread
From: Aneesh Kumar K.V @ 2020-01-21 1:36 UTC (permalink / raw)
To: Dan Williams, tglx, mingo
Cc: peterz, dave.hansen, hch, linux-kernel, linux-nvdimm, x86
Dan Williams <dan.j.williams@intel.com> writes:
> Update numa_map_to_online_node() to stop falling back to numa node 0
> when the input is NUMA_NO_NODE. Also, skip the lookup if @node is
> online. This makes the routine compatible with other arch node mapping
> routines.
>
> Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Link: https://lore.kernel.org/r/157401275716.43284.13185549705765009174.stgit@dwillia2-desk3.amr.corp.intel.com
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
> mm/mempolicy.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 4cff069279f6..30d76db718bf 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -137,8 +137,8 @@ int numa_map_to_online_node(int node)
> {
> int min_node;
>
> - if (node == NUMA_NO_NODE)
> - node = 0;
> + if (node == NUMA_NO_NODE || node_online(node))
> + return node;
>
> min_node = node;
> if (!node_online(node)) {
The above if condition will always be true?
-aneesh
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 2/6] mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node()
2020-01-21 1:36 ` Aneesh Kumar K.V
@ 2020-01-21 3:09 ` Dan Williams
0 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2020-01-21 3:09 UTC (permalink / raw)
To: Aneesh Kumar K.V
Cc: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Dave Hansen,
Christoph Hellwig, Linux Kernel Mailing List, linux-nvdimm,
X86 ML
On Mon, Jan 20, 2020 at 5:36 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Dan Williams <dan.j.williams@intel.com> writes:
>
> > Update numa_map_to_online_node() to stop falling back to numa node 0
> > when the input is NUMA_NO_NODE. Also, skip the lookup if @node is
> > online. This makes the routine compatible with other arch node mapping
> > routines.
> >
> > Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> > Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> > Link: https://lore.kernel.org/r/157401275716.43284.13185549705765009174.stgit@dwillia2-desk3.amr.corp.intel.com
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > ---
> > mm/mempolicy.c | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index 4cff069279f6..30d76db718bf 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -137,8 +137,8 @@ int numa_map_to_online_node(int node)
> > {
> > int min_node;
> >
> > - if (node == NUMA_NO_NODE)
> > - node = 0;
> > + if (node == NUMA_NO_NODE || node_online(node))
> > + return node;
> >
> > min_node = node;
> > if (!node_online(node)) {
>
>
> The above if condition will always be true?
No, not for the node_offline case, and that's typically what callers
are passing.
>
> -aneesh
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH v3 3/6] powerpc/papr_scm: Switch to numa_map_to_online_node()
2020-01-20 19:02 [PATCH v3 0/6] Memory Hierarchy: Enable target node lookups for reserved memory Dan Williams
2020-01-20 19:02 ` [PATCH v3 1/6] ACPI: NUMA: Up-level "map to online node" functionality Dan Williams
2020-01-20 19:02 ` [PATCH v3 2/6] mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node() Dan Williams
@ 2020-01-20 19:03 ` Dan Williams
2020-01-20 19:03 ` [PATCH v3 4/6] x86/mm: Introduce CONFIG_KEEP_NUMA Dan Williams
` (2 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2020-01-20 19:03 UTC (permalink / raw)
To: tglx, mingo
Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Aneesh Kumar K.V, peterz, dave.hansen, hch, linux-kernel,
linux-nvdimm, x86
Now that the core exports numa_map_to_online_node() switch to that
instead of the locally coded duplicate.
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Reported-by: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Link: https://lore.kernel.org/r/157401276263.43284.12616818803654229788.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
arch/powerpc/platforms/pseries/papr_scm.c | 21 +--------------------
1 file changed, 1 insertion(+), 20 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index c2ef320ba1bf..057ed703e882 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -284,25 +284,6 @@ int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
return 0;
}
-static inline int papr_scm_node(int node)
-{
- int min_dist = INT_MAX, dist;
- int nid, min_node;
-
- if ((node == NUMA_NO_NODE) || node_online(node))
- return node;
-
- min_node = first_online_node;
- for_each_online_node(nid) {
- dist = node_distance(node, nid);
- if (dist < min_dist) {
- min_dist = dist;
- min_node = nid;
- }
- }
- return min_node;
-}
-
static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
{
struct device *dev = &p->pdev->dev;
@@ -347,7 +328,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
memset(&ndr_desc, 0, sizeof(ndr_desc));
target_nid = dev_to_node(&p->pdev->dev);
- online_nid = papr_scm_node(target_nid);
+ online_nid = numa_map_to_online_node(target_nid);
ndr_desc.numa_node = online_nid;
ndr_desc.target_node = target_nid;
ndr_desc.res = &p->res;
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v3 4/6] x86/mm: Introduce CONFIG_KEEP_NUMA
2020-01-20 19:02 [PATCH v3 0/6] Memory Hierarchy: Enable target node lookups for reserved memory Dan Williams
` (2 preceding siblings ...)
2020-01-20 19:03 ` [PATCH v3 3/6] powerpc/papr_scm: Switch to numa_map_to_online_node() Dan Williams
@ 2020-01-20 19:03 ` Dan Williams
2020-01-20 19:03 ` [PATCH v3 5/6] x86/numa: Provide a range-to-target_node lookup facility Dan Williams
2020-01-20 19:03 ` [PATCH v3 6/6] libnvdimm/e820: Retrieve and populate correct 'target_node' info Dan Williams
5 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2020-01-20 19:03 UTC (permalink / raw)
To: tglx, mingo
Cc: Dave Hansen, Andy Lutomirski, Peter Zijlstra, Borislav Petkov,
H. Peter Anvin, x86, Andrew Morton, David Hildenbrand,
Michal Hocko, hch, linux-kernel, linux-nvdimm
Currently x86 numa_meminfo is marked __initdata in the
CONFIG_MEMORY_HOTPLUG=n case. In support of a new facility to allow
drivers to map reserved memory to a 'target_node'
(phys_to_target_node()), add support for removing the __initdata
designation for those users. Both memory hotplug and
phys_to_target_node() users select CONFIG_KEEP_NUMA to tell the arch to
maintain its physical address to numa mapping infrastructure post init.
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
arch/x86/mm/numa.c | 6 +-----
include/linux/numa.h | 6 ++++++
mm/Kconfig | 5 +++++
3 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 99f7a68738f0..5289d9d6799a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -25,11 +25,7 @@ nodemask_t numa_nodes_parsed __initdata;
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
-static struct numa_meminfo numa_meminfo
-#ifndef CONFIG_MEMORY_HOTPLUG
-__initdata
-#endif
-;
+static struct numa_meminfo numa_meminfo __initdata_numa;
static int numa_distance_cnt;
static u8 *numa_distance;
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 20f4e44b186c..c005ed6b807b 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -13,6 +13,12 @@
#define NUMA_NO_NODE (-1)
+#ifdef CONFIG_KEEP_NUMA
+#define __initdata_numa
+#else
+#define __initdata_numa __initdata
+#endif
+
#ifdef CONFIG_NUMA
int numa_map_to_online_node(int node);
#else
diff --git a/mm/Kconfig b/mm/Kconfig
index ab80933be65f..001f1185eadf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -139,6 +139,10 @@ config HAVE_FAST_GUP
config ARCH_KEEP_MEMBLOCK
bool
+# Keep arch numa mapping infrastructure post-init.
+config KEEP_NUMA
+ bool
+
config MEMORY_ISOLATION
bool
@@ -154,6 +158,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
+ select KEEP_NUMA if NUMA
config MEMORY_HOTPLUG_SPARSE
def_bool y
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v3 5/6] x86/numa: Provide a range-to-target_node lookup facility
2020-01-20 19:02 [PATCH v3 0/6] Memory Hierarchy: Enable target node lookups for reserved memory Dan Williams
` (3 preceding siblings ...)
2020-01-20 19:03 ` [PATCH v3 4/6] x86/mm: Introduce CONFIG_KEEP_NUMA Dan Williams
@ 2020-01-20 19:03 ` Dan Williams
2020-01-20 19:03 ` [PATCH v3 6/6] libnvdimm/e820: Retrieve and populate correct 'target_node' info Dan Williams
5 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2020-01-20 19:03 UTC (permalink / raw)
To: tglx, mingo
Cc: Dave Hansen, Andy Lutomirski, Peter Zijlstra, Borislav Petkov,
H. Peter Anvin, x86, Andrew Morton, David Hildenbrand,
Michal Hocko, kbuild test robot, hch, linux-kernel, linux-nvdimm
The DEV_DAX_KMEM facility is a generic mechanism to allow device-dax
instances, fronting performance-differentiated-memory like pmem, to be
added to the System RAM pool. The numa node for that hot-added memory is
derived from the device-dax instance's 'target_node' attribute.
Recall that the 'target_node' is the ACPI-PXM-to-node translation for
memory when it comes online whereas the 'numa_node' attribute of the
device represents the closest online cpu node.
Presently useful target_node information from the ACPI SRAT is discarded
with the expectation that "Reserved" memory will never be onlined. Now,
DEV_DAX_KMEM violates that assumption, there is a need to retain the
translation. Move, rather than discard, numa_memblk data to a secondary
array that memory_add_physaddr_to_target_node() may consider at a later
point in time.
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
arch/x86/mm/numa.c | 68 +++++++++++++++++++++++++++++++++++++++++++-------
include/linux/numa.h | 8 +++++-
mm/mempolicy.c | 5 ++++
3 files changed, 70 insertions(+), 11 deletions(-)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 5289d9d6799a..f2c8fca36f28 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,6 +26,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
static struct numa_meminfo numa_meminfo __initdata_numa;
+static struct numa_meminfo numa_reserved_meminfo __initdata_numa;
static int numa_distance_cnt;
static u8 *numa_distance;
@@ -164,6 +165,26 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
}
+/**
+ * numa_move_memblk - Move one numa_memblk from one numa_meminfo to another
+ * @dst: numa_meminfo to move block to
+ * @idx: Index of memblk to remove
+ * @src: numa_meminfo to remove memblk from
+ *
+ * If @dst is non-NULL add it at the @dst->nr_blks index and increment
+ * @dst->nr_blks, then remove it from @src.
+ */
+static void __init numa_move_memblk(struct numa_meminfo *dst, int idx,
+ struct numa_meminfo *src)
+{
+ if (dst) {
+ memcpy(&dst->blk[dst->nr_blks], &src->blk[idx],
+ sizeof(struct numa_memblk));
+ dst->nr_blks++;
+ }
+ numa_remove_memblk_from(idx, src);
+}
+
/**
* numa_add_memblk - Add one numa_memblk to numa_meminfo
* @nid: NUMA node ID of the new memblk
@@ -233,14 +254,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
- /* make sure all blocks are inside the limits */
+ /* move / save reserved memory ranges */
+ if (!memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start)) {
+ numa_move_memblk(&numa_reserved_meminfo, i--, mi);
+ continue;
+ }
+
+ /* make sure all non-reserved blocks are inside the limits */
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
- /* and there's no empty or non-exist block */
- if (bi->start >= bi->end ||
- !memblock_overlaps_region(&memblock.memory,
- bi->start, bi->end - bi->start))
+ /* and there's no empty block */
+ if (bi->start >= bi->end)
numa_remove_memblk_from(i--, mi);
}
@@ -877,16 +903,38 @@ EXPORT_SYMBOL(cpumask_of_node);
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
-#ifdef CONFIG_MEMORY_HOTPLUG
-int memory_add_physaddr_to_nid(u64 start)
+#ifdef CONFIG_KEEP_NUMA
+static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
{
- struct numa_meminfo *mi = &numa_meminfo;
- int nid = mi->blk[0].nid;
int i;
for (i = 0; i < mi->nr_blks; i++)
if (mi->blk[i].start <= start && mi->blk[i].end > start)
- nid = mi->blk[i].nid;
+ return mi->blk[i].nid;
+ return NUMA_NO_NODE;
+}
+
+int phys_to_target_node(phys_addr_t start)
+{
+ int nid = meminfo_to_nid(&numa_meminfo, start);
+
+ /*
+ * Prefer online nodes, but if reserved memory might be
+ * hot-added continue the search with reserved ranges.
+ */
+ if (nid != NUMA_NO_NODE)
+ return nid;
+
+ return meminfo_to_nid(&numa_reserved_meminfo, start);
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+
+int memory_add_physaddr_to_nid(u64 start)
+{
+ int nid = meminfo_to_nid(&numa_meminfo, start);
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_meminfo.blk[0].nid;
return nid;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
diff --git a/include/linux/numa.h b/include/linux/numa.h
index c005ed6b807b..cad0ab165619 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NUMA_H
#define _LINUX_NUMA_H
-
+#include <linux/types.h>
#ifdef CONFIG_NODES_SHIFT
#define NODES_SHIFT CONFIG_NODES_SHIFT
@@ -21,11 +21,17 @@
#ifdef CONFIG_NUMA
int numa_map_to_online_node(int node);
+int phys_to_target_node(phys_addr_t addr);
#else
static inline int numa_map_to_online_node(int node)
{
return NUMA_NO_NODE;
}
+
+static inline int phys_to_target_node(phys_addr_t addr)
+{
+ return NUMA_NO_NODE;
+}
#endif
#endif /* _LINUX_NUMA_H */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 30d76db718bf..c564b77decf5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3015,3 +3015,8 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+__weak int phys_to_target_node(phys_addr_t addr)
+{
+ return NUMA_NO_NODE;
+}
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v3 6/6] libnvdimm/e820: Retrieve and populate correct 'target_node' info
2020-01-20 19:02 [PATCH v3 0/6] Memory Hierarchy: Enable target node lookups for reserved memory Dan Williams
` (4 preceding siblings ...)
2020-01-20 19:03 ` [PATCH v3 5/6] x86/numa: Provide a range-to-target_node lookup facility Dan Williams
@ 2020-01-20 19:03 ` Dan Williams
5 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2020-01-20 19:03 UTC (permalink / raw)
To: tglx, mingo
Cc: Dave Hansen, Andy Lutomirski, Peter Zijlstra, Andrew Morton,
David Hildenbrand, Michal Hocko, Christoph Hellwig, linux-kernel,
linux-nvdimm, x86
Use the new phys_to_target_node() and numa_map_to_online_node() helpers
to retrieve the correct id for the 'numa_node' ("local" / online
initiator node) and 'target_node' (offline target memory node) sysfs
attributes.
Below is an example from a 4 numa node system where all the memory on
node2 is pmem / reserved. It should be noted that with the arrival of
the ACPI HMAT table and EFI Specific Purpose Memory the kernel will
start to see more platforms with reserved / performance differentiated
memory in its own numa node. Hence all the stakeholders on the Cc for
what is ostensibly a libnvdimm local patch.
=== Before ===
/* Notice no online memory on node2 at start */
# numactl --hardware
available: 3 nodes (0-1,3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
node 0 size: 3958 MB
node 0 free: 3708 MB
node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
node 1 size: 4027 MB
node 1 free: 3871 MB
node 3 cpus:
node 3 size: 3994 MB
node 3 free: 3971 MB
node distances:
node 0 1 3
0: 10 21 21
1: 21 10 21
3: 21 21 10
/*
* Put the pmem namespace into devdax mode so it can be assigned to the
* kmem driver
*/
# ndctl create-namespace -e namespace0.0 -m devdax -f
{
"dev":"namespace0.0",
"mode":"devdax",
"map":"dev",
"size":"3.94 GiB (4.23 GB)",
"uuid":"1650af9b-9ba3-4704-acd6-10178399d9a3",
[..]
}
/* Online Persistent Memory as System RAM */
# daxctl reconfigure-device --mode=system-ram dax0.0
libdaxctl: memblock_in_dev: dax0.0: memory0: Unable to determine phys_index: Success
libdaxctl: memblock_in_dev: dax0.0: memory0: Unable to determine phys_index: Success
libdaxctl: memblock_in_dev: dax0.0: memory0: Unable to determine phys_index: Success
libdaxctl: memblock_in_dev: dax0.0: memory0: Unable to determine phys_index: Success
[
{
"chardev":"dax0.0",
"size":4225761280,
"target_node":0,
"mode":"system-ram"
}
]
reconfigured 1 device
/* Note that the memory is onlined by default to the wrong node, node0 */
# numactl --hardware
available: 3 nodes (0-1,3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
node 0 size: 7926 MB
node 0 free: 7655 MB
node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
node 1 size: 4027 MB
node 1 free: 3871 MB
node 3 cpus:
node 3 size: 3994 MB
node 3 free: 3971 MB
node distances:
node 0 1 3
0: 10 21 21
1: 21 10 21
3: 21 21 10
=== After ===
/* Notice that the "phys_index" error messages are gone */
# daxctl reconfigure-device --mode=system-ram dax0.0
[
{
"chardev":"dax0.0",
"size":4225761280,
"target_node":2,
"mode":"system-ram"
}
]
reconfigured 1 device
/* Notice that node2 is now correctly populated */
# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
node 0 size: 3958 MB
node 0 free: 3793 MB
node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
node 1 size: 4027 MB
node 1 free: 3851 MB
node 2 cpus:
node 2 size: 3968 MB
node 2 free: 3968 MB
node 3 cpus:
node 3 size: 3994 MB
node 3 free: 3908 MB
node distances:
node 0 1 2 3
0: 10 21 21 21
1: 21 10 21 21
2: 21 21 10 21
3: 21 21 21 10
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
arch/x86/Kconfig | 1 +
drivers/nvdimm/e820.c | 18 ++++--------------
2 files changed, 5 insertions(+), 14 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e8949953660..3a827fe7afd6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1660,6 +1660,7 @@ config X86_PMEM_LEGACY
depends on PHYS_ADDR_T_64BIT
depends on BLK_DEV
select X86_PMEM_LEGACY_DEVICE
+ select KEEP_NUMA if NUMA
select LIBNVDIMM
help
Treat memory marked using the non-standard e820 type of 12 as used
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index e02f60ad6c99..4cd18be9d0e9 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -7,6 +7,7 @@
#include <linux/memory_hotplug.h>
#include <linux/libnvdimm.h>
#include <linux/module.h>
+#include <linux/numa.h>
static int e820_pmem_remove(struct platform_device *pdev)
{
@@ -16,27 +17,16 @@ static int e820_pmem_remove(struct platform_device *pdev)
return 0;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int e820_range_to_nid(resource_size_t addr)
-{
- return memory_add_physaddr_to_nid(addr);
-}
-#else
-static int e820_range_to_nid(resource_size_t addr)
-{
- return NUMA_NO_NODE;
-}
-#endif
-
static int e820_register_one(struct resource *res, void *data)
{
struct nd_region_desc ndr_desc;
struct nvdimm_bus *nvdimm_bus = data;
+ int nid = phys_to_target_node(res->start);
memset(&ndr_desc, 0, sizeof(ndr_desc));
ndr_desc.res = res;
- ndr_desc.numa_node = e820_range_to_nid(res->start);
- ndr_desc.target_node = ndr_desc.numa_node;
+ ndr_desc.numa_node = numa_map_to_online_node(nid);
+ ndr_desc.target_node = nid;
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
return -ENXIO;
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
^ permalink raw reply related [flat|nested] 9+ messages in thread