dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: Weixi Zhu <weixi.zhu@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
	<akpm@linux-foundation.org>
Cc: dri-devel@lists.freedesktop.org, leonro@nvidia.com,
	apopple@nvidia.com, amd-gfx@lists.freedesktop.org,
	mgorman@suse.de, ziy@nvidia.com, zhi.a.wang@intel.com,
	rcampbell@nvidia.com, jgg@nvidia.com, weixi.zhu@openeuler.sh,
	jhubbard@nvidia.com, intel-gfx@lists.freedesktop.org,
	mhairgrove@nvidia.com, jglisse@redhat.com,
	rodrigo.vivi@intel.com, intel-gvt-dev@lists.freedesktop.org,
	tvrtko.ursulin@linux.intel.com, Felix.Kuehling@amd.com,
	Xinhui.Pan@amd.com, christian.koenig@amd.com,
	alexander.deucher@amd.com, ogabbay@kernel.org,
	Weixi Zhu <weixi.zhu@huawei.com>
Subject: [RFC PATCH 1/6] mm/gmem: add heterogeneous NUMA node
Date: Tue, 28 Nov 2023 20:50:20 +0800	[thread overview]
Message-ID: <20231128125025.4449-2-weixi.zhu@huawei.com> (raw)
In-Reply-To: <20231128125025.4449-1-weixi.zhu@huawei.com>

This patch adds a new NUMA node state, named N_HETEROGENEOUS. It is
utilized to identify heterogeneous NUMA (hNUMA) node. Note that hNUMA node
may not be directly accessible by the CPU.

Each hNUMA node can be identified with a NUMA id. This can be extended to
provide NUMA topology including device local DRAM, where a cache-coherent
bus does not need to exist between the CPU and device local DRAM.
Furthermore, this allows an application user to issue memory hints that
bind with specific hNUMA nodes.

Signed-off-by: Weixi Zhu <weixi.zhu@huawei.com>
---
 drivers/base/node.c      |  6 ++++
 include/linux/gmem.h     | 19 ++++++++++
 include/linux/nodemask.h | 10 ++++++
 init/main.c              |  2 ++
 mm/Kconfig               | 14 ++++++++
 mm/Makefile              |  1 +
 mm/gmem.c                | 78 ++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c          |  3 ++
 8 files changed, 133 insertions(+)
 create mode 100644 include/linux/gmem.h
 create mode 100644 mm/gmem.c

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 493d533f8375..aa4d2ca266aa 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -928,6 +928,9 @@ static struct node_attr node_state_attr[] = {
 	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 	[N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
 					   N_GENERIC_INITIATOR),
+#ifdef CONFIG_GMEM
+	[N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS),
+#endif
 };
 
 static struct attribute *node_state_attrs[] = {
@@ -940,6 +943,9 @@ static struct attribute *node_state_attrs[] = {
 	&node_state_attr[N_MEMORY].attr.attr,
 	&node_state_attr[N_CPU].attr.attr,
 	&node_state_attr[N_GENERIC_INITIATOR].attr.attr,
+#ifdef CONFIG_GMEM
+	&node_state_attr[N_HETEROGENEOUS].attr.attr,
+#endif
 	NULL
 };
 
diff --git a/include/linux/gmem.h b/include/linux/gmem.h
new file mode 100644
index 000000000000..fff877873557
--- /dev/null
+++ b/include/linux/gmem.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generalized Memory Management.
+ *
+ * Copyright (C) 2023- Huawei, Inc.
+ * Author: Weixi Zhu
+ *
+ */
+#ifndef _GMEM_H
+#define _GMEM_H
+
+#ifdef CONFIG_GMEM
+/* h-NUMA topology */
+void __init hnuma_init(void);
+#else
+static inline void hnuma_init(void) {}
+#endif
+
+#endif /* _GMEM_H */
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8d07116caaf1..66e4640a52ba 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -407,6 +407,9 @@ enum node_states {
 	N_MEMORY,		/* The node has memory(regular, high, movable) */
 	N_CPU,		/* The node has one or more cpus */
 	N_GENERIC_INITIATOR,	/* The node has one or more Generic Initiators */
+#ifdef CONFIG_GMEM
+	N_HETEROGENEOUS,	/* The node has heterogeneous memory */
+#endif
 	NR_NODE_STATES
 };
 
@@ -536,6 +539,13 @@ static inline int node_random(const nodemask_t *maskp)
 #define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
 
+#ifdef CONFIG_GMEM
+/* For h-NUMA topology */
+#define hnode_map		node_states[N_HETEROGENEOUS]
+#define num_hnodes()		num_node_state(N_HETEROGENEOUS)
+#define for_each_hnode(node)	for_each_node_state(node, N_HETEROGENEOUS)
+#endif
+
 /*
  * For nodemask scratch area.
  * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..12dfb5b63d51 100644
--- a/init/main.c
+++ b/init/main.c
@@ -100,6 +100,7 @@
 #include <linux/stackdepot.h>
 #include <linux/randomize_kstack.h>
 #include <net/net_namespace.h>
+#include <linux/gmem.h>
 
 #include <asm/io.h>
 #include <asm/setup.h>
@@ -901,6 +902,7 @@ void start_kernel(void)
 	setup_per_cpu_areas();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 	boot_cpu_hotplug_init();
+	hnuma_init();
 
 	pr_notice("Kernel command line: %s\n", saved_command_line);
 	/* parameters may set static keys */
diff --git a/mm/Kconfig b/mm/Kconfig
index 89971a894b60..1a7d8194513c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1270,6 +1270,20 @@ config LOCK_MM_AND_FIND_VMA
 	bool
 	depends on !STACK_GROWSUP
 
+config GMEM
+	bool "generalized memory management for external memory devices"
+	depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE
+	select ARCH_USES_HIGH_VMA_FLAGS
+	default y
+	help
+	  Supporting GMEM (generalized memory management) for external memory
+	  devices
+
+	  GMEM extends Linux MM to share its machine-independent MM code. Only
+	  high-level interface is provided for device drivers. This prevents
+	  accelerator drivers from reinventing the wheel, but relies on drivers to
+	  implement their hardware-dependent functions declared by GMEM.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 33873c8aedb3..f48ea2eb4a44 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_GMEM) += gmem.o
diff --git a/mm/gmem.c b/mm/gmem.c
new file mode 100644
index 000000000000..767eb070b22e
--- /dev/null
+++ b/mm/gmem.c
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generalized Memory Management.
+ *
+ * Copyright (C) 2023- Huawei, Inc.
+ * Author: Weixi Zhu
+ *
+ */
+#include <linux/mm.h>
+#include <linux/gmem.h>
+
+DEFINE_SPINLOCK(hnode_lock);
+
+struct hnode {
+	unsigned int id;
+	struct gm_dev *dev;
+	struct xarray pages;
+};
+
+struct hnode *hnodes[MAX_NUMNODES];
+
+static bool is_hnode(int node)
+{
+	return !node_isset(node, node_possible_map) &&
+	       node_isset(node, hnode_map);
+}
+
+static bool is_hnode_allowed(int node)
+{
+	return is_hnode(node) && node_isset(node, current->mems_allowed);
+}
+
+static struct hnode *get_hnode(unsigned int hnid)
+{
+	return hnodes[hnid];
+}
+
+void __init hnuma_init(void)
+{
+	unsigned int node;
+
+	for_each_node(node)
+		node_set(node, hnode_map);
+}
+
+static unsigned int alloc_hnode_id(void)
+{
+	unsigned int node;
+
+	spin_lock(&hnode_lock);
+	node = first_unset_node(hnode_map);
+	node_set(node, hnode_map);
+	spin_unlock(&hnode_lock);
+
+	return node;
+}
+
+static void free_hnode_id(unsigned int nid)
+{
+	node_clear(nid, hnode_map);
+}
+
+static void hnode_init(struct hnode *hnode, unsigned int hnid,
+		       struct gm_dev *dev)
+{
+	hnodes[hnid] = hnode;
+	hnodes[hnid]->id = hnid;
+	hnodes[hnid]->dev = dev;
+	xa_init(&hnodes[hnid]->pages);
+}
+
+static void hnode_deinit(unsigned int hnid, struct gm_dev *dev)
+{
+	hnodes[hnid]->id = 0;
+	hnodes[hnid]->dev = NULL;
+	xa_destroy(&hnodes[hnid]->pages);
+	hnodes[hnid] = NULL;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 733732e7e0ba..a785b62a1542 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -192,6 +192,9 @@ EXPORT_SYMBOL(latent_entropy);
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
+#ifdef CONFIG_GMEM
+	[N_HETEROGENEOUS] = NODE_MASK_NONE,
+#endif
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
-- 
2.25.1


  reply	other threads:[~2023-11-28 15:55 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-28 12:50 [RFC PATCH 0/6] Supporting GMEM (generalized memory management) for external memory devices Weixi Zhu
2023-11-28 12:50 ` Weixi Zhu [this message]
2023-11-28 12:50 ` [RFC PATCH 2/6] mm/gmem: add arch-independent abstraction to track address mapping status Weixi Zhu
2023-11-29  8:25   ` linear cannon
2023-12-01  9:23   ` David Hildenbrand
2023-12-02 14:50     ` Pedro Falcato
2023-12-04 10:21       ` David Hildenbrand
2023-11-28 12:50 ` [RFC PATCH 3/6] mm/gmem: add GMEM (Generalized Memory Management) interface for external accelerators Weixi Zhu
2023-11-28 12:50 ` [RFC PATCH 4/6] mm/gmem: add new syscall hmadvise() to issue memory hints for heterogeneous NUMA nodes Weixi Zhu
2023-11-28 12:50 ` [RFC PATCH 5/6] mm/gmem: resolve VMA conflicts for attached peer devices Weixi Zhu
2023-11-28 12:50 ` [RFC PATCH 6/6] mm/gmem: extending Linux core MM to support unified virtual address space Weixi Zhu
2023-11-28 13:06 ` [RFC PATCH 0/6] Supporting GMEM (generalized memory management) for external memory devices Christian König
2023-11-29  5:14   ` Dave Airlie
2023-11-29  8:27     ` zhuweixi
2023-11-29 15:22       ` Christian König
2023-11-30  7:22         ` zhuweixi
2023-11-30 13:05           ` Christian König
2023-12-01  2:37             ` zhuweixi
2023-12-01 21:28               ` Philipp Stanner
2023-11-30 14:55       ` David Hildenbrand
2023-12-01  2:44         ` zhuweixi
2023-12-01  9:29           ` David Hildenbrand
2023-11-28 13:09 ` Christian König
2023-11-29 22:23   ` Zeng, Oak
2023-11-30  8:27     ` Christian König
2023-11-30 10:48       ` zhuweixi
2023-12-01  5:48         ` Zeng, Oak
2023-12-01  6:11           ` Alistair Popple
2023-12-01 13:16           ` Christian König
2023-12-03 23:32             ` Alistair Popple
2023-12-04  9:35               ` Christian König
2023-12-01  6:01         ` Alistair Popple

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231128125025.4449-2-weixi.zhu@huawei.com \
    --to=weixi.zhu@huawei.com \
    --cc=Felix.Kuehling@amd.com \
    --cc=Xinhui.Pan@amd.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=apopple@nvidia.com \
    --cc=christian.koenig@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=intel-gvt-dev@lists.freedesktop.org \
    --cc=jgg@nvidia.com \
    --cc=jglisse@redhat.com \
    --cc=jhubbard@nvidia.com \
    --cc=leonro@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mhairgrove@nvidia.com \
    --cc=ogabbay@kernel.org \
    --cc=rcampbell@nvidia.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=tvrtko.ursulin@linux.intel.com \
    --cc=weixi.zhu@openeuler.sh \
    --cc=zhi.a.wang@intel.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).