All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: linux-mm@kvack.org, akpm@linux-foundation.org
Cc: Wei Xu <weixugc@google.com>, Huang Ying <ying.huang@intel.com>,
	Greg Thelen <gthelen@google.com>, Yang Shi <shy828301@gmail.com>,
	Davidlohr Bueso <dave@stgolabs.net>,
	Tim C Chen <tim.c.chen@intel.com>,
	Brice Goglin <brice.goglin@gmail.com>,
	Michal Hocko <mhocko@kernel.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Hesham Almatary <hesham.almatary@huawei.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Jonathan Cameron <Jonathan.Cameron@huawei.com>,
	Alistair Popple <apopple@nvidia.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Feng Tang <feng.tang@intel.com>,
	Jagdish Gediya <jvgediya@linux.ibm.com>,
	Baolin Wang <baolin.wang@linux.alibaba.com>,
	David Rientjes <rientjes@google.com>,
	"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Subject: [PATCH v6 09/13] mm/demotion: Add pg_data_t member to track node memory tier details
Date: Fri, 10 Jun 2022 19:22:25 +0530	[thread overview]
Message-ID: <20220610135229.182859-10-aneesh.kumar@linux.ibm.com> (raw)
In-Reply-To: <20220610135229.182859-1-aneesh.kumar@linux.ibm.com>

Also update different helpes to use NODE_DATA()->memtier. Since
node specific memtier can change based on the reassignment of
NUMA node to a different memory tiers, accessing NODE_DATA()->memtier
needs to under an rcu read lock of memory_tier_lock.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 include/linux/memory-tiers.h |  14 +++++
 include/linux/mmzone.h       |   3 ++
 mm/memory-tiers.c            | 102 ++++++++++++++++++++++++++---------
 3 files changed, 94 insertions(+), 25 deletions(-)

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 52896f5970b7..53f3e4c7cba8 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -6,6 +6,9 @@
 
 #ifdef CONFIG_TIERED_MEMORY
 
+#include <linux/device.h>
+#include <linux/nodemask.h>
+
 #define MEMORY_TIER_HBM_GPU	0
 #define MEMORY_TIER_DRAM	1
 #define MEMORY_TIER_PMEM	2
@@ -18,13 +21,24 @@
 #define MAX_STATIC_MEMORY_TIERS  3
 #define MAX_MEMORY_TIERS  (MAX_STATIC_MEMORY_TIERS + 2)
 
+struct memory_tier {
+	struct list_head list;
+	struct device dev;
+	nodemask_t nodelist;
+	int rank;
+};
+
 extern bool numa_demotion_enabled;
 int node_create_and_set_memory_tier(int node, int tier);
 int next_demotion_node(int node);
 int node_set_memory_tier(int node, int tier);
 int node_get_memory_tier_id(int node);
 int node_reset_memory_tier(int node, int tier);
+struct memory_tier *node_get_memory_tier(int node);
+void node_put_memory_tier(struct memory_tier *memtier);
+
 #else
+
 #define numa_demotion_enabled	false
 static inline int next_demotion_node(int node)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f..c4fcfd2b9980 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -928,6 +928,9 @@ typedef struct pglist_data {
 	/* Per-node vmstats */
 	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
 	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
+#ifdef CONFIG_TIERED_MEMORY
+	struct memory_tier *memtier;
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index de3b7403ae6f..429aa864edb0 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -1,22 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/types.h>
-#include <linux/device.h>
-#include <linux/nodemask.h>
 #include <linux/slab.h>
 #include <linux/memory-tiers.h>
 #include <linux/random.h>
 #include <linux/memory.h>
 #include <linux/idr.h>
+#include <linux/rcupdate.h>
 
 #include "internal.h"
 
-struct memory_tier {
-	struct list_head list;
-	struct device dev;
-	nodemask_t nodelist;
-	int rank;
-};
-
 struct demotion_nodes {
 	nodemask_t preferred;
 };
@@ -212,13 +204,17 @@ static void unregister_memory_tier(struct memory_tier *memtier)
 
 static struct memory_tier *__node_get_memory_tier(int node)
 {
-	struct memory_tier *memtier;
+	pg_data_t *pgdat;
 
-	list_for_each_entry(memtier, &memory_tiers, list) {
-		if (node_isset(node, memtier->nodelist))
-			return memtier;
-	}
-	return NULL;
+	pgdat = NODE_DATA(node);
+	if (!pgdat)
+		return NULL;
+	/*
+	 * Since we hold memory_tier_lock, we can avoid
+	 * RCU read locks when accessing the details. No
+	 * parallel updates are possible here.
+	 */
+	return pgdat->memtier;
 }
 
 static struct memory_tier *__get_memory_tier_from_id(int id)
@@ -232,6 +228,32 @@ static struct memory_tier *__get_memory_tier_from_id(int id)
 	return NULL;
 }
 
+/*
+ * Called with memory_tier_lock. Hence the device references cannot
+ * be dropped during this function.
+ */
+static void memtier_node_set(int node, struct memory_tier *memtier)
+{
+	pg_data_t *pgdat;
+	struct memory_tier *current_memtier;
+
+	pgdat = NODE_DATA(node);
+	if (!pgdat)
+		return;
+	/*
+	 * Make sure we mark the memtier NULL before we assign the new memory tier
+	 * to the NUMA node. This make sure that anybody looking at NODE_DATA
+	 * finds a NULL memtier or the one which is still valid.
+	 */
+	current_memtier = rcu_dereference(pgdat->memtier);
+	rcu_assign_pointer(pgdat->memtier, NULL);
+	if (current_memtier)
+		node_clear(node, current_memtier->nodelist);
+	synchronize_rcu();
+	node_set(node, memtier->nodelist);
+	rcu_assign_pointer(pgdat->memtier, memtier);
+}
+
 static int __node_create_and_set_memory_tier(int node, int tier)
 {
 	int ret = 0;
@@ -252,7 +274,7 @@ static int __node_create_and_set_memory_tier(int node, int tier)
 			goto out;
 		}
 	}
-	node_set(node, memtier->nodelist);
+	memtier_node_set(node, memtier);
 out:
 	return ret;
 }
@@ -274,12 +296,10 @@ int node_create_and_set_memory_tier(int node, int tier)
 	if (current_tier->dev.id == tier)
 		goto out;
 
-	node_clear(node, current_tier->nodelist);
-
 	ret = __node_create_and_set_memory_tier(node, tier);
 	if (ret) {
 		/* reset it back to older tier */
-		node_set(node, current_tier->nodelist);
+		memtier_node_set(node, current_tier);
 		goto out;
 	}
 
@@ -304,7 +324,7 @@ static int __node_set_memory_tier(int node, int tier)
 		ret = -EINVAL;
 		goto out;
 	}
-	node_set(node, memtier->nodelist);
+	memtier_node_set(node, memtier);
 out:
 	return ret;
 }
@@ -360,12 +380,10 @@ int node_reset_memory_tier(int node, int tier)
 	if (!current_tier || current_tier->dev.id == tier)
 		goto out;
 
-	node_clear(node, current_tier->nodelist);
-
 	ret = __node_set_memory_tier(node, tier);
 	if (ret) {
 		/* reset it back to older tier */
-		node_set(node, current_tier->nodelist);
+		memtier_node_set(node, current_tier);
 		goto out;
 	}
 
@@ -379,6 +397,34 @@ int node_reset_memory_tier(int node, int tier)
 	return ret;
 }
 
+/*
+ * lockless access to memory tier of a NUMA node.
+ */
+struct memory_tier *node_get_memory_tier(int node)
+{
+	pg_data_t *pgdat;
+	struct memory_tier *memtier;
+
+	pgdat = NODE_DATA(node);
+	if (!pgdat)
+		return NULL;
+
+	rcu_read_lock();
+	memtier = rcu_dereference(pgdat->memtier);
+	if (!memtier)
+		goto out;
+
+	get_device(&memtier->dev);
+out:
+	rcu_read_unlock();
+	return memtier;
+}
+
+void node_put_memory_tier(struct memory_tier *memtier)
+{
+	put_device(&memtier->dev);
+}
+
 /**
  * next_demotion_node() - Get the next node in the demotion path
  * @node: The starting node to lookup the next node
@@ -641,7 +687,7 @@ static const struct attribute_group *memory_tier_attr_groups[] = {
 
 static int __init memory_tier_init(void)
 {
-	int ret;
+	int ret, node;
 	struct memory_tier *memtier;
 
 	ret = subsys_system_register(&memory_tier_subsys, memory_tier_attr_groups);
@@ -660,7 +706,13 @@ static int __init memory_tier_init(void)
 		      __func__, PTR_ERR(memtier));
 
 	/* CPU only nodes are not part of memory tiers. */
-	memtier->nodelist = node_states[N_MEMORY];
+	for_each_node_state(node, N_MEMORY) {
+		/*
+		 * Should be safe to do this early in the boot.
+		 */
+		NODE_DATA(node)->memtier = memtier;
+		node_set(node, memtier->nodelist);
+	}
 	migrate_on_reclaim_init();
 
 	return 0;
-- 
2.36.1


  parent reply	other threads:[~2022-06-10 13:55 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-10 13:52 [PATCH v6 00/13] mm/demotion: Memory tiers and demotion Aneesh Kumar K.V
2022-06-10 13:52 ` [PATCH v6 01/13] mm/demotion: Add support for explicit memory tiers Aneesh Kumar K.V
2022-06-13  3:22   ` Ying Huang
2022-06-13  3:31     ` Aneesh Kumar K V
2022-06-13  5:30       ` Ying Huang
2022-06-13 13:16         ` Johannes Weiner
2022-06-13 13:28           ` Aneesh Kumar K V
2022-06-14  8:20         ` Aneesh Kumar K.V
2022-06-14 15:13           ` Davidlohr Bueso
2022-06-10 13:52 ` [PATCH v6 02/13] mm/demotion: Move memory demotion related code Aneesh Kumar K.V
2022-06-10 13:52 ` [PATCH v6 03/13] mm/demotion: Return error on write to numa_demotion sysfs Aneesh Kumar K.V
2022-06-13  3:26   ` Ying Huang
2022-06-13  3:35     ` Aneesh Kumar K V
2022-06-13  5:33       ` Ying Huang
2022-06-13  5:48         ` Aneesh Kumar K V
2022-06-14  8:40           ` Ying Huang
2022-06-10 13:52 ` [PATCH v6 04/13] mm/demotion/dax/kmem: Set node's memory tier to MEMORY_TIER_PMEM Aneesh Kumar K.V
2022-06-13  6:59   ` Ying Huang
2022-06-13  7:05     ` Aneesh Kumar K V
2022-06-10 13:52 ` [PATCH v6 05/13] mm/demotion: Build demotion targets based on explicit memory tiers Aneesh Kumar K.V
2022-06-10 13:52 ` [PATCH v6 06/13] mm/demotion: Expose memory tier details via sysfs Aneesh Kumar K.V
2022-06-10 13:52 ` [PATCH v6 07/13] mm/demotion: Add per node memory tier attribute to sysfs Aneesh Kumar K.V
2022-06-10 13:52 ` [PATCH v6 08/13] mm/demotion: Add support for memory tier creation from userspace Aneesh Kumar K.V
2022-06-10 13:52 ` Aneesh Kumar K.V [this message]
2022-06-13  7:07   ` [PATCH v6 09/13] mm/demotion: Add pg_data_t member to track node memory tier details Ying Huang
2022-06-10 13:52 ` [PATCH v6 10/13] mm/demotion: Demote pages according to allocation fallback order Aneesh Kumar K.V
2022-06-10 13:52 ` [PATCH v6 11/13] mm/demotion: Update node_is_toptier to work with memory tiers Aneesh Kumar K.V
2022-06-10 13:52 ` [PATCH v6 12/13] mm/demotion: Add documentation for memory tiering Aneesh Kumar K.V
2022-06-10 13:52 ` [PATCH v6 13/13] mm/demotion: Add sysfs ABI documentation Aneesh Kumar K.V

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220610135229.182859-10-aneesh.kumar@linux.ibm.com \
    --to=aneesh.kumar@linux.ibm.com \
    --cc=Jonathan.Cameron@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=apopple@nvidia.com \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=brice.goglin@gmail.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@intel.com \
    --cc=dave@stgolabs.net \
    --cc=feng.tang@intel.com \
    --cc=gthelen@google.com \
    --cc=hesham.almatary@huawei.com \
    --cc=jvgediya@linux.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=rientjes@google.com \
    --cc=shy828301@gmail.com \
    --cc=tim.c.chen@intel.com \
    --cc=weixugc@google.com \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.