* [PATCH v4] mm/demotion: Expose memory tier details via sysfs
@ 2022-09-22 10:22 Aneesh Kumar K.V
2022-09-23 8:07 ` Huang, Ying
0 siblings, 1 reply; 4+ messages in thread
From: Aneesh Kumar K.V @ 2022-09-22 10:22 UTC (permalink / raw)
To: linux-mm, akpm
Cc: Wei Xu, Huang Ying, Yang Shi, Davidlohr Bueso, Tim C Chen,
Michal Hocko, Linux Kernel Mailing List, Hesham Almatary,
Dave Hansen, Jonathan Cameron, Alistair Popple, Dan Williams,
Johannes Weiner, jvgediya.oss, Bharata B Rao, Aneesh Kumar K.V
This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
related details can be found. All allocated memory tiers will be listed
there as /sys/devices/virtual/memory_tiering/memory_tierN/
The nodes which are part of a specific memory tier can be listed via
/sys/devices/virtual/memory_tiering/memory_tierN/nodes
A directory hierarchy looks like
:/sys/devices/virtual/memory_tiering$ tree memory_tier4/
memory_tier4/
├── nodes
├── subsystem -> ../../../../bus/memory_tiering
└── uevent
:/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
0,2
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Changes from v3:
* drop toptier_nodes from sysfs
.../ABI/testing/sysfs-kernel-mm-memory-tiers | 25 ++++
mm/memory-tiers.c | 109 ++++++++++++++----
2 files changed, 112 insertions(+), 22 deletions(-)
create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
new file mode 100644
index 000000000000..45985e411f13
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
@@ -0,0 +1,25 @@
+What: /sys/devices/virtual/memory_tiering/
+Date: August 2022
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: A collection of all the memory tiers allocated.
+
+ Individual memory tier details are contained in subdirectories
+ named by the abstract distance of the memory tier.
+
+ /sys/devices/virtual/memory_tiering/memory_tierN/
+
+
+What: /sys/devices/virtual/memory_tiering/memory_tierN/
+ /sys/devices/virtual/memory_tiering/memory_tierN/nodes
+Date: August 2022
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Directory with details of a specific memory tier
+
+ This is the directory containing information about a particular
+ memory tier, memtierN, where N is derived based on abstract distance.
+
+ A smaller value of N implies a higher (faster) memory tier in the
+ hierarchy.
+
+ nodes: NUMA nodes that are part of this memory tier.
+
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index c82eb0111383..f116b7b6333e 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -19,6 +19,7 @@ struct memory_tier {
* adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
*/
int adistance_start;
+ struct device dev;
/* All the nodes that are part of all the lower memory tiers. */
nodemask_t lower_tier_mask;
};
@@ -36,6 +37,12 @@ static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
static struct memory_dev_type *default_dram_type;
+
+static struct bus_type memory_tier_subsys = {
+ .name = "memory_tiering",
+ .dev_name = "memory_tier",
+};
+
#ifdef CONFIG_MIGRATION
static int top_tier_adistance;
/*
@@ -98,8 +105,63 @@ static int top_tier_adistance;
static struct demotion_nodes *node_demotion __read_mostly;
#endif /* CONFIG_MIGRATION */
+static inline struct memory_tier *to_memory_tier(struct device *device)
+{
+ return container_of(device, struct memory_tier, dev);
+}
+
+static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+{
+ nodemask_t nodes = NODE_MASK_NONE;
+ struct memory_dev_type *memtype;
+
+ list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+ nodes_or(nodes, nodes, memtype->nodes);
+
+ return nodes;
+}
+
+static void memory_tier_device_release(struct device *dev)
+{
+ struct memory_tier *tier = to_memory_tier(dev);
+ /*
+ * synchronize_rcu in clear_node_memory_tier makes sure
+ * we don't have rcu access to this memory tier.
+ */
+ kfree(tier);
+}
+
+static ssize_t nodes_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int ret;
+ nodemask_t nmask;
+
+ mutex_lock(&memory_tier_lock);
+ nmask = get_memtier_nodemask(to_memory_tier(dev));
+ ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
+ mutex_unlock(&memory_tier_lock);
+ return ret;
+}
+static DEVICE_ATTR_RO(nodes);
+
+static struct attribute *memtier_dev_attrs[] = {
+ &dev_attr_nodes.attr,
+ NULL
+};
+
+static const struct attribute_group memtier_dev_group = {
+ .attrs = memtier_dev_attrs,
+};
+
+static const struct attribute_group *memtier_dev_groups[] = {
+ &memtier_dev_group,
+ NULL
+};
+
static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
{
+ int ret;
bool found_slot = false;
struct memory_tier *memtier, *new_memtier;
int adistance = memtype->adistance;
@@ -123,15 +185,14 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
list_for_each_entry(memtier, &memory_tiers, list) {
if (adistance == memtier->adistance_start) {
- list_add(&memtype->tier_sibiling, &memtier->memory_types);
- return memtier;
+ goto link_memtype;
} else if (adistance < memtier->adistance_start) {
found_slot = true;
break;
}
}
- new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
+ new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
if (!new_memtier)
return ERR_PTR(-ENOMEM);
@@ -142,8 +203,23 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
list_add_tail(&new_memtier->list, &memtier->list);
else
list_add_tail(&new_memtier->list, &memory_tiers);
- list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
- return new_memtier;
+
+ new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
+ new_memtier->dev.bus = &memory_tier_subsys;
+ new_memtier->dev.release = memory_tier_device_release;
+ new_memtier->dev.groups = memtier_dev_groups;
+
+ ret = device_register(&new_memtier->dev);
+ if (ret) {
+ list_del(&memtier->list);
+ put_device(&memtier->dev);
+ return ERR_PTR(ret);
+ }
+ memtier = new_memtier;
+
+link_memtype:
+ list_add(&memtype->tier_sibiling, &memtier->memory_types);
+ return memtier;
}
static struct memory_tier *__node_get_memory_tier(int node)
@@ -275,17 +351,6 @@ static void disable_all_demotion_targets(void)
synchronize_rcu();
}
-static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
-{
- nodemask_t nodes = NODE_MASK_NONE;
- struct memory_dev_type *memtype;
-
- list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
- nodes_or(nodes, nodes, memtype->nodes);
-
- return nodes;
-}
-
/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
@@ -433,11 +498,7 @@ static struct memory_tier *set_node_memory_tier(int node)
static void destroy_memory_tier(struct memory_tier *memtier)
{
list_del(&memtier->list);
- /*
- * synchronize_rcu in clear_node_memory_tier makes sure
- * we don't have rcu access to this memory tier.
- */
- kfree(memtier);
+ device_unregister(&memtier->dev);
}
static bool clear_node_memory_tier(int node)
@@ -566,9 +627,13 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
static int __init memory_tier_init(void)
{
- int node;
+ int ret, node;
struct memory_tier *memtier;
+ ret = subsys_virtual_register(&memory_tier_subsys, NULL);
+ if (ret)
+ panic("%s() failed to register memory tier subsystem\n", __func__);
+
#ifdef CONFIG_MIGRATION
node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
GFP_KERNEL);
--
2.37.3
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH v4] mm/demotion: Expose memory tier details via sysfs
2022-09-22 10:22 [PATCH v4] mm/demotion: Expose memory tier details via sysfs Aneesh Kumar K.V
@ 2022-09-23 8:07 ` Huang, Ying
2022-09-23 10:35 ` Aneesh Kumar K V
0 siblings, 1 reply; 4+ messages in thread
From: Huang, Ying @ 2022-09-23 8:07 UTC (permalink / raw)
To: Aneesh Kumar K.V
Cc: linux-mm, akpm, Wei Xu, Yang Shi, Davidlohr Bueso, Tim C Chen,
Michal Hocko, Linux Kernel Mailing List, Hesham Almatary,
Dave Hansen, Jonathan Cameron, Alistair Popple, Dan Williams,
Johannes Weiner, jvgediya.oss, Bharata B Rao
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
> related details can be found. All allocated memory tiers will be listed
> there as /sys/devices/virtual/memory_tiering/memory_tierN/
>
> The nodes which are part of a specific memory tier can be listed via
> /sys/devices/virtual/memory_tiering/memory_tierN/nodes
It appears that XXXs is used for mask while XXXs_list is used for list?
For example,
# cat /sys/devices/system/cpu/cpu2/topology/core_cpus
0,00100004
# cat /sys/devices/system/cpu/cpu2/topology/core_cpus_list
2,20
It's better to follow the this convention?
> A directory hierarchy looks like
> :/sys/devices/virtual/memory_tiering$ tree memory_tier4/
> memory_tier4/
> ├── nodes
> ├── subsystem -> ../../../../bus/memory_tiering
> └── uevent
>
> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
> 0,2
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Best Regards,
Huang, Ying
[snip]
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v4] mm/demotion: Expose memory tier details via sysfs
2022-09-23 8:07 ` Huang, Ying
@ 2022-09-23 10:35 ` Aneesh Kumar K V
2022-09-26 1:04 ` Huang, Ying
0 siblings, 1 reply; 4+ messages in thread
From: Aneesh Kumar K V @ 2022-09-23 10:35 UTC (permalink / raw)
To: Huang, Ying
Cc: linux-mm, akpm, Wei Xu, Yang Shi, Davidlohr Bueso, Tim C Chen,
Michal Hocko, Linux Kernel Mailing List, Hesham Almatary,
Dave Hansen, Jonathan Cameron, Alistair Popple, Dan Williams,
Johannes Weiner, jvgediya.oss, Bharata B Rao
On 9/23/22 1:37 PM, Huang, Ying wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>
>> This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
>> related details can be found. All allocated memory tiers will be listed
>> there as /sys/devices/virtual/memory_tiering/memory_tierN/
>>
>> The nodes which are part of a specific memory tier can be listed via
>> /sys/devices/virtual/memory_tiering/memory_tierN/nodes
>
> It appears that XXXs is used for mask while XXXs_list is used for list?
> For example,
>
> # cat /sys/devices/system/cpu/cpu2/topology/core_cpus
> 0,00100004
> # cat /sys/devices/system/cpu/cpu2/topology/core_cpus_list
> 2,20
>
> It's better to follow the this convention?
>
That is not followed in other parts of the kernel. I was loking at cpuset
$cat cpuset.cpus.effective
0-7
>> A directory hierarchy looks like
>> :/sys/devices/virtual/memory_tiering$ tree memory_tier4/
>> memory_tier4/
>> ├── nodes
>> ├── subsystem -> ../../../../bus/memory_tiering
>> └── uevent
>>
>> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
>> 0,2
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>
> Best Regards,
> Huang, Ying
>
> [snip]
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v4] mm/demotion: Expose memory tier details via sysfs
2022-09-23 10:35 ` Aneesh Kumar K V
@ 2022-09-26 1:04 ` Huang, Ying
0 siblings, 0 replies; 4+ messages in thread
From: Huang, Ying @ 2022-09-26 1:04 UTC (permalink / raw)
To: Aneesh Kumar K V
Cc: linux-mm, akpm, Wei Xu, Yang Shi, Davidlohr Bueso, Tim C Chen,
Michal Hocko, Linux Kernel Mailing List, Hesham Almatary,
Dave Hansen, Jonathan Cameron, Alistair Popple, Dan Williams,
Johannes Weiner, jvgediya.oss, Bharata B Rao
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
> On 9/23/22 1:37 PM, Huang, Ying wrote:
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>
>>> This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
>>> related details can be found. All allocated memory tiers will be listed
>>> there as /sys/devices/virtual/memory_tiering/memory_tierN/
>>>
>>> The nodes which are part of a specific memory tier can be listed via
>>> /sys/devices/virtual/memory_tiering/memory_tierN/nodes
>>
>> It appears that XXXs is used for mask while XXXs_list is used for list?
>> For example,
>>
>> # cat /sys/devices/system/cpu/cpu2/topology/core_cpus
>> 0,00100004
>> # cat /sys/devices/system/cpu/cpu2/topology/core_cpus_list
>> 2,20
>>
>> It's better to follow the this convention?
>>
>
> That is not followed in other parts of the kernel. I was loking at cpuset
>
> $cat cpuset.cpus.effective
> 0-7
Per my understanding, cpuset isn't sysfs, but cgroupfs?
I did some research in my system,
$ grep . $(find /sys/devices | grep 'list$')
and
$ grep . $(find /sys/devices | grep 'cpus$')
I found that the cpus/cpus_list convention is used in
- pci
/sys/devices/pci0000:64/0000:64:0d.2/local_cpulist:0-35
/sys/devices/pci0000:64/0000:64:0c.2/local_cpus:f,ffffffff
- system
/sys/devices/system/cpu/cpu7/topology/core_cpus_list:7,25
/sys/devices/system/cpu/cpu7/topology/core_cpus:0,02000080
- block
/sys/devices/virtual/block/loop1/mq/0/cpu_list:0, 1, 2, ...
- net
/sys/devices/virtual/net/lo/queues/rx-0/rps_cpus:0,00000000
And I haven't found any exception in sysfs of my system. Can you find
some?
Best Regards,
Huang, Ying
>>> A directory hierarchy looks like
>>> :/sys/devices/virtual/memory_tiering$ tree memory_tier4/
>>> memory_tier4/
>>> ├── nodes
>>> ├── subsystem -> ../../../../bus/memory_tiering
>>> └── uevent
>>>
>>> :/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
>>> 0,2
>>>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>
>> Best Regards,
>> Huang, Ying
>>
>> [snip]
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-09-26 1:04 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-22 10:22 [PATCH v4] mm/demotion: Expose memory tier details via sysfs Aneesh Kumar K.V
2022-09-23 8:07 ` Huang, Ying
2022-09-23 10:35 ` Aneesh Kumar K V
2022-09-26 1:04 ` Huang, Ying
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).