All of lore.kernel.org
 help / color / mirror / Atom feed
From: Aneesh Kumar K V <aneesh.kumar@linux.ibm.com>
To: "Huang, Ying" <ying.huang@intel.com>
Cc: linux-mm@kvack.org, akpm@linux-foundation.org,
	Wei Xu <weixugc@google.com>, Yang Shi <shy828301@gmail.com>,
	Davidlohr Bueso <dave@stgolabs.net>,
	Tim C Chen <tim.c.chen@intel.com>,
	Michal Hocko <mhocko@kernel.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Hesham Almatary <hesham.almatary@huawei.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Jonathan Cameron <Jonathan.Cameron@huawei.com>,
	Alistair Popple <apopple@nvidia.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	jvgediya.oss@gmail.com
Subject: Re: [PATCH v10 4/8] mm/demotion/dax/kmem: Set node's performance level to MEMTIER_PERF_LEVEL_PMEM
Date: Mon, 25 Jul 2022 12:18:39 +0530	[thread overview]
Message-ID: <adbf4fc8-80a6-3160-3338-ea4e8739cb64@linux.ibm.com> (raw)
In-Reply-To: <874jz5zoi9.fsf@yhuang6-desk2.ccr.corp.intel.com>

On 7/25/22 12:07 PM, Huang, Ying wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> 
>> By default, all nodes are assigned to the default memory tier which
>> is the memory tier designated for nodes with DRAM
>>
>> Set dax kmem device node's tier to slower memory tier by assigning
>> performance level to MEMTIER_PERF_LEVEL_PMEM. PMEM tier
>> appears below the default memory tier in demotion order.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  arch/powerpc/platforms/pseries/papr_scm.c | 41 ++++++++++++++++++++---
>>  drivers/acpi/nfit/core.c                  | 41 ++++++++++++++++++++++-
>>  2 files changed, 76 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
>> index 82cae08976bc..3b6164418d6f 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> @@ -14,6 +14,8 @@
>>  #include <linux/delay.h>
>>  #include <linux/seq_buf.h>
>>  #include <linux/nd.h>
>> +#include <linux/memory.h>
>> +#include <linux/memory-tiers.h>
>>  
>>  #include <asm/plpar_wrappers.h>
>>  #include <asm/papr_pdsm.h>
>> @@ -98,6 +100,7 @@ struct papr_scm_priv {
>>  	bool hcall_flush_required;
>>  
>>  	uint64_t bound_addr;
>> +	int target_node;
>>  
>>  	struct nvdimm_bus_descriptor bus_desc;
>>  	struct nvdimm_bus *bus;
>> @@ -1278,6 +1281,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	p->bus_desc.module = THIS_MODULE;
>>  	p->bus_desc.of_node = p->pdev->dev.of_node;
>>  	p->bus_desc.provider_name = kstrdup(p->pdev->name, GFP_KERNEL);
>> +	p->target_node = dev_to_node(&p->pdev->dev);
>>  
>>  	/* Set the dimm command family mask to accept PDSMs */
>>  	set_bit(NVDIMM_FAMILY_PAPR, &p->bus_desc.dimm_family_mask);
>> @@ -1322,7 +1326,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
>>  
>>  	memset(&ndr_desc, 0, sizeof(ndr_desc));
>> -	target_nid = dev_to_node(&p->pdev->dev);
>> +	target_nid = p->target_node;
>>  	online_nid = numa_map_to_online_node(target_nid);
>>  	ndr_desc.numa_node = online_nid;
>>  	ndr_desc.target_node = target_nid;
>> @@ -1582,15 +1586,42 @@ static struct platform_driver papr_scm_driver = {
>>  	},
>>  };
>>  
>> +static int papr_scm_callback(struct notifier_block *self,
>> +			     unsigned long action, void *arg)
>> +{
>> +	struct memory_notify *mnb = arg;
>> +	int nid = mnb->status_change_nid;
>> +	struct papr_scm_priv *p;
>> +
>> +	if (nid == NUMA_NO_NODE || action != MEM_ONLINE)
>> +		return NOTIFY_OK;
>> +
>> +	mutex_lock(&papr_ndr_lock);
>> +	list_for_each_entry(p, &papr_nd_regions, region_list) {
>> +		if (p->target_node == nid) {
>> +			node_devices[nid]->perf_level = MEMTIER_PERF_LEVEL_PMEM;
>> +			break;
>> +		}
>> +	}
>> +
>> +	mutex_unlock(&papr_ndr_lock);
>> +	return NOTIFY_OK;
>> +}
>> +
>>  static int __init papr_scm_init(void)
>>  {
>>  	int ret;
>>  
>>  	ret = platform_driver_register(&papr_scm_driver);
>> -	if (!ret)
>> -		mce_register_notifier(&mce_ue_nb);
>> -
>> -	return ret;
>> +	if (ret)
>> +		return ret;
>> +	mce_register_notifier(&mce_ue_nb);
>> +	/*
>> +	 * register a memory hotplug notifier at prio 2 so that we
>> +	 * can update the perf level for the node.
>> +	 */
>> +	hotplug_memory_notifier(papr_scm_callback, MEMTIER_HOTPLUG_PRIO + 1);
>> +	return 0;
>>  }
>>  module_init(papr_scm_init);
>>  
>> diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
>> index ae5f4acf2675..7ea1017ef790 100644
>> --- a/drivers/acpi/nfit/core.c
>> +++ b/drivers/acpi/nfit/core.c
>> @@ -15,6 +15,8 @@
>>  #include <linux/sort.h>
>>  #include <linux/io.h>
>>  #include <linux/nd.h>
>> +#include <linux/memory.h>
>> +#include <linux/memory-tiers.h>
>>  #include <asm/cacheflush.h>
>>  #include <acpi/nfit.h>
>>  #include "intel.h"
>> @@ -3470,6 +3472,39 @@ static struct acpi_driver acpi_nfit_driver = {
>>  	},
>>  };
>>  
>> +static int nfit_callback(struct notifier_block *self,
>> +			 unsigned long action, void *arg)
>> +{
>> +	bool found = false;
>> +	struct memory_notify *mnb = arg;
>> +	int nid = mnb->status_change_nid;
>> +	struct nfit_spa *nfit_spa;
>> +	struct acpi_nfit_desc *acpi_desc;
>> +
>> +	if (nid == NUMA_NO_NODE || action != MEM_ONLINE)
>> +		return NOTIFY_OK;
>> +
>> +	mutex_lock(&acpi_desc_lock);
>> +	list_for_each_entry(acpi_desc, &acpi_descs, list) {
>> +		mutex_lock(&acpi_desc->init_mutex);
>> +		list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
>> +			struct acpi_nfit_system_address *spa = nfit_spa->spa;
>> +			int target_node = pxm_to_node(spa->proximity_domain);
>> +
>> +			if (target_node == nid) {
>> +				node_devices[nid]->perf_level = MEMTIER_PERF_LEVEL_PMEM;
>> +				found = true;
>> +				break;
>> +			}
>> +		}
>> +		mutex_unlock(&acpi_desc->init_mutex);
>> +		if (found)
>> +			break;
>> +	}
>> +	mutex_unlock(&acpi_desc_lock);
>> +	return NOTIFY_OK;
>> +}
>> +
>>  static __init int nfit_init(void)
>>  {
>>  	int ret;
>> @@ -3509,7 +3544,11 @@ static __init int nfit_init(void)
>>  		nfit_mce_unregister();
>>  		destroy_workqueue(nfit_wq);
>>  	}
>> -
>> +	/*
>> +	 * register a memory hotplug notifier at prio 2 so that we
>> +	 * can update the perf level for the node.
>> +	 */
>> +	hotplug_memory_notifier(nfit_callback, MEMTIER_HOTPLUG_PRIO + 1);
>>  	return ret;
>>  
>>  }
> 
> I don't think that it's a good idea to set perf_level of a memory device
> (node) via NFIT only.

> 
> For example, we may prefer HMAT over NFIT when it's available.  So the
> perf_level should be set in dax/kmem.c based on information provided by
> ACPI or other information sources.  ACPI can provide some functions/data
> structures to let drivers (like dax/kmem.c) to query the properties of
> the memory device (node).
> 

I was trying to make it architecture specific so that we have a placeholder
to fine-tune this better. For example, ppc64 will look at device tree
details to find the performance level and x86 will look at ACPI data structure.
Adding that hotplug callback in dax/kmem will prevent that architecture-specific
customization? 

I would expect that callback to move to the generic ACPI layer so that even
firmware managed CXL devices can be added to a lower tier?  I don't understand
ACPI enough to find the right abstraction for that hotplug callback. 


> As the simplest first version, this can be just hard coded.
> 

If you are suggesting to not use hotplug callback, one of the challenge was node_devices[nid]
get allocated pretty late when we try to online the node. 

> Best Regards,
> Huang, Ying


  reply	other threads:[~2022-07-25  6:49 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-20  2:59 [PATCH v10 0/8] mm/demotion: Memory tiers and demotion Aneesh Kumar K.V
2022-07-20  2:59 ` [PATCH v10 1/8] mm/demotion: Add support for explicit memory tiers Aneesh Kumar K.V
2022-07-26  3:53   ` Huang, Ying
2022-07-26 11:59     ` Aneesh Kumar K V
2022-07-27  1:16       ` Huang, Ying
2022-07-28 17:23         ` Johannes Weiner
2022-07-20  2:59 ` [PATCH v10 2/8] mm/demotion: Move memory demotion related code Aneesh Kumar K.V
2022-07-20  2:59 ` [PATCH v10 3/8] mm/demotion: Add hotplug callbacks to handle new numa node onlined Aneesh Kumar K.V
2022-07-26  4:03   ` Huang, Ying
2022-07-26 12:03     ` Aneesh Kumar K V
2022-07-27  1:53       ` Huang, Ying
2022-07-27  4:38         ` Aneesh Kumar K.V
2022-07-28  6:42           ` Huang, Ying
2022-07-20  2:59 ` [PATCH v10 4/8] mm/demotion/dax/kmem: Set node's performance level to MEMTIER_PERF_LEVEL_PMEM Aneesh Kumar K.V
2022-07-21  6:07   ` kernel test robot
2022-07-25  6:37   ` Huang, Ying
2022-07-25  6:48     ` Aneesh Kumar K V [this message]
2022-07-25  8:35       ` Huang, Ying
2022-07-25  8:42         ` Aneesh Kumar K V
2022-07-26  2:13           ` Huang, Ying
2022-07-27  4:31             ` Aneesh Kumar K.V
2022-07-28  6:39               ` Huang, Ying
2022-07-20  2:59 ` [PATCH v10 5/8] mm/demotion: Build demotion targets based on explicit memory tiers Aneesh Kumar K.V
2022-07-20  3:38   ` Aneesh Kumar K.V
2022-07-21  0:02   ` kernel test robot
2022-07-26  7:44   ` Huang, Ying
2022-07-26 12:30     ` Aneesh Kumar K V
2022-07-27  1:40       ` Huang, Ying
2022-07-27  4:35         ` Aneesh Kumar K.V
2022-07-28  6:51           ` Huang, Ying
2022-08-03  3:18         ` Aneesh Kumar K.V
2022-08-04  4:19           ` Huang, Ying
2022-07-20  2:59 ` [PATCH v10 6/8] mm/demotion: Add pg_data_t member to track node memory tier details Aneesh Kumar K.V
2022-07-26  8:02   ` Huang, Ying
2022-07-20  2:59 ` [PATCH v10 7/8] mm/demotion: Demote pages according to allocation fallback order Aneesh Kumar K.V
2022-07-26  8:24   ` Huang, Ying
2022-07-20  2:59 ` [PATCH v10 8/8] mm/demotion: Update node_is_toptier to work with memory tiers Aneesh Kumar K.V
2022-07-25  8:54   ` Huang, Ying
2022-07-25  8:56     ` Aneesh Kumar K V

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=adbf4fc8-80a6-3160-3338-ea4e8739cb64@linux.ibm.com \
    --to=aneesh.kumar@linux.ibm.com \
    --cc=Jonathan.Cameron@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=apopple@nvidia.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@intel.com \
    --cc=dave@stgolabs.net \
    --cc=hannes@cmpxchg.org \
    --cc=hesham.almatary@huawei.com \
    --cc=jvgediya.oss@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=shy828301@gmail.com \
    --cc=tim.c.chen@intel.com \
    --cc=weixugc@google.com \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.