All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] mm: add thp_utilization metrics to debugfs
@ 2022-08-18  0:01 alexlzhu
  2022-08-20 22:07 ` Andrew Morton
  0 siblings, 1 reply; 4+ messages in thread
From: alexlzhu @ 2022-08-18  0:01 UTC (permalink / raw)
  To: linux-mm, kernel-team, linux-kernel, akpm; +Cc: Alexander Zhu

From: Alexander Zhu <alexlzhu@fb.com>

THPs have historically been enabled on a per application basis due to
performance increase or decrease depending on how the particular
application uses physical memory. When THPs are heavily utilized,
application performance improves due to fewer TLB cache misses.
It has long been suspected that performance regressions when THP
is enabled happens due to heavily underutilized anonymous THPs.

Previously there was no way to track how much of a THP is
actually being used. With this change, we seek to gain visibility
into the utilization of THPs in order to make more intelligent
decisions regarding paging.

This change introduces a tool that scans through all of physical
memory for anonymous THPs and groups them into buckets based
on utilization. It also includes an interface under
/sys/kernel/debug/thp_utilization.

Utilization of a THP is defined as the percentage of nonzero
pages in the THP. The worker thread will scan through all
of physical memory and obtain utilization of all anonymous
THPs. It will gather this information by periodically scanning
through all of physical memory for anonymous THPs, group them
into buckets based on utilization, and report utilization
information through debugfs under /sys/kernel/debug/thp_utilization.

Signed-off-by: Alexander Zhu <alexlzhu@fb.com>

Changes in v3:
-changed to use folio instead of page in thp_number_utilized_pages
method
-changed thp_number_utilized_pages to work on 32 bit architectures
with HIGHMEM.

Changes in v2:
-moved thp_utilization metrics from /proc to /sys/kernel/debug
-adjusted documentation to reflect that thp_utilization is no longer
in /proc
-changed is_anon_transparent_hugepage method to be static inline to
fix warning.
-removed unused *head pointer based on warning.
-adjust to use compound_head(page) instead of page in
thp_number_utilized_pages. There are rare instances where they
are not equivalent.
-fixed header to be v2. Previous v1 had v3 in the header by mistake.
---
 Documentation/admin-guide/mm/transhuge.rst |   9 ++
 include/linux/huge_mm.h                    |   2 +
 mm/huge_memory.c                           | 176 +++++++++++++++++++++
 3 files changed, 187 insertions(+)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index c9c37f16eef8..d883ff9fddc7 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -297,6 +297,15 @@ To identify what applications are mapping file transparent huge pages, it
 is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
 for each mapping.
 
+The utilization of transparent hugepages can be viewed by reading
+``/sys/kernel/debug/thp_utilization``. The utilization of a THP is defined
+as the ratio of non zero filled 4kb pages to the total number of pages in a
+THP. The buckets are labelled by the range of total utilized 4kb pages with
+one line per utilization bucket. Each line contains the total number of
+THPs in that bucket and the total number of zero filled 4kb pages summed
+over all THPs in that bucket. The last two lines show the timestamp and
+duration respectively of the most recent scan over all of physical memory.
+
 Note that reading the smaps file is expensive and reading it
 frequently will incur overhead.
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 768e5261fdae..c9086239deb7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -179,6 +179,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 
+int thp_number_utilized_pages(struct page *page);
+
 void prep_transhuge_page(struct page *page);
 void free_transhuge_page(struct page *page);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8a7c1b344abe..7f75cd3d553c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -45,6 +45,16 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/thp.h>
 
+/*
+ * The number of utilization buckets THPs will be grouped in
+ * under /sys/kernel/debug/thp_utilization.
+ */
+#define THP_UTIL_BUCKET_NR 10
+/*
+ * The number of addresses to scan through on each periodic
+ * run of the scanner that generates /sys/kernel/debug/thp_utilization.
+ */
+#define THP_UTIL_SCAN_SIZE 256
 /*
  * By default, transparent hugepage support is disabled in order to avoid
  * risking an increased memory footprint for applications that are not
@@ -70,6 +80,25 @@ static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
+static void thp_utilization_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(thp_utilization_work, thp_utilization_workfn);
+
+struct thp_scan_info_bucket {
+	int nr_thps;
+	int nr_zero_pages;
+};
+
+struct thp_scan_info {
+	struct thp_scan_info_bucket buckets[THP_UTIL_BUCKET_NR];
+	struct zone *scan_zone;
+	struct timespec64 last_scan_duration;
+	struct timespec64 last_scan_time;
+	unsigned long pfn;
+};
+
+static struct thp_scan_info thp_scan_debugfs;
+static struct thp_scan_info thp_scan;
+
 bool hugepage_vma_check(struct vm_area_struct *vma,
 			unsigned long vm_flags,
 			bool smaps, bool in_pf)
@@ -486,6 +515,7 @@ static int __init hugepage_init(void)
 	if (err)
 		goto err_slab;
 
+	schedule_delayed_work(&thp_utilization_work, HZ);
 	err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
 	if (err)
 		goto err_hzp_shrinker;
@@ -600,6 +630,11 @@ static inline bool is_transparent_hugepage(struct page *page)
 	       page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
 }
 
+static inline bool is_anon_transparent_hugepage(struct page *page)
+{
+	return PageAnon(page) && is_transparent_hugepage(page);
+}
+
 static unsigned long __thp_get_unmapped_area(struct file *filp,
 		unsigned long addr, unsigned long len,
 		loff_t off, unsigned long flags, unsigned long size)
@@ -650,6 +685,38 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
+int thp_number_utilized_pages(struct page *page)
+{
+	struct folio *folio;
+	unsigned long page_offset, value;
+	int thp_nr_utilized_pages = HPAGE_PMD_NR;
+	int step_size = sizeof(unsigned long);
+	bool is_all_zeroes;
+	void *kaddr;
+	int i;
+
+	if (!page || !is_anon_transparent_hugepage(page))
+		return -1;
+
+	folio = page_folio(page);
+	for (i = 0; i < folio_nr_pages(folio); i++) {
+		kaddr = kmap_local_folio(folio, i);
+		is_all_zeroes = true;
+		for (page_offset = 0; page_offset < PAGE_SIZE; page_offset += step_size) {
+			value = *(unsigned long *)(kaddr + page_offset);
+			if (value != 0) {
+				is_all_zeroes = false;
+				break;
+			}
+		}
+		if (is_all_zeroes)
+			thp_nr_utilized_pages--;
+
+		kunmap_local(kaddr);
+	}
+	return thp_nr_utilized_pages;
+}
+
 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 			struct page *page, gfp_t gfp)
 {
@@ -3135,6 +3202,42 @@ static int __init split_huge_pages_debugfs(void)
 	return 0;
 }
 late_initcall(split_huge_pages_debugfs);
+
+static int thp_utilization_show(struct seq_file *seqf, void *pos)
+{
+	int i;
+	int start;
+	int end;
+
+	for (i = 0; i < THP_UTIL_BUCKET_NR; i++) {
+		start = i * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR;
+		end = (i + 1 == THP_UTIL_BUCKET_NR)
+			   ? HPAGE_PMD_NR
+			   : ((i + 1) * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR - 1);
+		/* The last bucket will need to contain 100 */
+		seq_printf(seqf, "Utilized[%d-%d]: %d %d\n", start, end,
+			   thp_scan_debugfs.buckets[i].nr_thps,
+			   thp_scan_debugfs.buckets[i].nr_zero_pages);
+	}
+	seq_printf(seqf, "Last Scan Time: %lu.%02lu\n",
+		   (unsigned long)thp_scan_debugfs.last_scan_time.tv_sec,
+		   (thp_scan_debugfs.last_scan_time.tv_nsec / (NSEC_PER_SEC / 100)));
+
+	seq_printf(seqf, "Last Scan Duration: %lu.%02lu\n",
+		   (unsigned long)thp_scan_debugfs.last_scan_duration.tv_sec,
+		   (thp_scan_debugfs.last_scan_duration.tv_nsec / (NSEC_PER_SEC / 100)));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(thp_utilization);
+
+static int __init thp_utilization_debugfs(void)
+{
+	debugfs_create_file("thp_utilization", 0200, NULL, NULL,
+			    &thp_utilization_fops);
+	return 0;
+}
+late_initcall(thp_utilization_debugfs);
 #endif
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -3220,3 +3323,76 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 	trace_remove_migration_pmd(address, pmd_val(pmde));
 }
 #endif
+
+static void thp_scan_next_zone(void)
+{
+	struct timespec64 current_time;
+	int i;
+	bool update_debugfs;
+
+	thp_scan.scan_zone = next_zone(thp_scan.scan_zone);
+	update_debugfs = !thp_scan.scan_zone;
+	thp_scan.scan_zone = update_debugfs ? (first_online_pgdat())->node_zones
+			: thp_scan.scan_zone;
+	thp_scan.pfn = (thp_scan.scan_zone->zone_start_pfn + HPAGE_PMD_NR - 1)
+			& ~(HPAGE_PMD_SIZE - 1);
+	if (!update_debugfs)
+		return;
+
+	ktime_get_ts64(&current_time);
+	thp_scan_debugfs.last_scan_duration = timespec64_sub(current_time,
+							     thp_scan_debugfs.last_scan_time);
+	thp_scan_debugfs.last_scan_time = current_time;
+
+	for (i = 0; i < THP_UTIL_BUCKET_NR; i++) {
+		thp_scan_debugfs.buckets[i].nr_thps = thp_scan.buckets[i].nr_thps;
+		thp_scan_debugfs.buckets[i].nr_zero_pages = thp_scan.buckets[i].nr_zero_pages;
+		thp_scan.buckets[i].nr_thps = 0;
+		thp_scan.buckets[i].nr_zero_pages = 0;
+	}
+}
+
+static void thp_util_scan(unsigned long pfn_end)
+{
+	struct page *page = NULL;
+	int bucket, num_utilized_pages, current_pfn;
+	int i;
+
+	for (i = 0; i < THP_UTIL_SCAN_SIZE; i++) {
+		current_pfn = thp_scan.pfn;
+		thp_scan.pfn += HPAGE_PMD_NR;
+		if (current_pfn >= pfn_end)
+			return;
+
+		if (!pfn_valid(current_pfn))
+			continue;
+
+		page = pfn_to_page(current_pfn);
+		num_utilized_pages = thp_number_utilized_pages(page);
+		if (num_utilized_pages < 0)
+			continue;
+
+		bucket = num_utilized_pages * THP_UTIL_BUCKET_NR / HPAGE_PMD_NR;
+		bucket = min(bucket, THP_UTIL_BUCKET_NR - 1);
+		thp_scan.buckets[bucket].nr_thps++;
+		thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages);
+	}
+}
+
+static void thp_utilization_workfn(struct work_struct *work)
+{
+	unsigned long pfn_end;
+
+	if (!thp_scan.scan_zone)
+		thp_scan.scan_zone = (first_online_pgdat())->node_zones;
+
+	pfn_end = (thp_scan.scan_zone->zone_start_pfn +
+			thp_scan.scan_zone->spanned_pages + HPAGE_PMD_NR - 1)
+			& ~(HPAGE_PMD_SIZE - 1);
+	if (!populated_zone(thp_scan.scan_zone) || thp_scan.pfn >= pfn_end)
+		thp_scan_next_zone();
+	else
+		thp_util_scan(pfn_end);
+
+	schedule_delayed_work(&thp_utilization_work, HZ);
+}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v3] mm: add thp_utilization metrics to debugfs
  2022-08-18  0:01 [PATCH v3] mm: add thp_utilization metrics to debugfs alexlzhu
@ 2022-08-20 22:07 ` Andrew Morton
  2022-08-22 17:24   ` Alex Zhu (Kernel)
  0 siblings, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2022-08-20 22:07 UTC (permalink / raw)
  To: alexlzhu; +Cc: linux-mm, kernel-team, linux-kernel

On Wed, 17 Aug 2022 17:01:12 -0700 <alexlzhu@fb.com> wrote:

> THPs have historically been enabled on a per application basis due to
> performance increase or decrease depending on how the particular
> application uses physical memory. When THPs are heavily utilized,
> application performance improves due to fewer TLB cache misses.
> It has long been suspected that performance regressions when THP
> is enabled happens due to heavily underutilized anonymous THPs.
> 
> Previously there was no way to track how much of a THP is
> actually being used. With this change, we seek to gain visibility
> into the utilization of THPs in order to make more intelligent
> decisions regarding paging.
> 
> This change introduces a tool that scans through all of physical
> memory for anonymous THPs and groups them into buckets based
> on utilization. It also includes an interface under
> /sys/kernel/debug/thp_utilization.
> 
> Utilization of a THP is defined as the percentage of nonzero
> pages in the THP. The worker thread will scan through all
> of physical memory and obtain utilization of all anonymous
> THPs. It will gather this information by periodically scanning
> through all of physical memory for anonymous THPs, group them
> into buckets based on utilization, and report utilization

I'd like to see sample debugfs output right here in the changelog, for
reviewers to review.  In some detail.

And I'd like to see the code commented!  Especially
thp_utilization_workfn(), thp_util_scan() and thp_scan_next_zone(). 
What are their roles and responsibilities?  How long do they take, by
what means do they scan?

I mean, scanning all of physical memory is a huge task.  How do we
avoid chewing vast amounts of CPU?  What is the chosen approach and
what are the tradeoffs?  Why is is done within a kernel thread at all,
rather than putting the load into the context of the reader of the
stats (which is more appropriate).  etcetera.  There are many traps,
tradeoffs and hidden design decisions here.  Please unhide them.

This comment, which is rather a core part of these tradeoffs:

+/*
+ * The number of addresses to scan through on each periodic
+ * run of the scanner that generates /sys/kernel/debug/thp_utilization.
+ */
+#define THP_UTIL_SCAN_SIZE 256

isn't very helpful.  "number of addresses"?  Does it mean we scan 256
bytes at a time?  256 pages?  256 hugepages?  Something else?

How can any constant make sense when different architectures have
different [huge]page sizes?  Should it be scaled by pagesize?  And if
we're going to do that, we should scale it by CPU speed at the same time.

Or bypass all of that and simply scan for a certain amount of *time*,
rather than scan a certain amount of memory.  After all, chunking up
the scan time is what we're trying to achieve by chunking up the scan
amount.  Why not chunk up the scan time directly?

See where I'm going?  I see many hidden assumptions, design decisions
and tradeoffs here.  Can we please attempt to spell them out and review
them.

Anyway.  There were many review comments on previous versions.  It
would have been better had those reviewers been cc'ed on this version. 
I'll go into hiding and see what people think.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v3] mm: add thp_utilization metrics to debugfs
  2022-08-20 22:07 ` Andrew Morton
@ 2022-08-22 17:24   ` Alex Zhu (Kernel)
  0 siblings, 0 replies; 4+ messages in thread
From: Alex Zhu (Kernel) @ 2022-08-22 17:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, Kernel Team, linux-kernel



> On Aug 20, 2022, at 3:07 PM, Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > 
> On Wed, 17 Aug 2022 17:01:12 -0700 <alexlzhu@fb.com> wrote:
> 
>> THPs have historically been enabled on a per application basis due to
>> performance increase or decrease depending on how the particular
>> application uses physical memory. When THPs are heavily utilized,
>> application performance improves due to fewer TLB cache misses.
>> It has long been suspected that performance regressions when THP
>> is enabled happens due to heavily underutilized anonymous THPs.
>> 
>> Previously there was no way to track how much of a THP is
>> actually being used. With this change, we seek to gain visibility
>> into the utilization of THPs in order to make more intelligent
>> decisions regarding paging.
>> 
>> This change introduces a tool that scans through all of physical
>> memory for anonymous THPs and groups them into buckets based
>> on utilization. It also includes an interface under
>> /sys/kernel/debug/thp_utilization.
>> 
>> Utilization of a THP is defined as the percentage of nonzero
>> pages in the THP. The worker thread will scan through all
>> of physical memory and obtain utilization of all anonymous
>> THPs. It will gather this information by periodically scanning
>> through all of physical memory for anonymous THPs, group them
>> into buckets based on utilization, and report utilization
> 
> I'd like to see sample debugfs output right here in the changelog, for
> reviewers to review.  In some detail.

I should have included that in the description, sorry about that. A sample output: 

Utilized[0-50]: 1331 680884
Utilized[51-101]: 9 3983
Utilized[102-152]: 3 1187
Utilized[153-203]: 0 0
Utilized[204-255]: 2 539
Utilized[256-306]: 5 1135
Utilized[307-357]: 1 192
Utilized[358-408]: 0 0
Utilized[409-459]: 1 57
Utilized[460-512]: 400 13
Last Scan Time: 223.98
Last Scan Duration: 70.65

This indicates that there are 1331 THPs that have between 0 and 50
utilized (non zero) pages. In total there are 680884 zero pages in
this utilization bucket. THPs in the [0-50] bucket compose 76% of total
THPs, and are responsible for 99% of total zero pages across all
THPs. In other words, the least utilized THPs are responsible for almost
all of the memory waste when THP is always enabled. Similar results
have been observed across production workloads. 

The last two lines indicate the timestamp and duration of the most recent
scan through all of physical memory. Here we see that the last scan took
70.65 seconds. 
> 
> And I'd like to see the code commented!  Especially
> thp_utilization_workfn(), thp_util_scan() and thp_scan_next_zone(). 
> What are their roles and responsibilities?  How long do they take, by
> what means do they scan?
> 
> I mean, scanning all of physical memory is a huge task.  How do we
> avoid chewing vast amounts of CPU?  What is the chosen approach and
> what are the tradeoffs?  Why is is done within a kernel thread at all,
> rather than putting the load into the context of the reader of the
> stats (which is more appropriate).  etcetera.  There are many traps,
> tradeoffs and hidden design decisions here.  Please unhide them.
> 
> This comment, which is rather a core part of these tradeoffs:
> 

> +/*
> + * The number of addresses to scan through on each periodic
> + * run of the scanner that generates /sys/kernel/debug/thp_utilization.
> + */
> +#define THP_UTIL_SCAN_SIZE 256
> 
> isn't very helpful.  "number of addresses"?  Does it mean we scan 256
> bytes at a time?  256 pages?  256 hugepages?  Something else?

256 hugepages. We scan through physical memory 2MB at a time with the
alignment at 2MB. For the moment, we have observed that scanning 256 PFNs per second
with a kernel thread does not produce any noticeable side effects on x86_64.
> 
> How can any constant make sense when different architectures have
> different [huge]page sizes?  Should it be scaled by pagesize?  And if
> we're going to do that, we should scale it by CPU speed at the same time.
> 
This sounds very interesting. I would be happy to hear any suggestions for how 
we can scale this value more systematically. The way we decided on this value
initially was as a value that should be small enough that there are no noticeable
Side effects, and then we could tune it later. 

> Or bypass all of that and simply scan for a certain amount of *time*,
> rather than scan a certain amount of memory.  After all, chunking up
> the scan time is what we're trying to achieve by chunking up the scan
> amount.  Why not chunk up the scan time directly?
> 
> See where I'm going?  I see many hidden assumptions, design decisions
> and tradeoffs here.  Can we please attempt to spell them out and review
> them.

I will send out an RFC patchset including this one, split_huge_page, and the shrinker
later this week. Will add more description and include any suggestions in that. Thanks!
> 
> Anyway.  There were many review comments on previous versions.  It
> would have been better had those reviewers been cc'ed on this version. 
> I'll go into hiding and see what people think.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v3] mm: add thp_utilization metrics to debugfs
@ 2022-08-10 19:57 alexlzhu
  0 siblings, 0 replies; 4+ messages in thread
From: alexlzhu @ 2022-08-10 19:57 UTC (permalink / raw)
  To: linux-mm, kernel-team, linux-kernel, akpm; +Cc: Alexander Zhu

From: Alexander Zhu <alexlzhu@fb.com>

THPs have historically been enabled on a per application basis due to
performance increase or decrease depending on how the particular
application uses physical memory. When THPs are heavily utilized,
application performance improves due to fewer TLB cache misses.
It has long been suspected that performance regressions when THP
is enabled happens due to heavily underutilized anonymous THPs.

Previously there was no way to track how much of a THP is
actually being used. With this change, we seek to gain visibility
into the utilization of THPs in order to make more intelligent
decisions regarding paging.

This change introduces a tool that scans through all of physical
memory for anonymous THPs and groups them into buckets based
on utilization. It also includes an interface under
/sys/kernel/debug/thp_utilization.

Utilization of a THP is defined as the percentage of nonzero
pages in the THP. The worker thread will scan through all
of physical memory and obtain utilization of all anonymous
THPs. It will gather this information by periodically scanning
through all of physical memory for anonymous THPs, group them
into buckets based on utilization, and report utilization
information through debugfs under /sys/kernel/debug/thp_utilization.

Signed-off-by: Alexander Zhu <alexlzhu@fb.com>

Changes in v3:
-changed to use folio instead of page in thp_number_utilized_pages
method
-changed thp_number_utilized_pages to work on 32 bit architectures
with HIGHMEM.

Changes in v2:
-moved thp_utilization metrics from /proc to /sys/kernel/debug
-adjusted documentation to reflect that thp_utilization is no longer
in /proc
-changed is_anon_transparent_hugepage method to be static inline to
fix warning.
-removed unused *head pointer based on warning.
-adjust to use compound_head(page) instead of page in
thp_number_utilized_pages. There are rare instances where they
are not equivalent.
-fixed header to be v2. Previous v1 had v3 in the header by mistake.
---
 Documentation/admin-guide/mm/transhuge.rst |   9 ++
 include/linux/huge_mm.h                    |   2 +
 mm/huge_memory.c                           | 176 +++++++++++++++++++++
 3 files changed, 187 insertions(+)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index c9c37f16eef8..d883ff9fddc7 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -297,6 +297,15 @@ To identify what applications are mapping file transparent huge pages, it
 is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
 for each mapping.
 
+The utilization of transparent hugepages can be viewed by reading
+``/sys/kernel/debug/thp_utilization``. The utilization of a THP is defined
+as the ratio of non zero filled 4kb pages to the total number of pages in a
+THP. The buckets are labelled by the range of total utilized 4kb pages with
+one line per utilization bucket. Each line contains the total number of
+THPs in that bucket and the total number of zero filled 4kb pages summed
+over all THPs in that bucket. The last two lines show the timestamp and
+duration respectively of the most recent scan over all of physical memory.
+
 Note that reading the smaps file is expensive and reading it
 frequently will incur overhead.
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 768e5261fdae..c9086239deb7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -179,6 +179,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 
+int thp_number_utilized_pages(struct page *page);
+
 void prep_transhuge_page(struct page *page);
 void free_transhuge_page(struct page *page);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8a7c1b344abe..7f75cd3d553c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -45,6 +45,16 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/thp.h>
 
+/*
+ * The number of utilization buckets THPs will be grouped in
+ * under /sys/kernel/debug/thp_utilization.
+ */
+#define THP_UTIL_BUCKET_NR 10
+/*
+ * The number of addresses to scan through on each periodic
+ * run of the scanner that generates /sys/kernel/debug/thp_utilization.
+ */
+#define THP_UTIL_SCAN_SIZE 256
 /*
  * By default, transparent hugepage support is disabled in order to avoid
  * risking an increased memory footprint for applications that are not
@@ -70,6 +80,25 @@ static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
+static void thp_utilization_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(thp_utilization_work, thp_utilization_workfn);
+
+struct thp_scan_info_bucket {
+	int nr_thps;
+	int nr_zero_pages;
+};
+
+struct thp_scan_info {
+	struct thp_scan_info_bucket buckets[THP_UTIL_BUCKET_NR];
+	struct zone *scan_zone;
+	struct timespec64 last_scan_duration;
+	struct timespec64 last_scan_time;
+	unsigned long pfn;
+};
+
+static struct thp_scan_info thp_scan_debugfs;
+static struct thp_scan_info thp_scan;
+
 bool hugepage_vma_check(struct vm_area_struct *vma,
 			unsigned long vm_flags,
 			bool smaps, bool in_pf)
@@ -486,6 +515,7 @@ static int __init hugepage_init(void)
 	if (err)
 		goto err_slab;
 
+	schedule_delayed_work(&thp_utilization_work, HZ);
 	err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
 	if (err)
 		goto err_hzp_shrinker;
@@ -600,6 +630,11 @@ static inline bool is_transparent_hugepage(struct page *page)
 	       page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
 }
 
+static inline bool is_anon_transparent_hugepage(struct page *page)
+{
+	return PageAnon(page) && is_transparent_hugepage(page);
+}
+
 static unsigned long __thp_get_unmapped_area(struct file *filp,
 		unsigned long addr, unsigned long len,
 		loff_t off, unsigned long flags, unsigned long size)
@@ -650,6 +685,38 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
+int thp_number_utilized_pages(struct page *page)
+{
+	struct folio *folio;
+	unsigned long page_offset, value;
+	int thp_nr_utilized_pages = HPAGE_PMD_NR;
+	int step_size = sizeof(unsigned long);
+	bool is_all_zeroes;
+	void *kaddr;
+	int i;
+
+	if (!page || !is_anon_transparent_hugepage(page))
+		return -1;
+
+	folio = page_folio(page);
+	for (i = 0; i < folio_nr_pages(folio); i++) {
+		kaddr = kmap_local_folio(folio, i);
+		is_all_zeroes = true;
+		for (page_offset = 0; page_offset < PAGE_SIZE; page_offset += step_size) {
+			value = *(unsigned long *)(kaddr + page_offset);
+			if (value != 0) {
+				is_all_zeroes = false;
+				break;
+			}
+		}
+		if (is_all_zeroes)
+			thp_nr_utilized_pages--;
+
+		kunmap_local(kaddr);
+	}
+	return thp_nr_utilized_pages;
+}
+
 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 			struct page *page, gfp_t gfp)
 {
@@ -3135,6 +3202,42 @@ static int __init split_huge_pages_debugfs(void)
 	return 0;
 }
 late_initcall(split_huge_pages_debugfs);
+
+static int thp_utilization_show(struct seq_file *seqf, void *pos)
+{
+	int i;
+	int start;
+	int end;
+
+	for (i = 0; i < THP_UTIL_BUCKET_NR; i++) {
+		start = i * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR;
+		end = (i + 1 == THP_UTIL_BUCKET_NR)
+			   ? HPAGE_PMD_NR
+			   : ((i + 1) * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR - 1);
+		/* The last bucket will need to contain 100 */
+		seq_printf(seqf, "Utilized[%d-%d]: %d %d\n", start, end,
+			   thp_scan_debugfs.buckets[i].nr_thps,
+			   thp_scan_debugfs.buckets[i].nr_zero_pages);
+	}
+	seq_printf(seqf, "Last Scan Time: %lu.%02lu\n",
+		   (unsigned long)thp_scan_debugfs.last_scan_time.tv_sec,
+		   (thp_scan_debugfs.last_scan_time.tv_nsec / (NSEC_PER_SEC / 100)));
+
+	seq_printf(seqf, "Last Scan Duration: %lu.%02lu\n",
+		   (unsigned long)thp_scan_debugfs.last_scan_duration.tv_sec,
+		   (thp_scan_debugfs.last_scan_duration.tv_nsec / (NSEC_PER_SEC / 100)));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(thp_utilization);
+
+static int __init thp_utilization_debugfs(void)
+{
+	debugfs_create_file("thp_utilization", 0200, NULL, NULL,
+			    &thp_utilization_fops);
+	return 0;
+}
+late_initcall(thp_utilization_debugfs);
 #endif
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -3220,3 +3323,76 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 	trace_remove_migration_pmd(address, pmd_val(pmde));
 }
 #endif
+
+static void thp_scan_next_zone(void)
+{
+	struct timespec64 current_time;
+	int i;
+	bool update_debugfs;
+
+	thp_scan.scan_zone = next_zone(thp_scan.scan_zone);
+	update_debugfs = !thp_scan.scan_zone;
+	thp_scan.scan_zone = update_debugfs ? (first_online_pgdat())->node_zones
+			: thp_scan.scan_zone;
+	thp_scan.pfn = (thp_scan.scan_zone->zone_start_pfn + HPAGE_PMD_NR - 1)
+			& ~(HPAGE_PMD_SIZE - 1);
+	if (!update_debugfs)
+		return;
+
+	ktime_get_ts64(&current_time);
+	thp_scan_debugfs.last_scan_duration = timespec64_sub(current_time,
+							     thp_scan_debugfs.last_scan_time);
+	thp_scan_debugfs.last_scan_time = current_time;
+
+	for (i = 0; i < THP_UTIL_BUCKET_NR; i++) {
+		thp_scan_debugfs.buckets[i].nr_thps = thp_scan.buckets[i].nr_thps;
+		thp_scan_debugfs.buckets[i].nr_zero_pages = thp_scan.buckets[i].nr_zero_pages;
+		thp_scan.buckets[i].nr_thps = 0;
+		thp_scan.buckets[i].nr_zero_pages = 0;
+	}
+}
+
+static void thp_util_scan(unsigned long pfn_end)
+{
+	struct page *page = NULL;
+	int bucket, num_utilized_pages, current_pfn;
+	int i;
+
+	for (i = 0; i < THP_UTIL_SCAN_SIZE; i++) {
+		current_pfn = thp_scan.pfn;
+		thp_scan.pfn += HPAGE_PMD_NR;
+		if (current_pfn >= pfn_end)
+			return;
+
+		if (!pfn_valid(current_pfn))
+			continue;
+
+		page = pfn_to_page(current_pfn);
+		num_utilized_pages = thp_number_utilized_pages(page);
+		if (num_utilized_pages < 0)
+			continue;
+
+		bucket = num_utilized_pages * THP_UTIL_BUCKET_NR / HPAGE_PMD_NR;
+		bucket = min(bucket, THP_UTIL_BUCKET_NR - 1);
+		thp_scan.buckets[bucket].nr_thps++;
+		thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages);
+	}
+}
+
+static void thp_utilization_workfn(struct work_struct *work)
+{
+	unsigned long pfn_end;
+
+	if (!thp_scan.scan_zone)
+		thp_scan.scan_zone = (first_online_pgdat())->node_zones;
+
+	pfn_end = (thp_scan.scan_zone->zone_start_pfn +
+			thp_scan.scan_zone->spanned_pages + HPAGE_PMD_NR - 1)
+			& ~(HPAGE_PMD_SIZE - 1);
+	if (!populated_zone(thp_scan.scan_zone) || thp_scan.pfn >= pfn_end)
+		thp_scan_next_zone();
+	else
+		thp_util_scan(pfn_end);
+
+	schedule_delayed_work(&thp_utilization_work, HZ);
+}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-08-22 17:24 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-18  0:01 [PATCH v3] mm: add thp_utilization metrics to debugfs alexlzhu
2022-08-20 22:07 ` Andrew Morton
2022-08-22 17:24   ` Alex Zhu (Kernel)
  -- strict thread matches above, loose matches on Subject: below --
2022-08-10 19:57 alexlzhu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.