[PATCH v2] mm: add per-order mTHP alloc_success and alloc_fail counters

From: Barry Song <21cnbao@gmail.com>
To: akpm@linux-foundation.org, linux-mm@kvack.org
Cc: willy@infradead.org, david@redhat.com, ryan.roberts@arm.com,
	cerasuolodomenico@gmail.com, kasong@tencent.com,
	surenb@google.com, v-songbaohua@oppo.com, yosryahmed@google.com,
	yuzhao@google.com, chrisl@kernel.org, peterx@redhat.com
Subject: [PATCH v2] mm: add per-order mTHP alloc_success and alloc_fail counters
Date: Thu, 28 Mar 2024 22:51:39 +1300	[thread overview]
Message-ID: <20240328095139.143374-1-21cnbao@gmail.com> (raw)

From: Barry Song <v-songbaohua@oppo.com>

Profiling a system blindly with mTHP has become challenging due
to the lack of visibility into its operations. Presenting the
success rate of mTHP allocations appears to be pressing need.

Recently, I've been experiencing significant difficulty debugging
performance improvements and regressions without these figures.
It's crucial for us to understand the true effectiveness of
mTHP in real-world scenarios, especially in systems with
fragmented memory.

This patch sets up the framework for per-order mTHP counters,
starting with the introduction of alloc_success and alloc_fail
counters.  Incorporating additional counters should now be
straightforward as well.

The initial two unsigned longs for each event are unused, given
that order-0 and order-1 are not mTHP. Nonetheless, this refinement
improves code clarity.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 -v2:
 * move to sysfs and provide per-order counters; David, Ryan, Willy
 -v1:
 https://lore.kernel.org/linux-mm/20240326030103.50678-1-21cnbao@gmail.com/

 include/linux/huge_mm.h | 17 +++++++++++++
 mm/huge_memory.c        | 54 +++++++++++++++++++++++++++++++++++++++++
 mm/memory.c             |  3 +++
 3 files changed, 74 insertions(+)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e896ca4760f6..27fa26a22a8f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -264,6 +264,23 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 					  enforce_sysfs, orders);
 }
 
+enum thp_event_item {
+	THP_ALLOC_SUCCESS,
+	THP_ALLOC_FAIL,
+	NR_THP_EVENT_ITEMS
+};
+
+struct thp_event_state {
+	unsigned long event[PMD_ORDER + 1][NR_THP_EVENT_ITEMS];
+};
+
+DECLARE_PER_CPU(struct thp_event_state, thp_event_states);
+
+static inline void count_thp_event(int order, enum thp_event_item item)
+{
+	this_cpu_inc(thp_event_states.event[order][item]);
+}
+
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1683de78c313..addd093d8410 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -526,6 +526,52 @@ static const struct kobj_type thpsize_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
 };
 
+DEFINE_PER_CPU(struct thp_event_state, thp_event_states) = {{{0}}};
+
+static unsigned long sum_thp_events(int order, enum thp_event_item item)
+{
+	unsigned long sum = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct thp_event_state *this = &per_cpu(thp_event_states, cpu);
+
+		sum += this->event[order][item];
+	}
+
+	return sum;
+}
+
+static ssize_t alloc_success_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	int order = to_thpsize(kobj)->order;
+
+	return sysfs_emit(buf, "%lu\n", sum_thp_events(order, THP_ALLOC_SUCCESS));
+}
+
+static ssize_t alloc_fail_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	int order = to_thpsize(kobj)->order;
+
+	return sysfs_emit(buf, "%lu\n", sum_thp_events(order, THP_ALLOC_FAIL));
+}
+
+static struct kobj_attribute alloc_success_attr = __ATTR_RO(alloc_success);
+static struct kobj_attribute alloc_fail_attr = __ATTR_RO(alloc_fail);
+
+static struct attribute *stats_attrs[] = {
+	&alloc_success_attr.attr,
+	&alloc_fail_attr.attr,
+	NULL,
+};
+
+static struct attribute_group stats_attr_group = {
+	.name = "stats",
+	.attrs = stats_attrs,
+};
+
 static struct thpsize *thpsize_create(int order, struct kobject *parent)
 {
 	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
@@ -549,6 +595,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
 		return ERR_PTR(ret);
 	}
 
+	ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
+	if (ret) {
+		kobject_put(&thpsize->kobj);
+		return ERR_PTR(ret);
+	}
+
 	thpsize->order = order;
 	return thpsize;
 }
@@ -1050,8 +1102,10 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
 	if (unlikely(!folio)) {
 		count_vm_event(THP_FAULT_FALLBACK);
+		count_thp_event(HPAGE_PMD_ORDER, THP_ALLOC_FAIL);
 		return VM_FAULT_FALLBACK;
 	}
+	count_thp_event(HPAGE_PMD_ORDER, THP_ALLOC_SUCCESS);
 	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 984b138f85b4..c9c1031c2ecb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4365,7 +4365,10 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 			}
 			folio_throttle_swaprate(folio, gfp);
 			clear_huge_page(&folio->page, vmf->address, 1 << order);
+			count_thp_event(order, THP_ALLOC_SUCCESS);
 			return folio;
+		} else {
+			count_thp_event(order, THP_ALLOC_FAIL);
 		}
 next:
 		order = next_order(&orders, order);
-- 
2.34.1