linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Saravanan D <saravanand@fb.com>
To: <x86@kernel.org>, <dave.hansen@linux.intel.com>,
	<luto@kernel.org>, <peterz@infradead.org>
Cc: <linux-kernel@vger.kernel.org>, <kernel-team@fb.com>,
	Saravanan D <saravanand@fb.com>
Subject: [PATCH V2] x86/mm: Tracking linear mapping split events
Date: Wed, 27 Jan 2021 09:51:24 -0800	[thread overview]
Message-ID: <20210127175124.3289879-1-saravanand@fb.com> (raw)
In-Reply-To: <bd157a11-8e6b-5f44-4d91-d99adb9f8686@intel.com>

Numerous hugepage splits in the linear mapping would give
admins the signal to narrow down the sluggishness caused by TLB
miss/reload.

To help with debugging, we introduce monotonic lifetime  hugepage
split event counts since SYSTEM_RUNNING to be displayed as part of
/proc/vmstat in x86 servers

The lifetime split event information will be displayed at the bottom of
/proc/vmstat
....
swap_ra 0
swap_ra_hit 0
direct_map_2M_splits 139
direct_map_4M_splits 0
direct_map_1G_splits 7
nr_unstable 0
....

Ancillary debugfs split event counts exported to userspace via read-write
endpoints : /sys/kernel/debug/x86/direct_map_[2M|4M|1G]_split

dmesg log when user resets the debugfs split event count for
debugging
....
[  232.470531] debugfs 2M Pages split event count(128) reset to 0
....

One of the many lasting (as we don't coalesce back) sources for huge page
splits is tracing as the granular page attribute/permission changes would
force the kernel to split code segments mapped to huge pages to smaller
ones thereby increasing the probability of TLB miss/reload even after
tracing has been stopped.

Signed-off-by: Saravanan D <saravanand@fb.com>
---
 arch/x86/mm/pat/set_memory.c  | 117 ++++++++++++++++++++++++++++++++++
 include/linux/vm_event_item.h |   8 +++
 mm/vmstat.c                   |   8 +++
 3 files changed, 133 insertions(+)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..97b6ef8dbd12 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,8 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/libnvdimm.h>
+#include <linux/vmstat.h>
+#include <linux/kernel.h>
 
 #include <asm/e820/api.h>
 #include <asm/processor.h>
@@ -76,6 +78,104 @@ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
+static unsigned long split_page_event_count[PG_LEVEL_NUM];
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int direct_map_2M_split_set(void *data, u64 val)
+{
+	switch (val) {
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pr_info("debugfs 2M Pages split event count(%lu) reset to 0",
+		  split_page_event_count[PG_LEVEL_2M]);
+	split_page_event_count[PG_LEVEL_2M] = 0;
+
+	return 0;
+}
+
+static int direct_map_2M_split_get(void *data, u64 *val)
+{
+	*val = split_page_event_count[PG_LEVEL_2M];
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_2M_split, direct_map_2M_split_get,
+			 direct_map_2M_split_set, "%llu\n");
+#else
+static int direct_map_4M_split_set(void *data, u64 val)
+{
+	switch (val) {
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pr_info("debugfs 4M Pages split event count(%lu) reset to 0",
+		  split_page_event_count[PG_LEVEL_2M]);
+	split_page_event_count[PG_LEVEL_2M] = 0;
+
+	return 0;
+}
+
+static int direct_map_4M_split_get(void *data, u64 *val)
+{
+	*val = split_page_event_count[PG_LEVEL_2M];
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_4M_split, direct_map_4M_split_get,
+			 direct_map_4M_split_set, "%llu\n");
+#endif
+
+static int direct_map_1G_split_set(void *data, u64 val)
+{
+	switch (val) {
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pr_info("debugfs 1G Pages split event count(%lu) reset to 0",
+		  split_page_event_count[PG_LEVEL_1G]);
+	split_page_event_count[PG_LEVEL_1G] = 0;
+
+	return 0;
+}
+
+static int direct_map_1G_split_get(void *data, u64 *val)
+{
+	*val = split_page_event_count[PG_LEVEL_1G];
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_1G_split, direct_map_1G_split_get,
+			 direct_map_1G_split_set, "%llu\n");
+
+static __init int direct_map_split_debugfs_init(void)
+{
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+	debugfs_create_file("direct_map_2M_split", 0600,
+			    arch_debugfs_dir, NULL,
+			    &fops_direct_map_2M_split);
+#else
+	debugfs_create_file("direct_map_4M_split", 0600,
+			    arch_debugfs_dir, NULL,
+			    &fops_direct_map_4M_split);
+#endif
+	if (direct_gbpages)
+		debugfs_create_file("direct_map_1G_split", 0600,
+				    arch_debugfs_dir, NULL,
+				    &fops_direct_map_1G_split);
+	return 0;
+}
+
+late_initcall(direct_map_split_debugfs_init);
 
 void update_page_count(int level, unsigned long pages)
 {
@@ -85,12 +185,29 @@ void update_page_count(int level, unsigned long pages)
 	spin_unlock(&pgd_lock);
 }
 
+void update_split_page_event_count(int level)
+{
+	if (system_state == SYSTEM_RUNNING) {
+		split_page_event_count[level]++;
+		if (level == PG_LEVEL_2M) {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+			count_vm_event(DIRECT_MAP_2M_SPLIT);
+#else
+			count_vm_event(DIRECT_MAP_4M_SPLIT);
+#endif
+		} else if (level == PG_LEVEL_1G) {
+			count_vm_event(DIRECT_MAP_1G_SPLIT);
+		}
+	}
+}
+
 static void split_page_count(int level)
 {
 	if (direct_pages_count[level] == 0)
 		return;
 
 	direct_pages_count[level]--;
+	update_split_page_event_count(level);
 	direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 18e75974d4e3..439742d2435e 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -120,6 +120,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_SWAP
 		SWAP_RA,
 		SWAP_RA_HIT,
+#endif
+#if defined(__x86_64__)
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+		DIRECT_MAP_2M_SPLIT,
+#else
+		DIRECT_MAP_4M_SPLIT,
+#endif
+		DIRECT_MAP_1G_SPLIT,
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..beaa2bb4f9dc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1350,6 +1350,14 @@ const char * const vmstat_text[] = {
 	"swap_ra",
 	"swap_ra_hit",
 #endif
+#if defined(__x86_64__)
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+	"direct_map_2M_splits",
+#else
+	"direct_map_4M_splits",
+#endif
+	"direct_map_1G_splits",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
-- 
2.24.1


  parent reply	other threads:[~2021-01-27 17:53 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <BYAPR01MB40856478D5BE74CB6A7D5578CFBD9@BYAPR01MB4085.prod.exchangelabs.com>
2021-01-25 20:15 ` [PATCH] x86/mm: Tracking linear mapping split events since boot Dave Hansen
2021-01-25 20:32   ` Tejun Heo
2021-01-26  0:47     ` Dave Hansen
2021-01-26  0:53       ` Tejun Heo
2021-01-26  1:04         ` Dave Hansen
2021-01-26  1:17           ` Tejun Heo
2021-01-27 17:51           ` Saravanan D [this message]
2021-01-27 21:03             ` [PATCH V2] x86/mm: Tracking linear mapping split events Tejun Heo
2021-01-27 21:32               ` Dave Hansen
2021-01-27 21:36                 ` Tejun Heo
2021-01-27 21:42                   ` Saravanan D
2021-01-27 22:50                   ` [PATCH V3] " Saravanan D
2021-01-27 23:00                     ` Randy Dunlap
2021-01-27 23:56                       ` Saravanan D
2021-01-27 23:41                     ` Dave Hansen
2021-01-28  0:15                       ` Saravanan D
2021-01-28  4:35                       ` [PATCH V4] " Saravanan D
2021-01-28  4:51                         ` Matthew Wilcox
     [not found]                           ` <20210128104934.2916679-1-saravanand@fb.com>
2021-01-28 15:04                             ` [PATCH V5] " Matthew Wilcox
2021-01-28 19:49                               ` Saravanan D
2021-01-28 16:33                             ` Zi Yan
2021-01-28 16:41                               ` Dave Hansen
2021-01-28 16:56                                 ` Zi Yan
2021-01-28 16:59                               ` Song Liu
     [not found]                             ` <3aec2d10-f4c3-d07a-356f-6f1001679181@intel.com>
2021-01-28 21:20                               ` Saravanan D
     [not found]                                 ` <20210128233430.1460964-1-saravanand@fb.com>
2021-01-28 23:41                                   ` [PATCH V6] " Tejun Heo
2021-01-29 19:27                                   ` Johannes Weiner
2021-02-08 23:17                                     ` Saravanan D
2021-02-08 23:30                                   ` Dave Hansen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210127175124.3289879-1-saravanand@fb.com \
    --to=saravanand@fb.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=peterz@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).