All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jiaqi Yan <jiaqiyan@google.com>
To: tony.luck@intel.com, naoya.horiguchi@nec.com
Cc: jiaqiyan@google.com, duenwen@google.com, rientjes@google.com,
	 linux-mm@kvack.org, shy828301@gmail.com,
	akpm@linux-foundation.org,  wangkefeng.wang@huawei.com
Subject: [PATCH v1 1/3] mm: memory-failure: Add memory failure stats to sysfs
Date: Mon, 16 Jan 2023 19:39:00 +0000	[thread overview]
Message-ID: <20230116193902.1315236-2-jiaqiyan@google.com> (raw)
In-Reply-To: <20230116193902.1315236-1-jiaqiyan@google.com>

Today kernel provides following memory error info to userspace, but each
has its own disadvantage
* HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total,
  not per NUMA node stats though
* ras:memory_failure_event: only available after explicitly enabled
* /dev/mcelog provides many useful info about the MCEs, but
  doesn't capture how memory_failure recovered memory MCEs
* kernel logs: userspace needs to process log text

Exposes per NUMA node memory error stats as sysfs entries:

  /sys/devices/system/node/node${X}/memory_failure/pages_poisoned
  /sys/devices/system/node/node${X}/memory_failure/pages_recovered
  /sys/devices/system/node/node${X}/memory_failure/pages_ignored
  /sys/devices/system/node/node${X}/memory_failure/pages_failed
  /sys/devices/system/node/node${X}/memory_failure/pages_delayed

These counters describe how many raw pages are poisoned and after the
attempted recoveries by the kernel, their resolutions: how many are
recovered, ignored, failed, or delayed respectively.

The following math holds for the statistics:
* pages_poisoned = pages_recovered + pages_ignored + pages_failed +
                   pages_delayed
* pages_poisoned * PAGE_SIZE = /proc/meminfo/HardwareCorrupted

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
---
 drivers/base/node.c    |  3 +++
 include/linux/mm.h     |  5 +++++
 include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++
 mm/memory-failure.c    | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index faf3597a96da..b46db17124f3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = {
 	&node_dev_group,
 #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
 	&arch_node_dev_group,
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+	&memory_failure_attr_group,
 #endif
 	NULL
 };
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3f196e4d66d..888576884eb9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3521,6 +3521,11 @@ enum mf_action_page_type {
 	MF_MSG_UNKNOWN,
 };
 
+/*
+ * Sysfs entries for memory failure handling statistics.
+ */
+extern const struct attribute_group memory_failure_attr_group;
+
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr_hint,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd28a100d9e4..0a14b35a96da 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1110,6 +1110,31 @@ struct deferred_split {
 };
 #endif
 
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Per NUMA node memory failure handling statistics.
+ */
+struct memory_failure_stats {
+	/*
+	 * Number of pages poisoned.
+	 * Cases not accounted: memory outside kernel control, offline page,
+	 * arch-specific memory_failure (SGX), and hwpoison_filter()
+	 * filtered error events.
+	 */
+	unsigned long pages_poisoned;
+	/*
+	 * Recovery results of poisoned pages handled by memory_failure,
+	 * in sync with mf_result.
+	 * pages_poisoned = pages_ignored + pages_failed +
+	 *		    pages_delayed + pages_recovered
+	 */
+	unsigned long pages_ignored;
+	unsigned long pages_failed;
+	unsigned long pages_delayed;
+	unsigned long pages_recovered;
+};
+#endif
+
 /*
  * On NUMA machines, each NUMA node would have a pg_data_t to describe
  * it's memory layout. On UMA machines there is a single pglist_data which
@@ -1253,6 +1278,9 @@ typedef struct pglist_data {
 #ifdef CONFIG_NUMA
 	struct memory_tier __rcu *memtier;
 #endif
+#ifdef CONFIG_MEMORY_FAILURE
+	struct memory_failure_stats mf_stats;
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c77a9e37e27e..cb782fa552d5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i)
 		memblk_nr_poison_sub(pfn, i);
 }
 
+/**
+ * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
+ * @_name: name of the file in the per NUMA sysfs directory.
+ */
+#define MF_ATTR_RO(_name)					\
+static ssize_t _name##_show(struct device *dev,		\
+			    struct device_attribute *attr,	\
+			    char *buf)				\
+{								\
+	struct memory_failure_stats *mf_stats =			\
+		&NODE_DATA(dev->id)->mf_stats;			\
+	return sprintf(buf, "%lu\n", mf_stats->_name);		\
+}								\
+static DEVICE_ATTR_RO(_name)
+
+MF_ATTR_RO(pages_poisoned);
+MF_ATTR_RO(pages_ignored);
+MF_ATTR_RO(pages_failed);
+MF_ATTR_RO(pages_delayed);
+MF_ATTR_RO(pages_recovered);
+
+static struct attribute *memory_failure_attr[] = {
+	&dev_attr_pages_poisoned.attr,
+	&dev_attr_pages_ignored.attr,
+	&dev_attr_pages_failed.attr,
+	&dev_attr_pages_delayed.attr,
+	&dev_attr_pages_recovered.attr,
+	NULL,
+};
+
+const struct attribute_group memory_failure_attr_group = {
+	.name = "memory_failure",
+	.attrs = memory_failure_attr,
+};
+
 /*
  * Return values:
  *   1:   the page is dissolved (if needed) and taken off from buddy,
-- 
2.39.0.314.g84b9a713c41-goog



  reply	other threads:[~2023-01-16 19:39 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-16 19:38 [PATCH v1 0/3] Introduce per NUMA node memory error statistics Jiaqi Yan
2023-01-16 19:39 ` Jiaqi Yan [this message]
2023-01-16 20:15   ` [PATCH v1 1/3] mm: memory-failure: Add memory failure stats to sysfs Andrew Morton
2023-01-17  9:14     ` HORIGUCHI NAOYA(堀口 直也)
2023-01-19 21:16       ` Jiaqi Yan
2023-01-17  9:02   ` HORIGUCHI NAOYA(堀口 直也)
2023-01-16 19:39 ` [PATCH v1 2/3] mm: memory-failure: Bump memory failure stats to pglist_data Jiaqi Yan
2023-01-16 20:16   ` Andrew Morton
2023-01-17  9:03     ` HORIGUCHI NAOYA(堀口 直也)
2023-01-18 23:05       ` Jiaqi Yan
2023-01-19  6:40         ` HORIGUCHI NAOYA(堀口 直也)
2023-01-19 18:05           ` Jiaqi Yan
2023-01-16 19:39 ` [PATCH v1 3/3] mm: memory-failure: Document memory failure stats Jiaqi Yan
2023-01-16 20:13 ` [PATCH v1 0/3] Introduce per NUMA node memory error statistics Andrew Morton
2023-01-16 21:52   ` Jiaqi Yan
2023-01-17  9:18 ` HORIGUCHI NAOYA(堀口 直也)
2023-01-17 17:51   ` Jiaqi Yan
2023-01-17 18:33     ` Luck, Tony
2023-01-18 17:31       ` Jiaqi Yan
2023-01-18 17:50         ` Luck, Tony
2023-01-18 23:33           ` Jiaqi Yan
2023-01-19  4:52             ` HORIGUCHI NAOYA(堀口 直也)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230116193902.1315236-2-jiaqiyan@google.com \
    --to=jiaqiyan@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=duenwen@google.com \
    --cc=linux-mm@kvack.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=rientjes@google.com \
    --cc=shy828301@gmail.com \
    --cc=tony.luck@intel.com \
    --cc=wangkefeng.wang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.