* [PATCH v2 1/8] rasdaemon: add support for memory_failure events
2021-01-25 17:14 [PATCH v2 0/8] rasdaemon: add support for memory_failure events, Shiju Jose
@ 2021-01-25 17:14 ` Shiju Jose
2021-01-25 17:14 ` [PATCH v2 2/8] rasdaemon: ras-mc-ctl: Modify ARM processor error summary log Shiju Jose
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Shiju Jose @ 2021-01-25 17:14 UTC (permalink / raw)
To: linux-edac, mchehab+huawei; +Cc: linuxarm, tanxiaofei, jonathan.cameron
Add support to log the memory_failure kernel trace
events.
Example rasdaemon log and SQLite DB output for the
memory_failure event,
=================================================
rasdaemon: memory_failure_event store: 0x126ce8f8
rasdaemon: register inserted at db
<...>-785 [000] 0.000024: memory_failure_event: 2020-10-02 13:27:13 -0400 pfn=0x204000000 page_type=free buddy page action_result=Delayed
CREATE TABLE memory_failure_event (id INTEGER PRIMARY KEY, timestamp TEXT, pfn TEXT, page_type TEXT, action_result TEXT);
INSERT INTO memory_failure_event VALUES(1,'2020-10-02 13:27:13 -0400','0x204000000','free buddy page','Delayed');
==================================================
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
.travis.yml | 2 +-
Makefile.am | 7 +-
configure.ac | 11 +++
ras-events.c | 15 +++
ras-events.h | 1 +
ras-memory-failure-handler.c | 179 +++++++++++++++++++++++++++++++++++
ras-memory-failure-handler.h | 25 +++++
ras-record.c | 70 ++++++++++++++
ras-record.h | 13 +++
ras-report.c | 68 +++++++++++++
ras-report.h | 2 +
11 files changed, 390 insertions(+), 3 deletions(-)
create mode 100644 ras-memory-failure-handler.c
create mode 100644 ras-memory-failure-handler.h
diff --git a/.travis.yml b/.travis.yml
index 41d716d..7855b8e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,7 +26,7 @@ before_install:
- sudo apt-get install -y sqlite3
install:
- autoreconf -vfi
-- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa
+- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-memory-failure --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa
script:
- make && sudo make install
diff --git a/Makefile.am b/Makefile.am
index de01098..7c1c027 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -48,6 +48,9 @@ endif
if WITH_DISKERROR
rasdaemon_SOURCES += ras-diskerror-handler.c
endif
+if WITH_MEMORY_FAILURE
+ rasdaemon_SOURCES += ras-memory-failure-handler.c
+endif
if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
@@ -62,8 +65,8 @@ rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
- ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
- non-standard-hisilicon.h
+ ras-devlink-handler.h ras-diskerror-handler.h ras-memory-failure-handler.h \
+ rbtree.h ras-page-isolation.h non-standard-hisilicon.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
diff --git a/configure.ac b/configure.ac
index e276c84..a6251d4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -111,6 +111,16 @@ AS_IF([test "x$enable_diskerror" = "xyes" || test "x$enable_all" == "xyes"], [
AM_CONDITIONAL([WITH_DISKERROR], [test x$enable_diskerror = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_DISKERROR], [USE_DISKERROR="yes"], [USE_DISKERROR="no"])
+AC_ARG_ENABLE([memory_failure],
+ AS_HELP_STRING([--enable-memory-failure], [enable memory failure events (currently experimental)]))
+
+AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_MEMORY_FAILURE,1,"have memory failure events collect")
+ AC_SUBST([WITH_MEMORY_FAILURE])
+])
+AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"])
+
AC_ARG_ENABLE([abrt_report],
AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
@@ -178,5 +188,6 @@ compile time options summary
ARM events : $USE_ARM
DEVLINK : $USE_DEVLINK
Disk I/O errors : $USE_DISKERROR
+ Memory Failure : $USE_MEMORY_FAILURE
Memory CE PFA : $USE_MEMORY_CE_PFA
EOF
diff --git a/ras-events.c b/ras-events.c
index c797b20..1e275cc 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -37,6 +37,7 @@
#include "ras-extlog-handler.h"
#include "ras-devlink-handler.h"
#include "ras-diskerror-handler.h"
+#include "ras-memory-failure-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
@@ -231,6 +232,10 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable);
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable);
+#endif
+
free_ras:
free(ras);
return rc;
@@ -908,6 +913,16 @@ int handle_ras_events(int record_events)
}
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event",
+ ras_memory_failure_event_handler, NULL, MF_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "memory_failure_event");
+#endif
+
if (!num_events) {
log(ALL, LOG_INFO,
"Failed to trace all supported RAS events. Aborting.\n");
diff --git a/ras-events.h b/ras-events.h
index f028741..dfd690c 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -38,6 +38,7 @@ enum {
EXTLOG_EVENT,
DEVLINK_EVENT,
DISKERROR_EVENT,
+ MF_EVENT,
NR_EVENTS
};
diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
new file mode 100644
index 0000000..9941e68
--- /dev/null
+++ b/ras-memory-failure-handler.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libtrace/kbuffer.h"
+#include "ras-memory-failure-handler.h"
+#include "ras-record.h"
+#include "ras-logger.h"
+#include "ras-report.h"
+
+/* Memory failure - various types of pages */
+enum mf_action_page_type {
+ MF_MSG_KERNEL,
+ MF_MSG_KERNEL_HIGH_ORDER,
+ MF_MSG_SLAB,
+ MF_MSG_DIFFERENT_COMPOUND,
+ MF_MSG_POISONED_HUGE,
+ MF_MSG_HUGE,
+ MF_MSG_FREE_HUGE,
+ MF_MSG_NON_PMD_HUGE,
+ MF_MSG_UNMAP_FAILED,
+ MF_MSG_DIRTY_SWAPCACHE,
+ MF_MSG_CLEAN_SWAPCACHE,
+ MF_MSG_DIRTY_MLOCKED_LRU,
+ MF_MSG_CLEAN_MLOCKED_LRU,
+ MF_MSG_DIRTY_UNEVICTABLE_LRU,
+ MF_MSG_CLEAN_UNEVICTABLE_LRU,
+ MF_MSG_DIRTY_LRU,
+ MF_MSG_CLEAN_LRU,
+ MF_MSG_TRUNCATED_LRU,
+ MF_MSG_BUDDY,
+ MF_MSG_BUDDY_2ND,
+ MF_MSG_DAX,
+ MF_MSG_UNSPLIT_THP,
+ MF_MSG_UNKNOWN,
+};
+
+/* Action results for various types of pages */
+enum mf_action_result {
+ MF_IGNORED, /* Error: cannot be handled */
+ MF_FAILED, /* Error: handling failed */
+ MF_DELAYED, /* Will be handled later */
+ MF_RECOVERED, /* Successfully recovered */
+};
+
+/* memory failure page types */
+static const struct {
+ int type;
+ const char *page_type;
+} mf_page_type[] = {
+ { MF_MSG_KERNEL, "reserved kernel page" },
+ { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"},
+ { MF_MSG_SLAB, "kernel slab page"},
+ { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"},
+ { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"},
+ { MF_MSG_HUGE, "huge page"},
+ { MF_MSG_FREE_HUGE, "free huge page"},
+ { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"},
+ { MF_MSG_UNMAP_FAILED, "unmapping failed page"},
+ { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"},
+ { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"},
+ { MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page"},
+ { MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page"},
+ { MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page"},
+ { MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page"},
+ { MF_MSG_DIRTY_LRU, "dirty LRU page"},
+ { MF_MSG_CLEAN_LRU, "clean LRU page"},
+ { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"},
+ { MF_MSG_BUDDY, "free buddy page"},
+ { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"},
+ { MF_MSG_DAX, "dax page"},
+ { MF_MSG_UNSPLIT_THP, "unsplit thp"},
+ { MF_MSG_UNKNOWN, "unknown page"},
+};
+
+/* memory failure action results */
+static const struct {
+ int result;
+ const char *action_result;
+} mf_action_result[] = {
+ { MF_IGNORED, "Ignored" },
+ { MF_FAILED, "Failed" },
+ { MF_DELAYED, "Delayed" },
+ { MF_RECOVERED, "Recovered" },
+};
+
+static const char *get_page_type(int page_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_page_type); i++)
+ if (mf_page_type[i].type == page_type)
+ return mf_page_type[i].page_type;
+
+ return "unknown page";
+}
+
+static const char *get_action_result(int result)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_action_result); i++)
+ if (mf_action_result[i].result == result)
+ return mf_action_result[i].action_result;
+
+ return "unknown";
+}
+
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context)
+{
+ unsigned long long val;
+ struct ras_events *ras = context;
+ time_t now;
+ struct tm *tm;
+ struct ras_mf_event ev;
+
+ /*
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ * On previous kernels, the way to properly generate an event would
+ * be to inject a fake one, measure its timestamp and diff it against
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
+ * falling-back to the event report's time, if "uptime" clock is
+ * not available (legacy kernels).
+ */
+
+ if (ras->use_uptime)
+ now = record->ts/user_hz + ras->uptime_diff;
+ else
+ now = time(NULL);
+
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s ", ev.timestamp);
+
+ if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0)
+ return -1;
+ sprintf(ev.pfn, "0x%llx", val);
+ trace_seq_printf(s, "pfn=0x%llx ", val);
+
+ if (pevent_get_field_val(s, event, "type", record, &val, 1) < 0)
+ return -1;
+ ev.page_type = get_page_type(val);
+ trace_seq_printf(s, "page_type=%s ", ev.page_type);
+
+ if (pevent_get_field_val(s, event, "result", record, &val, 1) < 0)
+ return -1;
+ ev.action_result = get_action_result(val);
+ trace_seq_printf(s, "action_result=%s ", ev.action_result);
+
+ /* Store data into the SQLite DB */
+#ifdef HAVE_SQLITE3
+ ras_store_mf_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_mf_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-memory-failure-handler.h b/ras-memory-failure-handler.h
new file mode 100644
index 0000000..b9e9971
--- /dev/null
+++ b/ras-memory-failure-handler.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+*/
+
+#ifndef __RAS_MEMORY_FAILURE_HANDLER_H
+#define __RAS_MEMORY_FAILURE_HANDLER_H
+
+#include "ras-events.h"
+#include "libtrace/event-parse.h"
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context);
+
+#endif
diff --git a/ras-record.c b/ras-record.c
index 549c494..1a2ea06 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -498,6 +498,56 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev
}
#endif
+/*
+ * Table and functions to handle ras:memory_failure
+ */
+
+#ifdef HAVE_MEMORY_FAILURE
+static const struct db_fields mf_event_fields[] = {
+ { .name="id", .type="INTEGER PRIMARY KEY" },
+ { .name="timestamp", .type="TEXT" },
+ { .name="pfn", .type="TEXT" },
+ { .name="page_type", .type="TEXT" },
+ { .name="action_result", .type="TEXT" },
+};
+
+static const struct db_table_descriptor mf_event_tab = {
+ .name = "memory_failure_event",
+ .fields = mf_event_fields,
+ .num_fields = ARRAY_SIZE(mf_event_fields),
+};
+
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_mf_event)
+ return 0;
+ log(TERM, LOG_INFO, "memory_failure_event store: %p\n", priv->stmt_mf_event);
+
+ sqlite3_bind_text(priv->stmt_mf_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 2, ev->pfn, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 3, ev->page_type, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 4, ev->action_result, -1, NULL);
+
+ rc = sqlite3_step(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do memory_failure_event step on sqlite: error = %d\n", rc);
+
+ rc = sqlite3_reset(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset memory_failure_event on sqlite: error = %d\n",
+ rc);
+
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
+#endif
+
/*
* Generic code
*/
@@ -810,6 +860,16 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
}
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = ras_mc_create_table(priv, &mf_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mf_event,
+ &mf_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
+#endif
+
ras->db_priv = priv;
return 0;
@@ -912,6 +972,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
}
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ if (priv->stmt_mf_event) {
+ rc = sqlite3_finalize(priv->stmt_mf_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize mf_event sqlite: error = %d\n",
+ cpu, rc);
+ }
+#endif
+
rc = sqlite3_close_v2(db);
if (rc != SQLITE_OK)
log(TERM, LOG_ERR,
diff --git a/ras-record.h b/ras-record.h
index cc217a9..4bbeb0c 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -98,6 +98,13 @@ struct diskerror_event {
const char *cmd;
};
+struct ras_mf_event {
+ char timestamp[64];
+ char pfn[30];
+ const char *page_type;
+ const char *action_result;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -106,6 +113,7 @@ struct ras_arm_event;
struct mce_event;
struct devlink_event;
struct diskerror_event;
+struct ras_mf_event;
#ifdef HAVE_SQLITE3
@@ -135,6 +143,9 @@ struct sqlite3_priv {
#ifdef HAVE_DISKERROR
sqlite3_stmt *stmt_diskerror_event;
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ sqlite3_stmt *stmt_mf_event;
+#endif
};
struct db_fields {
@@ -161,6 +172,7 @@ int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standar
int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev);
int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev);
int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -173,6 +185,7 @@ static inline int ras_store_non_standard_record(struct ras_events *ras, struct r
static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; };
static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
+static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 2710eac..ea3a9b6 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -309,6 +309,28 @@ static int set_diskerror_event_backtrace(char *buf, struct diskerror_event *ev)
return 0;
}
+static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "pfn=%s\n" \
+ "page_type=%s\n" \
+ "action_result=%s\n", \
+ ev->timestamp, \
+ ev->pfn, \
+ ev->page_type, \
+ ev->action_result);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -343,6 +365,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case DISKERROR_EVENT:
rc = set_diskerror_event_backtrace(buf, (struct diskerror_event *)ev);
break;
+ case MF_EVENT:
+ rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev);
+ break;
default:
return -1;
}
@@ -708,3 +733,46 @@ diskerror_fail:
return -1;
}
}
+
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto mf_fail;
+
+ rc = commit_report_backtrace(sockfd, MF_EVENT, ev);
+ if (rc < 0)
+ goto mf_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-memory_failure");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ sprintf(buf, "REASON=%s", "memory failure problem");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ done = 1;
+
+mf_fail:
+ if (sockfd > 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index 1d911de..e605eb1 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -38,6 +38,7 @@ int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standar
int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev);
int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev);
int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
@@ -48,6 +49,7 @@ static inline int ras_report_non_standard_event(struct ras_events *ras, struct r
static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
static inline int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; };
static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
+static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v2 2/8] rasdaemon: ras-mc-ctl: Modify ARM processor error summary log
2021-01-25 17:14 [PATCH v2 0/8] rasdaemon: add support for memory_failure events, Shiju Jose
2021-01-25 17:14 ` [PATCH v2 1/8] rasdaemon: add support for memory_failure events Shiju Jose
@ 2021-01-25 17:14 ` Shiju Jose
2021-01-25 17:14 ` [PATCH v2 3/8] rasdaemon: ras-mc-ctl: Add memory failure events Shiju Jose
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Shiju Jose @ 2021-01-25 17:14 UTC (permalink / raw)
To: linux-edac, mchehab+huawei; +Cc: linuxarm, tanxiaofei, jonathan.cameron
Add CPU's mpidr information to the ARM processor error
summary log.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
util/ras-mc-ctl.in | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index dd7d56f..d8abdbd 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1123,7 +1123,7 @@ sub summary
my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
my ($etype, $severity, $etype_string, $severity_string);
my ($dev_name, $dev);
- my ($affinity, $mpidr);
+ my ($mpidr);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1160,13 +1160,13 @@ sub summary
$query_handle->finish;
# ARM processor arm_event errors
- $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
+ $query = "select mpidr, count(*) from arm_event group by mpidr";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($affinity, $mpidr, $count));
+ $query_handle->bind_columns(\($mpidr, $count));
$out = "";
while($query_handle->fetch()) {
- $out .= "\t$count errors\n";
+ $out .= sprintf "\tCPU(mpidr=0x%x) has %d errors\n", $mpidr, $count;
}
if ($out ne "") {
print "ARM processor events summary:\n$out\n";
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v2 3/8] rasdaemon: ras-mc-ctl: Add memory failure events
2021-01-25 17:14 [PATCH v2 0/8] rasdaemon: add support for memory_failure events, Shiju Jose
2021-01-25 17:14 ` [PATCH v2 1/8] rasdaemon: add support for memory_failure events Shiju Jose
2021-01-25 17:14 ` [PATCH v2 2/8] rasdaemon: ras-mc-ctl: Modify ARM processor error summary log Shiju Jose
@ 2021-01-25 17:14 ` Shiju Jose
2021-01-25 17:14 ` [PATCH v2 4/8] rasdaemon: ras-mc-ctl: Fix for exception when an event is not enabled Shiju Jose
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Shiju Jose @ 2021-01-25 17:14 UTC (permalink / raw)
To: linux-edac, mchehab+huawei; +Cc: linuxarm, tanxiaofei, jonathan.cameron
Add supporting memory failure errors (memory_failure_event)
to the ras-mc-ctl tool.
Sample Log,
ras-mc-ctl --summary
...
Memory failure events summary:
Delayed errors: 4
Failed errors: 1
...
ras-mc-ctl --errors
...
Memory failure events:
1 2020-10-28 23:20:41 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
2 2020-10-28 23:31:38 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
3 2020-10-28 23:54:54 -0800 error: pfn=0x205000000, page_type=free buddy page, action_result=Delayed
4 2020-10-29 00:12:25 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
5 2020-10-29 00:26:36 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Failed
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
util/ras-mc-ctl.in | 36 +++++++++++++++++++++++++++++++++++-
1 file changed, 35 insertions(+), 1 deletion(-)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index d8abdbd..eebcc4e 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1120,7 +1120,7 @@ sub summary
{
require DBI;
my ($query, $query_handle, $out);
- my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
+ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result);
my ($etype, $severity, $etype_string, $severity_string);
my ($dev_name, $dev);
my ($mpidr);
@@ -1225,6 +1225,22 @@ sub summary
}
$query_handle->finish;
+ # Memory failure errors
+ $query = "select action_result, count(*) from memory_failure_event group by action_result";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($action_result, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$action_result errors: $count\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events summary:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
+ }
+ $query_handle->finish;
+
# MCE mce_record errors
$query = "select error_msg, count(*) from mce_record group by error_msg";
$query_handle = $dbh->prepare($query);
@@ -1253,6 +1269,7 @@ sub errors
my ($bus_name, $dev_name, $driver_name, $reporter_name);
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
+ my ($pfn, $page_type, $action_result);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1384,6 +1401,23 @@ sub errors
}
$query_handle->finish;
+ # Memory failure errors
+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "pfn=$pfn, page_type=$page_type, action_result=$action_result\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
+ }
+ $query_handle->finish;
+
# MCE mce_record errors
$query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
$query_handle = $dbh->prepare($query);
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v2 4/8] rasdaemon: ras-mc-ctl: Fix for exception when an event is not enabled
2021-01-25 17:14 [PATCH v2 0/8] rasdaemon: add support for memory_failure events, Shiju Jose
` (2 preceding siblings ...)
2021-01-25 17:14 ` [PATCH v2 3/8] rasdaemon: ras-mc-ctl: Add memory failure events Shiju Jose
@ 2021-01-25 17:14 ` Shiju Jose
2021-01-25 17:14 ` [PATCH v2 5/8] rasdaemon: ras-mc-ctl: Add support for the vendor-specific errors Shiju Jose
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Shiju Jose @ 2021-01-25 17:14 UTC (permalink / raw)
To: linux-edac, mchehab+huawei; +Cc: linuxarm, tanxiaofei, jonathan.cameron
When an event is not enabled in the build and thus the event's table
is not present in the SQLite DB, then the DBI would detect exception
and ras-mc-ctl exit without read and log remaining event's information.
Following is the error log when the devlink_event is not enabled,
"DBD::SQLite::db prepare failed: no such table: devlink_event at ./ras-mc-ctl line 1198.
Can't call method "execute" on an undefined value at ./ras-mc-ctl line 1199"
Add an extra check, whether an event is enabled in the build,
before try reading the tables.
Reported-by: Xiaofei Tan <tanxiaofei@huawei.com>
Suggested-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
configure.ac | 7 +
util/ras-mc-ctl.in | 506 ++++++++++++++++++++++++---------------------
2 files changed, 278 insertions(+), 235 deletions(-)
diff --git a/configure.ac b/configure.ac
index a6251d4..9893bb4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -42,6 +42,7 @@ AC_SUBST([SQLITE3_LIBS])
AC_ARG_ENABLE([aer],
AS_HELP_STRING([--enable-aer], [enable PCIe AER events (currently experimental)]))
+AC_SUBST([enable_aer])
AS_IF([test "x$enable_aer" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_AER,1,"have PCIe AER events collect")
@@ -63,6 +64,7 @@ AM_COND_IF([WITH_NON_STANDARD], [USE_NON_STANDARD="yes"], [USE_NON_STANDARD="no"
AC_ARG_ENABLE([arm],
AS_HELP_STRING([--enable-arm], [enable ARM events (currently experimental)]))
+AC_SUBST([enable_arm])
AS_IF([test "x$enable_arm" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_ARM,1,"have ARM events collect")
@@ -73,6 +75,7 @@ AM_COND_IF([WITH_ARM], [USE_ARM="yes"], [USE_ARM="no"])
AC_ARG_ENABLE([mce],
AS_HELP_STRING([--enable-mce], [enable MCE events (currently experimental)]))
+AC_SUBST([enable_mce])
AS_IF([test "x$enable_mce" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_MCE,1,"have PCIe MCE events collect")
@@ -83,6 +86,7 @@ AM_COND_IF([WITH_MCE], [USE_MCE="yes"], [USE_MCE="no"])
AC_ARG_ENABLE([extlog],
AS_HELP_STRING([--enable-extlog], [enable EXTLOG events (currently experimental)]))
+AC_SUBST([enable_extlog])
AS_IF([test "x$enable_extlog" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_EXTLOG,1,"have EXTLOG events collect")
@@ -93,6 +97,7 @@ AM_COND_IF([WITH_EXTLOG], [USE_EXTLOG="yes"], [USE_EXTLOG="no"])
AC_ARG_ENABLE([devlink],
AS_HELP_STRING([--enable-devlink], [enable devlink health events (currently experimental)]))
+AC_SUBST([enable_devlink])
AS_IF([test "x$enable_devlink" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_DEVLINK,1,"have devlink health events collect")
@@ -103,6 +108,7 @@ AM_COND_IF([WITH_DEVLINK], [USE_DEVLINK="yes"], [USE_DEVLINK="no"])
AC_ARG_ENABLE([diskerror],
AS_HELP_STRING([--enable-diskerror], [enable disk I/O error events (currently experimental)]))
+AC_SUBST([enable_diskerror])
AS_IF([test "x$enable_diskerror" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_DISKERROR,1,"have disk I/O errors collect")
@@ -113,6 +119,7 @@ AM_COND_IF([WITH_DISKERROR], [USE_DISKERROR="yes"], [USE_DISKERROR="no"])
AC_ARG_ENABLE([memory_failure],
AS_HELP_STRING([--enable-memory-failure], [enable memory failure events (currently experimental)]))
+AC_SUBST([enable_memory_failure])
AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_MEMORY_FAILURE,1,"have memory failure events collect")
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index eebcc4e..97b1fa4 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -65,6 +65,14 @@ $conf{mbconfig} = "$sysconfdir/ras/mainboard";
my $status = 0;
+my $enable_aer = "@enable_aer@";
+my $enable_arm = "@enable_arm@";
+my $enable_mce = "@enable_mce@";
+my $enable_extlog = "@enable_extlog@";
+my $enable_devlink = "@enable_devlink@";
+my $enable_diskerror = "@enable_diskerror@";
+my $enable_mem_failure = "@enable_memory_failure@";
+
my $usage = <<EOF;
Usage: $prog [OPTIONS...]
--quiet Quiet operation.
@@ -1144,118 +1152,132 @@ sub summary
$query_handle->finish;
# PCIe AER aer_event errors
- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($err_type, $msg, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count $err_type errors: $msg\n";
- }
- if ($out ne "") {
- print "PCIe AER events summary:\n$out\n";
- } else {
- print "No PCIe AER errors.\n\n";
+ if ($enable_aer eq "yes") {
+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($err_type, $msg, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count $err_type errors: $msg\n";
+ }
+ if ($out ne "") {
+ print "PCIe AER events summary:\n$out\n";
+ } else {
+ print "No PCIe AER errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# ARM processor arm_event errors
- $query = "select mpidr, count(*) from arm_event group by mpidr";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($mpidr, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= sprintf "\tCPU(mpidr=0x%x) has %d errors\n", $mpidr, $count;
- }
- if ($out ne "") {
- print "ARM processor events summary:\n$out\n";
- } else {
- print "No ARM processor errors.\n\n";
+ if ($enable_arm eq "yes") {
+ $query = "select mpidr, count(*) from arm_event group by mpidr";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($mpidr, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= sprintf "\tCPU(mpidr=0x%x) has %d errors\n", $mpidr, $count;
+ }
+ if ($out ne "") {
+ print "ARM processor events summary:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# extlog errors
- $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($etype, $severity, $count));
- $out = "";
- while($query_handle->fetch()) {
- $etype_string = get_extlog_type($etype);
- $severity_string = get_extlog_severity($severity);
- $out .= "\t$count $etype_string $severity_string errors\n";
- }
- if ($out ne "") {
- print "Extlog records summary:\n$out";
- } else {
- print "No Extlog errors.\n\n";
+ if ($enable_extlog eq "yes") {
+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($etype, $severity, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $etype_string = get_extlog_type($etype);
+ $severity_string = get_extlog_severity($severity);
+ $out .= "\t$count $etype_string $severity_string errors\n";
+ }
+ if ($out ne "") {
+ print "Extlog records summary:\n$out";
+ } else {
+ print "No Extlog errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# devlink errors
- $query = "select dev_name, count(*) from devlink_event group by dev_name";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($dev_name, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$dev_name has $count errors\n";
- }
- if ($out ne "") {
- print "Devlink records summary:\n$out";
- } else {
- print "No devlink errors.\n";
+ if ($enable_devlink eq "yes") {
+ $query = "select dev_name, count(*) from devlink_event group by dev_name";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($dev_name, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$dev_name has $count errors\n";
+ }
+ if ($out ne "") {
+ print "Devlink records summary:\n$out";
+ } else {
+ print "No devlink errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# Disk errors
- $query = "select dev, count(*) from disk_errors group by dev";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($dev, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$dev has $count errors\n";
- }
- if ($out ne "") {
- print "Disk errors summary:\n$out";
- } else {
- print "No disk errors.\n";
+ if ($enable_diskerror eq "yes") {
+ $query = "select dev, count(*) from disk_errors group by dev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($dev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$dev has $count errors\n";
+ }
+ if ($out ne "") {
+ print "Disk errors summary:\n$out";
+ } else {
+ print "No disk errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# Memory failure errors
- $query = "select action_result, count(*) from memory_failure_event group by action_result";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($action_result, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$action_result errors: $count\n";
- }
- if ($out ne "") {
- print "Memory failure events summary:\n$out\n";
- } else {
- print "No Memory failure errors.\n\n";
+ if ($enable_mem_failure eq "yes") {
+ $query = "select action_result, count(*) from memory_failure_event group by action_result";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($action_result, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$action_result errors: $count\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events summary:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# MCE mce_record errors
- $query = "select error_msg, count(*) from mce_record group by error_msg";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($msg, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count $msg errors\n";
- }
- if ($out ne "") {
- print "MCE records summary:\n$out";
- } else {
- print "No MCE errors.\n";
+ if ($enable_mce eq "yes") {
+ $query = "select error_msg, count(*) from mce_record group by error_msg";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($msg, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count $msg errors\n";
+ }
+ if ($out ne "") {
+ print "MCE records summary:\n$out";
+ } else {
+ print "No MCE errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
undef($dbh);
}
@@ -1294,167 +1316,181 @@ sub errors
$query_handle->finish;
# PCIe AER aer_event errors
- $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $devname, $type, $msg));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $time $devname $type error: $msg\n";
- }
- if ($out ne "") {
- print "PCIe AER events:\n$out\n";
- } else {
- print "No PCIe AER errors.\n\n";
+ if ($enable_aer eq "yes") {
+ $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $time, $devname, $type, $msg));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $time $devname $type error: $msg\n";
+ }
+ if ($out ne "") {
+ print "PCIe AER events:\n$out\n";
+ } else {
+ print "No PCIe AER errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# ARM processor arm_event errors
- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $timestamp error: ";
- $out .= "error_count=$error_count, " if ($error_count);
- $out .= "affinity_level=$affinity, ";
- $out .= sprintf "mpidr=0x%x, ", $mpidr;
- $out .= sprintf "running_state=0x%x, ", $r_state;
- $out .= sprintf "psci_state=0x%x", $psci_state;
- $out .= "\n";
- }
- if ($out ne "") {
- print "ARM processor events:\n$out\n";
- } else {
- print "No ARM processor errors.\n\n";
+ if ($enable_arm eq "yes") {
+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "error_count=$error_count, " if ($error_count);
+ $out .= "affinity_level=$affinity, ";
+ $out .= sprintf "mpidr=0x%x, ", $mpidr;
+ $out .= sprintf "running_state=0x%x, ", $r_state;
+ $out .= sprintf "psci_state=0x%x", $psci_state;
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# Extlog errors
- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
- $out = "";
- while($query_handle->fetch()) {
- $etype_string = get_extlog_type($etype);
- $severity_string = get_extlog_severity($severity);
- $out .= "$id $timestamp error: ";
- $out .= "type=$etype_string, ";
- $out .= "severity=$severity_string, ";
- $out .= sprintf "address=0x%08x, ", $addr;
- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
- $out .= "fru_text='$fru_text', ";
- $out .= get_cper_data_text($cper_data) if ($cper_data);
- $out .= "\n";
- }
- if ($out ne "") {
- print "Extlog events:\n$out\n";
- } else {
- print "No Extlog errors.\n\n";
+ if ($enable_extlog eq "yes") {
+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
+ $out = "";
+ while($query_handle->fetch()) {
+ $etype_string = get_extlog_type($etype);
+ $severity_string = get_extlog_severity($severity);
+ $out .= "$id $timestamp error: ";
+ $out .= "type=$etype_string, ";
+ $out .= "severity=$severity_string, ";
+ $out .= sprintf "address=0x%08x, ", $addr;
+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
+ $out .= "fru_text='$fru_text', ";
+ $out .= get_cper_data_text($cper_data) if ($cper_data);
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "Extlog events:\n$out\n";
+ } else {
+ print "No Extlog errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# devlink errors
- $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $bus_name, $dev_name, $driver_name, $reporter_name, $msg));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $timestamp error: ";
- $out .= "bus_name=$bus_name, ";
- $out .= "dev_name=$dev_name, ";
- $out .= "driver_name=$driver_name, ";
- $out .= "reporter_name=$reporter_name, ";
- $out .= "message='$msg', ";
- $out .= "\n";
- }
- if ($out ne "") {
- print "Devlink events:\n$out\n";
- } else {
- print "No devlink errors.\n\n";
+ if ($enable_devlink eq "yes") {
+ $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $bus_name, $dev_name, $driver_name, $reporter_name, $msg));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "bus_name=$bus_name, ";
+ $out .= "dev_name=$dev_name, ";
+ $out .= "driver_name=$driver_name, ";
+ $out .= "reporter_name=$reporter_name, ";
+ $out .= "message='$msg', ";
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "Devlink events:\n$out\n";
+ } else {
+ print "No devlink errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# Disk errors
- $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $timestamp error: ";
- $out .= "dev=$dev, ";
- $out .= "sector=$sector, ";
- $out .= "nr_sector=$nr_sector, ";
- $out .= "error='$error', ";
- $out .= "rwbs='$rwbs', ";
- $out .= "cmd='$cmd', ";
- $out .= "\n";
- }
- if ($out ne "") {
- print "Disk errors\n$out\n";
- } else {
- print "No disk errors.\n\n";
+ if ($enable_diskerror eq "yes") {
+ $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "dev=$dev, ";
+ $out .= "sector=$sector, ";
+ $out .= "nr_sector=$nr_sector, ";
+ $out .= "error='$error', ";
+ $out .= "rwbs='$rwbs', ";
+ $out .= "cmd='$cmd', ";
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "Disk errors\n$out\n";
+ } else {
+ print "No disk errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# Memory failure errors
- $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $timestamp error: ";
- $out .= "pfn=$pfn, page_type=$page_type, action_result=$action_result\n";
- }
- if ($out ne "") {
- print "Memory failure events:\n$out\n";
- } else {
- print "No Memory failure errors.\n\n";
+ if ($enable_mem_failure eq "yes") {
+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "pfn=$pfn, page_type=$page_type, action_result=$action_result\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# MCE mce_record errors
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $time error: $msg";
- $out .= ", CPU $cpuvendor" if ($cpuvendor);
- $out .= ", bank $bank_name" if ($bank_name);
- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
- $out .= ", $mc_location" if ($mc_location);
- $out .= ", $user_action" if ($user_action);
- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
- $out .= sprintf ", status=0x%08x", $status if ($status);
- $out .= sprintf ", addr=0x%08x", $addr if ($addr);
- $out .= sprintf ", misc=0x%08x", $misc if ($misc);
- $out .= sprintf ", ip=0x%08x", $ip if ($ip);
- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
- $out .= sprintf ", cs=0x%08x", $cs if ($cs);
- $out .= sprintf ", bank=0x%08x", $bank if ($bank);
-
- $out .= "\n";
- }
- if ($out ne "") {
- print "MCE events:\n$out\n";
- } else {
- print "No MCE errors.\n\n";
+ if ($enable_mce eq "yes") {
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $time error: $msg";
+ $out .= ", CPU $cpuvendor" if ($cpuvendor);
+ $out .= ", bank $bank_name" if ($bank_name);
+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
+ $out .= ", $mc_location" if ($mc_location);
+ $out .= ", $user_action" if ($user_action);
+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
+ $out .= sprintf ", status=0x%08x", $status if ($status);
+ $out .= sprintf ", addr=0x%08x", $addr if ($addr);
+ $out .= sprintf ", misc=0x%08x", $misc if ($misc);
+ $out .= sprintf ", ip=0x%08x", $ip if ($ip);
+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
+ $out .= sprintf ", cs=0x%08x", $cs if ($cs);
+ $out .= sprintf ", bank=0x%08x", $bank if ($bank);
+
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "MCE events:\n$out\n";
+ } else {
+ print "No MCE errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
undef($dbh);
}
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v2 5/8] rasdaemon: ras-mc-ctl: Add support for the vendor-specific errors
2021-01-25 17:14 [PATCH v2 0/8] rasdaemon: add support for memory_failure events, Shiju Jose
` (3 preceding siblings ...)
2021-01-25 17:14 ` [PATCH v2 4/8] rasdaemon: ras-mc-ctl: Fix for exception when an event is not enabled Shiju Jose
@ 2021-01-25 17:14 ` Shiju Jose
2021-01-25 17:14 ` [PATCH v2 6/8] rasdaemon: ras-mc-ctl: Add support for HiSilicon Kunpeng920 errors Shiju Jose
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Shiju Jose @ 2021-01-25 17:14 UTC (permalink / raw)
To: linux-edac, mchehab+huawei; +Cc: linuxarm, tanxiaofei, jonathan.cameron
Add commands to support logging the vendor-specific
error info in the ras-mc-ctl.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Xiaofei Tan <tanxiaofei@huawei.com>
---
util/ras-mc-ctl.in | 64 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 63 insertions(+), 1 deletion(-)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 97b1fa4..6820823 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -87,6 +87,9 @@ Usage: $prog [OPTIONS...]
--summary Presents a summary of the logged errors.
--errors Shows the errors stored at the error database.
--error-count Shows the corrected and uncorrected error counts using sysfs.
+ --vendor-errors-summary <platform-id> Presents a summary of the vendor-specific logged errors.
+ --vendor-errors <platform-id> Shows the vendor-specific errors stored in the error database.
+ --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors.
--help This help message.
EOF
@@ -134,6 +137,18 @@ if ($conf{opt}{errors}) {
errors ();
}
+if ($conf{opt}{vendor_errors_summary}) {
+ vendor_errors_summary ();
+}
+
+if ($conf{opt}{vendor_errors}) {
+ vendor_errors ();
+}
+
+if ($conf{opt}{vendor_platforms}) {
+ vendor_platforms ();
+}
+
exit (0);
sub parse_cmdline
@@ -149,6 +164,9 @@ sub parse_cmdline
$conf{opt}{summary} = 0;
$conf{opt}{errors} = 0;
$conf{opt}{error_count} = 0;
+ $conf{opt}{vendor_errors_summary} = 0;
+ $conf{opt}{vendor_errors} = 0;
+ $conf{opt}{vendor_platforms} = 0;
my $rref = \$conf{opt}{report};
my $mref = \$conf{opt}{mainboard};
@@ -166,7 +184,10 @@ sub parse_cmdline
"layout" => \$conf{opt}{display_memory_layout},
"summary" => \$conf{opt}{summary},
"errors" => \$conf{opt}{errors},
- "error-count" => \$conf{opt}{error_count}
+ "error-count" => \$conf{opt}{error_count},
+ "vendor-errors-summary" => \$conf{opt}{vendor_errors_summary},
+ "vendor-errors" => \$conf{opt}{vendor_errors},
+ "vendor-platforms" => \$conf{opt}{vendor_platforms},
);
usage(1) if !$rc;
@@ -1495,6 +1516,47 @@ sub errors
undef($dbh);
}
+sub vendor_errors_summary
+{
+ require DBI;
+ my ($num_args, $platform_id);
+
+ $num_args = $#ARGV + 1;
+ $platform_id = 0;
+ if ($num_args ne 0) {
+ $platform_id = $ARGV[0];
+ } else {
+ return;
+ }
+
+ my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
+
+ undef($dbh);
+}
+
+sub vendor_errors
+{
+ require DBI;
+ my ($num_args, $platform_id);
+
+ $num_args = $#ARGV + 1;
+ $platform_id = 0;
+ if ($num_args ne 0) {
+ $platform_id = $ARGV[0];
+ } else {
+ return;
+ }
+
+ my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
+
+ undef($dbh);
+}
+
+sub vendor_platforms
+{
+ print "\nSupported platforms for the vendor-specific errors:\n";
+}
+
sub log_msg { print STDERR "$prog: ", @_ unless $conf{opt}{quiet}; }
sub log_error { log_msg ("Error: @_"); }
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v2 6/8] rasdaemon: ras-mc-ctl: Add support for HiSilicon Kunpeng920 errors
2021-01-25 17:14 [PATCH v2 0/8] rasdaemon: add support for memory_failure events, Shiju Jose
` (4 preceding siblings ...)
2021-01-25 17:14 ` [PATCH v2 5/8] rasdaemon: ras-mc-ctl: Add support for the vendor-specific errors Shiju Jose
@ 2021-01-25 17:14 ` Shiju Jose
2021-01-25 17:14 ` [PATCH v2 7/8] rasdaemon: ras-mc-ctl: Add support for HiSilicon Kunpeng9xx common errors Shiju Jose
2021-01-25 17:14 ` [PATCH v2 8/8] rasdaemon: Modify confiure.ac for Hisilicon Kunpeng errors Shiju Jose
7 siblings, 0 replies; 9+ messages in thread
From: Shiju Jose @ 2021-01-25 17:14 UTC (permalink / raw)
To: linux-edac, mchehab+huawei; +Cc: linuxarm, tanxiaofei, jonathan.cameron
Add support for the HiSilicon Kunpeng920 errors.
Supported error formats: OEM type 1, OEM typ2 and PCIe controller
error formats.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Xiaofei Tan <tanxiaofei@huawei.com>
---
util/ras-mc-ctl.in | 149 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 149 insertions(+)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 6820823..8befc5d 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1516,10 +1516,17 @@ sub errors
undef($dbh);
}
+# Definitions of the vendor platform IDs.
+use constant {
+ HISILICON_KUNPENG_920 => "Kunpeng920",
+};
+
sub vendor_errors_summary
{
require DBI;
my ($num_args, $platform_id);
+ my ($query, $query_handle, $count, $out);
+ my ($module_id, $sub_module_id, $err_severity, $err_sev);
$num_args = $#ARGV + 1;
$platform_id = 0;
@@ -1531,6 +1538,69 @@ sub vendor_errors_summary
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
+ # HiSilicon Kunpeng920 errors
+ if ($platform_id eq HISILICON_KUNPENG_920) {
+ $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($err_severity, $module_id, $count));
+ $out = "";
+ $err_sev = "";
+ while($query_handle->fetch()) {
+ if ($err_severity ne $err_sev) {
+ $out .= "$err_severity errors:\n";
+ $err_sev = $err_severity;
+ }
+ $out .= "\t$module_id: $count\n";
+ }
+ if ($out ne "") {
+ print "HiSilicon Kunpeng920 OEM type1 error events summary:\n$out\n";
+ } else {
+ print "No HiSilicon Kunpeng920 OEM type1 errors.\n\n";
+ }
+ $query_handle->finish;
+
+ $query = "select err_severity, module_id, count(*) from hip08_oem_type2_event_v2 group by err_severity, module_id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($err_severity, $module_id, $count));
+ $out = "";
+ $err_sev = "";
+ while($query_handle->fetch()) {
+ if ($err_severity ne $err_sev) {
+ $out .= "$err_severity errors:\n";
+ $err_sev = $err_severity;
+ }
+ $out .= "\t$module_id: $count\n";
+ }
+ if ($out ne "") {
+ print "HiSilicon Kunpeng920 OEM type2 error events summary:\n$out\n";
+ } else {
+ print "No HiSilicon Kunpeng920 OEM type2 errors.\n\n";
+ }
+ $query_handle->finish;
+
+ $query = "select err_severity, sub_module_id, count(*) from hip08_pcie_local_event_v2 group by err_severity, sub_module_id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($err_severity, $sub_module_id, $count));
+ $out = "";
+ $err_sev = "";
+ while($query_handle->fetch()) {
+ if ($err_severity ne $err_sev) {
+ $out .= "$err_severity errors:\n";
+ $err_sev = $err_severity;
+ }
+ $out .= "\t$sub_module_id: $count\n";
+ }
+ if ($out ne "") {
+ print "HiSilicon Kunpeng920 PCIe controller error events summary:\n$out\n";
+ } else {
+ print "No HiSilicon Kunpeng920 PCIe controller errors.\n\n";
+ }
+ $query_handle->finish;
+ }
+
undef($dbh);
}
@@ -1538,6 +1608,9 @@ sub vendor_errors
{
require DBI;
my ($num_args, $platform_id);
+ my ($query, $query_handle, $id, $timestamp, $out);
+ my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id);
+ my ($module_id, $sub_module_id, $err_severity, $err_type, $regs);
$num_args = $#ARGV + 1;
$platform_id = 0;
@@ -1549,12 +1622,88 @@ sub vendor_errors
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
+ # HiSilicon Kunpeng920 errors
+ if ($platform_id eq HISILICON_KUNPENG_920) {
+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id. $timestamp Error Info: ";
+ $out .= "version=$version, ";
+ $out .= "soc_id=$soc_id, " if ($soc_id);
+ $out .= "socket_id=$socket_id, " if ($socket_id);
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
+ $out .= "module_id=$module_id, " if ($module_id);
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
+ $out .= "err_severity=$err_severity, \n" if ($err_severity);
+ $out .= "Error Registers: $regs\n\n" if ($regs);
+ }
+ if ($out ne "") {
+ print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
+ } else {
+ print "No HiSilicon Kunpeng920 OEM type1 errors.\n";
+ }
+ $query_handle->finish;
+
+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type2_event_v2 order by id, module_id, err_severity";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id. $timestamp Error Info: ";
+ $out .= "version=$version, ";
+ $out .= "soc_id=$soc_id, " if ($soc_id);
+ $out .= "socket_id=$socket_id, " if ($socket_id);
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
+ $out .= "module_id=$module_id, " if ($module_id);
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
+ $out .= "err_severity=$err_severity, \n" if ($err_severity);
+ $out .= "Error Registers: $regs\n\n" if ($regs);
+ }
+ if ($out ne "") {
+ print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
+ } else {
+ print "No HiSilicon Kunpeng920 OEM type2 errors.\n";
+ }
+ $query_handle->finish;
+
+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, sub_module_id, core_id, port_id, err_severity, err_type, regs_dump from hip08_pcie_local_event_v2 order by id, sub_module_id, err_severity";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id. $timestamp Error Info: ";
+ $out .= "version=$version, ";
+ $out .= "soc_id=$soc_id, " if ($soc_id);
+ $out .= "socket_id=$socket_id, " if ($socket_id);
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
+ $out .= "core_id=$core_id, " if ($core_id);
+ $out .= "port_id=$port_id, " if ($port_id);
+ $out .= "err_severity=$err_severity, " if ($err_severity);
+ $out .= "err_type=$err_type, \n" if ($err_type);
+ $out .= "Error Registers: $regs\n\n" if ($regs);
+ }
+ if ($out ne "") {
+ print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
+ } else {
+ print "No HiSilicon Kunpeng920 PCIe controller errors.\n";
+ }
+ $query_handle->finish;
+ }
+
undef($dbh);
}
sub vendor_platforms
{
print "\nSupported platforms for the vendor-specific errors:\n";
+ print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n";
+ print "\n";
}
sub log_msg { print STDERR "$prog: ", @_ unless $conf{opt}{quiet}; }
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v2 7/8] rasdaemon: ras-mc-ctl: Add support for HiSilicon Kunpeng9xx common errors
2021-01-25 17:14 [PATCH v2 0/8] rasdaemon: add support for memory_failure events, Shiju Jose
` (5 preceding siblings ...)
2021-01-25 17:14 ` [PATCH v2 6/8] rasdaemon: ras-mc-ctl: Add support for HiSilicon Kunpeng920 errors Shiju Jose
@ 2021-01-25 17:14 ` Shiju Jose
2021-01-25 17:14 ` [PATCH v2 8/8] rasdaemon: Modify confiure.ac for Hisilicon Kunpeng errors Shiju Jose
7 siblings, 0 replies; 9+ messages in thread
From: Shiju Jose @ 2021-01-25 17:14 UTC (permalink / raw)
To: linux-edac, mchehab+huawei; +Cc: linuxarm, tanxiaofei, jonathan.cameron
Add support for the HiSilicon Kunpeng9xx platforms common errors.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Xiaofei Tan <tanxiaofei@huawei.com>
---
util/ras-mc-ctl.in | 44 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 8befc5d..37a5042 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1519,6 +1519,7 @@ sub errors
# Definitions of the vendor platform IDs.
use constant {
HISILICON_KUNPENG_920 => "Kunpeng920",
+ HISILICON_KUNPENG_9XX => "Kunpeng9xx",
};
sub vendor_errors_summary
@@ -1526,7 +1527,7 @@ sub vendor_errors_summary
require DBI;
my ($num_args, $platform_id);
my ($query, $query_handle, $count, $out);
- my ($module_id, $sub_module_id, $err_severity, $err_sev);
+ my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info);
$num_args = $#ARGV + 1;
$platform_id = 0;
@@ -1601,6 +1602,24 @@ sub vendor_errors_summary
$query_handle->finish;
}
+ # HiSilicon Kunpeng9xx common errors
+ if ($platform_id eq HISILICON_KUNPENG_9XX) {
+ $query = "select err_info, count(*) from hisi_common_section";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($err_info, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\terrors: $count\n";
+ }
+ if ($out ne "") {
+ print "HiSilicon Kunpeng9xx common error events summary:\n$out\n";
+ } else {
+ print "No HiSilicon Kunpeng9xx common errors.\n\n";
+ }
+ $query_handle->finish;
+ }
+
undef($dbh);
}
@@ -1610,7 +1629,7 @@ sub vendor_errors
my ($num_args, $platform_id);
my ($query, $query_handle, $id, $timestamp, $out);
my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id);
- my ($module_id, $sub_module_id, $err_severity, $err_type, $regs);
+ my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs);
$num_args = $#ARGV + 1;
$platform_id = 0;
@@ -1696,6 +1715,26 @@ sub vendor_errors
$query_handle->finish;
}
+ # HiSilicon Kunpeng9xx common errors
+ if ($platform_id eq HISILICON_KUNPENG_9XX) {
+ $query = "select id, timestamp, err_info, regs_dump from hisi_common_section order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $err_info, $regs));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id. $timestamp ";
+ $out .= "Error Info:$err_info \n" if ($err_info);
+ $out .= "Error Registers: $regs\n\n" if ($regs);
+ }
+ if ($out ne "") {
+ print "HiSilicon Kunpeng9xx common error events:\n$out\n";
+ } else {
+ print "No HiSilicon Kunpeng9xx common errors.\n";
+ }
+ $query_handle->finish;
+ }
+
undef($dbh);
}
@@ -1703,6 +1742,7 @@ sub vendor_platforms
{
print "\nSupported platforms for the vendor-specific errors:\n";
print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n";
+ print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n";
print "\n";
}
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v2 8/8] rasdaemon: Modify confiure.ac for Hisilicon Kunpeng errors
2021-01-25 17:14 [PATCH v2 0/8] rasdaemon: add support for memory_failure events, Shiju Jose
` (6 preceding siblings ...)
2021-01-25 17:14 ` [PATCH v2 7/8] rasdaemon: ras-mc-ctl: Add support for HiSilicon Kunpeng9xx common errors Shiju Jose
@ 2021-01-25 17:14 ` Shiju Jose
7 siblings, 0 replies; 9+ messages in thread
From: Shiju Jose @ 2021-01-25 17:14 UTC (permalink / raw)
To: linux-edac, mchehab+huawei; +Cc: linuxarm, tanxiaofei, jonathan.cameron
Modify HIP07 SAS HW errors : $USE_HISI_NS_DECODE to
HISI Kunpeng errors : $USE_HISI_NS_DECODE.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
configure.ac | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/configure.ac b/configure.ac
index 9893bb4..3a8b0c7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -191,7 +191,7 @@ compile time options summary
EXTLOG : $USE_EXTLOG
CPER non-standard : $USE_NON_STANDARD
ABRT report : $USE_ABRT_REPORT
- HIP07 SAS HW errors : $USE_HISI_NS_DECODE
+ HISI Kunpeng errors : $USE_HISI_NS_DECODE
ARM events : $USE_ARM
DEVLINK : $USE_DEVLINK
Disk I/O errors : $USE_DISKERROR
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread