linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mauro Carvalho Chehab <mchehab@redhat.com>
To: unlisted-recipients:; (no To-header on input)
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>,
	Linux Edac Mailing List <linux-edac@vger.kernel.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: [PATCH RFCv2 03/16] hw_event: Consolidate uncorrected/corrected error msgs into one
Date: Sat, 28 Jan 2012 13:32:38 -0200	[thread overview]
Message-ID: <1327764771-28649-4-git-send-email-mchehab@redhat.com> (raw)
In-Reply-To: <1327764771-28649-1-git-send-email-mchehab@redhat.com>

This is an RFC patch, consolidating two trace calls into one.
Not sure if this is the better thing to do, but it simplifies
the error tracepoint, while still keeping the technical details
that may be needed by someone debugging the driver or for
the vendors to double-check what's happening inside the system.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/edac_mc.c          |   51 +++++++--
 include/linux/edac.h            |    6 +
 include/trace/events/hw_event.h |  231 ++++-----------------------------------
 3 files changed, 68 insertions(+), 220 deletions(-)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 2b8382e..5038239 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -685,6 +685,7 @@ void edac_mc_handle_ce(struct mem_ctl_info *mci,
 		int row, int channel, const char *msg)
 {
 	unsigned long remapped_page;
+	char detail[80];
 
 	debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
 
@@ -711,8 +712,15 @@ void edac_mc_handle_ce(struct mem_ctl_info *mci,
 		return;
 	}
 
-	trace_mc_corrected_error(mci, page_frame_number, offset_in_page,
-				syndrome, row, channel, msg);
+	/* Memory type dependent details about the error */
+	snprintf(detail, sizeof(detail),
+		 " (page 0x%lx, offset 0x%lx, grain %d, "
+		 "syndrome 0x%lx, row %d, channel %d)\n",
+		 page_frame_number, offset_in_page,
+		 mci->csrows[row].grain, syndrome, row, channel);
+	trace_mc_error(HW_EVENT_ERR_CORRECTED, mci->mc_idx,
+		       mci->csrows[row].channels[channel].label,
+		       msg, detail);
 
 	if (edac_mc_get_log_ce())
 		/* FIXME - put in DIMM location */
@@ -749,7 +757,8 @@ EXPORT_SYMBOL_GPL(edac_mc_handle_ce);
 
 void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg)
 {
-	trace_mc_corrected_error_no_info(mci, msg);
+	trace_mc_error(HW_EVENT_ERR_CORRECTED, mci->mc_idx,
+		       "unknown", msg, "");
 	if (edac_mc_get_log_ce())
 		edac_mc_printk(mci, KERN_WARNING,
 			"CE - no information available: %s\n", msg);
@@ -768,6 +777,7 @@ void edac_mc_handle_ue(struct mem_ctl_info *mci,
 	char *pos = labels;
 	int chan;
 	int chars;
+	char detail[80];
 
 	debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
 
@@ -796,8 +806,15 @@ void edac_mc_handle_ue(struct mem_ctl_info *mci,
 		pos += chars;
 	}
 
-	trace_mc_uncorrected_error(mci, page_frame_number, offset_in_page,
-				row, msg, labels);
+	/* Memory type dependent details about the error */
+	snprintf(detail, sizeof(detail),
+		 "page 0x%lx, offset 0x%lx, grain %d, row %d ",
+		 page_frame_number, offset_in_page,
+	         mci->csrows[row].grain, row);
+	trace_mc_error(HW_EVENT_ERR_UNCORRECTED, mci->mc_idx,
+		       labels,
+		       msg, detail);
+
 	if (edac_mc_get_log_ue())
 		edac_mc_printk(mci, KERN_EMERG,
 			"UE page 0x%lx, offset 0x%lx, grain %d, row %d, "
@@ -818,7 +835,8 @@ EXPORT_SYMBOL_GPL(edac_mc_handle_ue);
 
 void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg)
 {
-	trace_mc_uncorrected_error_no_info(mci, msg);
+	trace_mc_error(HW_EVENT_ERR_UNCORRECTED, mci->mc_idx,
+		       "unknown", msg, "");
 	if (edac_mc_get_panic_on_ue())
 		panic("EDAC MC%d: Uncorrected Error", mci->mc_idx);
 
@@ -843,6 +861,7 @@ void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
 	char labels[len + 1];
 	char *pos = labels;
 	int chars;
+	char detail[80];
 
 	if (csrow >= mci->nr_csrows) {
 		/* something is wrong */
@@ -891,8 +910,13 @@ void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
 	chars = snprintf(pos, len + 1, "-%s",
 			 mci->csrows[csrow].channels[channelb].label);
 
-	trace_mc_uncorrected_error_fbd(mci, csrow, channela, channelb,
-				       msg, labels);
+	/* Memory type dependent details about the error */
+	snprintf(detail, sizeof(detail),
+		 "row %d, channel-a= %d channel-b= %d ",
+		 csrow, channela, channelb);
+	trace_mc_error(HW_EVENT_ERR_UNCORRECTED, mci->mc_idx,
+		       labels,
+		       msg, detail);
 	if (edac_mc_get_log_ue())
 		edac_mc_printk(mci, KERN_EMERG,
 			"UE row %d, channel-a= %d channel-b= %d "
@@ -913,7 +937,7 @@ EXPORT_SYMBOL(edac_mc_handle_fbd_ue);
 void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
 			unsigned int csrow, unsigned int channel, char *msg)
 {
-
+	char detail[80];
 	/* Ensure boundary values */
 	if (csrow >= mci->nr_csrows) {
 		/* something is wrong */
@@ -936,7 +960,14 @@ void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
 		return;
 	}
 
-	trace_mc_corrected_error_fbd(mci, csrow, channel, msg);
+	/* Memory type dependent details about the error */
+	snprintf(detail, sizeof(detail),
+		 "(row %d, channel %d)\n",
+		 csrow, channel);
+	trace_mc_error(HW_EVENT_ERR_CORRECTED, mci->mc_idx,
+		       mci->csrows[csrow].channels[channel].label,
+		       msg, detail);
+
 	if (edac_mc_get_log_ce())
 		/* FIXME - put in DIMM location */
 		edac_mc_printk(mci, KERN_WARNING,
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 055b248..3ba99d7 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -66,6 +66,12 @@ enum dev_type {
 #define DEV_FLAG_X32		BIT(DEV_X32)
 #define DEV_FLAG_X64		BIT(DEV_X64)
 
+enum hw_event_mc_err_type {
+	HW_EVENT_ERR_CORRECTED,
+	HW_EVENT_ERR_UNCORRECTED,
+	HW_EVENT_ERR_FATAL,
+};
+
 /* memory types */
 enum mem_type {
 	MEM_EMPTY = 0,		/* Empty csrow */
diff --git a/include/trace/events/hw_event.h b/include/trace/events/hw_event.h
index 85fca0d..fee7ed2 100644
--- a/include/trace/events/hw_event.h
+++ b/include/trace/events/hw_event.h
@@ -52,183 +52,42 @@ DEFINE_EVENT(hw_event_class, hw_event_init,
 /*
  * Default error mechanisms for Memory Controller errors (CE and UE)
  */
-TRACE_EVENT(mc_corrected_error,
+TRACE_EVENT(mc_error,
 
-	TP_PROTO(struct mem_ctl_info *mci,
-		unsigned long page_frame_number,
-		unsigned long offset_in_page, unsigned long syndrome,
-		int row, int channel, const char *msg),
+	TP_PROTO(unsigned int err_type,
+		 unsigned int mc_index,
+		 const char *label,
+		 const char *msg,
+		 const char *detail),
 
-	TP_ARGS(mci, page_frame_number, offset_in_page, syndrome, row,
-		channel, msg),
+	TP_ARGS(err_type, mc_index, label, msg, detail),
 
 	TP_STRUCT__entry(
+		__field(	unsigned int,	err_type		)
 		__field(	unsigned int,	mc_index		)
-		__field(	unsigned long,	page_frame_number	)
-		__field(	unsigned long,	offset_in_page		)
-		__field(	u32,		grain			)
-		__field(	unsigned long,	syndrome		)
-		__field(	int,		row			)
-		__field(	int,		channel			)
-		__string(	label,		mci->csrows[row].channels[channel].label)
-		__string(	msg,		msg			)
-	),
-
-	TP_fast_assign(
-		__entry->mc_index		= mci->mc_idx;
-		__entry->page_frame_number	= page_frame_number;
-		__entry->offset_in_page		= offset_in_page;
-		__entry->grain			= mci->csrows[row].grain;
-		__entry->syndrome		= syndrome;
-		__entry->row			= row;
-		__entry->channel		= channel;
-		__assign_str(label, mci->csrows[row].channels[channel].label);
-		__assign_str(msg, msg);
-	),
-
-	TP_printk(HW_ERR "mce#%d: Corrected error %s on label \"%s\" "
-			 "(page 0x%lux, offset 0x%lux, grain %ud, "
-			 "syndrome 0x%lux, row %d, channel %d)\n",
-		__entry->mc_index,
-		__get_str(msg),
-		__get_str(label),
-		__entry->page_frame_number,
-		__entry->offset_in_page,
-		__entry->grain,
-		__entry->syndrome,
-		__entry->row,
-		__entry->channel)
-);
-
-TRACE_EVENT(mc_uncorrected_error,
-
-	TP_PROTO(struct mem_ctl_info *mci,
-		unsigned long page_frame_number,
-		unsigned long offset_in_page,
-		int row, const char *msg, const char *label),
-
-	TP_ARGS(mci, page_frame_number, offset_in_page,
-		row, msg, label),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	mc_index		)
-		__field(	unsigned long,	page_frame_number	)
-		__field(	unsigned long,	offset_in_page		)
-		__field(	u32,		grain			)
-		__field(	int,		row			)
-		__string(	msg,		msg			)
 		__string(	label,		label			)
-	),
-
-	TP_fast_assign(
-		__entry->mc_index		= mci->mc_idx;
-		__entry->page_frame_number	= page_frame_number;
-		__entry->offset_in_page		= offset_in_page;
-		__entry->grain			= mci->csrows[row].grain;
-		__entry->row			= row;
-		__assign_str(msg, msg);
-		__assign_str(label, label);
-	),
-
-	TP_printk(HW_ERR "mce#%d: Uncorrected error %s on label \"%s\""
-			 "(page 0x%lux, offset 0x%lux, grain %ud, row %d)\n",
-		__entry->mc_index,
-		__get_str(msg),
-		__get_str(label),
-		__entry->page_frame_number,
-		__entry->offset_in_page,
-		__entry->grain,
-		__entry->row)
-);
-
-
-/*
- * Fully-Buffered memory hardware in general don't provide syndrome/grain/row
- * information for all types of errors. So, we need to either have another
- * trace event or add a bitmapped field to indicate that some info are not
- * provided and use the previously-declared event. It seemed easier and less
- * confusing to create a different event for such cases
- */
-TRACE_EVENT(mc_corrected_error_fbd,
-
-	TP_PROTO(struct mem_ctl_info *mci,
-		int row, int channel, const char *msg),
-
-	TP_ARGS(mci, row, channel, msg),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	mc_index		)
-		__field(	int,		row			)
-		__field(	int,		channel	        	)
-		__string(	label,		mci->csrows[row].channels[channel].label)
 		__string(	msg,		msg			)
+		__string(	detail,		detail			)
 	),
 
 	TP_fast_assign(
-		__entry->mc_index		= mci->mc_idx;
-		__entry->row			= row;
-		__entry->channel		= channel;
-		__assign_str(label, mci->csrows[row].channels[channel].label);
-		__assign_str(msg, msg);
-	),
-
-	TP_printk(HW_ERR "mce#%d: Corrected Error %s on label \"%s\" "
-			 "(row %d, channel %d)\n",
-		__entry->mc_index,
-		__get_str(msg),
-		__get_str(label),
-		__entry->row,
-		__entry->channel)
-);
-
-TRACE_EVENT(mc_uncorrected_error_fbd,
-
-	TP_PROTO(struct mem_ctl_info *mci,
-		int row, int channela, int channelb,
-		const char *msg, const char *label),
-
-	TP_ARGS(mci, row, channela, channelb, msg, label),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	mc_index		)
-		__field(	int,		row			)
-		__field(	int,		channela		)
-		__field(	int,		channelb		)
-		__string(	msg,		msg			)
-		__string(	label,		label			)
-	),
-
-	TP_fast_assign(
-		__entry->mc_index		= mci->mc_idx;
-		__entry->row			= row;
-		__entry->channela		= channela;
-		__entry->channelb		= channelb;
-		__assign_str(msg, msg);
+		__entry->err_type		= err_type;
+		__entry->mc_index		= mc_index;
 		__assign_str(label, label);
+		__assign_str(msg, msg);
+		__assign_str(detail, detail);
 	),
 
-	TP_printk(HW_ERR "mce#%d: Uncorrected Error %s on label \"%s\" "
-			 "(row %d, channels: %d, %d)\n",
-		__entry->mc_index,
-		__get_str(msg),
-		__get_str(label),
-		__entry->row,
-		__entry->channela,
-		__entry->channelb)
+	TP_printk(HW_ERR "mce#%d: %s error %s on label \"%s\" %s\n",
+		  __entry->mc_index,
+		  (__entry->err_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" :
+			((__entry->err_type == HW_EVENT_ERR_FATAL) ?
+			"Fatal" : "Uncorrected"),
+		  __get_str(msg),
+		  __get_str(label),
+		  __get_str(detail))
 );
 
-/*
- * The Memory controller driver needs to discover the memory topology, in
- * order to associate a hardware error with the memory label. If, for any
- * reason, it receives an error for a channel or row that are not supposed
- * to be there, an error event needs to be generated to indicate:
- *	- that a Corrected or Uncorrected error was received;
- *	- that the driver has a bug and, for that particular hardware, was
- *	  not capable of detecting the hardware architecture
- * If one of such errors is ever received, a bug to the kernel driver must
- * be filled.
- */
-
 TRACE_EVENT(mc_out_of_range,
 	TP_PROTO(struct mem_ctl_info *mci, const char *type, const char *field,
 		int invalid_val, int min, int max),
@@ -263,54 +122,6 @@ TRACE_EVENT(mc_out_of_range,
 );
 
 /*
- * On some cases, a corrected or uncorrected error was detected, but it
- * couldn't be properly handled, or because another error overrided the
- * error registers that details the error or because of some internal problem
- * on the driver. Those events bellow are meant for those error types.
- */
-TRACE_EVENT(mc_corrected_error_no_info,
-	TP_PROTO(struct mem_ctl_info *mci, const char *msg),
-
-	TP_ARGS(mci, msg),
-
-	TP_STRUCT__entry(
-	__string(	msg,			msg			)
-		__field(	unsigned int,	mc_index		)
-	),
-
-	TP_fast_assign(
-		__assign_str(msg, msg);
-		__entry->mc_index		= mci->mc_idx;
-	),
-
-	TP_printk(HW_ERR "mce#%d: Corrected Error: %s\n",
-		__entry->mc_index,
-		__get_str(msg))
-);
-
-TRACE_EVENT(mc_uncorrected_error_no_info,
-	TP_PROTO(struct mem_ctl_info *mci, const char *msg),
-
-	TP_ARGS(mci, msg),
-
-	TP_STRUCT__entry(
-		__string(	msg,		msg			)
-		__field(	unsigned int,	mc_index		)
-	),
-
-	TP_fast_assign(
-		__assign_str(msg, msg);
-		__entry->mc_index		= mci->mc_idx;
-	),
-
-	TP_printk(HW_ERR "mce#%d: Uncorrected Error: %s\n",
-		__entry->mc_index,
-		__get_str(msg))
-);
-
-
-
-/*
  * MCE Events placeholder. Please add non-memory events that come from the
  * MCE driver here
  */
-- 
1.7.8


  parent reply	other threads:[~2012-01-28 15:33 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-01-28 15:32 [PATCH RFCv2 00/16] This is the version 2 of the HERM patches Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 01/16] events/hw_event: Create a Hardware Events Report Mecanism (HERM) Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 02/16] events/hw_event: use __string() trace macros for events Mauro Carvalho Chehab
2012-01-28 15:32 ` Mauro Carvalho Chehab [this message]
2012-01-28 15:32 ` [PATCH RFCv2 04/16] drivers/edac: rename channel_info to csrow_channel_info Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 05/16] edac: Create a dimm struct and move the labels into it Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 06/16] edac_mc_sysfs: Fix error handling Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 07/16] edac: Add per dimm's sysfs nodes Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 08/16] edac: Prepare to push down to drivers the filling of the dimm_info Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 09/16] i5400_edac: Convert it to report memory with the new location Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 10/16] i7300_edac: " Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 11/16] edac: move dimm properties to struct dimm_info Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 12/16] edac: Don't initialize csrow's first_page & friends when not needed Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 13/16] edac: move nr_pages to dimm struct Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 14/16] edac: Add per-dimm sysfs show nodes Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 15/16] edac: DIMM location cleanup Mauro Carvalho Chehab
2012-01-28 15:32 ` [PATCH RFCv2 16/16] edac: Add an error scope logic Mauro Carvalho Chehab

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1327764771-28649-4-git-send-email-mchehab@redhat.com \
    --to=mchehab@redhat.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).