linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mauro Carvalho Chehab <mchehab@kernel.org>
To: Robert Richter <rrichter@marvell.com>
Cc: Borislav Petkov <bp@alien8.de>, Tony Luck <tony.luck@intel.com>,
	James Morse <james.morse@arm.com>,
	"linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH 06/19] EDAC, mc: Remove per layer counters
Date: Fri, 11 Oct 2019 07:40:31 -0300	[thread overview]
Message-ID: <20191011074031.699396df@coco.lan> (raw)
In-Reply-To: <20191010202418.25098-7-rrichter@marvell.com>

Em Thu, 10 Oct 2019 20:25:16 +0000
Robert Richter <rrichter@marvell.com> escreveu:

> Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
> turns out that only the leaves in the memory hierarchy are consumed
> (in sysfs), but not the intermediate layers, e.g.:
> 
>  count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> 
> These unused counters only add complexity, remove them. The error
> counter values are directly stored in struct dimm_info now.

Hmm... not sure if this patch is correct. I remember that there are some
border cases on some drivers (maybe the 3-layer drivers used together
with RDIMM memory controllers?) where some errors are not associated
to an specific dimm, but, instead, are related to a problem at the memory
bus.

Also, depending on how the memory controllers are organized[1], the ECC
logic groups memory on DIMM pairs. So, when an error occur, it may be
either at DIMM1 or DIMM2.

[1] On Intel, this happens with pre-Nehalem memory controllers.

Due to that, storing errors at the dimm struct sounds wrong, as the
error may affect multiple DIMMs or even the entire layer.

> 
> Signed-off-by: Robert Richter <rrichter@marvell.com>
> ---
>  drivers/edac/edac_mc.c       | 104 ++++++++++++-----------------------
>  drivers/edac/edac_mc_sysfs.c |  20 +++----
>  drivers/edac/ghes_edac.c     |   5 +-
>  include/linux/edac.h         |   7 +--
>  4 files changed, 46 insertions(+), 90 deletions(-)
> 
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 45b02bb31964..c1e142643006 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -315,10 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
>  	struct csrow_info *csr;
>  	struct rank_info *chan;
>  	struct dimm_info *dimm;
> -	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
>  	unsigned int pos[EDAC_MAX_LAYERS];
> -	unsigned int idx, size, tot_dimms = 1, count = 1;
> -	unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
> +	unsigned int idx, size, tot_dimms = 1;
> +	unsigned int tot_csrows = 1, tot_channels = 1;
>  	void *pvt, *p, *ptr = NULL;
>  	int i, j, row, chn, n, len;
>  	bool per_rank = false;
> @@ -346,19 +345,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
>  	 * stringent as what the compiler would provide if we could simply
>  	 * hardcode everything into a single struct.
>  	 */
> -	mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> -	layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> -	for (i = 0; i < n_layers; i++) {
> -		count *= layers[i].size;
> -		edac_dbg(4, "errcount layer %d size %d\n", i, count);
> -		ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
> -		ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
> -		tot_errcount += 2 * count;
> -	}
> -
> -	edac_dbg(4, "allocating %d error counters\n", tot_errcount);
> -	pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> -	size = ((unsigned long)pvt) + sz_pvt;
> +	mci	= edac_align_ptr(&ptr, sizeof(*mci), 1);
> +	layer	= edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> +	pvt	= edac_align_ptr(&ptr, sz_pvt, 1);
> +	size	= ((unsigned long)pvt) + sz_pvt;
>  
>  	edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
>  		 size,
> @@ -374,10 +364,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
>  	 * rather than an imaginary chunk of memory located at address 0.
>  	 */
>  	layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
> -	for (i = 0; i < n_layers; i++) {
> -		mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
> -		mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
> -	}
>  	pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
>  
>  	/* setup index and various internal pointers */
> @@ -908,53 +894,31 @@ const char *edac_layer_name[] = {
>  EXPORT_SYMBOL_GPL(edac_layer_name);
>  
>  static void edac_inc_ce_error(struct mem_ctl_info *mci,
> -			      bool enable_per_layer_report,
>  			      const int pos[EDAC_MAX_LAYERS],
>  			      const u16 count)
>  {
> -	int i, index = 0;
> +	struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>  
>  	mci->ce_mc += count;
>  
> -	if (!enable_per_layer_report) {
> +	if (dimm)
> +		dimm->ce_count += count;
> +	else
>  		mci->ce_noinfo_count += count;
> -		return;
> -	}
> -
> -	for (i = 0; i < mci->n_layers; i++) {
> -		if (pos[i] < 0)
> -			break;
> -		index += pos[i];
> -		mci->ce_per_layer[i][index] += count;
> -
> -		if (i < mci->n_layers - 1)
> -			index *= mci->layers[i + 1].size;
> -	}
>  }
>  
>  static void edac_inc_ue_error(struct mem_ctl_info *mci,
> -				    bool enable_per_layer_report,
>  				    const int pos[EDAC_MAX_LAYERS],
>  				    const u16 count)
>  {
> -	int i, index = 0;
> +	struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>  
>  	mci->ue_mc += count;
>  
> -	if (!enable_per_layer_report) {
> +	if (dimm)
> +		dimm->ue_count += count;
> +	else
>  		mci->ue_noinfo_count += count;
> -		return;
> -	}
> -
> -	for (i = 0; i < mci->n_layers; i++) {
> -		if (pos[i] < 0)
> -			break;
> -		index += pos[i];
> -		mci->ue_per_layer[i][index] += count;
> -
> -		if (i < mci->n_layers - 1)
> -			index *= mci->layers[i + 1].size;
> -	}
>  }
>  
>  static void edac_ce_error(struct mem_ctl_info *mci,
> @@ -965,7 +929,6 @@ static void edac_ce_error(struct mem_ctl_info *mci,
>  			  const char *label,
>  			  const char *detail,
>  			  const char *other_detail,
> -			  const bool enable_per_layer_report,
>  			  const unsigned long page_frame_number,
>  			  const unsigned long offset_in_page,
>  			  long grain)
> @@ -988,7 +951,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
>  				       error_count, msg, msg_aux, label,
>  				       location, detail);
>  	}
> -	edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
> +	edac_inc_ce_error(mci, pos, error_count);
>  
>  	if (mci->scrub_mode == SCRUB_SW_SRC) {
>  		/*
> @@ -1018,8 +981,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
>  			  const char *location,
>  			  const char *label,
>  			  const char *detail,
> -			  const char *other_detail,
> -			  const bool enable_per_layer_report)
> +			  const char *other_detail)
>  {
>  	char *msg_aux = "";
>  
> @@ -1048,7 +1010,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
>  			      msg, msg_aux, label, location, detail);
>  	}
>  
> -	edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
> +	edac_inc_ue_error(mci, pos, error_count);
>  }
>  
>  void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> @@ -1064,16 +1026,16 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
>  			"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
>  			e->page_frame_number, e->offset_in_page,
>  			e->grain, e->syndrome);
> -		edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
> -			      detail, e->other_detail, e->enable_per_layer_report,
> +		edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
> +			      e->label, detail, e->other_detail,
>  			      e->page_frame_number, e->offset_in_page, e->grain);
>  	} else {
>  		snprintf(detail, sizeof(detail),
>  			"page:0x%lx offset:0x%lx grain:%ld",
>  			e->page_frame_number, e->offset_in_page, e->grain);
>  
> -		edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
> -			      detail, e->other_detail, e->enable_per_layer_report);
> +		edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
> +			      e->label, detail, e->other_detail);
>  	}
>  
>  
> @@ -1099,6 +1061,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>  	int i, n_labels = 0;
>  	u8 grain_bits;
>  	struct edac_raw_error_desc *e = &mci->error_desc;
> +	bool any_memory = true;
>  
>  	edac_dbg(3, "MC%d\n", mci->mc_idx);
>  
> @@ -1116,9 +1079,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>  
>  	/*
>  	 * Check if the event report is consistent and if the memory
> -	 * location is known. If it is known, enable_per_layer_report will be
> -	 * true, the DIMM(s) label info will be filled and the per-layer
> -	 * error counters will be incremented.
> +	 * location is known. If it is known, the DIMM(s) label info
> +	 * will be filled and the DIMM's error counters will be
> +	 * incremented.
>  	 */
>  	for (i = 0; i < mci->n_layers; i++) {
>  		if (pos[i] >= (int)mci->layers[i].size) {
> @@ -1136,7 +1099,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>  			pos[i] = -1;
>  		}
>  		if (pos[i] >= 0)
> -			e->enable_per_layer_report = true;
> +			any_memory = false;
>  	}
>  
>  	/*
> @@ -1166,16 +1129,17 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>  			e->grain = dimm->grain;
>  
>  		/*
> -		 * If the error is memory-controller wide, there's no need to
> -		 * seek for the affected DIMMs because the whole
> -		 * channel/memory controller/...  may be affected.
> -		 * Also, don't show errors for empty DIMM slots.
> +		 * If the error is memory-controller wide, there's no
> +		 * need to seek for the affected DIMMs because the
> +		 * whole channel/memory controller/... may be
> +		 * affected. Also, don't show errors for empty DIMM
> +		 * slots.
>  		 */
> -		if (!e->enable_per_layer_report || !dimm->nr_pages)
> +		if (any_memory || !dimm->nr_pages)
>  			continue;
>  
>  		if (n_labels >= EDAC_MAX_LABELS) {
> -			e->enable_per_layer_report = false;
> +			any_memory = true;
>  			break;
>  		}
>  		n_labels++;
> @@ -1205,7 +1169,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>  			chan = -2;
>  	}
>  
> -	if (!e->enable_per_layer_report) {
> +	if (any_memory) {
>  		strcpy(e->label, "any memory");
>  	} else {
>  		edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
> index 0367554e7437..8682df2f7f4f 100644
> --- a/drivers/edac/edac_mc_sysfs.c
> +++ b/drivers/edac/edac_mc_sysfs.c
> @@ -556,10 +556,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
>  				      char *data)
>  {
>  	struct dimm_info *dimm = to_dimm(dev);
> -	u32 count;
>  
> -	count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> -	return sprintf(data, "%u\n", count);
> +	return sprintf(data, "%u\n", dimm->ce_count);
>  }
>  
>  static ssize_t dimmdev_ue_count_show(struct device *dev,
> @@ -567,10 +565,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
>  				      char *data)
>  {
>  	struct dimm_info *dimm = to_dimm(dev);
> -	u32 count;
>  
> -	count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
> -	return sprintf(data, "%u\n", count);
> +	return sprintf(data, "%u\n", dimm->ue_count);
>  }
>  
>  /* dimm/rank attribute files */
> @@ -666,7 +662,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
>  					const char *data, size_t count)
>  {
>  	struct mem_ctl_info *mci = to_mci(dev);
> -	int cnt, row, chan, i;
> +	struct dimm_info *dimm;
> +	int row, chan;
> +
>  	mci->ue_mc = 0;
>  	mci->ce_mc = 0;
>  	mci->ue_noinfo_count = 0;
> @@ -682,11 +680,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
>  			ri->channels[chan]->ce_count = 0;
>  	}
>  
> -	cnt = 1;
> -	for (i = 0; i < mci->n_layers; i++) {
> -		cnt *= mci->layers[i].size;
> -		memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
> -		memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
> +	mci_for_each_dimm(mci, dimm) {
> +		dimm->ue_count = 0;
> +		dimm->ce_count = 0;
>  	}
>  
>  	mci->start_time = jiffies;
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 842080d7b33a..e0b90c6d7d63 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -347,11 +347,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>  				     mem_err->mem_dev_handle);
>  
>  		index = get_dimm_smbios_index(mem_err->mem_dev_handle);
> -		if (index >= 0) {
> +		if (index >= 0)
>  			e->top_layer = index;
> -			e->enable_per_layer_report = true;
> -		}
> -
>  	}
>  	if (p > e->location)
>  		*(p - 1) = '\0';
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 8beb6e834be9..8e72222e50b0 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -383,6 +383,9 @@ struct dimm_info {
>  	unsigned int csrow, cschannel;	/* Points to the old API data */
>  
>  	u16 smbios_handle;              /* Handle for SMBIOS type 17 */
> +
> +	u32 ce_count;
> +	u32 ue_count;
>  };
>  
>  /**
> @@ -453,8 +456,6 @@ struct errcount_attribute_data {
>   * @location:			location of the error
>   * @label:			label of the affected DIMM(s)
>   * @other_detail:		other driver-specific detail about the error
> - * @enable_per_layer_report:	if false, the error affects all layers
> - *				(typically, a memory controller error)
>   */
>  struct edac_raw_error_desc {
>  	/*
> @@ -475,7 +476,6 @@ struct edac_raw_error_desc {
>  	unsigned long syndrome;
>  	const char *msg;
>  	const char *other_detail;
> -	bool enable_per_layer_report;
>  };
>  
>  /* MEMORY controller information structure
> @@ -565,7 +565,6 @@ struct mem_ctl_info {
>  	 */
>  	u32 ce_noinfo_count, ue_noinfo_count;
>  	u32 ue_mc, ce_mc;
> -	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
>  
>  	struct completion complete;
>  



Thanks,
Mauro

  reply	other threads:[~2019-10-11 10:40 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-10 20:25 [PATCH 00/19] EDAC: Rework edac_mc and ghes drivers Robert Richter
2019-10-10 20:25 ` [PATCH 01/19] EDAC: Replace EDAC_DIMM_PTR() macro with edac_get_dimm() function Robert Richter
2019-10-11  9:58   ` Mauro Carvalho Chehab
2019-10-11 11:38     ` Robert Richter
2019-10-10 20:25 ` [PATCH 02/19] EDAC: Remove EDAC_DIMM_OFF() macro Robert Richter
2019-10-11 10:09   ` Mauro Carvalho Chehab
2019-10-11 11:36     ` Robert Richter
2019-10-10 20:25 ` [PATCH 03/19] EDAC: Introduce mci_for_each_dimm() iterator Robert Richter
2019-10-11 10:14   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 04/19] EDAC, mc: Do not BUG_ON() in edac_mc_alloc() Robert Richter
2019-10-11 10:15   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 05/19] EDAC, mc: Reduce indentation level in edac_mc_handle_error() Robert Richter
2019-10-10 22:10   ` Joe Perches
2019-10-11  6:50     ` Robert Richter
2019-10-11 10:20     ` Mauro Carvalho Chehab
2019-10-11 10:50       ` Joe Perches
2019-10-11 12:08         ` Robert Richter
2019-10-11 14:49           ` Joe Perches
2019-10-11 10:17   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 06/19] EDAC, mc: Remove per layer counters Robert Richter
2019-10-11 10:40   ` Mauro Carvalho Chehab [this message]
2019-10-14 11:12     ` Robert Richter
2019-10-10 20:25 ` [PATCH 07/19] EDAC, mc: Rename iterator variable to idx Robert Richter
2019-10-11 10:41   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 08/19] EDAC, mc: Split edac_mc_alloc() into smaller functions Robert Richter
2019-10-11 10:43   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 09/19] EDAC, mc: Reorder functions edac_mc_alloc*() Robert Richter
2019-10-11 10:45   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 10/19] EDAC, mc: Rework edac_raw_mc_handle_error() to use struct dimm_info Robert Richter
2019-10-11 10:48   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 11/19] EDAC: Remove misleading comment in struct edac_raw_error_desc Robert Richter
2019-10-11 10:49   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 12/19] EDAC: Store error type " Robert Richter
2019-10-11 10:54   ` Mauro Carvalho Chehab
2019-10-14 11:47     ` Robert Richter
2019-10-10 20:25 ` [PATCH 13/19] EDAC, mc: Determine mci pointer from the error descriptor Robert Richter
2019-10-11 10:56   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 14/19] EDAC, mc: Create new function edac_inc_csrow() Robert Richter
2019-10-11 11:08   ` Mauro Carvalho Chehab
2019-10-14 11:58     ` Robert Richter
2019-10-10 20:25 ` [PATCH 15/19] EDAC, ghes: Use standard kernel macros for page calculations Robert Richter
2019-10-11 11:10   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 16/19] EDAC, ghes: Fix grain calculation Robert Richter
2019-10-11 11:22   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 17/19] EDAC, ghes: Remove intermediate buffer pvt->detail_location Robert Richter
2019-10-11 11:20   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 18/19] EDAC, ghes: Unify trace_mc_event() code with edac_mc driver Robert Richter
2019-10-11 11:23   ` Mauro Carvalho Chehab
2019-10-10 20:25 ` [PATCH 19/19] EDAC, Documentation: Describe CPER module definition and DIMM ranks Robert Richter
2019-10-11 11:29   ` Mauro Carvalho Chehab
2019-10-10 20:36 ` [PATCH 00/19] EDAC: Rework edac_mc and ghes drivers Robert Richter
2019-10-14 12:00 ` Robert Richter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191011074031.699396df@coco.lan \
    --to=mchehab@kernel.org \
    --cc=bp@alien8.de \
    --cc=james.morse@arm.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=rrichter@marvell.com \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).