EDAC, ghes: use CPER module handles to locate DIMMs
diff mbox series

Message ID 1535567632-18089-1-git-send-email-wufan@codeaurora.org
State New, archived
Headers show
Series
  • EDAC, ghes: use CPER module handles to locate DIMMs
Related show

Commit Message

Fan Wu Aug. 29, 2018, 6:33 p.m. UTC
The current ghes_edac driver does not update per-dimm error
counters when reporting memory errors, because there is no
platform-independent way to find DIMMs based on the error
information provided by firmware. This patch offers a solution
for platforms whose firmwares provide valid module handles
(SMBIOS type 17) in error records. In this case ghes_edac will
use the module handles to locate DIMMs and thus makes per-dimm
error reporting possible.

Signed-off-by: Fan Wu <wufan@codeaurora.org>
---
 drivers/edac/ghes_edac.c | 36 +++++++++++++++++++++++++++++++++---
 include/linux/edac.h     |  2 ++
 2 files changed, 35 insertions(+), 3 deletions(-)

Comments

Borislav Petkov Aug. 30, 2018, 10:43 a.m. UTC | #1
On Wed, Aug 29, 2018 at 06:33:52PM +0000, Fan Wu wrote:
> The current ghes_edac driver does not update per-dimm error
> counters when reporting memory errors, because there is no
> platform-independent way to find DIMMs based on the error
> information provided by firmware. This patch offers a solution
> for platforms whose firmwares provide valid module handles
> (SMBIOS type 17) in error records. In this case ghes_edac will
> use the module handles to locate DIMMs and thus makes per-dimm
> error reporting possible.
> 
> Signed-off-by: Fan Wu <wufan@codeaurora.org>
> ---
>  drivers/edac/ghes_edac.c | 36 +++++++++++++++++++++++++++++++++---
>  include/linux/edac.h     |  2 ++
>  2 files changed, 35 insertions(+), 3 deletions(-)

If we're going to do this, it needs to be tested on an x86 box which loads
ghes_edac. Adding Toshi to Cc.

Otherwise it must remain ARM-specific.

> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 473aeec..db527f0 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
>  		(*num_dimm)++;
>  }
>  
> +static int ghes_edac_dimm_index(u16 handle)

get_dimm_smbios_handle()

> +{
> +	struct mem_ctl_info *mci;
> +	int i;
> +
> +	if (!ghes_pvt)
> +		return -1;

You don't need that test.

> +
> +	mci = ghes_pvt->mci;
> +
> +	if (!mci)
> +		return -1;

Ditto.

> +
> +	for (i = 0; i < mci->tot_dimms; i++) {
> +		if (mci->dimms[i]->smbios_handle == handle)
> +			return i;
> +	}
> +	return -1;
> +}
> +
>  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>  {
>  	struct ghes_edac_dimm_fill *dimm_fill = arg;
James Morse Aug. 30, 2018, 10:48 a.m. UTC | #2
Hi Fan,

On 29/08/18 19:33, Fan Wu wrote:
> The current ghes_edac driver does not update per-dimm error
> counters when reporting memory errors, because there is no
> platform-independent way to find DIMMs based on the error
> information provided by firmware.

I'd argue there is: its in the CPER records, we just didn't do anything useful
with the information in the past!


> This patch offers a solution
> for platforms whose firmwares provide valid module handles
> (SMBIOS type 17) in error records. In this case ghes_edac will
> use the module handles to locate DIMMs and thus makes per-dimm
> error reporting possible.


> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 473aeec..db527f0 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
>  		(*num_dimm)++;
>  }
>  
> +static int ghes_edac_dimm_index(u16 handle)
> +{
> +	struct mem_ctl_info *mci;
> +	int i;
> +
> +	if (!ghes_pvt)
> +		return -1;

ghes_edac_report_mem_error() already checked this, as its the only caller there
is no need to check it again.


> +	mci = ghes_pvt->mci;
> +
> +	if (!mci)
> +		return -1;

Can this happen? ghes_edac_report_mem_error() would have dereferenced this already!

If you need the struct mem_ctl_info, you may as well pass it in as the only
caller has it to hand.


> +
> +	for (i = 0; i < mci->tot_dimms; i++) {
> +		if (mci->dimms[i]->smbios_handle == handle)
> +			return i;
> +	}
> +	return -1;
> +}
> +
>  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>  {
>  	struct ghes_edac_dimm_fill *dimm_fill = arg;
> @@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>  				entry->total_width, entry->data_width);
>  		}
>  
> +		dimm->smbios_handle = entry->handle;

We aren't checking for duplicate handles, (e.g. they're all zero). I think this
is fine as chances are firmware on those systems won't set
CPER_MEM_VALID_MODULE_HANDLE. If it does, the handle it gives us is ambiguous,
and we pick a dimm, instead of whine-ing about broken firmware tables.

(I'm just drawing attention to it in case someone disagrees)


>  		dimm_fill->count++;
>  	}
>  }
> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>  		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>  	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
>  		const char *bank = NULL, *device = NULL;
> +		int index = -1;
> +
>  		dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);

> +		p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> +			     mem_err->mem_dev_handle);
>  		if (bank != NULL && device != NULL)
>  			p += sprintf(p, "DIMM location:%s %s ", bank, device);
> -		else
> -			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> -				     mem_err->mem_dev_handle);

Why do we now print the handle every time? The handle is pretty meaningless, it
can only be used to find the location-strings, if we get those we print them
instead.


> +		index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
> +		if (index >= 0) {
> +			e->top_layer = index;
> +			e->enable_per_layer_report = true;
> +		}
> +
>  	}
>  	if (p > e->location)
>  		*(p - 1) = '\0';

Looks good to me!


Thanks,

James
Fan Wu Aug. 30, 2018, 2:20 p.m. UTC | #3
Hi Boris, 
 
> If we're going to do this, it needs to be tested on an x86 box which loads
> ghes_edac. Adding Toshi to Cc.
> 
> Otherwise it must remain ARM-specific.

Toshi it would be great if you can help! I'll also test the change in x86 but
not sure if the firmware updates module_handle.

> > +static int ghes_edac_dimm_index(u16 handle)
> 
> get_dimm_smbios_handle()

This function returns an index. So how about get_dimm_smbios_index()?

> > +{
> > +	struct mem_ctl_info *mci;
> > +	int i;
> > +
> > +	if (!ghes_pvt)
> > +		return -1;
> 
> You don't need that test.

Will remove.
 
> > +
> > +	mci = ghes_pvt->mci;
> > +
> > +	if (!mci)
> > +		return -1;
> 
> Ditto.

Will remove

Thanks,
Fan
Fan Wu Aug. 30, 2018, 2:40 p.m. UTC | #4
Hi James,

> > The current ghes_edac driver does not update per-dimm error counters
> > when reporting memory errors, because there is no platform-independent
> > way to find DIMMs based on the error information provided by firmware.
> 
> I'd argue there is: its in the CPER records, we just didn't do anything useful
> with the information in the past!

Agreed. Will update the wording. 
 
> > +static int ghes_edac_dimm_index(u16 handle) {
> > +	struct mem_ctl_info *mci;
> > +	int i;
> > +
> > +	if (!ghes_pvt)
> > +		return -1;
> 
> ghes_edac_report_mem_error() already checked this, as its the only caller
> there is no need to check it again.

Will remove.
 
> 
> > +	mci = ghes_pvt->mci;
> > +
> > +	if (!mci)
> > +		return -1;
> 
> Can this happen? ghes_edac_report_mem_error() would have
> dereferenced this already!
> 
> If you need the struct mem_ctl_info, you may as well pass it in as the only
> caller has it to hand.

Will remove.
> 
> > +
> > +	for (i = 0; i < mci->tot_dimms; i++) {
> > +		if (mci->dimms[i]->smbios_handle == handle)
> > +			return i;
> > +	}
> > +	return -1;
> > +}
> > +
> >  static void ghes_edac_dmidecode(const struct dmi_header *dh, void
> > *arg)  {
> >  	struct ghes_edac_dimm_fill *dimm_fill = arg; @@ -177,6 +197,8 @@
> > static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> >  				entry->total_width, entry->data_width);
> >  		}
> >
> > +		dimm->smbios_handle = entry->handle;
> 
> We aren't checking for duplicate handles, (e.g. they're all zero). I think this is
> fine as chances are firmware on those systems won't set
> CPER_MEM_VALID_MODULE_HANDLE. If it does, the handle it gives us is
> ambiguous, and we pick a dimm, instead of whine-ing about broken
> firmware tables.
> 
> (I'm just drawing attention to it in case someone disagrees)

SBMIOS tables are typically automatically generated so chances for duplicate
handles are small. 

> 
> >  		dimm_fill->count++;
> >  	}
> >  }
> > @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev,
> struct cper_sec_mem_err *mem_err)
> >  		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
> >  	if (mem_err->validation_bits &
> CPER_MEM_VALID_MODULE_HANDLE) {
> >  		const char *bank = NULL, *device = NULL;
> > +		int index = -1;
> > +
> >  		dmi_memdev_name(mem_err->mem_dev_handle, &bank,
> &device);
> 
> > +		p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> > +			     mem_err->mem_dev_handle);
> >  		if (bank != NULL && device != NULL)
> >  			p += sprintf(p, "DIMM location:%s %s ", bank, device);
> > -		else
> > -			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> > -				     mem_err->mem_dev_handle);
> 
> Why do we now print the handle every time? The handle is pretty
> meaningless, it can only be used to find the location-strings, if we get those
> we print them instead.

For ghes_edac the bank/device is informational, and nothing would go wrong
if the bank/device numbers are the same as another entry. But the handle
is now critical for DIMM lookup, thus pull it out.

Thanks!
Fan
Borislav Petkov Aug. 30, 2018, 3:12 p.m. UTC | #5
On August 30, 2018 5:20:32 PM GMT+03:00, wufan <wufan@codeaurora.org> wrote:
>> > +static int ghes_edac_dimm_index(u16 handle)
>> 
>> get_dimm_smbios_handle()
>
>This function returns an index. So how about get_dimm_smbios_index()?

Sure.
James Morse Aug. 30, 2018, 4:32 p.m. UTC | #6
Hi Fan,

On 30/08/18 15:40, wufan wrote:
>>> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev,
>> struct cper_sec_mem_err *mem_err)
>>>  		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>>>  	if (mem_err->validation_bits &
>> CPER_MEM_VALID_MODULE_HANDLE) {
>>>  		const char *bank = NULL, *device = NULL;
>>> +		int index = -1;
>>> +
>>>  		dmi_memdev_name(mem_err->mem_dev_handle, &bank,
>> &device);
>>
>>> +		p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>> +			     mem_err->mem_dev_handle);
>>>  		if (bank != NULL && device != NULL)
>>>  			p += sprintf(p, "DIMM location:%s %s ", bank, device);
>>> -		else
>>> -			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>> -				     mem_err->mem_dev_handle);
>>
>> Why do we now print the handle every time? The handle is pretty
>> meaningless, it can only be used to find the location-strings, if we get those
>> we print them instead.
> 
> For ghes_edac the bank/device is informational, and nothing would go wrong
> if the bank/device numbers are the same as another entry. But the handle
> is now critical for DIMM lookup, thus pull it out.

Is printing the handle to the kernel log critical?

I'd expect something collecting errors to read from sysfs, not dmesg. I thought
the whole point here was to update the per-dimm counters in sysfs.


Thanks,

James
James Morse Aug. 30, 2018, 4:34 p.m. UTC | #7
Hi Zhengqiang,

On 29/08/18 19:33, Fan Wu wrote:
> The current ghes_edac driver does not update per-dimm error
> counters when reporting memory errors, because there is no
> platform-independent way to find DIMMs based on the error
> information provided by firmware. This patch offers a solution
> for platforms whose firmwares provide valid module handles
> (SMBIOS type 17) in error records. In this case ghes_edac will
> use the module handles to locate DIMMs and thus makes per-dimm
> error reporting possible.

Does your platform set CPER_MEM_VALID_MODULE_HANDLE in GHES Memory errors? If
so, any chance you could test this patch on your platform? [0]
(original patch: https://lore.kernel.org/patchwork/patch/978928/)

Thanks,

James

[0] https://marc.info/?l=linux-edac&m=152603960002324


> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 473aeec..db527f0 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
>  		(*num_dimm)++;
>  }
>  
> +static int ghes_edac_dimm_index(u16 handle)
> +{
> +	struct mem_ctl_info *mci;
> +	int i;
> +
> +	if (!ghes_pvt)
> +		return -1;
> +
> +	mci = ghes_pvt->mci;
> +
> +	if (!mci)
> +		return -1;
> +
> +	for (i = 0; i < mci->tot_dimms; i++) {
> +		if (mci->dimms[i]->smbios_handle == handle)
> +			return i;
> +	}
> +	return -1;
> +}
> +
>  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>  {
>  	struct ghes_edac_dimm_fill *dimm_fill = arg;
> @@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>  				entry->total_width, entry->data_width);
>  		}
>  
> +		dimm->smbios_handle = entry->handle;
> +
>  		dimm_fill->count++;
>  	}
>  }
> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>  		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>  	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
>  		const char *bank = NULL, *device = NULL;
> +		int index = -1;
> +
>  		dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
> +		p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> +			     mem_err->mem_dev_handle);
>  		if (bank != NULL && device != NULL)
>  			p += sprintf(p, "DIMM location:%s %s ", bank, device);
> -		else
> -			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> -				     mem_err->mem_dev_handle);
> +
> +		index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
> +		if (index >= 0) {
> +			e->top_layer = index;
> +			e->enable_per_layer_report = true;
> +		}
> +
>  	}
>  	if (p > e->location)
>  		*(p - 1) = '\0';
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index bffb978..a45ce1f 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -451,6 +451,8 @@ struct dimm_info {
>  	u32 nr_pages;			/* number of pages on this dimm */
>  
>  	unsigned csrow, cschannel;	/* Points to the old API data */
> +
> +	u16 smbios_handle;              /* Handle for SMBIOS type 17 */
>  };
>  
>  /**
>
James Morse Aug. 30, 2018, 4:34 p.m. UTC | #8
Hi Boris,

On 30/08/18 11:43, Borislav Petkov wrote:
> On Wed, Aug 29, 2018 at 06:33:52PM +0000, Fan Wu wrote:
>> The current ghes_edac driver does not update per-dimm error
>> counters when reporting memory errors, because there is no
>> platform-independent way to find DIMMs based on the error
>> information provided by firmware. This patch offers a solution
>> for platforms whose firmwares provide valid module handles
>> (SMBIOS type 17) in error records. In this case ghes_edac will
>> use the module handles to locate DIMMs and thus makes per-dimm
>> error reporting possible.

> If we're going to do this, it needs to be tested on an x86 box which loads
> ghes_edac. Adding Toshi to Cc.

Good point, thanks.


> Otherwise it must remain ARM-specific.

Hmmm, that would be a shame.

This should only be a problem if HPE Servers set CPER_MEM_VALID_MODULE_HANDLE,
but don't actually provide module handles, or if firmware has a different idea
of what they are.

If firmware never sets CPER_MEM_VALID_MODULE_HANDLE, this patch shouldn't change
anything.

(Someone must have an x86 that sets CPER_MEM_VALID_MODULE_HANDLE, otherwise the
code wouldn't be there right?!)


Thanks,

James
Fan Wu Aug. 30, 2018, 4:45 p.m. UTC | #9
Hi James,

> > For ghes_edac the bank/device is informational, and nothing would go
> > wrong if the bank/device numbers are the same as another entry. But
> > the handle is now critical for DIMM lookup, thus pull it out.
> 
> Is printing the handle to the kernel log critical?
> 
> I'd expect something collecting errors to read from sysfs, not dmesg. I
> thought the whole point here was to update the per-dimm counters in sysfs.

No, printing out the handle is not critical. What I meant is because the 
information is critical it would be nice to have it available for debugging 
purpose. Otherwise it would be hard to find the handle if bank/device
is not accurate. 

Thanks,
Fan
Tyler Baicar Aug. 30, 2018, 4:46 p.m. UTC | #10
On Thu, Aug 30, 2018 at 12:32 PM, James Morse <james.morse@arm.com> wrote:
> Hi Fan,
>
> On 30/08/18 15:40, wufan wrote:
>>>> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev,
>>> struct cper_sec_mem_err *mem_err)
>>>>             p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>>>>     if (mem_err->validation_bits &
>>> CPER_MEM_VALID_MODULE_HANDLE) {
>>>>             const char *bank = NULL, *device = NULL;
>>>> +           int index = -1;
>>>> +
>>>>             dmi_memdev_name(mem_err->mem_dev_handle, &bank,
>>> &device);
>>>
>>>> +           p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>>> +                        mem_err->mem_dev_handle);
>>>>             if (bank != NULL && device != NULL)
>>>>                     p += sprintf(p, "DIMM location:%s %s ", bank, device);
>>>> -           else
>>>> -                   p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>>> -                                mem_err->mem_dev_handle);
>>>
>>> Why do we now print the handle every time? The handle is pretty
>>> meaningless, it can only be used to find the location-strings, if we get those
>>> we print them instead.
>>
>> For ghes_edac the bank/device is informational, and nothing would go wrong
>> if the bank/device numbers are the same as another entry. But the handle
>> is now critical for DIMM lookup, thus pull it out.
>
> Is printing the handle to the kernel log critical?
>

I don't see why we would need this print. The bank/device
print is enough to map what is shown in dmesg to an SMBIOS
entry if that's really needed.

Thanks,
Tyler
John Garry Aug. 30, 2018, 4:50 p.m. UTC | #11
On 30/08/2018 17:34, James Morse wrote:

Hi James,

Zhengqiang no longer works on this topic, so I have cc'ed some more guys 
who should be able to help.

John

> Hi Zhengqiang,
>
> On 29/08/18 19:33, Fan Wu wrote:
>> The current ghes_edac driver does not update per-dimm error
>> counters when reporting memory errors, because there is no
>> platform-independent way to find DIMMs based on the error
>> information provided by firmware. This patch offers a solution
>> for platforms whose firmwares provide valid module handles
>> (SMBIOS type 17) in error records. In this case ghes_edac will
>> use the module handles to locate DIMMs and thus makes per-dimm
>> error reporting possible.
>
> Does your platform set CPER_MEM_VALID_MODULE_HANDLE in GHES Memory errors? If
> so, any chance you could test this patch on your platform? [0]
> (original patch: https://lore.kernel.org/patchwork/patch/978928/)
>
> Thanks,
>
> James
>
> [0] https://marc.info/?l=linux-edac&m=152603960002324
>
>
>> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
>> index 473aeec..db527f0 100644
>> --- a/drivers/edac/ghes_edac.c
>> +++ b/drivers/edac/ghes_edac.c
>> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
>>  		(*num_dimm)++;
>>  }
>>
>> +static int ghes_edac_dimm_index(u16 handle)
>> +{
>> +	struct mem_ctl_info *mci;
>> +	int i;
>> +
>> +	if (!ghes_pvt)
>> +		return -1;
>> +
>> +	mci = ghes_pvt->mci;
>> +
>> +	if (!mci)
>> +		return -1;
>> +
>> +	for (i = 0; i < mci->tot_dimms; i++) {
>> +		if (mci->dimms[i]->smbios_handle == handle)
>> +			return i;
>> +	}
>> +	return -1;
>> +}
>> +
>>  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>>  {
>>  	struct ghes_edac_dimm_fill *dimm_fill = arg;
>> @@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>>  				entry->total_width, entry->data_width);
>>  		}
>>
>> +		dimm->smbios_handle = entry->handle;
>> +
>>  		dimm_fill->count++;
>>  	}
>>  }
>> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>>  		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>>  	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
>>  		const char *bank = NULL, *device = NULL;
>> +		int index = -1;
>> +
>>  		dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
>> +		p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>> +			     mem_err->mem_dev_handle);
>>  		if (bank != NULL && device != NULL)
>>  			p += sprintf(p, "DIMM location:%s %s ", bank, device);
>> -		else
>> -			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>> -				     mem_err->mem_dev_handle);
>> +
>> +		index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
>> +		if (index >= 0) {
>> +			e->top_layer = index;
>> +			e->enable_per_layer_report = true;
>> +		}
>> +
>>  	}
>>  	if (p > e->location)
>>  		*(p - 1) = '\0';
>> diff --git a/include/linux/edac.h b/include/linux/edac.h
>> index bffb978..a45ce1f 100644
>> --- a/include/linux/edac.h
>> +++ b/include/linux/edac.h
>> @@ -451,6 +451,8 @@ struct dimm_info {
>>  	u32 nr_pages;			/* number of pages on this dimm */
>>
>>  	unsigned csrow, cschannel;	/* Points to the old API data */
>> +
>> +	u16 smbios_handle;              /* Handle for SMBIOS type 17 */
>>  };
>>
>>  /**
>>
>
>
> .
>
Fan Wu Aug. 30, 2018, 5:11 p.m. UTC | #12
Hi Tyler, 

> > Is printing the handle to the kernel log critical?
> >
> 
> I don't see why we would need this print. The bank/device print is enough to
> map what is shown in dmesg to an SMBIOS entry if that's really needed.

This change is mostly for convenience. I'll revert it since we have two votes
against it now. 

Thanks,
Fan
Xiaofei Tan Aug. 31, 2018, 10:06 a.m. UTC | #13
On 2018/8/31 0:50, John Garry wrote:
> On 30/08/2018 17:34, James Morse wrote:
> 
> Hi James,
> 
> Zhengqiang no longer works on this topic, so I have cc'ed some more guys who should be able to help.
> 
> John
> 
>> Hi Zhengqiang,
>>
>> On 29/08/18 19:33, Fan Wu wrote:
>>> The current ghes_edac driver does not update per-dimm error
>>> counters when reporting memory errors, because there is no
>>> platform-independent way to find DIMMs based on the error
>>> information provided by firmware. This patch offers a solution
>>> for platforms whose firmwares provide valid module handles
>>> (SMBIOS type 17) in error records. In this case ghes_edac will
>>> use the module handles to locate DIMMs and thus makes per-dimm
>>> error reporting possible.
>>
>> Does your platform set CPER_MEM_VALID_MODULE_HANDLE in GHES Memory errors? If
>> so, any chance you could test this patch on your platform? [0]
>> (original patch: https://lore.kernel.org/patchwork/patch/978928/)
>>

Hi James,

Our platform do not set CPER_MEM_VALID_MODULE_HANDLE in GHES Memory errors.

Thanks,
tanxiaofei

>> Thanks,
>>
>> James
>>
>> [0] https://marc.info/?l=linux-edac&m=152603960002324
>>
>>
>>> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
>>> index 473aeec..db527f0 100644
>>> --- a/drivers/edac/ghes_edac.c
>>> +++ b/drivers/edac/ghes_edac.c
>>> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
>>>          (*num_dimm)++;
>>>  }
>>>
>>> +static int ghes_edac_dimm_index(u16 handle)
>>> +{
>>> +    struct mem_ctl_info *mci;
>>> +    int i;
>>> +
>>> +    if (!ghes_pvt)
>>> +        return -1;
>>> +
>>> +    mci = ghes_pvt->mci;
>>> +
>>> +    if (!mci)
>>> +        return -1;
>>> +
>>> +    for (i = 0; i < mci->tot_dimms; i++) {
>>> +        if (mci->dimms[i]->smbios_handle == handle)
>>> +            return i;
>>> +    }
>>> +    return -1;
>>> +}
>>> +
>>>  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>>>  {
>>>      struct ghes_edac_dimm_fill *dimm_fill = arg;
>>> @@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>>>                  entry->total_width, entry->data_width);
>>>          }
>>>
>>> +        dimm->smbios_handle = entry->handle;
>>> +
>>>          dimm_fill->count++;
>>>      }
>>>  }
>>> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>>>          p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>>>      if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
>>>          const char *bank = NULL, *device = NULL;
>>> +        int index = -1;
>>> +
>>>          dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
>>> +        p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>> +                 mem_err->mem_dev_handle);
>>>          if (bank != NULL && device != NULL)
>>>              p += sprintf(p, "DIMM location:%s %s ", bank, device);
>>> -        else
>>> -            p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>> -                     mem_err->mem_dev_handle);
>>> +
>>> +        index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
>>> +        if (index >= 0) {
>>> +            e->top_layer = index;
>>> +            e->enable_per_layer_report = true;
>>> +        }
>>> +
>>>      }
>>>      if (p > e->location)
>>>          *(p - 1) = '\0';
>>> diff --git a/include/linux/edac.h b/include/linux/edac.h
>>> index bffb978..a45ce1f 100644
>>> --- a/include/linux/edac.h
>>> +++ b/include/linux/edac.h
>>> @@ -451,6 +451,8 @@ struct dimm_info {
>>>      u32 nr_pages;            /* number of pages on this dimm */
>>>
>>>      unsigned csrow, cschannel;    /* Points to the old API data */
>>> +
>>> +    u16 smbios_handle;              /* Handle for SMBIOS type 17 */
>>>  };
>>>
>>>  /**
>>>
>>
>>
>> .
>>
> 
> 
> 
> .
>
Fan Wu Sept. 3, 2018, 3:05 p.m. UTC | #14
Thanks tanxiaofei!

Boris/James, are you OK to sign off, or you want to see more tests on 
this patch? 

Thanks,
Fan

> -----Original Message-----
> From: tanxiaofei <tanxiaofei@huawei.com>
> Sent: Friday, August 31, 2018 4:06 AM
> 
> Hi James,
> 
> Our platform do not set CPER_MEM_VALID_MODULE_HANDLE in GHES
> Memory errors.
> 
> Thanks,
> tanxiaofei
Borislav Petkov Sept. 3, 2018, 7:18 p.m. UTC | #15
On Mon, Sep 03, 2018 at 09:05:29AM -0600, wufan wrote:
> Thanks tanxiaofei!
> 
> Boris/James, are you OK to sign off, or you want to see more tests on 
> this patch? 

Please do not top-post.

Now, I don't have any problems with it - I'm still sceptical as the
firmware is a stinking pile but if we cannot find a broken case, I guess
we can take this one for a spin and revert it if there's trouble down
the road.

Unless James has objections.

Then, you sent a v2 here:

https://lkml.kernel.org/r/1535654266-40053-1-git-send-email-wufan@codeaurora.org

and you've received a bunch of Reviewed-by's and Tested-by's and a
couple of new comments.

Incorporate *all* of those and send me a v3.

Thx.

Patch
diff mbox series

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 473aeec..db527f0 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -81,6 +81,26 @@  static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
 		(*num_dimm)++;
 }
 
+static int ghes_edac_dimm_index(u16 handle)
+{
+	struct mem_ctl_info *mci;
+	int i;
+
+	if (!ghes_pvt)
+		return -1;
+
+	mci = ghes_pvt->mci;
+
+	if (!mci)
+		return -1;
+
+	for (i = 0; i < mci->tot_dimms; i++) {
+		if (mci->dimms[i]->smbios_handle == handle)
+			return i;
+	}
+	return -1;
+}
+
 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 {
 	struct ghes_edac_dimm_fill *dimm_fill = arg;
@@ -177,6 +197,8 @@  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 				entry->total_width, entry->data_width);
 		}
 
+		dimm->smbios_handle = entry->handle;
+
 		dimm_fill->count++;
 	}
 }
@@ -327,12 +349,20 @@  void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
 	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
 		const char *bank = NULL, *device = NULL;
+		int index = -1;
+
 		dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
+		p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
+			     mem_err->mem_dev_handle);
 		if (bank != NULL && device != NULL)
 			p += sprintf(p, "DIMM location:%s %s ", bank, device);
-		else
-			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
-				     mem_err->mem_dev_handle);
+
+		index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
+		if (index >= 0) {
+			e->top_layer = index;
+			e->enable_per_layer_report = true;
+		}
+
 	}
 	if (p > e->location)
 		*(p - 1) = '\0';
diff --git a/include/linux/edac.h b/include/linux/edac.h
index bffb978..a45ce1f 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -451,6 +451,8 @@  struct dimm_info {
 	u32 nr_pages;			/* number of pages on this dimm */
 
 	unsigned csrow, cschannel;	/* Points to the old API data */
+
+	u16 smbios_handle;              /* Handle for SMBIOS type 17 */
 };
 
 /**