Linux-EDAC Archive on lore.kernel.org
 help / color / Atom feed
* Re: [PATCH v4 2/2] EDAC: add EDAC driver for DMC520
       [not found] <BN6PR04MB11075E9070EE1A263E099A7386E60@BN6PR04MB1107.namprd04.prod.outlook.com>
@ 2019-07-03  9:18 ` James Morse
       [not found]   ` <BN6PR04MB1107E018F29465B68EE708C786FA0@BN6PR04MB1107.namprd04.prod.outlook.com>
  0 siblings, 1 reply; 2+ messages in thread
From: James Morse @ 2019-07-03  9:18 UTC (permalink / raw)
  To: Lei Wang, mark.rutland
  Cc: bp, robh+dt, devicetree, linux-kernel, mchehab, linux-edac,
	sashal, hangl, lewan, ruizhao

Hi!

On 22/06/2019 07:38, Lei Wang wrote:
> New driver supports error detection and correction on the devices with ARM
> DMC-520 memory controller.

> diff --git a/drivers/edac/dmc520_edac.c b/drivers/edac/dmc520_edac.c
> new file mode 100644
> index 000000000000..c23734c13933
> --- /dev/null
> +++ b/drivers/edac/dmc520_edac.c
> @@ -0,0 +1,604 @@

> +#include <linux/module.h>
> +#include <linux/platform_device.h>
> +#include <linux/edac.h>
> +#include <linux/io.h>
> +#include <linux/of.h>
> +#include <linux/interrupt.h>
> +#include <linux/bitfield.h>

#include <linux/spinlock.h> ?

It's best to keep this list sorted, it makes it easier for the maintainer to resolve
conflicts when header files get split/moved-around.

> +#include "edac_mc.h"

[...]

> +#define REG_OFFSET_FEATURE_CONTROL_NEXT		0x1F0

Nothing uses this, do we need it?

[...]

> +#define REG_OFFSET_DECODE_CONTROL_NOW		0x1014

Nothing uses this, do we need it?
[...]

> +/* DMC-520 types, masks and bitfields */
> +#define DRAM_ECC_INT_CE_MASK			BIT(2)
> +#define DRAM_ECC_INT_UE_MASK			BIT(3)

(The 'MASK' suffix isn't really needed for a single bit, you can't confuse it with the value.)

[...]

> +#define SCRUB_CONTROL_MASK			GENMASK(1, 0)

Isn't this field called TRIGGER0_NEXT? It would be good to use the names from the
datasheet[0] as it makes it much easier for someone else to debug.


> +#define DMC520_EDAC_ERR_GRAIN			1

> +#define DMC520_BUS_WIDTH	8  /* Data bus width is 64bits/8Bytes */

Can you point me to where this comes from in the datasheet[0]?
I see it talk in "1.3 Features" of "either a 32-bit wide data SDRAM interface or a 64-bit
wide data SDRAM interface".

If this is a choice that was made on your platform it needs to be described in the DT.

(I may be confused between SDRAM/DDR/DRAM, as 2.3.3. "PHY interface" seems to describe one
connecting to the other.)



> +/* memory type */
> +enum dmc520_mem_type {
> +	mem_type_ddr3 = 1,
> +	mem_type_ddr4 = 2
> +};
> +
> +/* memory device width */
> +enum dmc520_dev_width {
> +	dev_width_x4 = 0,
> +	dev_width_x8 = 1,
> +	dev_width_x16 = 2
> +};

(Nit: the convention for enums members is all-caps. e.g. include/linux/edac.h)

[...]

> +static irqreturn_t
> +dmc520_edac_dram_all_isr(int irq, void *data, u32 interrupt_mask);

(You could avoid this by moving the user after definition of this function)

[...]

> +static bool dmc520_get_dram_ecc_error_info(struct dmc520_edac *edac,
> +					   bool is_ce,
> +					   struct ecc_error_info *info)
> +{
> +	u32 reg_offset_low, reg_offset_high;
> +	u32 reg_val_low, reg_val_high;
> +	bool valid;
> +
> +	reg_offset_low = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 :
> +				 REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00;
> +	reg_offset_high = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 :
> +				  REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32;
> +
> +	reg_val_low = dmc520_read_reg(edac, reg_offset_low);
> +	reg_val_high = dmc520_read_reg(edac, reg_offset_high);
> +
> +	valid = (FIELD_GET(REG_FIELD_ERR_INFO_LOW_VALID, reg_val_low) != 0) &&
> +		(FIELD_GET(REG_FIELD_ERR_INFO_HIGH_VALID, reg_val_high) != 0);

> +	if (valid) {
> +		info->col = FIELD_GET(REG_FIELD_ERR_INFO_LOW_COL, reg_val_low);
> +		info->row = FIELD_GET(REG_FIELD_ERR_INFO_LOW_ROW, reg_val_low);
> +		info->rank = FIELD_GET(REG_FIELD_ERR_INFO_LOW_RANK, reg_val_low);
> +		info->bank = FIELD_GET(REG_FIELD_ERR_INFO_HIGH_BANK, reg_val_high);
> +	} else {
> +		memset(info, 0, sizeof(struct ecc_error_info));
> +	}

> +	return valid;

Nothing checks this return value.

> +}

> +static bool dmc520_get_scrub_type(struct dmc520_edac *edac)

This function returns enum scrub_type, not bool.

> +{
> +	enum scrub_type type = SCRUB_NONE;
> +	u32 reg_val, scrub_cfg;
> +
> +	reg_val = dmc520_read_reg(edac, REG_OFFSET_SCRUB_CONTROL0_NOW);
> +	scrub_cfg = FIELD_GET(SCRUB_CONTROL_MASK, reg_val);
> +
> +	if (DMC520_SCRUB_TRIGGER_ERR_DETECT == scrub_cfg ||
> +		DMC520_SCRUB_TRIGGER_IDLE == scrub_cfg)
> +		type = SCRUB_HW_PROG;
> +
> +	return type;
> +}


> +static void dmc520_handle_dram_ecc_errors(struct mem_ctl_info *mci,
> +					  bool is_ce)
> +{
> +	struct ecc_error_info info;
> +	struct dmc520_edac *edac;
> +	u32 cnt;
> +	char message[EDAC_MSG_BUF_SIZE];
> +	unsigned long flags;
> +
> +	edac = mci->pvt_info;
> +	dmc520_get_dram_ecc_error_info(edac, is_ce, &info);
> +
> +	cnt = dmc520_get_dram_ecc_error_count(edac, is_ce);
> +
> +	if (cnt > 0) {
> +		snprintf(message, ARRAY_SIZE(message),
> +			 "rank:%d bank:%d row:%d col:%d",
> +			 info.rank, info.bank,
> +			 info.row, info.col);
> +
> +		spin_lock_irqsave(&edac->ecc_lock, flags);

irqsave/irqrestore is overkill as this function is only called from an interrupt handler.
There is no way for this to be called with interrupts unmasked.


> +		edac_mc_handle_error((is_ce ? HW_EVENT_ERR_CORRECTED :
> +				     HW_EVENT_ERR_UNCORRECTED),
> +				     mci, cnt, 0, 0, 0, info.rank, -1, -1,
> +				     message, "");
> +		spin_unlock_irqrestore(&edac->ecc_lock, flags);
> +	}
> +}
> +
> +static irqreturn_t dmc520_edac_dram_ecc_isr(int irq, void *data, bool is_ce)

data here could be struct mem_ctl_info *, as it only has one caller.

> +{
> +	u32 i_mask;
> +	struct mem_ctl_info *mci;
> +	struct dmc520_edac *edac;
> +
> +	mci = data;
> +	edac = mci->pvt_info;
> +
> +	i_mask = is_ce ? DRAM_ECC_INT_CE_MASK : DRAM_ECC_INT_UE_MASK;

(The mask/bit here could be passed in directly, its the value you need most often)


> +	dmc520_handle_dram_ecc_errors(mci, is_ce);
> +
> +	dmc520_write_reg(edac, i_mask, REG_OFFSET_INTERRUPT_CLR);
> +
> +	return IRQ_HANDLED;
> +}

[...]


> +static int dmc520_edac_probe(struct platform_device *pdev)
> +{

[...]

> +	if (nintr > ARRAY_SIZE(dmc520_isr_array)) {
> +		edac_printk(KERN_ERR, EDAC_MOD_NAME,
> +			"Invalid device node configuration: # of interrupt config "
> +			"elements (%d) can not exeed %ld.\n",

(Nit: exceed)

> +			nintr, ARRAY_SIZE(dmc520_isr_array));
> +		return -EINVAL;
> +	}

[...]

> +	ret = of_property_read_u32_array(dev->of_node, "interrupt-config",
> +			edac->interrupt_masks, nintr);
> +	if (ret) {
> +		edac_printk(KERN_ERR, EDAC_MOD_NAME,
> +			"Failed to get interrupt-config arrays.\n");
> +		goto err_free_mc;
> +	}

> +	for (intr_index = 0; intr_index < nintr; ++intr_index) {
> +		if (edac->interrupt_mask_all & edac->interrupt_masks[intr_index]) {
> +			edac_printk(KERN_ERR, EDAC_MC,
> +				"interrupt-config error: "
> +				"element %d's interrupt mask %d has overlap.\n",
> +				intr_index, edac->interrupt_masks[intr_index]);
> +			goto err_free_mc;
> +		}
> +
> +		edac->interrupt_mask_all |= edac->interrupt_masks[intr_index];
> +	}

Ah, so the driver doesn't support overlapping masks... but wasn't this the reason for
describing the interrupts with these masks in the first place?
(It looks like the DT-folk want this as named interrupts)

lore.kernel.org/r/BYAPR21MB1319BC4D079B918AB038A4D590010@BYAPR21MB1319.namprd21.prod.outlook.com

Would this driver support the configuration you gave there?


> +	edac->interrupt_mask_all &= ALL_INT_MASK;

This is to removed invalid interrupt fields? Shouldn't we print a warning instead? Either
the DT is invalid, or its some future hardware that has an extra interrupt that this
driver won't enable.


[...]

> +	/* Clear interrupts */
> +	reg_val = dmc520_read_reg(edac, REG_OFFSET_INTERRUPT_CONTROL);
> +	dmc520_write_reg(edac, reg_val & (~(edac->interrupt_mask_all)),
> +			REG_OFFSET_INTERRUPT_CONTROL);
> +	dmc520_write_reg(edac, edac->interrupt_mask_all, REG_OFFSET_INTERRUPT_CLR);

[...]

> +	/* Enable interrupts */
> +	dmc520_write_reg(edac, edac->interrupt_mask_all, REG_OFFSET_INTERRUPT_CONTROL);

Won't this disable any interrupts we weren't told about? You did a read-modify write
above. Can we do the same here?


> +	return 0;
> +
> +err_free_irq:
> +	for (intr_index = 0; intr_index < nintr_registered; ++intr_index) {
> +		int irq_id = platform_get_irq(pdev, intr_index);
> +		devm_free_irq(&pdev->dev, irq_id, mci);
> +	}
> +	edac_mc_del_mc(&pdev->dev);
> +err_free_mc:
> +	edac_mc_free(mci);
> +
> +	return ret;
> +}
> +

[...]

> +static const struct of_device_id dmc520_edac_driver_id[] = {
> +	{ .compatible = "brcm,dmc-520", },
> +	{ .compatible = "arm,dmc-520", },

You should only need the "arm,dmc-520" entry here. The additional compatible values are
for quirking the driver when integration issues are discovered.
The 'brcm' version should be in the DT from day-one, but the kernel only needs to pick it
up when it needs to treat the brcm version differently.


> +	{ /* end of table */ }
> +};


With the bool/enum and interrupt-disabling things fixed:
Reviewed-by: James Morse <james.morse@arm.com>



Thanks,

James

[0] https://static.docs.arm.com/100000/0200/corelink_dmc520_trm_100000_0200_01_en.pdf

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH v4 2/2] EDAC: add EDAC driver for DMC520
       [not found]   ` <BN6PR04MB1107E018F29465B68EE708C786FA0@BN6PR04MB1107.namprd04.prod.outlook.com>
@ 2019-07-04 17:38     ` James Morse
  0 siblings, 0 replies; 2+ messages in thread
From: James Morse @ 2019-07-04 17:38 UTC (permalink / raw)
  To: Lei Wang
  Cc: mark.rutland, bp, robh+dt, devicetree, linux-kernel, mchehab,
	linux-edac, sashal, hangl, lewan, ruizhao

Hi,

On 04/07/2019 01:16, Lei Wang wrote:
>> #include <linux/spinlock.h> ?
>>
>> It's best to keep this list sorted, it makes it easier for the maintainer to resolve
>> conflicts when header files get split/moved-around.
> 
> It builds fine in our distro. The header seems to have been inherited 
> from some other header files.

Heh, you can't rely on this! Someone will disturb the header-soup, and inexplicably your
driver will stop building. If your using a thing, you need to include the headers that
define it.


>>> +#define DMC520_BUS_WIDTH	8  /* Data bus width is 64bits/8Bytes */

>> Can you point me to where this comes from in the datasheet[0]?
>> I see it talk in "1.3 Features" of "either a 32-bit wide data SDRAM interface or a 64-bit
>> wide data SDRAM interface".
>>
>> If this is a choice that was made on your platform it needs to be described in the DT.
>>
>> (I may be confused between SDRAM/DDR/DRAM, as 2.3.3. "PHY interface" seems to describe one
>> connecting to the other.)

( a bit more reading shows these are all terms for pretty much the same thing )


> I didn't find a configuration for bus width from the dmc520 doc. I 
> search online and it seems like more than one places state 64-bit as 
> DDR's data bus width, e.g. this one 
> (https://en.wikipedia.org/wiki/DDR_SDRAM) mentions "DDR memory bus width 
> per channel is 64 bits (72 for ECC memory)." So I took it as a define 
> here for 8 bytes.

Ooer. Your platform might not be the same as wikipedia's!
From the datasheet it looks like this is configurable for DMC520, if so we shouldn't
hard-code the value in the driver...

~ (more datasheet digging) ~

Bingo: 'format_control' can be read in all states and has a 'memory_width' field that
indicates if the PHY is connected to a 'x32 DDR device' or 'x64 DDR device'. (I'm guessing
those double bit values are for address + data)

This probably affects the 'grain' too.


>>> +static void dmc520_handle_dram_ecc_errors(struct mem_ctl_info *mci,
>>> +					  bool is_ce)
>>> +{
>>> +	struct ecc_error_info info;
>>> +	struct dmc520_edac *edac;
>>> +	u32 cnt;
>>> +	char message[EDAC_MSG_BUF_SIZE];
>>> +	unsigned long flags;
>>> +
>>> +	edac = mci->pvt_info;
>>> +	dmc520_get_dram_ecc_error_info(edac, is_ce, &info);
>>> +
>>> +	cnt = dmc520_get_dram_ecc_error_count(edac, is_ce);
>>> +
>>> +	if (cnt > 0) {
>>> +		snprintf(message, ARRAY_SIZE(message),
>>> +			 "rank:%d bank:%d row:%d col:%d",
>>> +			 info.rank, info.bank,
>>> +			 info.row, info.col);
>>> +
>>> +		spin_lock_irqsave(&edac->ecc_lock, flags);
>> irqsave/irqrestore is overkill as this function is only called from an interrupt handler.
>> There is no way for this to be called with interrupts unmasked.

> Still feel spin_lock_irqsave and spin_unlock_irqstore are the safest. If 
> a processor is on isr for interrupt line 1 and calls 
> dmc520_handle_dram_ecc_errors, only line 1 is disabled and and other 
> lines can still interrupt calling dmc520_handle_dram_ecc_errors again. 

But not on the same CPU, (which is the problem the irqsave helpers solve).

When the CPU takes an interrupt the hardware sets a status bit to prevent this CPU taking
any other interrupt. On armv8, this is PSTATE.I, its set automatically by the hardware. If
CPUs didn't do this, you could never guarantee that any interrupt would ever be handled.

You are right a second interrupt may occur, but it can't interrupt this CPU until it
returns from the Interrupt, or clears the status bit. If the second interrupt is taken at
the same time its because it was routed to a different CPU. A regular spin_lock() stops
any problems here, the second CPU has to wait for the first CPU to release the lock.

spin_lock_irqsave() is for a more complicated problem. If you used the spinlock in process
context, (e.g. your probe function), as well as interrupt context, then its possible the
interrupt is taken while the lock is held in process context on the same CPU. This causes
your interrupt handler to wait forever for the lock. The irqsave helpers stop this by
masking interrupts when taking the lock, so that this 'same CPU' sequence can't happen.

Because you don't take the lock in process context, you don't need the irqsave variants.


>>> +	for (intr_index = 0; intr_index < nintr; ++intr_index) {
>>> +		if (edac->interrupt_mask_all & edac->interrupt_masks[intr_index]) {
>>> +			edac_printk(KERN_ERR, EDAC_MC,
>>> +				"interrupt-config error: "
>>> +				"element %d's interrupt mask %d has overlap.\n",
>>> +				intr_index, edac->interrupt_masks[intr_index]);
>>> +			goto err_free_mc;
>>> +		}
>>> +
>>> +		edac->interrupt_mask_all |= edac->interrupt_masks[intr_index];
>>> +	}

>> Ah, so the driver doesn't support overlapping masks... but wasn't this the reason for
>> describing the interrupts with these masks in the first place?
>> (It looks like the DT-folk want this as named interrupts)
>>
>> lore.kernel.org/r/BYAPR21MB1319BC4D079B918AB038A4D590010@BYAPR21MB1319.namprd21.prod.outlook.com
>>
>> Would this driver support the configuration you gave there?

> The interrupt line to mask mapping is to solve how to flexibly adapting 
> to possible hardware implementations. dmc520 supports multiple 
> interrupts (3.3.169 interrupt_control). And these interrupts may have 
> different ways to be wired to physical interrupt lines. As in the above 
> link, in this particular brcm implementation:
> 
> Line 841: source dram_ecc_errc_int
> Line 843: source dram_ecc_errd_int
> Line 839: source dram_ecc_errc_int and dram_ecc_errd_int
> 
> Two straightforward possibilities for implementing ecc counts for ce/ue: 
> 1. We chose to use the single source line. 2. It's possible to implement 
> using the combined-source line too. Our implementation would support 
> either of these cases.
> 
> Of course there might be other possibilities that involve overlapping, 
> such as including all above 3 interrupt lines into the DT. But this 
> unlikely is of any real value of use. Our implementation does not 
> support this case.

Right, so the driver does support this, but not at the same time as independant interrupts.


>> With the bool/enum and interrupt-disabling things fixed:
>> Reviewed-by: James Morse <james.morse@arm.com>
>>
> New to the upstreaming review process. Does this last comment mean we're 
> closer? :)

Heh, yes. This translates as: If you post a subsequent version with those two issues
fixed, please include that Reviewed-by tag next to your Signed-off-by.

{
If you could also summarise the changes you make next to the diffstat, it allows people
who have given tags to only look at the bits you changed, (instead of playing spot the
difference).

As an example:
https://lore.kernel.org/r/20190521172139.21277-3-julien.grall@arm.com

git knows to discard the changes-between-versions and diffstat bits when the patch is
applied, they don't end up in the log.
}

What happens next? Your series gets more review and collects tags. This will include the
maintainers of each tree you're touching either giving tags, or queueing the series. From
there it sits in linux-next until the next merge-window, when the maintainer will send a
pull-request to Linus. Eventually it ends up in the release-candidates, and finally a
released kernel.


(N.B: your mail is still coming base64 encoded, so its very unlikely the maintainer can
pick it up.)


Thanks,

James

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, back to index

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <BN6PR04MB11075E9070EE1A263E099A7386E60@BN6PR04MB1107.namprd04.prod.outlook.com>
2019-07-03  9:18 ` [PATCH v4 2/2] EDAC: add EDAC driver for DMC520 James Morse
     [not found]   ` <BN6PR04MB1107E018F29465B68EE708C786FA0@BN6PR04MB1107.namprd04.prod.outlook.com>
2019-07-04 17:38     ` James Morse

Linux-EDAC Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-edac/0 linux-edac/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-edac linux-edac/ https://lore.kernel.org/linux-edac \
		linux-edac@vger.kernel.org linux-edac@archiver.kernel.org
	public-inbox-index linux-edac


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-edac


AGPL code for this site: git clone https://public-inbox.org/ public-inbox