All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors
@ 2018-05-18  1:34 Benjamin Herrenschmidt
  2018-05-18  1:34 ` [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors Benjamin Herrenschmidt
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-18  1:34 UTC (permalink / raw)
  To: openbmc

This has proven useful in case of problems with the FSI
communication. We retry up to 3 times.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/fsi/fsi-occ.c | 179 ++++++++++++++++++++++++------------------
 1 file changed, 104 insertions(+), 75 deletions(-)

diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
index bdee26096688..f4b2df7a3084 100644
--- a/drivers/fsi/fsi-occ.c
+++ b/drivers/fsi/fsi-occ.c
@@ -35,6 +35,7 @@
 #define OCC_SRAM_BYTES		4096
 #define OCC_CMD_DATA_BYTES	4090
 #define OCC_RESP_DATA_BYTES	4089
+#define OCC_COMMAND_RETRIES	3
 
 /*
  * Assume we don't have FFDC, if we do we'll overflow and
@@ -458,9 +459,9 @@ static int occ_getsram(struct device *sbefifo, u32 address, u8 *data,
 		       ssize_t len)
 {
 	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
-	size_t resp_len, resp_data_len;
+	size_t saved_resp_len, resp_len, resp_data_len;
 	__be32 *resp, cmd[5];
-	int rc;
+	int rc, retries = OCC_COMMAND_RETRIES;
 
 	/*
 	 * Magic sequence to do SBE getsram command. SBE will fetch data from
@@ -472,28 +473,37 @@ static int occ_getsram(struct device *sbefifo, u32 address, u8 *data,
 	cmd[3] = cpu_to_be32(address);
 	cmd[4] = cpu_to_be32(data_len);
 
-	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
+	resp_len = saved_resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
 	resp = kzalloc(resp_len << 2 , GFP_KERNEL);
 	if (!resp)
-		return -ENOMEM;
-
-	rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
-	if (rc)
-		goto free;
-	rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
-	if (rc)
-		goto free;
+		rc = -ENOMEM;
+
+	rc = -1;
+	while (rc && retries--) {
+		resp_len = saved_resp_len;
+		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
+		if (rc == 0)
+			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
+		if (rc) {
+			if (rc < 0)
+				pr_err("occ: FSI error %d, retrying sram read\n", rc);
+			else
+				pr_err("occ: SBE error 0x%08x, retrying sram read\n", rc);
+		}
+	}
 
-	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
-	if (resp_data_len != data_len) {
-		pr_err("occ: SRAM read expected %d bytes got %d\n",
-		       data_len, resp_data_len);
-		rc = -EBADMSG;
-	} else {
-		memcpy(data, resp, len);
+	/* Check response lenght and copy data */
+	if (rc == 0) {
+		resp_data_len = be32_to_cpu(resp[resp_len - 1]);
+		if (resp_data_len != data_len) {
+			pr_err("occ: SRAM read expected %d bytes got %d\n",
+			       data_len, resp_data_len);
+			rc = -EBADMSG;
+		} else {
+			memcpy(data, resp, len);
+		}
 	}
 
-free:
 	/* Convert positive SBEI status */
 	if (rc > 0) {
 		pr_err("occ: SRAM read returned failure status: %08x\n", rc);
@@ -508,8 +518,8 @@ static int occ_putsram(struct device *sbefifo, u32 address, u8 *data,
 {
 	size_t cmd_len, buf_len, resp_len, resp_data_len;
 	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
+	int rc, retries = OCC_COMMAND_RETRIES;
 	__be32 *buf;
-	int rc;
 
 	/*
 	 * We use the same buffer for command and response, make
@@ -522,38 +532,48 @@ static int occ_putsram(struct device *sbefifo, u32 address, u8 *data,
 	if (!buf)
 		return -ENOMEM;
 
-	/*
-	 * Magic sequence to do SBE putsram command. SBE will transfer
-	 * data to specified SRAM address.
-	 */
-	buf[0] = cpu_to_be32(cmd_len);
-	buf[1] = cpu_to_be32(0xa404);
-	buf[2] = cpu_to_be32(1);
-	buf[3] = cpu_to_be32(address);
-	buf[4] = cpu_to_be32(data_len);
-
-	memcpy(&buf[5], data, len);
-
-	rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
-	if (rc)
-		goto free;
-	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
-	if (rc)
-		goto free;
+	rc = -1;
+	while (rc && retries--) {
+		/*
+		 * Magic sequence to do SBE putsram command. SBE will transfer
+		 * data to specified SRAM address.
+		 */
+		buf[0] = cpu_to_be32(cmd_len);
+		buf[1] = cpu_to_be32(0xa404);
+		buf[2] = cpu_to_be32(1);
+		buf[3] = cpu_to_be32(address);
+		buf[4] = cpu_to_be32(data_len);
+
+		memcpy(&buf[5], data, len);
+
+		resp_len = OCC_SBE_STATUS_WORDS;
+		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
+		if (rc == 0)
+			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
+		if (rc) {
+			if (rc < 0)
+				pr_err("occ: FSI error %d, retrying sram write\n", rc);
+			else
+				pr_err("occ: SBE error 0x%08x, retrying sram write\n", rc);
+		}
+	}
 
-	if (resp_len != 1) {
-		pr_err("occ: SRAM write response lenght invalid: %d\n",
-		       resp_len);
-		rc = -EBADMSG;
-	} else {
-		resp_data_len = be32_to_cpu(buf[0]);
-		if (resp_data_len != data_len) {
-			pr_err("occ: SRAM write expected %d bytes got %d\n",
-			       data_len, resp_data_len);
+	/* Check response lenght */
+	if (rc == 0) {
+		if (resp_len != 1) {
+			pr_err("occ: SRAM write response lenght invalid: %d\n",
+			       resp_len);
 			rc = -EBADMSG;
+		} else {
+			resp_data_len = be32_to_cpu(buf[0]);
+			if (resp_data_len != data_len) {
+				pr_err("occ: SRAM write expected %d bytes got %d\n",
+				       data_len, resp_data_len);
+				rc = -EBADMSG;
+			}
 		}
 	}
-free:
+
 	/* Convert positive SBEI status */
 	if (rc > 0) {
 		pr_err("occ: SRAM write returned failure status: %08x\n", rc);
@@ -567,46 +587,55 @@ static int occ_trigger_attn(struct device *sbefifo)
 {
 	__be32 buf[OCC_SBE_STATUS_WORDS];
 	size_t resp_len, resp_data_len;
-	int rc;
+	int rc, retries = OCC_COMMAND_RETRIES;
 
 	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 7);
-	resp_len = OCC_SBE_STATUS_WORDS;
 
-	buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
-	buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
-	buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
-	buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
-	buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
-	buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
-	buf[6] = 0;
-
-	rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
-	if (rc)
-		goto error;
-	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
-	if (rc)
-		goto error;
+	rc = -1;
+	while (rc && retries--) {
+		resp_len = OCC_SBE_STATUS_WORDS;
+
+		buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
+		buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
+		buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
+		buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
+		buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
+		buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
+		buf[6] = 0;
+
+		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
+		if (rc == 0)
+			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
+		if (rc) {
+			if (rc < 0)
+				pr_err("occ: FSI error %d, retrying attn\n", rc);
+			else
+				pr_err("occ: SBE error 0x%08x, retrying attn\n", rc);
+		}
+	}
 
-	if (resp_len != 1) {
-		pr_err("occ: SRAM attn response lenght invalid: %d\n",
-		       resp_len);
-		rc = -EBADMSG;
-	} else {
-		resp_data_len = be32_to_cpu(buf[0]);
-		if (resp_data_len != 8) {
-			pr_err("occ: SRAM attn expected 8 bytes got %d\n",
-			       resp_data_len);
+	/* Check response lenght */
+	if (rc == 0) {
+		if (resp_len != 1) {
+			pr_err("occ: SRAM attn response lenght invalid: %d\n",
+			       resp_len);
 			rc = -EBADMSG;
+		} else {
+			resp_data_len = be32_to_cpu(buf[0]);
+			if (resp_data_len != 8) {
+				pr_err("occ: SRAM attn expected 8 bytes got %d\n",
+				       resp_data_len);
+				rc = -EBADMSG;
+			}
 		}
 	}
- error:
+
 	/* Convert positive SBEI status */
 	if (rc > 0) {
 		pr_err("occ: SRAM attn returned failure status: %08x\n", rc);
 		rc = -EBADMSG;
 	}
 
-
 	return rc;
 }
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors
  2018-05-18  1:34 [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Benjamin Herrenschmidt
@ 2018-05-18  1:34 ` Benjamin Herrenschmidt
  2018-05-21  5:26   ` Andrew Jeffery
  2018-05-18  1:34 ` [PATCH linux dev-4.13 3/4] fsi/occ: Nicer error messages when talking to a host that isn't ready Benjamin Herrenschmidt
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-18  1:34 UTC (permalink / raw)
  To: openbmc

Similarily to the new retry on SBE fifo errors, this adds
retries if the data we obtain from the OCC has a bad checksum.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/fsi/fsi-occ.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
index f4b2df7a3084..7a5afa78fb6b 100644
--- a/drivers/fsi/fsi-occ.c
+++ b/drivers/fsi/fsi-occ.c
@@ -652,7 +652,7 @@ static void occ_worker(struct work_struct *work)
 	struct occ_client *client;
 	struct occ *occ = container_of(work, struct occ, work);
 	struct device *sbefifo = occ->sbefifo;
-
+	int retries = 0;
 again:
 	if (occ->cancel)
 		return;
@@ -720,7 +720,10 @@ static void occ_worker(struct work_struct *work)
 	xfr->resp_data_length = resp_data_length + 7;
 
 	rc = occ_verify_checksum(resp, resp_data_length);
-
+	if (rc) {
+		if (retries++ < OCC_COMMAND_RETRIES)
+			goto again;
+	}
 done:
 	mutex_unlock(&occ->occ_lock);
 
@@ -732,6 +735,7 @@ static void occ_worker(struct work_struct *work)
 	clear_bit(XFR_IN_PROGRESS, &xfr->flags);
 	list_del(&xfr->link);
 	empty = list_empty(&occ->xfrs);
+	retries = 0;
 
 	spin_unlock_irqrestore(&occ->list_lock, flags);
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH linux dev-4.13 3/4] fsi/occ: Nicer error messages when talking to a host that isn't ready
  2018-05-18  1:34 [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Benjamin Herrenschmidt
  2018-05-18  1:34 ` [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors Benjamin Herrenschmidt
@ 2018-05-18  1:34 ` Benjamin Herrenschmidt
  2018-05-21  5:30   ` Andrew Jeffery
  2018-05-18  1:35 ` [PATCH linux dev-4.13 4/4] fsi/occ: Don't set driver data late Benjamin Herrenschmidt
  2018-05-21  5:14 ` [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Andrew Jeffery
  3 siblings, 1 reply; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-18  1:34 UTC (permalink / raw)
  To: openbmc

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/fsi/fsi-occ.c      | 12 ++++++++++++
 drivers/hwmon/occ/common.c |  9 +++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
index 7a5afa78fb6b..4bda1b435ecb 100644
--- a/drivers/fsi/fsi-occ.c
+++ b/drivers/fsi/fsi-occ.c
@@ -484,6 +484,10 @@ static int occ_getsram(struct device *sbefifo, u32 address, u8 *data,
 		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
 		if (rc == 0)
 			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
+		if (rc == -ESHUTDOWN) {
+			pr_info("occ: Host not ready\n");
+			return rc;
+		}
 		if (rc) {
 			if (rc < 0)
 				pr_err("occ: FSI error %d, retrying sram read\n", rc);
@@ -548,6 +552,10 @@ static int occ_putsram(struct device *sbefifo, u32 address, u8 *data,
 
 		resp_len = OCC_SBE_STATUS_WORDS;
 		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
+		if (rc == -ESHUTDOWN) {
+			pr_info("occ: Host not ready\n");
+			return rc;
+		}
 		if (rc == 0)
 			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
 		if (rc) {
@@ -604,6 +612,10 @@ static int occ_trigger_attn(struct device *sbefifo)
 		buf[6] = 0;
 
 		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
+		if (rc == -ESHUTDOWN) {
+			pr_info("occ: Host not ready\n");
+			return rc;
+		}
 		if (rc == 0)
 			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
 		if (rc) {
diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c
index 88c32915b8bf..10f5c0586d47 100644
--- a/drivers/hwmon/occ/common.c
+++ b/drivers/hwmon/occ/common.c
@@ -1350,8 +1350,13 @@ int occ_setup(struct occ *occ, const char *name)
 	/* no need to lock */
 	rc = occ_poll(occ);
 	if (rc < 0) {
-		dev_err(occ->bus_dev, "failed to get OCC poll response: %d\n",
-			rc);
+		/*
+		 * If the error is -ESHUTDOWN, fail silently, as this happen in
+		 * normal circumstances when the driver is loaded too early
+		 */
+		if (rc != -ESHUTDOWN)
+			dev_err(occ->bus_dev, "failed to get OCC poll response: %d\n",
+				rc);
 		return rc;
 	}
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH linux dev-4.13 4/4] fsi/occ: Don't set driver data late
  2018-05-18  1:34 [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Benjamin Herrenschmidt
  2018-05-18  1:34 ` [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors Benjamin Herrenschmidt
  2018-05-18  1:34 ` [PATCH linux dev-4.13 3/4] fsi/occ: Nicer error messages when talking to a host that isn't ready Benjamin Herrenschmidt
@ 2018-05-18  1:35 ` Benjamin Herrenschmidt
  2018-05-21  5:44   ` Andrew Jeffery
  2018-05-21  5:14 ` [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Andrew Jeffery
  3 siblings, 1 reply; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-18  1:35 UTC (permalink / raw)
  To: openbmc

Until now, the OCC driver was setting the driver data after
registering the character device and the hwmon device.

This might have been intentional, as doing so makes the initial
probe of the OCC by the hwmon device fail while the data is NULL
(provided you are lucky and the hwmon driver doesn't get bound
asynchronously). That failure used to be necessary, otherwise
the driver would try to access the SBE fifo at a time when it's
not ready, causing all sort of problems.

The new SBE fifo driver is much more robust and will return an
appropriate error code, so that (fragile) tweak is no longer
necessary.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/fsi/fsi-occ.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
index 4bda1b435ecb..170fd8020829 100644
--- a/drivers/fsi/fsi-occ.c
+++ b/drivers/fsi/fsi-occ.c
@@ -836,6 +836,8 @@ static int occ_probe(struct platform_device *pdev)
 		occ->idx = ida_simple_get(&occ_ida, 1, INT_MAX, GFP_KERNEL);
 	}
 
+	platform_set_drvdata(pdev, occ);
+
 	snprintf(occ->name, sizeof(occ->name), "occ%d", occ->idx);
 	occ->mdev.fops = &occ_fops;
 	occ->mdev.minor = MISC_DYNAMIC_MINOR;
@@ -854,8 +856,6 @@ static int occ_probe(struct platform_device *pdev)
 	if (!hwmon_dev)
 		dev_warn(dev, "failed to create hwmon device\n");
 
-	platform_set_drvdata(pdev, occ);
-
 	return 0;
 }
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors
  2018-05-18  1:34 [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Benjamin Herrenschmidt
                   ` (2 preceding siblings ...)
  2018-05-18  1:35 ` [PATCH linux dev-4.13 4/4] fsi/occ: Don't set driver data late Benjamin Herrenschmidt
@ 2018-05-21  5:14 ` Andrew Jeffery
  2018-05-21  8:33   ` Benjamin Herrenschmidt
  3 siblings, 1 reply; 17+ messages in thread
From: Andrew Jeffery @ 2018-05-21  5:14 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, openbmc

On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
> This has proven useful in case of problems with the FSI
> communication. We retry up to 3 times.

As a note, there's a separate, OCC-specific error-handling sequence as well, but this is currently done in drivers/hwmon/occ/common.c.

From the OCC documentation[1]:

3.3.1 BMC-OCC Communication Failure Handling

On failures communicating with an OCC the BMC should first verify that the “OCC Active” sensor is TRUE.  If the OCCs are not active the error should be ignored and communication with the OCC should not be retired until the “OCC Active” sensor is TRUE.  If the “OCC Active” sensor is TRUE the command should be retried twice.  If the command still fails after two retries and the “OCC Active” sensor is still “TRUE” and there is no checkstop the error is valid and a request to reset the OCCs should be sent.

[1] https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf

> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>  drivers/fsi/fsi-occ.c | 179 ++++++++++++++++++++++++------------------
>  1 file changed, 104 insertions(+), 75 deletions(-)
> 
> diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
> index bdee26096688..f4b2df7a3084 100644
> --- a/drivers/fsi/fsi-occ.c
> +++ b/drivers/fsi/fsi-occ.c
> @@ -35,6 +35,7 @@
>  #define OCC_SRAM_BYTES		4096
>  #define OCC_CMD_DATA_BYTES	4090
>  #define OCC_RESP_DATA_BYTES	4089
> +#define OCC_COMMAND_RETRIES	3
>  
>  /*
>   * Assume we don't have FFDC, if we do we'll overflow and
> @@ -458,9 +459,9 @@ static int occ_getsram(struct device *sbefifo, u32 
> address, u8 *data,
>  		       ssize_t len)
>  {
>  	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
> -	size_t resp_len, resp_data_len;
> +	size_t saved_resp_len, resp_len, resp_data_len;
>  	__be32 *resp, cmd[5];
> -	int rc;
> +	int rc, retries = OCC_COMMAND_RETRIES;
>  
>  	/*
>  	 * Magic sequence to do SBE getsram command. SBE will fetch data from
> @@ -472,28 +473,37 @@ static int occ_getsram(struct device *sbefifo, u32 
> address, u8 *data,
>  	cmd[3] = cpu_to_be32(address);
>  	cmd[4] = cpu_to_be32(data_len);
>  
> -	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
> +	resp_len = saved_resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
>  	resp = kzalloc(resp_len << 2 , GFP_KERNEL);
>  	if (!resp)
> -		return -ENOMEM;
> -
> -	rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
> -	if (rc)
> -		goto free;
> -	rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> -	if (rc)
> -		goto free;
> +		rc = -ENOMEM;
> +
> +	rc = -1;
> +	while (rc && retries--) {
> +		resp_len = saved_resp_len;
> +		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
> +		if (rc == 0)
> +			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> +		if (rc) {
> +			if (rc < 0)
> +				pr_err("occ: FSI error %d, retrying sram read\n", rc);
> +			else
> +				pr_err("occ: SBE error 0x%08x, retrying sram read\n", rc);
> +		}
> +	}
>  
> -	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
> -	if (resp_data_len != data_len) {
> -		pr_err("occ: SRAM read expected %d bytes got %d\n",
> -		       data_len, resp_data_len);
> -		rc = -EBADMSG;
> -	} else {
> -		memcpy(data, resp, len);
> +	/* Check response lenght and copy data */
> +	if (rc == 0) {
> +		resp_data_len = be32_to_cpu(resp[resp_len - 1]);
> +		if (resp_data_len != data_len) {
> +			pr_err("occ: SRAM read expected %d bytes got %d\n",
> +			       data_len, resp_data_len);
> +			rc = -EBADMSG;
> +		} else {
> +			memcpy(data, resp, len);
> +		}
>  	}
>  
> -free:
>  	/* Convert positive SBEI status */
>  	if (rc > 0) {
>  		pr_err("occ: SRAM read returned failure status: %08x\n", rc);
> @@ -508,8 +518,8 @@ static int occ_putsram(struct device *sbefifo, u32 
> address, u8 *data,
>  {
>  	size_t cmd_len, buf_len, resp_len, resp_data_len;
>  	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
> +	int rc, retries = OCC_COMMAND_RETRIES;
>  	__be32 *buf;
> -	int rc;
>  
>  	/*
>  	 * We use the same buffer for command and response, make
> @@ -522,38 +532,48 @@ static int occ_putsram(struct device *sbefifo, u32 
> address, u8 *data,
>  	if (!buf)
>  		return -ENOMEM;
>  
> -	/*
> -	 * Magic sequence to do SBE putsram command. SBE will transfer
> -	 * data to specified SRAM address.
> -	 */
> -	buf[0] = cpu_to_be32(cmd_len);
> -	buf[1] = cpu_to_be32(0xa404);
> -	buf[2] = cpu_to_be32(1);
> -	buf[3] = cpu_to_be32(address);
> -	buf[4] = cpu_to_be32(data_len);
> -
> -	memcpy(&buf[5], data, len);
> -
> -	rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> -	if (rc)
> -		goto free;
> -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> -	if (rc)
> -		goto free;
> +	rc = -1;
> +	while (rc && retries--) {
> +		/*
> +		 * Magic sequence to do SBE putsram command. SBE will transfer
> +		 * data to specified SRAM address.
> +		 */
> +		buf[0] = cpu_to_be32(cmd_len);
> +		buf[1] = cpu_to_be32(0xa404);
> +		buf[2] = cpu_to_be32(1);
> +		buf[3] = cpu_to_be32(address);
> +		buf[4] = cpu_to_be32(data_len);

Maybe it's worth a comment that we're doing this work inside the loop because we're reusing the buffer for the response, and we're doing that to minimise the memory consumption, and it motivated by the SBEFIFO protocol crazy.

> +
> +		memcpy(&buf[5], data, len);
> +
> +		resp_len = OCC_SBE_STATUS_WORDS;
> +		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> +		if (rc == 0)
> +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> +		if (rc) {
> +			if (rc < 0)
> +				pr_err("occ: FSI error %d, retrying sram write\n", rc);
> +			else
> +				pr_err("occ: SBE error 0x%08x, retrying sram write\n", rc);
> +		}
> +	}
>  
> -	if (resp_len != 1) {
> -		pr_err("occ: SRAM write response lenght invalid: %d\n",
> -		       resp_len);
> -		rc = -EBADMSG;
> -	} else {
> -		resp_data_len = be32_to_cpu(buf[0]);
> -		if (resp_data_len != data_len) {
> -			pr_err("occ: SRAM write expected %d bytes got %d\n",
> -			       data_len, resp_data_len);
> +	/* Check response lenght */
> +	if (rc == 0) {
> +		if (resp_len != 1) {
> +			pr_err("occ: SRAM write response lenght invalid: %d\n",

Hmm, not your fault but I just noticed 'length' is misspelled here.

> +			       resp_len);
>  			rc = -EBADMSG;
> +		} else {
> +			resp_data_len = be32_to_cpu(buf[0]);
> +			if (resp_data_len != data_len) {
> +				pr_err("occ: SRAM write expected %d bytes got %d\n",
> +				       data_len, resp_data_len);
> +				rc = -EBADMSG;
> +			}
>  		}
>  	}
> -free:
> +
>  	/* Convert positive SBEI status */
>  	if (rc > 0) {
>  		pr_err("occ: SRAM write returned failure status: %08x\n", rc);
> @@ -567,46 +587,55 @@ static int occ_trigger_attn(struct device *sbefifo)
>  {
>  	__be32 buf[OCC_SBE_STATUS_WORDS];
>  	size_t resp_len, resp_data_len;
> -	int rc;
> +	int rc, retries = OCC_COMMAND_RETRIES;
>  
>  	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 7);
> -	resp_len = OCC_SBE_STATUS_WORDS;
>  
> -	buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
> -	buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
> -	buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
> -	buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
> -	buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
> -	buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
> -	buf[6] = 0;
> -
> -	rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> -	if (rc)
> -		goto error;
> -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> -	if (rc)
> -		goto error;
> +	rc = -1;
> +	while (rc && retries--) {
> +		resp_len = OCC_SBE_STATUS_WORDS;
> +
> +		buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
> +		buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
> +		buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
> +		buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
> +		buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
> +		buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
> +		buf[6] = 0;
> +
> +		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> +		if (rc == 0)
> +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> +		if (rc) {
> +			if (rc < 0)
> +				pr_err("occ: FSI error %d, retrying attn\n", rc);
> +			else
> +				pr_err("occ: SBE error 0x%08x, retrying attn\n", rc);
> +		}
> +	}
>  
> -	if (resp_len != 1) {
> -		pr_err("occ: SRAM attn response lenght invalid: %d\n",
> -		       resp_len);
> -		rc = -EBADMSG;
> -	} else {
> -		resp_data_len = be32_to_cpu(buf[0]);
> -		if (resp_data_len != 8) {
> -			pr_err("occ: SRAM attn expected 8 bytes got %d\n",
> -			       resp_data_len);
> +	/* Check response lenght */

'length'

> +	if (rc == 0) {
> +		if (resp_len != 1) {
> +			pr_err("occ: SRAM attn response lenght invalid: %d\n",

'length'

> +			       resp_len);
>  			rc = -EBADMSG;
> +		} else {
> +			resp_data_len = be32_to_cpu(buf[0]);
> +			if (resp_data_len != 8) {
> +				pr_err("occ: SRAM attn expected 8 bytes got %d\n",
> +				       resp_data_len);
> +				rc = -EBADMSG;
> +			}
>  		}
>  	}
> - error:
> +
>  	/* Convert positive SBEI status */
>  	if (rc > 0) {
>  		pr_err("occ: SRAM attn returned failure status: %08x\n", rc);
>  		rc = -EBADMSG;
>  	}
>  
> -
>  	return rc;
>  }
>  
> -- 
> 2.17.0
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors
  2018-05-18  1:34 ` [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors Benjamin Herrenschmidt
@ 2018-05-21  5:26   ` Andrew Jeffery
  2018-05-21 14:48     ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Jeffery @ 2018-05-21  5:26 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, openbmc, Eddie James

On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
> Similarily to the new retry on SBE fifo errors, this adds
> retries if the data we obtain from the OCC has a bad checksum.
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>  drivers/fsi/fsi-occ.c | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
> index f4b2df7a3084..7a5afa78fb6b 100644
> --- a/drivers/fsi/fsi-occ.c
> +++ b/drivers/fsi/fsi-occ.c
> @@ -652,7 +652,7 @@ static void occ_worker(struct work_struct *work)
>  	struct occ_client *client;
>  	struct occ *occ = container_of(work, struct occ, work);
>  	struct device *sbefifo = occ->sbefifo;
> -
> +	int retries = 0;
>  again:
>  	if (occ->cancel)
>  		return;
> @@ -720,7 +720,10 @@ static void occ_worker(struct work_struct *work)
>  	xfr->resp_data_length = resp_data_length + 7;
>  
>  	rc = occ_verify_checksum(resp, resp_data_length);
> -
> +	if (rc) {
> +		if (retries++ < OCC_COMMAND_RETRIES)
> +			goto again;
> +	}

How should this interact with the OCC error handling mentioned in my reply on the previous patch? I feel like a checksum error is a bit of a grey area - probably caused by the transport, but possibly due to OCC firmware bugs as well? If it's the former then retrying independent of the OCC error handling protocol is probably okay, but if we're trying to catch the latter then maybe we should let this be handled as part of the OCC error handling code?

Eddie?

Ben: Did you actually hit cases where this path was triggered? There was the corruption issue with simultaneous LPC cycles that turned out to be issues around level-shifters and synchronisers, was that it?

>  done:
>  	mutex_unlock(&occ->occ_lock);
>  
> @@ -732,6 +735,7 @@ static void occ_worker(struct work_struct *work)
>  	clear_bit(XFR_IN_PROGRESS, &xfr->flags);
>  	list_del(&xfr->link);
>  	empty = list_empty(&occ->xfrs);
> +	retries = 0;
>  
>  	spin_unlock_irqrestore(&occ->list_lock, flags);
>  
> -- 
> 2.17.0
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 3/4] fsi/occ: Nicer error messages when talking to a host that isn't ready
  2018-05-18  1:34 ` [PATCH linux dev-4.13 3/4] fsi/occ: Nicer error messages when talking to a host that isn't ready Benjamin Herrenschmidt
@ 2018-05-21  5:30   ` Andrew Jeffery
  0 siblings, 0 replies; 17+ messages in thread
From: Andrew Jeffery @ 2018-05-21  5:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, openbmc

On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Reviewed-by: Andrew Jeffery <andrew@aj.id.au>

> ---
>  drivers/fsi/fsi-occ.c      | 12 ++++++++++++
>  drivers/hwmon/occ/common.c |  9 +++++++--
>  2 files changed, 19 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
> index 7a5afa78fb6b..4bda1b435ecb 100644
> --- a/drivers/fsi/fsi-occ.c
> +++ b/drivers/fsi/fsi-occ.c
> @@ -484,6 +484,10 @@ static int occ_getsram(struct device *sbefifo, u32 
> address, u8 *data,
>  		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
>  		if (rc == 0)
>  			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> +		if (rc == -ESHUTDOWN) {
> +			pr_info("occ: Host not ready\n");
> +			return rc;
> +		}
>  		if (rc) {
>  			if (rc < 0)
>  				pr_err("occ: FSI error %d, retrying sram read\n", rc);
> @@ -548,6 +552,10 @@ static int occ_putsram(struct device *sbefifo, u32 
> address, u8 *data,
>  
>  		resp_len = OCC_SBE_STATUS_WORDS;
>  		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> +		if (rc == -ESHUTDOWN) {
> +			pr_info("occ: Host not ready\n");
> +			return rc;
> +		}
>  		if (rc == 0)
>  			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>  		if (rc) {
> @@ -604,6 +612,10 @@ static int occ_trigger_attn(struct device *sbefifo)
>  		buf[6] = 0;
>  
>  		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> +		if (rc == -ESHUTDOWN) {
> +			pr_info("occ: Host not ready\n");
> +			return rc;
> +		}
>  		if (rc == 0)
>  			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>  		if (rc) {
> diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c
> index 88c32915b8bf..10f5c0586d47 100644
> --- a/drivers/hwmon/occ/common.c
> +++ b/drivers/hwmon/occ/common.c
> @@ -1350,8 +1350,13 @@ int occ_setup(struct occ *occ, const char *name)
>  	/* no need to lock */
>  	rc = occ_poll(occ);
>  	if (rc < 0) {
> -		dev_err(occ->bus_dev, "failed to get OCC poll response: %d\n",
> -			rc);
> +		/*
> +		 * If the error is -ESHUTDOWN, fail silently, as this happen in
> +		 * normal circumstances when the driver is loaded too early
> +		 */
> +		if (rc != -ESHUTDOWN)
> +			dev_err(occ->bus_dev, "failed to get OCC poll response: %d\n",
> +				rc);
>  		return rc;
>  	}
>  
> -- 
> 2.17.0
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 4/4] fsi/occ: Don't set driver data late
  2018-05-18  1:35 ` [PATCH linux dev-4.13 4/4] fsi/occ: Don't set driver data late Benjamin Herrenschmidt
@ 2018-05-21  5:44   ` Andrew Jeffery
  0 siblings, 0 replies; 17+ messages in thread
From: Andrew Jeffery @ 2018-05-21  5:44 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, openbmc

On Fri, 18 May 2018, at 11:05, Benjamin Herrenschmidt wrote:
> Until now, the OCC driver was setting the driver data after
> registering the character device and the hwmon device.
> 
> This might have been intentional, as doing so makes the initial
> probe of the OCC by the hwmon device fail while the data is NULL
> (provided you are lucky and the hwmon driver doesn't get bound
> asynchronously). That failure used to be necessary, otherwise
> the driver would try to access the SBE fifo at a time when it's
> not ready, causing all sort of problems.
> 
> The new SBE fifo driver is much more robust and will return an
> appropriate error code, so that (fragile) tweak is no longer
> necessary.
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

+1000

Reviewed-by: Andrew Jeffery <andrew@aj.id.au>

> ---
>  drivers/fsi/fsi-occ.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
> index 4bda1b435ecb..170fd8020829 100644
> --- a/drivers/fsi/fsi-occ.c
> +++ b/drivers/fsi/fsi-occ.c
> @@ -836,6 +836,8 @@ static int occ_probe(struct platform_device *pdev)
>  		occ->idx = ida_simple_get(&occ_ida, 1, INT_MAX, GFP_KERNEL);
>  	}
>  
> +	platform_set_drvdata(pdev, occ);
> +
>  	snprintf(occ->name, sizeof(occ->name), "occ%d", occ->idx);
>  	occ->mdev.fops = &occ_fops;
>  	occ->mdev.minor = MISC_DYNAMIC_MINOR;
> @@ -854,8 +856,6 @@ static int occ_probe(struct platform_device *pdev)
>  	if (!hwmon_dev)
>  		dev_warn(dev, "failed to create hwmon device\n");
>  
> -	platform_set_drvdata(pdev, occ);
> -
>  	return 0;
>  }
>  
> -- 
> 2.17.0
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors
  2018-05-21  5:14 ` [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Andrew Jeffery
@ 2018-05-21  8:33   ` Benjamin Herrenschmidt
  2018-05-21 18:48     ` Eddie James
  0 siblings, 1 reply; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-21  8:33 UTC (permalink / raw)
  To: Andrew Jeffery, openbmc

On Mon, 2018-05-21 at 14:44 +0930, Andrew Jeffery wrote:
> On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
> > This has proven useful in case of problems with the FSI
> > communication. We retry up to 3 times.
> 
> As a note, there's a separate, OCC-specific error-handling sequence
> as well, but this is currently done in drivers/hwmon/occ/common.c.

This is more for transport failures. It might be less useful now that
I've added CRC error recovery in the FSI layer and made it more
reliable overall. Experience showed that the upper OCC layer just
didn't deal with it well.

> > From the OCC documentation[1]:
> 
> 3.3.1 BMC-OCC Communication Failure Handling
> 
> On failures communicating with an OCC the BMC should first verify
> that the “OCC Active” sensor is TRUE.  If the OCCs are not active the
> error should be ignored and communication with the OCC should not be
> retired until the “OCC Active” sensor is TRUE.  If the “OCC Active”
> sensor is TRUE the command should be retried twice. 

What is the "OCC Active sensor" ?

>  If the command still fails after two retries and the “OCC Active”
> sensor is still “TRUE” and there is no checkstop the error is valid
> and a request to reset the OCCs should be sent.
> 
> [1] https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf
> 
> > 
> > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > ---
> >  drivers/fsi/fsi-occ.c | 179 ++++++++++++++++++++++++------------------
> >  1 file changed, 104 insertions(+), 75 deletions(-)
> > 
> > diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
> > index bdee26096688..f4b2df7a3084 100644
> > --- a/drivers/fsi/fsi-occ.c
> > +++ b/drivers/fsi/fsi-occ.c
> > @@ -35,6 +35,7 @@
> >  #define OCC_SRAM_BYTES		4096
> >  #define OCC_CMD_DATA_BYTES	4090
> >  #define OCC_RESP_DATA_BYTES	4089
> > +#define OCC_COMMAND_RETRIES	3
> >  
> >  /*
> >   * Assume we don't have FFDC, if we do we'll overflow and
> > @@ -458,9 +459,9 @@ static int occ_getsram(struct device *sbefifo, u32 
> > address, u8 *data,
> >  		       ssize_t len)
> >  {
> >  	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
> > -	size_t resp_len, resp_data_len;
> > +	size_t saved_resp_len, resp_len, resp_data_len;
> >  	__be32 *resp, cmd[5];
> > -	int rc;
> > +	int rc, retries = OCC_COMMAND_RETRIES;
> >  
> >  	/*
> >  	 * Magic sequence to do SBE getsram command. SBE will fetch data from
> > @@ -472,28 +473,37 @@ static int occ_getsram(struct device *sbefifo, u32 
> > address, u8 *data,
> >  	cmd[3] = cpu_to_be32(address);
> >  	cmd[4] = cpu_to_be32(data_len);
> >  
> > -	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
> > +	resp_len = saved_resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
> >  	resp = kzalloc(resp_len << 2 , GFP_KERNEL);
> >  	if (!resp)
> > -		return -ENOMEM;
> > -
> > -	rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
> > -	if (rc)
> > -		goto free;
> > -	rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> > -	if (rc)
> > -		goto free;
> > +		rc = -ENOMEM;
> > +
> > +	rc = -1;
> > +	while (rc && retries--) {
> > +		resp_len = saved_resp_len;
> > +		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
> > +		if (rc == 0)
> > +			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> > +		if (rc) {
> > +			if (rc < 0)
> > +				pr_err("occ: FSI error %d, retrying sram read\n", rc);
> > +			else
> > +				pr_err("occ: SBE error 0x%08x, retrying sram read\n", rc);
> > +		}
> > +	}
> >  
> > -	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
> > -	if (resp_data_len != data_len) {
> > -		pr_err("occ: SRAM read expected %d bytes got %d\n",
> > -		       data_len, resp_data_len);
> > -		rc = -EBADMSG;
> > -	} else {
> > -		memcpy(data, resp, len);
> > +	/* Check response lenght and copy data */
> > +	if (rc == 0) {
> > +		resp_data_len = be32_to_cpu(resp[resp_len - 1]);
> > +		if (resp_data_len != data_len) {
> > +			pr_err("occ: SRAM read expected %d bytes got %d\n",
> > +			       data_len, resp_data_len);
> > +			rc = -EBADMSG;
> > +		} else {
> > +			memcpy(data, resp, len);
> > +		}
> >  	}
> >  
> > -free:
> >  	/* Convert positive SBEI status */
> >  	if (rc > 0) {
> >  		pr_err("occ: SRAM read returned failure status: %08x\n", rc);
> > @@ -508,8 +518,8 @@ static int occ_putsram(struct device *sbefifo, u32 
> > address, u8 *data,
> >  {
> >  	size_t cmd_len, buf_len, resp_len, resp_data_len;
> >  	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
> > +	int rc, retries = OCC_COMMAND_RETRIES;
> >  	__be32 *buf;
> > -	int rc;
> >  
> >  	/*
> >  	 * We use the same buffer for command and response, make
> > @@ -522,38 +532,48 @@ static int occ_putsram(struct device *sbefifo, u32 
> > address, u8 *data,
> >  	if (!buf)
> >  		return -ENOMEM;
> >  
> > -	/*
> > -	 * Magic sequence to do SBE putsram command. SBE will transfer
> > -	 * data to specified SRAM address.
> > -	 */
> > -	buf[0] = cpu_to_be32(cmd_len);
> > -	buf[1] = cpu_to_be32(0xa404);
> > -	buf[2] = cpu_to_be32(1);
> > -	buf[3] = cpu_to_be32(address);
> > -	buf[4] = cpu_to_be32(data_len);
> > -
> > -	memcpy(&buf[5], data, len);
> > -
> > -	rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> > -	if (rc)
> > -		goto free;
> > -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> > -	if (rc)
> > -		goto free;
> > +	rc = -1;
> > +	while (rc && retries--) {
> > +		/*
> > +		 * Magic sequence to do SBE putsram command. SBE will transfer
> > +		 * data to specified SRAM address.
> > +		 */
> > +		buf[0] = cpu_to_be32(cmd_len);
> > +		buf[1] = cpu_to_be32(0xa404);
> > +		buf[2] = cpu_to_be32(1);
> > +		buf[3] = cpu_to_be32(address);
> > +		buf[4] = cpu_to_be32(data_len);
> 
> Maybe it's worth a comment that we're doing this work inside the loop because we're reusing the buffer for the response, and we're doing that to minimise the memory consumption, and it motivated by the SBEFIFO protocol crazy.
> 
> > +
> > +		memcpy(&buf[5], data, len);
> > +
> > +		resp_len = OCC_SBE_STATUS_WORDS;
> > +		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> > +		if (rc == 0)
> > +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> > +		if (rc) {
> > +			if (rc < 0)
> > +				pr_err("occ: FSI error %d, retrying sram write\n", rc);
> > +			else
> > +				pr_err("occ: SBE error 0x%08x, retrying sram write\n", rc);
> > +		}
> > +	}
> >  
> > -	if (resp_len != 1) {
> > -		pr_err("occ: SRAM write response lenght invalid: %d\n",
> > -		       resp_len);
> > -		rc = -EBADMSG;
> > -	} else {
> > -		resp_data_len = be32_to_cpu(buf[0]);
> > -		if (resp_data_len != data_len) {
> > -			pr_err("occ: SRAM write expected %d bytes got %d\n",
> > -			       data_len, resp_data_len);
> > +	/* Check response lenght */
> > +	if (rc == 0) {
> > +		if (resp_len != 1) {
> > +			pr_err("occ: SRAM write response lenght invalid: %d\n",
> 
> Hmm, not your fault but I just noticed 'length' is misspelled here.
> 
> > +			       resp_len);
> >  			rc = -EBADMSG;
> > +		} else {
> > +			resp_data_len = be32_to_cpu(buf[0]);
> > +			if (resp_data_len != data_len) {
> > +				pr_err("occ: SRAM write expected %d bytes got %d\n",
> > +				       data_len, resp_data_len);
> > +				rc = -EBADMSG;
> > +			}
> >  		}
> >  	}
> > -free:
> > +
> >  	/* Convert positive SBEI status */
> >  	if (rc > 0) {
> >  		pr_err("occ: SRAM write returned failure status: %08x\n", rc);
> > @@ -567,46 +587,55 @@ static int occ_trigger_attn(struct device *sbefifo)
> >  {
> >  	__be32 buf[OCC_SBE_STATUS_WORDS];
> >  	size_t resp_len, resp_data_len;
> > -	int rc;
> > +	int rc, retries = OCC_COMMAND_RETRIES;
> >  
> >  	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 7);
> > -	resp_len = OCC_SBE_STATUS_WORDS;
> >  
> > -	buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
> > -	buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
> > -	buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
> > -	buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
> > -	buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
> > -	buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
> > -	buf[6] = 0;
> > -
> > -	rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> > -	if (rc)
> > -		goto error;
> > -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> > -	if (rc)
> > -		goto error;
> > +	rc = -1;
> > +	while (rc && retries--) {
> > +		resp_len = OCC_SBE_STATUS_WORDS;
> > +
> > +		buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
> > +		buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
> > +		buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
> > +		buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
> > +		buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
> > +		buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
> > +		buf[6] = 0;
> > +
> > +		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> > +		if (rc == 0)
> > +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> > +		if (rc) {
> > +			if (rc < 0)
> > +				pr_err("occ: FSI error %d, retrying attn\n", rc);
> > +			else
> > +				pr_err("occ: SBE error 0x%08x, retrying attn\n", rc);
> > +		}
> > +	}
> >  
> > -	if (resp_len != 1) {
> > -		pr_err("occ: SRAM attn response lenght invalid: %d\n",
> > -		       resp_len);
> > -		rc = -EBADMSG;
> > -	} else {
> > -		resp_data_len = be32_to_cpu(buf[0]);
> > -		if (resp_data_len != 8) {
> > -			pr_err("occ: SRAM attn expected 8 bytes got %d\n",
> > -			       resp_data_len);
> > +	/* Check response lenght */
> 
> 'length'
> 
> > +	if (rc == 0) {
> > +		if (resp_len != 1) {
> > +			pr_err("occ: SRAM attn response lenght invalid: %d\n",
> 
> 'length'
> 
> > +			       resp_len);
> >  			rc = -EBADMSG;
> > +		} else {
> > +			resp_data_len = be32_to_cpu(buf[0]);
> > +			if (resp_data_len != 8) {
> > +				pr_err("occ: SRAM attn expected 8 bytes got %d\n",
> > +				       resp_data_len);
> > +				rc = -EBADMSG;
> > +			}
> >  		}
> >  	}
> > - error:
> > +
> >  	/* Convert positive SBEI status */
> >  	if (rc > 0) {
> >  		pr_err("occ: SRAM attn returned failure status: %08x\n", rc);
> >  		rc = -EBADMSG;
> >  	}
> >  
> > -
> >  	return rc;
> >  }
> >  
> > -- 
> > 2.17.0
> > 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors
  2018-05-21  5:26   ` Andrew Jeffery
@ 2018-05-21 14:48     ` Benjamin Herrenschmidt
  2018-05-21 18:58       ` Eddie James
  0 siblings, 1 reply; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-21 14:48 UTC (permalink / raw)
  To: Andrew Jeffery, openbmc, Eddie James

On Mon, 2018-05-21 at 14:56 +0930, Andrew Jeffery wrote:
> On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
> > Similarily to the new retry on SBE fifo errors, this adds
> > retries if the data we obtain from the OCC has a bad checksum.
> > 
> > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > ---
> >  drivers/fsi/fsi-occ.c | 8 ++++++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
> > index f4b2df7a3084..7a5afa78fb6b 100644
> > --- a/drivers/fsi/fsi-occ.c
> > +++ b/drivers/fsi/fsi-occ.c
> > @@ -652,7 +652,7 @@ static void occ_worker(struct work_struct *work)
> >  	struct occ_client *client;
> >  	struct occ *occ = container_of(work, struct occ, work);
> >  	struct device *sbefifo = occ->sbefifo;
> > -
> > +	int retries = 0;
> >  again:
> >  	if (occ->cancel)
> >  		return;
> > @@ -720,7 +720,10 @@ static void occ_worker(struct work_struct *work)
> >  	xfr->resp_data_length = resp_data_length + 7;
> >  
> >  	rc = occ_verify_checksum(resp, resp_data_length);
> > -
> > +	if (rc) {
> > +		if (retries++ < OCC_COMMAND_RETRIES)
> > +			goto again;
> > +	}
> 
> How should this interact with the OCC error handling mentioned in my
> reply on the previous patch? I feel like a checksum error is a bit of
> a grey area - probably caused by the transport, but possibly due to
> OCC firmware bugs as well?

Would it hurt to retry in any case ?

>  If it's the former then retrying independent of the OCC error
> handling protocol is probably okay, but if we're trying to catch the
> latter then maybe we should let this be handled as part of the OCC
> error handling code?
>
> Eddie?
> 
> Ben: Did you actually hit cases where this path was triggered? There
> was the corruption issue with simultaneous LPC cycles that turned out
> to be issues around level-shifters and synchronisers, was that it?

Yes, and I had cases where the CRC4 didn't "catch" the errors. The
retry fixed it. Now with the FSI layer being much more reliable, it
might be that all that retry stuff I added is no longer necessary, so I
won't be fighting for it, though I did find the upper layer error
handling to be somewhat lacking in efficacy...

I plan to do a deep dive on the rest of the OCC driver this week
regardless. I don't like a few things about it, such as the 2 layers
between fsi-occ and sbe_p9, that should be just one (sadly this change
will break the userspace binding code...).

I'll see if I can figure out how that error hanlding works.

Cheers,
Ben.

> >  done:
> >  	mutex_unlock(&occ->occ_lock);
> >  
> > @@ -732,6 +735,7 @@ static void occ_worker(struct work_struct *work)
> >  	clear_bit(XFR_IN_PROGRESS, &xfr->flags);
> >  	list_del(&xfr->link);
> >  	empty = list_empty(&occ->xfrs);
> > +	retries = 0;
> >  
> >  	spin_unlock_irqrestore(&occ->list_lock, flags);
> >  
> > -- 
> > 2.17.0
> > 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors
  2018-05-21  8:33   ` Benjamin Herrenschmidt
@ 2018-05-21 18:48     ` Eddie James
  2018-05-21 22:53       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 17+ messages in thread
From: Eddie James @ 2018-05-21 18:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Andrew Jeffery, openbmc



On 05/21/2018 03:33 AM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-05-21 at 14:44 +0930, Andrew Jeffery wrote:
>> On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
>>> This has proven useful in case of problems with the FSI
>>> communication. We retry up to 3 times.
>> As a note, there's a separate, OCC-specific error-handling sequence
>> as well, but this is currently done in drivers/hwmon/occ/common.c.
> This is more for transport failures. It might be less useful now that
> I've added CRC error recovery in the FSI layer and made it more
> reliable overall. Experience showed that the upper OCC layer just
> didn't deal with it well.
>
>>>  From the OCC documentation[1]:
>> 3.3.1 BMC-OCC Communication Failure Handling
>>
>> On failures communicating with an OCC the BMC should first verify
>> that the “OCC Active” sensor is TRUE.  If the OCCs are not active the
>> error should be ignored and communication with the OCC should not be
>> retired until the “OCC Active” sensor is TRUE.  If the “OCC Active”
>> sensor is TRUE the command should be retried twice.
> What is the "OCC Active sensor" ?

It's a value in the OCC poll response.

>
>>   If the command still fails after two retries and the “OCC Active”
>> sensor is still “TRUE” and there is no checkstop the error is valid
>> and a request to reset the OCCs should be sent.
>>
>> [1] https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf
>>
>>> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>>> ---
>>>   drivers/fsi/fsi-occ.c | 179 ++++++++++++++++++++++++------------------
>>>   1 file changed, 104 insertions(+), 75 deletions(-)
>>>
>>> diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
>>> index bdee26096688..f4b2df7a3084 100644
>>> --- a/drivers/fsi/fsi-occ.c
>>> +++ b/drivers/fsi/fsi-occ.c
>>> @@ -35,6 +35,7 @@
>>>   #define OCC_SRAM_BYTES		4096
>>>   #define OCC_CMD_DATA_BYTES	4090
>>>   #define OCC_RESP_DATA_BYTES	4089
>>> +#define OCC_COMMAND_RETRIES	3
>>>   
>>>   /*
>>>    * Assume we don't have FFDC, if we do we'll overflow and
>>> @@ -458,9 +459,9 @@ static int occ_getsram(struct device *sbefifo, u32
>>> address, u8 *data,
>>>   		       ssize_t len)
>>>   {
>>>   	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
>>> -	size_t resp_len, resp_data_len;
>>> +	size_t saved_resp_len, resp_len, resp_data_len;
>>>   	__be32 *resp, cmd[5];
>>> -	int rc;
>>> +	int rc, retries = OCC_COMMAND_RETRIES;
>>>   
>>>   	/*
>>>   	 * Magic sequence to do SBE getsram command. SBE will fetch data from
>>> @@ -472,28 +473,37 @@ static int occ_getsram(struct device *sbefifo, u32
>>> address, u8 *data,
>>>   	cmd[3] = cpu_to_be32(address);
>>>   	cmd[4] = cpu_to_be32(data_len);
>>>   
>>> -	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
>>> +	resp_len = saved_resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
>>>   	resp = kzalloc(resp_len << 2 , GFP_KERNEL);
>>>   	if (!resp)
>>> -		return -ENOMEM;
>>> -
>>> -	rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
>>> -	if (rc)
>>> -		goto free;
>>> -	rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
>>> -	if (rc)
>>> -		goto free;
>>> +		rc = -ENOMEM;
>>> +
>>> +	rc = -1;
>>> +	while (rc && retries--) {
>>> +		resp_len = saved_resp_len;
>>> +		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
>>> +		if (rc == 0)
>>> +			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
>>> +		if (rc) {
>>> +			if (rc < 0)
>>> +				pr_err("occ: FSI error %d, retrying sram read\n", rc);
>>> +			else
>>> +				pr_err("occ: SBE error 0x%08x, retrying sram read\n", rc);
>>> +		}
>>> +	}
>>>   
>>> -	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
>>> -	if (resp_data_len != data_len) {
>>> -		pr_err("occ: SRAM read expected %d bytes got %d\n",
>>> -		       data_len, resp_data_len);
>>> -		rc = -EBADMSG;
>>> -	} else {
>>> -		memcpy(data, resp, len);
>>> +	/* Check response lenght and copy data */
>>> +	if (rc == 0) {
>>> +		resp_data_len = be32_to_cpu(resp[resp_len - 1]);
>>> +		if (resp_data_len != data_len) {
>>> +			pr_err("occ: SRAM read expected %d bytes got %d\n",
>>> +			       data_len, resp_data_len);
>>> +			rc = -EBADMSG;
>>> +		} else {
>>> +			memcpy(data, resp, len);
>>> +		}
>>>   	}
>>>   
>>> -free:
>>>   	/* Convert positive SBEI status */
>>>   	if (rc > 0) {
>>>   		pr_err("occ: SRAM read returned failure status: %08x\n", rc);
>>> @@ -508,8 +518,8 @@ static int occ_putsram(struct device *sbefifo, u32
>>> address, u8 *data,
>>>   {
>>>   	size_t cmd_len, buf_len, resp_len, resp_data_len;
>>>   	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
>>> +	int rc, retries = OCC_COMMAND_RETRIES;
>>>   	__be32 *buf;
>>> -	int rc;
>>>   
>>>   	/*
>>>   	 * We use the same buffer for command and response, make
>>> @@ -522,38 +532,48 @@ static int occ_putsram(struct device *sbefifo, u32
>>> address, u8 *data,
>>>   	if (!buf)
>>>   		return -ENOMEM;
>>>   
>>> -	/*
>>> -	 * Magic sequence to do SBE putsram command. SBE will transfer
>>> -	 * data to specified SRAM address.
>>> -	 */
>>> -	buf[0] = cpu_to_be32(cmd_len);
>>> -	buf[1] = cpu_to_be32(0xa404);
>>> -	buf[2] = cpu_to_be32(1);
>>> -	buf[3] = cpu_to_be32(address);
>>> -	buf[4] = cpu_to_be32(data_len);
>>> -
>>> -	memcpy(&buf[5], data, len);
>>> -
>>> -	rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
>>> -	if (rc)
>>> -		goto free;
>>> -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>>> -	if (rc)
>>> -		goto free;
>>> +	rc = -1;
>>> +	while (rc && retries--) {
>>> +		/*
>>> +		 * Magic sequence to do SBE putsram command. SBE will transfer
>>> +		 * data to specified SRAM address.
>>> +		 */
>>> +		buf[0] = cpu_to_be32(cmd_len);
>>> +		buf[1] = cpu_to_be32(0xa404);
>>> +		buf[2] = cpu_to_be32(1);
>>> +		buf[3] = cpu_to_be32(address);
>>> +		buf[4] = cpu_to_be32(data_len);
>> Maybe it's worth a comment that we're doing this work inside the loop because we're reusing the buffer for the response, and we're doing that to minimise the memory consumption, and it motivated by the SBEFIFO protocol crazy.
>>
>>> +
>>> +		memcpy(&buf[5], data, len);
>>> +
>>> +		resp_len = OCC_SBE_STATUS_WORDS;
>>> +		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
>>> +		if (rc == 0)
>>> +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>>> +		if (rc) {
>>> +			if (rc < 0)
>>> +				pr_err("occ: FSI error %d, retrying sram write\n", rc);
>>> +			else
>>> +				pr_err("occ: SBE error 0x%08x, retrying sram write\n", rc);
>>> +		}
>>> +	}
>>>   
>>> -	if (resp_len != 1) {
>>> -		pr_err("occ: SRAM write response lenght invalid: %d\n",
>>> -		       resp_len);
>>> -		rc = -EBADMSG;
>>> -	} else {
>>> -		resp_data_len = be32_to_cpu(buf[0]);
>>> -		if (resp_data_len != data_len) {
>>> -			pr_err("occ: SRAM write expected %d bytes got %d\n",
>>> -			       data_len, resp_data_len);
>>> +	/* Check response lenght */
>>> +	if (rc == 0) {
>>> +		if (resp_len != 1) {
>>> +			pr_err("occ: SRAM write response lenght invalid: %d\n",
>> Hmm, not your fault but I just noticed 'length' is misspelled here.
>>
>>> +			       resp_len);
>>>   			rc = -EBADMSG;
>>> +		} else {
>>> +			resp_data_len = be32_to_cpu(buf[0]);
>>> +			if (resp_data_len != data_len) {
>>> +				pr_err("occ: SRAM write expected %d bytes got %d\n",
>>> +				       data_len, resp_data_len);
>>> +				rc = -EBADMSG;
>>> +			}
>>>   		}
>>>   	}
>>> -free:
>>> +
>>>   	/* Convert positive SBEI status */
>>>   	if (rc > 0) {
>>>   		pr_err("occ: SRAM write returned failure status: %08x\n", rc);
>>> @@ -567,46 +587,55 @@ static int occ_trigger_attn(struct device *sbefifo)
>>>   {
>>>   	__be32 buf[OCC_SBE_STATUS_WORDS];
>>>   	size_t resp_len, resp_data_len;
>>> -	int rc;
>>> +	int rc, retries = OCC_COMMAND_RETRIES;
>>>   
>>>   	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 7);
>>> -	resp_len = OCC_SBE_STATUS_WORDS;
>>>   
>>> -	buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
>>> -	buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
>>> -	buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
>>> -	buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
>>> -	buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
>>> -	buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
>>> -	buf[6] = 0;
>>> -
>>> -	rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
>>> -	if (rc)
>>> -		goto error;
>>> -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>>> -	if (rc)
>>> -		goto error;
>>> +	rc = -1;
>>> +	while (rc && retries--) {
>>> +		resp_len = OCC_SBE_STATUS_WORDS;
>>> +
>>> +		buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
>>> +		buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
>>> +		buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
>>> +		buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
>>> +		buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
>>> +		buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
>>> +		buf[6] = 0;
>>> +
>>> +		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
>>> +		if (rc == 0)
>>> +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>>> +		if (rc) {
>>> +			if (rc < 0)
>>> +				pr_err("occ: FSI error %d, retrying attn\n", rc);
>>> +			else
>>> +				pr_err("occ: SBE error 0x%08x, retrying attn\n", rc);
>>> +		}
>>> +	}
>>>   
>>> -	if (resp_len != 1) {
>>> -		pr_err("occ: SRAM attn response lenght invalid: %d\n",
>>> -		       resp_len);
>>> -		rc = -EBADMSG;
>>> -	} else {
>>> -		resp_data_len = be32_to_cpu(buf[0]);
>>> -		if (resp_data_len != 8) {
>>> -			pr_err("occ: SRAM attn expected 8 bytes got %d\n",
>>> -			       resp_data_len);
>>> +	/* Check response lenght */
>> 'length'
>>
>>> +	if (rc == 0) {
>>> +		if (resp_len != 1) {
>>> +			pr_err("occ: SRAM attn response lenght invalid: %d\n",
>> 'length'
>>
>>> +			       resp_len);
>>>   			rc = -EBADMSG;
>>> +		} else {
>>> +			resp_data_len = be32_to_cpu(buf[0]);
>>> +			if (resp_data_len != 8) {
>>> +				pr_err("occ: SRAM attn expected 8 bytes got %d\n",
>>> +				       resp_data_len);
>>> +				rc = -EBADMSG;
>>> +			}
>>>   		}
>>>   	}
>>> - error:
>>> +
>>>   	/* Convert positive SBEI status */
>>>   	if (rc > 0) {
>>>   		pr_err("occ: SRAM attn returned failure status: %08x\n", rc);
>>>   		rc = -EBADMSG;
>>>   	}
>>>   
>>> -
>>>   	return rc;
>>>   }
>>>   
>>> -- 
>>> 2.17.0
>>>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors
  2018-05-21 14:48     ` Benjamin Herrenschmidt
@ 2018-05-21 18:58       ` Eddie James
  2018-05-21 22:55         ` Benjamin Herrenschmidt
  2018-05-22  0:36         ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 17+ messages in thread
From: Eddie James @ 2018-05-21 18:58 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Andrew Jeffery, openbmc



On 05/21/2018 09:48 AM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-05-21 at 14:56 +0930, Andrew Jeffery wrote:
>> On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
>>> Similarily to the new retry on SBE fifo errors, this adds
>>> retries if the data we obtain from the OCC has a bad checksum.
>>>
>>> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>>> ---
>>>   drivers/fsi/fsi-occ.c | 8 ++++++--
>>>   1 file changed, 6 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
>>> index f4b2df7a3084..7a5afa78fb6b 100644
>>> --- a/drivers/fsi/fsi-occ.c
>>> +++ b/drivers/fsi/fsi-occ.c
>>> @@ -652,7 +652,7 @@ static void occ_worker(struct work_struct *work)
>>>   	struct occ_client *client;
>>>   	struct occ *occ = container_of(work, struct occ, work);
>>>   	struct device *sbefifo = occ->sbefifo;
>>> -
>>> +	int retries = 0;
>>>   again:
>>>   	if (occ->cancel)
>>>   		return;
>>> @@ -720,7 +720,10 @@ static void occ_worker(struct work_struct *work)
>>>   	xfr->resp_data_length = resp_data_length + 7;
>>>   
>>>   	rc = occ_verify_checksum(resp, resp_data_length);
>>> -
>>> +	if (rc) {
>>> +		if (retries++ < OCC_COMMAND_RETRIES)
>>> +			goto again;
>>> +	}
>> How should this interact with the OCC error handling mentioned in my
>> reply on the previous patch? I feel like a checksum error is a bit of
>> a grey area - probably caused by the transport, but possibly due to
>> OCC firmware bugs as well?
> Would it hurt to retry in any case ?
>
>>   If it's the former then retrying independent of the OCC error
>> handling protocol is probably okay, but if we're trying to catch the
>> latter then maybe we should let this be handled as part of the OCC
>> error handling code?
>>
>> Eddie?

The checksum is part of the OCC response, so it's not a transport thing. 
If we've gotten to checking the checksum then we've got a full response 
that looks valid so far (reasonable length, etc).

If we're trying to adhere to the OCC spec, then I'm of the opinion that 
we shouldn't do any retries except for those handled in the occ-hwmon 
driver.

>>
>> Ben: Did you actually hit cases where this path was triggered? There
>> was the corruption issue with simultaneous LPC cycles that turned out
>> to be issues around level-shifters and synchronisers, was that it?
> Yes, and I had cases where the CRC4 didn't "catch" the errors. The
> retry fixed it. Now with the FSI layer being much more reliable, it
> might be that all that retry stuff I added is no longer necessary, so I
> won't be fighting for it, though I did find the upper layer error
> handling to be somewhat lacking in efficacy...
>
> I plan to do a deep dive on the rest of the OCC driver this week
> regardless. I don't like a few things about it, such as the 2 layers
> between fsi-occ and sbe_p9, that should be just one (sadly this change
> will break the userspace binding code...).

What do you mean two layers? fsi-occ and occ-hwmon? I fear that the 
hwmon maintainer won't like having so much transport stuff (and a 
chardev) in the hwmon driver.

Thanks,
Eddie

>
> I'll see if I can figure out how that error hanlding works.
>
> Cheers,
> Ben.
>
>>>   done:
>>>   	mutex_unlock(&occ->occ_lock);
>>>   
>>> @@ -732,6 +735,7 @@ static void occ_worker(struct work_struct *work)
>>>   	clear_bit(XFR_IN_PROGRESS, &xfr->flags);
>>>   	list_del(&xfr->link);
>>>   	empty = list_empty(&occ->xfrs);
>>> +	retries = 0;
>>>   
>>>   	spin_unlock_irqrestore(&occ->list_lock, flags);
>>>   
>>> -- 
>>> 2.17.0
>>>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors
  2018-05-21 18:48     ` Eddie James
@ 2018-05-21 22:53       ` Benjamin Herrenschmidt
  2018-05-22 14:09         ` Eddie James
  0 siblings, 1 reply; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-21 22:53 UTC (permalink / raw)
  To: Eddie James, Andrew Jeffery, openbmc

On Mon, 2018-05-21 at 13:48 -0500, Eddie James wrote:
> 
> > > 3.3.1 BMC-OCC Communication Failure Handling
> > > 
> > > On failures communicating with an OCC the BMC should first verify
> > > that the “OCC Active” sensor is TRUE.  If the OCCs are not active the
> > > error should be ignored and communication with the OCC should not be
> > > retired until the “OCC Active” sensor is TRUE.  If the “OCC Active”
> > > sensor is TRUE the command should be retried twice.
> > 
> > What is the "OCC Active sensor" ?
> 
> It's a value in the OCC poll response.

That's only useful if you can get that response then... which you can't
if the communication fails. I'm missing something here.

Ben.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors
  2018-05-21 18:58       ` Eddie James
@ 2018-05-21 22:55         ` Benjamin Herrenschmidt
  2018-05-22  0:36         ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-21 22:55 UTC (permalink / raw)
  To: Eddie James, Andrew Jeffery, openbmc

On Mon, 2018-05-21 at 13:58 -0500, Eddie James wrote:
> 
> The checksum is part of the OCC response, so it's not a transport thing. 
> If we've gotten to checking the checksum then we've got a full response 
> that looks valid so far (reasonable length, etc).

I have had cases of transport errors causing a bad checksum, CRC4 is a
bit weak at the FSI transport level.

> If we're trying to adhere to the OCC spec, then I'm of the opinion that 
> we shouldn't do any retries except for those handled in the occ-hwmon 
> driver.
> 
> > > 
> > > Ben: Did you actually hit cases where this path was triggered? There
> > > was the corruption issue with simultaneous LPC cycles that turned out
> > > to be issues around level-shifters and synchronisers, was that it?
> > 
> > Yes, and I had cases where the CRC4 didn't "catch" the errors. The
> > retry fixed it. Now with the FSI layer being much more reliable, it
> > might be that all that retry stuff I added is no longer necessary, so I
> > won't be fighting for it, though I did find the upper layer error
> > handling to be somewhat lacking in efficacy...
> > 
> > I plan to do a deep dive on the rest of the OCC driver this week
> > regardless. I don't like a few things about it, such as the 2 layers
> > between fsi-occ and sbe_p9, that should be just one (sadly this change
> > will break the userspace binding code...).
> 
> What do you mean two layers? fsi-occ and occ-hwmon? I fear that the 
> hwmon maintainer won't like having so much transport stuff (and a 
> chardev) in the hwmon driver.

The transport stuff in fsi-occ can be brought down to almost nothing,
most of the code in there is ... not particularily useful.

As for the chardev, other drivers in hwmon do that too, but I suppose I
can handle the discussion with Guenter.

Cheers,
Ben.

> > 
> > I'll see if I can figure out how that error hanlding works.
> > 
> > Cheers,
> > Ben.
> > 
> > > >   done:
> > > >   	mutex_unlock(&occ->occ_lock);
> > > >   
> > > > @@ -732,6 +735,7 @@ static void occ_worker(struct work_struct *work)
> > > >   	clear_bit(XFR_IN_PROGRESS, &xfr->flags);
> > > >   	list_del(&xfr->link);
> > > >   	empty = list_empty(&occ->xfrs);
> > > > +	retries = 0;
> > > >   
> > > >   	spin_unlock_irqrestore(&occ->list_lock, flags);
> > > >   
> > > > -- 
> > > > 2.17.0
> > > > 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors
  2018-05-21 18:58       ` Eddie James
  2018-05-21 22:55         ` Benjamin Herrenschmidt
@ 2018-05-22  0:36         ` Benjamin Herrenschmidt
  2018-05-22 14:06           ` Eddie James
  1 sibling, 1 reply; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-22  0:36 UTC (permalink / raw)
  To: Eddie James, Andrew Jeffery, openbmc

On Mon, 2018-05-21 at 13:58 -0500, Eddie James wrote:
> > >    If it's the former then retrying independent of the OCC error
> > > handling protocol is probably okay, but if we're trying to catch the
> > > latter then maybe we should let this be handled as part of the OCC
> > > error handling code?
> > > 
> > > Eddie?
> 
> The checksum is part of the OCC response, so it's not a transport thing. 
> If we've gotten to checking the checksum then we've got a full response 
> that looks valid so far (reasonable length, etc).
> 
> If we're trying to adhere to the OCC spec, then I'm of the opinion that 
> we shouldn't do any retries except for those handled in the occ-hwmon 
> driver.

I don't see any in there ... what am I missing ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors
  2018-05-22  0:36         ` Benjamin Herrenschmidt
@ 2018-05-22 14:06           ` Eddie James
  0 siblings, 0 replies; 17+ messages in thread
From: Eddie James @ 2018-05-22 14:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Andrew Jeffery, openbmc



On 05/21/2018 07:36 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-05-21 at 13:58 -0500, Eddie James wrote:
>>>>     If it's the former then retrying independent of the OCC error
>>>> handling protocol is probably okay, but if we're trying to catch the
>>>> latter then maybe we should let this be handled as part of the OCC
>>>> error handling code?
>>>>
>>>> Eddie?
>> The checksum is part of the OCC response, so it's not a transport thing.
>> If we've gotten to checking the checksum then we've got a full response
>> that looks valid so far (reasonable length, etc).
>>
>> If we're trying to adhere to the OCC spec, then I'm of the opinion that
>> we shouldn't do any retries except for those handled in the occ-hwmon
>> driver.
> I don't see any in there ... what am I missing ?

Oh yea, we moved the retries to userspace, sort of. Basically userspace 
keeps polling every second, even if it fails, until the "error" 
attribute gets set. That error attribute is only set if we've tried and 
failed to communicate a certain number of times 
(OCC_ERROR_COUNT_THRESHOLD in the occ hwmon driver). Once userspace 
picks up the error attribute, it has to take action on that, by 
resetting the OCC over IPMI, etc.

Thanks,
Eddie

>
> Cheers,
> Ben.
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors
  2018-05-21 22:53       ` Benjamin Herrenschmidt
@ 2018-05-22 14:09         ` Eddie James
  0 siblings, 0 replies; 17+ messages in thread
From: Eddie James @ 2018-05-22 14:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Andrew Jeffery, openbmc



On 05/21/2018 05:53 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-05-21 at 13:48 -0500, Eddie James wrote:
>>>> 3.3.1 BMC-OCC Communication Failure Handling
>>>>
>>>> On failures communicating with an OCC the BMC should first verify
>>>> that the “OCC Active” sensor is TRUE.  If the OCCs are not active the
>>>> error should be ignored and communication with the OCC should not be
>>>> retired until the “OCC Active” sensor is TRUE.  If the “OCC Active”
>>>> sensor is TRUE the command should be retried twice.
>>> What is the "OCC Active sensor" ?
>> It's a value in the OCC poll response.
> That's only useful if you can get that response then... which you can't
> if the communication fails. I'm missing something here.

Ah. There is also the IPMI OCC active sensor, which is what this must 
mean. We're doing this correctly by unbinding the occ-hwmon driver when 
the OCC active sensor comes in false. So, if driver is bound, OCC active 
must be true, so we "retry" twice by only setting the error attribute 
after two failed poll responses.

Thanks... sorry for the mixup.
Eddie

>
> Ben.
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2018-05-22 14:09 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-18  1:34 [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Benjamin Herrenschmidt
2018-05-18  1:34 ` [PATCH linux dev-4.13 2/4] fsi/occ: Add Retries on checksum errors Benjamin Herrenschmidt
2018-05-21  5:26   ` Andrew Jeffery
2018-05-21 14:48     ` Benjamin Herrenschmidt
2018-05-21 18:58       ` Eddie James
2018-05-21 22:55         ` Benjamin Herrenschmidt
2018-05-22  0:36         ` Benjamin Herrenschmidt
2018-05-22 14:06           ` Eddie James
2018-05-18  1:34 ` [PATCH linux dev-4.13 3/4] fsi/occ: Nicer error messages when talking to a host that isn't ready Benjamin Herrenschmidt
2018-05-21  5:30   ` Andrew Jeffery
2018-05-18  1:35 ` [PATCH linux dev-4.13 4/4] fsi/occ: Don't set driver data late Benjamin Herrenschmidt
2018-05-21  5:44   ` Andrew Jeffery
2018-05-21  5:14 ` [PATCH linux dev-4.13 1/4] fsi/occ: Add retries on SBE errors Andrew Jeffery
2018-05-21  8:33   ` Benjamin Herrenschmidt
2018-05-21 18:48     ` Eddie James
2018-05-21 22:53       ` Benjamin Herrenschmidt
2018-05-22 14:09         ` Eddie James

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.