linux-hwmon.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] occ: fsi and hwmon: Extract and provide the SBEFIFO FFDC
@ 2021-09-14 21:35 Eddie James
  2021-09-14 21:35 ` [PATCH 1/3] fsi: occ: Use a large buffer for responses Eddie James
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Eddie James @ 2021-09-14 21:35 UTC (permalink / raw)
  To: linux-fsi
  Cc: linux-hwmon, linux-kernel, joel, linux, jdelvare, alistair, jk, eajames

Currently, users have no way to obtain the FFDC (First Failure Data
Capture) provided by the SBEFIFO when an operation fails. To remedy this,
add code in the FSI OCC driver to store this FFDC in the user's response
buffer and set the response length accordingly.
On the hwmon side, there is a need at the application level to perform
side-band operations in response to SBE errors. Therefore, add a new
binary sysfs file that provides the FFDC (or lack thereof) when there is
an SBEFIFO error. Now applications can take action when an SBE error is
detected.

Eddie James (3):
  fsi: occ: Use a large buffer for responses
  fsi: occ: Store the SBEFIFO FFDC in the user response buffer
  hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs

 drivers/fsi/fsi-occ.c      | 184 ++++++++++++++++++++-----------------
 drivers/hwmon/occ/p9_sbe.c |  98 +++++++++++++++++++-
 include/linux/fsi-occ.h    |   3 +
 3 files changed, 202 insertions(+), 83 deletions(-)

-- 
2.27.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] fsi: occ: Use a large buffer for responses
  2021-09-14 21:35 [PATCH 0/3] occ: fsi and hwmon: Extract and provide the SBEFIFO FFDC Eddie James
@ 2021-09-14 21:35 ` Eddie James
  2021-09-14 21:35 ` [PATCH 2/3] fsi: occ: Store the SBEFIFO FFDC in the user response buffer Eddie James
  2021-09-14 21:35 ` [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs Eddie James
  2 siblings, 0 replies; 11+ messages in thread
From: Eddie James @ 2021-09-14 21:35 UTC (permalink / raw)
  To: linux-fsi
  Cc: linux-hwmon, linux-kernel, joel, linux, jdelvare, alistair, jk, eajames

Allocate a large buffer for each OCC to handle response data. This
removes memory allocation during an operation, and also allows for
the maximum amount of SBE FFDC.

Signed-off-by: Eddie James <eajames@linux.ibm.com>
---
 drivers/fsi/fsi-occ.c   | 109 ++++++++++++++++------------------------
 include/linux/fsi-occ.h |   2 +
 2 files changed, 45 insertions(+), 66 deletions(-)

diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
index b0c9322078a1..ace3ec7767e5 100644
--- a/drivers/fsi/fsi-occ.c
+++ b/drivers/fsi/fsi-occ.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/miscdevice.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/fsi-occ.h>
@@ -42,13 +43,6 @@
 
 #define OCC_P10_SRAM_MODE	0x58	/* Normal mode, OCB channel 2 */
 
-/*
- * Assume we don't have much FFDC, if we do we'll overflow and
- * fail the command. This needs to be big enough for simple
- * commands as well.
- */
-#define OCC_SBE_STATUS_WORDS	32
-
 #define OCC_TIMEOUT_MS		1000
 #define OCC_CMD_IN_PRG_WAIT_MS	50
 
@@ -60,6 +54,7 @@ struct occ {
 	char name[32];
 	int idx;
 	u8 sequence_number;
+	void *buffer;
 	enum versions version;
 	struct miscdevice mdev;
 	struct mutex occ_lock;
@@ -250,8 +245,10 @@ static int occ_verify_checksum(struct occ *occ, struct occ_response *resp,
 static int occ_getsram(struct occ *occ, u32 offset, void *data, ssize_t len)
 {
 	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
-	size_t cmd_len, resp_len, resp_data_len;
-	__be32 *resp, cmd[6];
+	size_t cmd_len, resp_data_len;
+	size_t resp_len = OCC_MAX_RESP_WORDS;
+	__be32 *resp = occ->buffer;
+	__be32 cmd[6];
 	int idx = 0, rc;
 
 	/*
@@ -278,19 +275,19 @@ static int occ_getsram(struct occ *occ, u32 offset, void *data, ssize_t len)
 	cmd[1] = cpu_to_be32(SBEFIFO_CMD_GET_OCC_SRAM);
 	cmd[4 + idx] = cpu_to_be32(data_len);
 
-	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
-	resp = kzalloc(resp_len << 2, GFP_KERNEL);
-	if (!resp)
-		return -ENOMEM;
-
 	rc = sbefifo_submit(occ->sbefifo, cmd, cmd_len, resp, &resp_len);
 	if (rc)
-		goto free;
+		return rc;
 
 	rc = sbefifo_parse_status(occ->sbefifo, SBEFIFO_CMD_GET_OCC_SRAM,
 				  resp, resp_len, &resp_len);
-	if (rc)
-		goto free;
+	if (rc > 0) {
+		dev_err(occ->dev, "SRAM read returned failure status: %08x\n",
+			rc);
+		return -EBADMSG;
+	} else if (rc) {
+		return rc;
+	}
 
 	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
 	if (resp_data_len != data_len) {
@@ -301,39 +298,21 @@ static int occ_getsram(struct occ *occ, u32 offset, void *data, ssize_t len)
 		memcpy(data, resp, len);
 	}
 
-free:
-	/* Convert positive SBEI status */
-	if (rc > 0) {
-		dev_err(occ->dev, "SRAM read returned failure status: %08x\n",
-			rc);
-		rc = -EBADMSG;
-	}
-
-	kfree(resp);
 	return rc;
 }
 
 static int occ_putsram(struct occ *occ, const void *data, ssize_t len,
 		       u8 seq_no, u16 checksum)
 {
-	size_t cmd_len, buf_len, resp_len, resp_data_len;
 	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
-	__be32 *buf;
+	size_t cmd_len, resp_data_len;
+	size_t resp_len = OCC_MAX_RESP_WORDS;
+	__be32 *buf = occ->buffer;
 	u8 *byte_buf;
 	int idx = 0, rc;
 
 	cmd_len = (occ->version == occ_p10) ? 6 : 5;
-
-	/*
-	 * We use the same buffer for command and response, make
-	 * sure it's big enough
-	 */
-	resp_len = OCC_SBE_STATUS_WORDS;
 	cmd_len += data_len >> 2;
-	buf_len = max(cmd_len, resp_len);
-	buf = kzalloc(buf_len << 2, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
 
 	/*
 	 * Magic sequence to do SBE putsram command. SBE will transfer
@@ -384,12 +363,17 @@ static int occ_putsram(struct occ *occ, const void *data, ssize_t len,
 
 	rc = sbefifo_submit(occ->sbefifo, buf, cmd_len, buf, &resp_len);
 	if (rc)
-		goto free;
+		return rc;
 
 	rc = sbefifo_parse_status(occ->sbefifo, SBEFIFO_CMD_PUT_OCC_SRAM,
 				  buf, resp_len, &resp_len);
-	if (rc)
-		goto free;
+	if (rc > 0) {
+		dev_err(occ->dev, "SRAM write returned failure status: %08x\n",
+			rc);
+		return -EBADMSG;
+	} else if (rc) {
+		return rc;
+	}
 
 	if (resp_len != 1) {
 		dev_err(occ->dev, "SRAM write response length invalid: %zd\n",
@@ -405,27 +389,16 @@ static int occ_putsram(struct occ *occ, const void *data, ssize_t len,
 		}
 	}
 
-free:
-	/* Convert positive SBEI status */
-	if (rc > 0) {
-		dev_err(occ->dev, "SRAM write returned failure status: %08x\n",
-			rc);
-		rc = -EBADMSG;
-	}
-
-	kfree(buf);
 	return rc;
 }
 
 static int occ_trigger_attn(struct occ *occ)
 {
-	__be32 buf[OCC_SBE_STATUS_WORDS];
-	size_t cmd_len, resp_len, resp_data_len;
+	__be32 *buf = occ->buffer;
+	size_t cmd_len, resp_data_len;
+	size_t resp_len = OCC_MAX_RESP_WORDS;
 	int idx = 0, rc;
 
-	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 8);
-	resp_len = OCC_SBE_STATUS_WORDS;
-
 	switch (occ->version) {
 	default:
 	case occ_p9:
@@ -450,12 +423,17 @@ static int occ_trigger_attn(struct occ *occ)
 
 	rc = sbefifo_submit(occ->sbefifo, buf, cmd_len, buf, &resp_len);
 	if (rc)
-		goto error;
+		return rc;
 
 	rc = sbefifo_parse_status(occ->sbefifo, SBEFIFO_CMD_PUT_OCC_SRAM,
 				  buf, resp_len, &resp_len);
-	if (rc)
-		goto error;
+	if (rc > 0) {
+		dev_err(occ->dev, "SRAM attn returned failure status: %08x\n",
+			rc);
+		return -EBADMSG;
+	} else if (rc) {
+		return rc;
+	}
 
 	if (resp_len != 1) {
 		dev_err(occ->dev, "SRAM attn response length invalid: %zd\n",
@@ -471,14 +449,6 @@ static int occ_trigger_attn(struct occ *occ)
 		}
 	}
 
- error:
-	/* Convert positive SBEI status */
-	if (rc > 0) {
-		dev_err(occ->dev, "SRAM attn returned failure status: %08x\n",
-			rc);
-		rc = -EBADMSG;
-	}
-
 	return rc;
 }
 
@@ -635,6 +605,10 @@ static int occ_probe(struct platform_device *pdev)
 	if (!occ)
 		return -ENOMEM;
 
+	occ->buffer = kvmalloc(OCC_MAX_RESP_WORDS * 4, GFP_KERNEL);
+	if (!occ->buffer)
+		return -ENOMEM;
+
 	occ->version = (enum versions)md;
 	occ->dev = dev;
 	occ->sbefifo = dev->parent;
@@ -670,6 +644,7 @@ static int occ_probe(struct platform_device *pdev)
 	if (rc) {
 		dev_err(dev, "failed to register miscdevice: %d\n", rc);
 		ida_simple_remove(&occ_ida, occ->idx);
+		kvfree(occ->buffer);
 		return rc;
 	}
 
@@ -685,6 +660,8 @@ static int occ_remove(struct platform_device *pdev)
 {
 	struct occ *occ = platform_get_drvdata(pdev);
 
+	kvfree(occ->buffer);
+
 	misc_deregister(&occ->mdev);
 
 	device_for_each_child(&pdev->dev, NULL, occ_unregister_child);
diff --git a/include/linux/fsi-occ.h b/include/linux/fsi-occ.h
index d4cdc2aa6e33..7ee3dbd7f4b3 100644
--- a/include/linux/fsi-occ.h
+++ b/include/linux/fsi-occ.h
@@ -19,6 +19,8 @@ struct device;
 #define OCC_RESP_CRIT_OCB		0xE3
 #define OCC_RESP_CRIT_HW		0xE4
 
+#define OCC_MAX_RESP_WORDS		2048
+
 int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 		   void *response, size_t *resp_len);
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/3] fsi: occ: Store the SBEFIFO FFDC in the user response buffer
  2021-09-14 21:35 [PATCH 0/3] occ: fsi and hwmon: Extract and provide the SBEFIFO FFDC Eddie James
  2021-09-14 21:35 ` [PATCH 1/3] fsi: occ: Use a large buffer for responses Eddie James
@ 2021-09-14 21:35 ` Eddie James
  2021-09-14 21:35 ` [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs Eddie James
  2 siblings, 0 replies; 11+ messages in thread
From: Eddie James @ 2021-09-14 21:35 UTC (permalink / raw)
  To: linux-fsi
  Cc: linux-hwmon, linux-kernel, joel, linux, jdelvare, alistair, jk, eajames

If the SBEFIFO response indicates an error, store the response in the
user buffer and return an error. Previously, the user had no way of
obtaining the SBEFIFO FFDC. In case of an error with the SBE or SBEFIFO
itself, in which case there is no FFDC, set the FFDC data to a magic
value to indicate this kind of error.

Signed-off-by: Eddie James <eajames@linux.ibm.com>
---
 drivers/fsi/fsi-occ.c   | 87 ++++++++++++++++++++++++++++++-----------
 include/linux/fsi-occ.h |  1 +
 2 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
index ace3ec7767e5..e4ef96e41747 100644
--- a/drivers/fsi/fsi-occ.c
+++ b/drivers/fsi/fsi-occ.c
@@ -55,6 +55,9 @@ struct occ {
 	int idx;
 	u8 sequence_number;
 	void *buffer;
+	void *client_buffer;
+	size_t client_buffer_size;
+	size_t client_response_size;
 	enum versions version;
 	struct miscdevice mdev;
 	struct mutex occ_lock;
@@ -217,6 +220,28 @@ static const struct file_operations occ_fops = {
 	.release = occ_release,
 };
 
+static void occ_save_ffdc(struct occ *occ, __be32 *resp, size_t parsed_len,
+			  size_t resp_len)
+{
+	size_t dh = resp_len - parsed_len;
+	size_t ffdc_len = (dh - 1) * 4;
+	__be32 *ffdc = &resp[resp_len - dh];
+
+	if (ffdc_len > occ->client_buffer_size)
+		ffdc_len = occ->client_buffer_size;
+
+	memcpy(occ->client_buffer, ffdc, ffdc_len);
+	occ->client_response_size = ffdc_len;
+}
+
+static void occ_save_sbefifo_error(struct occ *occ)
+{
+	u32 no_ffdc_magic = OCC_NO_FFDC_MAGIC;
+
+	memcpy(occ->client_buffer, &no_ffdc_magic, sizeof(u32));
+	occ->client_response_size = sizeof(u32);
+}
+
 static int occ_verify_checksum(struct occ *occ, struct occ_response *resp,
 			       u16 data_length)
 {
@@ -245,7 +270,7 @@ static int occ_verify_checksum(struct occ *occ, struct occ_response *resp,
 static int occ_getsram(struct occ *occ, u32 offset, void *data, ssize_t len)
 {
 	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
-	size_t cmd_len, resp_data_len;
+	size_t cmd_len, parsed_len, resp_data_len;
 	size_t resp_len = OCC_MAX_RESP_WORDS;
 	__be32 *resp = occ->buffer;
 	__be32 cmd[6];
@@ -276,20 +301,23 @@ static int occ_getsram(struct occ *occ, u32 offset, void *data, ssize_t len)
 	cmd[4 + idx] = cpu_to_be32(data_len);
 
 	rc = sbefifo_submit(occ->sbefifo, cmd, cmd_len, resp, &resp_len);
-	if (rc)
+	if (rc) {
+		occ_save_sbefifo_error(occ);
 		return rc;
+	}
 
 	rc = sbefifo_parse_status(occ->sbefifo, SBEFIFO_CMD_GET_OCC_SRAM,
-				  resp, resp_len, &resp_len);
+				  resp, resp_len, &parsed_len);
 	if (rc > 0) {
 		dev_err(occ->dev, "SRAM read returned failure status: %08x\n",
 			rc);
-		return -EBADMSG;
+		occ_save_ffdc(occ, resp, parsed_len, resp_len);
+		return -ECOMM;
 	} else if (rc) {
 		return rc;
 	}
 
-	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
+	resp_data_len = be32_to_cpu(resp[parsed_len - 1]);
 	if (resp_data_len != data_len) {
 		dev_err(occ->dev, "SRAM read expected %d bytes got %zd\n",
 			data_len, resp_data_len);
@@ -305,7 +333,7 @@ static int occ_putsram(struct occ *occ, const void *data, ssize_t len,
 		       u8 seq_no, u16 checksum)
 {
 	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
-	size_t cmd_len, resp_data_len;
+	size_t cmd_len, parsed_len, resp_data_len;
 	size_t resp_len = OCC_MAX_RESP_WORDS;
 	__be32 *buf = occ->buffer;
 	u8 *byte_buf;
@@ -362,22 +390,25 @@ static int occ_putsram(struct occ *occ, const void *data, ssize_t len,
 	}
 
 	rc = sbefifo_submit(occ->sbefifo, buf, cmd_len, buf, &resp_len);
-	if (rc)
+	if (rc) {
+		occ_save_sbefifo_error(occ);
 		return rc;
+	}
 
 	rc = sbefifo_parse_status(occ->sbefifo, SBEFIFO_CMD_PUT_OCC_SRAM,
-				  buf, resp_len, &resp_len);
+				  buf, resp_len, &parsed_len);
 	if (rc > 0) {
 		dev_err(occ->dev, "SRAM write returned failure status: %08x\n",
 			rc);
-		return -EBADMSG;
+		occ_save_ffdc(occ, buf, parsed_len, resp_len);
+		return -ECOMM;
 	} else if (rc) {
 		return rc;
 	}
 
-	if (resp_len != 1) {
+	if (parsed_len != 1) {
 		dev_err(occ->dev, "SRAM write response length invalid: %zd\n",
-			resp_len);
+			parsed_len);
 		rc = -EBADMSG;
 	} else {
 		resp_data_len = be32_to_cpu(buf[0]);
@@ -395,7 +426,7 @@ static int occ_putsram(struct occ *occ, const void *data, ssize_t len,
 static int occ_trigger_attn(struct occ *occ)
 {
 	__be32 *buf = occ->buffer;
-	size_t cmd_len, resp_data_len;
+	size_t cmd_len, parsed_len, resp_data_len;
 	size_t resp_len = OCC_MAX_RESP_WORDS;
 	int idx = 0, rc;
 
@@ -422,22 +453,25 @@ static int occ_trigger_attn(struct occ *occ)
 	buf[6 + idx] = 0;
 
 	rc = sbefifo_submit(occ->sbefifo, buf, cmd_len, buf, &resp_len);
-	if (rc)
+	if (rc) {
+		occ_save_sbefifo_error(occ);
 		return rc;
+	}
 
 	rc = sbefifo_parse_status(occ->sbefifo, SBEFIFO_CMD_PUT_OCC_SRAM,
-				  buf, resp_len, &resp_len);
+				  buf, resp_len, &parsed_len);
 	if (rc > 0) {
 		dev_err(occ->dev, "SRAM attn returned failure status: %08x\n",
 			rc);
-		return -EBADMSG;
+		occ_save_ffdc(occ, buf, parsed_len, resp_len);
+		return -ECOMM;
 	} else if (rc) {
 		return rc;
 	}
 
-	if (resp_len != 1) {
+	if (parsed_len != 1) {
 		dev_err(occ->dev, "SRAM attn response length invalid: %zd\n",
-			resp_len);
+			parsed_len);
 		rc = -EBADMSG;
 	} else {
 		resp_data_len = be32_to_cpu(buf[0]);
@@ -460,6 +494,7 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 		msecs_to_jiffies(OCC_CMD_IN_PRG_WAIT_MS);
 	struct occ *occ = dev_get_drvdata(dev);
 	struct occ_response *resp = response;
+	size_t user_resp_len = *resp_len;
 	u8 seq_no;
 	u16 checksum = 0;
 	u16 resp_data_length;
@@ -468,11 +503,13 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 	int rc;
 	size_t i;
 
+	*resp_len = 0;
+
 	if (!occ)
 		return -ENODEV;
 
-	if (*resp_len < 7) {
-		dev_dbg(dev, "Bad resplen %zd\n", *resp_len);
+	if (user_resp_len < 7) {
+		dev_dbg(dev, "Bad resplen %zd\n", user_resp_len);
 		return -EINVAL;
 	}
 
@@ -482,6 +519,11 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 
 	mutex_lock(&occ->occ_lock);
 
+	occ->client_buffer = response;
+	occ->client_buffer_size = user_resp_len;
+	occ->client_response_size = 0;
+	memset(occ->client_buffer, 0, 8);
+
 	/*
 	 * Get a sequence number and update the counter. Avoid a sequence
 	 * number of 0 which would pass the response check below even if the
@@ -532,7 +574,7 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 	resp_data_length = get_unaligned_be16(&resp->data_length);
 
 	/* Message size is data length + 5 bytes header + 2 bytes checksum */
-	if ((resp_data_length + 7) > *resp_len) {
+	if ((resp_data_length + 7) > user_resp_len) {
 		rc = -EMSGSIZE;
 		goto done;
 	}
@@ -548,7 +590,7 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 			goto done;
 	}
 
-	*resp_len = resp_data_length + 7;
+	occ->client_response_size = resp_data_length + 7;
 
 	{
 		DEFINE_DYNAMIC_DEBUG_METADATA(ddm_occ_rsp,
@@ -560,7 +602,7 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 		    DYNAMIC_DEBUG_BRANCH(ddm_occ_rsp)) {
 			char prefix[64];
 			size_t l = DYNAMIC_DEBUG_BRANCH(ddm_occ_full_rsp) ?
-				*resp_len : 16;
+				occ->client_response_size : 16;
 
 			snprintf(prefix, sizeof(prefix), "%s %s: rsp ",
 				 dev_driver_string(occ->dev),
@@ -573,6 +615,7 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 	rc = occ_verify_checksum(occ, resp, resp_data_length);
 
  done:
+	*resp_len = occ->client_response_size;
 	mutex_unlock(&occ->occ_lock);
 
 	return rc;
diff --git a/include/linux/fsi-occ.h b/include/linux/fsi-occ.h
index 7ee3dbd7f4b3..1d584b2acba5 100644
--- a/include/linux/fsi-occ.h
+++ b/include/linux/fsi-occ.h
@@ -20,6 +20,7 @@ struct device;
 #define OCC_RESP_CRIT_HW		0xE4
 
 #define OCC_MAX_RESP_WORDS		2048
+#define OCC_NO_FFDC_MAGIC		0x4f434346	/* "OCCF" */
 
 int fsi_occ_submit(struct device *dev, const void *request, size_t req_len,
 		   void *response, size_t *resp_len);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs
  2021-09-14 21:35 [PATCH 0/3] occ: fsi and hwmon: Extract and provide the SBEFIFO FFDC Eddie James
  2021-09-14 21:35 ` [PATCH 1/3] fsi: occ: Use a large buffer for responses Eddie James
  2021-09-14 21:35 ` [PATCH 2/3] fsi: occ: Store the SBEFIFO FFDC in the user response buffer Eddie James
@ 2021-09-14 21:35 ` Eddie James
  2021-09-15 16:13   ` Guenter Roeck
                     ` (2 more replies)
  2 siblings, 3 replies; 11+ messages in thread
From: Eddie James @ 2021-09-14 21:35 UTC (permalink / raw)
  To: linux-fsi
  Cc: linux-hwmon, linux-kernel, joel, linux, jdelvare, alistair, jk, eajames

Save any FFDC provided by the OCC driver, and provide it to userspace
through a binary sysfs entry. Do some basic state management to
ensure that userspace can always collect the data if there was an
error. Notify polling userspace when there is an error too.

Signed-off-by: Eddie James <eajames@linux.ibm.com>
---
 drivers/hwmon/occ/p9_sbe.c | 98 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/drivers/hwmon/occ/p9_sbe.c b/drivers/hwmon/occ/p9_sbe.c
index 9709f2b9c052..505f489832a4 100644
--- a/drivers/hwmon/occ/p9_sbe.c
+++ b/drivers/hwmon/occ/p9_sbe.c
@@ -4,18 +4,54 @@
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/fsi-occ.h>
+#include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/platform_device.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
 
 #include "common.h"
 
+enum sbe_error_state {
+	SBE_ERROR_NONE = 0,
+	SBE_ERROR_PENDING,
+	SBE_ERROR_COLLECTED
+};
+
 struct p9_sbe_occ {
 	struct occ occ;
+	int sbe_error;
+	void *ffdc;
+	size_t ffdc_len;
+	size_t ffdc_size;
+	struct mutex sbe_error_lock;	/* lock access to ffdc data */
+	u32 no_ffdc_magic;
 	struct device *sbe;
 };
 
 #define to_p9_sbe_occ(x)	container_of((x), struct p9_sbe_occ, occ)
 
+static ssize_t sbe_error_read(struct file *filp, struct kobject *kobj,
+			      struct bin_attribute *battr, char *buf,
+			      loff_t pos, size_t count)
+{
+	ssize_t rc = 0;
+	struct occ *occ = dev_get_drvdata(kobj_to_dev(kobj));
+	struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
+
+	mutex_lock(&ctx->sbe_error_lock);
+	if (ctx->sbe_error == SBE_ERROR_PENDING) {
+		rc = memory_read_from_buffer(buf, count, &pos, ctx->ffdc,
+					     ctx->ffdc_len);
+		ctx->sbe_error = SBE_ERROR_COLLECTED;
+	}
+	mutex_unlock(&ctx->sbe_error_lock);
+
+	return rc;
+}
+static BIN_ATTR_RO(sbe_error, OCC_MAX_RESP_WORDS * 4);
+
 static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd, size_t len)
 {
 	struct occ_response *resp = &occ->resp;
@@ -24,8 +60,47 @@ static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd, size_t len)
 	int rc;
 
 	rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
-	if (rc < 0)
+	if (rc < 0) {
+		if (resp_len) {
+			bool notify = false;
+
+			mutex_lock(&ctx->sbe_error_lock);
+			if (ctx->sbe_error != SBE_ERROR_PENDING)
+				notify = true;
+			ctx->sbe_error = SBE_ERROR_PENDING;
+
+			if (resp_len > ctx->ffdc_size) {
+				if (ctx->ffdc_size)
+					kvfree(ctx->ffdc);
+				ctx->ffdc = kvmalloc(resp_len, GFP_KERNEL);
+				if (!ctx->ffdc) {
+					ctx->ffdc_size = 0;
+					ctx->ffdc_len = sizeof(u32);
+					ctx->ffdc = &ctx->no_ffdc_magic;
+					goto unlock;
+				}
+
+				ctx->ffdc_size = resp_len;
+			}
+
+			ctx->ffdc_len = resp_len;
+			memcpy(ctx->ffdc, resp, resp_len);
+
+unlock:
+			mutex_unlock(&ctx->sbe_error_lock);
+
+			if (notify)
+				sysfs_notify(&occ->bus_dev->kobj, NULL,
+					     bin_attr_sbe_error.attr.name);
+		}
+
 		return rc;
+	}
+
+	mutex_lock(&ctx->sbe_error_lock);
+	if (ctx->sbe_error == SBE_ERROR_COLLECTED)
+		ctx->sbe_error = SBE_ERROR_NONE;
+	mutex_unlock(&ctx->sbe_error_lock);
 
 	switch (resp->return_status) {
 	case OCC_RESP_CMD_IN_PRG:
@@ -65,6 +140,13 @@ static int p9_sbe_occ_probe(struct platform_device *pdev)
 	if (!ctx)
 		return -ENOMEM;
 
+	ctx->no_ffdc_magic = OCC_NO_FFDC_MAGIC;
+	ctx->sbe_error = SBE_ERROR_NONE;
+	ctx->ffdc = &ctx->no_ffdc_magic;
+	ctx->ffdc_len = sizeof(u32);
+	ctx->ffdc_size = 0;
+	mutex_init(&ctx->sbe_error_lock);
+
 	ctx->sbe = pdev->dev.parent;
 	occ = &ctx->occ;
 	occ->bus_dev = &pdev->dev;
@@ -78,6 +160,15 @@ static int p9_sbe_occ_probe(struct platform_device *pdev)
 	if (rc == -ESHUTDOWN)
 		rc = -ENODEV;	/* Host is shutdown, don't spew errors */
 
+	if (!rc) {
+		rc = device_create_bin_file(occ->bus_dev, &bin_attr_sbe_error);
+		if (rc) {
+			dev_warn(occ->bus_dev,
+				 "failed to create SBE error ffdc file\n");
+			rc = 0;
+		}
+	}
+
 	return rc;
 }
 
@@ -86,9 +177,14 @@ static int p9_sbe_occ_remove(struct platform_device *pdev)
 	struct occ *occ = platform_get_drvdata(pdev);
 	struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
 
+	device_remove_bin_file(occ->bus_dev, &bin_attr_sbe_error);
+
 	ctx->sbe = NULL;
 	occ_shutdown(occ);
 
+	if (ctx->ffdc_size)
+		kvfree(ctx->ffdc);
+
 	return 0;
 }
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs
  2021-09-14 21:35 ` [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs Eddie James
@ 2021-09-15 16:13   ` Guenter Roeck
  2021-09-15 21:11     ` Eddie James
  2021-09-16  0:17   ` Jeremy Kerr
  2021-09-21 15:37   ` Greg KH
  2 siblings, 1 reply; 11+ messages in thread
From: Guenter Roeck @ 2021-09-15 16:13 UTC (permalink / raw)
  To: Eddie James
  Cc: linux-fsi, linux-hwmon, linux-kernel, joel, jdelvare, alistair, jk

On Tue, Sep 14, 2021 at 04:35:43PM -0500, Eddie James wrote:
> Save any FFDC provided by the OCC driver, and provide it to userspace
> through a binary sysfs entry. Do some basic state management to
> ensure that userspace can always collect the data if there was an
> error. Notify polling userspace when there is an error too.
> 
> Signed-off-by: Eddie James <eajames@linux.ibm.com>

This is now the 2nd series that we have pending, and the first series
(from July) still didn't make it into the upstream kernel because the fsi code
seems to go nowhere. Any chance to address that ?

Additional comment inline.

Thanks,
Guenter

> ---
>  drivers/hwmon/occ/p9_sbe.c | 98 +++++++++++++++++++++++++++++++++++++-
>  1 file changed, 97 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/hwmon/occ/p9_sbe.c b/drivers/hwmon/occ/p9_sbe.c
> index 9709f2b9c052..505f489832a4 100644
> --- a/drivers/hwmon/occ/p9_sbe.c
> +++ b/drivers/hwmon/occ/p9_sbe.c
> @@ -4,18 +4,54 @@
>  #include <linux/device.h>
>  #include <linux/errno.h>
>  #include <linux/fsi-occ.h>
> +#include <linux/mm.h>
>  #include <linux/module.h>
> +#include <linux/mutex.h>
>  #include <linux/platform_device.h>
> +#include <linux/string.h>
> +#include <linux/sysfs.h>
>  
>  #include "common.h"
>  
> +enum sbe_error_state {
> +	SBE_ERROR_NONE = 0,
> +	SBE_ERROR_PENDING,
> +	SBE_ERROR_COLLECTED
> +};
> +
>  struct p9_sbe_occ {
>  	struct occ occ;
> +	int sbe_error;
> +	void *ffdc;
> +	size_t ffdc_len;
> +	size_t ffdc_size;
> +	struct mutex sbe_error_lock;	/* lock access to ffdc data */
> +	u32 no_ffdc_magic;
>  	struct device *sbe;
>  };
>  
>  #define to_p9_sbe_occ(x)	container_of((x), struct p9_sbe_occ, occ)
>  
> +static ssize_t sbe_error_read(struct file *filp, struct kobject *kobj,
> +			      struct bin_attribute *battr, char *buf,
> +			      loff_t pos, size_t count)
> +{
> +	ssize_t rc = 0;
> +	struct occ *occ = dev_get_drvdata(kobj_to_dev(kobj));
> +	struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
> +
> +	mutex_lock(&ctx->sbe_error_lock);
> +	if (ctx->sbe_error == SBE_ERROR_PENDING) {
> +		rc = memory_read_from_buffer(buf, count, &pos, ctx->ffdc,
> +					     ctx->ffdc_len);
> +		ctx->sbe_error = SBE_ERROR_COLLECTED;
> +	}
> +	mutex_unlock(&ctx->sbe_error_lock);
> +
> +	return rc;
> +}
> +static BIN_ATTR_RO(sbe_error, OCC_MAX_RESP_WORDS * 4);
> +
>  static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd, size_t len)
>  {
>  	struct occ_response *resp = &occ->resp;
> @@ -24,8 +60,47 @@ static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd, size_t len)
>  	int rc;
>  
>  	rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
> -	if (rc < 0)
> +	if (rc < 0) {
> +		if (resp_len) {
> +			bool notify = false;
> +
> +			mutex_lock(&ctx->sbe_error_lock);
> +			if (ctx->sbe_error != SBE_ERROR_PENDING)
> +				notify = true;
> +			ctx->sbe_error = SBE_ERROR_PENDING;
> +
> +			if (resp_len > ctx->ffdc_size) {
> +				if (ctx->ffdc_size)
> +					kvfree(ctx->ffdc);
> +				ctx->ffdc = kvmalloc(resp_len, GFP_KERNEL);
> +				if (!ctx->ffdc) {
> +					ctx->ffdc_size = 0;
> +					ctx->ffdc_len = sizeof(u32);
> +					ctx->ffdc = &ctx->no_ffdc_magic;
> +					goto unlock;
> +				}
> +
> +				ctx->ffdc_size = resp_len;
> +			}
> +
> +			ctx->ffdc_len = resp_len;
> +			memcpy(ctx->ffdc, resp, resp_len);
> +
> +unlock:
> +			mutex_unlock(&ctx->sbe_error_lock);
> +
> +			if (notify)
> +				sysfs_notify(&occ->bus_dev->kobj, NULL,
> +					     bin_attr_sbe_error.attr.name);
> +		}
> +
>  		return rc;
> +	}
> +
> +	mutex_lock(&ctx->sbe_error_lock);
> +	if (ctx->sbe_error == SBE_ERROR_COLLECTED)
> +		ctx->sbe_error = SBE_ERROR_NONE;
> +	mutex_unlock(&ctx->sbe_error_lock);

I am not entirely sure I understand the benefit of SBE_ERROR_COLLECTED.
Can you explain why it is needed, and why the status is not just set
to SBE_ERROR_NONE after the error data was collected ?

>  
>  	switch (resp->return_status) {
>  	case OCC_RESP_CMD_IN_PRG:
> @@ -65,6 +140,13 @@ static int p9_sbe_occ_probe(struct platform_device *pdev)
>  	if (!ctx)
>  		return -ENOMEM;
>  
> +	ctx->no_ffdc_magic = OCC_NO_FFDC_MAGIC;

This is ... odd. Why not just return a file size of 0 if there is no data ?
The binary file is an ABI and needs to be documented, including the use
of this "magic". The use of that magic needs to be explained because it
does add a lot of what sems to be unnecessary complexity to the code.

Besides, most of that complexity seems unnecessary: If the magic is really
needed, the read code could just write it into the buffer if ctx->ffdc
is NULL. There is a lot of complexity just to avoid an if statement in
sbe_error_read().

> +	ctx->sbe_error = SBE_ERROR_NONE;
> +	ctx->ffdc = &ctx->no_ffdc_magic;
> +	ctx->ffdc_len = sizeof(u32);
> +	ctx->ffdc_size = 0;
> +	mutex_init(&ctx->sbe_error_lock);
> +
>  	ctx->sbe = pdev->dev.parent;
>  	occ = &ctx->occ;
>  	occ->bus_dev = &pdev->dev;
> @@ -78,6 +160,15 @@ static int p9_sbe_occ_probe(struct platform_device *pdev)
>  	if (rc == -ESHUTDOWN)
>  		rc = -ENODEV;	/* Host is shutdown, don't spew errors */
>  
> +	if (!rc) {
> +		rc = device_create_bin_file(occ->bus_dev, &bin_attr_sbe_error);
> +		if (rc) {
> +			dev_warn(occ->bus_dev,
> +				 "failed to create SBE error ffdc file\n");
> +			rc = 0;
> +		}
> +	}
> +
>  	return rc;
>  }
>  
> @@ -86,9 +177,14 @@ static int p9_sbe_occ_remove(struct platform_device *pdev)
>  	struct occ *occ = platform_get_drvdata(pdev);
>  	struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
>  
> +	device_remove_bin_file(occ->bus_dev, &bin_attr_sbe_error);
> +
>  	ctx->sbe = NULL;
>  	occ_shutdown(occ);
>  
> +	if (ctx->ffdc_size)
> +		kvfree(ctx->ffdc);
> +
>  	return 0;
>  }
>  
> -- 
> 2.27.0
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs
  2021-09-15 16:13   ` Guenter Roeck
@ 2021-09-15 21:11     ` Eddie James
  2021-09-16  7:27       ` Joel Stanley
  0 siblings, 1 reply; 11+ messages in thread
From: Eddie James @ 2021-09-15 21:11 UTC (permalink / raw)
  To: Guenter Roeck, joel
  Cc: linux-fsi, linux-hwmon, linux-kernel, jdelvare, alistair, jk

On Wed, 2021-09-15 at 09:13 -0700, Guenter Roeck wrote:
> On Tue, Sep 14, 2021 at 04:35:43PM -0500, Eddie James wrote:
> > Save any FFDC provided by the OCC driver, and provide it to
> > userspace
> > through a binary sysfs entry. Do some basic state management to
> > ensure that userspace can always collect the data if there was an
> > error. Notify polling userspace when there is an error too.
> > 
> > Signed-off-by: Eddie James <eajames@linux.ibm.com>
> 
> This is now the 2nd series that we have pending, and the first series
> (from July) still didn't make it into the upstream kernel because the
> fsi code
> seems to go nowhere. Any chance to address that ?

Yes... Joel, can we merge that? I don't have any comments to address.

> 
> Additional comment inline.
> 
> Thanks,
> Guenter
> 
> > ---
> >  drivers/hwmon/occ/p9_sbe.c | 98
> > +++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 97 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/hwmon/occ/p9_sbe.c
> > b/drivers/hwmon/occ/p9_sbe.c
> > index 9709f2b9c052..505f489832a4 100644
> > --- a/drivers/hwmon/occ/p9_sbe.c
> > +++ b/drivers/hwmon/occ/p9_sbe.c
> > @@ -4,18 +4,54 @@
> >  #include <linux/device.h>
> >  #include <linux/errno.h>
> >  #include <linux/fsi-occ.h>
> > +#include <linux/mm.h>
> >  #include <linux/module.h>
> > +#include <linux/mutex.h>
> >  #include <linux/platform_device.h>
> > +#include <linux/string.h>
> > +#include <linux/sysfs.h>
> >  
> >  #include "common.h"
> >  
> > +enum sbe_error_state {
> > +	SBE_ERROR_NONE = 0,
> > +	SBE_ERROR_PENDING,
> > +	SBE_ERROR_COLLECTED
> > +};
> > +
> >  struct p9_sbe_occ {
> >  	struct occ occ;
> > +	int sbe_error;
> > +	void *ffdc;
> > +	size_t ffdc_len;
> > +	size_t ffdc_size;
> > +	struct mutex sbe_error_lock;	/* lock access to ffdc data
> > */
> > +	u32 no_ffdc_magic;
> >  	struct device *sbe;
> >  };
> >  
> >  #define to_p9_sbe_occ(x)	container_of((x), struct p9_sbe_occ,
> > occ)
> >  
> > +static ssize_t sbe_error_read(struct file *filp, struct kobject
> > *kobj,
> > +			      struct bin_attribute *battr, char *buf,
> > +			      loff_t pos, size_t count)
> > +{
> > +	ssize_t rc = 0;
> > +	struct occ *occ = dev_get_drvdata(kobj_to_dev(kobj));
> > +	struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
> > +
> > +	mutex_lock(&ctx->sbe_error_lock);
> > +	if (ctx->sbe_error == SBE_ERROR_PENDING) {
> > +		rc = memory_read_from_buffer(buf, count, &pos, ctx-
> > >ffdc,
> > +					     ctx->ffdc_len);
> > +		ctx->sbe_error = SBE_ERROR_COLLECTED;
> > +	}
> > +	mutex_unlock(&ctx->sbe_error_lock);
> > +
> > +	return rc;
> > +}
> > +static BIN_ATTR_RO(sbe_error, OCC_MAX_RESP_WORDS * 4);
> > +
> >  static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd, size_t
> > len)
> >  {
> >  	struct occ_response *resp = &occ->resp;
> > @@ -24,8 +60,47 @@ static int p9_sbe_occ_send_cmd(struct occ *occ,
> > u8 *cmd, size_t len)
> >  	int rc;
> >  
> >  	rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
> > -	if (rc < 0)
> > +	if (rc < 0) {
> > +		if (resp_len) {
> > +			bool notify = false;
> > +
> > +			mutex_lock(&ctx->sbe_error_lock);
> > +			if (ctx->sbe_error != SBE_ERROR_PENDING)
> > +				notify = true;
> > +			ctx->sbe_error = SBE_ERROR_PENDING;
> > +
> > +			if (resp_len > ctx->ffdc_size) {
> > +				if (ctx->ffdc_size)
> > +					kvfree(ctx->ffdc);
> > +				ctx->ffdc = kvmalloc(resp_len,
> > GFP_KERNEL);
> > +				if (!ctx->ffdc) {
> > +					ctx->ffdc_size = 0;
> > +					ctx->ffdc_len = sizeof(u32);
> > +					ctx->ffdc = &ctx-
> > >no_ffdc_magic;
> > +					goto unlock;
> > +				}
> > +
> > +				ctx->ffdc_size = resp_len;
> > +			}
> > +
> > +			ctx->ffdc_len = resp_len;
> > +			memcpy(ctx->ffdc, resp, resp_len);
> > +
> > +unlock:
> > +			mutex_unlock(&ctx->sbe_error_lock);
> > +
> > +			if (notify)
> > +				sysfs_notify(&occ->bus_dev->kobj, NULL,
> > +					     bin_attr_sbe_error.attr.na
> > me);
> > +		}
> > +
> >  		return rc;
> > +	}
> > +
> > +	mutex_lock(&ctx->sbe_error_lock);
> > +	if (ctx->sbe_error == SBE_ERROR_COLLECTED)
> > +		ctx->sbe_error = SBE_ERROR_NONE;
> > +	mutex_unlock(&ctx->sbe_error_lock);
> 
> I am not entirely sure I understand the benefit of
> SBE_ERROR_COLLECTED.
> Can you explain why it is needed, and why the status is not just set
> to SBE_ERROR_NONE after the error data was collected ?

The purpose was to make sure the data can be collected even if a
successful transfer (which clears the flag) comes through before the
user comes and reads the file. If the error is just set to NONE, then
the user might never see it, with the current implementation. I think I
will drop the state management though and just return the last error
data.

> 
> >  
> >  	switch (resp->return_status) {
> >  	case OCC_RESP_CMD_IN_PRG:
> > @@ -65,6 +140,13 @@ static int p9_sbe_occ_probe(struct
> > platform_device *pdev)
> >  	if (!ctx)
> >  		return -ENOMEM;
> >  
> > +	ctx->no_ffdc_magic = OCC_NO_FFDC_MAGIC;
> 
> This is ... odd. Why not just return a file size of 0 if there is no
> data ?
> The binary file is an ABI and needs to be documented, including the
> use
> of this "magic". The use of that magic needs to be explained because
> it
> does add a lot of what sems to be unnecessary complexity to the code.
> 
> Besides, most of that complexity seems unnecessary: If the magic is
> really
> needed, the read code could just write it into the buffer if ctx-
> >ffdc
> is NULL. There is a lot of complexity just to avoid an if statement
> in
> sbe_error_read().

Yea, I will admit this is pretty awkward. The reason for all this is
because I was trying to use a single sysfs entry to communicate both
whether or not there is an error at all, and the data from the error.
So returning  file size 0 means "no error" and then we can't capture
the case where there is an error but there is no data.

So on second thought, I should probably use two sysfs entries: one to
indicate if there is an error, and the other to report the data (if
there is any). There is the existing OCC error file of course, but
that's supposed to be for actual OCC response errors, so I will have to
investigate if it can serve both purposes.

Thanks for the review, Guenter!
Eddie

> 
> > +	ctx->sbe_error = SBE_ERROR_NONE;
> > +	ctx->ffdc = &ctx->no_ffdc_magic;
> > +	ctx->ffdc_len = sizeof(u32);
> > +	ctx->ffdc_size = 0;
> > +	mutex_init(&ctx->sbe_error_lock);
> > +
> >  	ctx->sbe = pdev->dev.parent;
> >  	occ = &ctx->occ;
> >  	occ->bus_dev = &pdev->dev;
> > @@ -78,6 +160,15 @@ static int p9_sbe_occ_probe(struct
> > platform_device *pdev)
> >  	if (rc == -ESHUTDOWN)
> >  		rc = -ENODEV;	/* Host is shutdown, don't spew
> > errors */
> >  
> > +	if (!rc) {
> > +		rc = device_create_bin_file(occ->bus_dev,
> > &bin_attr_sbe_error);
> > +		if (rc) {
> > +			dev_warn(occ->bus_dev,
> > +				 "failed to create SBE error ffdc
> > file\n");
> > +			rc = 0;
> > +		}
> > +	}
> > +
> >  	return rc;
> >  }
> >  
> > @@ -86,9 +177,14 @@ static int p9_sbe_occ_remove(struct
> > platform_device *pdev)
> >  	struct occ *occ = platform_get_drvdata(pdev);
> >  	struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
> >  
> > +	device_remove_bin_file(occ->bus_dev, &bin_attr_sbe_error);
> > +
> >  	ctx->sbe = NULL;
> >  	occ_shutdown(occ);
> >  
> > +	if (ctx->ffdc_size)
> > +		kvfree(ctx->ffdc);
> > +
> >  	return 0;
> >  }
> >  
> > -- 
> > 2.27.0
> > 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs
  2021-09-14 21:35 ` [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs Eddie James
  2021-09-15 16:13   ` Guenter Roeck
@ 2021-09-16  0:17   ` Jeremy Kerr
  2021-09-17 18:44     ` Eddie James
  2021-09-21 15:37   ` Greg KH
  2 siblings, 1 reply; 11+ messages in thread
From: Jeremy Kerr @ 2021-09-16  0:17 UTC (permalink / raw)
  To: Eddie James, linux-fsi
  Cc: linux-hwmon, linux-kernel, joel, linux, jdelvare, alistair

Hi Eddie,

> Save any FFDC provided by the OCC driver, and provide it to userspace
> through a binary sysfs entry. Do some basic state management to
> ensure that userspace can always collect the data if there was an
> error. Notify polling userspace when there is an error too.

Super! Some comments inline:

> +enum sbe_error_state {
> +       SBE_ERROR_NONE = 0,
> +       SBE_ERROR_PENDING,
> +       SBE_ERROR_COLLECTED
> +};
> +
>  struct p9_sbe_occ {
>         struct occ occ;
> +       int sbe_error;

Use the enum here?

> +       void *ffdc;
> +       size_t ffdc_len;
> +       size_t ffdc_size;
> +       struct mutex sbe_error_lock;    /* lock access to ffdc data */
> +       u32 no_ffdc_magic;
>         struct device *sbe;
>  };
>  
>  #define to_p9_sbe_occ(x)       container_of((x), struct p9_sbe_occ, occ)
>  
> +static ssize_t sbe_error_read(struct file *filp, struct kobject *kobj,
> +                             struct bin_attribute *battr, char *buf,
> +                             loff_t pos, size_t count)
> +{
> +       ssize_t rc = 0;
> +       struct occ *occ = dev_get_drvdata(kobj_to_dev(kobj));
> +       struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
> +
> +       mutex_lock(&ctx->sbe_error_lock);
> +       if (ctx->sbe_error == SBE_ERROR_PENDING) {
> +               rc = memory_read_from_buffer(buf, count, &pos, ctx->ffdc,
> +                                            ctx->ffdc_len);
> +               ctx->sbe_error = SBE_ERROR_COLLECTED;
> +       }
> +       mutex_unlock(&ctx->sbe_error_lock);
> +
> +       return rc;
> +}

So any read from this file will clear out the FFDC data, making partial
reads impossible. As a least-intrusive change, could we set
SBE_ERROR_COLLECTED on write instead?

Or is there a better interface (a pipe?) that allows multiple FFDC
captures, destroyed on full consume, without odd read/write side
effects?

>         rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
> -       if (rc < 0)
> +       if (rc < 0) {
> +               if (resp_len) {
> +                       bool notify = false;
> +
> +                       mutex_lock(&ctx->sbe_error_lock);
> +                       if (ctx->sbe_error != SBE_ERROR_PENDING)
> +                               notify = true;
> +                       ctx->sbe_error = SBE_ERROR_PENDING;

                          [...]

> +                       ctx->ffdc_len = resp_len;
> +                       memcpy(ctx->ffdc, resp, resp_len);

This will clear out the previous error it if hasn't been collected by
userspace. Is that really what you want for *first* fail data capture?
:)

Cheers,


Jeremy


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs
  2021-09-15 21:11     ` Eddie James
@ 2021-09-16  7:27       ` Joel Stanley
  0 siblings, 0 replies; 11+ messages in thread
From: Joel Stanley @ 2021-09-16  7:27 UTC (permalink / raw)
  To: Eddie James
  Cc: Guenter Roeck, linux-fsi, linux-hwmon, Linux Kernel Mailing List,
	Jean Delvare, Alistair Popple, Jeremy Kerr

On Wed, 15 Sept 2021 at 21:11, Eddie James <eajames@linux.ibm.com> wrote:
>
> On Wed, 2021-09-15 at 09:13 -0700, Guenter Roeck wrote:
> > On Tue, Sep 14, 2021 at 04:35:43PM -0500, Eddie James wrote:
> > > Save any FFDC provided by the OCC driver, and provide it to
> > > userspace
> > > through a binary sysfs entry. Do some basic state management to
> > > ensure that userspace can always collect the data if there was an
> > > error. Notify polling userspace when there is an error too.
> > >
> > > Signed-off-by: Eddie James <eajames@linux.ibm.com>
> >
> > This is now the 2nd series that we have pending, and the first series
> > (from July) still didn't make it into the upstream kernel because the
> > fsi code
> > seems to go nowhere. Any chance to address that ?
>
> Yes... Joel, can we merge that? I don't have any comments to address.

Thanks for the reminder. We have a queue of FSI patches to send out. I
hope to get that done this coming merge window.

Cheers,

Joel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs
  2021-09-16  0:17   ` Jeremy Kerr
@ 2021-09-17 18:44     ` Eddie James
  0 siblings, 0 replies; 11+ messages in thread
From: Eddie James @ 2021-09-17 18:44 UTC (permalink / raw)
  To: Jeremy Kerr, linux-fsi
  Cc: linux-hwmon, linux-kernel, joel, linux, jdelvare, alistair

On Thu, 2021-09-16 at 08:17 +0800, Jeremy Kerr wrote:
> Hi Eddie,
> 
> > Save any FFDC provided by the OCC driver, and provide it to
> > userspace
> > through a binary sysfs entry. Do some basic state management to
> > ensure that userspace can always collect the data if there was an
> > error. Notify polling userspace when there is an error too.
> 
> Super! Some comments inline:
> 
> > +enum sbe_error_state {
> > +       SBE_ERROR_NONE = 0,
> > +       SBE_ERROR_PENDING,
> > +       SBE_ERROR_COLLECTED
> > +};
> > +
> >  struct p9_sbe_occ {
> >         struct occ occ;
> > +       int sbe_error;
> 
> Use the enum here?

Yep :) Though I think I will switch to a bool; as I wrote to Guenter,
the extra "collected" state isn't necessary if I stop trying to report
two things (the FFDC itself and whether or not there is an error at
all) with one interface.

> 
> > +       void *ffdc;
> > +       size_t ffdc_len;
> > +       size_t ffdc_size;
> > +       struct mutex sbe_error_lock;    /* lock access to ffdc data
> > */
> > +       u32 no_ffdc_magic;
> >         struct device *sbe;
> >  };
> >  
> >  #define to_p9_sbe_occ(x)       container_of((x), struct
> > p9_sbe_occ, occ)
> >  
> > +static ssize_t sbe_error_read(struct file *filp, struct kobject
> > *kobj,
> > +                             struct bin_attribute *battr, char
> > *buf,
> > +                             loff_t pos, size_t count)
> > +{
> > +       ssize_t rc = 0;
> > +       struct occ *occ = dev_get_drvdata(kobj_to_dev(kobj));
> > +       struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
> > +
> > +       mutex_lock(&ctx->sbe_error_lock);
> > +       if (ctx->sbe_error == SBE_ERROR_PENDING) {
> > +               rc = memory_read_from_buffer(buf, count, &pos, ctx-
> > >ffdc,
> > +                                            ctx->ffdc_len);
> > +               ctx->sbe_error = SBE_ERROR_COLLECTED;
> > +       }
> > +       mutex_unlock(&ctx->sbe_error_lock);
> > +
> > +       return rc;
> > +}
> 
> So any read from this file will clear out the FFDC data, making
> partial
> reads impossible. As a least-intrusive change, could we set
> SBE_ERROR_COLLECTED on write instead?

Good point. Write would work. How about resetting the error flag once
pos >= size though, for the sake of simplicity?

> 
> Or is there a better interface (a pipe?) that allows multiple FFDC
> captures, destroyed on full consume, without odd read/write side
> effects?

I tried to look into pipes, but I'm pretty unclear on how to set one
up. Doesn't appear as though they are used in kernel much, especially
as an interface to userspace.
I'm just not sure its worth having more than one FFDC packet available.
There would always have to be a maximum number of them anyway to put a
bound on memory usage. We're somewhat helped here by the fact that OCC
operations will go out a maximum of once a second, so that's hopefully
plenty of time for userspace to notice the error and pick up the FFDC.

> 
> >         rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
> > -       if (rc < 0)
> > +       if (rc < 0) {
> > +               if (resp_len) {
> > +                       bool notify = false;
> > +
> > +                       mutex_lock(&ctx->sbe_error_lock);
> > +                       if (ctx->sbe_error != SBE_ERROR_PENDING)
> > +                               notify = true;
> > +                       ctx->sbe_error = SBE_ERROR_PENDING;
> 
>                           [...]
> 
> > +                       ctx->ffdc_len = resp_len;
> > +                       memcpy(ctx->ffdc, resp, resp_len);
> 
> This will clear out the previous error it if hasn't been collected by
> userspace. Is that really what you want for *first* fail data
> capture?
> :)

Ah, good point. I will fix that!

Thanks for the review Jeremy!

Eddie

> 
> Cheers,
> 
> 
> Jeremy
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs
  2021-09-14 21:35 ` [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs Eddie James
  2021-09-15 16:13   ` Guenter Roeck
  2021-09-16  0:17   ` Jeremy Kerr
@ 2021-09-21 15:37   ` Greg KH
  2021-09-21 15:57     ` Eddie James
  2 siblings, 1 reply; 11+ messages in thread
From: Greg KH @ 2021-09-21 15:37 UTC (permalink / raw)
  To: Eddie James
  Cc: linux-fsi, linux-hwmon, linux-kernel, joel, linux, jdelvare,
	alistair, jk

On Tue, Sep 14, 2021 at 04:35:43PM -0500, Eddie James wrote:
> Save any FFDC provided by the OCC driver, and provide it to userspace
> through a binary sysfs entry. Do some basic state management to
> ensure that userspace can always collect the data if there was an
> error. Notify polling userspace when there is an error too.
> 
> Signed-off-by: Eddie James <eajames@linux.ibm.com>
> ---
>  drivers/hwmon/occ/p9_sbe.c | 98 +++++++++++++++++++++++++++++++++++++-
>  1 file changed, 97 insertions(+), 1 deletion(-)

You forgot a Documentation/ABI/ entry :(

Binary sysfs files are for "pass through to the hardware" only, you
should not be dumping kernel data to userspace through them.  I can't
really determine if this is the case here or not, as there's no
documentation saying what you are trying to represent here...

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs
  2021-09-21 15:37   ` Greg KH
@ 2021-09-21 15:57     ` Eddie James
  0 siblings, 0 replies; 11+ messages in thread
From: Eddie James @ 2021-09-21 15:57 UTC (permalink / raw)
  To: Greg KH
  Cc: linux-fsi, linux-hwmon, linux-kernel, joel, linux, jdelvare,
	alistair, jk


On 9/21/21 10:37 AM, Greg KH wrote:
> On Tue, Sep 14, 2021 at 04:35:43PM -0500, Eddie James wrote:
>> Save any FFDC provided by the OCC driver, and provide it to userspace
>> through a binary sysfs entry. Do some basic state management to
>> ensure that userspace can always collect the data if there was an
>> error. Notify polling userspace when there is an error too.
>>
>> Signed-off-by: Eddie James <eajames@linux.ibm.com>
>> ---
>>   drivers/hwmon/occ/p9_sbe.c | 98 +++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 97 insertions(+), 1 deletion(-)
> You forgot a Documentation/ABI/ entry :(
>
> Binary sysfs files are for "pass through to the hardware" only, you
> should not be dumping kernel data to userspace through them.  I can't
> really determine if this is the case here or not, as there's no
> documentation saying what you are trying to represent here...


Ok oops. I will add an entry, thanks.

I believe this qualifies for binary sysfs then, since the data is an 
error response from the hardware directly.


Thanks,

Eddie


>
> thanks,
>
> greg k-h

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-09-21 15:57 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-14 21:35 [PATCH 0/3] occ: fsi and hwmon: Extract and provide the SBEFIFO FFDC Eddie James
2021-09-14 21:35 ` [PATCH 1/3] fsi: occ: Use a large buffer for responses Eddie James
2021-09-14 21:35 ` [PATCH 2/3] fsi: occ: Store the SBEFIFO FFDC in the user response buffer Eddie James
2021-09-14 21:35 ` [PATCH 3/3] hwmon: (occ) Provide the SBEFIFO FFDC in binary sysfs Eddie James
2021-09-15 16:13   ` Guenter Roeck
2021-09-15 21:11     ` Eddie James
2021-09-16  7:27       ` Joel Stanley
2021-09-16  0:17   ` Jeremy Kerr
2021-09-17 18:44     ` Eddie James
2021-09-21 15:37   ` Greg KH
2021-09-21 15:57     ` Eddie James

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).