All of lore.kernel.org
 help / color / mirror / Atom feed
From: Igor Russkikh <irusskikh@marvell.com>
To: <netdev@vger.kernel.org>
Cc: "David S . Miller" <davem@davemloft.net>,
	Ariel Elior <aelior@marvell.com>,
	Michal Kalderon <mkalderon@marvell.com>,
	Denis Bolotin <dbolotin@marvell.com>,
	Jakub Kicinski <kuba@kernel.org>,
	Igor Russkikh <irusskikh@marvell.com>,
	Ariel Elior <ariel.elior@marvell.com>,
	"Michal Kalderon" <michal.kalderon@marvell.com>
Subject: [PATCH v2 net-next 10/11] net: qed: introduce critical hardware error handler
Date: Thu, 14 May 2020 12:57:26 +0300	[thread overview]
Message-ID: <20200514095727.1361-11-irusskikh@marvell.com> (raw)
In-Reply-To: <20200514095727.1361-1-irusskikh@marvell.com>

MCP may signal driver about generic critical failure.
Driver has to collect mdump information (get_retain),
it pushes that to logs and triggers generic notification on
"hardware attention" event.

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
Signed-off-by: Igor Russkikh <irusskikh@marvell.com>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h |  28 +++++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 113 ++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h |  13 +++
 3 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index ab042b835797..f00460d00cab 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -12400,6 +12400,13 @@ struct load_rsp_stc {
 #define LOAD_RSP_FLAGS0_DRV_EXISTS      (0x1 << 0)
 };
 
+struct mdump_retain_data_stc {
+	u32 valid;
+	u32 epoch;
+	u32 pf;
+	u32 status;
+};
+
 union drv_union_data {
 	u32 ver_str[MCP_DRV_VER_STR_SIZE_DWORD];
 	struct mcp_mac wol_mac;
@@ -12488,6 +12495,8 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_BIST_TEST			0x001e0000
 #define DRV_MSG_CODE_SET_LED_MODE		0x00200000
 #define DRV_MSG_CODE_RESOURCE_CMD		0x00230000
+/* Send crash dump commands with param[3:0] - opcode */
+#define DRV_MSG_CODE_MDUMP_CMD			0x00250000
 #define DRV_MSG_CODE_GET_TLV_DONE		0x002f0000
 #define DRV_MSG_CODE_GET_ENGINE_CONFIG		0x00370000
 #define DRV_MSG_CODE_GET_PPFID_BITMAP		0x43000000
@@ -12519,6 +12528,21 @@ struct public_drv_mb {
 
 #define RESOURCE_DUMP				0
 
+/* DRV_MSG_CODE_MDUMP_CMD parameters */
+#define MDUMP_DRV_PARAM_OPCODE_MASK             0x0000000f
+#define DRV_MSG_CODE_MDUMP_ACK                  0x01
+#define DRV_MSG_CODE_MDUMP_SET_VALUES           0x02
+#define DRV_MSG_CODE_MDUMP_TRIGGER              0x03
+#define DRV_MSG_CODE_MDUMP_GET_CONFIG           0x04
+#define DRV_MSG_CODE_MDUMP_SET_ENABLE           0x05
+#define DRV_MSG_CODE_MDUMP_CLEAR_LOGS           0x06
+#define DRV_MSG_CODE_MDUMP_GET_RETAIN           0x07
+#define DRV_MSG_CODE_MDUMP_CLR_RETAIN           0x08
+
+#define DRV_MSG_CODE_HW_DUMP_TRIGGER            0x0a
+#define DRV_MSG_CODE_MDUMP_GEN_MDUMP2           0x0b
+#define DRV_MSG_CODE_MDUMP_FREE_MDUMP2          0x0c
+
 #define DRV_MSG_CODE_GET_PF_RDMA_PROTOCOL	0x002b0000
 #define DRV_MSG_CODE_OS_WOL			0x002e0000
 
@@ -12697,6 +12721,8 @@ struct public_drv_mb {
 #define FW_MSG_CODE_DEBUG_NOT_ENABLED		0xb00a0000
 #define FW_MSG_CODE_DEBUG_DATA_SEND_OK		0xb00b0000
 
+#define FW_MSG_CODE_MDUMP_INVALID_CMD		0x00030000
+
 	u32 fw_mb_param;
 #define FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_MASK	0xFFFF0000
 #define FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT	16
@@ -12763,7 +12789,7 @@ enum MFW_DRV_MSG_TYPE {
 	MFW_DRV_MSG_GET_RDMA_STATS,
 	MFW_DRV_MSG_FAILURE_DETECTED,
 	MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE,
-	MFW_DRV_MSG_BW_UPDATE11,
+	MFW_DRV_MSG_CRITICAL_ERROR_OCCURRED,
 	MFW_DRV_MSG_RESERVED,
 	MFW_DRV_MSG_GET_TLV_REQ,
 	MFW_DRV_MSG_OEM_CFG_UPDATE,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 0058e804efc3..8a0bbc7d4b24 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1717,6 +1717,116 @@ static void qed_mcp_handle_fan_failure(struct qed_hwfn *p_hwfn,
 			  "Fan failure was detected on the network interface card and it's going to be shut down.\n");
 }
 
+struct qed_mdump_cmd_params {
+	u32 cmd;
+	void *p_data_src;
+	u8 data_src_size;
+	void *p_data_dst;
+	u8 data_dst_size;
+	u32 mcp_resp;
+};
+
+static int
+qed_mcp_mdump_cmd(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt,
+		  struct qed_mdump_cmd_params *p_mdump_cmd_params)
+{
+	struct qed_mcp_mb_params mb_params;
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_MDUMP_CMD;
+	mb_params.param = p_mdump_cmd_params->cmd;
+	mb_params.p_data_src = p_mdump_cmd_params->p_data_src;
+	mb_params.data_src_size = p_mdump_cmd_params->data_src_size;
+	mb_params.p_data_dst = p_mdump_cmd_params->p_data_dst;
+	mb_params.data_dst_size = p_mdump_cmd_params->data_dst_size;
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		return rc;
+
+	p_mdump_cmd_params->mcp_resp = mb_params.mcp_resp;
+
+	if (p_mdump_cmd_params->mcp_resp == FW_MSG_CODE_MDUMP_INVALID_CMD) {
+		DP_INFO(p_hwfn,
+			"The mdump sub command is unsupported by the MFW [mdump_cmd 0x%x]\n",
+			p_mdump_cmd_params->cmd);
+		rc = -EOPNOTSUPP;
+	} else if (p_mdump_cmd_params->mcp_resp == FW_MSG_CODE_UNSUPPORTED) {
+		DP_INFO(p_hwfn,
+			"The mdump command is not supported by the MFW\n");
+		rc = -EOPNOTSUPP;
+	}
+
+	return rc;
+}
+
+static int qed_mcp_mdump_ack(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mdump_cmd_params mdump_cmd_params;
+
+	memset(&mdump_cmd_params, 0, sizeof(mdump_cmd_params));
+	mdump_cmd_params.cmd = DRV_MSG_CODE_MDUMP_ACK;
+
+	return qed_mcp_mdump_cmd(p_hwfn, p_ptt, &mdump_cmd_params);
+}
+
+int
+qed_mcp_mdump_get_retain(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct mdump_retain_data_stc *p_mdump_retain)
+{
+	struct qed_mdump_cmd_params mdump_cmd_params;
+	int rc;
+
+	memset(&mdump_cmd_params, 0, sizeof(mdump_cmd_params));
+	mdump_cmd_params.cmd = DRV_MSG_CODE_MDUMP_GET_RETAIN;
+	mdump_cmd_params.p_data_dst = p_mdump_retain;
+	mdump_cmd_params.data_dst_size = sizeof(*p_mdump_retain);
+
+	rc = qed_mcp_mdump_cmd(p_hwfn, p_ptt, &mdump_cmd_params);
+	if (rc)
+		return rc;
+
+	if (mdump_cmd_params.mcp_resp != FW_MSG_CODE_OK) {
+		DP_INFO(p_hwfn,
+			"Failed to get the mdump retained data [mcp_resp 0x%x]\n",
+			mdump_cmd_params.mcp_resp);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void qed_mcp_handle_critical_error(struct qed_hwfn *p_hwfn,
+					  struct qed_ptt *p_ptt)
+{
+	struct mdump_retain_data_stc mdump_retain;
+	int rc;
+
+	/* In CMT mode - no need for more than a single acknowledgment to the
+	 * MFW, and no more than a single notification to the upper driver.
+	 */
+	if (p_hwfn != QED_LEADING_HWFN(p_hwfn->cdev))
+		return;
+
+	rc = qed_mcp_mdump_get_retain(p_hwfn, p_ptt, &mdump_retain);
+	if (rc == 0 && mdump_retain.valid)
+		DP_NOTICE(p_hwfn,
+			  "The MFW notified that a critical error occurred in the device [epoch 0x%08x, pf 0x%x, status 0x%08x]\n",
+			  mdump_retain.epoch,
+			  mdump_retain.pf, mdump_retain.status);
+	else
+		DP_NOTICE(p_hwfn,
+			  "The MFW notified that a critical error occurred in the device\n");
+
+	DP_NOTICE(p_hwfn,
+		  "Acknowledging the notification to not allow the MFW crash dump [driver debug data collection is preferable]\n");
+	qed_mcp_mdump_ack(p_hwfn, p_ptt);
+
+	qed_hw_err_notify(p_hwfn, p_ptt, QED_HW_ERR_HW_ATTN, NULL);
+}
+
 void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
 	struct public_func shmem_info;
@@ -1866,6 +1976,9 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 		case MFW_DRV_MSG_FAILURE_DETECTED:
 			qed_mcp_handle_fan_failure(p_hwfn, p_ptt);
 			break;
+		case MFW_DRV_MSG_CRITICAL_ERROR_OCCURRED:
+			qed_mcp_handle_critical_error(p_hwfn, p_ptt);
+			break;
 		case MFW_DRV_MSG_GET_TLV_REQ:
 			qed_mfw_tlv_req(p_hwfn);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index bc248418a5f5..5750b4c5ef63 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -1016,6 +1016,19 @@ int __qed_configure_pf_min_bandwidth(struct qed_hwfn *p_hwfn,
 int qed_mcp_mask_parities(struct qed_hwfn *p_hwfn,
 			  struct qed_ptt *p_ptt, u32 mask_parities);
 
+/* @brief - Gets the mdump retained data from the MFW.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_mdump_retain
+ *
+ * @param return 0 upon success.
+ */
+int
+qed_mcp_mdump_get_retain(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct mdump_retain_data_stc *p_mdump_retain);
+
 /**
  * @brief - Sets the MFW's max value for the given resource
  *
-- 
2.17.1


  parent reply	other threads:[~2020-05-14  9:58 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-14  9:57 [PATCH v2 net-next 00/11] net: qed/qede: critical hw error handling Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 01/11] net: qed: adding hw_err states and handling Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 02/11] net: qede: add hw err scheduled handler Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 03/11] net: qed: invoke err notify on critical areas Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 04/11] net: qed: critical err reporting to management firmware Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 05/11] net: qed: cleanup debug related declarations Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 06/11] net: qed: attention clearing properties Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 07/11] net: qede: optional hw recovery procedure Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 08/11] net: qede: Implement ndo_tx_timeout Igor Russkikh
2020-05-14  9:57 ` [PATCH v2 net-next 09/11] net: qed: introduce critical fan failure handler Igor Russkikh
2020-05-14  9:57 ` Igor Russkikh [this message]
2020-05-14  9:57 ` [PATCH v2 net-next 11/11] net: qed: fix bad formatting Igor Russkikh
2020-05-14 19:06 ` [PATCH v2 net-next 00/11] net: qed/qede: critical hw error handling Jakub Kicinski
2020-05-14 19:40   ` [EXT] " Igor Russkikh
2020-05-14 20:02     ` Jakub Kicinski
2020-05-14 20:09       ` Igor Russkikh
2020-05-14 20:01 ` David Miller
2020-05-14 20:09   ` David Miller
2020-05-14 20:22     ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200514095727.1361-11-irusskikh@marvell.com \
    --to=irusskikh@marvell.com \
    --cc=aelior@marvell.com \
    --cc=ariel.elior@marvell.com \
    --cc=davem@davemloft.net \
    --cc=dbolotin@marvell.com \
    --cc=kuba@kernel.org \
    --cc=michal.kalderon@marvell.com \
    --cc=mkalderon@marvell.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.