linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 0/5] net: hns3: add RAS compatibility adaptation solution
@ 2021-06-08 13:08 Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 1/5] net: hns3: add support for handling all errors through MSI-X Guangbin Huang
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Guangbin Huang @ 2021-06-08 13:08 UTC (permalink / raw)
  To: davem, kuba; +Cc: netdev, linux-kernel, salil.mehta, lipeng321, huangguangbin2

This patchset adds RAS compatibility adaptation solution for new devices.


Jiaran Zhang (4):
  net: hns3: add the RAS compatibility adaptation solution
  net: hns3: add support for imp-handle ras capability
  net: hns3: update error recovery module and type
  net: hns3: add error handling compatibility during initialization

Yufeng Mo (1):
  net: hns3: add support for handling all errors through MSI-X

 drivers/net/ethernet/hisilicon/hns3/hnae3.h        |   4 +
 drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c |   3 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c |   5 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |   3 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 410 +++++++++++++++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h |  89 +++++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    |  87 +++--
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   1 +
 8 files changed, 546 insertions(+), 56 deletions(-)

-- 
2.8.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH net-next 1/5] net: hns3: add support for handling all errors through MSI-X
  2021-06-08 13:08 [PATCH net-next 0/5] net: hns3: add RAS compatibility adaptation solution Guangbin Huang
@ 2021-06-08 13:08 ` Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 2/5] net: hns3: add the RAS compatibility adaptation solution Guangbin Huang
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Guangbin Huang @ 2021-06-08 13:08 UTC (permalink / raw)
  To: davem, kuba; +Cc: netdev, linux-kernel, salil.mehta, lipeng321, huangguangbin2

From: Yufeng Mo <moyufeng@huawei.com>

Currently, hardware errors can be reported through AER or MSI-X mode.
However, the AER mode is intended to handle only bus errors, but not
hardware errors. On the other hand, virtual machines cannot handle
AER errors. When an AER error is reported, virtual machines will be
suspended. So add support for handling all these hardware errors
through MSI-X mode which depends on a newer version of firmware,
and reserve the handler of the AER mode for compatibility.

Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: Jiaran Zhang <zhangjiaran@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 16 ++++++++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 47 +++++++++++-----------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  1 +
 3 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
index f125aa425872..540dd15d7771 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
@@ -1611,11 +1611,27 @@ static const struct hclge_hw_blk hw_blk[] = {
 	{ /* sentinel */ }
 };
 
+static void hclge_config_all_msix_error(struct hclge_dev *hdev, bool enable)
+{
+	u32 reg_val;
+
+	reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG);
+
+	if (enable)
+		reg_val |= BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B);
+	else
+		reg_val &= ~BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B);
+
+	hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val);
+}
+
 int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state)
 {
 	const struct hclge_hw_blk *module = hw_blk;
 	int ret = 0;
 
+	hclge_config_all_msix_error(hdev, state);
+
 	while (module->name) {
 		if (module->config_err_int) {
 			ret = module->config_err_int(hdev, state);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 45102681bd2a..d5be3bc50b5c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3307,11 +3307,13 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf,
 
 static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 {
-	u32 cmdq_src_reg, msix_src_reg;
+	u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg;
 
 	/* fetch the events from their corresponding regs */
 	cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
 	msix_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
+	hw_err_src_reg = hclge_read_dev(&hdev->hw,
+					HCLGE_RAS_PF_OTHER_INT_STS_REG);
 
 	/* Assumption: If by any chance reset and mailbox events are reported
 	 * together then we will only process reset event in this go and will
@@ -3339,11 +3341,11 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 		return HCLGE_VECTOR0_EVENT_RST;
 	}
 
-	/* check for vector0 msix event source */
-	if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK) {
-		*clearval = msix_src_reg;
+	/* check for vector0 msix event and hardware error event source */
+	if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK ||
+	    hw_err_src_reg & HCLGE_RAS_REG_NFE_MASK ||
+	    hw_err_src_reg & HCLGE_RAS_REG_ROCEE_ERR_MASK)
 		return HCLGE_VECTOR0_EVENT_ERR;
-	}
 
 	/* check for vector0 mailbox(=CMDQ RX) event source */
 	if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
@@ -3354,9 +3356,8 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 
 	/* print other vector0 event source */
 	dev_info(&hdev->pdev->dev,
-		 "CMDQ INT status:0x%x, other INT status:0x%x\n",
-		 cmdq_src_reg, msix_src_reg);
-	*clearval = msix_src_reg;
+		 "INT status: CMDQ(%#x) HW errors(%#x) other(%#x)\n",
+		 cmdq_src_reg, hw_err_src_reg, msix_src_reg);
 
 	return HCLGE_VECTOR0_EVENT_OTHER;
 }
@@ -3427,15 +3428,10 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
 
 	hclge_clear_event_cause(hdev, event_cause, clearval);
 
-	/* Enable interrupt if it is not cause by reset. And when
-	 * clearval equal to 0, it means interrupt status may be
-	 * cleared by hardware before driver reads status register.
-	 * For this case, vector0 interrupt also should be enabled.
-	 */
-	if (!clearval ||
-	    event_cause == HCLGE_VECTOR0_EVENT_MBX) {
+	/* Enable interrupt if it is not caused by reset event or error event */
+	if (event_cause == HCLGE_VECTOR0_EVENT_MBX ||
+	    event_cause == HCLGE_VECTOR0_EVENT_OTHER)
 		hclge_enable_vector(&hdev->misc_vector, true);
-	}
 
 	return IRQ_HANDLED;
 }
@@ -4244,22 +4240,27 @@ static void hclge_misc_err_recovery(struct hclge_dev *hdev)
 {
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
 	struct device *dev = &hdev->pdev->dev;
+	enum hnae3_reset_type reset_type;
 	u32 msix_sts_reg;
 
 	msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
-
 	if (msix_sts_reg & HCLGE_VECTOR0_REG_MSIX_MASK) {
-		if (hclge_handle_hw_msix_error(hdev,
-					       &hdev->default_reset_request))
+		if (hclge_handle_hw_msix_error
+				(hdev, &hdev->default_reset_request))
 			dev_info(dev, "received msix interrupt 0x%x\n",
 				 msix_sts_reg);
+	}
+	hclge_enable_vector(&hdev->misc_vector, true);
 
-		if (hdev->default_reset_request)
-			if (ae_dev->ops->reset_event)
-				ae_dev->ops->reset_event(hdev->pdev, NULL);
+	hclge_handle_hw_ras_error(ae_dev);
+	if (ae_dev->hw_err_reset_req) {
+		reset_type = hclge_get_reset_level(ae_dev,
+						   &ae_dev->hw_err_reset_req);
+		hclge_set_def_reset_request(ae_dev, reset_type);
 	}
 
-	hclge_enable_vector(&hdev->misc_vector, true);
+	if (hdev->default_reset_request && ae_dev->ops->reset_event)
+		ae_dev->ops->reset_event(hdev->pdev, NULL);
 }
 
 static void hclge_errhand_service_task(struct hclge_dev *hdev)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 9b8abb5d7a8e..582972a6f60e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -190,6 +190,7 @@ enum HLCGE_PORT_TYPE {
 #define HCLGE_VECTOR0_IMP_RESET_INT_B	1
 #define HCLGE_VECTOR0_IMP_CMDQ_ERR_B	4U
 #define HCLGE_VECTOR0_IMP_RD_POISON_B	5U
+#define HCLGE_VECTOR0_ALL_MSIX_ERR_B	6U
 
 #define HCLGE_MAC_DEFAULT_FRAME \
 	(ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN)
-- 
2.8.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH net-next 2/5] net: hns3: add the RAS compatibility adaptation solution
  2021-06-08 13:08 [PATCH net-next 0/5] net: hns3: add RAS compatibility adaptation solution Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 1/5] net: hns3: add support for handling all errors through MSI-X Guangbin Huang
@ 2021-06-08 13:08 ` Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 3/5] net: hns3: add support for imp-handle ras capability Guangbin Huang
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Guangbin Huang @ 2021-06-08 13:08 UTC (permalink / raw)
  To: davem, kuba; +Cc: netdev, linux-kernel, salil.mehta, lipeng321, huangguangbin2

From: Jiaran Zhang <zhangjiaran@huawei.com>

To adapt to hardware modification and ensure that the driver is
compatible with the original error handling content, we need to add the
RAS compatibility adaptation solution.

Add a processing branch to the driver during error handling. In the new
processing branch, NIC fault information is integrated by the IMP. An
interaction command is added between the driver and IMP to query
and clear the fault source and interrupt source. The IMP integrates
error information and reports the highest reset level to the driver.

Signed-off-by: Jiaran Zhang <zhangjiaran@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c |   3 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |   2 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 320 +++++++++++++++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h |  69 +++++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    |  54 +++-
 5 files changed, 409 insertions(+), 39 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
index 8f6ed8577aea..614763f5e877 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
@@ -178,7 +178,8 @@ static bool hclge_is_special_opcode(u16 opcode)
 			     HCLGE_QUERY_CLEAR_MPF_RAS_INT,
 			     HCLGE_QUERY_CLEAR_PF_RAS_INT,
 			     HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT,
-			     HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT};
+			     HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT,
+			     HCLGE_QUERY_ALL_ERR_INFO};
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(spec_opcode); i++) {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index da78a6477e46..234f0a3beec1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -293,6 +293,8 @@ enum hclge_opcode_type {
 	HCLGE_QUERY_MSIX_INT_STS_BD_NUM	= 0x1513,
 	HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT	= 0x1514,
 	HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT	= 0x1515,
+	HCLGE_QUERY_ALL_ERR_BD_NUM		= 0x1516,
+	HCLGE_QUERY_ALL_ERR_INFO		= 0x1517,
 	HCLGE_CONFIG_ROCEE_RAS_INT_EN	= 0x1580,
 	HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
 	HCLGE_ROCEE_PF_RAS_INT_CMD	= 0x1584,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
index 540dd15d7771..36f8055bd859 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
@@ -631,6 +631,98 @@ static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = {
 	{ /* sentinel */ }
 };
 
+static const struct hclge_hw_module_id hclge_hw_module_id_st[] = {
+	{
+		.module_id = MODULE_NONE,
+		.msg = "MODULE_NONE"
+	}, {
+		.module_id = MODULE_BIOS_COMMON,
+		.msg = "MODULE_BIOS_COMMON"
+	}, {
+		.module_id = MODULE_GE,
+		.msg = "MODULE_GE"
+	}, {
+		.module_id = MODULE_IGU_EGU,
+		.msg = "MODULE_IGU_EGU"
+	}, {
+		.module_id = MODULE_LGE,
+		.msg = "MODULE_LGE"
+	}, {
+		.module_id = MODULE_NCSI,
+		.msg = "MODULE_NCSI"
+	}, {
+		.module_id = MODULE_PPP,
+		.msg = "MODULE_PPP"
+	}, {
+		.module_id = MODULE_QCN,
+		.msg = "MODULE_QCN"
+	}, {
+		.module_id = MODULE_RCB_RX,
+		.msg = "MODULE_RCB_RX"
+	}, {
+		.module_id = MODULE_RTC,
+		.msg = "MODULE_RTC"
+	}, {
+		.module_id = MODULE_SSU,
+		.msg = "MODULE_SSU"
+	}, {
+		.module_id = MODULE_TM,
+		.msg = "MODULE_TM"
+	}, {
+		.module_id = MODULE_RCB_TX,
+		.msg = "MODULE_RCB_TX"
+	}, {
+		.module_id = MODULE_TXDMA,
+		.msg = "MODULE_TXDMA"
+	}, {
+		.module_id = MODULE_MASTER,
+		.msg = "MODULE_MASTER"
+	}
+};
+
+static const struct hclge_hw_type_id hclge_hw_type_id_st[] = {
+	{
+		.type_id = NONE_ERROR,
+		.msg = "none_error"
+	}, {
+		.type_id = FIFO_ERROR,
+		.msg = "fifo_error"
+	}, {
+		.type_id = MEMORY_ERROR,
+		.msg = "memory_error"
+	}, {
+		.type_id = POISON_ERROR,
+		.msg = "poison_error"
+	}, {
+		.type_id = MSIX_ECC_ERROR,
+		.msg = "msix_ecc_error"
+	}, {
+		.type_id = TQP_INT_ECC_ERROR,
+		.msg = "tqp_int_ecc_error"
+	}, {
+		.type_id = PF_ABNORMAL_INT_ERROR,
+		.msg = "pf_abnormal_int_error"
+	}, {
+		.type_id = MPF_ABNORMAL_INT_ERROR,
+		.msg = "mpf_abnormal_int_error"
+	}, {
+		.type_id = COMMON_ERROR,
+		.msg = "common_error"
+	}, {
+		.type_id = PORT_ERROR,
+		.msg = "port_error"
+	}, {
+		.type_id = ETS_ERROR,
+		.msg = "ets_error"
+	}, {
+		.type_id = NCSI_ERROR,
+		.msg = "ncsi_error"
+	}, {
+		.type_id = GLB_ERROR,
+		.msg = "glb_error"
+	}
+};
+
 static void hclge_log_error(struct device *dev, char *reg,
 			    const struct hclge_hw_error *err,
 			    u32 err_sts, unsigned long *reset_requests)
@@ -1892,11 +1984,8 @@ static int hclge_handle_pf_msix_error(struct hclge_dev *hdev,
 static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev,
 					  unsigned long *reset_requests)
 {
-	struct hclge_mac_tnl_stats mac_tnl_stats;
-	struct device *dev = &hdev->pdev->dev;
 	u32 mpf_bd_num, pf_bd_num, bd_num;
 	struct hclge_desc *desc;
-	u32 status;
 	int ret;
 
 	/* query the number of bds for the MSIx int status */
@@ -1919,29 +2008,7 @@ static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev,
 	if (ret)
 		goto msi_error;
 
-	/* query and clear mac tnl interruptions */
-	hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_QUERY_MAC_TNL_INT,
-				   true);
-	ret = hclge_cmd_send(&hdev->hw, &desc[0], 1);
-	if (ret) {
-		dev_err(dev, "query mac tnl int cmd failed (%d)\n", ret);
-		goto msi_error;
-	}
-
-	status = le32_to_cpu(desc->data[0]);
-	if (status) {
-		/* When mac tnl interrupt occurs, we record current time and
-		 * register status here in a fifo, then clear the status. So
-		 * that if link status changes suddenly at some time, we can
-		 * query them by debugfs.
-		 */
-		mac_tnl_stats.time = local_clock();
-		mac_tnl_stats.status = status;
-		kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats);
-		ret = hclge_clear_mac_tnl_int(hdev);
-		if (ret)
-			dev_err(dev, "clear mac tnl int failed (%d)\n", ret);
-	}
+	ret = hclge_handle_mac_tnl(hdev);
 
 msi_error:
 	kfree(desc);
@@ -1963,10 +2030,43 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
 	return hclge_handle_all_hw_msix_error(hdev, reset_requests);
 }
 
-void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
+int hclge_handle_mac_tnl(struct hclge_dev *hdev)
 {
-#define HCLGE_DESC_NO_DATA_LEN 8
+	struct hclge_mac_tnl_stats mac_tnl_stats;
+	struct device *dev = &hdev->pdev->dev;
+	struct hclge_desc desc;
+	u32 status;
+	int ret;
 
+	/* query and clear mac tnl interruptions */
+	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_MAC_TNL_INT, true);
+	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+	if (ret) {
+		dev_err(dev, "failed to query mac tnl int, ret = %d.\n", ret);
+		return ret;
+	}
+
+	status = le32_to_cpu(desc.data[0]);
+	if (status) {
+		/* When mac tnl interrupt occurs, we record current time and
+		 * register status here in a fifo, then clear the status. So
+		 * that if link status changes suddenly at some time, we can
+		 * query them by debugfs.
+		 */
+		mac_tnl_stats.time = local_clock();
+		mac_tnl_stats.status = status;
+		kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats);
+		ret = hclge_clear_mac_tnl_int(hdev);
+		if (ret)
+			dev_err(dev, "failed to clear mac tnl int, ret = %d.\n",
+				ret);
+	}
+
+	return ret;
+}
+
+void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
+{
 	struct hclge_dev *hdev = ae_dev->priv;
 	struct device *dev = &hdev->pdev->dev;
 	u32 mpf_bd_num, pf_bd_num, bd_num;
@@ -2015,3 +2115,167 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
 msi_error:
 	kfree(desc);
 }
+
+static void
+hclge_handle_error_type_reg_log(struct device *dev,
+				struct hclge_mod_err_info *mod_info,
+				struct hclge_type_reg_err_info *type_reg_info)
+{
+#define HCLGE_ERR_TYPE_MASK 0x7F
+#define HCLGE_ERR_TYPE_IS_RAS_OFFSET 7
+
+	u8 mod_id, total_module, type_id, total_type, i, is_ras;
+
+	mod_id = mod_info->mod_id;
+	type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK;
+	is_ras = type_reg_info->type_id >> HCLGE_ERR_TYPE_IS_RAS_OFFSET;
+
+	total_module = ARRAY_SIZE(hclge_hw_module_id_st);
+	total_type = ARRAY_SIZE(hclge_hw_type_id_st);
+
+	if (mod_id < total_module && type_id < total_type)
+		dev_err(dev,
+			"found %s %s, is %s error.\n",
+			hclge_hw_module_id_st[mod_id].msg,
+			hclge_hw_type_id_st[type_id].msg,
+			is_ras ? "ras" : "msix");
+	else
+		dev_err(dev,
+			"unknown module[%u] or type[%u].\n", mod_id, type_id);
+
+	dev_err(dev, "reg_value:\n");
+	for (i = 0; i < type_reg_info->reg_num; i++)
+		dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]);
+}
+
+static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
+					  const u32 *buf, u32 buf_size)
+{
+	struct hclge_type_reg_err_info *type_reg_info;
+	struct hclge_dev *hdev = ae_dev->priv;
+	struct device *dev = &hdev->pdev->dev;
+	struct hclge_mod_err_info *mod_info;
+	struct hclge_sum_err_info *sum_info;
+	u8 mod_num, err_num, i;
+	u32 offset = 0;
+
+	sum_info = (struct hclge_sum_err_info *)&buf[offset++];
+	if (sum_info->reset_type &&
+	    sum_info->reset_type != HNAE3_NONE_RESET)
+		set_bit(sum_info->reset_type, &ae_dev->hw_err_reset_req);
+	mod_num = sum_info->mod_num;
+
+	while (mod_num--) {
+		if (offset >= buf_size) {
+			dev_err(dev, "The offset(%u) exceeds buf's size(%u).\n",
+				offset, buf_size);
+			return;
+		}
+		mod_info = (struct hclge_mod_err_info *)&buf[offset++];
+		err_num = mod_info->err_num;
+
+		for (i = 0; i < err_num; i++) {
+			if (offset >= buf_size) {
+				dev_err(dev,
+					"The offset(%u) exceeds buf size(%u).\n",
+					offset, buf_size);
+				return;
+			}
+
+			type_reg_info = (struct hclge_type_reg_err_info *)
+					    &buf[offset++];
+			hclge_handle_error_type_reg_log(dev, mod_info,
+							type_reg_info);
+
+			offset += type_reg_info->reg_num;
+		}
+	}
+}
+
+static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num)
+{
+	struct device *dev = &hdev->pdev->dev;
+	struct hclge_desc desc_bd;
+	int ret;
+
+	hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_ALL_ERR_BD_NUM, true);
+	ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1);
+	if (ret) {
+		dev_err(dev, "failed to query error bd_num, ret = %d.\n", ret);
+		return ret;
+	}
+
+	*bd_num = le32_to_cpu(desc_bd.data[0]);
+	if (!(*bd_num)) {
+		dev_err(dev, "The value of bd_num is 0!\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hclge_query_all_err_info(struct hclge_dev *hdev,
+				    struct hclge_desc *desc, u32 bd_num)
+{
+	struct device *dev = &hdev->pdev->dev;
+	int ret;
+
+	hclge_cmd_setup_basic_desc(desc, HCLGE_QUERY_ALL_ERR_INFO, true);
+	ret = hclge_cmd_send(&hdev->hw, desc, bd_num);
+	if (ret)
+		dev_err(dev, "failed to query error info, ret = %d.\n", ret);
+
+	return ret;
+}
+
+int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev)
+{
+	u32 bd_num, desc_len, buf_len, buf_size, i;
+	struct hclge_dev *hdev = ae_dev->priv;
+	struct hclge_desc *desc;
+	__le32 *desc_data;
+	u32 *buf;
+	int ret;
+
+	ret = hclge_query_all_err_bd_num(hdev, &bd_num);
+	if (ret)
+		goto out;
+
+	desc_len = bd_num * sizeof(struct hclge_desc);
+	desc = kzalloc(desc_len, GFP_KERNEL);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = hclge_query_all_err_info(hdev, desc, bd_num);
+	if (ret)
+		goto err_desc;
+
+	buf_len = bd_num * sizeof(struct hclge_desc) - HCLGE_DESC_NO_DATA_LEN;
+	buf_size = buf_len / sizeof(u32);
+
+	desc_data = kzalloc(buf_len, GFP_KERNEL);
+	if (!desc_data)
+		return -ENOMEM;
+
+	buf = kzalloc(buf_len, GFP_KERNEL);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto err_buf_alloc;
+	}
+
+	memcpy(desc_data, &desc[0].data[0], buf_len);
+	for (i = 0; i < buf_size; i++)
+		buf[i] = le32_to_cpu(desc_data[i]);
+
+	hclge_handle_error_module_log(ae_dev, buf, buf_size);
+	kfree(buf);
+
+err_buf_alloc:
+	kfree(desc_data);
+err_desc:
+	kfree(desc);
+out:
+	return ret;
+}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
index d647f3c84134..27ab772c665e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
@@ -107,6 +107,10 @@
 #define HCLGE_ROCEE_OVF_ERR_INT_MASK		0x10000
 #define HCLGE_ROCEE_OVF_ERR_TYPE_MASK		0x3F
 
+#define HCLGE_DESC_DATA_MAX			8
+#define HCLGE_REG_NUM_MAX			256
+#define HCLGE_DESC_NO_DATA_LEN			8
+
 enum hclge_err_int_type {
 	HCLGE_ERR_INT_MSIX = 0,
 	HCLGE_ERR_INT_RAS_CE = 1,
@@ -114,6 +118,40 @@ enum hclge_err_int_type {
 	HCLGE_ERR_INT_RAS_FE = 3,
 };
 
+enum hclge_mod_name_list {
+	MODULE_NONE		= 0,
+	MODULE_BIOS_COMMON	= 1,
+	MODULE_GE		= 2,
+	MODULE_IGU_EGU		= 3,
+	MODULE_LGE		= 4,
+	MODULE_NCSI		= 5,
+	MODULE_PPP		= 6,
+	MODULE_QCN		= 7,
+	MODULE_RCB_RX		= 8,
+	MODULE_RTC		= 9,
+	MODULE_SSU		= 10,
+	MODULE_TM		= 11,
+	MODULE_RCB_TX		= 12,
+	MODULE_TXDMA		= 13,
+	MODULE_MASTER		= 14,
+};
+
+enum hclge_err_type_list {
+	NONE_ERROR		= 0,
+	FIFO_ERROR		= 1,
+	MEMORY_ERROR		= 2,
+	POISON_ERROR		= 3,
+	MSIX_ECC_ERROR		= 4,
+	TQP_INT_ECC_ERROR	= 5,
+	PF_ABNORMAL_INT_ERROR	= 6,
+	MPF_ABNORMAL_INT_ERROR	= 7,
+	COMMON_ERROR		= 8,
+	PORT_ERROR		= 9,
+	ETS_ERROR		= 10,
+	NCSI_ERROR		= 11,
+	GLB_ERROR		= 12,
+};
+
 struct hclge_hw_blk {
 	u32 msk;
 	const char *name;
@@ -126,6 +164,35 @@ struct hclge_hw_error {
 	enum hnae3_reset_type reset_level;
 };
 
+struct hclge_hw_module_id {
+	enum hclge_mod_name_list module_id;
+	const char *msg;
+};
+
+struct hclge_hw_type_id {
+	enum hclge_err_type_list type_id;
+	const char *msg;
+};
+
+struct hclge_sum_err_info {
+	u8 reset_type;
+	u8 mod_num;
+	u8 rsv[2];
+};
+
+struct hclge_mod_err_info {
+	u8 mod_id;
+	u8 err_num;
+	u8 rsv[2];
+};
+
+struct hclge_type_reg_err_info {
+	u8 type_id;
+	u8 reg_num;
+	u8 rsv[2];
+	u32 hclge_reg[HCLGE_REG_NUM_MAX];
+};
+
 int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en);
 int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state);
 int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en);
@@ -133,4 +200,6 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev);
 pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
 int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
 			       unsigned long *reset_requests);
+int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev);
+int hclge_handle_mac_tnl(struct hclge_dev *hdev);
 #endif
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index d5be3bc50b5c..3c08fc71b951 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4236,11 +4236,49 @@ static void hclge_reset_subtask(struct hclge_dev *hdev)
 	hdev->reset_type = HNAE3_NONE_RESET;
 }
 
+static void hclge_handle_err_reset_request(struct hclge_dev *hdev)
+{
+	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
+	enum hnae3_reset_type reset_type;
+
+	if (ae_dev->hw_err_reset_req) {
+		reset_type = hclge_get_reset_level(ae_dev,
+						   &ae_dev->hw_err_reset_req);
+		hclge_set_def_reset_request(ae_dev, reset_type);
+	}
+
+	if (hdev->default_reset_request && ae_dev->ops->reset_event)
+		ae_dev->ops->reset_event(hdev->pdev, NULL);
+
+	/* enable interrupt after error handling complete */
+	hclge_enable_vector(&hdev->misc_vector, true);
+}
+
+static void hclge_handle_err_recovery(struct hclge_dev *hdev)
+{
+	u32 mask_val = HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK;
+	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
+	u32 msix_src_flag, hw_err_src_flag;
+
+	msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) &
+			HCLGE_VECTOR0_REG_MSIX_MASK;
+
+	hw_err_src_flag = hclge_read_dev(&hdev->hw,
+					 HCLGE_RAS_PF_OTHER_INT_STS_REG) &
+			  mask_val;
+
+	if (msix_src_flag || hw_err_src_flag) {
+		hclge_handle_error_info_log(ae_dev);
+		hclge_handle_mac_tnl(hdev);
+	}
+
+	hclge_handle_err_reset_request(hdev);
+}
+
 static void hclge_misc_err_recovery(struct hclge_dev *hdev)
 {
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
 	struct device *dev = &hdev->pdev->dev;
-	enum hnae3_reset_type reset_type;
 	u32 msix_sts_reg;
 
 	msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
@@ -4250,17 +4288,10 @@ static void hclge_misc_err_recovery(struct hclge_dev *hdev)
 			dev_info(dev, "received msix interrupt 0x%x\n",
 				 msix_sts_reg);
 	}
-	hclge_enable_vector(&hdev->misc_vector, true);
 
 	hclge_handle_hw_ras_error(ae_dev);
-	if (ae_dev->hw_err_reset_req) {
-		reset_type = hclge_get_reset_level(ae_dev,
-						   &ae_dev->hw_err_reset_req);
-		hclge_set_def_reset_request(ae_dev, reset_type);
-	}
 
-	if (hdev->default_reset_request && ae_dev->ops->reset_event)
-		ae_dev->ops->reset_event(hdev->pdev, NULL);
+	hclge_handle_err_reset_request(hdev);
 }
 
 static void hclge_errhand_service_task(struct hclge_dev *hdev)
@@ -4268,7 +4299,10 @@ static void hclge_errhand_service_task(struct hclge_dev *hdev)
 	if (!test_and_clear_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state))
 		return;
 
-	hclge_misc_err_recovery(hdev);
+	if (hdev->ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
+		hclge_handle_err_recovery(hdev);
+	else
+		hclge_misc_err_recovery(hdev);
 }
 
 static void hclge_reset_service_task(struct hclge_dev *hdev)
-- 
2.8.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH net-next 3/5] net: hns3: add support for imp-handle ras capability
  2021-06-08 13:08 [PATCH net-next 0/5] net: hns3: add RAS compatibility adaptation solution Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 1/5] net: hns3: add support for handling all errors through MSI-X Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 2/5] net: hns3: add the RAS compatibility adaptation solution Guangbin Huang
@ 2021-06-08 13:08 ` Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 4/5] net: hns3: update error recovery module and type Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 5/5] net: hns3: add error handling compatibility during initialization Guangbin Huang
  4 siblings, 0 replies; 6+ messages in thread
From: Guangbin Huang @ 2021-06-08 13:08 UTC (permalink / raw)
  To: davem, kuba; +Cc: netdev, linux-kernel, salil.mehta, lipeng321, huangguangbin2

From: Jiaran Zhang <zhangjiaran@huawei.com>

IMP(Intelligent Management Processor) firmware add a new feature to
handle and consolidate RAS information for new devices, NIC driver
only needs to query the reported RAS information. NIC driver adds
support for this feature.

Driver queries device capability to check whether IMP support this
feature, If yes, execute the new RAS processing branch.

In order to add a method to check whether PF supports imp-handle RAS
feature, add dumping this info in debugfs.

Signed-off-by: Jiaran Zhang <zhangjiaran@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h             | 4 ++++
 drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c      | 3 +++
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c  | 2 ++
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h  | 1 +
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +-
 5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index dc9b5bc3431b..e564aa32a414 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -91,6 +91,7 @@ enum HNAE3_DEV_CAP_BITS {
 	HNAE3_DEV_SUPPORT_STASH_B,
 	HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B,
 	HNAE3_DEV_SUPPORT_PAUSE_B,
+	HNAE3_DEV_SUPPORT_RAS_IMP_B,
 	HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
 	HNAE3_DEV_SUPPORT_PORT_VLAN_BYPASS_B,
 	HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B,
@@ -129,6 +130,9 @@ enum HNAE3_DEV_CAP_BITS {
 #define hnae3_dev_phy_imp_supported(hdev) \
 	test_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, (hdev)->ae_dev->caps)
 
+#define hnae3_dev_ras_imp_supported(hdev) \
+	test_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, (hdev)->ae_dev->caps)
+
 #define hnae3_dev_tqp_txrx_indep_supported(hdev) \
 	test_bit(HNAE3_DEV_SUPPORT_TQP_TXRX_INDEP_B, (hdev)->ae_dev->caps)
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
index cf1efd2f4a0f..a0edca848392 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
@@ -350,6 +350,9 @@ static struct hns3_dbg_cap_info hns3_dbg_cap[] = {
 		.name = "support imp-controlled PHY",
 		.cap_bit = HNAE3_DEV_SUPPORT_PHY_IMP_B,
 	}, {
+		.name = "support imp-controlled RAS",
+		.cap_bit = HNAE3_DEV_SUPPORT_RAS_IMP_B,
+	}, {
 		.name = "support rxd advanced layout",
 		.cap_bit = HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
 	}, {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
index 614763f5e877..887297e37cf3 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
@@ -387,6 +387,8 @@ static void hclge_parse_capability(struct hclge_dev *hdev,
 		set_bit(HNAE3_DEV_SUPPORT_PAUSE_B, ae_dev->caps);
 	if (hnae3_get_bit(caps, HCLGE_CAP_PHY_IMP_B))
 		set_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, ae_dev->caps);
+	if (hnae3_get_bit(caps, HCLGE_CAP_RAS_IMP_B))
+		set_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, ae_dev->caps);
 	if (hnae3_get_bit(caps, HCLGE_CAP_RXD_ADV_LAYOUT_B))
 		set_bit(HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, ae_dev->caps);
 	if (hnae3_get_bit(caps, HCLGE_CAP_PORT_VLAN_BYPASS_B)) {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index 234f0a3beec1..221811af9473 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -392,6 +392,7 @@ enum HCLGE_CAP_BITS {
 	HCLGE_CAP_HW_PAD_B,
 	HCLGE_CAP_STASH_B,
 	HCLGE_CAP_UDP_TUNNEL_CSUM_B,
+	HCLGE_CAP_RAS_IMP_B = 12,
 	HCLGE_CAP_FEC_B = 13,
 	HCLGE_CAP_PAUSE_B = 14,
 	HCLGE_CAP_RXD_ADV_LAYOUT_B = 15,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 3c08fc71b951..cf34216df171 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4299,7 +4299,7 @@ static void hclge_errhand_service_task(struct hclge_dev *hdev)
 	if (!test_and_clear_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state))
 		return;
 
-	if (hdev->ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
+	if (hnae3_dev_ras_imp_supported(hdev))
 		hclge_handle_err_recovery(hdev);
 	else
 		hclge_misc_err_recovery(hdev);
-- 
2.8.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH net-next 4/5] net: hns3: update error recovery module and type
  2021-06-08 13:08 [PATCH net-next 0/5] net: hns3: add RAS compatibility adaptation solution Guangbin Huang
                   ` (2 preceding siblings ...)
  2021-06-08 13:08 ` [PATCH net-next 3/5] net: hns3: add support for imp-handle ras capability Guangbin Huang
@ 2021-06-08 13:08 ` Guangbin Huang
  2021-06-08 13:08 ` [PATCH net-next 5/5] net: hns3: add error handling compatibility during initialization Guangbin Huang
  4 siblings, 0 replies; 6+ messages in thread
From: Guangbin Huang @ 2021-06-08 13:08 UTC (permalink / raw)
  To: davem, kuba; +Cc: netdev, linux-kernel, salil.mehta, lipeng321, huangguangbin2

From: Jiaran Zhang <zhangjiaran@huawei.com>

Update error recovery module and type for RoCE.

The enumeration values of module names and error types are not sorted
in sequence. If use the current printing mode, they cannot be correctly
printed.

Use the index mode, If mod_id and type_id match the enumerated value,
display the corresponding information.

Signed-off-by: Jiaran Zhang <zhangjiaran@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 58 ++++++++++++++++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h | 18 +++++++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    |  3 +-
 3 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
index 36f8055bd859..0e942d11dbf3 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
@@ -677,6 +677,36 @@ static const struct hclge_hw_module_id hclge_hw_module_id_st[] = {
 	}, {
 		.module_id = MODULE_MASTER,
 		.msg = "MODULE_MASTER"
+	}, {
+		.module_id = MODULE_ROCEE_TOP,
+		.msg = "MODULE_ROCEE_TOP"
+	}, {
+		.module_id = MODULE_ROCEE_TIMER,
+		.msg = "MODULE_ROCEE_TIMER"
+	}, {
+		.module_id = MODULE_ROCEE_MDB,
+		.msg = "MODULE_ROCEE_MDB"
+	}, {
+		.module_id = MODULE_ROCEE_TSP,
+		.msg = "MODULE_ROCEE_TSP"
+	}, {
+		.module_id = MODULE_ROCEE_TRP,
+		.msg = "MODULE_ROCEE_TRP"
+	}, {
+		.module_id = MODULE_ROCEE_SCC,
+		.msg = "MODULE_ROCEE_SCC"
+	}, {
+		.module_id = MODULE_ROCEE_CAEP,
+		.msg = "MODULE_ROCEE_CAEP"
+	}, {
+		.module_id = MODULE_ROCEE_GEN_AC,
+		.msg = "MODULE_ROCEE_GEN_AC"
+	}, {
+		.module_id = MODULE_ROCEE_QMM,
+		.msg = "MODULE_ROCEE_QMM"
+	}, {
+		.module_id = MODULE_ROCEE_LSAN,
+		.msg = "MODULE_ROCEE_LSAN"
 	}
 };
 
@@ -720,6 +750,12 @@ static const struct hclge_hw_type_id hclge_hw_type_id_st[] = {
 	}, {
 		.type_id = GLB_ERROR,
 		.msg = "glb_error"
+	}, {
+		.type_id = ROCEE_NORMAL_ERR,
+		.msg = "rocee_normal_error"
+	}, {
+		.type_id = ROCEE_OVF_ERR,
+		.msg = "rocee_ovf_error"
 	}
 };
 
@@ -2125,6 +2161,8 @@ hclge_handle_error_type_reg_log(struct device *dev,
 #define HCLGE_ERR_TYPE_IS_RAS_OFFSET 7
 
 	u8 mod_id, total_module, type_id, total_type, i, is_ras;
+	u8 index_module = MODULE_NONE;
+	u8 index_type = NONE_ERROR;
 
 	mod_id = mod_info->mod_id;
 	type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK;
@@ -2133,11 +2171,25 @@ hclge_handle_error_type_reg_log(struct device *dev,
 	total_module = ARRAY_SIZE(hclge_hw_module_id_st);
 	total_type = ARRAY_SIZE(hclge_hw_type_id_st);
 
-	if (mod_id < total_module && type_id < total_type)
+	for (i = 0; i < total_module; i++) {
+		if (mod_id == hclge_hw_module_id_st[i].module_id) {
+			index_module = i;
+			break;
+		}
+	}
+
+	for (i = 0; i < total_type; i++) {
+		if (type_id == hclge_hw_type_id_st[i].type_id) {
+			index_type = i;
+			break;
+		}
+	}
+
+	if (index_module != MODULE_NONE && index_type != NONE_ERROR)
 		dev_err(dev,
 			"found %s %s, is %s error.\n",
-			hclge_hw_module_id_st[mod_id].msg,
-			hclge_hw_type_id_st[type_id].msg,
+			hclge_hw_module_id_st[index_module].msg,
+			hclge_hw_type_id_st[index_type].msg,
 			is_ras ? "ras" : "msix");
 	else
 		dev_err(dev,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
index 27ab772c665e..ce4c96bbef8e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
@@ -15,6 +15,8 @@
 #define HCLGE_RAS_PF_OTHER_INT_STS_REG   0x20B00
 #define HCLGE_RAS_REG_NFE_MASK   0xFF00
 #define HCLGE_RAS_REG_ROCEE_ERR_MASK   0x3000000
+#define HCLGE_RAS_REG_ERR_MASK \
+	(HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK)
 
 #define HCLGE_VECTOR0_REG_MSIX_MASK   0x1FF00
 
@@ -134,6 +136,18 @@ enum hclge_mod_name_list {
 	MODULE_RCB_TX		= 12,
 	MODULE_TXDMA		= 13,
 	MODULE_MASTER		= 14,
+	/* add new MODULE NAME for NIC here in order */
+	MODULE_ROCEE_TOP	= 40,
+	MODULE_ROCEE_TIMER	= 41,
+	MODULE_ROCEE_MDB	= 42,
+	MODULE_ROCEE_TSP	= 43,
+	MODULE_ROCEE_TRP	= 44,
+	MODULE_ROCEE_SCC	= 45,
+	MODULE_ROCEE_CAEP	= 46,
+	MODULE_ROCEE_GEN_AC	= 47,
+	MODULE_ROCEE_QMM	= 48,
+	MODULE_ROCEE_LSAN	= 49,
+	/* add new MODULE NAME for RoCEE here in order */
 };
 
 enum hclge_err_type_list {
@@ -150,6 +164,10 @@ enum hclge_err_type_list {
 	ETS_ERROR		= 10,
 	NCSI_ERROR		= 11,
 	GLB_ERROR		= 12,
+	/* add new ERROR TYPE for NIC here in order */
+	ROCEE_NORMAL_ERR	= 40,
+	ROCEE_OVF_ERR		= 41,
+	/* add new ERROR TYPE for ROCEE here in order */
 };
 
 struct hclge_hw_blk {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index cf34216df171..9ff4210f6477 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3343,8 +3343,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 
 	/* check for vector0 msix event and hardware error event source */
 	if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK ||
-	    hw_err_src_reg & HCLGE_RAS_REG_NFE_MASK ||
-	    hw_err_src_reg & HCLGE_RAS_REG_ROCEE_ERR_MASK)
+	    hw_err_src_reg & HCLGE_RAS_REG_ERR_MASK)
 		return HCLGE_VECTOR0_EVENT_ERR;
 
 	/* check for vector0 mailbox(=CMDQ RX) event source */
-- 
2.8.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH net-next 5/5] net: hns3: add error handling compatibility during initialization
  2021-06-08 13:08 [PATCH net-next 0/5] net: hns3: add RAS compatibility adaptation solution Guangbin Huang
                   ` (3 preceding siblings ...)
  2021-06-08 13:08 ` [PATCH net-next 4/5] net: hns3: update error recovery module and type Guangbin Huang
@ 2021-06-08 13:08 ` Guangbin Huang
  4 siblings, 0 replies; 6+ messages in thread
From: Guangbin Huang @ 2021-06-08 13:08 UTC (permalink / raw)
  To: davem, kuba; +Cc: netdev, linux-kernel, salil.mehta, lipeng321, huangguangbin2

From: Jiaran Zhang <zhangjiaran@huawei.com>

During initialization, the driver logs and clears the hw errors that
already occurred. For device supports imp-handle ras capability, it
needs handle different error status, otherwise it may cause wrong reset.

So fix it by adding a new processing branch.

Signed-off-by: Jiaran Zhang <zhangjiaran@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 22 ++++++++++++++++++++++
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h |  2 ++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 21 ++++++++++-----------
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
index 0e942d11dbf3..bad9fda19398 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
@@ -2152,6 +2152,28 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
 	kfree(desc);
 }
 
+bool hclge_find_error_source(struct hclge_dev *hdev)
+{
+	u32 msix_src_flag, hw_err_src_flag;
+
+	msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) &
+			HCLGE_VECTOR0_REG_MSIX_MASK;
+
+	hw_err_src_flag = hclge_read_dev(&hdev->hw,
+					 HCLGE_RAS_PF_OTHER_INT_STS_REG) &
+			  HCLGE_RAS_REG_ERR_MASK;
+
+	return msix_src_flag || hw_err_src_flag;
+}
+
+void hclge_handle_occurred_error(struct hclge_dev *hdev)
+{
+	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
+
+	if (hclge_find_error_source(hdev))
+		hclge_handle_error_info_log(ae_dev);
+}
+
 static void
 hclge_handle_error_type_reg_log(struct device *dev,
 				struct hclge_mod_err_info *mod_info,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
index ce4c96bbef8e..07987fb8332e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
@@ -215,6 +215,8 @@ int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en);
 int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state);
 int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en);
 void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev);
+bool hclge_find_error_source(struct hclge_dev *hdev);
+void hclge_handle_occurred_error(struct hclge_dev *hdev);
 pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
 int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
 			       unsigned long *reset_requests);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 9ff4210f6477..d960e08850ae 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4255,18 +4255,11 @@ static void hclge_handle_err_reset_request(struct hclge_dev *hdev)
 
 static void hclge_handle_err_recovery(struct hclge_dev *hdev)
 {
-	u32 mask_val = HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK;
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
-	u32 msix_src_flag, hw_err_src_flag;
 
-	msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) &
-			HCLGE_VECTOR0_REG_MSIX_MASK;
+	ae_dev->hw_err_reset_req = 0;
 
-	hw_err_src_flag = hclge_read_dev(&hdev->hw,
-					 HCLGE_RAS_PF_OTHER_INT_STS_REG) &
-			  mask_val;
-
-	if (msix_src_flag || hw_err_src_flag) {
+	if (hclge_find_error_source(hdev)) {
 		hclge_handle_error_info_log(ae_dev);
 		hclge_handle_mac_tnl(hdev);
 	}
@@ -11558,7 +11551,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	hclge_clear_resetting_state(hdev);
 
 	/* Log and clear the hw errors those already occurred */
-	hclge_handle_all_hns_hw_errors(ae_dev);
+	if (hnae3_dev_ras_imp_supported(hdev))
+		hclge_handle_occurred_error(hdev);
+	else
+		hclge_handle_all_hns_hw_errors(ae_dev);
 
 	/* request delayed reset for the error recovery because an immediate
 	 * global reset on a PF affecting pending initialization of other PFs
@@ -11911,7 +11907,10 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
 	}
 
 	/* Log and clear the hw errors those already occurred */
-	hclge_handle_all_hns_hw_errors(ae_dev);
+	if (hnae3_dev_ras_imp_supported(hdev))
+		hclge_handle_occurred_error(hdev);
+	else
+		hclge_handle_all_hns_hw_errors(ae_dev);
 
 	/* Re-enable the hw error interrupts because
 	 * the interrupts get disabled on global reset.
-- 
2.8.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-06-08 13:12 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-08 13:08 [PATCH net-next 0/5] net: hns3: add RAS compatibility adaptation solution Guangbin Huang
2021-06-08 13:08 ` [PATCH net-next 1/5] net: hns3: add support for handling all errors through MSI-X Guangbin Huang
2021-06-08 13:08 ` [PATCH net-next 2/5] net: hns3: add the RAS compatibility adaptation solution Guangbin Huang
2021-06-08 13:08 ` [PATCH net-next 3/5] net: hns3: add support for imp-handle ras capability Guangbin Huang
2021-06-08 13:08 ` [PATCH net-next 4/5] net: hns3: update error recovery module and type Guangbin Huang
2021-06-08 13:08 ` [PATCH net-next 5/5] net: hns3: add error handling compatibility during initialization Guangbin Huang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).