netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Salil Mehta <salil.mehta@huawei.com>
To: <davem@davemloft.net>
Cc: <salil.mehta@huawei.com>, <yisen.zhuang@huawei.com>,
	<lipeng321@huawei.com>, <mehta.salil@opnsrc.net>,
	<netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<linuxarm@huawei.com>, Shiju Jose <shiju.jose@huawei.com>
Subject: [PATCH net-next 09/14] net: hns3: add handling of hw errors reported through MSIX
Date: Fri, 7 Dec 2018 21:08:06 +0000	[thread overview]
Message-ID: <20181207210811.23844-10-salil.mehta@huawei.com> (raw)
In-Reply-To: <20181207210811.23844-1-salil.mehta@huawei.com>

This patch adds handling for HNS3 hardware errors(non-standard)
which are reported through MSIX interrupts and not through
PCIe AER channel.

These MSIX reported hardware errors are handled using common
misc. interrupt handler. Hardware error related registers
cannot be cleared in context to the interrupt received as
they require *heavy* access to hardware using IMP(Integrated
Mangement Processor) commands. Hence, we defer the clearing
of such error events till later time.

Since, we have defered exact identification of errors we
will have to defer the level of receovery/reset which
might be required. Hence, a new reset type UNKNOWN reset
has been introduced which effectively defers the assertion
of the reset till we get hold of kind of errors at later
time.

Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h        |  1 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |  3 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 93 ++++++++++++++++++++++
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h |  5 ++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 39 ++++++++-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  1 +
 6 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 9d9f4f9..294e725 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -136,6 +136,7 @@ enum hnae3_reset_type {
 	HNAE3_CORE_RESET,
 	HNAE3_GLOBAL_RESET,
 	HNAE3_IMP_RESET,
+	HNAE3_UNKNOWN_RESET,
 	HNAE3_NONE_RESET,
 };
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index 0a0eb6c..08d02b9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -220,6 +220,9 @@ enum hclge_opcode_type {
 	HCLGE_QUERY_RAS_INT_STS_BD_NUM	= 0x1510,
 	HCLGE_QUERY_CLEAR_MPF_RAS_INT	= 0x1511,
 	HCLGE_QUERY_CLEAR_PF_RAS_INT	= 0x1512,
+	HCLGE_QUERY_MSIX_INT_STS_BD_NUM	= 0x1513,
+	HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT	= 0x1514,
+	HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT	= 0x1515,
 	HCLGE_IGU_EGU_TNL_INT_EN	= 0x1803,
 	HCLGE_IGU_COMMON_INT_EN		= 0x1806,
 	HCLGE_TM_QCN_MEM_INT_CFG	= 0x1A14,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
index 7371ae4..0676670 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
@@ -727,3 +727,96 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
 
 	return PCI_ERS_RESULT_RECOVERED;
 }
+
+int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
+			       unsigned long *reset_requests)
+{
+	struct device *dev = &hdev->pdev->dev;
+	u32 mpf_bd_num, pf_bd_num, bd_num;
+	struct hclge_desc desc_bd;
+	struct hclge_desc *desc;
+	int ret = 0;
+
+	/* set default handling */
+	set_bit(HNAE3_FUNC_RESET, reset_requests);
+
+	/* query the number of bds for the MSIx int status */
+	hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_MSIX_INT_STS_BD_NUM,
+				   true);
+	ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1);
+	if (ret) {
+		dev_err(dev, "fail(%d) to query msix int status bd num\n",
+			ret);
+		/* reset everything for now */
+		set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+		return ret;
+	}
+
+	mpf_bd_num = le32_to_cpu(desc_bd.data[0]);
+	pf_bd_num = le32_to_cpu(desc_bd.data[1]);
+	bd_num = max_t(u32, mpf_bd_num, pf_bd_num);
+
+	desc = kcalloc(bd_num, sizeof(struct hclge_desc), GFP_KERNEL);
+	if (!desc)
+		goto out;
+
+	/* query all main PF MSIx errors */
+	hclge_cmd_setup_basic_desc(&desc[0], HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT,
+				   true);
+	desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
+
+	ret = hclge_cmd_send(&hdev->hw, &desc[0], mpf_bd_num);
+	if (ret) {
+		dev_err(dev, "query all mpf msix int cmd failed (%d)\n",
+			ret);
+		/* reset everything for now */
+		set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+		goto msi_error;
+	}
+
+	/* clear all main PF MSIx errors */
+	hclge_cmd_reuse_desc(&desc[0], false);
+	desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
+
+	ret = hclge_cmd_send(&hdev->hw, &desc[0], mpf_bd_num);
+	if (ret) {
+		dev_err(dev, "clear all mpf msix int cmd failed (%d)\n",
+			ret);
+		/* reset everything for now */
+		set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+		goto msi_error;
+	}
+
+	/* query all PF MSIx errors */
+	memset(desc, 0, bd_num * sizeof(struct hclge_desc));
+	hclge_cmd_setup_basic_desc(&desc[0], HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT,
+				   true);
+	desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
+
+	ret = hclge_cmd_send(&hdev->hw, &desc[0], pf_bd_num);
+	if (ret) {
+		dev_err(dev, "query all pf msix int cmd failed (%d)\n",
+			ret);
+		/* reset everything for now */
+		set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+		goto msi_error;
+	}
+
+	/* clear all PF MSIx errors */
+	hclge_cmd_reuse_desc(&desc[0], false);
+	desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
+
+	ret = hclge_cmd_send(&hdev->hw, &desc[0], pf_bd_num);
+	if (ret) {
+		dev_err(dev, "clear all pf msix int cmd failed (%d)\n",
+			ret);
+		/* reset everything for now */
+		set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+	}
+
+msi_error:
+	kfree(desc);
+out:
+	return ret;
+}
+
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
index 809e698..05adccb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
@@ -9,6 +9,9 @@
 #define HCLGE_RAS_PF_OTHER_INT_STS_REG   0x20B00
 #define HCLGE_RAS_REG_NFE_MASK   0xFF00
 
+#define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG   0x20800
+#define HCLGE_VECTOR0_REG_MSIX_MASK   0x1FF00
+
 #define HCLGE_IMP_TCM_ECC_ERR_INT_EN	0xFFFF0000
 #define HCLGE_IMP_TCM_ECC_ERR_INT_EN_MASK	0xFFFF0000
 #define HCLGE_IMP_ITCM4_ECC_ERR_INT_EN	0x300
@@ -69,4 +72,6 @@ struct hclge_hw_error {
 
 int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state);
 pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
+int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
+			       unsigned long *reset_requests);
 #endif
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 354ac5f..2238020 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2200,12 +2200,13 @@ static void hclge_service_complete(struct hclge_dev *hdev)
 
 static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 {
-	u32 rst_src_reg;
-	u32 cmdq_src_reg;
+	u32 rst_src_reg, cmdq_src_reg, msix_src_reg;
 
 	/* fetch the events from their corresponding regs */
 	rst_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
 	cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
+	msix_src_reg = hclge_read_dev(&hdev->hw,
+				      HCLGE_VECTOR0_PF_OTHER_INT_STS_REG);
 
 	/* Assumption: If by any chance reset and mailbox events are reported
 	 * together then we will only process reset event in this go and will
@@ -2239,6 +2240,10 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 		return HCLGE_VECTOR0_EVENT_RST;
 	}
 
+	/* check for vector0 msix event source */
+	if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK)
+		return HCLGE_VECTOR0_EVENT_ERR;
+
 	/* check for vector0 mailbox(=CMDQ RX) event source */
 	if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
 		cmdq_src_reg &= ~BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B);
@@ -2289,6 +2294,19 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
 
 	/* vector 0 interrupt is shared with reset and mailbox source events.*/
 	switch (event_cause) {
+	case HCLGE_VECTOR0_EVENT_ERR:
+		/* we do not know what type of reset is required now. This could
+		 * only be decided after we fetch the type of errors which
+		 * caused this event. Therefore, we will do below for now:
+		 * 1. Assert HNAE3_UNKNOWN_RESET type of reset. This means we
+		 *    have defered type of reset to be used.
+		 * 2. Schedule the reset serivce task.
+		 * 3. When service task receives  HNAE3_UNKNOWN_RESET type it
+		 *    will fetch the correct type of reset.  This would be done
+		 *    by first decoding the types of errors.
+		 */
+		set_bit(HNAE3_UNKNOWN_RESET, &hdev->reset_request);
+		/* fall through */
 	case HCLGE_VECTOR0_EVENT_RST:
 		hclge_reset_task_schedule(hdev);
 		break;
@@ -2593,6 +2611,23 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hclge_dev *hdev,
 {
 	enum hnae3_reset_type rst_level = HNAE3_NONE_RESET;
 
+	/* first, resolve any unknown reset type to the known type(s) */
+	if (test_bit(HNAE3_UNKNOWN_RESET, addr)) {
+		/* we will intentionally ignore any errors from this function
+		 *  as we will end up in *some* reset request in any case
+		 */
+		hclge_handle_hw_msix_error(hdev, addr);
+		clear_bit(HNAE3_UNKNOWN_RESET, addr);
+		/* We defered the clearing of the error event which caused
+		 * interrupt since it was not posssible to do that in
+		 * interrupt context (and this is the reason we introduced
+		 * new UNKNOWN reset type). Now, the errors have been
+		 * handled and cleared in hardware we can safely enable
+		 * interrupts. This is an exception to the norm.
+		 */
+		hclge_enable_vector(&hdev->misc_vector, true);
+	}
+
 	/* return the highest priority reset level amongst all */
 	if (test_bit(HNAE3_IMP_RESET, addr)) {
 		rst_level = HNAE3_IMP_RESET;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index ee60242..106fce1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -205,6 +205,7 @@ enum HCLGE_DEV_STATE {
 enum hclge_evt_cause {
 	HCLGE_VECTOR0_EVENT_RST,
 	HCLGE_VECTOR0_EVENT_MBX,
+	HCLGE_VECTOR0_EVENT_ERR,
 	HCLGE_VECTOR0_EVENT_OTHER,
 };
 
-- 
2.7.4

  parent reply	other threads:[~2018-12-07 21:08 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-07 21:07 [PATCH net-next 00/14] net: hns3: Additions/optimizations related to HNS3 H/W err handling Salil Mehta
2018-12-07 21:07 ` [PATCH net-next 01/14] net: hns3: remove existing process error functions and reorder hw_blk table Salil Mehta
2018-12-07 21:07 ` [PATCH net-next 02/14] net: hns3: rename enable error interrupt functions Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 03/14] net: hns3: re-enable error interrupts on hw reset Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 04/14] net: hns3: deletes unnecessary settings of the descriptor data Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 05/14] net: hns3: rename process_hw_error function Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 06/14] net: hns3: add optimization in the hclge_hw_error_set_state Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 07/14] net: hns3: add handling of hw ras errors using new set of commands Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 08/14] net: hns3: deleted logging 1 bit errors Salil Mehta
2018-12-07 21:08 ` Salil Mehta [this message]
2018-12-07 21:08 ` [PATCH net-next 10/14] net: hns3: add handling of hw errors of MAC Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 11/14] net: hns3: handle hw errors of PPP PF Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 12/14] net: hns3: handle hw errors of PPU(RCB) Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 13/14] net: hns3: handle hw errors of SSU Salil Mehta
2018-12-07 21:08 ` [PATCH net-next 14/14] net: hns3: add handling of RDMA RAS errors Salil Mehta
2018-12-08  0:03 ` [PATCH net-next 00/14] net: hns3: Additions/optimizations related to HNS3 H/W err handling David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181207210811.23844-10-salil.mehta@huawei.com \
    --to=salil.mehta@huawei.com \
    --cc=davem@davemloft.net \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linuxarm@huawei.com \
    --cc=lipeng321@huawei.com \
    --cc=mehta.salil@opnsrc.net \
    --cc=netdev@vger.kernel.org \
    --cc=shiju.jose@huawei.com \
    --cc=yisen.zhuang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).