linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Huazhong Tan <tanhuazhong@huawei.com>
To: <davem@davemloft.net>
Cc: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<salil.mehta@huawei.com>, <yisen.zhuang@huawei.com>,
	<linuxarm@huawei.com>
Subject: [Patch net-next 11/12] net: hns3: add error handler for hclge_reset()
Date: Wed, 7 Nov 2018 12:06:17 +0800	[thread overview]
Message-ID: <1541563578-28973-12-git-send-email-tanhuazhong@huawei.com> (raw)
In-Reply-To: <1541563578-28973-1-git-send-email-tanhuazhong@huawei.com>

When hclge_reset() is called, it may fail for several reasons.
For example, an higher-level reset event occurs, memory allocation
failure, hardware reset timeout, etc. Therefore, it is necessary
to add corresponding error handling for these situations.
1. A high-level reset is required due to a high-level reset failure.
2. For memory allocation failure, a high-level reset is initiated by
the timer to recover. The reason for using the timer is to prevent this
new high-level reset to interrupt the reset process of other pf/vf;
3. For the case of hardware reset timeout, reschedule the reset task
to wait for the hardware to complete the reset.
For memory allocation failure and reset timeouts, in order to prevent
an infinite number of scheduled reset tasks, the number of error
recovery needs to be limited.

This patch also add some reset related debug log printing.

Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 139 +++++++++++++++++----
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   3 +
 2 files changed, 120 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 0c0327b..579945b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2145,6 +2145,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 
 	/* check for vector0 reset event sources */
 	if (BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) & rst_src_reg) {
+		dev_info(&hdev->pdev->dev, "global reset interrupt\n");
 		set_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state);
 		set_bit(HNAE3_GLOBAL_RESET, &hdev->reset_pending);
 		*clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B);
@@ -2152,6 +2153,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 	}
 
 	if (BIT(HCLGE_VECTOR0_CORERESET_INT_B) & rst_src_reg) {
+		dev_info(&hdev->pdev->dev, "core reset interrupt\n");
 		set_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state);
 		set_bit(HNAE3_CORE_RESET, &hdev->reset_pending);
 		*clearval = BIT(HCLGE_VECTOR0_CORERESET_INT_B);
@@ -2159,6 +2161,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 	}
 
 	if (BIT(HCLGE_VECTOR0_IMPRESET_INT_B) & rst_src_reg) {
+		dev_info(&hdev->pdev->dev, "IMP reset interrupt\n");
 		set_bit(HNAE3_IMP_RESET, &hdev->reset_pending);
 		*clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B);
 		return HCLGE_VECTOR0_EVENT_RST;
@@ -2308,8 +2311,11 @@ static int hclge_notify_client(struct hclge_dev *hdev,
 		int ret;
 
 		ret = client->ops->reset_notify(handle, type);
-		if (ret)
+		if (ret) {
+			dev_err(&hdev->pdev->dev,
+				"notify nic client failed %d(%d)\n", type, ret);
 			return ret;
+		}
 	}
 
 	return 0;
@@ -2518,9 +2524,49 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev)
 	return ret;
 }
 
+static bool hclge_reset_err_handle(struct hclge_dev *hdev, bool is_timeout)
+{
+#define MAX_RESET_FAIL_CNT 5
+#define RESET_UPGRADE_DELAY_SEC 10
+
+	if (hdev->reset_pending) {
+		dev_info(&hdev->pdev->dev, "Reset pending %lu\n",
+			 hdev->reset_pending);
+		return true;
+	} else if ((hdev->reset_type != HNAE3_IMP_RESET) &&
+		   (hclge_read_dev(&hdev->hw, HCLGE_GLOBAL_RESET_REG) &
+		    BIT(HCLGE_IMP_RESET_BIT))) {
+		dev_info(&hdev->pdev->dev,
+			 "reset failed because IMP Reset is pending\n");
+		hclge_clear_reset_cause(hdev);
+		return false;
+	} else if (hdev->reset_fail_cnt < MAX_RESET_FAIL_CNT) {
+		hdev->reset_fail_cnt++;
+		if (is_timeout) {
+			set_bit(hdev->reset_type, &hdev->reset_pending);
+			dev_info(&hdev->pdev->dev,
+				 "re-schedule to wait for hw reset done\n");
+			return true;
+		}
+
+		dev_info(&hdev->pdev->dev, "Upgrade reset level\n");
+		hclge_clear_reset_cause(hdev);
+		mod_timer(&hdev->reset_timer,
+			  jiffies + RESET_UPGRADE_DELAY_SEC * HZ);
+
+		return false;
+	}
+
+	hclge_clear_reset_cause(hdev);
+	dev_err(&hdev->pdev->dev, "Reset fail!\n");
+	return false;
+}
+
 static void hclge_reset(struct hclge_dev *hdev)
 {
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
+	bool is_timeout = false;
+	int ret;
 
 	/* Initialize ae_dev reset status as well, in case enet layer wants to
 	 * know if device is undergoing reset
@@ -2529,34 +2575,66 @@ static void hclge_reset(struct hclge_dev *hdev)
 	hdev->reset_count++;
 	hdev->last_reset_time = jiffies;
 	/* perform reset of the stack & ae device for a client */
-	hclge_notify_roce_client(hdev, HNAE3_DOWN_CLIENT);
+	ret = hclge_notify_roce_client(hdev, HNAE3_DOWN_CLIENT);
+	if (ret)
+		goto err_reset;
+
 	rtnl_lock();
-	hclge_notify_client(hdev, HNAE3_DOWN_CLIENT);
-	rtnl_unlock();
+	ret = hclge_notify_client(hdev, HNAE3_DOWN_CLIENT);
+	if (ret)
+		goto err_reset_lock;
 
-	hclge_reset_prepare_wait(hdev);
+	rtnl_unlock();
 
-	if (!hclge_reset_wait(hdev)) {
-		hclge_notify_roce_client(hdev, HNAE3_UNINIT_CLIENT);
-		rtnl_lock();
-		hclge_notify_client(hdev, HNAE3_UNINIT_CLIENT);
-		hclge_reset_ae_dev(hdev->ae_dev);
-		hclge_notify_client(hdev, HNAE3_INIT_CLIENT);
+	ret = hclge_reset_prepare_wait(hdev);
+	if (ret)
+		goto err_reset;
 
-		hclge_clear_reset_cause(hdev);
-	} else {
-		rtnl_lock();
-		/* schedule again to check pending resets later */
-		set_bit(hdev->reset_type, &hdev->reset_pending);
-		hclge_reset_task_schedule(hdev);
+	if (hclge_reset_wait(hdev)) {
+		is_timeout = true;
+		goto err_reset;
 	}
 
-	hclge_notify_client(hdev, HNAE3_UP_CLIENT);
+	ret = hclge_notify_roce_client(hdev, HNAE3_UNINIT_CLIENT);
+	if (ret)
+		goto err_reset;
+
+	rtnl_lock();
+	ret = hclge_notify_client(hdev, HNAE3_UNINIT_CLIENT);
+	if (ret)
+		goto err_reset_lock;
+
+	ret = hclge_reset_ae_dev(hdev->ae_dev);
+	if (ret)
+		goto err_reset_lock;
+
+	ret = hclge_notify_client(hdev, HNAE3_INIT_CLIENT);
+	if (ret)
+		goto err_reset_lock;
+
+	hclge_clear_reset_cause(hdev);
+
+	ret = hclge_notify_client(hdev, HNAE3_UP_CLIENT);
+	if (ret)
+		goto err_reset_lock;
+
 	rtnl_unlock();
-	ae_dev->reset_type = HNAE3_NONE_RESET;
 
-	hclge_notify_roce_client(hdev, HNAE3_INIT_CLIENT);
-	hclge_notify_roce_client(hdev, HNAE3_UP_CLIENT);
+	ret = hclge_notify_roce_client(hdev, HNAE3_INIT_CLIENT);
+	if (ret)
+		goto err_reset;
+
+	ret = hclge_notify_roce_client(hdev, HNAE3_UP_CLIENT);
+	if (ret)
+		goto err_reset;
+
+	return;
+
+err_reset_lock:
+	rtnl_unlock();
+err_reset:
+	if (hclge_reset_err_handle(hdev, is_timeout))
+		hclge_reset_task_schedule(hdev);
 }
 
 static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle)
@@ -2610,6 +2688,16 @@ static void hclge_set_def_reset_request(struct hnae3_ae_dev *ae_dev,
 	set_bit(rst_type, &hdev->default_reset_request);
 }
 
+static void hclge_reset_timer(struct timer_list *t)
+{
+	struct hclge_dev *hdev = from_timer(hdev, t, reset_timer);
+
+	dev_info(&hdev->pdev->dev,
+		 "triggering global reset in reset timer\n");
+	set_bit(HNAE3_GLOBAL_RESET, &hdev->default_reset_request);
+	hclge_reset_event(hdev->pdev, NULL);
+}
+
 static void hclge_reset_subtask(struct hclge_dev *hdev)
 {
 	/* check if there is any ongoing reset in the hardware. This status can
@@ -4416,8 +4504,12 @@ static int hclge_restore_fd_entries(struct hnae3_handle *handle)
 	struct hlist_node *node;
 	int ret;
 
+	/* Return ok here, because reset error handling will check this
+	 * return value. If error is returned here, the reset process will
+	 * fail.
+	 */
 	if (!hnae3_dev_fd_supported(hdev))
-		return -EOPNOTSUPP;
+		return 0;
 
 	hlist_for_each_entry_safe(rule, node, &hdev->fd_rule_list, rule_node) {
 		ret = hclge_config_action(hdev, HCLGE_FD_STAGE_1, rule);
@@ -6713,6 +6805,8 @@ static void hclge_state_uninit(struct hclge_dev *hdev)
 
 	if (hdev->service_timer.function)
 		del_timer_sync(&hdev->service_timer);
+	if (hdev->reset_timer.function)
+		del_timer_sync(&hdev->reset_timer);
 	if (hdev->service_task.func)
 		cancel_work_sync(&hdev->service_task);
 	if (hdev->rst_service_task.func)
@@ -6871,6 +6965,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	hclge_dcb_ops_set(hdev);
 
 	timer_setup(&hdev->service_timer, hclge_service_timer, 0);
+	timer_setup(&hdev->reset_timer, hclge_reset_timer, 0);
 	INIT_WORK(&hdev->service_task, hclge_service_task);
 	INIT_WORK(&hdev->rst_service_task, hclge_reset_service_task);
 	INIT_WORK(&hdev->mbx_service_task, hclge_mailbox_service_task);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 69bdb7f..36f3413 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -102,6 +102,7 @@ enum HLCGE_PORT_TYPE {
 #define HCLGE_GLOBAL_RESET_REG		0x20A00
 #define HCLGE_GLOBAL_RESET_BIT		0
 #define HCLGE_CORE_RESET_BIT		1
+#define HCLGE_IMP_RESET_BIT		2
 #define HCLGE_FUN_RST_ING		0x20C00
 #define HCLGE_FUN_RST_ING_B		0
 
@@ -601,6 +602,7 @@ struct hclge_dev {
 	unsigned long reset_request;	/* reset has been requested */
 	unsigned long reset_pending;	/* client rst is pending to be served */
 	unsigned long reset_count;	/* the number of reset has been done */
+	u32 reset_fail_cnt;
 	u32 fw_version;
 	u16 num_vmdq_vport;		/* Num vmdq vport this PF has set up */
 	u16 num_tqps;			/* Num task queue pairs of this PF */
@@ -648,6 +650,7 @@ struct hclge_dev {
 	unsigned long service_timer_period;
 	unsigned long service_timer_previous;
 	struct timer_list service_timer;
+	struct timer_list reset_timer;
 	struct work_struct service_task;
 	struct work_struct rst_service_task;
 	struct work_struct mbx_service_task;
-- 
2.7.4


  parent reply	other threads:[~2018-11-07  4:06 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-07  4:06 [Patch net-next 00/12] provide new interfaces & bugfixes & code optimization Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 01/12] net: hns3: use HNS3_NIC_STATE_INITED to indicate the initialization state of enet Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 02/12] net: hns3: add set_default_reset_request in the hnae3_ae_ops Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 03/12] net: hns3: provide some interface & information for the client Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 04/12] net: hns3: adjust the location of clearing the table when doing reset Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 05/12] net: hns3: enable/disable ring in the enet while doing UP/DOWN Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 06/12] net: hns3: use HNS3_NIC_STATE_RESETTING to indicate resetting Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 07/12] net: hns3: ignore new coming low-level reset while doing high-level reset Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 08/12] net: hns3: move some reset information from hnae3_handle into hclge_dev/hclgevf_dev Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 09/12] net: hns3: adjust the process of PF reset Huazhong Tan
2018-11-07  4:06 ` [Patch net-next 10/12] net: hns3: call roce's reset notify callback when resetting Huazhong Tan
2018-11-07  4:06 ` Huazhong Tan [this message]
2018-11-07  4:06 ` [Patch net-next 12/12] net: hns3: fix for cmd queue memory not freed problem during reset Huazhong Tan
2018-11-07 19:42 ` [Patch net-next 00/12] provide new interfaces & bugfixes & code optimization David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1541563578-28973-12-git-send-email-tanhuazhong@huawei.com \
    --to=tanhuazhong@huawei.com \
    --cc=davem@davemloft.net \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linuxarm@huawei.com \
    --cc=netdev@vger.kernel.org \
    --cc=salil.mehta@huawei.com \
    --cc=yisen.zhuang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).