netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Michael Chan <michael.chan@broadcom.com>
To: davem@davemloft.net
Cc: netdev@vger.kernel.org, vasundhara-v.volam@broadcom.com,
	ray.jui@broadcom.com, Jiri Pirko <jiri@mellanox.com>
Subject: [PATCH net-next v2 22/22] bnxt_en: Add FW fatal devlink_health_reporter.
Date: Thu, 29 Aug 2019 23:55:05 -0400	[thread overview]
Message-ID: <1567137305-5853-23-git-send-email-michael.chan@broadcom.com> (raw)
In-Reply-To: <1567137305-5853-1-git-send-email-michael.chan@broadcom.com>

From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>

Health show command example and output:

$ devlink health show pci/0000:af:00.0 reporter fw_fatal

pci/0000:af:00.0:
  name fw_fatal
    state healthy error 1 recover 1 grace_period 0 auto_recover true

Fatal events from firmware or missing periodic heartbeats will
be reported and recovery will be handled.

We also turn on the support flags when we register with the firmware to
enable this health and recovery feature in the firmware.

Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 80 ++++++++++++++++++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  7 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 56 ++++++++++++++++
 3 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5c7379e..f8a834f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1988,7 +1988,9 @@ static int bnxt_async_event_process(struct bnxt *bp,
 			goto async_event_process_exit;
 		set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
 		break;
-	case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY:
+	case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY: {
+		u32 data1 = le32_to_cpu(cmpl->event_data1);
+
 		bp->fw_reset_timestamp = jiffies;
 		bp->fw_reset_min_dsecs = cmpl->timestamp_lo;
 		if (!bp->fw_reset_min_dsecs)
@@ -1996,8 +1998,16 @@ static int bnxt_async_event_process(struct bnxt *bp,
 		bp->fw_reset_max_dsecs = le16_to_cpu(cmpl->timestamp_hi);
 		if (!bp->fw_reset_max_dsecs)
 			bp->fw_reset_max_dsecs = BNXT_DFLT_FW_RST_MAX_DSECS;
+		if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) {
+			netdev_warn(bp->dev, "Firmware fatal reset event received\n");
+			set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state);
+		} else {
+			netdev_warn(bp->dev, "Firmware non-fatal reset event received, max wait time %d msec\n",
+				    bp->fw_reset_max_dsecs * 100);
+		}
 		set_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event);
 		break;
+	}
 	case ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY: {
 		struct bnxt_fw_health *fw_health = bp->fw_health;
 		u32 data1 = le32_to_cpu(cmpl->event_data1);
@@ -4414,6 +4424,7 @@ static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
 {
 	struct hwrm_func_drv_rgtr_output *resp = bp->hwrm_cmd_resp_addr;
 	struct hwrm_func_drv_rgtr_input req = {0};
+	u32 flags;
 	int rc;
 
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
@@ -4423,7 +4434,11 @@ static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
 			    FUNC_DRV_RGTR_REQ_ENABLES_VER);
 
 	req.os_type = cpu_to_le16(FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX);
-	req.flags = cpu_to_le32(FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE);
+	flags = FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE |
+		FUNC_DRV_RGTR_REQ_FLAGS_HOT_RESET_SUPPORT;
+	if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY)
+		flags |= FUNC_DRV_RGTR_REQ_FLAGS_ERROR_RECOVERY_SUPPORT;
+	req.flags = cpu_to_le32(flags);
 	req.ver_maj_8b = DRV_VER_MAJ;
 	req.ver_min_8b = DRV_VER_MIN;
 	req.ver_upd_8b = DRV_VER_UPD;
@@ -9926,6 +9941,38 @@ static void bnxt_tx_timeout(struct net_device *dev)
 	bnxt_queue_sp_work(bp);
 }
 
+static void bnxt_fw_health_check(struct bnxt *bp)
+{
+	struct bnxt_fw_health *fw_health = bp->fw_health;
+	u32 val;
+
+	if (!fw_health || !fw_health->enabled ||
+	    test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))
+		return;
+
+	if (fw_health->tmr_counter) {
+		fw_health->tmr_counter--;
+		return;
+	}
+
+	val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
+	if (val == fw_health->last_fw_heartbeat)
+		goto fw_reset;
+
+	fw_health->last_fw_heartbeat = val;
+
+	val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
+	if (val != fw_health->last_fw_reset_cnt)
+		goto fw_reset;
+
+	fw_health->tmr_counter = fw_health->tmr_multiplier;
+	return;
+
+fw_reset:
+	set_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event);
+	bnxt_queue_sp_work(bp);
+}
+
 static void bnxt_timer(struct timer_list *t)
 {
 	struct bnxt *bp = from_timer(bp, t, timer);
@@ -9937,6 +9984,9 @@ static void bnxt_timer(struct timer_list *t)
 	if (atomic_read(&bp->intr_sem) != 0)
 		goto bnxt_restart_timer;
 
+	if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY)
+		bnxt_fw_health_check(bp);
+
 	if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS) &&
 	    bp->stats_coal_ticks) {
 		set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event);
@@ -10003,6 +10053,26 @@ static void bnxt_fw_reset_close(struct bnxt *bp)
 	bp->ctx = NULL;
 }
 
+static bool is_bnxt_fw_ok(struct bnxt *bp)
+{
+	struct bnxt_fw_health *fw_health = bp->fw_health;
+	bool no_heartbeat = false, has_reset = false;
+	u32 val;
+
+	val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
+	if (val == fw_health->last_fw_heartbeat)
+		no_heartbeat = true;
+
+	val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
+	if (val != fw_health->last_fw_reset_cnt)
+		has_reset = true;
+
+	if (!no_heartbeat && has_reset)
+		return true;
+
+	return false;
+}
+
 /* rtnl_lock is acquired before calling this function */
 static void bnxt_force_fw_reset(struct bnxt *bp)
 {
@@ -10207,6 +10277,12 @@ static void bnxt_sp_task(struct work_struct *work)
 	if (test_and_clear_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event))
 		bnxt_devlink_health_report(bp, BNXT_FW_RESET_NOTIFY_SP_EVENT);
 
+	if (test_and_clear_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event)) {
+		if (!is_bnxt_fw_ok(bp))
+			bnxt_devlink_health_report(bp,
+						   BNXT_FW_EXCEPTION_SP_EVENT);
+	}
+
 	smp_mb__before_atomic();
 	clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
 }
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 3459b2a..333b0a8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -472,6 +472,11 @@ struct rx_tpa_end_cmp_ext {
 	((le32_to_cpu((rx_tpa_end_ext)->rx_tpa_end_cmp_dup_acks) &	\
 	 RX_TPA_END_CMP_AGG_BUFS_P5) >> RX_TPA_END_CMP_AGG_BUFS_SHIFT_P5)
 
+#define EVENT_DATA1_RESET_NOTIFY_FATAL(data1)				\
+	(((data1) &							\
+	  ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_MASK) ==\
+	 ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL)
+
 #define EVENT_DATA1_RECOVERY_MASTER_FUNC(data1)				\
 	!!((data1) &							\
 	   ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_MASTER_FUNC)
@@ -1372,6 +1377,7 @@ struct bnxt_fw_health {
 	u32 fw_reset_seq_delay_msec[16];
 	struct devlink_health_reporter	*fw_reporter;
 	struct devlink_health_reporter *fw_reset_reporter;
+	struct devlink_health_reporter *fw_fatal_reporter;
 };
 
 struct bnxt_fw_reporter_ctx {
@@ -1728,6 +1734,7 @@ struct bnxt {
 #define BNXT_UPDATE_PHY_SP_EVENT	16
 #define BNXT_RING_COAL_NOW_SP_EVENT	17
 #define BNXT_FW_RESET_NOTIFY_SP_EVENT	18
+#define BNXT_FW_EXCEPTION_SP_EVENT	19
 
 	struct delayed_work	fw_reset_task;
 	int			fw_reset_state;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 8512467..e664392 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -83,6 +83,31 @@ struct devlink_health_reporter_ops bnxt_dl_fw_reset_reporter_ops = {
 	.recover = bnxt_fw_reset_recover,
 };
 
+static int bnxt_fw_fatal_recover(struct devlink_health_reporter *reporter,
+				 void *priv_ctx)
+{
+	struct bnxt *bp = devlink_health_reporter_priv(reporter);
+	struct bnxt_fw_reporter_ctx *fw_reporter_ctx = priv_ctx;
+	unsigned long event;
+
+	if (!priv_ctx)
+		return -EOPNOTSUPP;
+
+	event = fw_reporter_ctx->sp_event;
+	if (event == BNXT_FW_RESET_NOTIFY_SP_EVENT)
+		bnxt_fw_reset(bp);
+	else if (event == BNXT_FW_EXCEPTION_SP_EVENT)
+		bnxt_fw_exception(bp);
+
+	return 0;
+}
+
+static const
+struct devlink_health_reporter_ops bnxt_dl_fw_fatal_reporter_ops = {
+	.name = "fw_fatal",
+	.recover = bnxt_fw_fatal_recover,
+};
+
 static void bnxt_dl_fw_reporters_create(struct bnxt *bp)
 {
 	struct bnxt_fw_health *health = bp->fw_health;
@@ -108,6 +133,16 @@ static void bnxt_dl_fw_reporters_create(struct bnxt *bp)
 			    PTR_ERR(health->fw_reset_reporter));
 		health->fw_reset_reporter = NULL;
 	}
+
+	health->fw_fatal_reporter =
+		devlink_health_reporter_create(bp->dl,
+					       &bnxt_dl_fw_fatal_reporter_ops,
+					       0, true, bp);
+	if (IS_ERR(health->fw_fatal_reporter)) {
+		netdev_warn(bp->dev, "Failed to create FW fatal health reporter, rc = %ld\n",
+			    PTR_ERR(health->fw_fatal_reporter));
+		health->fw_fatal_reporter = NULL;
+	}
 }
 
 static void bnxt_dl_fw_reporters_destroy(struct bnxt *bp)
@@ -122,6 +157,9 @@ static void bnxt_dl_fw_reporters_destroy(struct bnxt *bp)
 
 	if (health->fw_reset_reporter)
 		devlink_health_reporter_destroy(health->fw_reset_reporter);
+
+	if (health->fw_fatal_reporter)
+		devlink_health_reporter_destroy(health->fw_fatal_reporter);
 }
 
 void bnxt_devlink_health_report(struct bnxt *bp, unsigned long event)
@@ -135,6 +173,15 @@ void bnxt_devlink_health_report(struct bnxt *bp, unsigned long event)
 	fw_reporter_ctx.sp_event = event;
 	switch (event) {
 	case BNXT_FW_RESET_NOTIFY_SP_EVENT:
+		if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) {
+			if (!fw_health->fw_fatal_reporter)
+				return;
+
+			devlink_health_report(fw_health->fw_fatal_reporter,
+					      "FW fatal async event received",
+					      &fw_reporter_ctx);
+			return;
+		}
 		if (!fw_health->fw_reset_reporter)
 			return;
 
@@ -142,6 +189,15 @@ void bnxt_devlink_health_report(struct bnxt *bp, unsigned long event)
 				      "FW non-fatal reset event received",
 				      &fw_reporter_ctx);
 		return;
+
+	case BNXT_FW_EXCEPTION_SP_EVENT:
+		if (!fw_health->fw_fatal_reporter)
+			return;
+
+		devlink_health_report(fw_health->fw_fatal_reporter,
+				      "FW fatal error reported",
+				      &fw_reporter_ctx);
+		return;
 	}
 }
 
-- 
2.5.1


  parent reply	other threads:[~2019-08-30  3:56 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-30  3:54 [PATCH net-next v2 00/22] bnxt_en: health and error recovery Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 01/22] bnxt_en: Use a common function to print the same ethtool -f error message Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 02/22] bnxt_en: Remove the -1 error return code from bnxt_hwrm_do_send_msg() Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 03/22] bnxt_en: Convert error code in firmware message response to standard code Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 04/22] bnxt_en: Simplify error checking in the SR-IOV message forwarding functions Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 05/22] bnxt_en: Suppress all error messages in hwrm_do_send_msg() in silent mode Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 06/22] bnxt_en: Prepare bnxt_init_one() to be called multiple times Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 07/22] bnxt_en: Refactor bnxt_sriov_enable() Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 08/22] bnxt_en: Register buffers for VFs before reserving resources Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 09/22] bnxt_en: Handle firmware reset status during IF_UP Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 10/22] bnxt_en: Discover firmware error recovery capabilities Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 11/22] bnxt_en: Pre-map the firmware health monitoring registers Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 12/22] bnxt_en: Enable health monitoring Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 13/22] bnxt_en: Add BNXT_STATE_IN_FW_RESET state Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 14/22] bnxt_en: Add new FW devlink_health_reporter Michael Chan
2019-10-07  9:56   ` Jiri Pirko
2019-10-09  4:55     ` Vasundhara Volam
2019-10-09  7:04       ` Jiri Pirko
2019-08-30  3:54 ` [PATCH net-next v2 15/22] bnxt_en: Handle RESET_NOTIFY async event from firmware Michael Chan
2019-08-30  3:54 ` [PATCH net-next v2 16/22] bnxt_en: Handle firmware reset Michael Chan
2019-08-30 17:56   ` kbuild test robot
2019-08-30  3:55 ` [PATCH net-next v2 17/22] bnxt_en: Add devlink health reset reporter Michael Chan
2019-08-30  3:55 ` [PATCH net-next v2 18/22] bnxt_en: Retain user settings on a VF after RESET_NOTIFY event Michael Chan
2019-08-30  3:55 ` [PATCH net-next v2 19/22] bnxt_en: Do not send firmware messages if firmware is in error state Michael Chan
2019-08-30  3:55 ` [PATCH net-next v2 20/22] bnxt_en: Add RESET_FW state logic to bnxt_fw_reset_task() Michael Chan
2019-08-30  3:55 ` [PATCH net-next v2 21/22] bnxt_en: Add bnxt_fw_exception() to handle fatal firmware errors Michael Chan
2019-08-30  3:55 ` Michael Chan [this message]
2019-08-30 21:02 ` [PATCH net-next v2 00/22] bnxt_en: health and error recovery David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1567137305-5853-23-git-send-email-michael.chan@broadcom.com \
    --to=michael.chan@broadcom.com \
    --cc=davem@davemloft.net \
    --cc=jiri@mellanox.com \
    --cc=netdev@vger.kernel.org \
    --cc=ray.jui@broadcom.com \
    --cc=vasundhara-v.volam@broadcom.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).