netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Saeed Mahameed <saeedm@mellanox.com>
To: "David S. Miller" <davem@davemloft.net>
Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	Jiri Pirko <jiri@mellanox.com>,
	Moshe Shemesh <moshe@mellanox.com>,
	Eran Ben Elisha <eranbe@mellanox.com>,
	Saeed Mahameed <saeedm@mellanox.com>
Subject: [net-next v2 11/15] net/mlx5: Report devlink health on FW issues
Date: Thu, 13 Jun 2019 20:39:36 +0000	[thread overview]
Message-ID: <20190613203825.31049-12-saeedm@mellanox.com> (raw)
In-Reply-To: <20190613203825.31049-1-saeedm@mellanox.com>

From: Moshe Shemesh <moshe@mellanox.com>

Use devlink_health_report() to report any symptom of FW issue as FW
counter miss or new health syndrome.
The FW issues detected in mlx5 during poll_health which is called in
timer atomic context and so health work queue is used to schedule the
reports.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/health.c  | 33 +++++++++++++++++++
 include/linux/mlx5/driver.h                   |  3 +-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 1c20d3f1d238..5e876f1de114 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -515,6 +515,29 @@ mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
 	return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
 }
 
+static void mlx5_fw_reporter_err_work(struct work_struct *work)
+{
+	struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+	struct mlx5_core_health *health;
+
+	health = container_of(work, struct mlx5_core_health, report_work);
+
+	if (IS_ERR_OR_NULL(health->fw_reporter))
+		return;
+
+	fw_reporter_ctx.err_synd = health->synd;
+	fw_reporter_ctx.miss_counter = health->miss_counter;
+	if (fw_reporter_ctx.err_synd) {
+		devlink_health_report(health->fw_reporter,
+				      "FW syndrom reported", &fw_reporter_ctx);
+		return;
+	}
+	if (fw_reporter_ctx.miss_counter)
+		devlink_health_report(health->fw_reporter,
+				      "FW miss counter reported",
+				      &fw_reporter_ctx);
+}
+
 static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.name = "fw",
 		.diagnose = mlx5_fw_reporter_diagnose,
@@ -572,7 +595,9 @@ static void poll_health(struct timer_list *t)
 {
 	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
 	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
 	u32 fatal_error;
+	u8 prev_synd;
 	u32 count;
 
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
@@ -588,8 +613,14 @@ static void poll_health(struct timer_list *t)
 	if (health->miss_counter == MAX_MISSES) {
 		mlx5_core_err(dev, "device's health compromised - reached miss count\n");
 		print_health_info(dev);
+		queue_work(health->wq, &health->report_work);
 	}
 
+	prev_synd = health->synd;
+	health->synd = ioread8(&h->synd);
+	if (health->synd && health->synd != prev_synd)
+		queue_work(health->wq, &health->report_work);
+
 	fatal_error = check_fatal_sensors(dev);
 
 	if (fatal_error && !health->fatal_error) {
@@ -639,6 +670,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 	spin_lock_irqsave(&health->wq_lock, flags);
 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
+	cancel_work_sync(&health->report_work);
 	cancel_work_sync(&health->work);
 }
 
@@ -675,6 +707,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
+	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 
 	mlx5_fw_reporter_create(dev);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8d5d065d1aa6..1931a4080d78 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -435,7 +435,7 @@ struct mlx5_core_health {
 	struct timer_list		timer;
 	u32				prev;
 	int				miss_counter;
-	bool				sick;
+	u8				synd;
 	u32				fatal_error;
 	u32				crdump_size;
 	/* wq spinlock to synchronize draining */
@@ -443,6 +443,7 @@ struct mlx5_core_health {
 	struct workqueue_struct	       *wq;
 	unsigned long			flags;
 	struct work_struct		work;
+	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 	struct devlink_health_reporter *fw_reporter;
 };
-- 
2.21.0


  parent reply	other threads:[~2019-06-13 20:40 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-13 20:39 [pull request][net-next v2 00/15] Mellanox, mlx5 Firmware devlink health and sw reset Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 01/15] devlink: Hang reporter's dump method on a dumpit cb Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 02/15] Documentation: net: mlx5: Add mlx5 initial documentation Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 03/15] net/mlx5: Move all devlink related functions calls to devlink.c Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 04/15] net/mlx5: Add Vendor Specific Capability access gateway Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 05/15] net/mlx5: Add Crdump support Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 06/15] net/mlx5: Handle SW reset of FW in error flow Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 07/15] net/mlx5: Control CR-space access by different PFs Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 08/15] net/mlx5: Issue SW reset on FW assert Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 09/15] net/mlx5: Create FW devlink_health_reporter Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 10/15] net/mlx5: Add support for FW reporter dump Saeed Mahameed
2019-06-13 20:39 ` Saeed Mahameed [this message]
2019-06-13 20:39 ` [net-next v2 12/15] net/mlx5: Add fw fatal devlink_health_reporter Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 13/15] net/mlx5: Add support for FW fatal reporter dump Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 14/15] net/mlx5: Report devlink health on FW fatal issues Saeed Mahameed
2019-06-13 20:39 ` [net-next v2 15/15] Documentation: net: mlx5: Devlink health documentation Saeed Mahameed
2019-06-15  2:46 ` [pull request][net-next v2 00/15] Mellanox, mlx5 Firmware devlink health and sw reset David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190613203825.31049-12-saeedm@mellanox.com \
    --to=saeedm@mellanox.com \
    --cc=davem@davemloft.net \
    --cc=eranbe@mellanox.com \
    --cc=jiri@mellanox.com \
    --cc=moshe@mellanox.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).