All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eran Ben Elisha <eranbe@mellanox.com>
To: netdev@vger.kernel.org, "David S. Miller" <davem@davemloft.net>,
	Jiri Pirko <jiri@mellanox.com>
Cc: Moshe Shemesh <moshe@mellanox.com>, Aya Levin <ayal@mellanox.com>,
	Eran Ben Elisha <eranbe@mellanox.com>,
	Tal Alon <talal@mellanox.com>, Ariel Almog <ariela@mellanox.com>
Subject: [PATCH RFC net-next 16/19] net/mlx5: Report devlink health on FW issues
Date: Mon, 31 Dec 2018 16:32:10 +0200	[thread overview]
Message-ID: <1546266733-9512-17-git-send-email-eranbe@mellanox.com> (raw)
In-Reply-To: <1546266733-9512-1-git-send-email-eranbe@mellanox.com>

From: Moshe Shemesh <moshe@mellanox.com>

Use devlink_health_report() to report any symptom of FW issue as FW
counter miss or new health syndrom.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/devlink.c | 21 +++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/devlink.h |  1 +
 .../net/ethernet/mellanox/mlx5/core/health.c  | 10 +++++++++
 include/linux/mlx5/driver.h                   |  2 ++
 4 files changed, 34 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 07bc473a8ebb..04c904214e4c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -205,6 +205,27 @@ mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
 	return 0;
 }
 
+void mlx5_fw_reporter_err_work(struct work_struct *work)
+{
+	struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+
+	health = container_of(work, struct mlx5_core_health, report_work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	fw_reporter_ctx.err_synd = health->synd;
+	fw_reporter_ctx.miss_counter = health->miss_counter;
+	if (fw_reporter_ctx.err_synd)
+		devlink_health_report(dev->fw_reporter, "FW syndrom reported",
+				      &fw_reporter_ctx);
+	else if (fw_reporter_ctx.miss_counter)
+		devlink_health_report(dev->fw_reporter, "FW miss counter reported",
+				      &fw_reporter_ctx);
+}
+
 static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.name = "FW",
 		.objdump_size = SAVED_TRACES_BUFFER_SIZE_BYTE,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 34f6bfed1cfb..082a648a3af3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -16,5 +16,6 @@ int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
 void mlx5_devlink_unregister(struct devlink *devlink);
 int mlx5_fw_reporter_create(struct mlx5_core_dev *dev);
 void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev);
+void mlx5_fw_reporter_err_work(struct work_struct *work);
 
 #endif /* __MLX5_DEVLINK_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2fa78edde1fe..4d0ad792b226 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -38,6 +38,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
+#include "devlink.h"
 #include "lib/eq.h"
 #include "lib/mlx5.h"
 
@@ -289,7 +290,9 @@ static void poll_health(struct timer_list *t)
 {
 	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
 	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
 	u32 count;
+	u8 synd;
 
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		goto out;
@@ -304,8 +307,13 @@ static void poll_health(struct timer_list *t)
 	if (health->miss_counter == MAX_MISSES) {
 		dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
 		mlx5_print_health_info(dev);
+		queue_work(health->wq, &health->report_work);
 	}
 
+	synd = ioread8(&h->synd);
+	if (synd && synd != health->synd)
+		queue_work(health->wq, &health->report_work);
+
 	if (in_fatal(dev) && !health->sick) {
 		health->sick = true;
 		mlx5_print_health_info(dev);
@@ -356,6 +364,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
 	cancel_delayed_work_sync(&health->recover_work);
+	cancel_work_sync(&health->report_work);
 	cancel_work_sync(&health->work);
 }
 
@@ -395,6 +404,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
+	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 	INIT_DELAYED_WORK(&health->recover_work, health_recover);
 
 	return 0;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bedc9bc08963..b2dc32b553b4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -432,12 +432,14 @@ struct mlx5_core_health {
 	struct timer_list		timer;
 	u32				prev;
 	int				miss_counter;
+	u8				synd;
 	bool				sick;
 	/* wq spinlock to synchronize draining */
 	spinlock_t			wq_lock;
 	struct workqueue_struct	       *wq;
 	unsigned long			flags;
 	struct work_struct		work;
+	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 };
 
-- 
2.17.1

  parent reply	other threads:[~2018-12-31 14:32 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-31 14:31 [PATCH RFC net-next 00/19] Devlink health reporting and recovery system Eran Ben Elisha
2018-12-31 14:31 ` [PATCH RFC net-next 01/19] devlink: Add health buffer support Eran Ben Elisha
2018-12-31 14:31 ` [PATCH RFC net-next 02/19] devlink: Add health reporter create/destroy functionality Eran Ben Elisha
2018-12-31 14:31 ` [PATCH RFC net-next 03/19] devlink: Add health report functionality Eran Ben Elisha
2018-12-31 14:31 ` [PATCH RFC net-next 04/19] devlink: Add health get command Eran Ben Elisha
2018-12-31 14:31 ` [PATCH RFC net-next 05/19] devlink: Add health set command Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 06/19] devlink: Add health recover command Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 07/19] devlink: Add health diagnose command Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 08/19] devlink: Add health objdump {get,clear} commands Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 09/19] net/mlx5e: Add TX reporter support Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 10/19] net/mlx5e: Add TX timeout support for mlx5e TX reporter Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 11/19] net/mlx5: Move all devlink related functions calls to devlink.c Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 12/19] net/mlx5: Refactor print health info Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 13/19] net/mlx5: Create FW devlink_health_reporter Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 14/19] net/mlx5: Add core dump register access functions Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 15/19] net/mlx5: Add support for FW reporter objdump Eran Ben Elisha
2018-12-31 14:32 ` Eran Ben Elisha [this message]
2018-12-31 14:32 ` [PATCH RFC net-next 17/19] net/mlx5: Add FW fatal devlink_health_reporter Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 18/19] net/mlx5: Report devlink health on FW fatal issues Eran Ben Elisha
2018-12-31 14:32 ` [PATCH RFC net-next 19/19] devlink: Add Documentation/networking/devlink-health.txt Eran Ben Elisha
2019-01-01  1:47   ` Jakub Kicinski
2019-01-01 10:01     ` Eran Ben Elisha
2018-12-31 14:41 ` [PATCH RFC iproute2-next] devlink: Add health command support Aya Levin
2019-01-01  1:47 ` [PATCH RFC net-next 00/19] Devlink health reporting and recovery system Jakub Kicinski
2019-01-01  9:58   ` Eran Ben Elisha
2019-01-02 22:46     ` Jakub Kicinski
2019-01-03 13:31       ` Eran Ben Elisha
2019-01-03 22:28         ` Jakub Kicinski
2019-01-04  9:12           ` Jiri Pirko
2019-01-04  9:01         ` Jiri Pirko
2019-01-04  8:57   ` Jiri Pirko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1546266733-9512-17-git-send-email-eranbe@mellanox.com \
    --to=eranbe@mellanox.com \
    --cc=ariela@mellanox.com \
    --cc=ayal@mellanox.com \
    --cc=davem@davemloft.net \
    --cc=jiri@mellanox.com \
    --cc=moshe@mellanox.com \
    --cc=netdev@vger.kernel.org \
    --cc=talal@mellanox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.