linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yishai Hadas <yishaih@nvidia.com>
To: <linux-rdma@vger.kernel.org>
Cc: <jgg@nvidia.com>, <yishaih@nvidia.com>, <maorg@nvidia.com>,
	<markzhang@nvidia.com>, <edwards@nvidia.com>
Subject: [PATCH rdma-core 13/27] mlx5: VFIO poll_health support
Date: Tue, 20 Jul 2021 11:16:33 +0300	[thread overview]
Message-ID: <20210720081647.1980-14-yishaih@nvidia.com> (raw)
In-Reply-To: <20210720081647.1980-1-yishaih@nvidia.com>

From: Mark Zhang <markzhang@nvidia.com>

Add firmware health polling support in vfio driver.

Such a case is not expected and we refer it as some fatal error in the
firmware that should be avoided/fixed.

The health buffer check is triggered by the application upon its call to
mlx5dv_vfio_process_events().

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 providers/mlx5/mlx5_vfio.c | 168 +++++++++++++++++++++++++++++++++++++++++++++
 providers/mlx5/mlx5_vfio.h |  10 ++-
 2 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/providers/mlx5/mlx5_vfio.c b/providers/mlx5/mlx5_vfio.c
index 85ee25b..c37358c 100644
--- a/providers/mlx5/mlx5_vfio.c
+++ b/providers/mlx5/mlx5_vfio.c
@@ -22,6 +22,8 @@
 #include <poll.h>
 #include <util/mmio.h>
 
+#include <ccan/array_size.h>
+
 #include "mlx5dv.h"
 #include "mlx5_vfio.h"
 #include "mlx5.h"
@@ -1910,6 +1912,7 @@ enum mlx5_cmd_addr_l_sz_offset {
 
 enum {
 	MLX5_NIC_IFC_DISABLED = 1,
+	MLX5_NIC_IFC_SW_RESET = 7,
 };
 
 static uint8_t mlx5_vfio_get_nic_state(struct mlx5_vfio_context *ctx)
@@ -1978,6 +1981,169 @@ static int mlx5_vfio_teardown_hca(struct mlx5_vfio_context *ctx)
 	return mlx5_vfio_teardown_hca_regular(ctx);
 }
 
+static bool sensor_pci_not_working(struct mlx5_init_seg *init_seg)
+{
+	/* Offline PCI reads return 0xffffffff */
+	return (be32toh(mmio_read32_be(&init_seg->health.fw_ver)) == 0xffffffff);
+}
+
+enum mlx5_fatal_assert_bit_offsets {
+	MLX5_RFR_OFFSET = 31,
+};
+
+static bool sensor_fw_synd_rfr(struct mlx5_init_seg *init_seg)
+{
+	uint32_t rfr = be32toh(mmio_read32_be(&init_seg->health.rfr)) >> MLX5_RFR_OFFSET;
+	uint8_t synd = mmio_read8(&init_seg->health.synd);
+
+	return (rfr && synd);
+}
+
+enum  {
+	MLX5_SENSOR_NO_ERR = 0,
+	MLX5_SENSOR_PCI_COMM_ERR = 1,
+	MLX5_SENSOR_NIC_DISABLED = 3,
+	MLX5_SENSOR_NIC_SW_RESET = 4,
+	MLX5_SENSOR_FW_SYND_RFR = 5,
+};
+
+static uint32_t mlx5_health_check_fatal_sensors(struct mlx5_vfio_context *ctx)
+{
+	if (sensor_pci_not_working(ctx->bar_map))
+		return MLX5_SENSOR_PCI_COMM_ERR;
+
+	if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_DISABLED)
+		return MLX5_SENSOR_NIC_DISABLED;
+
+	if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_SW_RESET)
+		return MLX5_SENSOR_NIC_SW_RESET;
+
+	if (sensor_fw_synd_rfr(ctx->bar_map))
+		return MLX5_SENSOR_FW_SYND_RFR;
+
+	return MLX5_SENSOR_NO_ERR;
+}
+
+enum {
+	MLX5_HEALTH_SYNDR_FW_ERR = 0x1,
+	MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7,
+	MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR = 0x8,
+	MLX5_HEALTH_SYNDR_CRC_ERR = 0x9,
+	MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa,
+	MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb,
+	MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc,
+	MLX5_HEALTH_SYNDR_EQ_ERR = 0xd,
+	MLX5_HEALTH_SYNDR_EQ_INV = 0xe,
+	MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf,
+	MLX5_HEALTH_SYNDR_HIGH_TEMP = 0x10,
+};
+
+static const char *hsynd_str(u8 synd)
+{
+	switch (synd) {
+	case MLX5_HEALTH_SYNDR_FW_ERR:
+		return "firmware internal error";
+	case MLX5_HEALTH_SYNDR_IRISC_ERR:
+		return "irisc not responding";
+	case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
+		return "unrecoverable hardware error";
+	case MLX5_HEALTH_SYNDR_CRC_ERR:
+		return "firmware CRC error";
+	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
+		return "ICM fetch PCI error";
+	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
+		return "HW fatal error\n";
+	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
+		return "async EQ buffer overrun";
+	case MLX5_HEALTH_SYNDR_EQ_ERR:
+		return "EQ error";
+	case MLX5_HEALTH_SYNDR_EQ_INV:
+		return "Invalid EQ referenced";
+	case MLX5_HEALTH_SYNDR_FFSER_ERR:
+		return "FFSER error";
+	case MLX5_HEALTH_SYNDR_HIGH_TEMP:
+		return "High temperature";
+	default:
+		return "unrecognized error";
+	}
+}
+
+static void print_health_info(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_init_seg *iseg = ctx->bar_map;
+	struct health_buffer *h = &iseg->health;
+	char fw_str[18] = {};
+	int i;
+
+	/* If the syndrome is 0, the device is OK and no need to print buffer */
+	if (!mmio_read8(&h->synd))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
+		mlx5_err(ctx->dbg_fp, "assert_var[%d] 0x%08x\n",
+			 i, be32toh(mmio_read32_be(h->assert_var + i)));
+
+	mlx5_err(ctx->dbg_fp, "assert_exit_ptr 0x%08x\n",
+		 be32toh(mmio_read32_be(&h->assert_exit_ptr)));
+	mlx5_err(ctx->dbg_fp, "assert_callra 0x%08x\n",
+		 be32toh(mmio_read32_be(&h->assert_callra)));
+	sprintf(fw_str, "%d.%d.%d",
+		be32toh(mmio_read32_be(&iseg->fw_rev)) & 0xffff,
+		be32toh(mmio_read32_be(&iseg->fw_rev)) >> 16,
+		be32toh(mmio_read32_be(&iseg->cmdif_rev_fw_sub)) & 0xffff);
+	mlx5_err(ctx->dbg_fp, "fw_ver %s\n", fw_str);
+	mlx5_err(ctx->dbg_fp, "hw_id 0x%08x\n", be32toh(mmio_read32_be(&h->hw_id)));
+	mlx5_err(ctx->dbg_fp, "irisc_index %d\n", mmio_read8(&h->irisc_index));
+	mlx5_err(ctx->dbg_fp, "synd 0x%x: %s\n", mmio_read8(&h->synd),
+		 hsynd_str(mmio_read8(&h->synd)));
+	mlx5_err(ctx->dbg_fp, "ext_synd 0x%04x\n",
+		 be16toh(mmio_read16_be(&h->ext_synd)));
+	mlx5_err(ctx->dbg_fp, "raw fw_ver 0x%08x\n",
+		 be32toh(mmio_read32_be(&iseg->fw_rev)));
+}
+
+static void mlx5_vfio_poll_health(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_vfio_health_state *hstate = &ctx->health_state;
+	uint32_t fatal_error, count;
+	struct timeval tv;
+	uint64_t time;
+	int ret;
+
+	ret = gettimeofday(&tv, NULL);
+	if (ret)
+		return;
+
+	time = (uint64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000;
+	if (time - hstate->prev_time < POLL_HEALTH_INTERVAL)
+		return;
+
+	fatal_error = mlx5_health_check_fatal_sensors(ctx);
+	if (fatal_error) {
+		mlx5_err(ctx->dbg_fp, "%s: Fatal error %u detected\n",
+			 __func__, fatal_error);
+		goto err;
+	}
+	count = be32toh(mmio_read32_be(&ctx->bar_map->health_counter)) & 0xffffff;
+	if (count == hstate->prev_count)
+		++hstate->miss_counter;
+	else
+		hstate->miss_counter = 0;
+
+	hstate->prev_time = time;
+	hstate->prev_count = count;
+	if (hstate->miss_counter == MAX_MISSES) {
+		mlx5_err(ctx->dbg_fp,
+			 "device's health compromised - reached miss count\n");
+		goto err;
+	}
+
+	return;
+err:
+	print_health_info(ctx);
+	abort();
+}
+
 static int mlx5_vfio_setup_function(struct mlx5_vfio_context *ctx)
 {
 	int err;
@@ -2232,6 +2398,8 @@ int mlx5dv_vfio_process_events(struct ibv_context *ibctx)
 	uint64_t u;
 	ssize_t s;
 
+	mlx5_vfio_poll_health(ctx);
+
 	/* read to re-arm the FD and process all existing events */
 	s = read(ctx->cmd_comp_fd, &u, sizeof(uint64_t));
 	if (s < 0 && errno != EAGAIN) {
diff --git a/providers/mlx5/mlx5_vfio.h b/providers/mlx5/mlx5_vfio.h
index 8e240c8..296d6d1 100644
--- a/providers/mlx5/mlx5_vfio.h
+++ b/providers/mlx5/mlx5_vfio.h
@@ -240,6 +240,14 @@ struct mlx5_vfio_eqs_uar {
 	uint64_t iova;
 };
 
+#define POLL_HEALTH_INTERVAL 1000 /* ms */
+#define MAX_MISSES 3
+struct mlx5_vfio_health_state {
+	uint64_t prev_time; /* ms */
+	uint32_t prev_count;
+	uint32_t miss_counter;
+};
+
 struct mlx5_vfio_context {
 	struct verbs_context vctx;
 	int container_fd;
@@ -258,7 +266,7 @@ struct mlx5_vfio_context {
 		uint32_t hca_cur[MLX5_CAP_NUM][DEVX_UN_SZ_DW(hca_cap_union)];
 		uint32_t hca_max[MLX5_CAP_NUM][DEVX_UN_SZ_DW(hca_cap_union)];
 	} caps;
-
+	struct mlx5_vfio_health_state health_state;
 	struct mlx5_eq async_eq;
 	struct mlx5_vfio_eqs_uar eqs_uar;
 	pthread_mutex_t eq_lock;
-- 
1.8.3.1


  parent reply	other threads:[~2021-07-20  8:18 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-20  8:16 [PATCH rdma-core 00/27] Introduce mlx5 user space driver over VFIO Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 01/27] Update kernel headers Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 02/27] mlx5: Introduce mlx5dv_get_vfio_device_list() Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 03/27] mlx5: Enable debug functionality for vfio Yishai Hadas
2021-07-20  8:51   ` Leon Romanovsky
2021-07-20  9:27     ` Yishai Hadas
2021-07-20 12:27       ` Leon Romanovsky
2021-07-20 14:57         ` Yishai Hadas
2021-07-21  7:05           ` Gal Pressman
2021-07-21  7:58             ` Yishai Hadas
2021-07-21  8:51               ` Gal Pressman
2021-07-20  8:16 ` [PATCH rdma-core 04/27] util: Add interval_set support Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 05/27] verbs: Enable verbs_open_device() to work over non sysfs devices Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 06/27] mlx5: Setup mlx5 vfio context Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 07/27] mlx5: Add mlx5_vfio_cmd_exec() support Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 08/27] mlx5: vfio setup function support Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 09/27] mlx5: vfio setup basic caps Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 10/27] mlx5: Support fast teardown over vfio Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 11/27] mlx5: Enable interrupt command mode " Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 12/27] mlx5: Introduce vfio APIs to process events Yishai Hadas
2021-07-20  8:16 ` Yishai Hadas [this message]
2021-07-20  8:16 ` [PATCH rdma-core 14/27] mlx5: Implement basic verbs operation for PD and MR over vfio Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 15/27] mlx5: Set DV context ops Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 16/27] mlx5: Support initial DEVX/DV APIs over vfio Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 17/27] mlx5: Implement mlx5dv devx_obj " Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 18/27] pyverbs: Support DevX UMEM registration Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 19/27] pyverbs/mlx5: Support EQN querying Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 20/27] pyverbs/mlx5: Support more DevX objects Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 21/27] pyverbs: Add auxiliary memory functions Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 22/27] pyverbs/mlx5: Add support to extract mlx5dv objects Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 23/27] pyverbs/mlx5: Wrap mlx5_cqe64 struct and add enums Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 24/27] tests: Add MAC address to the tests' args Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 25/27] tests: Add mlx5 DevX data path test Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 26/27] pyverbs/mlx5: Support mlx5 devices over VFIO Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 27/27] tests: Add a test for mlx5 " Yishai Hadas
2021-08-01  8:00 ` [PATCH rdma-core 00/27] Introduce mlx5 user space driver " Yishai Hadas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210720081647.1980-14-yishaih@nvidia.com \
    --to=yishaih@nvidia.com \
    --cc=edwards@nvidia.com \
    --cc=jgg@nvidia.com \
    --cc=linux-rdma@vger.kernel.org \
    --cc=maorg@nvidia.com \
    --cc=markzhang@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).