netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Saeed Mahameed <saeedm@mellanox.com>
To: "David S. Miller" <davem@davemloft.net>
Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	Aya Levin <ayal@mellanox.com>, Tariq Toukan <tariqt@mellanox.com>,
	Jiri Pirko <jiri@mellanox.com>,
	Saeed Mahameed <saeedm@mellanox.com>
Subject: [net-next 13/16] net/mlx5e: Report and recover from CQE with error on RQ
Date: Thu, 15 Aug 2019 19:10:11 +0000	[thread overview]
Message-ID: <20190815190911.12050-14-saeedm@mellanox.com> (raw)
In-Reply-To: <20190815190911.12050-1-saeedm@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

Add support for report and recovery from error on completion on RQ by
setting the queue back to ready state. Handle only errors with a
syndrome indicating the RQ might enter error state and could be
recovered.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  3 +
 .../ethernet/mellanox/mlx5/core/en/health.h   |  9 +++
 .../mellanox/mlx5/core/en/reporter_rx.c       | 66 +++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  9 +++
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 11 ++++
 5 files changed, 98 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 5f2a1d14de68..822f7b620640 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -295,6 +295,7 @@ struct mlx5e_dcbx_dp {
 
 enum {
 	MLX5E_RQ_STATE_ENABLED,
+	MLX5E_RQ_STATE_RECOVERING,
 	MLX5E_RQ_STATE_AM,
 	MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
 	MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
@@ -667,6 +668,8 @@ struct mlx5e_rq {
 	struct zero_copy_allocator zca;
 	struct xdp_umem       *umem;
 
+	struct work_struct     recover_work;
+
 	/* control */
 	struct mlx5_wq_ctrl    wq_ctrl;
 	__be32                 mkey_be;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
index 52e9ca37cf46..d3693fa547ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
@@ -8,6 +8,14 @@
 
 #define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)
 
+static inline bool cqe_syndrome_needs_recover(u8 syndrome)
+{
+	return syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR ||
+	       syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR ||
+	       syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR ||
+	       syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+}
+
 int mlx5e_reporter_tx_create(struct mlx5e_priv *priv);
 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv);
 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq);
@@ -21,6 +29,7 @@ int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg *fmsg);
 int mlx5e_reporter_rx_create(struct mlx5e_priv *priv);
 void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv);
 void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq);
+void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq);
 void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq);
 
 #define MLX5E_REPORTER_PER_Q_MAX_LEN 256
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index 4f5547ac4bee..b4f7e535dbc7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -111,6 +111,72 @@ void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq)
 	mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
 }
 
+static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
+{
+	struct net_device *dev = rq->netdev;
+	int err;
+
+	err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST);
+	if (err) {
+		netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn);
+		return err;
+	}
+	err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	if (err) {
+		netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn);
+		return err;
+	}
+
+	return 0;
+}
+
+static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx)
+{
+	struct mlx5e_rq *rq = ctx;
+	struct mlx5_core_dev *mdev = rq->mdev;
+	struct net_device *dev = rq->netdev;
+	u8 state;
+	int err;
+
+	err = mlx5e_query_rq_state(mdev, rq->rqn, &state);
+	if (err) {
+		netdev_err(dev, "Failed to query RQ 0x%x state. err = %d\n",
+			   rq->rqn, err);
+		goto out;
+	}
+
+	if (state != MLX5_RQC_STATE_ERR)
+		goto out;
+
+	mlx5e_deactivate_rq(rq);
+	mlx5e_free_rx_descs(rq);
+
+	err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR);
+	if (err)
+		goto out;
+
+	clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
+	mlx5e_activate_rq(rq);
+	rq->stats->recover++;
+	return 0;
+out:
+	clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
+	return err;
+}
+
+void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq)
+{
+	struct mlx5e_priv *priv = rq->channel->priv;
+	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
+	struct mlx5e_err_ctx err_ctx = {};
+
+	err_ctx.ctx = rq;
+	err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover;
+	sprintf(err_str, "ERR CQE on RQ: 0x%x", rq->rqn);
+
+	mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
+}
+
 static int mlx5e_rx_reporter_timeout_recover(void *ctx)
 {
 	struct mlx5e_rq *rq = ctx;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 76845bafd708..77f0c8fad9df 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -362,6 +362,13 @@ static void mlx5e_free_di_list(struct mlx5e_rq *rq)
 	kvfree(rq->wqe.di);
 }
 
+static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work)
+{
+	struct mlx5e_rq *rq = container_of(recover_work, struct mlx5e_rq, recover_work);
+
+	mlx5e_reporter_rq_cqe_err(rq);
+}
+
 static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			  struct mlx5e_params *params,
 			  struct mlx5e_xsk_param *xsk,
@@ -398,6 +405,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->stats = &c->priv->channel_stats[c->ix].xskrq;
 	else
 		rq->stats = &c->priv->channel_stats[c->ix].rq;
+	INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work);
 
 	rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
 	if (IS_ERR(rq->xdp_prog)) {
@@ -907,6 +915,7 @@ void mlx5e_close_rq(struct mlx5e_rq *rq)
 {
 	cancel_work_sync(&rq->dim.work);
 	cancel_work_sync(&rq->channel->icosq.recover_work);
+	cancel_work_sync(&rq->recover_work);
 	mlx5e_destroy_rq(rq);
 	mlx5e_free_rx_descs(rq);
 	mlx5e_free_rq(rq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 43d790b7d4ec..2fd2760d0bb7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1130,6 +1130,15 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 	return skb;
 }
 
+static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe;
+
+	if (cqe_syndrome_needs_recover(err_cqe->syndrome) &&
+	    !test_and_set_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state))
+		queue_work(rq->channel->priv->wq, &rq->recover_work);
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1143,6 +1152,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
+		trigger_report(rq, cqe);
 		rq->stats->wqe_err++;
 		goto free_wqe;
 	}
@@ -1328,6 +1338,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi->consumed_strides += cstrides;
 
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
+		trigger_report(rq, cqe);
 		rq->stats->wqe_err++;
 		goto mpwrq_cqe_out;
 	}
-- 
2.21.0


  parent reply	other threads:[~2019-08-15 19:10 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-15 19:09 [pull request][net-next 00/16] Mellanox, mlx5 devlink RX health reporters Saeed Mahameed
2019-08-15 19:09 ` [net-next 01/16] net/mlx5e: Rename reporter header file Saeed Mahameed
2019-08-15 19:09 ` [net-next 02/16] net/mlx5e: Change naming convention for reporter's functions Saeed Mahameed
2019-08-15 19:09 ` [net-next 03/16] net/mlx5e: Generalize tx reporter's functionality Saeed Mahameed
2019-08-15 19:09 ` [net-next 04/16] net/mlx5e: Extend tx diagnose function Saeed Mahameed
2019-08-15 19:09 ` [net-next 05/16] net/mlx5e: Extend tx reporter diagnostics output Saeed Mahameed
2019-08-15 19:09 ` [net-next 06/16] net/mlx5e: Add cq info to tx reporter diagnose Saeed Mahameed
2019-08-15 19:10 ` [net-next 07/16] net/mlx5e: Add helper functions for reporter's basics Saeed Mahameed
2019-08-15 19:10 ` [net-next 08/16] net/mlx5e: Add support to rx reporter diagnose Saeed Mahameed
2019-08-15 19:10 ` [net-next 09/16] net/mlx5e: Split open/close ICOSQ into stages Saeed Mahameed
2019-08-15 19:10 ` [net-next 10/16] net/mlx5e: Report and recover from CQE error on ICOSQ Saeed Mahameed
2019-08-15 19:10 ` [net-next 11/16] net/mlx5e: Report and recover from rx timeout Saeed Mahameed
2019-08-17 19:48   ` David Miller
2019-08-15 19:10 ` [net-next 12/16] net/mlx5e: RX, Handle CQE with error at the earliest stage Saeed Mahameed
2019-08-15 19:10 ` Saeed Mahameed [this message]
2019-08-15 19:10 ` [net-next 14/16] Documentation: net: mlx5: Devlink health documentation updates Saeed Mahameed
2019-08-15 19:10 ` [net-next 15/16] net/mlx5e: Fix deallocation of non-fully init encap entries Saeed Mahameed
2019-08-15 19:10 ` [net-next 16/16] net/mlx5: Fix the order of fc_stats cleanup Saeed Mahameed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190815190911.12050-14-saeedm@mellanox.com \
    --to=saeedm@mellanox.com \
    --cc=ayal@mellanox.com \
    --cc=davem@davemloft.net \
    --cc=jiri@mellanox.com \
    --cc=netdev@vger.kernel.org \
    --cc=tariqt@mellanox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).