All of lore.kernel.org
 help / color / mirror / Atom feed
From: Leon Romanovsky <leon@kernel.org>
To: Jason Gunthorpe <jgg@nvidia.com>
Cc: Patrisious Haddad <phaddad@nvidia.com>,
	"David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>,
	linux-rdma@vger.kernel.org, Mark Zhang <markzhang@nvidia.com>,
	netdev@vger.kernel.org, Paolo Abeni <pabeni@redhat.com>,
	Saeed Mahameed <saeedm@nvidia.com>,
	Tariq Toukan <tariqt@nvidia.com>,
	Yishai Hadas <yishaih@nvidia.com>
Subject: [PATCH rdma-next v1 3/3] RDMA/mlx5: Print error syndrome in case of fatal QP errors
Date: Wed,  4 Jan 2023 11:43:36 +0200	[thread overview]
Message-ID: <edc794f622a33e4ee12d7f5d218d1a59aa7c6af5.1672821186.git.leonro@nvidia.com> (raw)
In-Reply-To: <cover.1672821186.git.leonro@nvidia.com>

From: Patrisious Haddad <phaddad@nvidia.com>

Print syndromes in case of fatal QP events. This is helpful for upper
level debugging, as there maybe no CQEs.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/qp.c  | 45 +++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/mlx5/qp.h  |  2 +-
 drivers/infiniband/hw/mlx5/qpc.c |  4 ++-
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index b22d95b448ac..1bf15040d35a 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -310,6 +310,44 @@ int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
 	return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc);
 }
 
+static void mlx5_ib_qp_err_syndrome(struct ib_qp *ibqp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	int outlen = MLX5_ST_SZ_BYTES(query_qp_out);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	void *pas_ext_union, *err_syn;
+	u32 *outb;
+	int err;
+
+	if (!MLX5_CAP_GEN(dev->mdev, qpc_extension) ||
+	    !MLX5_CAP_GEN(dev->mdev, qp_error_syndrome))
+		return;
+
+	outb = kzalloc(outlen, GFP_KERNEL);
+	if (!outb)
+		return;
+
+	err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen,
+				 true);
+	if (err)
+		goto out;
+
+	pas_ext_union =
+		MLX5_ADDR_OF(query_qp_out, outb, qp_pas_or_qpc_ext_and_pas);
+	err_syn = MLX5_ADDR_OF(qpc_extension_and_pas_list_in, pas_ext_union,
+			       qpc_data_extension.error_syndrome);
+
+	pr_err("%s/%d: QP %d error: %s (0x%x 0x%x 0x%x)\n",
+	       ibqp->device->name, ibqp->port, ibqp->qp_num,
+	       ib_wc_status_msg(
+		       MLX5_GET(cqe_error_syndrome, err_syn, syndrome)),
+	       MLX5_GET(cqe_error_syndrome, err_syn, vendor_error_syndrome),
+	       MLX5_GET(cqe_error_syndrome, err_syn, hw_syndrome_type),
+	       MLX5_GET(cqe_error_syndrome, err_syn, hw_error_syndrome));
+out:
+	kfree(outb);
+}
+
 static void mlx5_ib_handle_qp_event(struct work_struct *_work)
 {
 	struct mlx5_ib_qp_event_work *qpe_work =
@@ -350,6 +388,10 @@ static void mlx5_ib_handle_qp_event(struct work_struct *_work)
 		goto out;
 	}
 
+	if ((event.event == IB_EVENT_QP_FATAL) ||
+	    (event.event == IB_EVENT_QP_ACCESS_ERR))
+		mlx5_ib_qp_err_syndrome(ibqp);
+
 	ibqp->event_handler(&event, ibqp->qp_context);
 
 out:
@@ -4843,7 +4885,8 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 	if (!outb)
 		return -ENOMEM;
 
-	err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen);
+	err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen,
+				 false);
 	if (err)
 		goto out;
 
diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
index fb2f4e030bb8..77f9b4a54816 100644
--- a/drivers/infiniband/hw/mlx5/qp.h
+++ b/drivers/infiniband/hw/mlx5/qp.h
@@ -20,7 +20,7 @@ int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask,
 int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp);
 int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct);
 int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
-		       u32 *out, int outlen);
+		       u32 *out, int outlen, bool qpc_ext);
 int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
 			u32 *out, int outlen);
 
diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
index 0d51b4c06c06..eeee18af36ed 100644
--- a/drivers/infiniband/hw/mlx5/qpc.c
+++ b/drivers/infiniband/hw/mlx5/qpc.c
@@ -507,12 +507,14 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev)
 }
 
 int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
-		       u32 *out, int outlen)
+		       u32 *out, int outlen, bool qpc_ext)
 {
 	u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {};
 
 	MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP);
 	MLX5_SET(query_qp_in, in, qpn, qp->qpn);
+	MLX5_SET(query_qp_in, in, qpc_ext, qpc_ext);
+
 	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, outlen);
 }
 
-- 
2.38.1


  parent reply	other threads:[~2023-01-04  9:44 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-04  9:43 [PATCH rdma-next v1 0/3] Provide more error details when a QP moves to error state Leon Romanovsky
2023-01-04  9:43 ` [PATCH rdma-next v1 1/3] RDMA/mlx: Calling qp event handler in workqueue context Leon Romanovsky
2023-01-04  9:43 ` [PATCH mlx5-next v1 2/3] net/mlx5: Introduce CQE error syndrome Leon Romanovsky
2023-01-06  0:31   ` Saeed Mahameed
2023-01-04  9:43 ` Leon Romanovsky [this message]
2023-01-06  0:31   ` [PATCH rdma-next v1 3/3] RDMA/mlx5: Print error syndrome in case of fatal QP errors Saeed Mahameed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=edc794f622a33e4ee12d7f5d218d1a59aa7c6af5.1672821186.git.leonro@nvidia.com \
    --to=leon@kernel.org \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=jgg@nvidia.com \
    --cc=kuba@kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=markzhang@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=phaddad@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=tariqt@nvidia.com \
    --cc=yishaih@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.