netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jiri Pirko <jiri@resnulli.us>
To: Aya Levin <ayal@mellanox.com>
Cc: netdev@vger.kernel.org, Eran Ben Elisha <eranbe@mellanox.com>,
	Saeed Mahameed <saeedm@mellanox.com>,
	Jiri Pirko <jiri@mellanox.com>
Subject: Re: [PATCH net-next RFC] Dump SW SQ context as part of tx reporter
Date: Tue, 7 May 2019 14:41:29 +0200	[thread overview]
Message-ID: <20190507124129.GC2157@nanopsycho> (raw)
In-Reply-To: <1556547459-7756-1-git-send-email-ayal@mellanox.com>

Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>TX reporter reports an error on two scenarios:
>- TX timeout on a specific tx queue
>- TX completion error on a specific send queue
>Prior to this patch, no dump data was supported by the tx reporter. This
>patch adds support for SW data dump of the related SQ context. The dump
>is simply the SQ's raw memory snapshot taken right after the error was
>reported, before any recovery procedure was launched. With this
>approach, no maintenance is needed as the driver fetch the actual data
>according to the layout on which the SQ was compiled with.  By providing
>a SW context, one can easily debug error on a given SQ.
>
>In order to offline translate the raw memory into a human readable
>format, the user can use some out-of-kernel scripts which receives as an
>input the following:
>- Object raw memory
>- Driver object compiled with debug info (can be taken/generated at any time from the machine)
>- Object name
>
>An example of such script output can be seen below.
>Note: the script is not offered as part of this patch as it do not
>belong to the kernel, I just described it in order to grasp the general
>idea of how/what can be fetched from SW dump via devlink health.
>
>The output of the SW dump can be extracted by devlink health command:
>$ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
> mlx5e_txqsq: sqn: 6336
> memory:
>   00 00 00 00 00 00 00 00
>   01 00 00 00 00 00 00 00
>   00 00 00 00 00 00 00 00
>   45 f4 88 cb 09 00 00 00
>   00 00 00 00 00 00 00 00
>   00 00 00 00 00 00 00 00
>   c0 ff ff ff 1f 00 00 00
>   f8 18 1e 89 81 88 ff ff
>   ...
>
>script output below, with struct members names and actual values:
>
>struct  mlx5e_txqsq {
>	short unsigned int         cc 	 0x5 ;
>	unsigned int               dma_fifo_cc 	 0x5 ;
>	struct  net_dim {
>		unsigned char      state 	 0x1 ;
>		struct  net_dim_stats {
>			int        ppms 	 0x0 ;
>			int        bpms 	 0x0 ;
>			int        epms 	 0x0 ;
>		} prev_stats;
>		struct  net_dim_sample {
>			long long int time 	 0x90766ef9d ;
>			unsigned int pkt_ctr 	 0x0 ;
>			unsigned int byte_ctr 	 0x0 ;
>			short unsigned int event_ctr 	 0x0 ;
>		} start_sample;
>		struct  work_struct {
>			struct   {
>				long int counter 	 0x1fffffffc0 ;
>			} data;
>			struct  list_head {
>				struct list_head * next 	 0xffff8881b08998f8 ;
>				struct list_head * prev 	 0xffff8881b08998f8 ;
>			} entry;
>			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>		} work;
>		unsigned char      profile_ix 	 0x60 ;
>		unsigned char      mode 	 0x72 ;
>		unsigned char      tune_state 	 0x35 ;
>		unsigned char      steps_right 	 0xa0 ;
>		unsigned char      steps_left 	 0xff ;
>		unsigned char      tired 	 0xff ;
>	} dim;
>	short unsigned int         pc 	 0x0 ;
>	unsigned int               dma_fifo_pc 	 0x0 ;
>	struct  mlx5e_cq {
>		struct  mlx5_cqwq {
>			struct  mlx5_frag_buf_ctrl {
>				struct mlx5_buf_list * frags 	 0x500000005 ;
>				unsigned int sz_m1 	 0x0 ;
>				short unsigned int frag_sz_m1 	 0x0 ;
>				short unsigned int strides_offset 	 0x0 ;
>				unsigned char log_sz 	 0x0 ;
>				unsigned char log_stride 	 0x0 ;
>				unsigned char log_frag_strides 	 0x0 ;
>			} fbc;
>			__be32 *   db 	 0x0 ;
>			unsigned int cc 	 0x0 ;
>		} wq;
>		short unsigned int event_ctr 	 0x0 ;
>		struct napi_struct * napi 	 0x0 ;
>		struct  mlx5_core_cq {
>			unsigned int cqn 	 0x0 ;
>			int        cqe_sz 	 0x0 ;
>			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>			__be32 *   arm_db 	 0x3f000003ff ;
>			struct mlx5_uars_page * uar 	 0x6060a ;
>			struct  refcount_struct {
>				struct   {
>					int    counter 	 0xa1814500 ;
>				} refs;
>			} refcount;
>			struct  completion {
>				unsigned int done 	 0x5 ;
>				struct  wait_queue_head {
>					struct  spinlock {
>						union   {
>							struct  raw_spinlock {
>								struct  qspinlock {
>									union   {
>										struct   {
>											int                                                    counter 	 0x5 ;
>										} val;
>										struct   {
>											unsigned char                                          locked 	 0x5 ;
>											unsigned char                                          pending 	 0x0 ;
>										} ;
>										struct   {
>											short unsigned int                                     locked_pending 	 0x5 ;
>											short unsigned int                                     tail 	 0x0 ;
>										} ;
>									} ;
>								} raw_lock;
>							} rlock;
>						} ;
>					} lock;
>					struct  list_head {
>						struct list_head * next 	 0xffff8881b089bb88 ;
>						struct list_head * prev 	 0x4000000c0a ;
>					} head;
>				} wait;
>			} free;
>			unsigned int vector 	 0xa1814500 ;
>			unsigned int irqn 	 0xffff8881 ;
>			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>			unsigned int cons_index 	 0x1 ;
>			unsigned int arm_sn 	 0x0 ;
>			struct mlx5_rsc_debug * dbg 	 0x0 ;
>			int        pid 	 0x0 ;
>			struct   {
>				struct  list_head {
>					struct list_head * next 	 0xffffffff ;
>					struct list_head * prev 	 0xffffffffffffffff ;
>				} list;
>				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>				void * priv 	 0x0 ;
>			} tasklet_ctx;
>			int        reset_notify_added 	 0x0 ;
>			struct  list_head {
>				struct list_head * next 	 0xffffffffa0300700 ;
>				struct list_head * prev 	 0xd ;
>			} reset_notify;
>			struct mlx5_eq_comp * eq 	 0x0 ;
>			short unsigned int uid 	 0x9a70 ;
>		} mcq;
>		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>		struct  mlx5_wq_ctrl {
>			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>			struct  mlx5_frag_buf {
>				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>				int npages 	 0x0 ;
>				int size 	 0x5 ;
>				unsigned char page_shift 	 0x8 ;
>			} buf;
>			struct  mlx5_db {
>				__be32 * db 	 0x1c6 ;
>				union   {
>					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>				} u;
>				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>				int index 	 0x0 ;
>			} db;
>		} wq_ctrl;
>	} cq;
>	struct  mlx5_wq_cyc {
>		struct  mlx5_frag_buf_ctrl {
>			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>			unsigned int sz_m1 	 0xa7600160 ;
>			short unsigned int frag_sz_m1 	 0x8881 ;
>			short unsigned int strides_offset 	 0xffff ;
>			unsigned char log_sz 	 0x88 ;
>			unsigned char log_stride 	 0x49 ;
>			unsigned char log_frag_strides 	 0xaa ;
>		} fbc;
>		__be32 *           db 	 0x1000000000010 ;
>		short unsigned int sz 	 0xc ;
>		short unsigned int wqe_ctr 	 0x0 ;
>		short unsigned int cur_sz 	 0x0 ;
>	} wq;
>	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>	struct   {
>		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>	} db;
>	void *                     uar_map 	 0x0 ;
>	struct netdev_queue *      txq 	 0x0 ;
>	unsigned int               sqn 	 0x18c0 ;
>	unsigned char              min_inline_mode 	 0x0 ;
>	struct device *            pdev 	 0x0 ;
>	unsigned int               mkey_be 	 0x0 ;
>	long unsigned int          state 	 0x0 ;
>	struct hwtstamp_config *   tstamp 	 0x0 ;
>	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>	struct  mlx5_wq_ctrl {
>		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>		struct  mlx5_frag_buf {
>			struct mlx5_buf_list * frags 	 0x6060a ;
>			int        npages 	 0xa1814604 ;
>			int        size 	 0xffff8881 ;
>			unsigned char page_shift 	 0x0 ;
>		} buf;
>		struct  mlx5_db {
>			__be32 *   db 	 0xfff ;
>			union   {
>				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>			} u;
>			long long unsigned int dma 	 0xffff888188440000 ;
>			int        index 	 0x8b074000 ;
>		} db;
>	} wq_ctrl;
>	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>	int                        txq_ix 	 0xa0020180 ;
>	unsigned int               rate_limit 	 0xffff8881 ;
>	struct  work_struct {
>		struct   {
>			long int   counter 	 0x1000018c0 ;
>		} data;
>		struct  list_head {
>			struct list_head * next 	 0xffff8881c32b68e8 ;
>			struct list_head * prev 	 0x800 ;
>		} entry;
>		void               (*func)(struct work_struct *) 	 0x9 ;
>	} recover_work;
>} ;

I don't get it. You are dumping live kernel memory? There are already
facilities to do that in place. Why to replicate it?


>
>Signed-off-by: Aya Levin <ayal@mellanox.com>
>---
> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
> 1 file changed, 100 insertions(+)
>
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>index 476dd97f7f2f..8a39f5525e57 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>@@ -9,6 +9,7 @@
> 
> struct mlx5e_tx_err_ctx {
> 	int (*recover)(struct mlx5e_txqsq *sq);
>+	int (*dump)(struct mlx5e_txqsq *sq);
> 	struct mlx5e_txqsq *sq;
> };
> 
>@@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
> 	return err;
> }
> 
>+static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>+					      struct mlx5e_txqsq *sq,
>+					      struct devlink_fmsg *fmsg)
>+{
>+	u64 *ptr = (u64 *)sq;
>+	int copy, err;
>+	int i = 0;
>+
>+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>+		return 0;
>+
>+	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>+	if (err)
>+		return err;
>+
>+	err = devlink_fmsg_obj_nest_start(fmsg);
>+	if (err)
>+		return err;
>+
>+	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>+	if (err)
>+		return err;
>+
>+	while (i < sizeof(struct mlx5e_txqsq)) {
>+		copy = sizeof(u64);
>+
>+		if (i + copy > sizeof(struct mlx5e_txqsq))
>+			copy = sizeof(struct mlx5e_txqsq) - i;
>+
>+		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>+		if (err)
>+			return err;
>+		ptr++;
>+		i += copy;
>+	}
>+
>+	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>+	if (err)
>+		return err;
>+
>+	err = devlink_fmsg_obj_nest_end(fmsg);
>+	if (err)
>+		return err;
>+
>+	err = devlink_fmsg_pair_nest_end(fmsg);
>+
>+	return err;
>+}
>+
>+static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>+					 struct devlink_fmsg *fmsg)
>+{
>+	int i, err = 0;
>+
>+	mutex_lock(&priv->state_lock);
>+
>+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>+		goto unlock;
>+
>+	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>+	if (err)
>+		goto unlock;
>+
>+	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>+	     i++) {
>+		err = devlink_fmsg_obj_nest_start(fmsg);
>+		if (err)
>+			goto unlock;
>+
>+		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>+							 fmsg);
>+		if (err)
>+			goto unlock;
>+
>+		err = devlink_fmsg_pair_nest_end(fmsg);
>+		if (err)
>+			goto unlock;
>+	}
>+	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>+	if (err)
>+		goto unlock;
>+
>+unlock:
>+	mutex_unlock(&priv->state_lock);
>+	return err;
>+}
>+
>+static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>+				     struct devlink_fmsg *fmsg, void *context)
>+{
>+	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>+	struct mlx5e_tx_err_ctx *err_ctx = context;
>+
>+	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>+							    fmsg) :
>+			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>+}
>+
> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
> 		.name = "tx",
> 		.recover = mlx5e_tx_reporter_recover,
> 		.diagnose = mlx5e_tx_reporter_diagnose,
>+		.dump = mlx5e_tx_reporter_sw_dump,
> };
> 
> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>-- 
>2.14.1
>

  parent reply	other threads:[~2019-05-07 12:41 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-29 14:17 [PATCH net-next RFC] Dump SW SQ context as part of tx reporter Aya Levin
2019-04-29 18:32 ` Saeed Mahameed
2019-04-30 11:13   ` Aya Levin
2019-04-30  0:54 ` Jakub Kicinski
2019-04-30 11:26   ` Aya Levin
2019-05-07 12:41 ` Jiri Pirko [this message]
2019-05-07 12:58   ` Aya Levin
2019-05-09  8:23     ` Jiri Pirko
2019-05-12  8:37       ` Aya Levin
2019-05-14 12:07         ` Jiri Pirko
2019-05-16  8:49           ` Aya Levin
2019-05-16 11:53             ` Jiri Pirko
2019-05-16 12:02               ` Aya Levin
2019-05-16 22:06 Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190507124129.GC2157@nanopsycho \
    --to=jiri@resnulli.us \
    --cc=ayal@mellanox.com \
    --cc=eranbe@mellanox.com \
    --cc=jiri@mellanox.com \
    --cc=netdev@vger.kernel.org \
    --cc=saeedm@mellanox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).