All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
@ 2018-02-15 13:54 Rahul Lakkireddy
  2018-02-16 20:41 ` David Miller
  0 siblings, 1 reply; 8+ messages in thread
From: Rahul Lakkireddy @ 2018-02-15 13:54 UTC (permalink / raw)
  To: netdev; +Cc: davem, ganeshgr, nirranjan, indranil, Rahul Lakkireddy

Register callback to panic_notifier_list.  Invoke dump collect routine
to append dump to vmcore.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h       |  5 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 93 +++++++++++++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h |  4 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c  | 13 ++++
 4 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index d3fa53db61ee..3d578fa1d640 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -568,6 +568,7 @@ enum {                                 /* adapter flags */
 	FW_OFLD_CONN       = (1 << 9),
 	ROOT_NO_RELAXED_ORDERING = (1 << 10),
 	SHUTTING_DOWN	   = (1 << 11),
+	K_CRASH            = (1 << 12),
 };
 
 enum {
@@ -946,6 +947,10 @@ struct adapter {
 
 	/* Ethtool Dump */
 	struct ethtool_dump eth_dump;
+
+	void *dump_buf; /* Dump buffer for collecting logs in panic */
+	u32 dump_buf_size; /* Dump buffer size */
+	struct notifier_block panic_nb; /* Panic notifier info */
 };
 
 /* Support for "sched-class" command to allow a TX Scheduling Class to be
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
index 30485f9a598f..579a019a246f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
@@ -383,13 +383,25 @@ static void cxgb4_cudbg_collect_entity(struct cudbg_init *pdbg_init,
 
 static int cudbg_alloc_compress_buff(struct cudbg_init *pdbg_init)
 {
+	struct adapter *adap = pdbg_init->adap;
 	u32 workspace_size;
 
 	workspace_size = cudbg_get_workspace_size();
-	pdbg_init->compress_buff = vzalloc(CUDBG_COMPRESS_BUFF_SIZE +
-					   workspace_size);
-	if (!pdbg_init->compress_buff)
-		return -ENOMEM;
+
+	if (adap->flags & K_CRASH) {
+		/* In panic scenario, the compression buffer is already
+		 * allocated. So, just update accordingly.
+		 */
+		pdbg_init->compress_buff = (u8 *)adap->dump_buf +
+					   adap->dump_buf_size -
+					   workspace_size -
+					   CUDBG_COMPRESS_BUFF_SIZE;
+	} else {
+		pdbg_init->compress_buff = vzalloc(CUDBG_COMPRESS_BUFF_SIZE +
+						   workspace_size);
+		if (!pdbg_init->compress_buff)
+			return -ENOMEM;
+	}
 
 	pdbg_init->compress_buff_size = CUDBG_COMPRESS_BUFF_SIZE;
 	pdbg_init->workspace = (u8 *)pdbg_init->compress_buff +
@@ -399,6 +411,14 @@ static int cudbg_alloc_compress_buff(struct cudbg_init *pdbg_init)
 
 static void cudbg_free_compress_buff(struct cudbg_init *pdbg_init)
 {
+	struct adapter *adap = pdbg_init->adap;
+
+	/* Don't free in panic scenario.  We need the buffer to be present
+	 * in vmcore so that we can extract the dump.
+	 */
+	if (adap->flags & K_CRASH)
+		return;
+
 	if (pdbg_init->compress_buff)
 		vfree(pdbg_init->compress_buff);
 }
@@ -488,3 +508,68 @@ void cxgb4_init_ethtool_dump(struct adapter *adapter)
 	adapter->eth_dump.version = adapter->params.fw_vers;
 	adapter->eth_dump.len = 0;
 }
+
+static int cxgb4_panic_notify(struct notifier_block *this, unsigned long event,
+			      void *ptr)
+{
+	struct adapter *adap = container_of(this, struct adapter, panic_nb);
+	bool use_bd;
+	u32 len;
+
+	/* Save original value and restore after collection */
+	use_bd = adap->use_bd;
+
+	dev_info(adap->pdev_dev, "Initialized cxgb4 crash handler");
+	adap->flags |= K_CRASH;
+
+	/* Don't contact firmware.  Directly access registers */
+	adap->use_bd = true;
+
+	len = adap->dump_buf_size;
+	cxgb4_cudbg_collect(adap, adap->dump_buf, &len, CXGB4_ETH_DUMP_ALL);
+	dev_info(adap->pdev_dev, "cxgb4 debug collection done...");
+
+	/* Restore original value */
+	adap->use_bd = use_bd;
+	return NOTIFY_DONE;
+}
+
+int cxgb4_cudbg_register_notifier(struct adapter *adap)
+{
+	u32 wsize, len;
+
+	len = sizeof(struct cudbg_hdr) +
+	      sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY;
+	len += cxgb4_get_dump_length(adap, CXGB4_ETH_DUMP_ALL);
+
+	/* If compression is enabled, allocate extra memory needed for
+	 * compression too.
+	 */
+	wsize = cudbg_get_workspace_size();
+	if (wsize)
+		wsize += CUDBG_COMPRESS_BUFF_SIZE;
+
+	adap->dump_buf_size = len + wsize;
+	adap->dump_buf = vzalloc(adap->dump_buf_size);
+	if (!adap->dump_buf)
+		return -ENOMEM;
+
+	/* Print info so that we can extract firmware dump from vmcore */
+	dev_info(adap->pdev_dev,
+		 "Registering cxgb4 panic handler.., Buffer start address = %p, size: %u\n",
+		 adap->dump_buf, len);
+
+	adap->panic_nb.notifier_call = cxgb4_panic_notify;
+	adap->panic_nb.priority = INT_MAX;
+	atomic_notifier_chain_register(&panic_notifier_list, &adap->panic_nb);
+	return 0;
+}
+
+void cxgb4_cudbg_unregister_notifier(struct adapter *adap)
+{
+	if (adap->dump_buf) {
+		atomic_notifier_chain_unregister(&panic_notifier_list,
+						 &adap->panic_nb);
+		vfree(adap->dump_buf);
+	}
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
index ce1ac9a1c878..66d4252f5032 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
@@ -41,8 +41,12 @@ enum CXGB4_ETHTOOL_DUMP_FLAGS {
 	CXGB4_ETH_DUMP_HW = (1 << 1), /* various FW and HW dumps */
 };
 
+#define CXGB4_ETH_DUMP_ALL (CXGB4_ETH_DUMP_MEM | CXGB4_ETH_DUMP_HW)
+
 u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag);
 int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size,
 			u32 flag);
 void cxgb4_init_ethtool_dump(struct adapter *adapter);
+int cxgb4_cudbg_register_notifier(struct adapter *adap);
+void cxgb4_cudbg_unregister_notifier(struct adapter *adap);
 #endif /* __CXGB4_CUDBG_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 56bc626ef006..f61d7a552dda 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5290,6 +5290,16 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	setup_memwin(adapter);
+
+	/* Register panic notifier */
+	err = cxgb4_cudbg_register_notifier(adapter);
+	if (err) {
+		dev_warn(adapter->pdev_dev,
+			 "Fail registering panic notifier, err: %d. Continuing\n",
+			 err);
+		err = 0;
+	}
+
 	err = adap_init0(adapter);
 #ifdef CONFIG_DEBUG_FS
 	bitmap_zero(adapter->sge.blocked_fl, adapter->sge.egr_sz);
@@ -5537,6 +5547,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		destroy_workqueue(adapter->workq);
 
 	kfree(adapter->mbox_log);
+	cxgb4_cudbg_unregister_notifier(adapter);
 	kfree(adapter);
  out_unmap_bar0:
 	iounmap(regs);
@@ -5610,6 +5621,8 @@ static void remove_one(struct pci_dev *pdev)
 		pci_release_regions(pdev);
 		kfree(adapter->mbox_log);
 		synchronize_rcu();
+		/* Unregister panic notifier */
+		cxgb4_cudbg_unregister_notifier(adapter);
 		kfree(adapter);
 	}
 #ifdef CONFIG_PCI_IOV
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
  2018-02-15 13:54 [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic Rahul Lakkireddy
@ 2018-02-16 20:41 ` David Miller
  2018-02-19 12:34   ` Rahul Lakkireddy
  0 siblings, 1 reply; 8+ messages in thread
From: David Miller @ 2018-02-16 20:41 UTC (permalink / raw)
  To: rahul.lakkireddy; +Cc: netdev, ganeshgr, nirranjan, indranil

From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Thu, 15 Feb 2018 19:24:42 +0530

> Register callback to panic_notifier_list.  Invoke dump collect routine
> to append dump to vmcore.
> 
> Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>

There is absolutely no precedence for a networking driver dumping
things into the vmcore image on a panic.

And I don't think this is a good idea.

Really, this commit message should have explained why this is desired
and in what context it is legitimate for this driver in particular
to do it.

A very detailed, long, complete commit message is especially important
when you are deciding to blaze you own trail and do something no other
networking driver has done before.

I get really upset when I see changes like this, because you give me
no preparation for what I'm about to read in the patch and therefore
I have to go into this routine asking you to explain things properly.

But as-is, I see this panic notifier as a really bad idea.

Thanks.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
  2018-02-16 20:41 ` David Miller
@ 2018-02-19 12:34   ` Rahul Lakkireddy
  2018-02-19 15:01     ` David Miller
  2018-02-21  0:43     ` Jakub Kicinski
  0 siblings, 2 replies; 8+ messages in thread
From: Rahul Lakkireddy @ 2018-02-19 12:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Ganesh GR, Nirranjan Kirubaharan, Indranil Choudhury

On Saturday, February 02/17/18, 2018 at 02:11:01 +0530, David Miller wrote:
> From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> Date: Thu, 15 Feb 2018 19:24:42 +0530
> 
> > Register callback to panic_notifier_list.  Invoke dump collect routine
> > to append dump to vmcore.
> > 
> > Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> > Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
> 
> There is absolutely no precedence for a networking driver dumping
> things into the vmcore image on a panic.
> 
> And I don't think this is a good idea.
> 
> Really, this commit message should have explained why this is desired
> and in what context it is legitimate for this driver in particular
> to do it.
> 
> A very detailed, long, complete commit message is especially important
> when you are deciding to blaze you own trail and do something no other
> networking driver has done before.
> 

My mistake.  Will add more info in the commit message in v2.

> I get really upset when I see changes like this, because you give me
> no preparation for what I'm about to read in the patch and therefore
> I have to go into this routine asking you to explain things properly.
> 
> But as-is, I see this panic notifier as a really bad idea.
> 

Our requirement is to analyze the state of firmware/hardware at the
time of kernel panic.  The dump will be written to pre allocated
buffer during kernel panic, which can be extracted later from the
vmcore, for post-analysis.

Panic notifier seemed to meet our requirement, as we are able to
collect dump during kernel panic and then extract it from vmcore.
Please suggest, if this can be done in a better way.

Thanks,
Rahul

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
  2018-02-19 12:34   ` Rahul Lakkireddy
@ 2018-02-19 15:01     ` David Miller
  2018-02-21  0:43     ` Jakub Kicinski
  1 sibling, 0 replies; 8+ messages in thread
From: David Miller @ 2018-02-19 15:01 UTC (permalink / raw)
  To: rahul.lakkireddy; +Cc: netdev, ganeshgr, nirranjan, indranil

From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Mon, 19 Feb 2018 18:04:17 +0530

> Panic notifier seemed to meet our requirement, as we are able to
> collect dump during kernel panic and then extract it from vmcore.

Who else does this?  I do not think the panic notifier was created
to serve this purpose.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
  2018-02-19 12:34   ` Rahul Lakkireddy
  2018-02-19 15:01     ` David Miller
@ 2018-02-21  0:43     ` Jakub Kicinski
  2018-02-21  0:51       ` Florian Fainelli
  1 sibling, 1 reply; 8+ messages in thread
From: Jakub Kicinski @ 2018-02-21  0:43 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: David Miller, netdev, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Mon, 19 Feb 2018 18:04:17 +0530, Rahul Lakkireddy wrote:
> Our requirement is to analyze the state of firmware/hardware at the
> time of kernel panic. 

I was wondering about this since you posted the patch and I can't come
up with any specific scenario where kernel crash would correlate
clearly with device state in non-trivial way.

Perhaps there is something about cxgb4 HW/FW that makes this useful.
Could you explain?  Could you give a real life example of a bug?  
Is it related to the TOE-looking TLS offload Atul is posting?

Is the panic you're targeting here real or manually triggered from user
space to get a full dump of kernel and FW?

That's me trying to guess what you're doing.. :)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
  2018-02-21  0:43     ` Jakub Kicinski
@ 2018-02-21  0:51       ` Florian Fainelli
  2018-02-21  1:04         ` Jakub Kicinski
  0 siblings, 1 reply; 8+ messages in thread
From: Florian Fainelli @ 2018-02-21  0:51 UTC (permalink / raw)
  To: Jakub Kicinski, Rahul Lakkireddy
  Cc: David Miller, netdev, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On 02/20/2018 04:43 PM, Jakub Kicinski wrote:
> On Mon, 19 Feb 2018 18:04:17 +0530, Rahul Lakkireddy wrote:
>> Our requirement is to analyze the state of firmware/hardware at the
>> time of kernel panic. 
> 
> I was wondering about this since you posted the patch and I can't come
> up with any specific scenario where kernel crash would correlate
> clearly with device state in non-trivial way.
> 
> Perhaps there is something about cxgb4 HW/FW that makes this useful.
> Could you explain?  Could you give a real life example of a bug?  
> Is it related to the TOE-looking TLS offload Atul is posting?
> 
> Is the panic you're targeting here real or manually triggered from user
> space to get a full dump of kernel and FW?
> 
> That's me trying to guess what you're doing.. :)
> 

One case where this might be helpful is if you are chasing down DMA
corruption and you would like to get a nearly instant capture of both
the kernel's memory and the adapter which may be responsible for that.
This is not probably 100% proof because there is a timing window during
which the dumps of both contexts are going to happen, and that alone
might be influencing the captured memory view. Just guessing of course.
-- 
Florian

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
  2018-02-21  0:51       ` Florian Fainelli
@ 2018-02-21  1:04         ` Jakub Kicinski
  2018-02-21 15:25           ` Rahul Lakkireddy
  0 siblings, 1 reply; 8+ messages in thread
From: Jakub Kicinski @ 2018-02-21  1:04 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Rahul Lakkireddy, David Miller, netdev, Ganesh GR,
	Nirranjan Kirubaharan, Indranil Choudhury

On Tue, 20 Feb 2018 16:51:03 -0800, Florian Fainelli wrote:
> On 02/20/2018 04:43 PM, Jakub Kicinski wrote:
> > On Mon, 19 Feb 2018 18:04:17 +0530, Rahul Lakkireddy wrote:  
> >> Our requirement is to analyze the state of firmware/hardware at the
> >> time of kernel panic.   
> > 
> > I was wondering about this since you posted the patch and I can't come
> > up with any specific scenario where kernel crash would correlate
> > clearly with device state in non-trivial way.
> > 
> > Perhaps there is something about cxgb4 HW/FW that makes this useful.
> > Could you explain?  Could you give a real life example of a bug?  
> > Is it related to the TOE-looking TLS offload Atul is posting?
> > 
> > Is the panic you're targeting here real or manually triggered from user
> > space to get a full dump of kernel and FW?
> > 
> > That's me trying to guess what you're doing.. :)
> 
> One case where this might be helpful is if you are chasing down DMA
> corruption and you would like to get a nearly instant capture of both
> the kernel's memory and the adapter which may be responsible for that.
> This is not probably 100% proof because there is a timing window during
> which the dumps of both contexts are going to happen, and that alone
> might be influencing the captured memory view. Just guessing of course.

Perhaps this is what you mean with the timing window - but with random
corruptions by the time kernel hits the corrupted memory 40/100Gb
adapter has likely forgotten all about those DMAs..  And IOMMUs are
pretty good at catching corruptions on big iron CPUs (i.e. it's easy to
catch them in testing, even if production environment runs iommu=pt).
At least that's my gut feeling/experience ;)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
  2018-02-21  1:04         ` Jakub Kicinski
@ 2018-02-21 15:25           ` Rahul Lakkireddy
  0 siblings, 0 replies; 8+ messages in thread
From: Rahul Lakkireddy @ 2018-02-21 15:25 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Florian Fainelli, David Miller, netdev, Ganesh GR,
	Nirranjan Kirubaharan, Indranil Choudhury

On Tuesday, February 02/20/18, 2018 at 17:04:20 -0800, Jakub Kicinski wrote:
> On Tue, 20 Feb 2018 16:51:03 -0800, Florian Fainelli wrote:
> > On 02/20/2018 04:43 PM, Jakub Kicinski wrote:
> > > On Mon, 19 Feb 2018 18:04:17 +0530, Rahul Lakkireddy wrote:  
> > >> Our requirement is to analyze the state of firmware/hardware at the
> > >> time of kernel panic.   
> > > 
> > > I was wondering about this since you posted the patch and I can't come
> > > up with any specific scenario where kernel crash would correlate
> > > clearly with device state in non-trivial way.
> > > 
> > > Perhaps there is something about cxgb4 HW/FW that makes this useful.
> > > Could you explain?  Could you give a real life example of a bug?  
> > > Is it related to the TOE-looking TLS offload Atul is posting?
> > > 
> > > Is the panic you're targeting here real or manually triggered from user
> > > space to get a full dump of kernel and FW?
> > > 
> > > That's me trying to guess what you're doing.. :)
> > 

This is not related to TLS that Atul posted.  This is related to
general Field Diagnostics.

When a kernel panic happens on critical production servers, they
may not be reproducible again or may not have downtime for debugging.

Currently vmcore generated after panic, has only snapshot of driver
state and not hardware/firmware state at the time of kernel panic. If
complete state and logs of underlying NIC hardware/firmware (in fact,
all hardware components) is collected, it will be very helpful for
post analysis. 

For example, hardware memory gets incorrectly programmed by driver
due to a race condition which causes a kernel panic indirectly. 
A dump of hardware memory collected during kernel panic, will
definitely help to root cause and fix the issue.

> > One case where this might be helpful is if you are chasing down DMA
> > corruption and you would like to get a nearly instant capture of both
> > the kernel's memory and the adapter which may be responsible for that.
> > This is not probably 100% proof because there is a timing window during
> > which the dumps of both contexts are going to happen, and that alone
> > might be influencing the captured memory view. Just guessing of course.
> 
> Perhaps this is what you mean with the timing window - but with random
> corruptions by the time kernel hits the corrupted memory 40/100Gb
> adapter has likely forgotten all about those DMAs..  And IOMMUs are
> pretty good at catching corruptions on big iron CPUs (i.e. it's easy to
> catch them in testing, even if production environment runs iommu=pt).
> At least that's my gut feeling/experience ;)

Thanks,
Rahul

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-02-21 15:25 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-15 13:54 [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic Rahul Lakkireddy
2018-02-16 20:41 ` David Miller
2018-02-19 12:34   ` Rahul Lakkireddy
2018-02-19 15:01     ` David Miller
2018-02-21  0:43     ` Jakub Kicinski
2018-02-21  0:51       ` Florian Fainelli
2018-02-21  1:04         ` Jakub Kicinski
2018-02-21 15:25           ` Rahul Lakkireddy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.