All of lore.kernel.org
 help / color / mirror / Atom feed
* BQL support in gianfar causes network hickup
@ 2012-11-23 15:58 Keitel, Tino (ALC NetworX GmbH)
  2012-11-23 16:34 ` Paul Gortmaker
  0 siblings, 1 reply; 21+ messages in thread
From: Keitel, Tino (ALC NetworX GmbH) @ 2012-11-23 15:58 UTC (permalink / raw)
  To: Paul Gortmaker, David Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 2155 bytes --]

Hi,

commit d8a0f1b0af67679bba886784de10d8c21acc4e0e causes the following
trace on a Freescale RDB8313 board:

NETDEV WATCHDOG: eth1 (fsl-gianfar): transmit queue 0 timed out
------------[ cut here ]------------
WARNING:
at /home/keitelt1/src/git/linux-stable/net/sched/sch_generic.c:255
Modules linked in:
NIP: c02448b0 LR: c02448b0 CTR: c01c19b8
REGS: c7ffbe40 TRAP: 0700   Not tainted  (3.7.0-rc6-rt18)
MSR: 00029032 <EE,ME,IR,DR,RI>  CR: 24002044  XER: 20000000
TASK = c03dd370[0] 'swapper' THREAD: c03fe000
GPR00: c02448b0 c7ffbef0 c03dd370 0000003f 00000001 c001aea8 00000000
00000001
GPR08: 00000001 c03e0000 00000000 0000009d 24002084 1008eb5c 07ffb000
ffffffff
GPR16: 00000004 c0362c7c c03dfbf8 00200000 c0411ed0 c0411cd0 c0411ad0
ffffffff
GPR24: 00000000 c749e1d8 00000004 c783d1b0 c0400000 c03e0000 c749e000
00000000
NIP [c02448b0] dev_watchdog+0x288/0x298
LR [c02448b0] dev_watchdog+0x288/0x298
Call Trace:
[c7ffbef0] [c02448b0] dev_watchdog+0x288/0x298 (unreliable)
[c7ffbf20] [c00267f8] call_timer_fn+0x6c/0xd8
[c7ffbf50] [c00269e4] run_timer_softirq+0x180/0x1f8
[c7ffbfa0] [c0021144] __do_softirq+0xc4/0x160
[c7ffbff0] [c000d0b8] call_do_softirq+0x14/0x24
[c03ffe90] [c00058e8] do_softirq+0x8c/0xb8
[c03ffeb0] [c0021358] irq_exit+0x98/0xb4
[c03ffec0] [c0009fb0] timer_interrupt+0x158/0x170
[c03ffee0] [c000f02c] ret_from_except+0x0/0x14
--- Exception: 901 at cpu_idle+0x94/0x100
    LR = cpu_idle+0x94/0x100
[c03fffa0] [c00088ec] cpu_idle+0x5c/0x100 (unreliable)
[c03fffc0] [c03b37b0] start_kernel+0x2dc/0x2f0
[c03ffff0] [00003438] 0x3438
Instruction dump:
7d2903a6 4e800421 80fe01fc 4bffff74 7fc3f378 4bfecb7d 7fc4f378 7fe6fb78
7c651b78 3c60c038 38637280 48090e69 <0fe00000> 39200001 993cc7c9
4bffffb8
---[ end trace 32125455035c2f70 ]---

With this commit reverted, it works fine. v3.3 is ok, v3.4 contains the
bad commit. The commit doesn't revert in a clean way in 3.7-rc6. I
attached diff without the tqi changes.

The above trace happens while a ptp client (for IEEE1588) is running, so
there is some locally generated network traffic. The client stops to
work after this, but maybe this is due to bad error handling.

Regards,
Tino


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: gianfar_fix_ptp.diff --]
[-- Type: text/x-patch; name="gianfar_fix_ptp.diff", Size: 2551 bytes --]

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 19ac096..e314c6f 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1748,13 +1748,9 @@ static void free_skb_resources(struct gfar_private *priv)
 
 	/* Go through all the buffer descriptors and free their data buffers */
 	for (i = 0; i < priv->num_tx_queues; i++) {
-		struct netdev_queue *txq;
-
 		tx_queue = priv->tx_queue[i];
-		txq = netdev_get_tx_queue(tx_queue->dev, tx_queue->qindex);
 		if (tx_queue->tx_skbuff)
 			free_skb_tx_queue(tx_queue);
-		netdev_tx_reset_queue(txq);
 	}
 
 	for (i = 0; i < priv->num_rx_queues; i++) {
@@ -2210,8 +2206,6 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb);
 	}
 
-	netdev_tx_sent_queue(txq, skb->len);
-
 	/* We can work in parallel with gfar_clean_tx_ring(), except
 	 * when modifying num_txbdfree. Note that we didn't grab the lock
 	 * when we were reading the num_txbdfree and checking for available
@@ -2456,7 +2450,6 @@ static void gfar_align_skb(struct sk_buff *skb)
 static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 {
 	struct net_device *dev = tx_queue->dev;
-	struct netdev_queue *txq;
 	struct gfar_private *priv = netdev_priv(dev);
 	struct gfar_priv_rx_q *rx_queue = NULL;
 	struct txbd8 *bdp, *next = NULL;
@@ -2469,12 +2462,10 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 	int i;
 	int howmany = 0;
 	int tqi = tx_queue->qindex;
-	unsigned int bytes_sent = 0;
 	u32 lstatus;
 	size_t buflen;
 
 	rx_queue = priv->rx_queue[tqi];
-	txq = netdev_get_tx_queue(dev, tqi);
 	bdp = tx_queue->dirty_tx;
 	skb_dirtytx = tx_queue->skb_dirtytx;
 
@@ -2531,8 +2522,6 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 			bdp = next_txbd(bdp, base, tx_ring_size);
 		}
 
-		bytes_sent += skb->len;
-
 		dev_kfree_skb_any(skb);
 
 		tx_queue->tx_skbuff[skb_dirtytx] = NULL;
@@ -2547,15 +2536,13 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 	}
 
 	/* If we freed a buffer, we can restart transmission, if necessary */
-	if (netif_tx_queue_stopped(txq) && tx_queue->num_txbdfree)
+	if (__netif_subqueue_stopped(dev, tx_queue->qindex) && tx_queue->num_txbdfree)
 		netif_wake_subqueue(dev, tqi);
 
 	/* Update dirty indicators */
 	tx_queue->skb_dirtytx = skb_dirtytx;
 	tx_queue->dirty_tx = bdp;
 
-	netdev_tx_completed_queue(txq, howmany, bytes_sent);
-
 	return howmany;
 }
 

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-23 15:58 BQL support in gianfar causes network hickup Keitel, Tino (ALC NetworX GmbH)
@ 2012-11-23 16:34 ` Paul Gortmaker
  2012-11-23 19:42   ` Francois Romieu
  2012-11-24 20:42   ` Tino Keitel
  0 siblings, 2 replies; 21+ messages in thread
From: Paul Gortmaker @ 2012-11-23 16:34 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH); +Cc: David Miller, netdev, Eric Dumazet

On 12-11-23 10:58 AM, Keitel, Tino (ALC NetworX GmbH) wrote:
> Hi,
> 
> commit d8a0f1b0af67679bba886784de10d8c21acc4e0e causes the following
> trace on a Freescale RDB8313 board:

Thanks for the report.

> 
> NETDEV WATCHDOG: eth1 (fsl-gianfar): transmit queue 0 timed out
> ------------[ cut here ]------------
> WARNING:
> at /home/keitelt1/src/git/linux-stable/net/sched/sch_generic.c:255
> Modules linked in:
> NIP: c02448b0 LR: c02448b0 CTR: c01c19b8
> REGS: c7ffbe40 TRAP: 0700   Not tainted  (3.7.0-rc6-rt18)
                                            ^^^^^^^^^^^^^^^
I almost overlooked the above.  It would have been nice to
see more explicit information on what kernel you are running.
I say that because the above concerns me.  For several reasons.

1) it looks to be not mainline, but preempt_rt
2) There is no RT on 3.7 yet, so I'm assuming this is a custom
   forward port of the 250 odd RT patches.  (The RT is 3.6.7-rt18,
   i.e. based on the 3.6 gregKH stable tree.)

> MSR: 00029032 <EE,ME,IR,DR,RI>  CR: 24002044  XER: 20000000
> TASK = c03dd370[0] 'swapper' THREAD: c03fe000
> GPR00: c02448b0 c7ffbef0 c03dd370 0000003f 00000001 c001aea8 00000000
> 00000001
> GPR08: 00000001 c03e0000 00000000 0000009d 24002084 1008eb5c 07ffb000
> ffffffff
> GPR16: 00000004 c0362c7c c03dfbf8 00200000 c0411ed0 c0411cd0 c0411ad0
> ffffffff
> GPR24: 00000000 c749e1d8 00000004 c783d1b0 c0400000 c03e0000 c749e000
> 00000000
> NIP [c02448b0] dev_watchdog+0x288/0x298
> LR [c02448b0] dev_watchdog+0x288/0x298
> Call Trace:
> [c7ffbef0] [c02448b0] dev_watchdog+0x288/0x298 (unreliable)
> [c7ffbf20] [c00267f8] call_timer_fn+0x6c/0xd8
> [c7ffbf50] [c00269e4] run_timer_softirq+0x180/0x1f8
> [c7ffbfa0] [c0021144] __do_softirq+0xc4/0x160
> [c7ffbff0] [c000d0b8] call_do_softirq+0x14/0x24
> [c03ffe90] [c00058e8] do_softirq+0x8c/0xb8
> [c03ffeb0] [c0021358] irq_exit+0x98/0xb4
> [c03ffec0] [c0009fb0] timer_interrupt+0x158/0x170
> [c03ffee0] [c000f02c] ret_from_except+0x0/0x14
> --- Exception: 901 at cpu_idle+0x94/0x100
>     LR = cpu_idle+0x94/0x100
> [c03fffa0] [c00088ec] cpu_idle+0x5c/0x100 (unreliable)
> [c03fffc0] [c03b37b0] start_kernel+0x2dc/0x2f0
> [c03ffff0] [00003438] 0x3438
> Instruction dump:
> 7d2903a6 4e800421 80fe01fc 4bffff74 7fc3f378 4bfecb7d 7fc4f378 7fe6fb78
> 7c651b78 3c60c038 38637280 48090e69 <0fe00000> 39200001 993cc7c9
> 4bffffb8
> ---[ end trace 32125455035c2f70 ]---
> 
> With this commit reverted, it works fine. v3.3 is ok, v3.4 contains the
> bad commit. The commit doesn't revert in a clean way in 3.7-rc6. I
> attached diff without the tqi changes.

Have you reproduced this on a mainline kernel, i.e. vanilla 3.4
or vanilla 3.7-rc6?  And then done a revert on that baseline?

The patch was relatively straightforward and reviewed by Eric
who knows this stuff inside out; it isn't immediately clear
to me why it would cause problems for you.

Paul.
--

> 
> The above trace happens while a ptp client (for IEEE1588) is running, so
> there is some locally generated network traffic. The client stops to
> work after this, but maybe this is due to bad error handling.
> 
> Regards,
> Tino
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-23 16:34 ` Paul Gortmaker
@ 2012-11-23 19:42   ` Francois Romieu
  2012-11-24 20:42   ` Tino Keitel
  1 sibling, 0 replies; 21+ messages in thread
From: Francois Romieu @ 2012-11-23 19:42 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH)
  Cc: Paul Gortmaker, David Miller, netdev, Eric Dumazet

Paul Gortmaker <paul.gortmaker@windriver.com> :
> On 12-11-23 10:58 AM, Keitel, Tino (ALC NetworX GmbH) wrote:
[...]
> > With this commit reverted, it works fine. v3.3 is ok, v3.4 contains the
> > bad commit. The commit doesn't revert in a clean way in 3.7-rc6. I
> > attached diff without the tqi changes.

Pre v3.4.5 stable kernel will miss some bql fixes. Namely:
4f4bdaeb40df95499c1ee7ea3fbca9d76174a59e (upstream 914bec1011a25f65cdc)
1414a53d956340ca8b1b27e05ab94ba63e82ed97 (upstream 25426b794efdc70dde7)

-- 
Ueimor

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-23 16:34 ` Paul Gortmaker
  2012-11-23 19:42   ` Francois Romieu
@ 2012-11-24 20:42   ` Tino Keitel
  2012-11-24 23:43     ` Eric Dumazet
  1 sibling, 1 reply; 21+ messages in thread
From: Tino Keitel @ 2012-11-24 20:42 UTC (permalink / raw)
  To: netdev

Paul Gortmaker <paul.gortmaker <at> windriver.com> writes:

> 
> On 12-11-23 10:58 AM, Keitel, Tino (ALC NetworX GmbH) wrote:
> > Hi,
> > 
> > commit d8a0f1b0af67679bba886784de10d8c21acc4e0e causes the following
> > trace on a Freescale RDB8313 board:
> 
> Thanks for the report.
> 
> > 
> > NETDEV WATCHDOG: eth1 (fsl-gianfar): transmit queue 0 timed out
> > ------------[ cut here ]------------
> > WARNING:
> > at /home/keitelt1/src/git/linux-stable/net/sched/sch_generic.c:255
> > Modules linked in:
> > NIP: c02448b0 LR: c02448b0 CTR: c01c19b8
> > REGS: c7ffbe40 TRAP: 0700   Not tainted  (3.7.0-rc6-rt18)
>                                             ^^^^^^^^^^^^^^^
> I almost overlooked the above.  It would have been nice to
> see more explicit information on what kernel you are running.
> I say that because the above concerns me.  For several reasons.
> 
> 1) it looks to be not mainline, but preempt_rt
> 2) There is no RT on 3.7 yet, so I'm assuming this is a custom
>    forward port of the 250 odd RT patches.  (The RT is 3.6.7-rt18,
>    i.e. based on the 3.6 gregKH stable tree.)

Sorry for the confusion. This was a 3.7.0-rc6 tree, and I forgot git clean after
trying the rt-patches and git reset --hard v3.7.0-rc6, so the localversion file
for -rt was still present, and the kernel was named 3.7.0-rc6-rt18. If I got
this right, this should be a normal kernel with just the version file modified.

I tried kernel 3.3, which doesnt have the issue. I tried 3.4, 3.6.7 and 3.7-rc6,
which all show the kernel trace and ptp client misbehaviour. I tried 3.4, 3.6.7,
3.7-rc6 and 3.6.5-rt18 with the patch I posted, and they were ok.

The patch I posted is for 3.7-rc6.

Regards,
Tino

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-24 20:42   ` Tino Keitel
@ 2012-11-24 23:43     ` Eric Dumazet
  2012-11-26 10:01       ` Tino Keitel
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2012-11-24 23:43 UTC (permalink / raw)
  To: Tino Keitel; +Cc: netdev

On Sat, 2012-11-24 at 20:42 +0000, Tino Keitel wrote:
> Paul Gortmaker <paul.gortmaker <at> windriver.com> writes:
> 
> > 
> > On 12-11-23 10:58 AM, Keitel, Tino (ALC NetworX GmbH) wrote:
> > > Hi,
> > > 
> > > commit d8a0f1b0af67679bba886784de10d8c21acc4e0e causes the following
> > > trace on a Freescale RDB8313 board:
> > 
> > Thanks for the report.
> > 
> > > 
> > > NETDEV WATCHDOG: eth1 (fsl-gianfar): transmit queue 0 timed out
> > > ------------[ cut here ]------------
> > > WARNING:
> > > at /home/keitelt1/src/git/linux-stable/net/sched/sch_generic.c:255
> > > Modules linked in:
> > > NIP: c02448b0 LR: c02448b0 CTR: c01c19b8
> > > REGS: c7ffbe40 TRAP: 0700   Not tainted  (3.7.0-rc6-rt18)
> >                                             ^^^^^^^^^^^^^^^
> > I almost overlooked the above.  It would have been nice to
> > see more explicit information on what kernel you are running.
> > I say that because the above concerns me.  For several reasons.
> > 
> > 1) it looks to be not mainline, but preempt_rt
> > 2) There is no RT on 3.7 yet, so I'm assuming this is a custom
> >    forward port of the 250 odd RT patches.  (The RT is 3.6.7-rt18,
> >    i.e. based on the 3.6 gregKH stable tree.)
> 
> Sorry for the confusion. This was a 3.7.0-rc6 tree, and I forgot git clean after
> trying the rt-patches and git reset --hard v3.7.0-rc6, so the localversion file
> for -rt was still present, and the kernel was named 3.7.0-rc6-rt18. If I got
> this right, this should be a normal kernel with just the version file modified.
> 
> I tried kernel 3.3, which doesnt have the issue. I tried 3.4, 3.6.7 and 3.7-rc6,
> which all show the kernel trace and ptp client misbehaviour. I tried 3.4, 3.6.7,
> 3.7-rc6 and 3.6.5-rt18 with the patch I posted, and they were ok.
> 
> The patch I posted is for 3.7-rc6.
> 

Hmm, I wonder if BQL makes a particular bug showing more often.

I see gianfar uses a very small watchdog_timeo of 1 second, while many
drivers use 5 seconds.

What happens if you change this to 5 seconds ?

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 19ac096..3a994f9 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -101,7 +101,7 @@
 
 #include "gianfar.h"
 
-#define TX_TIMEOUT      (1*HZ)
+#define TX_TIMEOUT      (5*HZ)
 
 const char gfar_driver_version[] = "1.3";
 

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-24 23:43     ` Eric Dumazet
@ 2012-11-26 10:01       ` Tino Keitel
  2012-11-26 16:34         ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Tino Keitel @ 2012-11-26 10:01 UTC (permalink / raw)
  To: Eric Dumazet, Paul Gortmaker; +Cc: netdev, Keitel, Tino (ALC NetworX GmbH)

On Sat, Nov 24, 2012 at 15:43:36 -0800, Eric Dumazet wrote:

[...]

> Hmm, I wonder if BQL makes a particular bug showing more often.
> 
> I see gianfar uses a very small watchdog_timeo of 1 second, while many
> drivers use 5 seconds.
> 
> What happens if you change this to 5 seconds ?

I still got the trace and a failing ptp client.

Regards,
Tino

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-26 10:01       ` Tino Keitel
@ 2012-11-26 16:34         ` Eric Dumazet
  2012-11-26 17:08           ` Keitel, Tino (ALC NetworX GmbH)
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2012-11-26 16:34 UTC (permalink / raw)
  To: Tino Keitel; +Cc: Paul Gortmaker, netdev, Keitel, Tino (ALC NetworX GmbH)

On Mon, 2012-11-26 at 11:01 +0100, Tino Keitel wrote:
> On Sat, Nov 24, 2012 at 15:43:36 -0800, Eric Dumazet wrote:
> 
> [...]
> 
> > Hmm, I wonder if BQL makes a particular bug showing more often.
> > 
> > I see gianfar uses a very small watchdog_timeo of 1 second, while many
> > drivers use 5 seconds.
> > 
> > What happens if you change this to 5 seconds ?
> 
> I still got the trace and a failing ptp client.
> 

Thanks. Is this bug easy to trigger ?

I suspect a core issue and a race, likely to happen on your (non x86)
hardware

Could you add the following debugging patch ?

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index aefc150..a8859ec 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -117,7 +117,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	int ret = NETDEV_TX_BUSY;
 
 	/* And release qdisc */
-	spin_unlock(root_lock);
+//	spin_unlock(root_lock);
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	if (!netif_xmit_frozen_or_stopped(txq))
@@ -125,7 +125,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	HARD_TX_UNLOCK(dev, txq);
 
-	spin_lock(root_lock);
+//	spin_lock(root_lock);
 
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-26 16:34         ` Eric Dumazet
@ 2012-11-26 17:08           ` Keitel, Tino (ALC NetworX GmbH)
  2012-11-26 17:17             ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Keitel, Tino (ALC NetworX GmbH) @ 2012-11-26 17:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tino Keitel, Paul Gortmaker, netdev

On Mo, 2012-11-26 at 08:34 -0800, Eric Dumazet wrote:
> On Mon, 2012-11-26 at 11:01 +0100, Tino Keitel wrote:
> > On Sat, Nov 24, 2012 at 15:43:36 -0800, Eric Dumazet wrote:
> > 
> > [...]
> > 
> > > Hmm, I wonder if BQL makes a particular bug showing more often.
> > > 
> > > I see gianfar uses a very small watchdog_timeo of 1 second, while many
> > > drivers use 5 seconds.
> > > 
> > > What happens if you change this to 5 seconds ?
> > 
> > I still got the trace and a failing ptp client.
> > 
> 
> Thanks. Is this bug easy to trigger ?
> 
> I suspect a core issue and a race, likely to happen on your (non x86)
> hardware
> 
> Could you add the following debugging patch ?

No visible difference:

NETDEV WATCHDOG: eth1 (fsl-gianfar): transmit queue 0 timed out
------------[ cut here ]------------
WARNING:
at /home/keitelt1/src/git/linux-stable/net/sched/sch_generic.c:255
Modules linked in:
NIP: c02448b0 LR: c02448b0 CTR: c01c19b8
REGS: c7ffbe40 TRAP: 0700   Not tainted  (3.7.0-rc6-dirty)
MSR: 00029032 <EE,ME,IR,DR,RI>  CR: 22002044  XER: 20000000
TASK = c03dd370[0] 'swapper' THREAD: c03fe000
GPR00: c02448b0 c7ffbef0 c03dd370 0000003f 00000001 c001aea8 00000000
00000001
GPR08: 00000001 c03e0000 00000000 0000009d 22002084 1008eb5c 07ffb000
ffffffff
GPR16: 00000004 c0362c7c c03dfbf8 00200000 c0411ed0 c0411cd0 c0411ad0
ffffffff
GPR24: 00000000 c749e1d8 00000004 c783c1b0 c0400000 c03e0000 c749e000
00000000
NIP [c02448b0] dev_watchdog+0x288/0x298
LR [c02448b0] dev_watchdog+0x288/0x298
Call Trace:
[c7ffbef0] [c02448b0] dev_watchdog+0x288/0x298 (unreliable)
[c7ffbf20] [c00267f8] call_timer_fn+0x6c/0xd8
[c7ffbf50] [c00269e4] run_timer_softirq+0x180/0x1f8
[c7ffbfa0] [c0021144] __do_softirq+0xc4/0x160
[c7ffbff0] [c000d0b8] call_do_softirq+0x14/0x24
[c03ffe00] [c00058e8] do_softirq+0x8c/0xb8
[c03ffe20] [c0021358] irq_exit+0x98/0xb4
[c03ffe30] [c0009fb0] timer_interrupt+0x158/0x170
[c03ffe50] [c000f02c] ret_from_except+0x0/0x14
--- Exception: 901 at _raw_spin_unlock_irq+0x3c/0x78
    LR = _raw_spin_unlock_irq+0x2c/0x78
[c03fff20] [c00434c8] finish_task_switch.constprop.69+0x5c/0xdc
[c03fff40] [c02d354c] __schedule+0x1e0/0x410
[c03fff90] [c02d3a78] schedule_preempt_disabled+0x18/0x30
[c03fffa0] [c000898c] cpu_idle+0xfc/0x100
[c03fffc0] [c03b37b0] start_kernel+0x2dc/0x2f0
[c03ffff0] [00003438] 0x3438
Instruction dump:
7d2903a6 4e800421 80fe01fc 4bffff74 7fc3f378 4bfecb7d 7fc4f378 7fe6fb78 
7c651b78 3c60c038 38637280 48090e51 <0fe00000> 39200001 993cc7c9
4bffffb8 
---[ end trace c170f56a0503cdd2 ]---

Regards,
Tino


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-26 17:08           ` Keitel, Tino (ALC NetworX GmbH)
@ 2012-11-26 17:17             ` Eric Dumazet
  2012-11-27  9:36               ` Keitel, Tino (ALC NetworX GmbH)
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2012-11-26 17:17 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH); +Cc: Tino Keitel, Paul Gortmaker, netdev

On Mon, 2012-11-26 at 18:08 +0100, Keitel, Tino (ALC NetworX GmbH)
wrote:
> On Mo, 2012-11-26 at 08:34 -0800, Eric Dumazet wrote:
> > On Mon, 2012-11-26 at 11:01 +0100, Tino Keitel wrote:
> > > On Sat, Nov 24, 2012 at 15:43:36 -0800, Eric Dumazet wrote:
> > > 
> > > [...]
> > > 
> > > > Hmm, I wonder if BQL makes a particular bug showing more often.
> > > > 
> > > > I see gianfar uses a very small watchdog_timeo of 1 second, while many
> > > > drivers use 5 seconds.
> > > > 
> > > > What happens if you change this to 5 seconds ?
> > > 
> > > I still got the trace and a failing ptp client.
> > > 
> > 
> > Thanks. Is this bug easy to trigger ?
> > 
> > I suspect a core issue and a race, likely to happen on your (non x86)
> > hardware
> > 
> > Could you add the following debugging patch ?
> 
> No visible difference:

OK it seems you trigger the problem fast !

Please try the following as well :

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 19ac096..77190bf 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -101,7 +101,7 @@
 
 #include "gianfar.h"
 
-#define TX_TIMEOUT      (1*HZ)
+#define TX_TIMEOUT      (5*HZ)
 
 const char gfar_driver_version[] = "1.3";
 
@@ -2465,9 +2465,9 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 	struct sk_buff *skb;
 	int skb_dirtytx;
 	int tx_ring_size = tx_queue->tx_ring_size;
-	int frags = 0, nr_txbds = 0;
+	int frags, nr_txbds;
 	int i;
-	int howmany = 0;
+	int howmany = 0, total_txbds = 0;
 	int tqi = tx_queue->qindex;
 	unsigned int bytes_sent = 0;
 	u32 lstatus;
@@ -2479,7 +2479,6 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 	skb_dirtytx = tx_queue->skb_dirtytx;
 
 	while ((skb = tx_queue->tx_skbuff[skb_dirtytx])) {
-		unsigned long flags;
 
 		frags = skb_shinfo(skb)->nr_frags;
 
@@ -2541,20 +2540,24 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 			      TX_RING_MOD_MASK(tx_ring_size);
 
 		howmany++;
-		spin_lock_irqsave(&tx_queue->txlock, flags);
-		tx_queue->num_txbdfree += nr_txbds;
-		spin_unlock_irqrestore(&tx_queue->txlock, flags);
+		total_txbds += nr_txbds;
 	}
 
-	/* If we freed a buffer, we can restart transmission, if necessary */
-	if (netif_tx_queue_stopped(txq) && tx_queue->num_txbdfree)
-		netif_wake_subqueue(dev, tqi);
+	if (howmany) {
+		unsigned long flags;
 
+		spin_lock_irqsave(&tx_queue->txlock, flags);
+		tx_queue->num_txbdfree += total_txbds;
+		/* If we freed a buffer, we can restart transmission, if necessary */
+		if (netif_tx_queue_stopped(txq))
+			netif_tx_wake_queue(txq);
+		netdev_tx_completed_queue(txq, howmany, bytes_sent);
+		spin_unlock_irqrestore(&tx_queue->txlock, flags);
+	}
 	/* Update dirty indicators */
 	tx_queue->skb_dirtytx = skb_dirtytx;
 	tx_queue->dirty_tx = bdp;
 
-	netdev_tx_completed_queue(txq, howmany, bytes_sent);
 
 	return howmany;
 }

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-26 17:17             ` Eric Dumazet
@ 2012-11-27  9:36               ` Keitel, Tino (ALC NetworX GmbH)
  2012-11-27 12:36                 ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Keitel, Tino (ALC NetworX GmbH) @ 2012-11-27  9:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tino Keitel, Paul Gortmaker, netdev

On Mo, 2012-11-26 at 09:17 -0800, Eric Dumazet wrote:
> On Mon, 2012-11-26 at 18:08 +0100, Keitel, Tino (ALC NetworX GmbH)
> wrote:
> > On Mo, 2012-11-26 at 08:34 -0800, Eric Dumazet wrote:
> > > On Mon, 2012-11-26 at 11:01 +0100, Tino Keitel wrote:
> > > > On Sat, Nov 24, 2012 at 15:43:36 -0800, Eric Dumazet wrote:
> > > > 
> > > > [...]
> > > > 
> > > > > Hmm, I wonder if BQL makes a particular bug showing more often.
> > > > > 
> > > > > I see gianfar uses a very small watchdog_timeo of 1 second, while many
> > > > > drivers use 5 seconds.
> > > > > 
> > > > > What happens if you change this to 5 seconds ?
> > > > 
> > > > I still got the trace and a failing ptp client.
> > > > 
> > > 
> > > Thanks. Is this bug easy to trigger ?
> > > 
> > > I suspect a core issue and a race, likely to happen on your (non x86)
> > > hardware
> > > 
> > > Could you add the following debugging patch ?
> > 
> > No visible difference:
> 
> OK it seems you trigger the problem fast !
> 
> Please try the following as well :

Hi,

yes, it can be triggered within 2 minutes.

The patch makes no difference:

NETDEV WATCHDOG: eth1 (fsl-gianfar): transmit queue 0 timed out
------------[ cut here ]------------
WARNING:
at /home/keitelt1/src/git/linux-stable/net/sched/sch_generic.c:255
Modules linked in:
NIP: c02448e0 LR: c02448e0 CTR: c01c19b8
REGS: c7ffbe40 TRAP: 0700   Not tainted  (3.7.0-rc6-dirty)
MSR: 00029032 <EE,ME,IR,DR,RI>  CR: 22002044  XER: 20000000
TASK = c03dd370[0] 'swapper' THREAD: c03fe000
GPR00: c02448e0 c7ffbef0 c03dd370 0000003f 00000001 c001aea8 00000000
00000001
GPR08: 00000001 c03e0000 00000000 0000009d 22002084 1008eb5c 07ffb000
ffffffff
GPR16: 00000004 c0362c7c c03dfbf8 00200000 c0411ed0 c0411cd0 c0411ad0
ffffffff
GPR24: 00000000 c749e1d8 00000004 c783c330 c0400000 c03e0000 c749e000
00000000
NIP [c02448e0] dev_watchdog+0x288/0x298
LR [c02448e0] dev_watchdog+0x288/0x298
Call Trace:
[c7ffbef0] [c02448e0] dev_watchdog+0x288/0x298 (unreliable)
[c7ffbf20] [c00267f8] call_timer_fn+0x6c/0xd8
[c7ffbf50] [c00269e4] run_timer_softirq+0x180/0x1f8
[c7ffbfa0] [c0021144] __do_softirq+0xc4/0x160
[c7ffbff0] [c000d0b8] call_do_softirq+0x14/0x24
[c03ffe90] [c00058e8] do_softirq+0x8c/0xb8
[c03ffeb0] [c0021358] irq_exit+0x98/0xb4
[c03ffec0] [c0009fb0] timer_interrupt+0x158/0x170
[c03ffee0] [c000f02c] ret_from_except+0x0/0x14
--- Exception: 901 at cpu_idle+0x94/0x100
    LR = cpu_idle+0x94/0x100
[c03fffa0] [c00088ec] cpu_idle+0x5c/0x100 (unreliable)
[c03fffc0] [c03b37b0] start_kernel+0x2dc/0x2f0
[c03ffff0] [00003438] 0x3438
Instruction dump:
7d2903a6 4e800421 80fe01fc 4bffff74 7fc3f378 4bfecb7d 7fc4f378 7fe6fb78
7c651b78 3c60c038 38637280 48090e69 <0fe00000> 39200001 993cc7c9
4bffffb8
---[ end trace 26a9da9c2717d65b ]---

Regards,
Tino



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-27  9:36               ` Keitel, Tino (ALC NetworX GmbH)
@ 2012-11-27 12:36                 ` Eric Dumazet
  2012-11-27 12:42                   ` Keitel, Tino (ALC NetworX GmbH)
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2012-11-27 12:36 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH); +Cc: Tino Keitel, Paul Gortmaker, netdev

On Tue, 2012-11-27 at 10:36 +0100, Keitel, Tino (ALC NetworX GmbH)
wrote:
> On Mo, 2012-11-26 at 09:17 -0800, Eric Dumazet wrote:
> > On Mon, 2012-11-26 at 18:08 +0100, Keitel, Tino (ALC NetworX GmbH)
> > wrote:
> > > On Mo, 2012-11-26 at 08:34 -0800, Eric Dumazet wrote:
> > > > On Mon, 2012-11-26 at 11:01 +0100, Tino Keitel wrote:
> > > > > On Sat, Nov 24, 2012 at 15:43:36 -0800, Eric Dumazet wrote:
> > > > > 
> > > > > [...]
> > > > > 
> > > > > > Hmm, I wonder if BQL makes a particular bug showing more often.
> > > > > > 
> > > > > > I see gianfar uses a very small watchdog_timeo of 1 second, while many
> > > > > > drivers use 5 seconds.
> > > > > > 
> > > > > > What happens if you change this to 5 seconds ?
> > > > > 
> > > > > I still got the trace and a failing ptp client.
> > > > > 
> > > > 
> > > > Thanks. Is this bug easy to trigger ?
> > > > 
> > > > I suspect a core issue and a race, likely to happen on your (non x86)
> > > > hardware
> > > > 
> > > > Could you add the following debugging patch ?
> > > 
> > > No visible difference:
> > 
> > OK it seems you trigger the problem fast !
> > 
> > Please try the following as well :
> 
> Hi,
> 
> yes, it can be triggered within 2 minutes.
> 
> The patch makes no difference:

Can you reproduce the problem using a single cpu ?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-27 12:36                 ` Eric Dumazet
@ 2012-11-27 12:42                   ` Keitel, Tino (ALC NetworX GmbH)
  2012-11-27 13:32                     ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Keitel, Tino (ALC NetworX GmbH) @ 2012-11-27 12:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tino Keitel, Paul Gortmaker, netdev

On Di, 2012-11-27 at 04:36 -0800, Eric Dumazet wrote:
> 
> Can you reproduce the problem using a single cpu ?

Yes, it is a single-CPU system.

Regards,
Tino


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-27 12:42                   ` Keitel, Tino (ALC NetworX GmbH)
@ 2012-11-27 13:32                     ` Eric Dumazet
  2012-11-27 13:49                       ` Eric Dumazet
  2013-02-05 13:00                       ` Keitel, Tino (ALC NetworX GmbH)
  0 siblings, 2 replies; 21+ messages in thread
From: Eric Dumazet @ 2012-11-27 13:32 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH); +Cc: Tino Keitel, Paul Gortmaker, netdev

On Tue, 2012-11-27 at 13:42 +0100, Keitel, Tino (ALC NetworX GmbH)
wrote:
> On Di, 2012-11-27 at 04:36 -0800, Eric Dumazet wrote:
> > 
> > Can you reproduce the problem using a single cpu ?
> 
> Yes, it is a single-CPU system.

Can you reproduce the problem without PTP running, or disabled in the
driver ?

(comment the "priv->hwts_tx_en = 1;" line)


This looks like we miss an interrupt ( or TXBD_INTERRUPT not correctly
set)

And it could be a bug occurring if we try to send one skb with fragments
and skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-27 13:32                     ` Eric Dumazet
@ 2012-11-27 13:49                       ` Eric Dumazet
  2013-02-05 13:00                       ` Keitel, Tino (ALC NetworX GmbH)
  1 sibling, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2012-11-27 13:49 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH); +Cc: Tino Keitel, Paul Gortmaker, netdev

On Tue, 2012-11-27 at 05:32 -0800, Eric Dumazet wrote:

> Can you reproduce the problem without PTP running, or disabled in the
> driver ?
> 
> (comment the "priv->hwts_tx_en = 1;" line)
> 
> 
> This looks like we miss an interrupt ( or TXBD_INTERRUPT not correctly
> set)
> 
> And it could be a bug occurring if we try to send one skb with fragments
> and skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP 
> 
> 

By the way are any errata flagged in gfar_detect_errata() ?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2012-11-27 13:32                     ` Eric Dumazet
  2012-11-27 13:49                       ` Eric Dumazet
@ 2013-02-05 13:00                       ` Keitel, Tino (ALC NetworX GmbH)
  2013-02-06  1:55                         ` Paul Gortmaker
  2013-02-07 21:05                         ` Paul Gortmaker
  1 sibling, 2 replies; 21+ messages in thread
From: Keitel, Tino (ALC NetworX GmbH) @ 2013-02-05 13:00 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tino Keitel, Paul Gortmaker, netdev

On Di, 2012-11-27 at 05:32 -0800, Eric Dumazet wrote:
> On Tue, 2012-11-27 at 13:42 +0100, Keitel, Tino (ALC NetworX GmbH)
> wrote:
> > On Di, 2012-11-27 at 04:36 -0800, Eric Dumazet wrote:
> > > 
> > > Can you reproduce the problem using a single cpu ?
> > 
> > Yes, it is a single-CPU system.
> 
> Can you reproduce the problem without PTP running, or disabled in the
> driver ?
> 
> (comment the "priv->hwts_tx_en = 1;" line)

I can't reproduce it with that line commented. However, so far I was
only able to reproduce it when starting the ptp2 client, so maybe this
is connected.

> By the way are any errata flagged in gfar_detect_errata() ?

This is from dmesg:

fsl-gianfar e0024000.ethernet: enabled errata workarounds, flags: 0x7

0x7 would be GFAR_ERRATA_74, GFAR_ERRATA_76 and GFAR_ERRATA_A002
according to drivers/net/ethernet/freescale/gianfar.h.

Regards,
Tino


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2013-02-05 13:00                       ` Keitel, Tino (ALC NetworX GmbH)
@ 2013-02-06  1:55                         ` Paul Gortmaker
  2013-02-06 15:20                           ` Keitel, Tino (ALC NetworX GmbH)
  2013-02-07 21:05                         ` Paul Gortmaker
  1 sibling, 1 reply; 21+ messages in thread
From: Paul Gortmaker @ 2013-02-06  1:55 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH); +Cc: Eric Dumazet, Tino Keitel, netdev

On Tue, Feb 5, 2013 at 8:00 AM, Keitel, Tino (ALC NetworX GmbH)
<Tino.Keitel@alcnetworx.de> wrote:
> On Di, 2012-11-27 at 05:32 -0800, Eric Dumazet wrote:
>> On Tue, 2012-11-27 at 13:42 +0100, Keitel, Tino (ALC NetworX GmbH)
>> wrote:
>> > On Di, 2012-11-27 at 04:36 -0800, Eric Dumazet wrote:
>> > >
>> > > Can you reproduce the problem using a single cpu ?
>> >
>> > Yes, it is a single-CPU system.
>>
>> Can you reproduce the problem without PTP running, or disabled in the
>> driver ?
>>
>> (comment the "priv->hwts_tx_en = 1;" line)
>
> I can't reproduce it with that line commented. However, so far I was
> only able to reproduce it when starting the ptp2 client, so maybe this
> is connected.

How critical is ptp2 for this?  And/or the platform details?  I can
try and reproduce it on an mpc8349 system and/or an mpc8548
system (and even an mpc8641D system) but I'd rather know that
was a meaningful chase and not a snipe hunt before going there.

So, in that respect, a "If you run this, you will get this" type of
error message would be good.   I understand that may be too
idealistic, though.

Thanks,
Paul.
--

>
>> By the way are any errata flagged in gfar_detect_errata() ?
>
> This is from dmesg:
>
> fsl-gianfar e0024000.ethernet: enabled errata workarounds, flags: 0x7
>
> 0x7 would be GFAR_ERRATA_74, GFAR_ERRATA_76 and GFAR_ERRATA_A002
> according to drivers/net/ethernet/freescale/gianfar.h.
>
> Regards,
> Tino
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2013-02-06  1:55                         ` Paul Gortmaker
@ 2013-02-06 15:20                           ` Keitel, Tino (ALC NetworX GmbH)
  2013-04-29 13:14                             ` Claudiu Manoil
  0 siblings, 1 reply; 21+ messages in thread
From: Keitel, Tino (ALC NetworX GmbH) @ 2013-02-06 15:20 UTC (permalink / raw)
  To: Paul Gortmaker; +Cc: Eric Dumazet, Tino Keitel, netdev

On Di, 2013-02-05 at 20:55 -0500, Paul Gortmaker wrote:
> How critical is ptp2 for this?  And/or the platform details?  I can
> try and reproduce it on an mpc8349 system and/or an mpc8548
> system (and even an mpc8641D system) but I'd rather know that
> was a meaningful chase and not a snipe hunt before going there.
> 
> So, in that respect, a "If you run this, you will get this" type of
> error message would be good.   I understand that may be too
> idealistic, though.

Hi,

I'm using an MPC8313ERDB board and was able to reproduce it with a
vanilla 3.6.7 kernel. There are some modifications to the device tree,
though. I'll try to reproduce this with as few changes to the device
tree as possible.

Ptp2 is critical for the desired use. So far I also only was able to
reproduce it with this ptp2 client.

If it is a possible option for you, I could provide an affected device
to make the issue easy to reproduce.

Regards,
Tino


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2013-02-05 13:00                       ` Keitel, Tino (ALC NetworX GmbH)
  2013-02-06  1:55                         ` Paul Gortmaker
@ 2013-02-07 21:05                         ` Paul Gortmaker
  1 sibling, 0 replies; 21+ messages in thread
From: Paul Gortmaker @ 2013-02-07 21:05 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH); +Cc: Eric Dumazet, Tino Keitel, netdev

On 13-02-05 08:00 AM, Keitel, Tino (ALC NetworX GmbH) wrote:
> On Di, 2012-11-27 at 05:32 -0800, Eric Dumazet wrote:
>> On Tue, 2012-11-27 at 13:42 +0100, Keitel, Tino (ALC NetworX GmbH)
>> wrote:
>>> On Di, 2012-11-27 at 04:36 -0800, Eric Dumazet wrote:
>>>>
>>>> Can you reproduce the problem using a single cpu ?
>>>
>>> Yes, it is a single-CPU system.
>>
>> Can you reproduce the problem without PTP running, or disabled in the
>> driver ?
>>
>> (comment the "priv->hwts_tx_en = 1;" line)
> 
> I can't reproduce it with that line commented. However, so far I was
> only able to reproduce it when starting the ptp2 client, so maybe this
> is connected.

I found an mpc8315erdb, and built the default yocto build (v3.4.20,
which should have the issue based on your earlier reports.)

> 
>> By the way are any errata flagged in gfar_detect_errata() ?
> 
> This is from dmesg:
> 
> fsl-gianfar e0024000.ethernet: enabled errata workarounds, flags: 0x7
> 
> 0x7 would be GFAR_ERRATA_74, GFAR_ERRATA_76 and GFAR_ERRATA_A002
> according to drivers/net/ethernet/freescale/gianfar.h.

The MPC8315ECE.pdf lists the 8315 as having 76 and A002 (amongst
a lot of others.)  However the driver only does errata checks
for the 8313 it seems.  I also then manually forced the driver
to enable the errata and confirmed I saw 0x7 flag in dmesg.

Finally I added the meta-networking layer to the yocto build to
get a ptpd2 (2.2.0-r1)

On another board, I ran a server as:
    ptpd2 -G -C -c -b eth0

On the 8315, I ran the client as:
    ptpd2 -g -C -c -b eth0

In neither case (errata off, and errata manually enabled) I did not
manage to get the tx timeout that you got.

There must be something more specific to your environment, your
ptpd client (and args) and perhaps the board itself.

Paul.
--

> 
> Regards,
> Tino
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2013-02-06 15:20                           ` Keitel, Tino (ALC NetworX GmbH)
@ 2013-04-29 13:14                             ` Claudiu Manoil
  2013-04-29 13:20                               ` Tino Keitel
  0 siblings, 1 reply; 21+ messages in thread
From: Claudiu Manoil @ 2013-04-29 13:14 UTC (permalink / raw)
  To: Keitel, Tino (ALC NetworX GmbH)
  Cc: Paul Gortmaker, Eric Dumazet, Tino Keitel, netdev

Hi,

I think I found a way to reproduce this easily, by forcing
timestamping on Tx with the following code change:

"""
drivers/net/ethernet/freescale/gianfar.c:
@@ -1210,6 +1210,8 @@ static int gfar_probe(struct platform_device *ofdev)
         /* Create all the sysfs files */
         gfar_init_sysfs(dev);

+       priv->hwts_tx_en = 1;
+
         /* Print out the device info */
         netdev_info(dev, "mac: %pM\n", dev->dev_addr);

@@ -2084,6 +2087,7 @@ static int gfar_start_xmit(struct sk_buff *skb, 
struct net_device *dev)
         txq = netdev_get_tx_queue(dev, rq);
         base = tx_queue->tx_bd_base;
         regs = tx_queue->grp->regs;
+       skb_shinfo(skb)->tx_flags |= SKBTX_HW_TSTAMP;

         /* check if time stamp should be generated */
         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
"""

With this change, tx timeout triggers with the first packet sent.
I tested it on p1020/ p1010 boards, so it's not related to a specific
board.

The proposed fix (passes this test):
http://patchwork.ozlabs.org/patch/240365/

Does it work for you?
I'm a bit concerned about BQL support being added to Gianfar. Seems that
it only adds an overhead to this driver and possible failure points.
I'm not sure whether Gianfar has the issue of excessive H/W queuing.
I'm also wondering why only 5 or 6 eth drivers integrated BQL to date.

Regards,
Claudiu

On 2/6/2013 5:20 PM, Keitel, Tino (ALC NetworX GmbH) wrote:
> On Di, 2013-02-05 at 20:55 -0500, Paul Gortmaker wrote:
>> How critical is ptp2 for this?  And/or the platform details?  I can
>> try and reproduce it on an mpc8349 system and/or an mpc8548
>> system (and even an mpc8641D system) but I'd rather know that
>> was a meaningful chase and not a snipe hunt before going there.
>>
>> So, in that respect, a "If you run this, you will get this" type of
>> error message would be good.   I understand that may be too
>> idealistic, though.
>
> Hi,
>
> I'm using an MPC8313ERDB board and was able to reproduce it with a
> vanilla 3.6.7 kernel. There are some modifications to the device tree,
> though. I'll try to reproduce this with as few changes to the device
> tree as possible.
>
> Ptp2 is critical for the desired use. So far I also only was able to
> reproduce it with this ptp2 client.
>
> If it is a possible option for you, I could provide an affected device
> to make the issue easy to reproduce.
>
> Regards,
> Tino

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2013-04-29 13:14                             ` Claudiu Manoil
@ 2013-04-29 13:20                               ` Tino Keitel
  2013-05-27 12:47                                 ` Tino Keitel
  0 siblings, 1 reply; 21+ messages in thread
From: Tino Keitel @ 2013-04-29 13:20 UTC (permalink / raw)
  To: Claudiu Manoil; +Cc: Paul Gortmaker, Eric Dumazet, Tino Keitel, netdev

On Mo, 2013-04-29 at 15:14 +0200, Claudiu Manoil wrote:

> The proposed fix (passes this test):
> http://patchwork.ozlabs.org/patch/240365/
> 
> Does it work for you?

Hi,

thanks a lot. I'll try the patch and report back.

Regards,
Tino

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: BQL support in gianfar causes network hickup
  2013-04-29 13:20                               ` Tino Keitel
@ 2013-05-27 12:47                                 ` Tino Keitel
  0 siblings, 0 replies; 21+ messages in thread
From: Tino Keitel @ 2013-05-27 12:47 UTC (permalink / raw)
  To: Claudiu Manoil; +Cc: Paul Gortmaker, Eric Dumazet, Tino Keitel, netdev

On Mo, 2013-04-29 at 15:20 +0200, Tino Keitel wrote:
> On Mo, 2013-04-29 at 15:14 +0200, Claudiu Manoil wrote:
> 
> > The proposed fix (passes this test):
> > http://patchwork.ozlabs.org/patch/240365/
> > 
> > Does it work for you?

Hi,

I tested the patch over the weekend and it looks fine. I then re-checked
without the patch and got the bug immediately. So the patch seems to
work fine.

Thanks a lot and regards,
Tino Keitel

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2013-05-27 12:47 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-11-23 15:58 BQL support in gianfar causes network hickup Keitel, Tino (ALC NetworX GmbH)
2012-11-23 16:34 ` Paul Gortmaker
2012-11-23 19:42   ` Francois Romieu
2012-11-24 20:42   ` Tino Keitel
2012-11-24 23:43     ` Eric Dumazet
2012-11-26 10:01       ` Tino Keitel
2012-11-26 16:34         ` Eric Dumazet
2012-11-26 17:08           ` Keitel, Tino (ALC NetworX GmbH)
2012-11-26 17:17             ` Eric Dumazet
2012-11-27  9:36               ` Keitel, Tino (ALC NetworX GmbH)
2012-11-27 12:36                 ` Eric Dumazet
2012-11-27 12:42                   ` Keitel, Tino (ALC NetworX GmbH)
2012-11-27 13:32                     ` Eric Dumazet
2012-11-27 13:49                       ` Eric Dumazet
2013-02-05 13:00                       ` Keitel, Tino (ALC NetworX GmbH)
2013-02-06  1:55                         ` Paul Gortmaker
2013-02-06 15:20                           ` Keitel, Tino (ALC NetworX GmbH)
2013-04-29 13:14                             ` Claudiu Manoil
2013-04-29 13:20                               ` Tino Keitel
2013-05-27 12:47                                 ` Tino Keitel
2013-02-07 21:05                         ` Paul Gortmaker

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.