netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mlx4: give precise rx/tx bytes/packets counters
@ 2016-11-25 15:46 Eric Dumazet
  2016-11-25 16:03 ` David Laight
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: Eric Dumazet @ 2016-11-25 15:46 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Tariq Toukan

From: Eric Dumazet <edumazet@google.com>

mlx4 stats are chaotic because a deferred work queue is responsible
to update them every 250 ms.

Even sampling stats every one second with "sar -n DEV 1" gives
variations like the following :

lpaa23:~# sar -n DEV 1 10 | grep eth0 | cut -c1-65
07:39:22         eth0 146877.00 3265554.00   9467.15 4828168.50  
07:39:23         eth0 146587.00 3260329.00   9448.15 4820445.98  
07:39:24         eth0 146894.00 3259989.00   9468.55 4819943.26  
07:39:25         eth0 110368.00 2454497.00   7113.95 3629012.17  <<>>
07:39:26         eth0 146563.00 3257502.00   9447.25 4816266.23  
07:39:27         eth0 145678.00 3258292.00   9389.79 4817414.39  
07:39:28         eth0 145268.00 3253171.00   9363.85 4809852.46  
07:39:29         eth0 146439.00 3262185.00   9438.97 4823172.48  
07:39:30         eth0 146758.00 3264175.00   9459.94 4826124.13  
07:39:31         eth0 146843.00 3256903.00   9465.44 4815381.97  
Average:         eth0 142827.50 3179259.70   9206.30 4700578.16  

This patch allows rx/tx bytes/packets counters being folded at the
time we need stats.

We now can fetch stats every 1 ms if we want to check NIC behavior
on a small time window. It is also easier to detect anomalies.

lpaa23:~# sar -n DEV 1 10 | grep eth0 | cut -c1-65
07:42:50         eth0 142915.00 3177696.00   9212.06 4698270.42  
07:42:51         eth0 143741.00 3200232.00   9265.15 4731593.02  
07:42:52         eth0 142781.00 3171600.00   9202.92 4689260.16  
07:42:53         eth0 143835.00 3192932.00   9271.80 4720761.39  
07:42:54         eth0 141922.00 3165174.00   9147.64 4679759.21  
07:42:55         eth0 142993.00 3207038.00   9216.78 4741653.05  
07:42:56         eth0 141394.06 3154335.64   9113.85 4663731.73  
07:42:57         eth0 141850.00 3161202.00   9144.48 4673866.07  
07:42:58         eth0 143439.00 3180736.00   9246.05 4702755.35  
07:42:59         eth0 143501.00 3210992.00   9249.99 4747501.84  
Average:         eth0 142835.66 3182165.93   9206.98 4704874.08  

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |    2 
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |    1 
 drivers/net/ethernet/mellanox/mlx4/en_port.c    |   77 +++++++++-----
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    1 
 4 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 487a58f9c192896852fef271b6cce9bde132deb7..d9c9f86a30df953fa555934c5406057dcaf28960 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -367,6 +367,8 @@ static void mlx4_en_get_ethtool_stats(struct net_device *dev,
 
 	spin_lock_bh(&priv->stats_lock);
 
+	mlx4_en_fold_software_stats(dev);
+
 	for (i = 0; i < NUM_MAIN_STATS; i++, bitmap_iterator_inc(&it))
 		if (bitmap_iterator_test(&it))
 			data[index++] = ((unsigned long *)&dev->stats)[i];
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 9018bb1b2e12142e048281a9d28ddf95e0023a61..d28d841db23ce885d2011877a156bacf23f65afe 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1321,6 +1321,7 @@ mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 
 	spin_lock_bh(&priv->stats_lock);
+	mlx4_en_fold_software_stats(dev);
 	netdev_stats_to_stats64(stats, &dev->stats);
 	spin_unlock_bh(&priv->stats_lock);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
index 1eb4c1e10bad1dad26049876acf107a2073a6ab1..c6c4f1238923e09eced547454b86c68720292859 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -147,6 +147,39 @@ static unsigned long en_stats_adder(__be64 *start, __be64 *next, int num)
 	return ret;
 }
 
+void mlx4_en_fold_software_stats(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	unsigned long packets, bytes;
+	int i;
+
+	if (mlx4_is_master(mdev->dev))
+		return;
+
+	packets = 0;
+	bytes = 0;
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
+
+		packets += READ_ONCE(ring->packets);
+		bytes   += READ_ONCE(ring->bytes);
+	}
+	dev->stats.rx_packets = packets;
+	dev->stats.rx_bytes = bytes;
+
+	packets = 0;
+	bytes = 0;
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		const struct mlx4_en_tx_ring *ring = priv->tx_ring[TX][i];
+
+		packets += READ_ONCE(ring->packets);
+		bytes   += READ_ONCE(ring->bytes);
+	}
+	dev->stats.tx_packets = packets;
+	dev->stats.tx_bytes = bytes;
+}
+
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 {
 	struct mlx4_counter tmp_counter_stats;
@@ -159,6 +192,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	u64 in_mod = reset << 8 | port;
 	int err;
 	int i, counter_index;
+	unsigned long sw_tx_dropped = 0;
 	unsigned long sw_rx_dropped = 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
@@ -174,8 +208,8 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 
 	spin_lock_bh(&priv->stats_lock);
 
-	stats->rx_packets = 0;
-	stats->rx_bytes = 0;
+	mlx4_en_fold_software_stats(dev);
+
 	priv->port_stats.rx_chksum_good = 0;
 	priv->port_stats.rx_chksum_none = 0;
 	priv->port_stats.rx_chksum_complete = 0;
@@ -183,19 +217,16 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	priv->xdp_stats.rx_xdp_tx      = 0;
 	priv->xdp_stats.rx_xdp_tx_full = 0;
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		stats->rx_packets += priv->rx_ring[i]->packets;
-		stats->rx_bytes += priv->rx_ring[i]->bytes;
-		sw_rx_dropped += priv->rx_ring[i]->dropped;
-		priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
-		priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
-		priv->port_stats.rx_chksum_complete += priv->rx_ring[i]->csum_complete;
-		priv->xdp_stats.rx_xdp_drop    += priv->rx_ring[i]->xdp_drop;
-		priv->xdp_stats.rx_xdp_tx      += priv->rx_ring[i]->xdp_tx;
-		priv->xdp_stats.rx_xdp_tx_full += priv->rx_ring[i]->xdp_tx_full;
+		const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
+
+		sw_rx_dropped			+= READ_ONCE(ring->dropped);
+		priv->port_stats.rx_chksum_good += READ_ONCE(ring->csum_ok);
+		priv->port_stats.rx_chksum_none += READ_ONCE(ring->csum_none);
+		priv->port_stats.rx_chksum_complete += READ_ONCE(ring->csum_complete);
+		priv->xdp_stats.rx_xdp_drop	+= READ_ONCE(ring->xdp_drop);
+		priv->xdp_stats.rx_xdp_tx	+= READ_ONCE(ring->xdp_tx);
+		priv->xdp_stats.rx_xdp_tx_full	+= READ_ONCE(ring->xdp_tx_full);
 	}
-	stats->tx_packets = 0;
-	stats->tx_bytes = 0;
-	stats->tx_dropped = 0;
 	priv->port_stats.tx_chksum_offload = 0;
 	priv->port_stats.queue_stopped = 0;
 	priv->port_stats.wake_queue = 0;
@@ -205,15 +236,14 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
 		const struct mlx4_en_tx_ring *ring = priv->tx_ring[TX][i];
 
-		stats->tx_packets += ring->packets;
-		stats->tx_bytes += ring->bytes;
-		stats->tx_dropped += ring->tx_dropped;
-		priv->port_stats.tx_chksum_offload += ring->tx_csum;
-		priv->port_stats.queue_stopped     += ring->queue_stopped;
-		priv->port_stats.wake_queue        += ring->wake_queue;
-		priv->port_stats.tso_packets       += ring->tso_packets;
-		priv->port_stats.xmit_more         += ring->xmit_more;
+		sw_tx_dropped			   += READ_ONCE(ring->tx_dropped);
+		priv->port_stats.tx_chksum_offload += READ_ONCE(ring->tx_csum);
+		priv->port_stats.queue_stopped     += READ_ONCE(ring->queue_stopped);
+		priv->port_stats.wake_queue        += READ_ONCE(ring->wake_queue);
+		priv->port_stats.tso_packets       += READ_ONCE(ring->tso_packets);
+		priv->port_stats.xmit_more         += READ_ONCE(ring->xmit_more);
 	}
+
 	if (mlx4_is_master(mdev->dev)) {
 		stats->rx_packets = en_stats_adder(&mlx4_en_stats->RTOT_prio_0,
 						   &mlx4_en_stats->RTOT_prio_1,
@@ -251,7 +281,8 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
 	stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
 	stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
-	stats->tx_dropped += be32_to_cpu(mlx4_en_stats->TDROP);
+	stats->tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP) +
+			    sw_tx_dropped;
 
 	/* RX stats */
 	priv->pkstats.rx_multicast_packets = stats->multicast;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 574bcbb1b38fc4758511d8f7bd17a87b0a507a73..20a936428f4a44c8ca0a7161855da310f9166b50 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -755,6 +755,7 @@ void mlx4_en_rx_irq(struct mlx4_cq *mcq);
 int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
 int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
 
+void mlx4_en_fold_software_stats(struct net_device *dev);
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
 int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
 

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* RE: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-25 15:46 [PATCH] mlx4: give precise rx/tx bytes/packets counters Eric Dumazet
@ 2016-11-25 16:03 ` David Laight
  2016-11-25 16:16   ` Eric Dumazet
  2016-11-26 22:47 ` Saeed Mahameed
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 33+ messages in thread
From: David Laight @ 2016-11-25 16:03 UTC (permalink / raw)
  To: 'Eric Dumazet', David Miller; +Cc: netdev, Tariq Toukan

From: Of Eric Dumazet
> Sent: 25 November 2016 15:46
> mlx4 stats are chaotic because a deferred work queue is responsible
> to update them every 250 ms.
> 
> Even sampling stats every one second with "sar -n DEV 1" gives
> variations like the following :
...
> This patch allows rx/tx bytes/packets counters being folded at the
> time we need stats.
> 
> We now can fetch stats every 1 ms if we want to check NIC behavior
> on a small time window. It is also easier to detect anomalies.
...
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Tariq Toukan <tariqt@mellanox.com>
...
>  	for (i = 0; i < priv->rx_ring_num; i++) {
> -		stats->rx_packets += priv->rx_ring[i]->packets;
> -		stats->rx_bytes += priv->rx_ring[i]->bytes;
> -		sw_rx_dropped += priv->rx_ring[i]->dropped;
> -		priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
> -		priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
> -		priv->port_stats.rx_chksum_complete += priv->rx_ring[i]->csum_complete;
> -		priv->xdp_stats.rx_xdp_drop    += priv->rx_ring[i]->xdp_drop;
> -		priv->xdp_stats.rx_xdp_tx      += priv->rx_ring[i]->xdp_tx;
> -		priv->xdp_stats.rx_xdp_tx_full += priv->rx_ring[i]->xdp_tx_full;
> +		const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
> +
> +		sw_rx_dropped			+= READ_ONCE(ring->dropped);
> +		priv->port_stats.rx_chksum_good += READ_ONCE(ring->csum_ok);
> +		priv->port_stats.rx_chksum_none += READ_ONCE(ring->csum_none);
> +		priv->port_stats.rx_chksum_complete += READ_ONCE(ring->csum_complete);
> +		priv->xdp_stats.rx_xdp_drop	+= READ_ONCE(ring->xdp_drop);
> +		priv->xdp_stats.rx_xdp_tx	+= READ_ONCE(ring->xdp_tx);
> +		priv->xdp_stats.rx_xdp_tx_full	+= READ_ONCE(ring->xdp_tx_full);

This chunk (and the one after) seem to be adding READ_ONCE() and don't
seem to be related to the commit message.

	David


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-25 16:03 ` David Laight
@ 2016-11-25 16:16   ` Eric Dumazet
  2016-11-25 16:30     ` Aw: " Lino Sanfilippo
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-11-25 16:16 UTC (permalink / raw)
  To: David Laight; +Cc: David Miller, netdev, Tariq Toukan

On Fri, 2016-11-25 at 16:03 +0000, David Laight wrote:
> From: Of Eric Dumazet
> > Sent: 25 November 2016 15:46
> > mlx4 stats are chaotic because a deferred work queue is responsible
> > to update them every 250 ms.
> > 
> > Even sampling stats every one second with "sar -n DEV 1" gives
> > variations like the following :
> ...
> > This patch allows rx/tx bytes/packets counters being folded at the
> > time we need stats.
> > 
> > We now can fetch stats every 1 ms if we want to check NIC behavior
> > on a small time window. It is also easier to detect anomalies.
> ...
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > Cc: Tariq Toukan <tariqt@mellanox.com>
> ...
> >  	for (i = 0; i < priv->rx_ring_num; i++) {
> > -		stats->rx_packets += priv->rx_ring[i]->packets;
> > -		stats->rx_bytes += priv->rx_ring[i]->bytes;
> > -		sw_rx_dropped += priv->rx_ring[i]->dropped;
> > -		priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
> > -		priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
> > -		priv->port_stats.rx_chksum_complete += priv->rx_ring[i]->csum_complete;
> > -		priv->xdp_stats.rx_xdp_drop    += priv->rx_ring[i]->xdp_drop;
> > -		priv->xdp_stats.rx_xdp_tx      += priv->rx_ring[i]->xdp_tx;
> > -		priv->xdp_stats.rx_xdp_tx_full += priv->rx_ring[i]->xdp_tx_full;
> > +		const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
> > +
> > +		sw_rx_dropped			+= READ_ONCE(ring->dropped);
> > +		priv->port_stats.rx_chksum_good += READ_ONCE(ring->csum_ok);
> > +		priv->port_stats.rx_chksum_none += READ_ONCE(ring->csum_none);
> > +		priv->port_stats.rx_chksum_complete += READ_ONCE(ring->csum_complete);
> > +		priv->xdp_stats.rx_xdp_drop	+= READ_ONCE(ring->xdp_drop);
> > +		priv->xdp_stats.rx_xdp_tx	+= READ_ONCE(ring->xdp_tx);
> > +		priv->xdp_stats.rx_xdp_tx_full	+= READ_ONCE(ring->xdp_tx_full);
> 
> This chunk (and the one after) seem to be adding READ_ONCE() and don't
> seem to be related to the commit message.

The READ_ONCE() are documenting the fact that no lock is taken to fetch
the stats, while another cpus might being changing them.

I had no answer yet from https://patchwork.ozlabs.org/patch/698449/

So I thought it was not needed to explain this in the changelog, given
that it apparently is one of the few things that can block someone to
understand one of my changes :/

Apparently nobody really understands READ_ONCE() purpose, it is really a
pity we have to explain this over and over.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Aw: Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-25 16:16   ` Eric Dumazet
@ 2016-11-25 16:30     ` Lino Sanfilippo
  2016-11-25 19:19       ` Eric Dumazet
  0 siblings, 1 reply; 33+ messages in thread
From: Lino Sanfilippo @ 2016-11-25 16:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Laight, David Miller, netdev, Tariq Toukan

Hi,


> 
> The READ_ONCE() are documenting the fact that no lock is taken to fetch
> the stats, while another cpus might being changing them.
> 
> I had no answer yet from https://patchwork.ozlabs.org/patch/698449/
> 
> So I thought it was not needed to explain this in the changelog, given
> that it apparently is one of the few things that can block someone to
> understand one of my changes :/
> 
> Apparently nobody really understands READ_ONCE() purpose, it is really a
> pity we have to explain this over and over.
> 

Even at the risk of showing once more a lack of understanding for READ_ONCE():
Does not a READ_ONCE() have to e paired with some kind of WRITE_ONCE()? 
Furthermore: there a quite some network drivers that ensure visibility of 
the descriptor queue indices between xmit and xmit completion function by means of
smp barriers. Could all these drivers theoretically be adjusted to use READ_ONCE(),
WRITE_ONCE() for the indices instead?

Regards,
Lino

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Aw: Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-25 16:30     ` Aw: " Lino Sanfilippo
@ 2016-11-25 19:19       ` Eric Dumazet
  2016-11-28 22:02         ` Lino Sanfilippo
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-11-25 19:19 UTC (permalink / raw)
  To: Lino Sanfilippo; +Cc: David Laight, David Miller, netdev, Tariq Toukan

On Fri, 2016-11-25 at 17:30 +0100, Lino Sanfilippo wrote:
> Hi,
> 
> 
> > 
> > The READ_ONCE() are documenting the fact that no lock is taken to fetch
> > the stats, while another cpus might being changing them.
> > 
> > I had no answer yet from https://patchwork.ozlabs.org/patch/698449/
> > 
> > So I thought it was not needed to explain this in the changelog, given
> > that it apparently is one of the few things that can block someone to
> > understand one of my changes :/
> > 
> > Apparently nobody really understands READ_ONCE() purpose, it is really a
> > pity we have to explain this over and over.
> > 
> 
> Even at the risk of showing once more a lack of understanding for
> READ_ONCE():
> Does not a READ_ONCE() have to e paired with some kind of
> WRITE_ONCE()? 

You are right.

Although in this case, the producers are using a lock, and do

ring->packets++;

We hopefully have compilers/cpus that do not put intermediate garbage in
ring->packets while doing the increment.

One problem with :

WRITE_ONCE(ring->packets, ring->packets + 1);

is that gcc no longer uses an INC instruction.

Maybe we need some ADD_ONCE(ptr, val) macro doing the proper thing.

> Furthermore: there a quite some network drivers that ensure visibility
> of 
> the descriptor queue indices between xmit and xmit completion function
> by means of
> smp barriers. Could all these drivers theoretically be adjusted to use
> READ_ONCE(),
> WRITE_ONCE() for the indices instead?
> 

Well, for this precise case we do need appropriate smp barriers.

READ_ONCE() can be better than poor barrier(), look at 
https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=b668534c1d9b80f4cda4d761eb11d3a6c9f4ced8

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-25 15:46 [PATCH] mlx4: give precise rx/tx bytes/packets counters Eric Dumazet
  2016-11-25 16:03 ` David Laight
@ 2016-11-26 22:47 ` Saeed Mahameed
  2016-11-27  2:16   ` Eric Dumazet
  2016-11-29 18:37 ` David Miller
  2016-11-30 14:08 ` Regression: " Jesper Dangaard Brouer
  3 siblings, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2016-11-26 22:47 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Tariq Toukan

On Fri, Nov 25, 2016 at 5:46 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> mlx4 stats are chaotic because a deferred work queue is responsible
> to update them every 250 ms.
>
Hello Eric,

Well the only historical reason for this deferred work is that we
query FW for some counters which might sleep.
and there is one place in the kernel where dev_get_stats(dev, &temp)
is called under a rw lock "read_lock(&dev_base_lock);"
in http://lxr.free-electrons.com/source/net/core/net-sysfs.c#L552, i
am not sure why is it this way ? Maybe it is time fix this and get rid
of the deferred work, which will give you the same precision even for
when reading ehttool stats, which this patch didn't take care off.
this will also improve other drivers who might sleep while reading
stats.

> Even sampling stats every one second with "sar -n DEV 1" gives
> variations like the following :
>
> lpaa23:~# sar -n DEV 1 10 | grep eth0 | cut -c1-65
> 07:39:22         eth0 146877.00 3265554.00   9467.15 4828168.50
> 07:39:23         eth0 146587.00 3260329.00   9448.15 4820445.98
> 07:39:24         eth0 146894.00 3259989.00   9468.55 4819943.26
> 07:39:25         eth0 110368.00 2454497.00   7113.95 3629012.17  <<>>
> 07:39:26         eth0 146563.00 3257502.00   9447.25 4816266.23
> 07:39:27         eth0 145678.00 3258292.00   9389.79 4817414.39
> 07:39:28         eth0 145268.00 3253171.00   9363.85 4809852.46
> 07:39:29         eth0 146439.00 3262185.00   9438.97 4823172.48
> 07:39:30         eth0 146758.00 3264175.00   9459.94 4826124.13
> 07:39:31         eth0 146843.00 3256903.00   9465.44 4815381.97
> Average:         eth0 142827.50 3179259.70   9206.30 4700578.16
>
> This patch allows rx/tx bytes/packets counters being folded at the
> time we need stats.
>
> We now can fetch stats every 1 ms if we want to check NIC behavior
> on a small time window. It is also easier to detect anomalies.
>
> lpaa23:~# sar -n DEV 1 10 | grep eth0 | cut -c1-65
> 07:42:50         eth0 142915.00 3177696.00   9212.06 4698270.42
> 07:42:51         eth0 143741.00 3200232.00   9265.15 4731593.02
> 07:42:52         eth0 142781.00 3171600.00   9202.92 4689260.16
> 07:42:53         eth0 143835.00 3192932.00   9271.80 4720761.39
> 07:42:54         eth0 141922.00 3165174.00   9147.64 4679759.21
> 07:42:55         eth0 142993.00 3207038.00   9216.78 4741653.05
> 07:42:56         eth0 141394.06 3154335.64   9113.85 4663731.73
> 07:42:57         eth0 141850.00 3161202.00   9144.48 4673866.07
> 07:42:58         eth0 143439.00 3180736.00   9246.05 4702755.35
> 07:42:59         eth0 143501.00 3210992.00   9249.99 4747501.84
> Average:         eth0 142835.66 3182165.93   9206.98 4704874.08
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Tariq Toukan <tariqt@mellanox.com>
> ---
>  drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |    2
>  drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |    1
>  drivers/net/ethernet/mellanox/mlx4/en_port.c    |   77 +++++++++-----
>  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    1
>  4 files changed, 58 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> index 487a58f9c192896852fef271b6cce9bde132deb7..d9c9f86a30df953fa555934c5406057dcaf28960 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> @@ -367,6 +367,8 @@ static void mlx4_en_get_ethtool_stats(struct net_device *dev,
>
>         spin_lock_bh(&priv->stats_lock);
>
> +       mlx4_en_fold_software_stats(dev);
> +
>         for (i = 0; i < NUM_MAIN_STATS; i++, bitmap_iterator_inc(&it))
>                 if (bitmap_iterator_test(&it))
>                         data[index++] = ((unsigned long *)&dev->stats)[i];
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> index 9018bb1b2e12142e048281a9d28ddf95e0023a61..d28d841db23ce885d2011877a156bacf23f65afe 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> @@ -1321,6 +1321,7 @@ mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
>         struct mlx4_en_priv *priv = netdev_priv(dev);
>
>         spin_lock_bh(&priv->stats_lock);
> +       mlx4_en_fold_software_stats(dev);
>         netdev_stats_to_stats64(stats, &dev->stats);
>         spin_unlock_bh(&priv->stats_lock);
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
> index 1eb4c1e10bad1dad26049876acf107a2073a6ab1..c6c4f1238923e09eced547454b86c68720292859 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
> @@ -147,6 +147,39 @@ static unsigned long en_stats_adder(__be64 *start, __be64 *next, int num)
>         return ret;
>  }
>
> +void mlx4_en_fold_software_stats(struct net_device *dev)
> +{
> +       struct mlx4_en_priv *priv = netdev_priv(dev);
> +       struct mlx4_en_dev *mdev = priv->mdev;
> +       unsigned long packets, bytes;
> +       int i;
> +
> +       if (mlx4_is_master(mdev->dev))
> +               return;

hmm, I think here you are just dragging a wrong discussion made in
mlx4 driver that the PF (only in SRIOV mode) netdev stats should
report the whole port stats from MLX4_CMD_DUMP_ETH_STATS FW command.

IMHO mlx4_en_get_stats64 should always report SW stats.
regardless, this function "mlx4_en_fold_software_stats" should always
fold the SW stats unconditionally, and W/A it somewhere else if SW
stats should be reported from FW. otherwise we will keep dragging this
confusion.

> +
> +       packets = 0;
> +       bytes = 0;
> +       for (i = 0; i < priv->rx_ring_num; i++) {
> +               const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
> +
> +               packets += READ_ONCE(ring->packets);
> +               bytes   += READ_ONCE(ring->bytes);
> +       }
> +       dev->stats.rx_packets = packets;
> +       dev->stats.rx_bytes = bytes;
> +
> +       packets = 0;
> +       bytes = 0;
> +       for (i = 0; i < priv->tx_ring_num[TX]; i++) {
> +               const struct mlx4_en_tx_ring *ring = priv->tx_ring[TX][i];
> +
> +               packets += READ_ONCE(ring->packets);
> +               bytes   += READ_ONCE(ring->bytes);
> +       }
> +       dev->stats.tx_packets = packets;
> +       dev->stats.tx_bytes = bytes;
> +}
> +
>  int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
>  {
>         struct mlx4_counter tmp_counter_stats;
> @@ -159,6 +192,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
>         u64 in_mod = reset << 8 | port;
>         int err;
>         int i, counter_index;
> +       unsigned long sw_tx_dropped = 0;
>         unsigned long sw_rx_dropped = 0;
>
>         mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
> @@ -174,8 +208,8 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
>
>         spin_lock_bh(&priv->stats_lock);
>
> -       stats->rx_packets = 0;
> -       stats->rx_bytes = 0;
> +       mlx4_en_fold_software_stats(dev);
> +
>         priv->port_stats.rx_chksum_good = 0;
>         priv->port_stats.rx_chksum_none = 0;
>         priv->port_stats.rx_chksum_complete = 0;
> @@ -183,19 +217,16 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
>         priv->xdp_stats.rx_xdp_tx      = 0;
>         priv->xdp_stats.rx_xdp_tx_full = 0;
>         for (i = 0; i < priv->rx_ring_num; i++) {
> -               stats->rx_packets += priv->rx_ring[i]->packets;
> -               stats->rx_bytes += priv->rx_ring[i]->bytes;
> -               sw_rx_dropped += priv->rx_ring[i]->dropped;
> -               priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
> -               priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
> -               priv->port_stats.rx_chksum_complete += priv->rx_ring[i]->csum_complete;
> -               priv->xdp_stats.rx_xdp_drop    += priv->rx_ring[i]->xdp_drop;
> -               priv->xdp_stats.rx_xdp_tx      += priv->rx_ring[i]->xdp_tx;
> -               priv->xdp_stats.rx_xdp_tx_full += priv->rx_ring[i]->xdp_tx_full;
> +               const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
> +
> +               sw_rx_dropped                   += READ_ONCE(ring->dropped);
> +               priv->port_stats.rx_chksum_good += READ_ONCE(ring->csum_ok);
> +               priv->port_stats.rx_chksum_none += READ_ONCE(ring->csum_none);
> +               priv->port_stats.rx_chksum_complete += READ_ONCE(ring->csum_complete);
> +               priv->xdp_stats.rx_xdp_drop     += READ_ONCE(ring->xdp_drop);
> +               priv->xdp_stats.rx_xdp_tx       += READ_ONCE(ring->xdp_tx);
> +               priv->xdp_stats.rx_xdp_tx_full  += READ_ONCE(ring->xdp_tx_full);
>         }
> -       stats->tx_packets = 0;
> -       stats->tx_bytes = 0;
> -       stats->tx_dropped = 0;
>         priv->port_stats.tx_chksum_offload = 0;
>         priv->port_stats.queue_stopped = 0;
>         priv->port_stats.wake_queue = 0;
> @@ -205,15 +236,14 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
>         for (i = 0; i < priv->tx_ring_num[TX]; i++) {
>                 const struct mlx4_en_tx_ring *ring = priv->tx_ring[TX][i];
>
> -               stats->tx_packets += ring->packets;
> -               stats->tx_bytes += ring->bytes;
> -               stats->tx_dropped += ring->tx_dropped;
> -               priv->port_stats.tx_chksum_offload += ring->tx_csum;
> -               priv->port_stats.queue_stopped     += ring->queue_stopped;
> -               priv->port_stats.wake_queue        += ring->wake_queue;
> -               priv->port_stats.tso_packets       += ring->tso_packets;
> -               priv->port_stats.xmit_more         += ring->xmit_more;
> +               sw_tx_dropped                      += READ_ONCE(ring->tx_dropped);
> +               priv->port_stats.tx_chksum_offload += READ_ONCE(ring->tx_csum);
> +               priv->port_stats.queue_stopped     += READ_ONCE(ring->queue_stopped);
> +               priv->port_stats.wake_queue        += READ_ONCE(ring->wake_queue);
> +               priv->port_stats.tso_packets       += READ_ONCE(ring->tso_packets);
> +               priv->port_stats.xmit_more         += READ_ONCE(ring->xmit_more);
>         }
> +
>         if (mlx4_is_master(mdev->dev)) {
>                 stats->rx_packets = en_stats_adder(&mlx4_en_stats->RTOT_prio_0,
>                                                    &mlx4_en_stats->RTOT_prio_1,

As you see here in SRIOV mode (PF only) reads   sw stats from FW.
Tariq, I think we need to fix this.


> @@ -251,7 +281,8 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
>         stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
>         stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
>         stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
> -       stats->tx_dropped += be32_to_cpu(mlx4_en_stats->TDROP);
> +       stats->tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP) +
> +                           sw_tx_dropped;
>
>         /* RX stats */
>         priv->pkstats.rx_multicast_packets = stats->multicast;
> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> index 574bcbb1b38fc4758511d8f7bd17a87b0a507a73..20a936428f4a44c8ca0a7161855da310f9166b50 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> @@ -755,6 +755,7 @@ void mlx4_en_rx_irq(struct mlx4_cq *mcq);
>  int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
>  int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
>
> +void mlx4_en_fold_software_stats(struct net_device *dev);
>  int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
>  int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
>
>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-26 22:47 ` Saeed Mahameed
@ 2016-11-27  2:16   ` Eric Dumazet
  2016-11-28 20:40     ` David Miller
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-11-27  2:16 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David Miller, netdev, Tariq Toukan

On Sun, 2016-11-27 at 00:47 +0200, Saeed Mahameed wrote:
> On Fri, Nov 25, 2016 at 5:46 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:

> As you see here in SRIOV mode (PF only) reads   sw stats from FW.
> Tariq, I think we need to fix this.

Sure, my patch does not change this at all.

If mlx4_is_master() is false, then we aggregate the software states and
only the software stats.

My patch makes this aggregation possible at the time ethtool or
ndo_get_stat64() are called, since this absolutely not depend on the 250
ms timer fetching hardware stats.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-27  2:16   ` Eric Dumazet
@ 2016-11-28 20:40     ` David Miller
  2016-11-28 21:55       ` Saeed Mahameed
  0 siblings, 1 reply; 33+ messages in thread
From: David Miller @ 2016-11-28 20:40 UTC (permalink / raw)
  To: eric.dumazet; +Cc: saeedm, netdev, tariqt

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 26 Nov 2016 18:16:00 -0800

> On Sun, 2016-11-27 at 00:47 +0200, Saeed Mahameed wrote:
>> On Fri, Nov 25, 2016 at 5:46 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
>> As you see here in SRIOV mode (PF only) reads   sw stats from FW.
>> Tariq, I think we need to fix this.
> 
> Sure, my patch does not change this at all.
> 
> If mlx4_is_master() is false, then we aggregate the software states and
> only the software stats.
> 
> My patch makes this aggregation possible at the time ethtool or
> ndo_get_stat64() are called, since this absolutely not depend on the 250
> ms timer fetching hardware stats.

Saeed please provide counter arguments or ACK this patch, thank you.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-28 20:40     ` David Miller
@ 2016-11-28 21:55       ` Saeed Mahameed
  2016-11-28 22:57         ` Eric Dumazet
  0 siblings, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2016-11-28 21:55 UTC (permalink / raw)
  To: David Miller; +Cc: Eric Dumazet, Linux Netdev List, Tariq Toukan

On Mon, Nov 28, 2016 at 10:40 PM, David Miller <davem@davemloft.net> wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Sat, 26 Nov 2016 18:16:00 -0800
>
>> On Sun, 2016-11-27 at 00:47 +0200, Saeed Mahameed wrote:
>>> On Fri, Nov 25, 2016 at 5:46 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>>> As you see here in SRIOV mode (PF only) reads   sw stats from FW.
>>> Tariq, I think we need to fix this.
>>
>> Sure, my patch does not change this at all.
>>
>> If mlx4_is_master() is false, then we aggregate the software states and
>> only the software stats.
>>
>> My patch makes this aggregation possible at the time ethtool or
>> ndo_get_stat64() are called, since this absolutely not depend on the 250
>> ms timer fetching hardware stats.
>
> Saeed please provide counter arguments or ACK this patch, thank you.

I have nothing against this patch, I just wanted to point out that
this patch is just fixing the symptom.
We keep ignoring the root cause that dev_get_stats is called under a
spin_lock which really ties our hands "us device drivers developers"
and push us towards those fragile solutions like deferred work for
caching statistics.
I am not saying that Eric should fix this in his patch, i just wanted
to raise the awareness of the root cause.

Regarding the SRIOV PF stats, i will take it with Tariq internally.

Bottom line, I ACK this patch, I might even do the same myself for
mlx5 :), but there are some follow ups that need to be done.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Aw: Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-25 19:19       ` Eric Dumazet
@ 2016-11-28 22:02         ` Lino Sanfilippo
  2016-11-29  0:19           ` Eric Dumazet
  0 siblings, 1 reply; 33+ messages in thread
From: Lino Sanfilippo @ 2016-11-28 22:02 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Laight, David Miller, netdev, Tariq Toukan

Hi Eric,

On 25.11.2016 20:19, Eric Dumazet wrote:
> On Fri, 2016-11-25 at 17:30 +0100, Lino Sanfilippo wrote:
>> Hi,
>> 
>> 
>> > 
>> > The READ_ONCE() are documenting the fact that no lock is taken to fetch
>> > the stats, while another cpus might being changing them.
>> > 
>> > I had no answer yet from https://patchwork.ozlabs.org/patch/698449/
>> > 
>> > So I thought it was not needed to explain this in the changelog, given
>> > that it apparently is one of the few things that can block someone to
>> > understand one of my changes :/
>> > 
>> > Apparently nobody really understands READ_ONCE() purpose, it is really a
>> > pity we have to explain this over and over.
>> > 
>> 
>> Even at the risk of showing once more a lack of understanding for
>> READ_ONCE():
>> Does not a READ_ONCE() have to e paired with some kind of
>> WRITE_ONCE()? 
> 
> You are right.
> 
> Although in this case, the producers are using a lock, and do
> 
> ring->packets++;
> 
> We hopefully have compilers/cpus that do not put intermediate garbage in
> ring->packets while doing the increment.
> 
> One problem with :
> 
> WRITE_ONCE(ring->packets, ring->packets + 1);
> 
> is that gcc no longer uses an INC instruction.

I see. So we would have to do something like

tmp = ring->packets;
tmp++;
WRITE_ONCE(ring->packets, tmp);

to use WRITE_ONCE in this case? If so, could it be worth doing something like this to 
have a balanced READ_ONCE, WRITE_ONCE usage?

> 
> Maybe we need some ADD_ONCE(ptr, val) macro doing the proper thing.
> 
>> Furthermore: there a quite some network drivers that ensure visibility
>> of 
>> the descriptor queue indices between xmit and xmit completion function
>> by means of
>> smp barriers. Could all these drivers theoretically be adjusted to use
>> READ_ONCE(),
>> WRITE_ONCE() for the indices instead?
>> 
> 
> Well, for this precise case we do need appropriate smp barriers.
> 
> READ_ONCE() can be better than poor barrier(), look at 
> https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=b668534c1d9b80f4cda4d761eb11d3a6c9f4ced8
> 
> 

Regards,
Lino

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-28 21:55       ` Saeed Mahameed
@ 2016-11-28 22:57         ` Eric Dumazet
  0 siblings, 0 replies; 33+ messages in thread
From: Eric Dumazet @ 2016-11-28 22:57 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David Miller, Linux Netdev List, Tariq Toukan

On Mon, 2016-11-28 at 23:55 +0200, Saeed Mahameed wrote:

> I have nothing against this patch, I just wanted to point out that
> this patch is just fixing the symptom.


> We keep ignoring the root cause that dev_get_stats is called under a
> spin_lock which really ties our hands "us device drivers developers"
> and push us towards those fragile solutions like deferred work for
> caching statistics.

Well, we do not want to allow a driver to wait several ms to fetch
stats, or allowing the operation to fail is memory for the transaction
can not be allocated.

Otherwise, some monitoring tools could really have serious problems
under stress.

We really don't care of how often the 'not really hot counters' are
fetched from the NIC. I guess even once per second should be good
enough.

But these 4 counters (bytes/packets rx/tx), especially if they are
given by counters managed in the driver, should reflect the most current
value.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Aw: Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-28 22:02         ` Lino Sanfilippo
@ 2016-11-29  0:19           ` Eric Dumazet
  2016-11-30 15:28             ` David Laight
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-11-29  0:19 UTC (permalink / raw)
  To: Lino Sanfilippo; +Cc: David Laight, David Miller, netdev, Tariq Toukan

On Mon, 2016-11-28 at 23:02 +0100, Lino Sanfilippo wrote:
> Hi Eric,
> 
> On 25.11.2016 20:19, Eric Dumazet wrote:
> > On Fri, 2016-11-25 at 17:30 +0100, Lino Sanfilippo wrote:
> >> Hi,
> >> 
> >> 
> >> > 
> >> > The READ_ONCE() are documenting the fact that no lock is taken to fetch
> >> > the stats, while another cpus might being changing them.
> >> > 
> >> > I had no answer yet from https://patchwork.ozlabs.org/patch/698449/
> >> > 
> >> > So I thought it was not needed to explain this in the changelog, given
> >> > that it apparently is one of the few things that can block someone to
> >> > understand one of my changes :/
> >> > 
> >> > Apparently nobody really understands READ_ONCE() purpose, it is really a
> >> > pity we have to explain this over and over.
> >> > 
> >> 
> >> Even at the risk of showing once more a lack of understanding for
> >> READ_ONCE():
> >> Does not a READ_ONCE() have to e paired with some kind of
> >> WRITE_ONCE()? 
> > 
> > You are right.
> > 
> > Although in this case, the producers are using a lock, and do
> > 
> > ring->packets++;
> > 
> > We hopefully have compilers/cpus that do not put intermediate garbage in
> > ring->packets while doing the increment.
> > 
> > One problem with :
> > 
> > WRITE_ONCE(ring->packets, ring->packets + 1);
> > 
> > is that gcc no longer uses an INC instruction.
> 
> I see. So we would have to do something like
> 
> tmp = ring->packets;
> tmp++;
> WRITE_ONCE(ring->packets, tmp);


Well, gcc will generate a code with more instructions than a mere 

"inc  offset(%register)"


Which is kind of unfortunate, given it is the fast path.

Better add a comment, like :

/* We should use WRITE_ONCE() to pair with the READ_ONCE() found in xxxx()
 * But gcc would generate non optimal code.
 */

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-25 15:46 [PATCH] mlx4: give precise rx/tx bytes/packets counters Eric Dumazet
  2016-11-25 16:03 ` David Laight
  2016-11-26 22:47 ` Saeed Mahameed
@ 2016-11-29 18:37 ` David Miller
  2016-11-30 14:08 ` Regression: " Jesper Dangaard Brouer
  3 siblings, 0 replies; 33+ messages in thread
From: David Miller @ 2016-11-29 18:37 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, tariqt

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 25 Nov 2016 07:46:20 -0800

> From: Eric Dumazet <edumazet@google.com>
> 
> mlx4 stats are chaotic because a deferred work queue is responsible
> to update them every 250 ms.
> 
> Even sampling stats every one second with "sar -n DEV 1" gives
> variations like the following :
 ...
> This patch allows rx/tx bytes/packets counters being folded at the
> time we need stats.
> 
> We now can fetch stats every 1 ms if we want to check NIC behavior
> on a small time window. It is also easier to detect anomalies.
 ...
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Tariq Toukan <tariqt@mellanox.com>

Applied to net-next, thanks Eric.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-25 15:46 [PATCH] mlx4: give precise rx/tx bytes/packets counters Eric Dumazet
                   ` (2 preceding siblings ...)
  2016-11-29 18:37 ` David Miller
@ 2016-11-30 14:08 ` Jesper Dangaard Brouer
  2016-11-30 15:58   ` Eric Dumazet
  3 siblings, 1 reply; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2016-11-30 14:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: brouer, David Miller, netdev, Tariq Toukan


On Fri, 25 Nov 2016 07:46:20 -0800 Eric Dumazet <eric.dumazet@gmail.com> wrote:

> From: Eric Dumazet <edumazet@google.com>

Ended up-in net-next as:

 commit 40931b85113dad7881d49e8759e5ad41d30a5e6c
 Author: Eric Dumazet <edumazet@google.com>
 Date:   Fri Nov 25 07:46:20 2016 -0800

    mlx4: give precise rx/tx bytes/packets counters
    
    mlx4 stats are chaotic because a deferred work queue is responsible
    to update them every 250 ms.

Likely after this patch I get this crash (below), when rebooting my machine.
Looks like a device removal order thing.
Tested with net-next at commit 93ba22225504.   

[...]
[ 1967.248453] mlx5_core 0000:02:00.1: Shutdown was called
[ 1967.854556] mlx5_core 0000:02:00.0: Shutdown was called
[ 1968.443015] e1000e: EEE TX LPI TIMER: 00000011
[ 1968.484676] sd 3:0:0:0: [sda] Synchronizing SCSI cache
[ 1968.528354] mlx4_core 0000:01:00.0: mlx4_shutdown was called
[ 1968.534054] mlx4_en: mlx4p1: Close port called
[ 1968.571156] mlx4_en 0000:01:00.0: removed PHC
[ 1968.575677] mlx4_en: mlx4p2: Close port called
[ 1969.506602] BUG: unable to handle kernel NULL pointer dereference at 0000000000000d08
[ 1969.514530] IP: [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
[ 1969.522963] PGD 0 [ 1969.524803] 
[ 1969.526332] Oops: 0000 [#1] PREEMPT SMP
[ 1969.530201] Modules linked in: coretemp kvm_intel kvm irqbypass intel_cstate mxm_wmi i2c_i801 intel_rapl_perf i2c_smbus sg pcspkr i2c_core shpchp nfsd wmi video acpi_pad auth_rpcgss oid_registry nfs_acl lockd grace sunrpc ip_tables x_tables mlx4_en e1000e mlx5_core ptp serio_raw sd_mod mlx4_core pps_core devlink hid_generic
[ 1969.559616] CPU: 3 PID: 3104 Comm: kworker/3:1 Not tainted 4.9.0-rc6-net-next3-01390-g93ba22225504 #12
[ 1969.568984] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z97 Extreme4, BIOS P2.10 05/12/2015
[ 1969.578877] Workqueue: events linkwatch_event
[ 1969.583285] task: ffff8803f42a0000 task.stack: ffff88040b2d0000
[ 1969.589238] RIP: 0010:[<ffffffffa0127ca4>]  [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
[ 1969.600102] RSP: 0018:ffff88040b2d3bd8  EFLAGS: 00010282
[ 1969.605442] RAX: ffff8803f432efc8 RBX: ffff8803f4320000 RCX: 0000000000000000
[ 1969.612604] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8803f4320000
[ 1969.619772] RBP: ffff88040b2d3bd8 R08: 000000000000000c R09: ffff8803f432f000
[ 1969.626938] R10: 0000000000000000 R11: ffff88040d64ac00 R12: ffff8803e5aff8dc
[ 1969.634104] R13: ffff8803f4320a28 R14: ffff8803e5aff800 R15: 0000000000000000
[ 1969.641273] FS:  0000000000000000(0000) GS:ffff88041fac0000(0000) knlGS:0000000000000000
[ 1969.649422] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1969.655197] CR2: 0000000000000d08 CR3: 0000000001c07000 CR4: 00000000001406e0
[ 1969.662366] Stack:
[ 1969.664412]  ffff88040b2d3be8 ffffffffa0127f8e ffff88040b2d3c10 ffffffffa012a23b
[ 1969.671948]  ffff8803e5aff8dc ffff8803f4320000 ffff8803f4320000 ffff88040b2d3c30
[ 1969.679478]  ffffffff8160ae29 ffff8803e5aff8d8 ffff8804088ff300 ffff88040b2d3c58
[ 1969.687001] Call Trace:
[ 1969.689484]  [<ffffffffa0127f8e>] mlx4_en_fold_software_stats+0x1e/0x20 [mlx4_en]
[ 1969.697026]  [<ffffffffa012a23b>] mlx4_en_get_stats64+0x2b/0x50 [mlx4_en]
[ 1969.703844]  [<ffffffff8160ae29>] dev_get_stats+0x39/0xa0
[ 1969.709274]  [<ffffffff81622470>] rtnl_fill_stats+0x40/0x130
[ 1969.714968]  [<ffffffff8162631b>] rtnl_fill_ifinfo+0x55b/0x1010
[ 1969.720921]  [<ffffffff816285d3>] rtmsg_ifinfo_build_skb+0x73/0xd0
[ 1969.727136]  [<ffffffff81628646>] rtmsg_ifinfo.part.25+0x16/0x50
[ 1969.733176]  [<ffffffff81628698>] rtmsg_ifinfo+0x18/0x20
[ 1969.738522]  [<ffffffff8160e947>] netdev_state_change+0x47/0x50
[ 1969.744478]  [<ffffffff81629018>] linkwatch_do_dev+0x38/0x50
[ 1969.750170]  [<ffffffff81629257>] __linkwatch_run_queue+0xe7/0x160
[ 1969.756385]  [<ffffffff816292f5>] linkwatch_event+0x25/0x30
[ 1969.761991]  [<ffffffff8107b3cb>] process_one_work+0x15b/0x460
[ 1969.767857]  [<ffffffff8107b71e>] worker_thread+0x4e/0x480
[ 1969.773378]  [<ffffffff8107b6d0>] ? process_one_work+0x460/0x460
[ 1969.779420]  [<ffffffff8107b6d0>] ? process_one_work+0x460/0x460
[ 1969.785460]  [<ffffffff810811ea>] kthread+0xca/0xe0
[ 1969.790372]  [<ffffffff81081120>] ? kthread_worker_fn+0x120/0x120
[ 1969.796495]  [<ffffffff817302d2>] ret_from_fork+0x22/0x30
[ 1969.801924] Code: 00 00 55 48 89 e5 85 d2 0f 84 90 00 00 00 83 ea 01 31 c9 31 f6 48 8d 87 c0 ef 00 00 4c 8d 8c d7 c8 ef 00 00 48 8b 10 48 83 c0 08 <4c> 8b 82 08 0d 00 00 48 8b 92 00 0d 00 00 4c 01 c6 48 01 d1 4c 
[ 1969.821969] RIP  [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
[ 1969.830486]  RSP <ffff88040b2d3bd8>
[ 1969.834002] CR2: 0000000000000d08
[ 1969.837440] ---[ end trace 80b9fbc1e7baed9b ]---
[ 1969.842102] Kernel panic - not syncing: Fatal exception in interrupt
[ 1969.848520] Kernel Offset: disabled
[ 1969.852050] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
(END)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: Aw: Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-29  0:19           ` Eric Dumazet
@ 2016-11-30 15:28             ` David Laight
  2016-11-30 15:47               ` Eric Dumazet
  0 siblings, 1 reply; 33+ messages in thread
From: David Laight @ 2016-11-30 15:28 UTC (permalink / raw)
  To: 'Eric Dumazet', Lino Sanfilippo
  Cc: David Miller, netdev, Tariq Toukan

From: Eric Dumazet
> Sent: 29 November 2016 00:19
> On Mon, 2016-11-28 at 23:02 +0100, Lino Sanfilippo wrote:
> > Hi Eric,
> >
> > On 25.11.2016 20:19, Eric Dumazet wrote:
> > > On Fri, 2016-11-25 at 17:30 +0100, Lino Sanfilippo wrote:
> > >> Hi,
> > >>
> > >>
> > >> >
> > >> > The READ_ONCE() are documenting the fact that no lock is taken to fetch
> > >> > the stats, while another cpus might being changing them.
> > >> >
> > >> > I had no answer yet from https://patchwork.ozlabs.org/patch/698449/
> > >> >
> > >> > So I thought it was not needed to explain this in the changelog, given
> > >> > that it apparently is one of the few things that can block someone to
> > >> > understand one of my changes :/
> > >> >
> > >> > Apparently nobody really understands READ_ONCE() purpose, it is really a
> > >> > pity we have to explain this over and over.
> > >> >
> > >>
> > >> Even at the risk of showing once more a lack of understanding for
> > >> READ_ONCE():
> > >> Does not a READ_ONCE() have to e paired with some kind of
> > >> WRITE_ONCE()?
> > >
> > > You are right.
> > >
> > > Although in this case, the producers are using a lock, and do
> > >
> > > ring->packets++;
> > >
> > > We hopefully have compilers/cpus that do not put intermediate garbage in
> > > ring->packets while doing the increment.
> > >
> > > One problem with :
> > >
> > > WRITE_ONCE(ring->packets, ring->packets + 1);
> > >
> > > is that gcc no longer uses an INC instruction.
> >
> > I see. So we would have to do something like
> >
> > tmp = ring->packets;
> > tmp++;
> > WRITE_ONCE(ring->packets, tmp);
> 
> 
> Well, gcc will generate a code with more instructions than a mere
> 
> "inc  offset(%register)"

Are you sure??
Last I looked gcc seemed to convert 'foo++' to 'foo = foo + 1' before
generating any code.
It might then optimise that back to a memory increment, but that would
also happen if you'd coded the latter form.

> Which is kind of unfortunate, given it is the fast path.
> 
> Better add a comment, like :
> 
> /* We should use WRITE_ONCE() to pair with the READ_ONCE() found in xxxx()
>  * But gcc would generate non optimal code.
>  */

Actually while READ_ONCE() is generally useful - to get a snapshot of a changing value.

WRITE_ONCE() isn't a pairing - the compiler is highly unlikely to write a
location twice.
You might want an annotation to ensure is doesn't assume it can read the value
back (write through volatile pointer). But that has nothing to do with how readers behave.

	David


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Aw: Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-30 15:28             ` David Laight
@ 2016-11-30 15:47               ` Eric Dumazet
  0 siblings, 0 replies; 33+ messages in thread
From: Eric Dumazet @ 2016-11-30 15:47 UTC (permalink / raw)
  To: David Laight; +Cc: Lino Sanfilippo, David Miller, netdev, Tariq Toukan

On Wed, 2016-11-30 at 15:28 +0000, David Laight wrote:

> Are you sure??

Yes I am

> Last I looked gcc seemed to convert 'foo++' to 'foo = foo + 1' before
> generating any code.

Your gcc might need a refresh then.

> It might then optimise that back to a memory increment, but that would
> also happen if you'd coded the latter form.
> 
> > Which is kind of unfortunate, given it is the fast path.
> > 
> > Better add a comment, like :
> > 
> > /* We should use WRITE_ONCE() to pair with the READ_ONCE() found in xxxx()
> >  * But gcc would generate non optimal code.
> >  */
> 
> Actually while READ_ONCE() is generally useful - to get a snapshot of a changing value.
> 
> WRITE_ONCE() isn't a pairing - the compiler is highly unlikely to write a
> location twice.

WOW. I can not believe what you just said.

We had numerous bugs because compiler was writing on the location
temporary computations. Just take a look at git history to find some
gems.

> You might want an annotation to ensure is doesn't assume it can read the value
> back (write through volatile pointer). But that has nothing to do with how readers behave.

Completely wrong.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-30 14:08 ` Regression: " Jesper Dangaard Brouer
@ 2016-11-30 15:58   ` Eric Dumazet
  2016-11-30 16:46     ` Saeed Mahameed
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-11-30 15:58 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: David Miller, netdev, Tariq Toukan

On Wed, 2016-11-30 at 15:08 +0100, Jesper Dangaard Brouer wrote:
> On Fri, 25 Nov 2016 07:46:20 -0800 Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > From: Eric Dumazet <edumazet@google.com>
> 
> Ended up-in net-next as:
> 
>  commit 40931b85113dad7881d49e8759e5ad41d30a5e6c
>  Author: Eric Dumazet <edumazet@google.com>
>  Date:   Fri Nov 25 07:46:20 2016 -0800
> 
>     mlx4: give precise rx/tx bytes/packets counters
>     
>     mlx4 stats are chaotic because a deferred work queue is responsible
>     to update them every 250 ms.
> 
> Likely after this patch I get this crash (below), when rebooting my machine.
> Looks like a device removal order thing.
> Tested with net-next at commit 93ba22225504.   
> 
> [...]
> [ 1967.248453] mlx5_core 0000:02:00.1: Shutdown was called
> [ 1967.854556] mlx5_core 0000:02:00.0: Shutdown was called
> [ 1968.443015] e1000e: EEE TX LPI TIMER: 00000011
> [ 1968.484676] sd 3:0:0:0: [sda] Synchronizing SCSI cache
> [ 1968.528354] mlx4_core 0000:01:00.0: mlx4_shutdown was called
> [ 1968.534054] mlx4_en: mlx4p1: Close port called
> [ 1968.571156] mlx4_en 0000:01:00.0: removed PHC
> [ 1968.575677] mlx4_en: mlx4p2: Close port called
> [ 1969.506602] BUG: unable to handle kernel NULL pointer dereference at 0000000000000d08
> [ 1969.514530] IP: [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
> [ 1969.522963] PGD 0 [ 1969.524803] 
> [ 1969.526332] Oops: 0000 [#1] PREEMPT SMP
> [ 1969.530201] Modules linked in: coretemp kvm_intel kvm irqbypass intel_cstate mxm_wmi i2c_i801 intel_rapl_perf i2c_smbus sg pcspkr i2c_core shpchp nfsd wmi video acpi_pad auth_rpcgss oid_registry nfs_acl lockd grace sunrpc ip_tables x_tables mlx4_en e1000e mlx5_core ptp serio_raw sd_mod mlx4_core pps_core devlink hid_generic
> [ 1969.559616] CPU: 3 PID: 3104 Comm: kworker/3:1 Not tainted 4.9.0-rc6-net-next3-01390-g93ba22225504 #12
> [ 1969.568984] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z97 Extreme4, BIOS P2.10 05/12/2015
> [ 1969.578877] Workqueue: events linkwatch_event
> [ 1969.583285] task: ffff8803f42a0000 task.stack: ffff88040b2d0000
> [ 1969.589238] RIP: 0010:[<ffffffffa0127ca4>]  [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
> [ 1969.600102] RSP: 0018:ffff88040b2d3bd8  EFLAGS: 00010282
> [ 1969.605442] RAX: ffff8803f432efc8 RBX: ffff8803f4320000 RCX: 0000000000000000
> [ 1969.612604] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8803f4320000
> [ 1969.619772] RBP: ffff88040b2d3bd8 R08: 000000000000000c R09: ffff8803f432f000
> [ 1969.626938] R10: 0000000000000000 R11: ffff88040d64ac00 R12: ffff8803e5aff8dc
> [ 1969.634104] R13: ffff8803f4320a28 R14: ffff8803e5aff800 R15: 0000000000000000
> [ 1969.641273] FS:  0000000000000000(0000) GS:ffff88041fac0000(0000) knlGS:0000000000000000
> [ 1969.649422] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1969.655197] CR2: 0000000000000d08 CR3: 0000000001c07000 CR4: 00000000001406e0
> [ 1969.662366] Stack:
> [ 1969.664412]  ffff88040b2d3be8 ffffffffa0127f8e ffff88040b2d3c10 ffffffffa012a23b
> [ 1969.671948]  ffff8803e5aff8dc ffff8803f4320000 ffff8803f4320000 ffff88040b2d3c30
> [ 1969.679478]  ffffffff8160ae29 ffff8803e5aff8d8 ffff8804088ff300 ffff88040b2d3c58
> [ 1969.687001] Call Trace:
> [ 1969.689484]  [<ffffffffa0127f8e>] mlx4_en_fold_software_stats+0x1e/0x20 [mlx4_en]
> [ 1969.697026]  [<ffffffffa012a23b>] mlx4_en_get_stats64+0x2b/0x50 [mlx4_en]
> [ 1969.703844]  [<ffffffff8160ae29>] dev_get_stats+0x39/0xa0
> [ 1969.709274]  [<ffffffff81622470>] rtnl_fill_stats+0x40/0x130
> [ 1969.714968]  [<ffffffff8162631b>] rtnl_fill_ifinfo+0x55b/0x1010
> [ 1969.720921]  [<ffffffff816285d3>] rtmsg_ifinfo_build_skb+0x73/0xd0
> [ 1969.727136]  [<ffffffff81628646>] rtmsg_ifinfo.part.25+0x16/0x50
> [ 1969.733176]  [<ffffffff81628698>] rtmsg_ifinfo+0x18/0x20
> [ 1969.738522]  [<ffffffff8160e947>] netdev_state_change+0x47/0x50
> [ 1969.744478]  [<ffffffff81629018>] linkwatch_do_dev+0x38/0x50
> [ 1969.750170]  [<ffffffff81629257>] __linkwatch_run_queue+0xe7/0x160
> [ 1969.756385]  [<ffffffff816292f5>] linkwatch_event+0x25/0x30
> [ 1969.761991]  [<ffffffff8107b3cb>] process_one_work+0x15b/0x460
> [ 1969.767857]  [<ffffffff8107b71e>] worker_thread+0x4e/0x480
> [ 1969.773378]  [<ffffffff8107b6d0>] ? process_one_work+0x460/0x460
> [ 1969.779420]  [<ffffffff8107b6d0>] ? process_one_work+0x460/0x460
> [ 1969.785460]  [<ffffffff810811ea>] kthread+0xca/0xe0
> [ 1969.790372]  [<ffffffff81081120>] ? kthread_worker_fn+0x120/0x120
> [ 1969.796495]  [<ffffffff817302d2>] ret_from_fork+0x22/0x30
> [ 1969.801924] Code: 00 00 55 48 89 e5 85 d2 0f 84 90 00 00 00 83 ea 01 31 c9 31 f6 48 8d 87 c0 ef 00 00 4c 8d 8c d7 c8 ef 00 00 48 8b 10 48 83 c0 08 <4c> 8b 82 08 0d 00 00 48 8b 92 00 0d 00 00 4c 01 c6 48 01 d1 4c 
> [ 1969.821969] RIP  [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
> [ 1969.830486]  RSP <ffff88040b2d3bd8>
> [ 1969.834002] CR2: 0000000000000d08
> [ 1969.837440] ---[ end trace 80b9fbc1e7baed9b ]---
> [ 1969.842102] Kernel panic - not syncing: Fatal exception in interrupt
> [ 1969.848520] Kernel Offset: disabled
> [ 1969.852050] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
> (END)

Hi Jesper.

Thanks for the report.

Then we have a bug in the driver, deleting some memory too soon.

If we depend on having proper stats at device dismantle, we need to keep
around the TX/RX ring where counters are, until the point these stats
wont be required.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-30 15:58   ` Eric Dumazet
@ 2016-11-30 16:46     ` Saeed Mahameed
  2016-11-30 17:35       ` Eric Dumazet
  0 siblings, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2016-11-30 16:46 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Wed, Nov 30, 2016 at 5:58 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-11-30 at 15:08 +0100, Jesper Dangaard Brouer wrote:
>> On Fri, 25 Nov 2016 07:46:20 -0800 Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>> > From: Eric Dumazet <edumazet@google.com>
>>
>> Ended up-in net-next as:
>>
>>  commit 40931b85113dad7881d49e8759e5ad41d30a5e6c
>>  Author: Eric Dumazet <edumazet@google.com>
>>  Date:   Fri Nov 25 07:46:20 2016 -0800
>>
>>     mlx4: give precise rx/tx bytes/packets counters
>>
>>     mlx4 stats are chaotic because a deferred work queue is responsible
>>     to update them every 250 ms.
>>
>> Likely after this patch I get this crash (below), when rebooting my machine.
>> Looks like a device removal order thing.
>> Tested with net-next at commit 93ba22225504.
>>
>> [...]
>> [ 1967.248453] mlx5_core 0000:02:00.1: Shutdown was called
>> [ 1967.854556] mlx5_core 0000:02:00.0: Shutdown was called
>> [ 1968.443015] e1000e: EEE TX LPI TIMER: 00000011
>> [ 1968.484676] sd 3:0:0:0: [sda] Synchronizing SCSI cache
>> [ 1968.528354] mlx4_core 0000:01:00.0: mlx4_shutdown was called
>> [ 1968.534054] mlx4_en: mlx4p1: Close port called
>> [ 1968.571156] mlx4_en 0000:01:00.0: removed PHC
>> [ 1968.575677] mlx4_en: mlx4p2: Close port called
>> [ 1969.506602] BUG: unable to handle kernel NULL pointer dereference at 0000000000000d08
>> [ 1969.514530] IP: [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
>> [ 1969.522963] PGD 0 [ 1969.524803]
>> [ 1969.526332] Oops: 0000 [#1] PREEMPT SMP
>> [ 1969.530201] Modules linked in: coretemp kvm_intel kvm irqbypass intel_cstate mxm_wmi i2c_i801 intel_rapl_perf i2c_smbus sg pcspkr i2c_core shpchp nfsd wmi video acpi_pad auth_rpcgss oid_registry nfs_acl lockd grace sunrpc ip_tables x_tables mlx4_en e1000e mlx5_core ptp serio_raw sd_mod mlx4_core pps_core devlink hid_generic
>> [ 1969.559616] CPU: 3 PID: 3104 Comm: kworker/3:1 Not tainted 4.9.0-rc6-net-next3-01390-g93ba22225504 #12
>> [ 1969.568984] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z97 Extreme4, BIOS P2.10 05/12/2015
>> [ 1969.578877] Workqueue: events linkwatch_event
>> [ 1969.583285] task: ffff8803f42a0000 task.stack: ffff88040b2d0000
>> [ 1969.589238] RIP: 0010:[<ffffffffa0127ca4>]  [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
>> [ 1969.600102] RSP: 0018:ffff88040b2d3bd8  EFLAGS: 00010282
>> [ 1969.605442] RAX: ffff8803f432efc8 RBX: ffff8803f4320000 RCX: 0000000000000000
>> [ 1969.612604] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8803f4320000
>> [ 1969.619772] RBP: ffff88040b2d3bd8 R08: 000000000000000c R09: ffff8803f432f000
>> [ 1969.626938] R10: 0000000000000000 R11: ffff88040d64ac00 R12: ffff8803e5aff8dc
>> [ 1969.634104] R13: ffff8803f4320a28 R14: ffff8803e5aff800 R15: 0000000000000000
>> [ 1969.641273] FS:  0000000000000000(0000) GS:ffff88041fac0000(0000) knlGS:0000000000000000
>> [ 1969.649422] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [ 1969.655197] CR2: 0000000000000d08 CR3: 0000000001c07000 CR4: 00000000001406e0
>> [ 1969.662366] Stack:
>> [ 1969.664412]  ffff88040b2d3be8 ffffffffa0127f8e ffff88040b2d3c10 ffffffffa012a23b
>> [ 1969.671948]  ffff8803e5aff8dc ffff8803f4320000 ffff8803f4320000 ffff88040b2d3c30
>> [ 1969.679478]  ffffffff8160ae29 ffff8803e5aff8d8 ffff8804088ff300 ffff88040b2d3c58
>> [ 1969.687001] Call Trace:
>> [ 1969.689484]  [<ffffffffa0127f8e>] mlx4_en_fold_software_stats+0x1e/0x20 [mlx4_en]
>> [ 1969.697026]  [<ffffffffa012a23b>] mlx4_en_get_stats64+0x2b/0x50 [mlx4_en]
>> [ 1969.703844]  [<ffffffff8160ae29>] dev_get_stats+0x39/0xa0
>> [ 1969.709274]  [<ffffffff81622470>] rtnl_fill_stats+0x40/0x130
>> [ 1969.714968]  [<ffffffff8162631b>] rtnl_fill_ifinfo+0x55b/0x1010
>> [ 1969.720921]  [<ffffffff816285d3>] rtmsg_ifinfo_build_skb+0x73/0xd0
>> [ 1969.727136]  [<ffffffff81628646>] rtmsg_ifinfo.part.25+0x16/0x50
>> [ 1969.733176]  [<ffffffff81628698>] rtmsg_ifinfo+0x18/0x20
>> [ 1969.738522]  [<ffffffff8160e947>] netdev_state_change+0x47/0x50
>> [ 1969.744478]  [<ffffffff81629018>] linkwatch_do_dev+0x38/0x50
>> [ 1969.750170]  [<ffffffff81629257>] __linkwatch_run_queue+0xe7/0x160
>> [ 1969.756385]  [<ffffffff816292f5>] linkwatch_event+0x25/0x30
>> [ 1969.761991]  [<ffffffff8107b3cb>] process_one_work+0x15b/0x460
>> [ 1969.767857]  [<ffffffff8107b71e>] worker_thread+0x4e/0x480
>> [ 1969.773378]  [<ffffffff8107b6d0>] ? process_one_work+0x460/0x460
>> [ 1969.779420]  [<ffffffff8107b6d0>] ? process_one_work+0x460/0x460
>> [ 1969.785460]  [<ffffffff810811ea>] kthread+0xca/0xe0
>> [ 1969.790372]  [<ffffffff81081120>] ? kthread_worker_fn+0x120/0x120
>> [ 1969.796495]  [<ffffffff817302d2>] ret_from_fork+0x22/0x30
>> [ 1969.801924] Code: 00 00 55 48 89 e5 85 d2 0f 84 90 00 00 00 83 ea 01 31 c9 31 f6 48 8d 87 c0 ef 00 00 4c 8d 8c d7 c8 ef 00 00 48 8b 10 48 83 c0 08 <4c> 8b 82 08 0d 00 00 48 8b 92 00 0d 00 00 4c 01 c6 48 01 d1 4c
>> [ 1969.821969] RIP  [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
>> [ 1969.830486]  RSP <ffff88040b2d3bd8>
>> [ 1969.834002] CR2: 0000000000000d08
>> [ 1969.837440] ---[ end trace 80b9fbc1e7baed9b ]---
>> [ 1969.842102] Kernel panic - not syncing: Fatal exception in interrupt
>> [ 1969.848520] Kernel Offset: disabled
>> [ 1969.852050] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
>> (END)
>
> Hi Jesper.
>
> Thanks for the report.
>
> Then we have a bug in the driver, deleting some memory too soon.

No ! it always been this way, the cached stats are always there (never deleted).
we just stop caching once the device is down, nothing is deleted too soon.

>
> If we depend on having proper stats at device dismantle, we need to keep

we had/still have the proper stats they are the ones that
mlx4_en_fold_software_stats is trying to cache into  (they always
exist),
but the ones that you are trying to read from (the mlx4 rings) are gone !

This bug is totally new and as i warned, this is another symptom of
the real root cause (can't sleep while reading stats).

Eric what do you suggest ? Keep pre-allocated MAX_RINGS stats  and
always iterate over all of them to query stats ?
what if you have one ring/none/1K ? how would you know how many to query ?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-30 16:46     ` Saeed Mahameed
@ 2016-11-30 17:35       ` Eric Dumazet
  2016-11-30 20:42         ` Saeed Mahameed
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-11-30 17:35 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Wed, 2016-11-30 at 18:46 +0200, Saeed Mahameed wrote:

> we had/still have the proper stats they are the ones that
> mlx4_en_fold_software_stats is trying to cache into  (they always
> exist),
> but the ones that you are trying to read from (the mlx4 rings) are gone !
> 
> This bug is totally new and as i warned, this is another symptom of
> the real root cause (can't sleep while reading stats).
> 
> Eric what do you suggest ? Keep pre-allocated MAX_RINGS stats  and
> always iterate over all of them to query stats ?
> what if you have one ring/none/1K ? how would you know how many to query ?

I am suggesting I will fix the bug I introduced.

Do not panic.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-30 17:35       ` Eric Dumazet
@ 2016-11-30 20:42         ` Saeed Mahameed
  2016-11-30 21:00           ` Eric Dumazet
  0 siblings, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2016-11-30 20:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Wed, Nov 30, 2016 at 7:35 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-11-30 at 18:46 +0200, Saeed Mahameed wrote:
>
>> we had/still have the proper stats they are the ones that
>> mlx4_en_fold_software_stats is trying to cache into  (they always
>> exist),
>> but the ones that you are trying to read from (the mlx4 rings) are gone !
>>
>> This bug is totally new and as i warned, this is another symptom of
>> the real root cause (can't sleep while reading stats).
>>
>> Eric what do you suggest ? Keep pre-allocated MAX_RINGS stats  and
>> always iterate over all of them to query stats ?
>> what if you have one ring/none/1K ? how would you know how many to query ?
>
> I am suggesting I will fix the bug I introduced.
>
> Do not panic.
>
>

Not at all, I trust you are the only one who is capable of providing
the best solution.
I am just trying to read your mind :-).

As i said i like the solution and i want to adapt it to mlx5, so I am
a little bit enthusiastic :)

Thanks.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-30 20:42         ` Saeed Mahameed
@ 2016-11-30 21:00           ` Eric Dumazet
  2016-12-01 12:37             ` Jesper Dangaard Brouer
  2016-12-01 15:38             ` Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters Saeed Mahameed
  0 siblings, 2 replies; 33+ messages in thread
From: Eric Dumazet @ 2016-11-30 21:00 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Wed, 2016-11-30 at 22:42 +0200, Saeed Mahameed wrote:
> On Wed, Nov 30, 2016 at 7:35 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Wed, 2016-11-30 at 18:46 +0200, Saeed Mahameed wrote:
> >
> >> we had/still have the proper stats they are the ones that
> >> mlx4_en_fold_software_stats is trying to cache into  (they always
> >> exist),
> >> but the ones that you are trying to read from (the mlx4 rings) are gone !
> >>
> >> This bug is totally new and as i warned, this is another symptom of
> >> the real root cause (can't sleep while reading stats).
> >>
> >> Eric what do you suggest ? Keep pre-allocated MAX_RINGS stats  and
> >> always iterate over all of them to query stats ?
> >> what if you have one ring/none/1K ? how would you know how many to query ?
> >
> > I am suggesting I will fix the bug I introduced.
> >
> > Do not panic.
> >
> >
> 
> Not at all, I trust you are the only one who is capable of providing
> the best solution.
> I am just trying to read your mind :-).
> 
> As i said i like the solution and i want to adapt it to mlx5, so I am
> a little bit enthusiastic :)

What about the following fix guys ?

As a bonus we update the stats right before they are sent to monitors
via rtnetlink ;)


diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 12ea3405f442717478bf0e8882edaf0de77986cb..091b904262bc7932d3edf99cf850affb23b9ce6e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1809,8 +1809,12 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
 
 	netif_tx_disable(dev);
 
+	spin_lock_bh(&priv->stats_lock);
+	mlx4_en_fold_software_stats(dev);
 	/* Set port as not active */
 	priv->port_up = false;
+	spin_unlock_bh(&priv->stats_lock);
+
 	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
 
 	/* Promsicuous mode */
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
index c6c4f1238923e09eced547454b86c68720292859..9166d90e732858610b1407fe85cbf6cbe27f5e0b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -154,7 +154,7 @@ void mlx4_en_fold_software_stats(struct net_device *dev)
 	unsigned long packets, bytes;
 	int i;
 
-	if (mlx4_is_master(mdev->dev))
+	if (!priv->port_up || mlx4_is_master(mdev->dev))
 		return;
 
 	packets = 0;

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-30 21:00           ` Eric Dumazet
@ 2016-12-01 12:37             ` Jesper Dangaard Brouer
  2016-12-01 13:02               ` [PATCH net-next] mlx4: fix use-after-free in mlx4_en_fold_software_stats() Eric Dumazet
  2016-12-01 15:38             ` Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters Saeed Mahameed
  1 sibling, 1 reply; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2016-12-01 12:37 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Saeed Mahameed, David Miller, netdev, Tariq Toukan, brouer


On Wed, 30 Nov 2016 13:00:52 -0800 Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Wed, 2016-11-30 at 22:42 +0200, Saeed Mahameed wrote:
> > On Wed, Nov 30, 2016 at 7:35 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:  
> > > On Wed, 2016-11-30 at 18:46 +0200, Saeed Mahameed wrote:
[...]
> > >
> > > I am suggesting I will fix the bug I introduced.
> > >
> > > Do not panic.
> > >
> > >  
> > 
> > Not at all, I trust you are the only one who is capable of providing
> > the best solution.
> > I am just trying to read your mind :-).
> > 
> > As i said i like the solution and i want to adapt it to mlx5, so I am
> > a little bit enthusiastic :)  
> 
> What about the following fix guys ?

Confirming this fixed the crash on shutdown for me, thanks!

 
> As a bonus we update the stats right before they are sent to monitors
> via rtnetlink ;)
> 
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> index 12ea3405f442717478bf0e8882edaf0de77986cb..091b904262bc7932d3edf99cf850affb23b9ce6e 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> @@ -1809,8 +1809,12 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
>  
>  	netif_tx_disable(dev);
>  
> +	spin_lock_bh(&priv->stats_lock);
> +	mlx4_en_fold_software_stats(dev);
>  	/* Set port as not active */
>  	priv->port_up = false;
> +	spin_unlock_bh(&priv->stats_lock);
> +
>  	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
>  
>  	/* Promsicuous mode */
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
> index c6c4f1238923e09eced547454b86c68720292859..9166d90e732858610b1407fe85cbf6cbe27f5e0b 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
> @@ -154,7 +154,7 @@ void mlx4_en_fold_software_stats(struct net_device *dev)
>  	unsigned long packets, bytes;
>  	int i;
>  
> -	if (mlx4_is_master(mdev->dev))
> +	if (!priv->port_up || mlx4_is_master(mdev->dev))
>  		return;
>  
>  	packets = 0;
> 
> 



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH net-next] mlx4: fix use-after-free in mlx4_en_fold_software_stats()
  2016-12-01 12:37             ` Jesper Dangaard Brouer
@ 2016-12-01 13:02               ` Eric Dumazet
  2016-12-01 15:23                 ` Saeed Mahameed
  2016-12-02 18:36                 ` David Miller
  0 siblings, 2 replies; 33+ messages in thread
From: Eric Dumazet @ 2016-12-01 13:02 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Saeed Mahameed, David Miller, netdev, Tariq Toukan

From: Eric Dumazet <edumazet@google.com>

My recent commit to get more precise rx/tx counters in ndo_get_stats64()
can lead to crashes at device dismantle, as Jesper found out.

We must prevent mlx4_en_fold_software_stats() trying to access
tx/rx rings if they are deleted.

Fix this by adding a test against priv->port_up in
mlx4_en_fold_software_stats()

Calling mlx4_en_fold_software_stats() from mlx4_en_stop_port()
allows us to eventually broadcast the latest/current counters to
rtnetlink monitors.

Fixes: 40931b85113d ("mlx4: give precise rx/tx bytes/packets counters")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-and-bisected-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Saeed Mahameed <saeedm@dev.mellanox.co.il>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |    4 ++++
 drivers/net/ethernet/mellanox/mlx4/en_port.c   |    2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 12ea3405f442717478bf0e8882edaf0de77986cb..091b904262bc7932d3edf99cf850affb23b9ce6e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1809,8 +1809,12 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
 
 	netif_tx_disable(dev);
 
+	spin_lock_bh(&priv->stats_lock);
+	mlx4_en_fold_software_stats(dev);
 	/* Set port as not active */
 	priv->port_up = false;
+	spin_unlock_bh(&priv->stats_lock);
+
 	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
 
 	/* Promsicuous mode */
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
index c6c4f1238923e09eced547454b86c68720292859..9166d90e732858610b1407fe85cbf6cbe27f5e0b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -154,7 +154,7 @@ void mlx4_en_fold_software_stats(struct net_device *dev)
 	unsigned long packets, bytes;
 	int i;
 
-	if (mlx4_is_master(mdev->dev))
+	if (!priv->port_up || mlx4_is_master(mdev->dev))
 		return;
 
 	packets = 0;

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH net-next] mlx4: fix use-after-free in mlx4_en_fold_software_stats()
  2016-12-01 13:02               ` [PATCH net-next] mlx4: fix use-after-free in mlx4_en_fold_software_stats() Eric Dumazet
@ 2016-12-01 15:23                 ` Saeed Mahameed
  2016-12-02 18:36                 ` David Miller
  1 sibling, 0 replies; 33+ messages in thread
From: Saeed Mahameed @ 2016-12-01 15:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Thu, Dec 1, 2016 at 3:02 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> My recent commit to get more precise rx/tx counters in ndo_get_stats64()
> can lead to crashes at device dismantle, as Jesper found out.
>
> We must prevent mlx4_en_fold_software_stats() trying to access
> tx/rx rings if they are deleted.
>
> Fix this by adding a test against priv->port_up in
> mlx4_en_fold_software_stats()
>
> Calling mlx4_en_fold_software_stats() from mlx4_en_stop_port()
> allows us to eventually broadcast the latest/current counters to
> rtnetlink monitors.
>
> Fixes: 40931b85113d ("mlx4: give precise rx/tx bytes/packets counters")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-and-bisected-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Tariq Toukan <tariqt@mellanox.com>
> Cc: Saeed Mahameed <saeedm@dev.mellanox.co.il>

Acked-by: Saeed Mahameed <saeedm@mellanox.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-11-30 21:00           ` Eric Dumazet
  2016-12-01 12:37             ` Jesper Dangaard Brouer
@ 2016-12-01 15:38             ` Saeed Mahameed
  2016-12-01 15:55               ` Eric Dumazet
  1 sibling, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2016-12-01 15:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Wed, Nov 30, 2016 at 11:00 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-11-30 at 22:42 +0200, Saeed Mahameed wrote:
>> On Wed, Nov 30, 2016 at 7:35 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > On Wed, 2016-11-30 at 18:46 +0200, Saeed Mahameed wrote:
>> >
>> >> we had/still have the proper stats they are the ones that
>> >> mlx4_en_fold_software_stats is trying to cache into  (they always
>> >> exist),
>> >> but the ones that you are trying to read from (the mlx4 rings) are gone !
>> >>
>> >> This bug is totally new and as i warned, this is another symptom of
>> >> the real root cause (can't sleep while reading stats).
>> >>
>> >> Eric what do you suggest ? Keep pre-allocated MAX_RINGS stats  and
>> >> always iterate over all of them to query stats ?
>> >> what if you have one ring/none/1K ? how would you know how many to query ?
>> >
>> > I am suggesting I will fix the bug I introduced.
>> >
>> > Do not panic.
>> >
>> >
>>
>> Not at all, I trust you are the only one who is capable of providing
>> the best solution.
>> I am just trying to read your mind :-).
>>
>> As i said i like the solution and i want to adapt it to mlx5, so I am
>> a little bit enthusiastic :)
>
> What about the following fix guys ?
>
> As a bonus we update the stats right before they are sent to monitors
> via rtnetlink ;)

Hi Eric, Thanks for the patch, I already acked it.

I have one educational question (not related to this patch, but
related to stats reading in general).
I was wondering why do we need to disable bh every time we read stats
"spin_lock_bh" ? is it essential ?

I checked and in mlx4 we don't hold stats_lock in softirq
(en_rx.c/en_tx.c), so I don't see any deadlock risk in here..

 Thanks
Saeed.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-12-01 15:38             ` Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters Saeed Mahameed
@ 2016-12-01 15:55               ` Eric Dumazet
  2016-12-01 16:08                 ` Eric Dumazet
  2016-12-01 16:33                 ` Saeed Mahameed
  0 siblings, 2 replies; 33+ messages in thread
From: Eric Dumazet @ 2016-12-01 15:55 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Thu, 2016-12-01 at 17:38 +0200, Saeed Mahameed wrote:

> 
> Hi Eric, Thanks for the patch, I already acked it.

Thanks !

> 
> I have one educational question (not related to this patch, but
> related to stats reading in general).
> I was wondering why do we need to disable bh every time we read stats
> "spin_lock_bh" ? is it essential ?
> 
> I checked and in mlx4 we don't hold stats_lock in softirq
> (en_rx.c/en_tx.c), so I don't see any deadlock risk in here..

Excellent question, and I chose to keep the spinlock.

That would be doable, only if we do not overwrite dev->stats.

Current code is :

static struct rtnl_link_stats64 *
mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct mlx4_en_priv *priv = netdev_priv(dev);

        spin_lock_bh(&priv->stats_lock);
        mlx4_en_fold_software_stats(dev);
        netdev_stats_to_stats64(stats, &dev->stats);
        spin_unlock_bh(&priv->stats_lock);

        return stats;
}

If you remove the spin_lock_bh() :

static struct rtnl_link_stats64 *
mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct mlx4_en_priv *priv = netdev_priv(dev);

        mlx4_en_fold_software_stats(dev); // possible races

        netdev_stats_to_stats64(stats, &dev->stats);

        return stats;
}

1) one mlx4_en_fold_software_stats(dev) could be preempted
on a CONFIG_PREEMPT kernel, or interrupted by long irqs.

2) Another cpu would also call mlx4_en_fold_software_stats(dev) while
   first cpu is busy.

3) Then when resuming first cpu/thread, part of the dev->stats fieds 
would be updated with 'old counters',
while another thread might have updated them with newer values.

4) A SNMP reader could then get counters that are not monotonically
increasing,
which would be confusing/buggy.

So removing the spinlock is doable, but needs to add a new parameter
to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
before mlx4_en_fold_software_stats(dev)

static struct rtnl_link_stats64 *
mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct mlx4_en_priv *priv = netdev_priv(dev);

        netdev_stats_to_stats64(stats, &dev->stats);

	// Passing a non NULL stats asks mlx4_en_fold_software_stats()
	// to not update dev->stats, but stats directly.

        mlx4_en_fold_software_stats(dev, stats)


        return stats;
}

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-12-01 15:55               ` Eric Dumazet
@ 2016-12-01 16:08                 ` Eric Dumazet
  2016-12-01 17:36                   ` Eric Dumazet
  2016-12-01 16:33                 ` Saeed Mahameed
  1 sibling, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-12-01 16:08 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Thu, 2016-12-01 at 07:55 -0800, Eric Dumazet wrote:

> So removing the spinlock is doable, but needs to add a new parameter
> to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
> before mlx4_en_fold_software_stats(dev)

Untested patch would be :

 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |    2 -
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |   10 +----
 drivers/net/ethernet/mellanox/mlx4/en_port.c    |   24 +++++++++-----
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    3 +
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index d9c9f86a30df953fa555934c5406057dcaf28960..676050e352703cebe7fcaa5202a06496f7a5a0df 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -367,7 +367,7 @@ static void mlx4_en_get_ethtool_stats(struct net_device *dev,
 
 	spin_lock_bh(&priv->stats_lock);
 
-	mlx4_en_fold_software_stats(dev);
+	mlx4_en_fold_software_stats(dev, NULL);
 
 	for (i = 0; i < NUM_MAIN_STATS; i++, bitmap_iterator_inc(&it))
 		if (bitmap_iterator_test(&it))
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 091b904262bc7932d3edf99cf850affb23b9ce6e..6ee9e31e59c392cb88faedf9c541b3bc6d195228 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1321,13 +1321,9 @@ static void mlx4_en_tx_timeout(struct net_device *dev)
 static struct rtnl_link_stats64 *
 mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
-	struct mlx4_en_priv *priv = netdev_priv(dev);
-
-	spin_lock_bh(&priv->stats_lock);
-	mlx4_en_fold_software_stats(dev);
 	netdev_stats_to_stats64(stats, &dev->stats);
-	spin_unlock_bh(&priv->stats_lock);
-
+	/* Must be called after netdev_stats_to_stats64() */
+	mlx4_en_fold_software_stats(dev, stats);
 	return stats;
 }
 
@@ -1810,7 +1806,7 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
 	netif_tx_disable(dev);
 
 	spin_lock_bh(&priv->stats_lock);
-	mlx4_en_fold_software_stats(dev);
+	mlx4_en_fold_software_stats(dev, NULL);
 	/* Set port as not active */
 	priv->port_up = false;
 	spin_unlock_bh(&priv->stats_lock);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
index 9166d90e732858610b1407fe85cbf6cbe27f5e0b..eea042a18e3cfba62745ece4ca673c2db967b9aa 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -147,7 +147,8 @@ static unsigned long en_stats_adder(__be64 *start, __be64 *next, int num)
 	return ret;
 }
 
-void mlx4_en_fold_software_stats(struct net_device *dev)
+void mlx4_en_fold_software_stats(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
@@ -165,9 +166,13 @@ void mlx4_en_fold_software_stats(struct net_device *dev)
 		packets += READ_ONCE(ring->packets);
 		bytes   += READ_ONCE(ring->bytes);
 	}
-	dev->stats.rx_packets = packets;
-	dev->stats.rx_bytes = bytes;
-
+	if (stats) {
+		stats->rx_packets = packets;
+		stats->rx_bytes = bytes;
+	} else {
+		dev->stats.rx_packets = packets;
+		dev->stats.rx_bytes = bytes;
+	}
 	packets = 0;
 	bytes = 0;
 	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
@@ -176,8 +181,13 @@ void mlx4_en_fold_software_stats(struct net_device *dev)
 		packets += READ_ONCE(ring->packets);
 		bytes   += READ_ONCE(ring->bytes);
 	}
-	dev->stats.tx_packets = packets;
-	dev->stats.tx_bytes = bytes;
+	if (stats) {
+		stats->tx_packets = packets;
+		stats->tx_bytes = bytes;
+	} else {
+		dev->stats.tx_packets = packets;
+		dev->stats.tx_bytes = bytes;
+	}
 }
 
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
@@ -208,7 +218,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 
 	spin_lock_bh(&priv->stats_lock);
 
-	mlx4_en_fold_software_stats(dev);
+	mlx4_en_fold_software_stats(dev, NULL);
 
 	priv->port_stats.rx_chksum_good = 0;
 	priv->port_stats.rx_chksum_none = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 20a936428f4a44c8ca0a7161855da310f9166b50..92dbb41f425b282e9ab7c8d534f091da0ba661c3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -755,7 +755,8 @@ void mlx4_en_rx_irq(struct mlx4_cq *mcq);
 int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
 int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
 
-void mlx4_en_fold_software_stats(struct net_device *dev);
+void mlx4_en_fold_software_stats(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats);
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
 int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
 

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-12-01 15:55               ` Eric Dumazet
  2016-12-01 16:08                 ` Eric Dumazet
@ 2016-12-01 16:33                 ` Saeed Mahameed
  2016-12-01 17:08                   ` Eric Dumazet
  1 sibling, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2016-12-01 16:33 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Thu, Dec 1, 2016 at 5:55 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-12-01 at 17:38 +0200, Saeed Mahameed wrote:
>
>>
>> Hi Eric, Thanks for the patch, I already acked it.
>
> Thanks !
>
>>
>> I have one educational question (not related to this patch, but
>> related to stats reading in general).
>> I was wondering why do we need to disable bh every time we read stats
>> "spin_lock_bh" ? is it essential ?
>>
>> I checked and in mlx4 we don't hold stats_lock in softirq
>> (en_rx.c/en_tx.c), so I don't see any deadlock risk in here..
>
> Excellent question, and I chose to keep the spinlock.
>
> That would be doable, only if we do not overwrite dev->stats.
>
> Current code is :
>
> static struct rtnl_link_stats64 *
> mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
> {
>         struct mlx4_en_priv *priv = netdev_priv(dev);
>
>         spin_lock_bh(&priv->stats_lock);
>         mlx4_en_fold_software_stats(dev);
>         netdev_stats_to_stats64(stats, &dev->stats);
>         spin_unlock_bh(&priv->stats_lock);
>
>         return stats;
> }
>
> If you remove the spin_lock_bh() :
>
>
> static struct rtnl_link_stats64 *
> mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
> {
>         struct mlx4_en_priv *priv = netdev_priv(dev);
>
>         mlx4_en_fold_software_stats(dev); // possible races
>
>         netdev_stats_to_stats64(stats, &dev->stats);
>
>         return stats;
> }
>
> 1) one mlx4_en_fold_software_stats(dev) could be preempted
> on a CONFIG_PREEMPT kernel, or interrupted by long irqs.
>
> 2) Another cpu would also call mlx4_en_fold_software_stats(dev) while
>    first cpu is busy.
>
> 3) Then when resuming first cpu/thread, part of the dev->stats fieds
> would be updated with 'old counters',
> while another thread might have updated them with newer values.
>
> 4) A SNMP reader could then get counters that are not monotonically
> increasing,
> which would be confusing/buggy.
>
> So removing the spinlock is doable, but needs to add a new parameter
> to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
> before mlx4_en_fold_software_stats(dev)
>
> static struct rtnl_link_stats64 *
> mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
> {
>         struct mlx4_en_priv *priv = netdev_priv(dev);
>
>         netdev_stats_to_stats64(stats, &dev->stats);
>
>         // Passing a non NULL stats asks mlx4_en_fold_software_stats()
>         // to not update dev->stats, but stats directly.
>
>         mlx4_en_fold_software_stats(dev, stats)
>
>
>         return stats;
> }
>
>

Thanks for the detailed answer !!

BTW you went 5 steps ahead of my original question :)), so far you
already have a patch without locking at all (really impressive).

What i wanted to ask originally, was regarding the "_bh", i didn't
mean to completely remove the "spin_lock_bh",
I meant, what happens if we replace "spin_lock_bh"  with "spin_lock",
without disabling bh ?
I gues raw "sping_lock" handles points (2 to 4) from above, but it
won't handle long irqs.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-12-01 16:33                 ` Saeed Mahameed
@ 2016-12-01 17:08                   ` Eric Dumazet
  2016-12-04 13:21                     ` Saeed Mahameed
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-12-01 17:08 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Thu, 2016-12-01 at 18:33 +0200, Saeed Mahameed wrote:

> Thanks for the detailed answer !!

You're welcome.

> 
> BTW you went 5 steps ahead of my original question :)), so far you
> already have a patch without locking at all (really impressive).
> 
> What i wanted to ask originally, was regarding the "_bh", i didn't
> mean to completely remove the "spin_lock_bh",
> I meant, what happens if we replace "spin_lock_bh"  with "spin_lock",
> without disabling bh ?
> I gues raw "sping_lock" handles points (2 to 4) from above, but it
> won't handle long irqs.

Thats a very good point, the _bh prefix can totally be removed, since
stats_lock is only acquired from process context.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-12-01 16:08                 ` Eric Dumazet
@ 2016-12-01 17:36                   ` Eric Dumazet
  2016-12-04 13:23                     ` Saeed Mahameed
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2016-12-01 17:36 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Thu, 2016-12-01 at 08:08 -0800, Eric Dumazet wrote:
> On Thu, 2016-12-01 at 07:55 -0800, Eric Dumazet wrote:
> 
> > So removing the spinlock is doable, but needs to add a new parameter
> > to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
> > before mlx4_en_fold_software_stats(dev)
> 
> Untested patch would be :
> 
>  drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |    2 -
>  drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |   10 +----
>  drivers/net/ethernet/mellanox/mlx4/en_port.c    |   24 +++++++++-----
>  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    3 +
>  4 files changed, 23 insertions(+), 16 deletions(-)

The patch is wrong, since priv->port_up could change to false while we
are running and using the about to be deleted tx/rx rings.

So the only safe thing to do is to remove the _bh suffix.

Not worth trying to avoid taking a spinlock in this code.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH net-next] mlx4: fix use-after-free in mlx4_en_fold_software_stats()
  2016-12-01 13:02               ` [PATCH net-next] mlx4: fix use-after-free in mlx4_en_fold_software_stats() Eric Dumazet
  2016-12-01 15:23                 ` Saeed Mahameed
@ 2016-12-02 18:36                 ` David Miller
  1 sibling, 0 replies; 33+ messages in thread
From: David Miller @ 2016-12-02 18:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: brouer, saeedm, netdev, tariqt

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 01 Dec 2016 05:02:06 -0800

> From: Eric Dumazet <edumazet@google.com>
> 
> My recent commit to get more precise rx/tx counters in ndo_get_stats64()
> can lead to crashes at device dismantle, as Jesper found out.
> 
> We must prevent mlx4_en_fold_software_stats() trying to access
> tx/rx rings if they are deleted.
> 
> Fix this by adding a test against priv->port_up in
> mlx4_en_fold_software_stats()
> 
> Calling mlx4_en_fold_software_stats() from mlx4_en_stop_port()
> allows us to eventually broadcast the latest/current counters to
> rtnetlink monitors.
> 
> Fixes: 40931b85113d ("mlx4: give precise rx/tx bytes/packets counters")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-and-bisected-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>

Applied.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-12-01 17:08                   ` Eric Dumazet
@ 2016-12-04 13:21                     ` Saeed Mahameed
  0 siblings, 0 replies; 33+ messages in thread
From: Saeed Mahameed @ 2016-12-04 13:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Thu, Dec 1, 2016 at 7:08 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-12-01 at 18:33 +0200, Saeed Mahameed wrote:
>
>> Thanks for the detailed answer !!
>
> You're welcome.
>
>>
>> BTW you went 5 steps ahead of my original question :)), so far you
>> already have a patch without locking at all (really impressive).
>>
>> What i wanted to ask originally, was regarding the "_bh", i didn't
>> mean to completely remove the "spin_lock_bh",
>> I meant, what happens if we replace "spin_lock_bh"  with "spin_lock",
>> without disabling bh ?
>> I gues raw "sping_lock" handles points (2 to 4) from above, but it
>> won't handle long irqs.
>
> Thats a very good point, the _bh prefix can totally be removed, since
> stats_lock is only acquired from process context.
>
>

That was my initial point, Thanks for the help.
will provide a fix patch later once 4.9 is release.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
  2016-12-01 17:36                   ` Eric Dumazet
@ 2016-12-04 13:23                     ` Saeed Mahameed
  0 siblings, 0 replies; 33+ messages in thread
From: Saeed Mahameed @ 2016-12-04 13:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan

On Thu, Dec 1, 2016 at 7:36 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-12-01 at 08:08 -0800, Eric Dumazet wrote:
>> On Thu, 2016-12-01 at 07:55 -0800, Eric Dumazet wrote:
>>
>> > So removing the spinlock is doable, but needs to add a new parameter
>> > to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
>> > before mlx4_en_fold_software_stats(dev)
>>
>> Untested patch would be :
>>
>>  drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |    2 -
>>  drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |   10 +----
>>  drivers/net/ethernet/mellanox/mlx4/en_port.c    |   24 +++++++++-----
>>  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    3 +
>>  4 files changed, 23 insertions(+), 16 deletions(-)
>
> The patch is wrong, since priv->port_up could change to false while we
> are running and using the about to be deleted tx/rx rings.
>

Right, hence the regression Jesper saw ;).

>
> So the only safe thing to do is to remove the _bh suffix.
>
> Not worth trying to avoid taking a spinlock in this code.
>

Ack.

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2016-12-04 13:23 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-25 15:46 [PATCH] mlx4: give precise rx/tx bytes/packets counters Eric Dumazet
2016-11-25 16:03 ` David Laight
2016-11-25 16:16   ` Eric Dumazet
2016-11-25 16:30     ` Aw: " Lino Sanfilippo
2016-11-25 19:19       ` Eric Dumazet
2016-11-28 22:02         ` Lino Sanfilippo
2016-11-29  0:19           ` Eric Dumazet
2016-11-30 15:28             ` David Laight
2016-11-30 15:47               ` Eric Dumazet
2016-11-26 22:47 ` Saeed Mahameed
2016-11-27  2:16   ` Eric Dumazet
2016-11-28 20:40     ` David Miller
2016-11-28 21:55       ` Saeed Mahameed
2016-11-28 22:57         ` Eric Dumazet
2016-11-29 18:37 ` David Miller
2016-11-30 14:08 ` Regression: " Jesper Dangaard Brouer
2016-11-30 15:58   ` Eric Dumazet
2016-11-30 16:46     ` Saeed Mahameed
2016-11-30 17:35       ` Eric Dumazet
2016-11-30 20:42         ` Saeed Mahameed
2016-11-30 21:00           ` Eric Dumazet
2016-12-01 12:37             ` Jesper Dangaard Brouer
2016-12-01 13:02               ` [PATCH net-next] mlx4: fix use-after-free in mlx4_en_fold_software_stats() Eric Dumazet
2016-12-01 15:23                 ` Saeed Mahameed
2016-12-02 18:36                 ` David Miller
2016-12-01 15:38             ` Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters Saeed Mahameed
2016-12-01 15:55               ` Eric Dumazet
2016-12-01 16:08                 ` Eric Dumazet
2016-12-01 17:36                   ` Eric Dumazet
2016-12-04 13:23                     ` Saeed Mahameed
2016-12-01 16:33                 ` Saeed Mahameed
2016-12-01 17:08                   ` Eric Dumazet
2016-12-04 13:21                     ` Saeed Mahameed

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).