All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing
@ 2018-03-23  1:19 Florian Fainelli
  2018-03-23  1:19 ` [PATCH net-next 1/2] net: systemport: Implement adaptive " Florian Fainelli
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Florian Fainelli @ 2018-03-23  1:19 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, davem, jaedon.shin, pgynther, opendmb,
	michal.chan, gospo, talgi, saeedm

Hi all,

This patch series adds adaptive interrupt coalescing for the Gigabit Ethernet
drivers SYSTEMPORT and GENET.

This really helps lower the interrupt count and system load, as measured by
vmstat for a Gigabit TCP RX session:

SYSTEMPORT:

without:

 1  0      0 192188      0  25472    0    0     0     0 122100 38870  1 42 57  0  0
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-10.0 sec  1.03 GBytes   884 Mbits/sec

with:

 1  0      0 192288      0  25468    0    0     0     0 58806 44401  0 100  0  0  0
[  5]  0.0-10.0 sec  1.04 GBytes   888 Mbits/sec

GENET:

without:

 1  0      0 1170404      0  25420    0    0     0     0 130785 63402  2 85 12  0  0
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-10.0 sec  1.04 GBytes   888 Mbits/sec

with:

 1  0      0 1170560      0  25420    0    0     0     0 50610 48477  0 100  0  0  0
[  5]  0.0-10.0 sec  1.05 GBytes   899 Mbits/sec

Please look at the implementation and let me know if you see any problems, this
was largely inspired by bnxt_en.

Thank you!

Florian Fainelli (2):
  net: systemport: Implement adaptive interrupt coalescing
  net: bcmgenet: Add support for adaptive RX coalescing

 drivers/net/ethernet/broadcom/bcmsysport.c     | 141 ++++++++++++++++++++++---
 drivers/net/ethernet/broadcom/bcmsysport.h     |  14 +++
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 109 +++++++++++++++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.h |  12 +++
 4 files changed, 243 insertions(+), 33 deletions(-)

-- 
2.14.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH net-next 1/2] net: systemport: Implement adaptive interrupt coalescing
  2018-03-23  1:19 [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing Florian Fainelli
@ 2018-03-23  1:19 ` Florian Fainelli
  2018-03-26 21:22   ` Tal Gilboa
  2018-03-23  1:19 ` [PATCH net-next 2/2] net: bcmgenet: Add support for adaptive RX coalescing Florian Fainelli
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Florian Fainelli @ 2018-03-23  1:19 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, davem, jaedon.shin, pgynther, opendmb,
	michal.chan, gospo, talgi, saeedm

Implement support for adaptive RX and TX interrupt coalescing using
net_dim. We have each of our TX ring and our single RX ring implement a
bcm_sysport_net_dim structure which holds an interrupt counter, number
of packets, bytes, and a container for a net_dim instance.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 141 ++++++++++++++++++++++++++---
 drivers/net/ethernet/broadcom/bcmsysport.h |  14 +++
 2 files changed, 140 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index f15a8fc6dfc9..5a5a726bafa4 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/net_dim.h>
 #include <linux/etherdevice.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
@@ -574,21 +575,55 @@ static int bcm_sysport_set_wol(struct net_device *dev,
 	return 0;
 }
 
+static void bcm_sysport_set_rx_coalesce(struct bcm_sysport_priv *priv)
+{
+	u32 reg;
+
+	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
+	reg &= ~(RDMA_INTR_THRESH_MASK |
+		 RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
+	reg |= priv->dim.coal_pkts;
+	reg |= DIV_ROUND_UP(priv->dim.coal_usecs * 1000, 8192) <<
+			    RDMA_TIMEOUT_SHIFT;
+	rdma_writel(priv, reg, RDMA_MBDONE_INTR);
+}
+
+static void bcm_sysport_set_tx_coalesce(struct bcm_sysport_tx_ring *ring)
+{
+	struct bcm_sysport_priv *priv = ring->priv;
+	u32 reg;
+
+	reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(ring->index));
+	reg &= ~(RING_INTR_THRESH_MASK |
+		 RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
+	reg |= ring->dim.coal_pkts;
+	reg |= DIV_ROUND_UP(ring->dim.coal_usecs * 1000, 8192) <<
+			    RING_TIMEOUT_SHIFT;
+	tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(ring->index));
+}
+
 static int bcm_sysport_get_coalesce(struct net_device *dev,
 				    struct ethtool_coalesce *ec)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
+	struct bcm_sysport_tx_ring *ring;
+	unsigned int i;
 	u32 reg;
 
 	reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(0));
 
 	ec->tx_coalesce_usecs = (reg >> RING_TIMEOUT_SHIFT) * 8192 / 1000;
 	ec->tx_max_coalesced_frames = reg & RING_INTR_THRESH_MASK;
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		ring = &priv->tx_rings[i];
+		ec->use_adaptive_tx_coalesce |= ring->dim.use_dim;
+	}
 
 	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
 
 	ec->rx_coalesce_usecs = (reg >> RDMA_TIMEOUT_SHIFT) * 8192 / 1000;
 	ec->rx_max_coalesced_frames = reg & RDMA_INTR_THRESH_MASK;
+	ec->use_adaptive_rx_coalesce = priv->dim.use_dim;
 
 	return 0;
 }
@@ -597,8 +632,8 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
 				    struct ethtool_coalesce *ec)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
+	struct bcm_sysport_tx_ring *ring;
 	unsigned int i;
-	u32 reg;
 
 	/* Base system clock is 125Mhz, DMA timeout is this reference clock
 	 * divided by 1024, which yield roughly 8.192 us, our maximum value has
@@ -615,22 +650,26 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
 		return -EINVAL;
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
-		reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(i));
-		reg &= ~(RING_INTR_THRESH_MASK |
-			 RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
-		reg |= ec->tx_max_coalesced_frames;
-		reg |= DIV_ROUND_UP(ec->tx_coalesce_usecs * 1000, 8192) <<
-			 RING_TIMEOUT_SHIFT;
-		tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(i));
+		ring = &priv->tx_rings[i];
+		ring->dim.coal_pkts = ec->tx_max_coalesced_frames;
+		ring->dim.coal_usecs = ec->tx_coalesce_usecs;
+		if (!ec->use_adaptive_tx_coalesce && ring->dim.use_dim) {
+			ring->dim.coal_pkts = 1;
+			ring->dim.coal_usecs = 0;
+		}
+		ring->dim.use_dim = ec->use_adaptive_tx_coalesce;
+		bcm_sysport_set_tx_coalesce(ring);
 	}
 
-	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
-	reg &= ~(RDMA_INTR_THRESH_MASK |
-		 RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
-	reg |= ec->rx_max_coalesced_frames;
-	reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192) <<
-			    RDMA_TIMEOUT_SHIFT;
-	rdma_writel(priv, reg, RDMA_MBDONE_INTR);
+	priv->dim.coal_usecs = ec->rx_coalesce_usecs;
+	priv->dim.coal_pkts = ec->rx_max_coalesced_frames;
+
+	if (!ec->use_adaptive_rx_coalesce && priv->dim.use_dim) {
+		priv->dim.coal_pkts = 1;
+		priv->dim.coal_usecs = 0;
+	}
+	priv->dim.use_dim = ec->use_adaptive_rx_coalesce;
+	bcm_sysport_set_rx_coalesce(priv);
 
 	return 0;
 }
@@ -709,6 +748,7 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
 	struct bcm_sysport_stats64 *stats64 = &priv->stats64;
 	struct net_device *ndev = priv->netdev;
 	unsigned int processed = 0, to_process;
+	unsigned int processed_bytes = 0;
 	struct bcm_sysport_cb *cb;
 	struct sk_buff *skb;
 	unsigned int p_index;
@@ -800,6 +840,7 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
 		 */
 		skb_pull(skb, sizeof(*rsb) + 2);
 		len -= (sizeof(*rsb) + 2);
+		processed_bytes += len;
 
 		/* UniMAC may forward CRC */
 		if (priv->crc_fwd) {
@@ -824,6 +865,9 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
 			priv->rx_read_ptr = 0;
 	}
 
+	priv->dim.packets = processed;
+	priv->dim.bytes = processed_bytes;
+
 	return processed;
 }
 
@@ -900,6 +944,8 @@ static unsigned int __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
 	ring->packets += pkts_compl;
 	ring->bytes += bytes_compl;
 	u64_stats_update_end(&priv->syncp);
+	ring->dim.packets = pkts_compl;
+	ring->dim.bytes = bytes_compl;
 
 	ring->c_index = c_index;
 
@@ -945,6 +991,7 @@ static int bcm_sysport_tx_poll(struct napi_struct *napi, int budget)
 {
 	struct bcm_sysport_tx_ring *ring =
 		container_of(napi, struct bcm_sysport_tx_ring, napi);
+	struct net_dim_sample dim_sample;
 	unsigned int work_done = 0;
 
 	work_done = bcm_sysport_tx_reclaim(ring->priv, ring);
@@ -961,6 +1008,12 @@ static int bcm_sysport_tx_poll(struct napi_struct *napi, int budget)
 		return 0;
 	}
 
+	if (ring->dim.use_dim) {
+		net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
+			       ring->dim.bytes, &dim_sample);
+		net_dim(&ring->dim.dim, dim_sample);
+	}
+
 	return budget;
 }
 
@@ -976,6 +1029,7 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 {
 	struct bcm_sysport_priv *priv =
 		container_of(napi, struct bcm_sysport_priv, napi);
+	struct net_dim_sample dim_sample;
 	unsigned int work_done = 0;
 
 	work_done = bcm_sysport_desc_rx(priv, budget);
@@ -998,6 +1052,12 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 		intrl2_0_mask_clear(priv, INTRL2_0_RDMA_MBDONE);
 	}
 
+	if (priv->dim.use_dim) {
+		net_dim_sample(priv->dim.event_ctr, priv->dim.packets,
+			       priv->dim.bytes, &dim_sample);
+		net_dim(&priv->dim.dim, dim_sample);
+	}
+
 	return work_done;
 }
 
@@ -1016,6 +1076,40 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
 	netif_dbg(priv, wol, priv->netdev, "resumed from WOL\n");
 }
 
+static void bcm_sysport_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct bcm_sysport_net_dim *ndim =
+			container_of(dim, struct bcm_sysport_net_dim, dim);
+	struct bcm_sysport_priv *priv =
+			container_of(ndim, struct bcm_sysport_priv, dim);
+	struct net_dim_cq_moder cur_profile =
+				net_dim_get_profile(dim->mode, dim->profile_ix);
+
+	priv->dim.coal_usecs = cur_profile.usec;
+	priv->dim.coal_pkts = cur_profile.pkts;
+
+	bcm_sysport_set_rx_coalesce(priv);
+	dim->state = NET_DIM_START_MEASURE;
+}
+
+static void bcm_sysport_dim_tx_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct bcm_sysport_net_dim *ndim =
+			container_of(dim, struct bcm_sysport_net_dim, dim);
+	struct bcm_sysport_tx_ring *ring =
+			container_of(ndim, struct bcm_sysport_tx_ring, dim);
+	struct net_dim_cq_moder cur_profile =
+				net_dim_get_profile(dim->mode, dim->profile_ix);
+
+	ring->dim.coal_usecs = cur_profile.usec;
+	ring->dim.coal_pkts = cur_profile.pkts;
+
+	bcm_sysport_set_tx_coalesce(ring);
+	dim->state = NET_DIM_START_MEASURE;
+}
+
 /* RX and misc interrupt routine */
 static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
 {
@@ -1034,6 +1128,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
 	}
 
 	if (priv->irq0_stat & INTRL2_0_RDMA_MBDONE) {
+		priv->dim.event_ctr++;
 		if (likely(napi_schedule_prep(&priv->napi))) {
 			/* disable RX interrupts */
 			intrl2_0_mask_set(priv, INTRL2_0_RDMA_MBDONE);
@@ -1061,6 +1156,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
 			continue;
 
 		txr = &priv->tx_rings[ring];
+		txr->dim.event_ctr++;
 
 		if (likely(napi_schedule_prep(&txr->napi))) {
 			intrl2_0_mask_set(priv, ring_bit);
@@ -1093,6 +1189,7 @@ static irqreturn_t bcm_sysport_tx_isr(int irq, void *dev_id)
 			continue;
 
 		txr = &priv->tx_rings[ring];
+		txr->dim.event_ctr++;
 
 		if (likely(napi_schedule_prep(&txr->napi))) {
 			intrl2_1_mask_set(priv, BIT(ring));
@@ -1358,6 +1455,16 @@ static void bcm_sysport_adj_link(struct net_device *dev)
 		phy_print_status(phydev);
 }
 
+static void bcm_sysport_init_dim(struct bcm_sysport_net_dim *dim,
+				 void (*cb)(struct work_struct *work))
+{
+	INIT_WORK(&dim->dim.work, cb);
+	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	dim->event_ctr = 0;
+	dim->packets = 0;
+	dim->bytes = 0;
+}
+
 static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv,
 				    unsigned int index)
 {
@@ -1447,6 +1554,7 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv,
 	reg |= (1 << index);
 	tdma_writel(priv, reg, TDMA_TIER1_ARB_0_QUEUE_EN);
 
+	bcm_sysport_init_dim(&ring->dim, bcm_sysport_dim_tx_work);
 	napi_enable(&ring->napi);
 
 	netif_dbg(priv, hw, priv->netdev,
@@ -1477,6 +1585,7 @@ static void bcm_sysport_fini_tx_ring(struct bcm_sysport_priv *priv,
 		return;
 
 	napi_disable(&ring->napi);
+	cancel_work_sync(&ring->dim.dim.work);
 	netif_napi_del(&ring->napi);
 
 	bcm_sysport_tx_clean(priv, ring);
@@ -1766,6 +1875,7 @@ static void bcm_sysport_netif_start(struct net_device *dev)
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
 
 	/* Enable NAPI */
+	bcm_sysport_init_dim(&priv->dim, bcm_sysport_dim_work);
 	napi_enable(&priv->napi);
 
 	/* Enable RX interrupt and TX ring full interrupt */
@@ -1951,6 +2061,7 @@ static void bcm_sysport_netif_stop(struct net_device *dev)
 	/* stop all software from updating hardware */
 	netif_tx_stop_all_queues(dev);
 	napi_disable(&priv->napi);
+	cancel_work_sync(&priv->dim.dim.work);
 	phy_stop(dev->phydev);
 
 	/* mask all interrupts */
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index f5a984c1c986..9f48ad3cc38d 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -12,6 +12,7 @@
 #define __BCM_SYSPORT_H
 
 #include <linux/if_vlan.h>
+#include <linux/net_dim.h>
 
 /* Receive/transmit descriptor format */
 #define DESC_ADDR_HI_STATUS_LEN	0x00
@@ -695,6 +696,16 @@ struct bcm_sysport_hw_params {
 	unsigned int	num_rx_desc_words;
 };
 
+struct bcm_sysport_net_dim {
+	u16			use_dim;
+	u16			event_ctr;
+	unsigned long		packets;
+	unsigned long		bytes;
+	u32			coal_usecs;
+	u32			coal_pkts;
+	struct net_dim		dim;
+};
+
 /* Software view of the TX ring */
 struct bcm_sysport_tx_ring {
 	spinlock_t	lock;		/* Ring lock for tx reclaim/xmit */
@@ -712,6 +723,7 @@ struct bcm_sysport_tx_ring {
 	struct bcm_sysport_priv *priv;	/* private context backpointer */
 	unsigned long	packets;	/* packets statistics */
 	unsigned long	bytes;		/* bytes statistics */
+	struct bcm_sysport_net_dim dim;	/* Net DIM context */
 	unsigned int	switch_queue;	/* switch port queue number */
 	unsigned int	switch_port;	/* switch port queue number */
 	bool		inspect;	/* inspect switch port and queue */
@@ -743,6 +755,8 @@ struct bcm_sysport_priv {
 	unsigned int		rx_read_ptr;
 	unsigned int		rx_c_index;
 
+	struct bcm_sysport_net_dim	dim;
+
 	/* PHY device */
 	struct device_node	*phy_dn;
 	phy_interface_t		phy_interface;
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next 2/2] net: bcmgenet: Add support for adaptive RX coalescing
  2018-03-23  1:19 [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing Florian Fainelli
  2018-03-23  1:19 ` [PATCH net-next 1/2] net: systemport: Implement adaptive " Florian Fainelli
@ 2018-03-23  1:19 ` Florian Fainelli
  2018-03-26 21:23   ` Tal Gilboa
  2018-03-26  0:49 ` [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing David Miller
  2018-03-26 21:16 ` Tal Gilboa
  3 siblings, 1 reply; 13+ messages in thread
From: Florian Fainelli @ 2018-03-23  1:19 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, davem, jaedon.shin, pgynther, opendmb,
	michal.chan, gospo, talgi, saeedm

Unlike the moder modern SYSTEMPORT hardware, we do not have a
configurable TDMA timeout, which limits us to implement adaptive RX
interrupt coalescing only. We have each of our RX rings implement a
bcmgenet_net_dim structure which holds an interrupt counter, number of
packets, bytes, and a container for a net_dim instance.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 109 +++++++++++++++++++++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.h |  12 +++
 2 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index b1e35a9accf1..7db8edc643ec 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -603,6 +603,8 @@ static int bcmgenet_get_coalesce(struct net_device *dev,
 				 struct ethtool_coalesce *ec)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
+	struct bcmgenet_rx_ring *ring;
+	unsigned int i;
 
 	ec->tx_max_coalesced_frames =
 		bcmgenet_tdma_ring_readl(priv, DESC_INDEX,
@@ -613,15 +615,37 @@ static int bcmgenet_get_coalesce(struct net_device *dev,
 	ec->rx_coalesce_usecs =
 		bcmgenet_rdma_readl(priv, DMA_RING16_TIMEOUT) * 8192 / 1000;
 
+	for (i = 0; i < priv->hw_params->rx_queues; i++) {
+		ring = &priv->rx_rings[i];
+		ec->use_adaptive_rx_coalesce |= ring->dim.use_dim;
+	}
+	ring = &priv->rx_rings[DESC_INDEX];
+	ec->use_adaptive_rx_coalesce |= ring->dim.use_dim;
+
 	return 0;
 }
 
+static void bcmgenet_set_rx_coalesce(struct bcmgenet_rx_ring *ring)
+{
+	struct bcmgenet_priv *priv = ring->priv;
+	unsigned int i = ring->index;
+	u32 reg;
+
+	bcmgenet_rdma_ring_writel(priv, i, ring->dim.coal_pkts,
+				  DMA_MBUF_DONE_THRESH);
+
+	reg = bcmgenet_rdma_readl(priv, DMA_RING0_TIMEOUT + i);
+	reg &= ~DMA_TIMEOUT_MASK;
+	reg |= DIV_ROUND_UP(ring->dim.coal_usecs * 1000, 8192);
+	bcmgenet_rdma_writel(priv, reg, DMA_RING0_TIMEOUT + i);
+}
+
 static int bcmgenet_set_coalesce(struct net_device *dev,
 				 struct ethtool_coalesce *ec)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
+	struct bcmgenet_rx_ring *ring;
 	unsigned int i;
-	u32 reg;
 
 	/* Base system clock is 125Mhz, DMA timeout is this reference clock
 	 * divided by 1024, which yields roughly 8.192us, our maximum value
@@ -641,7 +665,8 @@ static int bcmgenet_set_coalesce(struct net_device *dev,
 	 * transmitted, or when the ring is empty.
 	 */
 	if (ec->tx_coalesce_usecs || ec->tx_coalesce_usecs_high ||
-	    ec->tx_coalesce_usecs_irq || ec->tx_coalesce_usecs_low)
+	    ec->tx_coalesce_usecs_irq || ec->tx_coalesce_usecs_low ||
+	    ec->use_adaptive_tx_coalesce)
 		return -EOPNOTSUPP;
 
 	/* Program all TX queues with the same values, as there is no
@@ -656,24 +681,26 @@ static int bcmgenet_set_coalesce(struct net_device *dev,
 				  DMA_MBUF_DONE_THRESH);
 
 	for (i = 0; i < priv->hw_params->rx_queues; i++) {
-		bcmgenet_rdma_ring_writel(priv, i,
-					  ec->rx_max_coalesced_frames,
-					  DMA_MBUF_DONE_THRESH);
-
-		reg = bcmgenet_rdma_readl(priv, DMA_RING0_TIMEOUT + i);
-		reg &= ~DMA_TIMEOUT_MASK;
-		reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192);
-		bcmgenet_rdma_writel(priv, reg, DMA_RING0_TIMEOUT + i);
+		ring = &priv->rx_rings[i];
+		ring->dim.coal_usecs = ec->rx_coalesce_usecs;
+		ring->dim.coal_pkts = ec->rx_max_coalesced_frames;
+		if (!ec->use_adaptive_rx_coalesce && ring->dim.use_dim) {
+			ring->dim.coal_pkts = 1;
+			ring->dim.coal_usecs = 0;
+		}
+		ring->dim.use_dim = ec->use_adaptive_rx_coalesce;
+		bcmgenet_set_rx_coalesce(ring);
 	}
 
-	bcmgenet_rdma_ring_writel(priv, DESC_INDEX,
-				  ec->rx_max_coalesced_frames,
-				  DMA_MBUF_DONE_THRESH);
-
-	reg = bcmgenet_rdma_readl(priv, DMA_RING16_TIMEOUT);
-	reg &= ~DMA_TIMEOUT_MASK;
-	reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192);
-	bcmgenet_rdma_writel(priv, reg, DMA_RING16_TIMEOUT);
+	ring = &priv->rx_rings[DESC_INDEX];
+	ring->dim.coal_usecs = ec->rx_coalesce_usecs;
+	ring->dim.coal_pkts = ec->rx_max_coalesced_frames;
+	if (!ec->use_adaptive_rx_coalesce && ring->dim.use_dim) {
+		ring->dim.coal_pkts = 1;
+		ring->dim.coal_usecs = 0;
+	}
+	ring->dim.use_dim = ec->use_adaptive_rx_coalesce;
+	bcmgenet_set_rx_coalesce(ring);
 
 	return 0;
 }
@@ -1713,6 +1740,7 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 	unsigned long dma_flag;
 	int len;
 	unsigned int rxpktprocessed = 0, rxpkttoprocess;
+	unsigned int bytes_processed = 0;
 	unsigned int p_index, mask;
 	unsigned int discards;
 	unsigned int chksum_ok = 0;
@@ -1832,6 +1860,8 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 			len -= ETH_FCS_LEN;
 		}
 
+		bytes_processed += len;
+
 		/*Finish setting up the received SKB and send it to the kernel*/
 		skb->protocol = eth_type_trans(skb, priv->dev);
 		ring->packets++;
@@ -1854,6 +1884,9 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 		bcmgenet_rdma_ring_writel(priv, ring->index, ring->c_index, RDMA_CONS_INDEX);
 	}
 
+	ring->dim.bytes = bytes_processed;
+	ring->dim.packets = rxpktprocessed;
+
 	return rxpktprocessed;
 }
 
@@ -1862,6 +1895,7 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 {
 	struct bcmgenet_rx_ring *ring = container_of(napi,
 			struct bcmgenet_rx_ring, napi);
+	struct net_dim_sample dim_sample;
 	unsigned int work_done;
 
 	work_done = bcmgenet_desc_rx(ring, budget);
@@ -1871,9 +1905,32 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 		ring->int_enable(ring);
 	}
 
+	if (ring->dim.use_dim) {
+		net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
+			       ring->dim.bytes, &dim_sample);
+		net_dim(&ring->dim.dim, dim_sample);
+	}
+
 	return work_done;
 }
 
+static void bcmgenet_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct bcmgenet_net_dim *ndim =
+			container_of(dim, struct bcmgenet_net_dim, dim);
+	struct bcmgenet_rx_ring *ring =
+			container_of(ndim, struct bcmgenet_rx_ring, dim);
+	struct net_dim_cq_moder cur_profile =
+			net_dim_get_profile(dim->mode, dim->profile_ix);
+
+	ring->dim.coal_usecs = cur_profile.usec;
+	ring->dim.coal_pkts = cur_profile.pkts;
+
+	bcmgenet_set_rx_coalesce(ring);
+	dim->state = NET_DIM_START_MEASURE;
+}
+
 /* Assign skb to RX DMA descriptor. */
 static int bcmgenet_alloc_rx_buffers(struct bcmgenet_priv *priv,
 				     struct bcmgenet_rx_ring *ring)
@@ -2022,6 +2079,16 @@ static void init_umac(struct bcmgenet_priv *priv)
 	dev_dbg(kdev, "done init umac\n");
 }
 
+static void bcmgenet_init_dim(struct bcmgenet_net_dim *dim,
+			      void (*cb)(struct work_struct *work))
+{
+	INIT_WORK(&dim->dim.work, cb);
+	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	dim->event_ctr = 0;
+	dim->packets = 0;
+	dim->bytes = 0;
+}
+
 /* Initialize a Tx ring along with corresponding hardware registers */
 static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv,
 				  unsigned int index, unsigned int size,
@@ -2111,6 +2178,8 @@ static int bcmgenet_init_rx_ring(struct bcmgenet_priv *priv,
 	if (ret)
 		return ret;
 
+	bcmgenet_init_dim(&ring->dim, bcmgenet_dim_work);
+
 	/* Initialize Rx NAPI */
 	netif_napi_add(priv->dev, &ring->napi, bcmgenet_rx_poll,
 		       NAPI_POLL_WEIGHT);
@@ -2276,10 +2345,12 @@ static void bcmgenet_disable_rx_napi(struct bcmgenet_priv *priv)
 	for (i = 0; i < priv->hw_params->rx_queues; ++i) {
 		ring = &priv->rx_rings[i];
 		napi_disable(&ring->napi);
+		cancel_work_sync(&ring->dim.dim.work);
 	}
 
 	ring = &priv->rx_rings[DESC_INDEX];
 	napi_disable(&ring->napi);
+	cancel_work_sync(&ring->dim.dim.work);
 }
 
 static void bcmgenet_fini_rx_napi(struct bcmgenet_priv *priv)
@@ -2557,6 +2628,7 @@ static irqreturn_t bcmgenet_isr1(int irq, void *dev_id)
 			continue;
 
 		rx_ring = &priv->rx_rings[index];
+		rx_ring->dim.event_ctr++;
 
 		if (likely(napi_schedule_prep(&rx_ring->napi))) {
 			rx_ring->int_disable(rx_ring);
@@ -2601,6 +2673,7 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
 
 	if (status & UMAC_IRQ_RXDMA_DONE) {
 		rx_ring = &priv->rx_rings[DESC_INDEX];
+		rx_ring->dim.event_ctr++;
 
 		if (likely(napi_schedule_prep(&rx_ring->napi))) {
 			rx_ring->int_disable(rx_ring);
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 3c50431ccd2a..22c41e0430fb 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -16,6 +16,7 @@
 #include <linux/mii.h>
 #include <linux/if_vlan.h>
 #include <linux/phy.h>
+#include <linux/net_dim.h>
 
 /* total number of Buffer Descriptors, same for Rx/Tx */
 #define TOTAL_DESC				256
@@ -572,6 +573,16 @@ struct bcmgenet_tx_ring {
 	struct bcmgenet_priv *priv;
 };
 
+struct bcmgenet_net_dim {
+	u16		use_dim;
+	u16		event_ctr;
+	unsigned long	packets;
+	unsigned long	bytes;
+	u32		coal_usecs;
+	u32		coal_pkts;
+	struct net_dim	dim;
+};
+
 struct bcmgenet_rx_ring {
 	struct napi_struct napi;	/* Rx NAPI struct */
 	unsigned long	bytes;
@@ -586,6 +597,7 @@ struct bcmgenet_rx_ring {
 	unsigned int	cb_ptr;		/* Rx ring initial CB ptr */
 	unsigned int	end_ptr;	/* Rx ring end CB ptr */
 	unsigned int	old_discards;
+	struct bcmgenet_net_dim dim;
 	void (*int_enable)(struct bcmgenet_rx_ring *);
 	void (*int_disable)(struct bcmgenet_rx_ring *);
 	struct bcmgenet_priv *priv;
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing
  2018-03-23  1:19 [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing Florian Fainelli
  2018-03-23  1:19 ` [PATCH net-next 1/2] net: systemport: Implement adaptive " Florian Fainelli
  2018-03-23  1:19 ` [PATCH net-next 2/2] net: bcmgenet: Add support for adaptive RX coalescing Florian Fainelli
@ 2018-03-26  0:49 ` David Miller
  2018-03-26 21:16 ` Tal Gilboa
  3 siblings, 0 replies; 13+ messages in thread
From: David Miller @ 2018-03-26  0:49 UTC (permalink / raw)
  To: f.fainelli
  Cc: netdev, jaedon.shin, pgynther, opendmb, michal.chan, gospo,
	talgi, saeedm

From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 22 Mar 2018 18:19:31 -0700

> This patch series adds adaptive interrupt coalescing for the Gigabit
> Ethernet drivers SYSTEMPORT and GENET.
> 
> This really helps lower the interrupt count and system load, as
> measured by vmstat for a Gigabit TCP RX session:

Looks good to me, series applied, thanks Florian.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing
  2018-03-23  1:19 [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing Florian Fainelli
                   ` (2 preceding siblings ...)
  2018-03-26  0:49 ` [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing David Miller
@ 2018-03-26 21:16 ` Tal Gilboa
  2018-03-26 22:04   ` Florian Fainelli
  3 siblings, 1 reply; 13+ messages in thread
From: Tal Gilboa @ 2018-03-26 21:16 UTC (permalink / raw)
  To: Florian Fainelli, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michal.chan, gospo, saeedm

On 3/23/2018 4:19 AM, Florian Fainelli wrote:
> Hi all,
> 
> This patch series adds adaptive interrupt coalescing for the Gigabit Ethernet
> drivers SYSTEMPORT and GENET.
> 
> This really helps lower the interrupt count and system load, as measured by
> vmstat for a Gigabit TCP RX session:

I don't see an improvement in system load, the opposite - 42% vs. 100% 
for SYSTEMPORT and 85% vs. 100% for GENET. Both with the same bandwidth. 
Am I missing something? Talking about bandwidth, I would expect 941Mb/s 
(assuming this is TCP over IPv4). Do you know why the reduced interrupt 
rate doesn't improve bandwidth? Also, any effect on the client side (you 
mentioned enabling TX moderation for SYSTEMPORT)?

> 
> SYSTEMPORT:
> 
> without:
> 
>   1  0      0 192188      0  25472    0    0     0     0 122100 38870  1 42 57  0  0
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-10.0 sec  1.03 GBytes   884 Mbits/sec
> 
> with:
> 
>   1  0      0 192288      0  25468    0    0     0     0 58806 44401  0 100  0  0  0
> [  5]  0.0-10.0 sec  1.04 GBytes   888 Mbits/sec
> 
> GENET:
> 
> without:
> 
>   1  0      0 1170404      0  25420    0    0     0     0 130785 63402  2 85 12  0  0
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-10.0 sec  1.04 GBytes   888 Mbits/sec
> 
> with:
> 
>   1  0      0 1170560      0  25420    0    0     0     0 50610 48477  0 100  0  0  0
> [  5]  0.0-10.0 sec  1.05 GBytes   899 Mbits/sec
> 
> Please look at the implementation and let me know if you see any problems, this
> was largely inspired by bnxt_en.
> 
> Thank you!
> 
> Florian Fainelli (2):
>    net: systemport: Implement adaptive interrupt coalescing
>    net: bcmgenet: Add support for adaptive RX coalescing
> 
>   drivers/net/ethernet/broadcom/bcmsysport.c     | 141 ++++++++++++++++++++++---
>   drivers/net/ethernet/broadcom/bcmsysport.h     |  14 +++
>   drivers/net/ethernet/broadcom/genet/bcmgenet.c | 109 +++++++++++++++----
>   drivers/net/ethernet/broadcom/genet/bcmgenet.h |  12 +++
>   4 files changed, 243 insertions(+), 33 deletions(-)
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 1/2] net: systemport: Implement adaptive interrupt coalescing
  2018-03-23  1:19 ` [PATCH net-next 1/2] net: systemport: Implement adaptive " Florian Fainelli
@ 2018-03-26 21:22   ` Tal Gilboa
  2018-03-26 21:36     ` Florian Fainelli
  0 siblings, 1 reply; 13+ messages in thread
From: Tal Gilboa @ 2018-03-26 21:22 UTC (permalink / raw)
  To: Florian Fainelli, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michal.chan, gospo, saeedm

On 3/23/2018 4:19 AM, Florian Fainelli wrote:
> Implement support for adaptive RX and TX interrupt coalescing using
> net_dim. We have each of our TX ring and our single RX ring implement a
> bcm_sysport_net_dim structure which holds an interrupt counter, number
> of packets, bytes, and a container for a net_dim instance.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>   drivers/net/ethernet/broadcom/bcmsysport.c | 141 ++++++++++++++++++++++++++---
>   drivers/net/ethernet/broadcom/bcmsysport.h |  14 +++
>   2 files changed, 140 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
> index f15a8fc6dfc9..5a5a726bafa4 100644
> --- a/drivers/net/ethernet/broadcom/bcmsysport.c
> +++ b/drivers/net/ethernet/broadcom/bcmsysport.c
> @@ -15,6 +15,7 @@
>   #include <linux/module.h>
>   #include <linux/kernel.h>
>   #include <linux/netdevice.h>
> +#include <linux/net_dim.h>

I don't think you need this include. You already include net_dim in 
bcmsysport.h and include the bcmsysport.h here.

>   #include <linux/etherdevice.h>
>   #include <linux/platform_device.h>
>   #include <linux/of.h>
> @@ -574,21 +575,55 @@ static int bcm_sysport_set_wol(struct net_device *dev,
>   	return 0;
>   }
>   
> +static void bcm_sysport_set_rx_coalesce(struct bcm_sysport_priv *priv)
> +{
> +	u32 reg;
> +
> +	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
> +	reg &= ~(RDMA_INTR_THRESH_MASK |
> +		 RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
> +	reg |= priv->dim.coal_pkts;
> +	reg |= DIV_ROUND_UP(priv->dim.coal_usecs * 1000, 8192) <<
> +			    RDMA_TIMEOUT_SHIFT;
> +	rdma_writel(priv, reg, RDMA_MBDONE_INTR);
> +}
> +
> +static void bcm_sysport_set_tx_coalesce(struct bcm_sysport_tx_ring *ring)
> +{
> +	struct bcm_sysport_priv *priv = ring->priv;
> +	u32 reg;
> +
> +	reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(ring->index));
> +	reg &= ~(RING_INTR_THRESH_MASK |
> +		 RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
> +	reg |= ring->dim.coal_pkts;
> +	reg |= DIV_ROUND_UP(ring->dim.coal_usecs * 1000, 8192) <<
> +			    RING_TIMEOUT_SHIFT;
> +	tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(ring->index));
> +}
> +

I wouldn't couple these functions with dim. This implies dim is always 
used. IMO, would be more clear to use a generic method which takes usecs 
and packets as an argument.

>   static int bcm_sysport_get_coalesce(struct net_device *dev,
>   				    struct ethtool_coalesce *ec)
>   {
>   	struct bcm_sysport_priv *priv = netdev_priv(dev);
> +	struct bcm_sysport_tx_ring *ring;
> +	unsigned int i;
>   	u32 reg;
>   
>   	reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(0));
>   
>   	ec->tx_coalesce_usecs = (reg >> RING_TIMEOUT_SHIFT) * 8192 / 1000;
>   	ec->tx_max_coalesced_frames = reg & RING_INTR_THRESH_MASK;
> +	for (i = 0; i < dev->num_tx_queues; i++) {
> +		ring = &priv->tx_rings[i];
> +		ec->use_adaptive_tx_coalesce |= ring->dim.use_dim;
> +	}
>   
>   	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
>   
>   	ec->rx_coalesce_usecs = (reg >> RDMA_TIMEOUT_SHIFT) * 8192 / 1000;
>   	ec->rx_max_coalesced_frames = reg & RDMA_INTR_THRESH_MASK;
> +	ec->use_adaptive_rx_coalesce = priv->dim.use_dim;
>   
>   	return 0;
>   }
> @@ -597,8 +632,8 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
>   				    struct ethtool_coalesce *ec)
>   {
>   	struct bcm_sysport_priv *priv = netdev_priv(dev);
> +	struct bcm_sysport_tx_ring *ring;
>   	unsigned int i;
> -	u32 reg;
>   
>   	/* Base system clock is 125Mhz, DMA timeout is this reference clock
>   	 * divided by 1024, which yield roughly 8.192 us, our maximum value has
> @@ -615,22 +650,26 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
>   		return -EINVAL;
>   
>   	for (i = 0; i < dev->num_tx_queues; i++) {
> -		reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(i));
> -		reg &= ~(RING_INTR_THRESH_MASK |
> -			 RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
> -		reg |= ec->tx_max_coalesced_frames;
> -		reg |= DIV_ROUND_UP(ec->tx_coalesce_usecs * 1000, 8192) <<
> -			 RING_TIMEOUT_SHIFT;
> -		tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(i));
> +		ring = &priv->tx_rings[i];
> +		ring->dim.coal_pkts = ec->tx_max_coalesced_frames;
> +		ring->dim.coal_usecs = ec->tx_coalesce_usecs;
> +		if (!ec->use_adaptive_tx_coalesce && ring->dim.use_dim) {
> +			ring->dim.coal_pkts = 1;
> +			ring->dim.coal_usecs = 0;
> +		}
> +		ring->dim.use_dim = ec->use_adaptive_tx_coalesce;
> +		bcm_sysport_set_tx_coalesce(ring);
>   	}

If I understand correctly, if I disable dim, moderation is set to 
{usecs,packets}={0,1} regardless of the input from ethtool right? 
Doesn't this break the wanted behavior? As mentioned above, I would 
decouple dim from the set_tx/rx_coalesce() function. Also, when dim is 
enabled, why change dim.coal_pkts/usecs? They would just be overwritten 
in the next iteration of net_dim.

>   
> -	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
> -	reg &= ~(RDMA_INTR_THRESH_MASK |
> -		 RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
> -	reg |= ec->rx_max_coalesced_frames;
> -	reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192) <<
> -			    RDMA_TIMEOUT_SHIFT;
> -	rdma_writel(priv, reg, RDMA_MBDONE_INTR);
> +	priv->dim.coal_usecs = ec->rx_coalesce_usecs;
> +	priv->dim.coal_pkts = ec->rx_max_coalesced_frames;
> +
> +	if (!ec->use_adaptive_rx_coalesce && priv->dim.use_dim) {
> +		priv->dim.coal_pkts = 1;
> +		priv->dim.coal_usecs = 0;
> +	}
> +	priv->dim.use_dim = ec->use_adaptive_rx_coalesce;
> +	bcm_sysport_set_rx_coalesce(priv);

Same comment as above.

>   
>   	return 0;
>   }
> @@ -709,6 +748,7 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
>   	struct bcm_sysport_stats64 *stats64 = &priv->stats64;
>   	struct net_device *ndev = priv->netdev;
>   	unsigned int processed = 0, to_process;
> +	unsigned int processed_bytes = 0;
>   	struct bcm_sysport_cb *cb;
>   	struct sk_buff *skb;
>   	unsigned int p_index;
> @@ -800,6 +840,7 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
>   		 */
>   		skb_pull(skb, sizeof(*rsb) + 2);
>   		len -= (sizeof(*rsb) + 2);
> +		processed_bytes += len;
>   
>   		/* UniMAC may forward CRC */
>   		if (priv->crc_fwd) {
> @@ -824,6 +865,9 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
>   			priv->rx_read_ptr = 0;
>   	}
>   
> +	priv->dim.packets = processed;
> +	priv->dim.bytes = processed_bytes;
> +
>   	return processed;
>   }
>   
> @@ -900,6 +944,8 @@ static unsigned int __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
>   	ring->packets += pkts_compl;
>   	ring->bytes += bytes_compl;
>   	u64_stats_update_end(&priv->syncp);
> +	ring->dim.packets = pkts_compl;
> +	ring->dim.bytes = bytes_compl;
>   
>   	ring->c_index = c_index;
>   
> @@ -945,6 +991,7 @@ static int bcm_sysport_tx_poll(struct napi_struct *napi, int budget)
>   {
>   	struct bcm_sysport_tx_ring *ring =
>   		container_of(napi, struct bcm_sysport_tx_ring, napi);
> +	struct net_dim_sample dim_sample;
>   	unsigned int work_done = 0;
>   
>   	work_done = bcm_sysport_tx_reclaim(ring->priv, ring);
> @@ -961,6 +1008,12 @@ static int bcm_sysport_tx_poll(struct napi_struct *napi, int budget)
>   		return 0;
>   	}
>   
> +	if (ring->dim.use_dim) {
> +		net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
> +			       ring->dim.bytes, &dim_sample);
> +		net_dim(&ring->dim.dim, dim_sample);
> +	}
> +
>   	return budget;
>   }
>   
> @@ -976,6 +1029,7 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
>   {
>   	struct bcm_sysport_priv *priv =
>   		container_of(napi, struct bcm_sysport_priv, napi);
> +	struct net_dim_sample dim_sample;
>   	unsigned int work_done = 0;
>   
>   	work_done = bcm_sysport_desc_rx(priv, budget);
> @@ -998,6 +1052,12 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
>   		intrl2_0_mask_clear(priv, INTRL2_0_RDMA_MBDONE);
>   	}
>   
> +	if (priv->dim.use_dim) {
> +		net_dim_sample(priv->dim.event_ctr, priv->dim.packets,
> +			       priv->dim.bytes, &dim_sample);
> +		net_dim(&priv->dim.dim, dim_sample);
> +	}
> +
>   	return work_done;
>   }
>   
> @@ -1016,6 +1076,40 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
>   	netif_dbg(priv, wol, priv->netdev, "resumed from WOL\n");
>   }
>   
> +static void bcm_sysport_dim_work(struct work_struct *work)
> +{
> +	struct net_dim *dim = container_of(work, struct net_dim, work);
> +	struct bcm_sysport_net_dim *ndim =
> +			container_of(dim, struct bcm_sysport_net_dim, dim);
> +	struct bcm_sysport_priv *priv =
> +			container_of(ndim, struct bcm_sysport_priv, dim);
> +	struct net_dim_cq_moder cur_profile =
> +				net_dim_get_profile(dim->mode, dim->profile_ix);
> +
> +	priv->dim.coal_usecs = cur_profile.usec;
> +	priv->dim.coal_pkts = cur_profile.pkts;
> +
> +	bcm_sysport_set_rx_coalesce(priv);
> +	dim->state = NET_DIM_START_MEASURE;
> +}
> +
> +static void bcm_sysport_dim_tx_work(struct work_struct *work)
> +{
> +	struct net_dim *dim = container_of(work, struct net_dim, work);
> +	struct bcm_sysport_net_dim *ndim =
> +			container_of(dim, struct bcm_sysport_net_dim, dim);
> +	struct bcm_sysport_tx_ring *ring =
> +			container_of(ndim, struct bcm_sysport_tx_ring, dim);
> +	struct net_dim_cq_moder cur_profile =
> +				net_dim_get_profile(dim->mode, dim->profile_ix);
> +
> +	ring->dim.coal_usecs = cur_profile.usec;
> +	ring->dim.coal_pkts = cur_profile.pkts;
> +
> +	bcm_sysport_set_tx_coalesce(ring);
> +	dim->state = NET_DIM_START_MEASURE;
> +}
> +
>   /* RX and misc interrupt routine */
>   static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
>   {
> @@ -1034,6 +1128,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
>   	}
>   
>   	if (priv->irq0_stat & INTRL2_0_RDMA_MBDONE) {
> +		priv->dim.event_ctr++;
>   		if (likely(napi_schedule_prep(&priv->napi))) {
>   			/* disable RX interrupts */
>   			intrl2_0_mask_set(priv, INTRL2_0_RDMA_MBDONE);
> @@ -1061,6 +1156,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
>   			continue;
>   
>   		txr = &priv->tx_rings[ring];
> +		txr->dim.event_ctr++;
>   
>   		if (likely(napi_schedule_prep(&txr->napi))) {
>   			intrl2_0_mask_set(priv, ring_bit);
> @@ -1093,6 +1189,7 @@ static irqreturn_t bcm_sysport_tx_isr(int irq, void *dev_id)
>   			continue;
>   
>   		txr = &priv->tx_rings[ring];
> +		txr->dim.event_ctr++;
>   
>   		if (likely(napi_schedule_prep(&txr->napi))) {
>   			intrl2_1_mask_set(priv, BIT(ring));
> @@ -1358,6 +1455,16 @@ static void bcm_sysport_adj_link(struct net_device *dev)
>   		phy_print_status(phydev);
>   }
>   
> +static void bcm_sysport_init_dim(struct bcm_sysport_net_dim *dim,
> +				 void (*cb)(struct work_struct *work))
> +{
> +	INIT_WORK(&dim->dim.work, cb);
> +	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
> +	dim->event_ctr = 0;
> +	dim->packets = 0;
> +	dim->bytes = 0;
> +}

What about default values for coal_usecs/pkts? dim supports it through 
net_dim_get_def_profile(mode) function.

> +
>   static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv,
>   				    unsigned int index)
>   {
> @@ -1447,6 +1554,7 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv,
>   	reg |= (1 << index);
>   	tdma_writel(priv, reg, TDMA_TIER1_ARB_0_QUEUE_EN);
>   
> +	bcm_sysport_init_dim(&ring->dim, bcm_sysport_dim_tx_work);
>   	napi_enable(&ring->napi);
>   
>   	netif_dbg(priv, hw, priv->netdev,
> @@ -1477,6 +1585,7 @@ static void bcm_sysport_fini_tx_ring(struct bcm_sysport_priv *priv,
>   		return;
>   
>   	napi_disable(&ring->napi);
> +	cancel_work_sync(&ring->dim.dim.work);
>   	netif_napi_del(&ring->napi);
>   
>   	bcm_sysport_tx_clean(priv, ring);
> @@ -1766,6 +1875,7 @@ static void bcm_sysport_netif_start(struct net_device *dev)
>   	struct bcm_sysport_priv *priv = netdev_priv(dev);
>   
>   	/* Enable NAPI */
> +	bcm_sysport_init_dim(&priv->dim, bcm_sysport_dim_work);
>   	napi_enable(&priv->napi);
>   
>   	/* Enable RX interrupt and TX ring full interrupt */
> @@ -1951,6 +2061,7 @@ static void bcm_sysport_netif_stop(struct net_device *dev)
>   	/* stop all software from updating hardware */
>   	netif_tx_stop_all_queues(dev);
>   	napi_disable(&priv->napi);
> +	cancel_work_sync(&priv->dim.dim.work);
>   	phy_stop(dev->phydev);
>   
>   	/* mask all interrupts */
> diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
> index f5a984c1c986..9f48ad3cc38d 100644
> --- a/drivers/net/ethernet/broadcom/bcmsysport.h
> +++ b/drivers/net/ethernet/broadcom/bcmsysport.h
> @@ -12,6 +12,7 @@
>   #define __BCM_SYSPORT_H
>   
>   #include <linux/if_vlan.h>
> +#include <linux/net_dim.h>
>   
>   /* Receive/transmit descriptor format */
>   #define DESC_ADDR_HI_STATUS_LEN	0x00
> @@ -695,6 +696,16 @@ struct bcm_sysport_hw_params {
>   	unsigned int	num_rx_desc_words;
>   };
>   
> +struct bcm_sysport_net_dim {
> +	u16			use_dim;
> +	u16			event_ctr;
> +	unsigned long		packets;
> +	unsigned long		bytes;
> +	u32			coal_usecs;
> +	u32			coal_pkts;
> +	struct net_dim		dim;
> +};
> +
>   /* Software view of the TX ring */
>   struct bcm_sysport_tx_ring {
>   	spinlock_t	lock;		/* Ring lock for tx reclaim/xmit */
> @@ -712,6 +723,7 @@ struct bcm_sysport_tx_ring {
>   	struct bcm_sysport_priv *priv;	/* private context backpointer */
>   	unsigned long	packets;	/* packets statistics */
>   	unsigned long	bytes;		/* bytes statistics */
> +	struct bcm_sysport_net_dim dim;	/* Net DIM context */
>   	unsigned int	switch_queue;	/* switch port queue number */
>   	unsigned int	switch_port;	/* switch port queue number */
>   	bool		inspect;	/* inspect switch port and queue */
> @@ -743,6 +755,8 @@ struct bcm_sysport_priv {
>   	unsigned int		rx_read_ptr;
>   	unsigned int		rx_c_index;
>   
> +	struct bcm_sysport_net_dim	dim;
> +
>   	/* PHY device */
>   	struct device_node	*phy_dn;
>   	phy_interface_t		phy_interface;
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/2] net: bcmgenet: Add support for adaptive RX coalescing
  2018-03-23  1:19 ` [PATCH net-next 2/2] net: bcmgenet: Add support for adaptive RX coalescing Florian Fainelli
@ 2018-03-26 21:23   ` Tal Gilboa
  0 siblings, 0 replies; 13+ messages in thread
From: Tal Gilboa @ 2018-03-26 21:23 UTC (permalink / raw)
  To: Florian Fainelli, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michal.chan, gospo, saeedm

On 3/23/2018 4:19 AM, Florian Fainelli wrote:
> Unlike the moder modern SYSTEMPORT hardware, we do not have a
> configurable TDMA timeout, which limits us to implement adaptive RX
> interrupt coalescing only. We have each of our RX rings implement a
> bcmgenet_net_dim structure which holds an interrupt counter, number of
> packets, bytes, and a container for a net_dim instance.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>   drivers/net/ethernet/broadcom/genet/bcmgenet.c | 109 +++++++++++++++++++++----
>   drivers/net/ethernet/broadcom/genet/bcmgenet.h |  12 +++
>   2 files changed, 103 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> index b1e35a9accf1..7db8edc643ec 100644
> --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> @@ -603,6 +603,8 @@ static int bcmgenet_get_coalesce(struct net_device *dev,
>   				 struct ethtool_coalesce *ec)
>   {
>   	struct bcmgenet_priv *priv = netdev_priv(dev);
> +	struct bcmgenet_rx_ring *ring;
> +	unsigned int i;
>   
>   	ec->tx_max_coalesced_frames =
>   		bcmgenet_tdma_ring_readl(priv, DESC_INDEX,
> @@ -613,15 +615,37 @@ static int bcmgenet_get_coalesce(struct net_device *dev,
>   	ec->rx_coalesce_usecs =
>   		bcmgenet_rdma_readl(priv, DMA_RING16_TIMEOUT) * 8192 / 1000;
>   
> +	for (i = 0; i < priv->hw_params->rx_queues; i++) {
> +		ring = &priv->rx_rings[i];
> +		ec->use_adaptive_rx_coalesce |= ring->dim.use_dim;
> +	}
> +	ring = &priv->rx_rings[DESC_INDEX];
> +	ec->use_adaptive_rx_coalesce |= ring->dim.use_dim;
> +
>   	return 0;
>   }
>   
> +static void bcmgenet_set_rx_coalesce(struct bcmgenet_rx_ring *ring)
> +{
> +	struct bcmgenet_priv *priv = ring->priv;
> +	unsigned int i = ring->index;
> +	u32 reg;
> +
> +	bcmgenet_rdma_ring_writel(priv, i, ring->dim.coal_pkts,
> +				  DMA_MBUF_DONE_THRESH);
> +
> +	reg = bcmgenet_rdma_readl(priv, DMA_RING0_TIMEOUT + i);
> +	reg &= ~DMA_TIMEOUT_MASK;
> +	reg |= DIV_ROUND_UP(ring->dim.coal_usecs * 1000, 8192);
> +	bcmgenet_rdma_writel(priv, reg, DMA_RING0_TIMEOUT + i);
> +}
> +

Similar comments from path 1/2 apply here - wouldn't couple the genric 
get_set_coalesce functions with dim.

>   static int bcmgenet_set_coalesce(struct net_device *dev,
>   				 struct ethtool_coalesce *ec)
>   {
>   	struct bcmgenet_priv *priv = netdev_priv(dev);
> +	struct bcmgenet_rx_ring *ring;
>   	unsigned int i;
> -	u32 reg;
>   
>   	/* Base system clock is 125Mhz, DMA timeout is this reference clock
>   	 * divided by 1024, which yields roughly 8.192us, our maximum value
> @@ -641,7 +665,8 @@ static int bcmgenet_set_coalesce(struct net_device *dev,
>   	 * transmitted, or when the ring is empty.
>   	 */
>   	if (ec->tx_coalesce_usecs || ec->tx_coalesce_usecs_high ||
> -	    ec->tx_coalesce_usecs_irq || ec->tx_coalesce_usecs_low)
> +	    ec->tx_coalesce_usecs_irq || ec->tx_coalesce_usecs_low ||
> +	    ec->use_adaptive_tx_coalesce)
>   		return -EOPNOTSUPP;
>   
>   	/* Program all TX queues with the same values, as there is no
> @@ -656,24 +681,26 @@ static int bcmgenet_set_coalesce(struct net_device *dev,
>   				  DMA_MBUF_DONE_THRESH);
>   
>   	for (i = 0; i < priv->hw_params->rx_queues; i++) {
> -		bcmgenet_rdma_ring_writel(priv, i,
> -					  ec->rx_max_coalesced_frames,
> -					  DMA_MBUF_DONE_THRESH);
> -
> -		reg = bcmgenet_rdma_readl(priv, DMA_RING0_TIMEOUT + i);
> -		reg &= ~DMA_TIMEOUT_MASK;
> -		reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192);
> -		bcmgenet_rdma_writel(priv, reg, DMA_RING0_TIMEOUT + i);
> +		ring = &priv->rx_rings[i];
> +		ring->dim.coal_usecs = ec->rx_coalesce_usecs;
> +		ring->dim.coal_pkts = ec->rx_max_coalesced_frames;
> +		if (!ec->use_adaptive_rx_coalesce && ring->dim.use_dim) {
> +			ring->dim.coal_pkts = 1;
> +			ring->dim.coal_usecs = 0;
> +		}
> +		ring->dim.use_dim = ec->use_adaptive_rx_coalesce;
> +		bcmgenet_set_rx_coalesce(ring);
>   	}
>   
> -	bcmgenet_rdma_ring_writel(priv, DESC_INDEX,
> -				  ec->rx_max_coalesced_frames,
> -				  DMA_MBUF_DONE_THRESH);
> -
> -	reg = bcmgenet_rdma_readl(priv, DMA_RING16_TIMEOUT);
> -	reg &= ~DMA_TIMEOUT_MASK;
> -	reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192);
> -	bcmgenet_rdma_writel(priv, reg, DMA_RING16_TIMEOUT);
> +	ring = &priv->rx_rings[DESC_INDEX];
> +	ring->dim.coal_usecs = ec->rx_coalesce_usecs;
> +	ring->dim.coal_pkts = ec->rx_max_coalesced_frames;
> +	if (!ec->use_adaptive_rx_coalesce && ring->dim.use_dim) {
> +		ring->dim.coal_pkts = 1;
> +		ring->dim.coal_usecs = 0;
> +	}
> +	ring->dim.use_dim = ec->use_adaptive_rx_coalesce;
> +	bcmgenet_set_rx_coalesce(ring);
>   
>   	return 0;
>   }
> @@ -1713,6 +1740,7 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
>   	unsigned long dma_flag;
>   	int len;
>   	unsigned int rxpktprocessed = 0, rxpkttoprocess;
> +	unsigned int bytes_processed = 0;
>   	unsigned int p_index, mask;
>   	unsigned int discards;
>   	unsigned int chksum_ok = 0;
> @@ -1832,6 +1860,8 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
>   			len -= ETH_FCS_LEN;
>   		}
>   
> +		bytes_processed += len;
> +
>   		/*Finish setting up the received SKB and send it to the kernel*/
>   		skb->protocol = eth_type_trans(skb, priv->dev);
>   		ring->packets++;
> @@ -1854,6 +1884,9 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
>   		bcmgenet_rdma_ring_writel(priv, ring->index, ring->c_index, RDMA_CONS_INDEX);
>   	}
>   
> +	ring->dim.bytes = bytes_processed;
> +	ring->dim.packets = rxpktprocessed;
> +
>   	return rxpktprocessed;
>   }
>   
> @@ -1862,6 +1895,7 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
>   {
>   	struct bcmgenet_rx_ring *ring = container_of(napi,
>   			struct bcmgenet_rx_ring, napi);
> +	struct net_dim_sample dim_sample;
>   	unsigned int work_done;
>   
>   	work_done = bcmgenet_desc_rx(ring, budget);
> @@ -1871,9 +1905,32 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
>   		ring->int_enable(ring);
>   	}
>   
> +	if (ring->dim.use_dim) {
> +		net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
> +			       ring->dim.bytes, &dim_sample);
> +		net_dim(&ring->dim.dim, dim_sample);
> +	}
> +
>   	return work_done;
>   }
>   
> +static void bcmgenet_dim_work(struct work_struct *work)
> +{
> +	struct net_dim *dim = container_of(work, struct net_dim, work);
> +	struct bcmgenet_net_dim *ndim =
> +			container_of(dim, struct bcmgenet_net_dim, dim);
> +	struct bcmgenet_rx_ring *ring =
> +			container_of(ndim, struct bcmgenet_rx_ring, dim);
> +	struct net_dim_cq_moder cur_profile =
> +			net_dim_get_profile(dim->mode, dim->profile_ix);
> +
> +	ring->dim.coal_usecs = cur_profile.usec;
> +	ring->dim.coal_pkts = cur_profile.pkts;
> +
> +	bcmgenet_set_rx_coalesce(ring);
> +	dim->state = NET_DIM_START_MEASURE;
> +}
> +
>   /* Assign skb to RX DMA descriptor. */
>   static int bcmgenet_alloc_rx_buffers(struct bcmgenet_priv *priv,
>   				     struct bcmgenet_rx_ring *ring)
> @@ -2022,6 +2079,16 @@ static void init_umac(struct bcmgenet_priv *priv)
>   	dev_dbg(kdev, "done init umac\n");
>   }
>   
> +static void bcmgenet_init_dim(struct bcmgenet_net_dim *dim,
> +			      void (*cb)(struct work_struct *work))
> +{
> +	INIT_WORK(&dim->dim.work, cb);
> +	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
> +	dim->event_ctr = 0;
> +	dim->packets = 0;
> +	dim->bytes = 0;
> +} > +

Similar comment from path 1/2 applies here - default values for 
coal_usecs/pkts.

>   /* Initialize a Tx ring along with corresponding hardware registers */
>   static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv,
>   				  unsigned int index, unsigned int size,
> @@ -2111,6 +2178,8 @@ static int bcmgenet_init_rx_ring(struct bcmgenet_priv *priv,
>   	if (ret)
>   		return ret;
>   
> +	bcmgenet_init_dim(&ring->dim, bcmgenet_dim_work);
> +
>   	/* Initialize Rx NAPI */
>   	netif_napi_add(priv->dev, &ring->napi, bcmgenet_rx_poll,
>   		       NAPI_POLL_WEIGHT);
> @@ -2276,10 +2345,12 @@ static void bcmgenet_disable_rx_napi(struct bcmgenet_priv *priv)
>   	for (i = 0; i < priv->hw_params->rx_queues; ++i) {
>   		ring = &priv->rx_rings[i];
>   		napi_disable(&ring->napi);
> +		cancel_work_sync(&ring->dim.dim.work);
>   	}
>   
>   	ring = &priv->rx_rings[DESC_INDEX];
>   	napi_disable(&ring->napi);
> +	cancel_work_sync(&ring->dim.dim.work);
>   }
>   
>   static void bcmgenet_fini_rx_napi(struct bcmgenet_priv *priv)
> @@ -2557,6 +2628,7 @@ static irqreturn_t bcmgenet_isr1(int irq, void *dev_id)
>   			continue;
>   
>   		rx_ring = &priv->rx_rings[index];
> +		rx_ring->dim.event_ctr++;
>   
>   		if (likely(napi_schedule_prep(&rx_ring->napi))) {
>   			rx_ring->int_disable(rx_ring);
> @@ -2601,6 +2673,7 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
>   
>   	if (status & UMAC_IRQ_RXDMA_DONE) {
>   		rx_ring = &priv->rx_rings[DESC_INDEX];
> +		rx_ring->dim.event_ctr++;
>   
>   		if (likely(napi_schedule_prep(&rx_ring->napi))) {
>   			rx_ring->int_disable(rx_ring);
> diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
> index 3c50431ccd2a..22c41e0430fb 100644
> --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
> +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
> @@ -16,6 +16,7 @@
>   #include <linux/mii.h>
>   #include <linux/if_vlan.h>
>   #include <linux/phy.h>
> +#include <linux/net_dim.h>
>   
>   /* total number of Buffer Descriptors, same for Rx/Tx */
>   #define TOTAL_DESC				256
> @@ -572,6 +573,16 @@ struct bcmgenet_tx_ring {
>   	struct bcmgenet_priv *priv;
>   };
>   
> +struct bcmgenet_net_dim {
> +	u16		use_dim;
> +	u16		event_ctr;
> +	unsigned long	packets;
> +	unsigned long	bytes;
> +	u32		coal_usecs;
> +	u32		coal_pkts;
> +	struct net_dim	dim;
> +};
> +
>   struct bcmgenet_rx_ring {
>   	struct napi_struct napi;	/* Rx NAPI struct */
>   	unsigned long	bytes;
> @@ -586,6 +597,7 @@ struct bcmgenet_rx_ring {
>   	unsigned int	cb_ptr;		/* Rx ring initial CB ptr */
>   	unsigned int	end_ptr;	/* Rx ring end CB ptr */
>   	unsigned int	old_discards;
> +	struct bcmgenet_net_dim dim;
>   	void (*int_enable)(struct bcmgenet_rx_ring *);
>   	void (*int_disable)(struct bcmgenet_rx_ring *);
>   	struct bcmgenet_priv *priv;
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 1/2] net: systemport: Implement adaptive interrupt coalescing
  2018-03-26 21:22   ` Tal Gilboa
@ 2018-03-26 21:36     ` Florian Fainelli
  2018-03-26 23:07       ` Tal Gilboa
  0 siblings, 1 reply; 13+ messages in thread
From: Florian Fainelli @ 2018-03-26 21:36 UTC (permalink / raw)
  To: Tal Gilboa, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michal.chan, gospo, saeedm

On 03/26/2018 02:22 PM, Tal Gilboa wrote:
> On 3/23/2018 4:19 AM, Florian Fainelli wrote:
>> Implement support for adaptive RX and TX interrupt coalescing using
>> net_dim. We have each of our TX ring and our single RX ring implement a
>> bcm_sysport_net_dim structure which holds an interrupt counter, number
>> of packets, bytes, and a container for a net_dim instance.
>>
>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>> ---
>>   drivers/net/ethernet/broadcom/bcmsysport.c | 141
>> ++++++++++++++++++++++++++---
>>   drivers/net/ethernet/broadcom/bcmsysport.h |  14 +++
>>   2 files changed, 140 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c
>> b/drivers/net/ethernet/broadcom/bcmsysport.c
>> index f15a8fc6dfc9..5a5a726bafa4 100644
>> --- a/drivers/net/ethernet/broadcom/bcmsysport.c
>> +++ b/drivers/net/ethernet/broadcom/bcmsysport.c
>> @@ -15,6 +15,7 @@
>>   #include <linux/module.h>
>>   #include <linux/kernel.h>
>>   #include <linux/netdevice.h>
>> +#include <linux/net_dim.h>
> 
> I don't think you need this include. You already include net_dim in
> bcmsysport.h and include the bcmsysport.h here.

Indeed.

> 
>>   #include <linux/etherdevice.h>
>>   #include <linux/platform_device.h>
>>   #include <linux/of.h>
>> @@ -574,21 +575,55 @@ static int bcm_sysport_set_wol(struct net_device
>> *dev,
>>       return 0;
>>   }
>>   +static void bcm_sysport_set_rx_coalesce(struct bcm_sysport_priv *priv)
>> +{
>> +    u32 reg;
>> +
>> +    reg = rdma_readl(priv, RDMA_MBDONE_INTR);
>> +    reg &= ~(RDMA_INTR_THRESH_MASK |
>> +         RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
>> +    reg |= priv->dim.coal_pkts;
>> +    reg |= DIV_ROUND_UP(priv->dim.coal_usecs * 1000, 8192) <<
>> +                RDMA_TIMEOUT_SHIFT;
>> +    rdma_writel(priv, reg, RDMA_MBDONE_INTR);
>> +}
>> +
>> +static void bcm_sysport_set_tx_coalesce(struct bcm_sysport_tx_ring
>> *ring)
>> +{
>> +    struct bcm_sysport_priv *priv = ring->priv;
>> +    u32 reg;
>> +
>> +    reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(ring->index));
>> +    reg &= ~(RING_INTR_THRESH_MASK |
>> +         RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
>> +    reg |= ring->dim.coal_pkts;
>> +    reg |= DIV_ROUND_UP(ring->dim.coal_usecs * 1000, 8192) <<
>> +                RING_TIMEOUT_SHIFT;
>> +    tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(ring->index));
>> +}
>> +
> 
> I wouldn't couple these functions with dim. This implies dim is always
> used. IMO, would be more clear to use a generic method which takes usecs
> and packets as an argument.

I did not want to create an additional structure for storing coalescing
parameters, but if you prefer I make this function take two parameters,
that sounds entirely reasonable.

> 
>>   static int bcm_sysport_get_coalesce(struct net_device *dev,
>>                       struct ethtool_coalesce *ec)
>>   {
>>       struct bcm_sysport_priv *priv = netdev_priv(dev);
>> +    struct bcm_sysport_tx_ring *ring;
>> +    unsigned int i;
>>       u32 reg;
>>         reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(0));
>>         ec->tx_coalesce_usecs = (reg >> RING_TIMEOUT_SHIFT) * 8192 /
>> 1000;
>>       ec->tx_max_coalesced_frames = reg & RING_INTR_THRESH_MASK;
>> +    for (i = 0; i < dev->num_tx_queues; i++) {
>> +        ring = &priv->tx_rings[i];
>> +        ec->use_adaptive_tx_coalesce |= ring->dim.use_dim;
>> +    }
>>         reg = rdma_readl(priv, RDMA_MBDONE_INTR);
>>         ec->rx_coalesce_usecs = (reg >> RDMA_TIMEOUT_SHIFT) * 8192 /
>> 1000;
>>       ec->rx_max_coalesced_frames = reg & RDMA_INTR_THRESH_MASK;
>> +    ec->use_adaptive_rx_coalesce = priv->dim.use_dim;
>>         return 0;
>>   }
>> @@ -597,8 +632,8 @@ static int bcm_sysport_set_coalesce(struct
>> net_device *dev,
>>                       struct ethtool_coalesce *ec)
>>   {
>>       struct bcm_sysport_priv *priv = netdev_priv(dev);
>> +    struct bcm_sysport_tx_ring *ring;
>>       unsigned int i;
>> -    u32 reg;
>>         /* Base system clock is 125Mhz, DMA timeout is this reference
>> clock
>>        * divided by 1024, which yield roughly 8.192 us, our maximum
>> value has
>> @@ -615,22 +650,26 @@ static int bcm_sysport_set_coalesce(struct
>> net_device *dev,
>>           return -EINVAL;
>>         for (i = 0; i < dev->num_tx_queues; i++) {
>> -        reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(i));
>> -        reg &= ~(RING_INTR_THRESH_MASK |
>> -             RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
>> -        reg |= ec->tx_max_coalesced_frames;
>> -        reg |= DIV_ROUND_UP(ec->tx_coalesce_usecs * 1000, 8192) <<
>> -             RING_TIMEOUT_SHIFT;
>> -        tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(i));
>> +        ring = &priv->tx_rings[i];
>> +        ring->dim.coal_pkts = ec->tx_max_coalesced_frames;
>> +        ring->dim.coal_usecs = ec->tx_coalesce_usecs;
>> +        if (!ec->use_adaptive_tx_coalesce && ring->dim.use_dim) {
>> +            ring->dim.coal_pkts = 1;
>> +            ring->dim.coal_usecs = 0;
>> +        }
>> +        ring->dim.use_dim = ec->use_adaptive_tx_coalesce;
>> +        bcm_sysport_set_tx_coalesce(ring);
>>       }
> 
> If I understand correctly, if I disable dim, moderation is set to
> {usecs,packets}={0,1} regardless of the input from ethtool right?

Correct, these are the default coalescing parameters that the driver
sets. As mentioned before, since I am not storing any coalescing
parameters other than these two, there is no copy of what an user might
have previously provided, falling back to the defaults seemed reasonable.

> Doesn't this break the wanted behavior? As mentioned above, I would
> decouple dim from the set_tx/rx_coalesce() function. Also, when dim is
> enabled, why change dim.coal_pkts/usecs? They would just be overwritten
> in the next iteration of net_dim.

Indeed, that is not necessary.

> 
>>   -    reg = rdma_readl(priv, RDMA_MBDONE_INTR);
>> -    reg &= ~(RDMA_INTR_THRESH_MASK |
>> -         RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
>> -    reg |= ec->rx_max_coalesced_frames;
>> -    reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192) <<
>> -                RDMA_TIMEOUT_SHIFT;
>> -    rdma_writel(priv, reg, RDMA_MBDONE_INTR);
>> +    priv->dim.coal_usecs = ec->rx_coalesce_usecs;
>> +    priv->dim.coal_pkts = ec->rx_max_coalesced_frames;
>> +
>> +    if (!ec->use_adaptive_rx_coalesce && priv->dim.use_dim) {
>> +        priv->dim.coal_pkts = 1;
>> +        priv->dim.coal_usecs = 0;
>> +    }
>> +    priv->dim.use_dim = ec->use_adaptive_rx_coalesce;
>> +    bcm_sysport_set_rx_coalesce(priv);
> 
> Same comment as above.
> 
>>         return 0;
>>   }
>> @@ -709,6 +748,7 @@ static unsigned int bcm_sysport_desc_rx(struct
>> bcm_sysport_priv *priv,
>>       struct bcm_sysport_stats64 *stats64 = &priv->stats64;
>>       struct net_device *ndev = priv->netdev;
>>       unsigned int processed = 0, to_process;
>> +    unsigned int processed_bytes = 0;
>>       struct bcm_sysport_cb *cb;
>>       struct sk_buff *skb;
>>       unsigned int p_index;
>> @@ -800,6 +840,7 @@ static unsigned int bcm_sysport_desc_rx(struct
>> bcm_sysport_priv *priv,
>>            */
>>           skb_pull(skb, sizeof(*rsb) + 2);
>>           len -= (sizeof(*rsb) + 2);
>> +        processed_bytes += len;
>>             /* UniMAC may forward CRC */
>>           if (priv->crc_fwd) {
>> @@ -824,6 +865,9 @@ static unsigned int bcm_sysport_desc_rx(struct
>> bcm_sysport_priv *priv,
>>               priv->rx_read_ptr = 0;
>>       }
>>   +    priv->dim.packets = processed;
>> +    priv->dim.bytes = processed_bytes;
>> +
>>       return processed;
>>   }
>>   @@ -900,6 +944,8 @@ static unsigned int
>> __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
>>       ring->packets += pkts_compl;
>>       ring->bytes += bytes_compl;
>>       u64_stats_update_end(&priv->syncp);
>> +    ring->dim.packets = pkts_compl;
>> +    ring->dim.bytes = bytes_compl;
>>         ring->c_index = c_index;
>>   @@ -945,6 +991,7 @@ static int bcm_sysport_tx_poll(struct
>> napi_struct *napi, int budget)
>>   {
>>       struct bcm_sysport_tx_ring *ring =
>>           container_of(napi, struct bcm_sysport_tx_ring, napi);
>> +    struct net_dim_sample dim_sample;
>>       unsigned int work_done = 0;
>>         work_done = bcm_sysport_tx_reclaim(ring->priv, ring);
>> @@ -961,6 +1008,12 @@ static int bcm_sysport_tx_poll(struct
>> napi_struct *napi, int budget)
>>           return 0;
>>       }
>>   +    if (ring->dim.use_dim) {
>> +        net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
>> +                   ring->dim.bytes, &dim_sample);
>> +        net_dim(&ring->dim.dim, dim_sample);
>> +    }
>> +
>>       return budget;
>>   }
>>   @@ -976,6 +1029,7 @@ static int bcm_sysport_poll(struct napi_struct
>> *napi, int budget)
>>   {
>>       struct bcm_sysport_priv *priv =
>>           container_of(napi, struct bcm_sysport_priv, napi);
>> +    struct net_dim_sample dim_sample;
>>       unsigned int work_done = 0;
>>         work_done = bcm_sysport_desc_rx(priv, budget);
>> @@ -998,6 +1052,12 @@ static int bcm_sysport_poll(struct napi_struct
>> *napi, int budget)
>>           intrl2_0_mask_clear(priv, INTRL2_0_RDMA_MBDONE);
>>       }
>>   +    if (priv->dim.use_dim) {
>> +        net_dim_sample(priv->dim.event_ctr, priv->dim.packets,
>> +                   priv->dim.bytes, &dim_sample);
>> +        net_dim(&priv->dim.dim, dim_sample);
>> +    }
>> +
>>       return work_done;
>>   }
>>   @@ -1016,6 +1076,40 @@ static void
>> bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
>>       netif_dbg(priv, wol, priv->netdev, "resumed from WOL\n");
>>   }
>>   +static void bcm_sysport_dim_work(struct work_struct *work)
>> +{
>> +    struct net_dim *dim = container_of(work, struct net_dim, work);
>> +    struct bcm_sysport_net_dim *ndim =
>> +            container_of(dim, struct bcm_sysport_net_dim, dim);
>> +    struct bcm_sysport_priv *priv =
>> +            container_of(ndim, struct bcm_sysport_priv, dim);
>> +    struct net_dim_cq_moder cur_profile =
>> +                net_dim_get_profile(dim->mode, dim->profile_ix);
>> +
>> +    priv->dim.coal_usecs = cur_profile.usec;
>> +    priv->dim.coal_pkts = cur_profile.pkts;
>> +
>> +    bcm_sysport_set_rx_coalesce(priv);
>> +    dim->state = NET_DIM_START_MEASURE;
>> +}
>> +
>> +static void bcm_sysport_dim_tx_work(struct work_struct *work)
>> +{
>> +    struct net_dim *dim = container_of(work, struct net_dim, work);
>> +    struct bcm_sysport_net_dim *ndim =
>> +            container_of(dim, struct bcm_sysport_net_dim, dim);
>> +    struct bcm_sysport_tx_ring *ring =
>> +            container_of(ndim, struct bcm_sysport_tx_ring, dim);
>> +    struct net_dim_cq_moder cur_profile =
>> +                net_dim_get_profile(dim->mode, dim->profile_ix);
>> +
>> +    ring->dim.coal_usecs = cur_profile.usec;
>> +    ring->dim.coal_pkts = cur_profile.pkts;
>> +
>> +    bcm_sysport_set_tx_coalesce(ring);
>> +    dim->state = NET_DIM_START_MEASURE;
>> +}
>> +
>>   /* RX and misc interrupt routine */
>>   static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
>>   {
>> @@ -1034,6 +1128,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq,
>> void *dev_id)
>>       }
>>         if (priv->irq0_stat & INTRL2_0_RDMA_MBDONE) {
>> +        priv->dim.event_ctr++;
>>           if (likely(napi_schedule_prep(&priv->napi))) {
>>               /* disable RX interrupts */
>>               intrl2_0_mask_set(priv, INTRL2_0_RDMA_MBDONE);
>> @@ -1061,6 +1156,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq,
>> void *dev_id)
>>               continue;
>>             txr = &priv->tx_rings[ring];
>> +        txr->dim.event_ctr++;
>>             if (likely(napi_schedule_prep(&txr->napi))) {
>>               intrl2_0_mask_set(priv, ring_bit);
>> @@ -1093,6 +1189,7 @@ static irqreturn_t bcm_sysport_tx_isr(int irq,
>> void *dev_id)
>>               continue;
>>             txr = &priv->tx_rings[ring];
>> +        txr->dim.event_ctr++;
>>             if (likely(napi_schedule_prep(&txr->napi))) {
>>               intrl2_1_mask_set(priv, BIT(ring));
>> @@ -1358,6 +1455,16 @@ static void bcm_sysport_adj_link(struct
>> net_device *dev)
>>           phy_print_status(phydev);
>>   }
>>   +static void bcm_sysport_init_dim(struct bcm_sysport_net_dim *dim,
>> +                 void (*cb)(struct work_struct *work))
>> +{
>> +    INIT_WORK(&dim->dim.work, cb);
>> +    dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
>> +    dim->event_ctr = 0;
>> +    dim->packets = 0;
>> +    dim->bytes = 0;
>> +}
> 
> What about default values for coal_usecs/pkts? dim supports it through
> net_dim_get_def_profile(mode) function.

OK, thanks I did not know that.
-- 
Florian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing
  2018-03-26 21:16 ` Tal Gilboa
@ 2018-03-26 22:04   ` Florian Fainelli
  2018-03-26 22:29     ` Florian Fainelli
  0 siblings, 1 reply; 13+ messages in thread
From: Florian Fainelli @ 2018-03-26 22:04 UTC (permalink / raw)
  To: Tal Gilboa, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michal.chan, gospo, saeedm

On 03/26/2018 02:16 PM, Tal Gilboa wrote:
> On 3/23/2018 4:19 AM, Florian Fainelli wrote:
>> Hi all,
>>
>> This patch series adds adaptive interrupt coalescing for the Gigabit
>> Ethernet
>> drivers SYSTEMPORT and GENET.
>>
>> This really helps lower the interrupt count and system load, as
>> measured by
>> vmstat for a Gigabit TCP RX session:
> 
> I don't see an improvement in system load, the opposite - 42% vs. 100%
> for SYSTEMPORT and 85% vs. 100% for GENET. Both with the same bandwidth.

Looks like I did not extract the correct data the load could spike in
both cases (with and without net_dim) up to 100, but averaged over the
transmission I see the following:

GENET without:
 1  0      0 1169568      0  25556    0    0     0     0 130079 62795  2
86 13  0  0

GENET with:
 1  0      0 1169536      0  25556    0    0     0     0 10566 10869  1
21 78  0  0

> Am I missing something? Talking about bandwidth, I would expect 941Mb/s
> (assuming this is TCP over IPv4). Do you know why the reduced interrupt
> rate doesn't improve bandwidth?

I am assuming that this comes down to a latency, still capturing some
pcap files to analyze the TCP session with wireshark and see if that is
indeed what is going on. The test machine is actually not that great

> Also, any effect on the client side (you
> mentioned enabling TX moderation for SYSTEMPORT)?

Yes, on SYSTEMPORT, being the TCP IPv4 client, I have the following:

SYSTEMPORT without:
 2  0      0 191428      0  25748    0    0     0     0 86254  264  0 41
59  0  0

SYSTEMPORT with:
 3  0      0 190176      0  25748    0    0     0     0 45485 31332  0
100  0  0  0

I don't get top to agree with these load results though but it looks
like we just have the CPU spinning more, does not look like a win.

Thanks a lot for taking a look at this Tal!
-- 
Florian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing
  2018-03-26 22:04   ` Florian Fainelli
@ 2018-03-26 22:29     ` Florian Fainelli
  2018-03-26 23:21       ` Tal Gilboa
  0 siblings, 1 reply; 13+ messages in thread
From: Florian Fainelli @ 2018-03-26 22:29 UTC (permalink / raw)
  To: Tal Gilboa, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michael.chan, gospo, saeedm

On 03/26/2018 03:04 PM, Florian Fainelli wrote:
> On 03/26/2018 02:16 PM, Tal Gilboa wrote:
>> On 3/23/2018 4:19 AM, Florian Fainelli wrote:
>>> Hi all,
>>>
>>> This patch series adds adaptive interrupt coalescing for the Gigabit
>>> Ethernet
>>> drivers SYSTEMPORT and GENET.
>>>
>>> This really helps lower the interrupt count and system load, as
>>> measured by
>>> vmstat for a Gigabit TCP RX session:
>>
>> I don't see an improvement in system load, the opposite - 42% vs. 100%
>> for SYSTEMPORT and 85% vs. 100% for GENET. Both with the same bandwidth.
> 
> Looks like I did not extract the correct data the load could spike in
> both cases (with and without net_dim) up to 100, but averaged over the
> transmission I see the following:
> 
> GENET without:
>  1  0      0 1169568      0  25556    0    0     0     0 130079 62795  2
> 86 13  0  0
> 
> GENET with:
>  1  0      0 1169536      0  25556    0    0     0     0 10566 10869  1
> 21 78  0  0
> 
>> Am I missing something? Talking about bandwidth, I would expect 941Mb/s
>> (assuming this is TCP over IPv4). Do you know why the reduced interrupt
>> rate doesn't improve bandwidth?
> 
> I am assuming that this comes down to a latency, still capturing some
> pcap files to analyze the TCP session with wireshark and see if that is
> indeed what is going on. The test machine is actually not that great
> 
>> Also, any effect on the client side (you
>> mentioned enabling TX moderation for SYSTEMPORT)?
> 
> Yes, on SYSTEMPORT, being the TCP IPv4 client, I have the following:
> 
> SYSTEMPORT without:
>  2  0      0 191428      0  25748    0    0     0     0 86254  264  0 41
> 59  0  0
> 
> SYSTEMPORT with:
>  3  0      0 190176      0  25748    0    0     0     0 45485 31332  0
> 100  0  0  0
> 
> I don't get top to agree with these load results though but it looks
> like we just have the CPU spinning more, does not look like a win.

The problem appears to be the timeout selection on TX, ignoring it
completely allows us to keep the load average down while maintaining the
bandwidth. Looks like NAPI on TX already does a good job, so interrupt
mitigation on TX is not such a great idea actually...

Also, doing UDP TX tests shows that we can lower the interrupt count by
setting an appropriate tx-frames (as expected), but we won't be lowering
the CPU load since that is inherently a CPU intensive work. Past
tx-frames=64, the bandwidth completely drops because that would be 1/2
of the ring size.
-- 
Florian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 1/2] net: systemport: Implement adaptive interrupt coalescing
  2018-03-26 21:36     ` Florian Fainelli
@ 2018-03-26 23:07       ` Tal Gilboa
  0 siblings, 0 replies; 13+ messages in thread
From: Tal Gilboa @ 2018-03-26 23:07 UTC (permalink / raw)
  To: Florian Fainelli, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michal.chan, gospo, saeedm

On 3/27/2018 12:36 AM, Florian Fainelli wrote:
> On 03/26/2018 02:22 PM, Tal Gilboa wrote:
>> On 3/23/2018 4:19 AM, Florian Fainelli wrote:
>>> Implement support for adaptive RX and TX interrupt coalescing using
>>> net_dim. We have each of our TX ring and our single RX ring implement a
>>> bcm_sysport_net_dim structure which holds an interrupt counter, number
>>> of packets, bytes, and a container for a net_dim instance.
>>>
>>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>>> ---
>>>    drivers/net/ethernet/broadcom/bcmsysport.c | 141
>>> ++++++++++++++++++++++++++---
>>>    drivers/net/ethernet/broadcom/bcmsysport.h |  14 +++
>>>    2 files changed, 140 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c
>>> b/drivers/net/ethernet/broadcom/bcmsysport.c
>>> index f15a8fc6dfc9..5a5a726bafa4 100644
>>> --- a/drivers/net/ethernet/broadcom/bcmsysport.c
>>> +++ b/drivers/net/ethernet/broadcom/bcmsysport.c
>>> @@ -15,6 +15,7 @@
>>>    #include <linux/module.h>
>>>    #include <linux/kernel.h>
>>>    #include <linux/netdevice.h>
>>> +#include <linux/net_dim.h>
>>
>> I don't think you need this include. You already include net_dim in
>> bcmsysport.h and include the bcmsysport.h here.
> 
> Indeed.
> 
>>
>>>    #include <linux/etherdevice.h>
>>>    #include <linux/platform_device.h>
>>>    #include <linux/of.h>
>>> @@ -574,21 +575,55 @@ static int bcm_sysport_set_wol(struct net_device
>>> *dev,
>>>        return 0;
>>>    }
>>>    +static void bcm_sysport_set_rx_coalesce(struct bcm_sysport_priv *priv)
>>> +{
>>> +    u32 reg;
>>> +
>>> +    reg = rdma_readl(priv, RDMA_MBDONE_INTR);
>>> +    reg &= ~(RDMA_INTR_THRESH_MASK |
>>> +         RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
>>> +    reg |= priv->dim.coal_pkts;
>>> +    reg |= DIV_ROUND_UP(priv->dim.coal_usecs * 1000, 8192) <<
>>> +                RDMA_TIMEOUT_SHIFT;
>>> +    rdma_writel(priv, reg, RDMA_MBDONE_INTR);
>>> +}
>>> +
>>> +static void bcm_sysport_set_tx_coalesce(struct bcm_sysport_tx_ring
>>> *ring)
>>> +{
>>> +    struct bcm_sysport_priv *priv = ring->priv;
>>> +    u32 reg;
>>> +
>>> +    reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(ring->index));
>>> +    reg &= ~(RING_INTR_THRESH_MASK |
>>> +         RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
>>> +    reg |= ring->dim.coal_pkts;
>>> +    reg |= DIV_ROUND_UP(ring->dim.coal_usecs * 1000, 8192) <<
>>> +                RING_TIMEOUT_SHIFT;
>>> +    tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(ring->index));
>>> +}
>>> +
>>
>> I wouldn't couple these functions with dim. This implies dim is always
>> used. IMO, would be more clear to use a generic method which takes usecs
>> and packets as an argument.
> 
> I did not want to create an additional structure for storing coalescing
> parameters, but if you prefer I make this function take two parameters,
> that sounds entirely reasonable.
> 
>>
>>>    static int bcm_sysport_get_coalesce(struct net_device *dev,
>>>                        struct ethtool_coalesce *ec)
>>>    {
>>>        struct bcm_sysport_priv *priv = netdev_priv(dev);
>>> +    struct bcm_sysport_tx_ring *ring;
>>> +    unsigned int i;
>>>        u32 reg;
>>>          reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(0));
>>>          ec->tx_coalesce_usecs = (reg >> RING_TIMEOUT_SHIFT) * 8192 /
>>> 1000;
>>>        ec->tx_max_coalesced_frames = reg & RING_INTR_THRESH_MASK;
>>> +    for (i = 0; i < dev->num_tx_queues; i++) {
>>> +        ring = &priv->tx_rings[i];
>>> +        ec->use_adaptive_tx_coalesce |= ring->dim.use_dim;
>>> +    }
>>>          reg = rdma_readl(priv, RDMA_MBDONE_INTR);
>>>          ec->rx_coalesce_usecs = (reg >> RDMA_TIMEOUT_SHIFT) * 8192 /
>>> 1000;
>>>        ec->rx_max_coalesced_frames = reg & RDMA_INTR_THRESH_MASK;
>>> +    ec->use_adaptive_rx_coalesce = priv->dim.use_dim;
>>>          return 0;
>>>    }
>>> @@ -597,8 +632,8 @@ static int bcm_sysport_set_coalesce(struct
>>> net_device *dev,
>>>                        struct ethtool_coalesce *ec)
>>>    {
>>>        struct bcm_sysport_priv *priv = netdev_priv(dev);
>>> +    struct bcm_sysport_tx_ring *ring;
>>>        unsigned int i;
>>> -    u32 reg;
>>>          /* Base system clock is 125Mhz, DMA timeout is this reference
>>> clock
>>>         * divided by 1024, which yield roughly 8.192 us, our maximum
>>> value has
>>> @@ -615,22 +650,26 @@ static int bcm_sysport_set_coalesce(struct
>>> net_device *dev,
>>>            return -EINVAL;
>>>          for (i = 0; i < dev->num_tx_queues; i++) {
>>> -        reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(i));
>>> -        reg &= ~(RING_INTR_THRESH_MASK |
>>> -             RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
>>> -        reg |= ec->tx_max_coalesced_frames;
>>> -        reg |= DIV_ROUND_UP(ec->tx_coalesce_usecs * 1000, 8192) <<
>>> -             RING_TIMEOUT_SHIFT;
>>> -        tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(i));
>>> +        ring = &priv->tx_rings[i];
>>> +        ring->dim.coal_pkts = ec->tx_max_coalesced_frames;
>>> +        ring->dim.coal_usecs = ec->tx_coalesce_usecs;
>>> +        if (!ec->use_adaptive_tx_coalesce && ring->dim.use_dim) {
>>> +            ring->dim.coal_pkts = 1;
>>> +            ring->dim.coal_usecs = 0;
>>> +        }
>>> +        ring->dim.use_dim = ec->use_adaptive_tx_coalesce;
>>> +        bcm_sysport_set_tx_coalesce(ring);
>>>        }
>>
>> If I understand correctly, if I disable dim, moderation is set to
>> {usecs,packets}={0,1} regardless of the input from ethtool right?
> 
> Correct, these are the default coalescing parameters that the driver
> sets. As mentioned before, since I am not storing any coalescing
> parameters other than these two, there is no copy of what an user might
> have previously provided, falling back to the defaults seemed reasonable.

Consider this example: ethtool -C <int> adaptive-tx on; ethtool -C 
<intf> adaptive-tx off tx-usecs 8 tx-frames 32;
In this case the actual moderation would be {0,1} instead of the 
requested {8,32}. Setting default values is ok unless requested 
otherwise. I would also use macros for default values.

> 
>> Doesn't this break the wanted behavior? As mentioned above, I would
>> decouple dim from the set_tx/rx_coalesce() function. Also, when dim is
>> enabled, why change dim.coal_pkts/usecs? They would just be overwritten
>> in the next iteration of net_dim.
> 
> Indeed, that is not necessary.
> 
>>
>>>    -    reg = rdma_readl(priv, RDMA_MBDONE_INTR);
>>> -    reg &= ~(RDMA_INTR_THRESH_MASK |
>>> -         RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
>>> -    reg |= ec->rx_max_coalesced_frames;
>>> -    reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192) <<
>>> -                RDMA_TIMEOUT_SHIFT;
>>> -    rdma_writel(priv, reg, RDMA_MBDONE_INTR);
>>> +    priv->dim.coal_usecs = ec->rx_coalesce_usecs;
>>> +    priv->dim.coal_pkts = ec->rx_max_coalesced_frames;
>>> +
>>> +    if (!ec->use_adaptive_rx_coalesce && priv->dim.use_dim) {
>>> +        priv->dim.coal_pkts = 1;
>>> +        priv->dim.coal_usecs = 0;
>>> +    }
>>> +    priv->dim.use_dim = ec->use_adaptive_rx_coalesce;
>>> +    bcm_sysport_set_rx_coalesce(priv);
>>
>> Same comment as above.
>>
>>>          return 0;
>>>    }
>>> @@ -709,6 +748,7 @@ static unsigned int bcm_sysport_desc_rx(struct
>>> bcm_sysport_priv *priv,
>>>        struct bcm_sysport_stats64 *stats64 = &priv->stats64;
>>>        struct net_device *ndev = priv->netdev;
>>>        unsigned int processed = 0, to_process;
>>> +    unsigned int processed_bytes = 0;
>>>        struct bcm_sysport_cb *cb;
>>>        struct sk_buff *skb;
>>>        unsigned int p_index;
>>> @@ -800,6 +840,7 @@ static unsigned int bcm_sysport_desc_rx(struct
>>> bcm_sysport_priv *priv,
>>>             */
>>>            skb_pull(skb, sizeof(*rsb) + 2);
>>>            len -= (sizeof(*rsb) + 2);
>>> +        processed_bytes += len;
>>>              /* UniMAC may forward CRC */
>>>            if (priv->crc_fwd) {
>>> @@ -824,6 +865,9 @@ static unsigned int bcm_sysport_desc_rx(struct
>>> bcm_sysport_priv *priv,
>>>                priv->rx_read_ptr = 0;
>>>        }
>>>    +    priv->dim.packets = processed;
>>> +    priv->dim.bytes = processed_bytes;
>>> +
>>>        return processed;
>>>    }
>>>    @@ -900,6 +944,8 @@ static unsigned int
>>> __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
>>>        ring->packets += pkts_compl;
>>>        ring->bytes += bytes_compl;
>>>        u64_stats_update_end(&priv->syncp);
>>> +    ring->dim.packets = pkts_compl;
>>> +    ring->dim.bytes = bytes_compl;
>>>          ring->c_index = c_index;
>>>    @@ -945,6 +991,7 @@ static int bcm_sysport_tx_poll(struct
>>> napi_struct *napi, int budget)
>>>    {
>>>        struct bcm_sysport_tx_ring *ring =
>>>            container_of(napi, struct bcm_sysport_tx_ring, napi);
>>> +    struct net_dim_sample dim_sample;
>>>        unsigned int work_done = 0;
>>>          work_done = bcm_sysport_tx_reclaim(ring->priv, ring);
>>> @@ -961,6 +1008,12 @@ static int bcm_sysport_tx_poll(struct
>>> napi_struct *napi, int budget)
>>>            return 0;
>>>        }
>>>    +    if (ring->dim.use_dim) {
>>> +        net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
>>> +                   ring->dim.bytes, &dim_sample);
>>> +        net_dim(&ring->dim.dim, dim_sample);
>>> +    }
>>> +
>>>        return budget;
>>>    }
>>>    @@ -976,6 +1029,7 @@ static int bcm_sysport_poll(struct napi_struct
>>> *napi, int budget)
>>>    {
>>>        struct bcm_sysport_priv *priv =
>>>            container_of(napi, struct bcm_sysport_priv, napi);
>>> +    struct net_dim_sample dim_sample;
>>>        unsigned int work_done = 0;
>>>          work_done = bcm_sysport_desc_rx(priv, budget);
>>> @@ -998,6 +1052,12 @@ static int bcm_sysport_poll(struct napi_struct
>>> *napi, int budget)
>>>            intrl2_0_mask_clear(priv, INTRL2_0_RDMA_MBDONE);
>>>        }
>>>    +    if (priv->dim.use_dim) {
>>> +        net_dim_sample(priv->dim.event_ctr, priv->dim.packets,
>>> +                   priv->dim.bytes, &dim_sample);
>>> +        net_dim(&priv->dim.dim, dim_sample);
>>> +    }
>>> +
>>>        return work_done;
>>>    }
>>>    @@ -1016,6 +1076,40 @@ static void
>>> bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
>>>        netif_dbg(priv, wol, priv->netdev, "resumed from WOL\n");
>>>    }
>>>    +static void bcm_sysport_dim_work(struct work_struct *work)
>>> +{
>>> +    struct net_dim *dim = container_of(work, struct net_dim, work);
>>> +    struct bcm_sysport_net_dim *ndim =
>>> +            container_of(dim, struct bcm_sysport_net_dim, dim);
>>> +    struct bcm_sysport_priv *priv =
>>> +            container_of(ndim, struct bcm_sysport_priv, dim);
>>> +    struct net_dim_cq_moder cur_profile =
>>> +                net_dim_get_profile(dim->mode, dim->profile_ix);
>>> +
>>> +    priv->dim.coal_usecs = cur_profile.usec;
>>> +    priv->dim.coal_pkts = cur_profile.pkts;
>>> +
>>> +    bcm_sysport_set_rx_coalesce(priv);
>>> +    dim->state = NET_DIM_START_MEASURE;
>>> +}
>>> +
>>> +static void bcm_sysport_dim_tx_work(struct work_struct *work)
>>> +{
>>> +    struct net_dim *dim = container_of(work, struct net_dim, work);
>>> +    struct bcm_sysport_net_dim *ndim =
>>> +            container_of(dim, struct bcm_sysport_net_dim, dim);
>>> +    struct bcm_sysport_tx_ring *ring =
>>> +            container_of(ndim, struct bcm_sysport_tx_ring, dim);
>>> +    struct net_dim_cq_moder cur_profile =
>>> +                net_dim_get_profile(dim->mode, dim->profile_ix);
>>> +
>>> +    ring->dim.coal_usecs = cur_profile.usec;
>>> +    ring->dim.coal_pkts = cur_profile.pkts;
>>> +
>>> +    bcm_sysport_set_tx_coalesce(ring);
>>> +    dim->state = NET_DIM_START_MEASURE;
>>> +}
>>> +
>>>    /* RX and misc interrupt routine */
>>>    static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
>>>    {
>>> @@ -1034,6 +1128,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq,
>>> void *dev_id)
>>>        }
>>>          if (priv->irq0_stat & INTRL2_0_RDMA_MBDONE) {
>>> +        priv->dim.event_ctr++;
>>>            if (likely(napi_schedule_prep(&priv->napi))) {
>>>                /* disable RX interrupts */
>>>                intrl2_0_mask_set(priv, INTRL2_0_RDMA_MBDONE);
>>> @@ -1061,6 +1156,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq,
>>> void *dev_id)
>>>                continue;
>>>              txr = &priv->tx_rings[ring];
>>> +        txr->dim.event_ctr++;
>>>              if (likely(napi_schedule_prep(&txr->napi))) {
>>>                intrl2_0_mask_set(priv, ring_bit);
>>> @@ -1093,6 +1189,7 @@ static irqreturn_t bcm_sysport_tx_isr(int irq,
>>> void *dev_id)
>>>                continue;
>>>              txr = &priv->tx_rings[ring];
>>> +        txr->dim.event_ctr++;
>>>              if (likely(napi_schedule_prep(&txr->napi))) {
>>>                intrl2_1_mask_set(priv, BIT(ring));
>>> @@ -1358,6 +1455,16 @@ static void bcm_sysport_adj_link(struct
>>> net_device *dev)
>>>            phy_print_status(phydev);
>>>    }
>>>    +static void bcm_sysport_init_dim(struct bcm_sysport_net_dim *dim,
>>> +                 void (*cb)(struct work_struct *work))
>>> +{
>>> +    INIT_WORK(&dim->dim.work, cb);
>>> +    dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
>>> +    dim->event_ctr = 0;
>>> +    dim->packets = 0;
>>> +    dim->bytes = 0;
>>> +}
>>
>> What about default values for coal_usecs/pkts? dim supports it through
>> net_dim_get_def_profile(mode) function.
> 
> OK, thanks I did not know that.
> 

I'll add it to the documentation.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing
  2018-03-26 22:29     ` Florian Fainelli
@ 2018-03-26 23:21       ` Tal Gilboa
  2018-03-26 23:40         ` Florian Fainelli
  0 siblings, 1 reply; 13+ messages in thread
From: Tal Gilboa @ 2018-03-26 23:21 UTC (permalink / raw)
  To: Florian Fainelli, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michael.chan, gospo, saeedm

On 3/27/2018 1:29 AM, Florian Fainelli wrote:
> On 03/26/2018 03:04 PM, Florian Fainelli wrote:
>> On 03/26/2018 02:16 PM, Tal Gilboa wrote:
>>> On 3/23/2018 4:19 AM, Florian Fainelli wrote:
>>>> Hi all,
>>>>
>>>> This patch series adds adaptive interrupt coalescing for the Gigabit
>>>> Ethernet
>>>> drivers SYSTEMPORT and GENET.
>>>>
>>>> This really helps lower the interrupt count and system load, as
>>>> measured by
>>>> vmstat for a Gigabit TCP RX session:
>>>
>>> I don't see an improvement in system load, the opposite - 42% vs. 100%
>>> for SYSTEMPORT and 85% vs. 100% for GENET. Both with the same bandwidth.
>>
>> Looks like I did not extract the correct data the load could spike in
>> both cases (with and without net_dim) up to 100, but averaged over the
>> transmission I see the following:
>>
>> GENET without:
>>   1  0      0 1169568      0  25556    0    0     0     0 130079 62795  2
>> 86 13  0  0
>>
>> GENET with:
>>   1  0      0 1169536      0  25556    0    0     0     0 10566 10869  1
>> 21 78  0  0
>>
>>> Am I missing something? Talking about bandwidth, I would expect 941Mb/s
>>> (assuming this is TCP over IPv4). Do you know why the reduced interrupt
>>> rate doesn't improve bandwidth?
>>
>> I am assuming that this comes down to a latency, still capturing some
>> pcap files to analyze the TCP session with wireshark and see if that is
>> indeed what is going on. The test machine is actually not that great

I would expect 1GbE full wire speed on almost any setup. I'll try 
applying your code on my setup and see what I get.

>>
>>> Also, any effect on the client side (you
>>> mentioned enabling TX moderation for SYSTEMPORT)?
>>
>> Yes, on SYSTEMPORT, being the TCP IPv4 client, I have the following:
>>
>> SYSTEMPORT without:
>>   2  0      0 191428      0  25748    0    0     0     0 86254  264  0 41
>> 59  0  0
>>
>> SYSTEMPORT with:
>>   3  0      0 190176      0  25748    0    0     0     0 45485 31332  0
>> 100  0  0  0
>>
>> I don't get top to agree with these load results though but it looks
>> like we just have the CPU spinning more, does not look like a win.
> 
> The problem appears to be the timeout selection on TX, ignoring it
> completely allows us to keep the load average down while maintaining the
> bandwidth. Looks like NAPI on TX already does a good job, so interrupt
> mitigation on TX is not such a great idea actually...

I saw a similar behavior for TX. For me the issue was too many 
outstanding bytes without a completion (defined to be 256KB by sysctl 
net.ipv4.tcp_limit_output_bytes). I tested on a 100GbE connection so 
with reasonable timeout values I already waited too long (4 TSO 
sessions). For the 1GbE case this might have no effect since you need a 
very long timeout. I'm currently working on adding TX support for dim. 
If you don't see a good benefit currently you might want to wait a 
little with TX adaptive interrupt moderation. Maybe only adjust static 
moderation for now?

> 
> Also, doing UDP TX tests shows that we can lower the interrupt count by
> setting an appropriate tx-frames (as expected), but we won't be lowering
> the CPU load since that is inherently a CPU intensive work. Past

Do you see higher TX UDP bandwidth? If you are bounded by CPU on both 
cases I would at least expect higher bandwidth with less interrupts 
since you reduce work from the CPU.

> tx-frames=64, the bandwidth completely drops because that would be 1/2
> of the ring size.
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing
  2018-03-26 23:21       ` Tal Gilboa
@ 2018-03-26 23:40         ` Florian Fainelli
  0 siblings, 0 replies; 13+ messages in thread
From: Florian Fainelli @ 2018-03-26 23:40 UTC (permalink / raw)
  To: Tal Gilboa, netdev
  Cc: davem, jaedon.shin, pgynther, opendmb, michael.chan, gospo, saeedm

On 03/26/2018 04:21 PM, Tal Gilboa wrote:
> On 3/27/2018 1:29 AM, Florian Fainelli wrote:
>> On 03/26/2018 03:04 PM, Florian Fainelli wrote:
>>> On 03/26/2018 02:16 PM, Tal Gilboa wrote:
>>>> On 3/23/2018 4:19 AM, Florian Fainelli wrote:
>>>>> Hi all,
>>>>>
>>>>> This patch series adds adaptive interrupt coalescing for the Gigabit
>>>>> Ethernet
>>>>> drivers SYSTEMPORT and GENET.
>>>>>
>>>>> This really helps lower the interrupt count and system load, as
>>>>> measured by
>>>>> vmstat for a Gigabit TCP RX session:
>>>>
>>>> I don't see an improvement in system load, the opposite - 42% vs. 100%
>>>> for SYSTEMPORT and 85% vs. 100% for GENET. Both with the same
>>>> bandwidth.
>>>
>>> Looks like I did not extract the correct data the load could spike in
>>> both cases (with and without net_dim) up to 100, but averaged over the
>>> transmission I see the following:
>>>
>>> GENET without:
>>>   1  0      0 1169568      0  25556    0    0     0     0 130079
>>> 62795  2
>>> 86 13  0  0
>>>
>>> GENET with:
>>>   1  0      0 1169536      0  25556    0    0     0     0 10566 10869  1
>>> 21 78  0  0
>>>
>>>> Am I missing something? Talking about bandwidth, I would expect 941Mb/s
>>>> (assuming this is TCP over IPv4). Do you know why the reduced interrupt
>>>> rate doesn't improve bandwidth?
>>>
>>> I am assuming that this comes down to a latency, still capturing some
>>> pcap files to analyze the TCP session with wireshark and see if that is
>>> indeed what is going on. The test machine is actually not that great
> 
> I would expect 1GbE full wire speed on almost any setup. I'll try
> applying your code on my setup and see what I get.

The test machine that I am using appears to be loaded by other non
networking workload which perturbs the tests I am running, other than
than I agree, wire speed should be expected.

> 
>>>
>>>> Also, any effect on the client side (you
>>>> mentioned enabling TX moderation for SYSTEMPORT)?
>>>
>>> Yes, on SYSTEMPORT, being the TCP IPv4 client, I have the following:
>>>
>>> SYSTEMPORT without:
>>>   2  0      0 191428      0  25748    0    0     0     0 86254  264 
>>> 0 41
>>> 59  0  0
>>>
>>> SYSTEMPORT with:
>>>   3  0      0 190176      0  25748    0    0     0     0 45485 31332  0
>>> 100  0  0  0
>>>
>>> I don't get top to agree with these load results though but it looks
>>> like we just have the CPU spinning more, does not look like a win.
>>
>> The problem appears to be the timeout selection on TX, ignoring it
>> completely allows us to keep the load average down while maintaining the
>> bandwidth. Looks like NAPI on TX already does a good job, so interrupt
>> mitigation on TX is not such a great idea actually...
> 
> I saw a similar behavior for TX. For me the issue was too many
> outstanding bytes without a completion (defined to be 256KB by sysctl
> net.ipv4.tcp_limit_output_bytes). I tested on a 100GbE connection so
> with reasonable timeout values I already waited too long (4 TSO
> sessions). For the 1GbE case this might have no effect since you need a
> very long timeout. I'm currently working on adding TX support for dim.
> If you don't see a good benefit currently you might want to wait a
> little with TX adaptive interrupt moderation. Maybe only adjust static
> moderation for now?

Yes static moderation appears to be doing just fine.

> 
>>
>> Also, doing UDP TX tests shows that we can lower the interrupt count by
>> setting an appropriate tx-frames (as expected), but we won't be lowering
>> the CPU load since that is inherently a CPU intensive work. Past
> 
> Do you see higher TX UDP bandwidth? If you are bounded by CPU on both
> cases I would at least expect higher bandwidth with less interrupts
> since you reduce work from the CPU.

UDP bandwidth was intentionally limited the UDP bandwidth to
800Mbits/sec, we are definitively not CPU bound (18% CPU load), but we
can still lower the interrupt count.
-- 
Florian

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-03-26 23:40 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-23  1:19 [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing Florian Fainelli
2018-03-23  1:19 ` [PATCH net-next 1/2] net: systemport: Implement adaptive " Florian Fainelli
2018-03-26 21:22   ` Tal Gilboa
2018-03-26 21:36     ` Florian Fainelli
2018-03-26 23:07       ` Tal Gilboa
2018-03-23  1:19 ` [PATCH net-next 2/2] net: bcmgenet: Add support for adaptive RX coalescing Florian Fainelli
2018-03-26 21:23   ` Tal Gilboa
2018-03-26  0:49 ` [PATCH net-next 0/2] net: broadcom: Adaptive interrupt coalescing David Miller
2018-03-26 21:16 ` Tal Gilboa
2018-03-26 22:04   ` Florian Fainelli
2018-03-26 22:29     ` Florian Fainelli
2018-03-26 23:21       ` Tal Gilboa
2018-03-26 23:40         ` Florian Fainelli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.