All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support
@ 2015-10-13 23:38 Jacob Keller
  2015-10-13 23:38 ` [Intel-wired-lan] [next-queue 02/17] fm10k: set netdev features in one location Jacob Keller
                   ` (16 more replies)
  0 siblings, 17 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:38 UTC (permalink / raw)
  To: intel-wired-lan

Rather than wrapping fm10k_dcbnl.c and fm10k_debugfs.c support with
 #ifdef blocks, just conditionally include the .o files in the Makefile.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/Makefile        | 20 +++++++++++++++-----
 drivers/net/ethernet/intel/fm10k/fm10k.h         |  2 ++
 drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c   |  4 ----
 drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c |  4 ----
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c     |  4 ++++
 5 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/Makefile b/drivers/net/ethernet/intel/fm10k/Makefile
index 08859dd220a8..b006ff66d028 100644
--- a/drivers/net/ethernet/intel/fm10k/Makefile
+++ b/drivers/net/ethernet/intel/fm10k/Makefile
@@ -1,7 +1,7 @@
 ################################################################################
 #
 # Intel Ethernet Switch Host Interface Driver
-# Copyright(c) 2013 - 2014 Intel Corporation.
+# Copyright(c) 2013 - 2015 Intel Corporation.
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms and conditions of the GNU General Public License,
@@ -27,7 +27,17 @@
 
 obj-$(CONFIG_FM10K) += fm10k.o
 
-fm10k-objs := fm10k_main.o fm10k_common.o fm10k_pci.o \
-	      fm10k_netdev.o fm10k_ethtool.o fm10k_pf.o fm10k_vf.o \
-	      fm10k_mbx.o fm10k_iov.o fm10k_tlv.o \
-	      fm10k_debugfs.o fm10k_ptp.o fm10k_dcbnl.o
+fm10k-y := fm10k_main.o \
+	   fm10k_common.o \
+	   fm10k_pci.o \
+	   fm10k_ptp.o \
+	   fm10k_netdev.o \
+	   fm10k_ethtool.o \
+	   fm10k_pf.o \
+	   fm10k_vf.o \
+	   fm10k_mbx.o \
+	   fm10k_iov.o \
+	   fm10k_tlv.o
+
+fm10k-$(CONFIG_DEBUG_FS) += fm10k_debugfs.o
+fm10k-$(CONFIG_DCB) += fm10k_dcbnl.o
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index 48809e5d3f79..504d487fa800 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -551,5 +551,7 @@ int fm10k_get_ts_config(struct net_device *netdev, struct ifreq *ifr);
 int fm10k_set_ts_config(struct net_device *netdev, struct ifreq *ifr);
 
 /* DCB */
+#ifdef CONFIG_DCB
 void fm10k_dcbnl_set_ops(struct net_device *dev);
+#endif
 #endif /* _FM10K_H_ */
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c b/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
index 5c7a4d7662d8..2be4361839db 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
@@ -20,7 +20,6 @@
 
 #include "fm10k.h"
 
-#ifdef CONFIG_DCB
 /**
  * fm10k_dcbnl_ieee_getets - get the ETS configuration for the device
  * @dev: netdev interface for the device
@@ -155,7 +154,6 @@ static const struct dcbnl_rtnl_ops fm10k_dcbnl_ops = {
 	.setdcbx	= fm10k_dcbnl_setdcbx,
 };
 
-#endif /* CONFIG_DCB */
 /**
  * fm10k_dcbnl_set_ops - Configures dcbnl ops pointer for netdev
  * @dev: netdev interface for the device
@@ -164,11 +162,9 @@ static const struct dcbnl_rtnl_ops fm10k_dcbnl_ops = {
  **/
 void fm10k_dcbnl_set_ops(struct net_device *dev)
 {
-#ifdef CONFIG_DCB
 	struct fm10k_intfc *interface = netdev_priv(dev);
 	struct fm10k_hw *hw = &interface->hw;
 
 	if (hw->mac.type == fm10k_mac_pf)
 		dev->dcbnl_ops = &fm10k_dcbnl_ops;
-#endif /* CONFIG_DCB */
 }
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
index 5304bc1fbecd..5d6137faf7d1 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
@@ -18,8 +18,6 @@
  * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
  */
 
-#ifdef CONFIG_DEBUG_FS
-
 #include "fm10k.h"
 
 #include <linux/debugfs.h>
@@ -258,5 +256,3 @@ void fm10k_dbg_exit(void)
 	debugfs_remove_recursive(dbg_root);
 	dbg_root = NULL;
 }
-
-#endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index 5fbffbaefe32..0dece3c07727 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -1729,8 +1729,10 @@ static int fm10k_sw_init(struct fm10k_intfc *interface,
 		netdev->hw_features &= ~NETIF_F_GSO_UDP_TUNNEL;
 	}
 
+#ifdef CONFIG_DCB
 	/* initialize DCBNL interface */
 	fm10k_dcbnl_set_ops(netdev);
+#endif
 
 	/* Initialize service timer and service task */
 	set_bit(__FM10K_SERVICE_DISABLE, &interface->state);
@@ -1919,8 +1921,10 @@ static int fm10k_probe(struct pci_dev *pdev,
 	if (err)
 		goto err_sw_init;
 
+#if IS_ENABLED(CONFIG_DEBUG_FS)
 	/* enable debugfs support */
 	fm10k_dbg_intfc_init(interface);
+#endif
 
 	err = fm10k_init_queueing_scheme(interface);
 	if (err)
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 02/17] fm10k: set netdev features in one location
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
@ 2015-10-13 23:38 ` Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 03/17] fm10k: reinitialize queuing scheme after calling init_hw Jacob Keller
                   ` (15 subsequent siblings)
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:38 UTC (permalink / raw)
  To: intel-wired-lan

Don't change netdev hw_features later in fm10k_probe, instead set all
values inside fm10k_alloc_netdev. To do so, we need to know the MAC type
(whether it is PF or VF) in order to determine what to do. This helps
ensure that all logic regarding features is co-located.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k.h        |  2 +-
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c | 31 +++++++++++++++----------
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c    |  9 +------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index 504d487fa800..4d4cd006fec1 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -484,7 +484,7 @@ void fm10k_netpoll(struct net_device *netdev);
 #endif
 
 /* Netdev */
-struct net_device *fm10k_alloc_netdev(void);
+struct net_device *fm10k_alloc_netdev(const struct fm10k_info *info);
 int fm10k_setup_rx_resources(struct fm10k_ring *);
 int fm10k_setup_tx_resources(struct fm10k_ring *);
 void fm10k_free_rx_resources(struct fm10k_ring *);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 96364c7b7719..38fe2d8c8388 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1391,8 +1391,9 @@ static const struct net_device_ops fm10k_netdev_ops = {
 
 #define DEFAULT_DEBUG_LEVEL_SHIFT 3
 
-struct net_device *fm10k_alloc_netdev(void)
+struct net_device *fm10k_alloc_netdev(const struct fm10k_info *info)
 {
+	netdev_features_t hw_features;
 	struct fm10k_intfc *interface;
 	struct net_device *dev;
 
@@ -1415,27 +1416,31 @@ struct net_device *fm10k_alloc_netdev(void)
 			 NETIF_F_TSO |
 			 NETIF_F_TSO6 |
 			 NETIF_F_TSO_ECN |
-			 NETIF_F_GSO_UDP_TUNNEL |
 			 NETIF_F_RXHASH |
 			 NETIF_F_RXCSUM;
 
+	/* Only the PF can support VXLAN and NVGRE tunnel offloads */
+	if (info->mac == fm10k_mac_pf) {
+		dev->hw_enc_features = NETIF_F_IP_CSUM |
+				       NETIF_F_TSO |
+				       NETIF_F_TSO6 |
+				       NETIF_F_TSO_ECN |
+				       NETIF_F_GSO_UDP_TUNNEL |
+				       NETIF_F_IPV6_CSUM |
+				       NETIF_F_SG;
+
+		dev->features |= NETIF_F_GSO_UDP_TUNNEL;
+	}
+
 	/* all features defined to this point should be changeable */
-	dev->hw_features |= dev->features;
+	hw_features = dev->features;
 
 	/* allow user to enable L2 forwarding acceleration */
-	dev->hw_features |= NETIF_F_HW_L2FW_DOFFLOAD;
+	hw_features |= NETIF_F_HW_L2FW_DOFFLOAD;
 
 	/* configure VLAN features */
 	dev->vlan_features |= dev->features;
 
-	/* configure tunnel offloads */
-	dev->hw_enc_features |= NETIF_F_IP_CSUM |
-				NETIF_F_TSO |
-				NETIF_F_TSO6 |
-				NETIF_F_TSO_ECN |
-				NETIF_F_GSO_UDP_TUNNEL |
-				NETIF_F_IPV6_CSUM;
-
 	/* we want to leave these both on as we cannot disable VLAN tag
 	 * insertion or stripping on the hardware since it is contained
 	 * in the FTAG and not in the frame itself.
@@ -1446,5 +1451,7 @@ struct net_device *fm10k_alloc_netdev(void)
 
 	dev->priv_flags |= IFF_UNICAST_FLT;
 
+	dev->hw_features |= hw_features;
+
 	return dev;
 }
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index 0dece3c07727..a488bbade48b 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -1722,13 +1722,6 @@ static int fm10k_sw_init(struct fm10k_intfc *interface,
 					     pci_resource_len(pdev, 4));
 	hw->sw_addr = interface->sw_addr;
 
-	/* Only the PF can support VXLAN and NVGRE offloads */
-	if (hw->mac.type != fm10k_mac_pf) {
-		netdev->hw_enc_features = 0;
-		netdev->features &= ~NETIF_F_GSO_UDP_TUNNEL;
-		netdev->hw_features &= ~NETIF_F_GSO_UDP_TUNNEL;
-	}
-
 #ifdef CONFIG_DCB
 	/* initialize DCBNL interface */
 	fm10k_dcbnl_set_ops(netdev);
@@ -1896,7 +1889,7 @@ static int fm10k_probe(struct pci_dev *pdev,
 	pci_set_master(pdev);
 	pci_save_state(pdev);
 
-	netdev = fm10k_alloc_netdev();
+	netdev = fm10k_alloc_netdev(fm10k_info_tbl[ent->driver_data]);
 	if (!netdev) {
 		err = -ENOMEM;
 		goto err_alloc_netdev;
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 03/17] fm10k: reinitialize queuing scheme after calling init_hw
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
  2015-10-13 23:38 ` [Intel-wired-lan] [next-queue 02/17] fm10k: set netdev features in one location Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 04/17] fm10k: reset max_queues on init_hw_vf failure Jacob Keller
                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

The init_hw function may fail, and in the case of VFs, it might change
the number of maximum queues available. Thus, for every flow which
checks init_hw, we need to ensure that we clear the queue scheme before,
and initialize it after. The fm10k_io_slot_reset path will end up
triggering a reset so fm10k_reinit needs this change. The
fm10k_io_error_detected and fm10k_io_resume also need to properly clear
and reinitialize the queue scheme.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index a488bbade48b..48f0ef4545a0 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -159,6 +159,9 @@ static void fm10k_reinit(struct fm10k_intfc *interface)
 
 	fm10k_mbx_free_irq(interface);
 
+	/* free interrupts */
+	fm10k_clear_queueing_scheme(interface);
+
 	/* delay any future reset requests */
 	interface->last_reset = jiffies + (10 * HZ);
 
@@ -167,6 +170,12 @@ static void fm10k_reinit(struct fm10k_intfc *interface)
 	if (err)
 		dev_err(&interface->pdev->dev, "init_hw failed: %d\n", err);
 
+	err = fm10k_init_queueing_scheme(interface);
+	if (err) {
+		dev_err(&interface->pdev->dev, "init_queueing_scheme failed: %d\n", err);
+		goto reinit_err;
+	}
+
 	/* reassociate interrupts */
 	fm10k_mbx_request_irq(interface);
 
@@ -2182,6 +2191,9 @@ static pci_ers_result_t fm10k_io_error_detected(struct pci_dev *pdev,
 	if (netif_running(netdev))
 		fm10k_close(netdev);
 
+	/* free interrupts */
+	fm10k_clear_queueing_scheme(interface);
+
 	fm10k_mbx_free_irq(interface);
 
 	pci_disable_device(pdev);
@@ -2250,6 +2262,12 @@ static void fm10k_io_resume(struct pci_dev *pdev)
 	/* reset statistics starting values */
 	hw->mac.ops.rebind_hw_stats(hw, &interface->stats);
 
+	err = fm10k_init_queueing_scheme(interface);
+	if (err) {
+		dev_err(&interface->pdev->dev, "init_queueing_scheme failed: %d\n", err);
+		return;
+	}
+
 	/* reassociate interrupts */
 	fm10k_mbx_request_irq(interface);
 
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 04/17] fm10k: reset max_queues on init_hw_vf failure
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
  2015-10-13 23:38 ` [Intel-wired-lan] [next-queue 02/17] fm10k: set netdev features in one location Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 03/17] fm10k: reinitialize queuing scheme after calling init_hw Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors Jacob Keller
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

VF drivers must detect how many queues are available. Previously, the
driver assumed that each VF has at minimum 1 queue. This assumption is
incorrect, since it is possible that the PF has not yet assigned the
queues to the VF by the time the VF checks. To resolve this, we added a
check first to ensure that the first queue is infact owned by the VF at
init_hw_vf time. However, the code flow did not reset hw->mac.max_queues
to 0. In some cases, such as during reinit flows, we call init_hw_vf
without clearing the previous value of hw->mac.max_queues. Due to this,
when init_hw_vf errors out, if its error code is not properly handled
the VF driver may still believe it has queues which no longer belong to
it. Fix this by clearing the hw->mac.max_queues on exit due to errors.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_vf.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
index 3a18ef1cc017..d512575c33f3 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
@@ -105,8 +105,10 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
 
 	/* verify we have at least 1 queue */
 	if (!~fm10k_read_reg(hw, FM10K_TXQCTL(0)) ||
-	    !~fm10k_read_reg(hw, FM10K_RXQCTL(0)))
-		return FM10K_ERR_NO_RESOURCES;
+	    !~fm10k_read_reg(hw, FM10K_RXQCTL(0))) {
+		err = FM10K_ERR_NO_RESOURCES;
+		goto reset_max_queues;
+	}
 
 	/* determine how many queues we have */
 	for (i = 1; tqdloc0 && (i < FM10K_MAX_QUEUES_POOL); i++) {
@@ -124,7 +126,7 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
 	/* shut down queues we own and reset DMA configuration */
 	err = fm10k_disable_queues_generic(hw, i);
 	if (err)
-		return err;
+		goto reset_max_queues;
 
 	/* record maximum queue count */
 	hw->mac.max_queues = i;
@@ -134,6 +136,11 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
 			       FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT;
 
 	return 0;
+
+reset_max_queues:
+	hw->mac.max_queues = 0;
+
+	return err;
 }
 
 /* This structure defines the attibutes to be parsed below */
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (2 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 04/17] fm10k: reset max_queues on init_hw_vf failure Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-14  0:46   ` Allan, Bruce W
  2015-10-28  0:47   ` Singh, Krishneil K
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 06/17] fm10k: Correct typecast in fm10k_update_xc_addr_pf Jacob Keller
                   ` (12 subsequent siblings)
  16 siblings, 2 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

A recent change to the SHARED code makes it such that in some flows,
init_hw may fail. For example, init_hw_vf will fail if the 1st queue is
not owned by the VF. However, previously some flows called init_hw
without checking the return code, or checked it but only to display a
diagnostic message.

Fix this by (a) always returning and preventing the netdevice from going
up, and (b) printing the diagnostic in every flow for consistency. This
should resolve an issue where VF drivers would attempt to come up
before the PF has finished assigning queues.

In addition, change the dmesg output to explicitly show the actual
function that failed, instead of combining reset_hw and init_hw into a
single check.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 34 ++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index 48f0ef4545a0..9ad9f9164d91 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -166,9 +166,17 @@ static void fm10k_reinit(struct fm10k_intfc *interface)
 	interface->last_reset = jiffies + (10 * HZ);
 
 	/* reset and initialize the hardware so it is in a known state */
-	err = hw->mac.ops.reset_hw(hw) ? : hw->mac.ops.init_hw(hw);
-	if (err)
+	err = hw->mac.ops.reset_hw(hw);
+	if (err) {
+		dev_err(&interface->pdev->dev, "reset_hw failed: %d\n", err);
+		goto reinit_err;
+	}
+
+	err = hw->mac.ops.init_hw(hw);
+	if (err) {
 		dev_err(&interface->pdev->dev, "init_hw failed: %d\n", err);
+		goto reinit_err;
+	}
 
 	err = fm10k_init_queueing_scheme(interface);
 	if (err) {
@@ -202,6 +210,10 @@ static void fm10k_reinit(struct fm10k_intfc *interface)
 
 	fm10k_iov_resume(interface->pdev);
 
+reinit_err:
+	if (err)
+		netif_device_detach(netdev);
+
 	rtnl_unlock();
 
 	clear_bit(__FM10K_RESETTING, &interface->state);
@@ -1693,7 +1705,13 @@ static int fm10k_sw_init(struct fm10k_intfc *interface,
 	interface->last_reset = jiffies + (10 * HZ);
 
 	/* reset and initialize the hardware so it is in a known state */
-	err = hw->mac.ops.reset_hw(hw) ? : hw->mac.ops.init_hw(hw);
+	err = hw->mac.ops.reset_hw(hw);
+	if (err) {
+		dev_err(&pdev->dev, "reset_hw failed: %d\n", err);
+		return err;
+	}
+
+	err = hw->mac.ops.init_hw(hw);
 	if (err) {
 		dev_err(&pdev->dev, "init_hw failed: %d\n", err);
 		return err;
@@ -2077,8 +2095,10 @@ static int fm10k_resume(struct pci_dev *pdev)
 
 	/* reset hardware to known state */
 	err = hw->mac.ops.init_hw(&interface->hw);
-	if (err)
+	if (err) {
+		dev_err(&pdev->dev, "init_hw failed: %d\n", err);
 		return err;
+	}
 
 	/* reset statistics starting values */
 	hw->mac.ops.rebind_hw_stats(hw, &interface->stats);
@@ -2257,7 +2277,11 @@ static void fm10k_io_resume(struct pci_dev *pdev)
 	int err = 0;
 
 	/* reset hardware to known state */
-	hw->mac.ops.init_hw(&interface->hw);
+	err = hw->mac.ops.init_hw(&interface->hw);
+	if (err) {
+		dev_err(&pdev->dev, "init_hw failed: %d\n", err);
+		return;
+	}
 
 	/* reset statistics starting values */
 	hw->mac.ops.rebind_hw_stats(hw, &interface->stats);
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 06/17] fm10k: Correct typecast in fm10k_update_xc_addr_pf
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (3 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-14  0:46   ` Allan, Bruce W
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 07/17] fm10k: explicitly typecast vlan values to u16 Jacob Keller
                   ` (11 subsequent siblings)
  16 siblings, 1 reply; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

Since the resultant data type of the mac_update.mac_upper field is u16,
it does not make sense to typecast u8 variables to u32 first.

Reported-by: Matthew Vick <matthew.vick@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_pf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
index 8c0bdc4e4edd..789ffd9d7fd9 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
@@ -334,8 +334,8 @@ static s32 fm10k_update_xc_addr_pf(struct fm10k_hw *hw, u16 glort,
 						 ((u32)mac[3] << 16) |
 						 ((u32)mac[4] << 8) |
 						 ((u32)mac[5]));
-	mac_update.mac_upper = cpu_to_le16(((u32)mac[0] << 8) |
-						 ((u32)mac[1]));
+	mac_update.mac_upper = cpu_to_le16(((u16)mac[0] << 8) |
+					   ((u16)mac[1]));
 	mac_update.vlan = cpu_to_le16(vid);
 	mac_update.glort = cpu_to_le16(glort);
 	mac_update.action = add ? 0 : 1;
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 07/17] fm10k: explicitly typecast vlan values to u16
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (4 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 06/17] fm10k: Correct typecast in fm10k_update_xc_addr_pf Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 08/17] fm10k: add statistics for actual DWORD count of mbmem mailbox Jacob Keller
                   ` (10 subsequent siblings)
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_pf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
index 789ffd9d7fd9..52f754b35972 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
@@ -1212,7 +1212,7 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 **results,
 		set = !(vid & FM10K_VLAN_CLEAR);
 		vid &= ~FM10K_VLAN_CLEAR;
 
-		err = fm10k_iov_select_vid(vf_info, vid);
+		err = fm10k_iov_select_vid(vf_info, (u16)vid);
 		if (err < 0)
 			return err;
 		else
@@ -1242,7 +1242,7 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 **results,
 		if (err < 0)
 			return err;
 		else
-			vlan = err;
+			vlan = (u16)err;
 
 		/* notify switch of request for new unicast address */
 		err = hw->mac.ops.update_uc_addr(hw, vf_info->glort,
@@ -1268,7 +1268,7 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 **results,
 		if (err < 0)
 			return err;
 		else
-			vlan = err;
+			vlan = (u16)err;
 
 		/* notify switch of request for new multicast address */
 		err = hw->mac.ops.update_mc_addr(hw, vf_info->glort,
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 08/17] fm10k: add statistics for actual DWORD count of mbmem mailbox
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (5 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 07/17] fm10k: explicitly typecast vlan values to u16 Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-14  0:47   ` Allan, Bruce W
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 09/17] fm10k: rename mbx_tx_oversized statistic to mbx_tx_dropped Jacob Keller
                   ` (9 subsequent siblings)
  16 siblings, 1 reply; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

A previous bug was uncovered by addition of a debug stat to indicate the
actual number of DWORDS we pulled from the mbmem. It turned out this was
not the same as the tx_dwords counter. While the previous bug fix should
have corrected this in all cases, add some debug stats that count the
number of DWORDs pushed or pulled from the mbmem. A future debugger may
take advantage of this statistic for debugging purposes.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 2 ++
 drivers/net/ethernet/intel/fm10k/fm10k_mbx.c     | 4 ++++
 drivers/net/ethernet/intel/fm10k/fm10k_mbx.h     | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 447a5f8da42f..5db5a97cf28e 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -114,9 +114,11 @@ static const struct fm10k_stats fm10k_gstrings_mbx_stats[] = {
 	FM10K_MBX_STAT("mbx_tx_oversized", tx_dropped),
 	FM10K_MBX_STAT("mbx_tx_messages", tx_messages),
 	FM10K_MBX_STAT("mbx_tx_dwords", tx_dwords),
+	FM10K_MBX_STAT("mbx_tx_mbmem_pulled", tx_mbmem_pulled),
 	FM10K_MBX_STAT("mbx_rx_messages", rx_messages),
 	FM10K_MBX_STAT("mbx_rx_dwords", rx_dwords),
 	FM10K_MBX_STAT("mbx_rx_parse_err", rx_parse_err),
+	FM10K_MBX_STAT("mbx_rx_mbmem_pushed", rx_mbmem_pushed),
 };
 
 #define FM10K_GLOBAL_STATS_LEN ARRAY_SIZE(fm10k_gstrings_global_stats)
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
index af09a1b272e6..2bce47490723 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
@@ -375,6 +375,8 @@ static void fm10k_mbx_write_copy(struct fm10k_hw *hw,
 			if (!tail)
 				tail++;
 
+			mbx->tx_mbmem_pulled++;
+
 			/* write message to hardware FIFO */
 			fm10k_write_reg(hw, mbmem + tail++, *(head++));
 		} while (--len && --end);
@@ -459,6 +461,8 @@ static void fm10k_mbx_read_copy(struct fm10k_hw *hw,
 			if (!head)
 				head++;
 
+			mbx->rx_mbmem_pushed++;
+
 			/* read message from hardware FIFO */
 			*(tail++) = fm10k_read_reg(hw, mbmem + head++);
 		} while (--len && --end);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
index 0419a7f0035e..b8eb6b5be894 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
@@ -291,8 +291,10 @@ struct fm10k_mbx_info {
 	u64 tx_dropped;
 	u64 tx_messages;
 	u64 tx_dwords;
+	u64 tx_mbmem_pulled;
 	u64 rx_messages;
 	u64 rx_dwords;
+	u64 rx_mbmem_pushed;
 	u64 rx_parse_err;
 
 	/* Buffer to store messages */
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 09/17] fm10k: rename mbx_tx_oversized statistic to mbx_tx_dropped
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (6 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 08/17] fm10k: add statistics for actual DWORD count of mbmem mailbox Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 10/17] fm10k: add TEB check to fm10k_gre_is_nvgre Jacob Keller
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

Originally this statistic was renamed because the method of dropping was
called "drop_oversized_messages", but this logic has changed much, and
this counter does actually represent messages which we failed to
transmit for a number of reasons. Rename the counter back to tx_dropped
since this is when it will increment, and it is less confusing.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 5db5a97cf28e..b903c4646cf8 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -111,7 +111,7 @@ static const struct fm10k_stats fm10k_gstrings_pf_stats[] = {
 
 static const struct fm10k_stats fm10k_gstrings_mbx_stats[] = {
 	FM10K_MBX_STAT("mbx_tx_busy", tx_busy),
-	FM10K_MBX_STAT("mbx_tx_oversized", tx_dropped),
+	FM10K_MBX_STAT("mbx_tx_dropped", tx_dropped),
 	FM10K_MBX_STAT("mbx_tx_messages", tx_messages),
 	FM10K_MBX_STAT("mbx_tx_dwords", tx_dwords),
 	FM10K_MBX_STAT("mbx_tx_mbmem_pulled", tx_mbmem_pulled),
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 10/17] fm10k: add TEB check to fm10k_gre_is_nvgre
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (7 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 09/17] fm10k: rename mbx_tx_oversized statistic to mbx_tx_dropped Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-14  0:47   ` Allan, Bruce W
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 11/17] fm10k: Add support for ITR scaling based on PCIe link speed Jacob Keller
                   ` (7 subsequent siblings)
  16 siblings, 1 reply; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

The NVGRE protocol should be run over transparent ethernet bridge
protocol, so we should verify this in our check whether the GRE tunnel
is NVGRE.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 746a1986690b..5dfcba72825d 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -708,6 +708,10 @@ static struct ethhdr *fm10k_gre_is_nvgre(struct sk_buff *skb)
 	if (nvgre_hdr->flags & FM10K_NVGRE_RESERVED0_FLAGS)
 		return NULL;
 
+	/* verify protocol is transparent Ethernet bridging */
+	if (nvgre_hdr->proto != htons(ETH_P_TEB))
+		return NULL;
+
 	/* report start of ethernet header */
 	if (nvgre_hdr->flags & NVGRE_TNI)
 		return (struct ethhdr *)(nvgre_hdr + 1);
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 11/17] fm10k: Add support for ITR scaling based on PCIe link speed
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (8 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 10/17] fm10k: add TEB check to fm10k_gre_is_nvgre Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-14  0:47   ` Allan, Bruce W
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 12/17] fm10k: introduce ITR_IS_ADAPTIVE macro Jacob Keller
                   ` (6 subsequent siblings)
  16 siblings, 1 reply; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

Red Rock Canyon's interrupt throttle timers are based on the PCIe link
speed. Because of this, the value being programmed into the ITR
registers must be scaled.

For the PF, this is as simple as reading the PCIe link speed and storing
the result. However, in the case of SR-IOV, the VF's interrupt throttle
timers are based on the link speed of the PF. However, the VF is unable
to get the link speed information from its configuration space, so the
PF must inform it of what scale to use.

Rather than pass this scale via mailbox message, take advantage of
unused bits in the TDLEN register to pass the scale. It is the
responsibility of the PF to program this for the VF while setting up the
VF queues and the responsibility of the VF to get the information
accordingly. This is preferable because it allows the VF to set up the
interrupts properly during initialization and matches how the MAC
address is passed in the TDBAL/TDBAH registers.

Reported-by: Matthew Vick <matthew.vick@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_pf.c   | 22 +++++++++++++++++++++-
 drivers/net/ethernet/intel/fm10k/fm10k_type.h | 15 +++++++++++++++
 drivers/net/ethernet/intel/fm10k/fm10k_vf.c   | 20 ++++++++++++++++++--
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
index 52f754b35972..9c3a48f04056 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
@@ -150,19 +150,26 @@ static s32 fm10k_init_hw_pf(struct fm10k_hw *hw)
 				FM10K_TPH_RXCTRL_HDR_WROEN);
 	}
 
-	/* set max hold interval to align with 1.024 usec in all modes */
+	/* set max hold interval to align with 1.024 usec in all modes and
+	 * store ITR scale
+	 */
 	switch (hw->bus.speed) {
 	case fm10k_bus_speed_2500:
 		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN1;
+		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN1;
 		break;
 	case fm10k_bus_speed_5000:
 		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN2;
+		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN2;
 		break;
 	case fm10k_bus_speed_8000:
 		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN3;
+		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN3;
 		break;
 	default:
 		dma_ctrl = 0;
+		/* just in case, assume Gen3 ITR scale */
+		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN3;
 		break;
 	}
 
@@ -903,6 +910,13 @@ static s32 fm10k_iov_assign_default_mac_vlan_pf(struct fm10k_hw *hw,
 	fm10k_write_reg(hw, FM10K_TDBAL(vf_q_idx), tdbal);
 	fm10k_write_reg(hw, FM10K_TDBAH(vf_q_idx), tdbah);
 
+	/* Provide the VF the ITR scale, using software-defined fields in TDLEN
+	 * to pass the information during VF initialization. See definition of
+	 * FM10K_TDLEN_ITR_SCALE_SHIFT for more details.
+	 */
+	fm10k_write_reg(hw, FM10K_TDLEN(vf_q_idx), hw->mac.itr_scale <<
+						   FM10K_TDLEN_ITR_SCALE_SHIFT);
+
 err_out:
 	/* configure Queue control register */
 	txqctl = ((u32)vf_vid << FM10K_TXQCTL_VID_SHIFT) &
@@ -1035,6 +1049,12 @@ static s32 fm10k_iov_reset_resources_pf(struct fm10k_hw *hw,
 	for (i = queues_per_pool; i--;) {
 		fm10k_write_reg(hw, FM10K_TDBAL(vf_q_idx + i), tdbal);
 		fm10k_write_reg(hw, FM10K_TDBAH(vf_q_idx + i), tdbah);
+		/* See definition of FM10K_TDLEN_ITR_SCALE_SHIFT for an
+		 * explanation of how TDLEN is used.
+		 */
+		fm10k_write_reg(hw, FM10K_TDLEN(vf_q_idx + i),
+				hw->mac.itr_scale <<
+				FM10K_TDLEN_ITR_SCALE_SHIFT);
 		fm10k_write_reg(hw, FM10K_TQMAP(qmap_idx + i), vf_q_idx + i);
 		fm10k_write_reg(hw, FM10K_RQMAP(qmap_idx + i), vf_q_idx + i);
 	}
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_type.h b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
index 35afd711d144..02d370fb8bbb 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_type.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
@@ -272,6 +272,20 @@ struct fm10k_hw;
 #define FM10K_TDBAL(_n)		((0x40 * (_n)) + 0x8000)
 #define FM10K_TDBAH(_n)		((0x40 * (_n)) + 0x8001)
 #define FM10K_TDLEN(_n)		((0x40 * (_n)) + 0x8002)
+/* When fist initialized, VFs need to know the Interrupt Throttle Rate (ITR)
+ * scale which is based on the PCIe speed but the speed information in the PCI
+ * configuration space may not be accurate. The PF already knows the ITR scale
+ * but there is no defined method to pass that information from the PF to the
+ * VF. This is accomplished during VF initialization by temporarily co-opting
+ * the yet-to-be-used TDLEN register to have the PF store the ITR shift for
+ * the VF to retrieve before the VF needs to use the TDLEN register for its
+ * intended purpose, i.e. before the Tx resources are allocated.
+ */
+#define FM10K_TDLEN_ITR_SCALE_SHIFT		9
+#define FM10K_TDLEN_ITR_SCALE_MASK		0x00000E00
+#define FM10K_TDLEN_ITR_SCALE_GEN1		2
+#define FM10K_TDLEN_ITR_SCALE_GEN2		1
+#define FM10K_TDLEN_ITR_SCALE_GEN3		0
 #define FM10K_TPH_TXCTRL(_n)	((0x40 * (_n)) + 0x8003)
 #define FM10K_TPH_TXCTRL_DESC_TPHEN		0x00000020
 #define FM10K_TPH_TXCTRL_DESC_RROEN		0x00000200
@@ -560,6 +574,7 @@ struct fm10k_mac_info {
 	bool get_host_state;
 	bool tx_ready;
 	u32 dglort_map;
+	u8 itr_scale;
 };
 
 struct fm10k_swapi_table_info {
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
index d512575c33f3..2af697df5abc 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
@@ -28,7 +28,7 @@
 static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
 {
 	u8 *perm_addr = hw->mac.perm_addr;
-	u32 bal = 0, bah = 0;
+	u32 bal = 0, bah = 0, tdlen;
 	s32 err;
 	u16 i;
 
@@ -48,6 +48,9 @@ static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
 		       ((u32)perm_addr[2]);
 	}
 
+	/* restore default itr_scale for next VF initialization */
+	tdlen = hw->mac.itr_scale << FM10K_TDLEN_ITR_SCALE_SHIFT;
+
 	/* The queues have already been disabled so we just need to
 	 * update their base address registers
 	 */
@@ -56,6 +59,12 @@ static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
 		fm10k_write_reg(hw, FM10K_TDBAH(i), bah);
 		fm10k_write_reg(hw, FM10K_RDBAL(i), bal);
 		fm10k_write_reg(hw, FM10K_RDBAH(i), bah);
+		/* Restore ITR scale in software-defined mechanism in TDLEN
+		 * for next VF initialization. See definition of
+		 * FM10K_TDLEN_ITR_SCALE_SHIFT for more details on the use of
+		 * TDLEN here.
+		 */
+		fm10k_write_reg(hw, FM10K_TDLEN(i), tdlen);
 	}
 
 	return 0;
@@ -131,9 +140,16 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
 	/* record maximum queue count */
 	hw->mac.max_queues = i;
 
-	/* fetch default VLAN */
+	/* fetch default VLAN and ITR scale */
 	hw->mac.default_vid = (fm10k_read_reg(hw, FM10K_TXQCTL(0)) &
 			       FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT;
+	/* Read the ITR scale from TDLEN. See the definition of
+	 * FM10K_TDLEN_ITR_SCALE_SHIFT for more information about how TDLEN is
+	 * used here.
+	 */
+	hw->mac.itr_scale = (fm10k_read_reg(hw, FM10K_TDLEN(0)) &
+			     FM10K_TDLEN_ITR_SCALE_MASK) >>
+			    FM10K_TDLEN_ITR_SCALE_SHIFT;
 
 	return 0;
 
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 12/17] fm10k: introduce ITR_IS_ADAPTIVE macro
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (9 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 11/17] fm10k: Add support for ITR scaling based on PCIe link speed Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm Jacob Keller
                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

Define a macro for identifying when the itr value is dynamic or
adaptive. The concept was taken from i40e. This helps make clear what
the check is, and reduces the line length to something more reasonable
in a few places.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k.h         | 2 ++
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 6 ++----
 drivers/net/ethernet/intel/fm10k/fm10k_main.c    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index 4d4cd006fec1..c40f50737d17 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -172,6 +172,8 @@ struct fm10k_ring_container {
 #define FM10K_ITR_20K		50	/* 50us */
 #define FM10K_ITR_ADAPTIVE	0x8000	/* adaptive interrupt moderation flag */
 
+#define ITR_IS_ADAPTIVE(itr) (!!(itr & FM10K_ITR_ADAPTIVE))
+
 #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)
 
 static inline struct netdev_queue *txring_txq(const struct fm10k_ring *ring)
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index b903c4646cf8..2395877f0544 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -705,12 +705,10 @@ static int fm10k_get_coalesce(struct net_device *dev,
 {
 	struct fm10k_intfc *interface = netdev_priv(dev);
 
-	ec->use_adaptive_tx_coalesce =
-		!!(interface->tx_itr & FM10K_ITR_ADAPTIVE);
+	ec->use_adaptive_tx_coalesce = ITR_IS_ADAPTIVE(interface->tx_itr);
 	ec->tx_coalesce_usecs = interface->tx_itr & ~FM10K_ITR_ADAPTIVE;
 
-	ec->use_adaptive_rx_coalesce =
-		!!(interface->rx_itr & FM10K_ITR_ADAPTIVE);
+	ec->use_adaptive_rx_coalesce = ITR_IS_ADAPTIVE(interface->rx_itr);
 	ec->rx_coalesce_usecs = interface->rx_itr & ~FM10K_ITR_ADAPTIVE;
 
 	return 0;
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 5dfcba72825d..babde3e4b2bb 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -1370,7 +1370,7 @@ static void fm10k_update_itr(struct fm10k_ring_container *ring_container)
 	unsigned int avg_wire_size, packets;
 
 	/* Only update ITR if we are using adaptive setting */
-	if (!(ring_container->itr & FM10K_ITR_ADAPTIVE))
+	if (!ITR_IS_ADAPTIVE(ring_container->itr))
 		goto clear_counts;
 
 	packets = ring_container->total_packets;
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (10 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 12/17] fm10k: introduce ITR_IS_ADAPTIVE macro Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-14 18:35   ` Alexander Duyck
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 14/17] fm10k: use macro for default Tx and Rx ITR values Jacob Keller
                   ` (4 subsequent siblings)
  16 siblings, 1 reply; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

The existing adaptive ITR algorithm is overly restrictive. It throttles
incorrectly for various traffic rates, and does not produce good
performance. The algorithm now allows for more interrupts per second,
and does some calculation to help improve for smaller packet loads. In
addition, take into account the new itr_scale from the hardware which
indicates how much to scale due to PCIe link speed.

A single thread of receiving TCP_STREAM in netperf:
- Before: 450 Mbps
- After: 20,000 Mbps

Reported-by: Matthew Vick <matthew.vick@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k.h      |  1 +
 drivers/net/ethernet/intel/fm10k/fm10k_main.c | 29 +++++++++++++++++++++++----
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c  |  6 ++++--
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index c40f50737d17..ceddf39d7cec 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -164,6 +164,7 @@ struct fm10k_ring_container {
 	unsigned int total_packets;	/* total packets processed this int */
 	u16 work_limit;			/* total work allowed per interrupt */
 	u16 itr;			/* interrupt throttle rate value */
+	u8 itr_scale;			/* ITR adjustment scaler based on PCI speed */
 	u8 count;			/* total number of rings in vector */
 };
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index babde3e4b2bb..cae6b4e309a9 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -1386,11 +1386,30 @@ static void fm10k_update_itr(struct fm10k_ring_container *ring_container)
 	if (avg_wire_size > 3000)
 		avg_wire_size = 3000;
 
-	/* Give a little boost to mid-size frames */
-	if ((avg_wire_size > 300) && (avg_wire_size < 1200))
-		avg_wire_size /= 3;
+	/* Throttle rate management based on average wire size, attempting to
+	 * slightly boost small and medium packet loads. Divide the average
+	 * wire size by a small factor to calculate the minimum time until the
+	 * next interrupt in microseconds. Save some cycles by using a
+	 * multiply then a shift, which also accounts for difference due to
+	 * PCIe link speed.
+	 */
+#define FM10K_ITR_SCALE_SMALL	6
+#define FM10K_ITR_SCALE_MEDIUM	5
+#define FM10K_ITR_SCALE_LARGE	4
+
+	if (avg_wire_size < 300)
+		avg_wire_size *= FM10K_ITR_SCALE_SMALL;
+	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
+		avg_wire_size *= FM10K_ITR_SCALE_MEDIUM;
 	else
-		avg_wire_size /= 2;
+		avg_wire_size *= FM10K_ITR_SCALE_LARGE;
+
+	/* Round up average wire size, then perform bit shift, to ensure that
+	 * the calculation will never get below 1. Account for changes in ITR
+	 * value due to PCIe link speed.
+	 */
+	avg_wire_size += (1 << (ring_container->itr_scale + 8)) - 1;
+	avg_wire_size >>= ring_container->itr_scale + 8;
 
 	/* write back value and retain adaptive flag */
 	ring_container->itr = avg_wire_size | FM10K_ITR_ADAPTIVE;
@@ -1608,6 +1627,7 @@ static int fm10k_alloc_q_vector(struct fm10k_intfc *interface,
 	q_vector->tx.ring = ring;
 	q_vector->tx.work_limit = FM10K_DEFAULT_TX_WORK;
 	q_vector->tx.itr = interface->tx_itr;
+	q_vector->tx.itr_scale = interface->hw.mac.itr_scale;
 	q_vector->tx.count = txr_count;
 
 	while (txr_count) {
@@ -1636,6 +1656,7 @@ static int fm10k_alloc_q_vector(struct fm10k_intfc *interface,
 	/* save Rx ring container info */
 	q_vector->rx.ring = ring;
 	q_vector->rx.itr = interface->rx_itr;
+	q_vector->rx.itr_scale = interface->hw.mac.itr_scale;
 	q_vector->rx.count = rxr_count;
 
 	while (rxr_count) {
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index 9ad9f9164d91..cbf38da0ada7 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -880,7 +880,8 @@ static irqreturn_t fm10k_msix_mbx_vf(int __always_unused irq, void *data)
 
 	/* re-enable mailbox interrupt and indicate 20us delay */
 	fm10k_write_reg(hw, FM10K_VFITR(FM10K_MBX_VECTOR),
-			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
+			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY >>
+					    hw->mac.itr_scale));
 
 	/* service upstream mailbox */
 	if (fm10k_mbx_trylock(interface)) {
@@ -1111,7 +1112,8 @@ static irqreturn_t fm10k_msix_mbx_pf(int __always_unused irq, void *data)
 
 	/* re-enable mailbox interrupt and indicate 20us delay */
 	fm10k_write_reg(hw, FM10K_ITR(FM10K_MBX_VECTOR),
-			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
+			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY >>
+					    hw->mac.itr_scale));
 
 	return IRQ_HANDLED;
 }
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 14/17] fm10k: use macro for default Tx and Rx ITR values
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (11 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec Jacob Keller
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k.h         | 2 ++
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 4 ++--
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c     | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index ceddf39d7cec..bf2cc6f78418 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -175,6 +175,8 @@ struct fm10k_ring_container {
 
 #define ITR_IS_ADAPTIVE(itr) (!!(itr & FM10K_ITR_ADAPTIVE))
 
+#define FM10K_TX_ITR_DEFAULT	FM10K_ITR_10K
+#define FM10K_RX_ITR_DEFAULT	FM10K_ITR_20K
 #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)
 
 static inline struct netdev_queue *txring_txq(const struct fm10k_ring *ring)
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 2395877f0544..2432b6e0528f 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -733,10 +733,10 @@ static int fm10k_set_coalesce(struct net_device *dev,
 
 	/* set initial values for adaptive ITR */
 	if (ec->use_adaptive_tx_coalesce)
-		tx_itr = FM10K_ITR_ADAPTIVE | FM10K_ITR_10K;
+		tx_itr = FM10K_ITR_ADAPTIVE | FM10K_TX_ITR_DEFAULT;
 
 	if (ec->use_adaptive_rx_coalesce)
-		rx_itr = FM10K_ITR_ADAPTIVE | FM10K_ITR_20K;
+		rx_itr = FM10K_ITR_ADAPTIVE | FM10K_RX_ITR_DEFAULT;
 
 	/* update interface */
 	interface->tx_itr = tx_itr;
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index cbf38da0ada7..fec672f66287 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -1773,8 +1773,8 @@ static int fm10k_sw_init(struct fm10k_intfc *interface,
 	interface->rx_ring_count = FM10K_DEFAULT_RXD;
 
 	/* set default interrupt moderation */
-	interface->tx_itr = FM10K_ITR_10K;
-	interface->rx_itr = FM10K_ITR_ADAPTIVE | FM10K_ITR_20K;
+	interface->tx_itr = FM10K_TX_ITR_DEFAULT;
+	interface->rx_itr = FM10K_ITR_ADAPTIVE | FM10K_RX_ITR_DEFAULT;
 
 	/* initialize vxlan_port list */
 	INIT_LIST_HEAD(&interface->vxlan_port);
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (12 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 14/17] fm10k: use macro for default Tx and Rx ITR values Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-14 15:15   ` Alexander Duyck
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 16/17] fm10k: TRIVIAL fix typo of hardware Jacob Keller
                   ` (2 subsequent siblings)
  16 siblings, 1 reply; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

The current default ITR for Tx is overly restrictive. Using a simple
netperf TCP_STREAM test, we top out at about 10Gb/s for a single thread
when running using 1500 byte frames. By reducing the ITR value to 25usec
(up to 40K interrupts a second from 10K), we are able to achieve 36Gb/s
for a single thread TCP stream test.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index bf2cc6f78418..5eb42fd21d3f 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -171,11 +171,12 @@ struct fm10k_ring_container {
 #define FM10K_ITR_MAX		0x0FFF	/* maximum value for ITR */
 #define FM10K_ITR_10K		100	/* 100us */
 #define FM10K_ITR_20K		50	/* 50us */
+#define FM10K_ITR_40K		25	/* 25us */
 #define FM10K_ITR_ADAPTIVE	0x8000	/* adaptive interrupt moderation flag */
 
 #define ITR_IS_ADAPTIVE(itr) (!!(itr & FM10K_ITR_ADAPTIVE))
 
-#define FM10K_TX_ITR_DEFAULT	FM10K_ITR_10K
+#define FM10K_TX_ITR_DEFAULT	FM10K_ITR_40K
 #define FM10K_RX_ITR_DEFAULT	FM10K_ITR_20K
 #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)
 
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 16/17] fm10k: TRIVIAL fix typo of hardware
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (13 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 17/17] fm10k: TRIVIAL cleanup order at top of fm10k_xmit_frame Jacob Keller
  2015-10-14  0:46 ` [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Allan, Bruce W
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index fec672f66287..91aa57ca8418 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -584,7 +584,7 @@ static void fm10k_configure_tx_ring(struct fm10k_intfc *interface,
 	/* store tail pointer */
 	ring->tail = &interface->uc_addr[FM10K_TDT(reg_idx)];
 
-	/* reset ntu and ntc to place SW in sync with hardwdare */
+	/* reset ntu and ntc to place SW in sync with hardware */
 	ring->next_to_clean = 0;
 	ring->next_to_use = 0;
 
@@ -690,7 +690,7 @@ static void fm10k_configure_rx_ring(struct fm10k_intfc *interface,
 	/* store tail pointer */
 	ring->tail = &interface->uc_addr[FM10K_RDT(reg_idx)];
 
-	/* reset ntu and ntc to place SW in sync with hardwdare */
+	/* reset ntu and ntc to place SW in sync with hardware */
 	ring->next_to_clean = 0;
 	ring->next_to_use = 0;
 	ring->next_to_alloc = 0;
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 17/17] fm10k: TRIVIAL cleanup order at top of fm10k_xmit_frame
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (14 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 16/17] fm10k: TRIVIAL fix typo of hardware Jacob Keller
@ 2015-10-13 23:39 ` Jacob Keller
  2015-10-14  0:46 ` [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Allan, Bruce W
  16 siblings, 0 replies; 40+ messages in thread
From: Jacob Keller @ 2015-10-13 23:39 UTC (permalink / raw)
  To: intel-wired-lan

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index cae6b4e309a9..9bcb803387d0 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -1098,11 +1098,11 @@ static void fm10k_tx_map(struct fm10k_ring *tx_ring,
 netdev_tx_t fm10k_xmit_frame_ring(struct sk_buff *skb,
 				  struct fm10k_ring *tx_ring)
 {
-	struct fm10k_tx_buffer *first;
-	int tso;
-	u32 tx_flags = 0;
-	unsigned short f;
 	u16 count = TXD_USE_COUNT(skb_headlen(skb));
+	struct fm10k_tx_buffer *first;
+	unsigned short f;
+	u32 tx_flags = 0;
+	int tso;
 
 	/* need: 1 descriptor per page * PAGE_SIZE/FM10K_MAX_DATA_PER_TXD,
 	 *       + 1 desc for skb_headlen/FM10K_MAX_DATA_PER_TXD,
-- 
2.6.1.264.gbab76a9


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support
  2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
                   ` (15 preceding siblings ...)
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 17/17] fm10k: TRIVIAL cleanup order at top of fm10k_xmit_frame Jacob Keller
@ 2015-10-14  0:46 ` Allan, Bruce W
  16 siblings, 0 replies; 40+ messages in thread
From: Allan, Bruce W @ 2015-10-14  0:46 UTC (permalink / raw)
  To: intel-wired-lan

> -----Original Message-----
> From: Intel-wired-lan [mailto:intel-wired-lan-bounces at lists.osuosl.org] On
> Behalf Of Jacob Keller
> Sent: Tuesday, October 13, 2015 4:39 PM
> To: Intel Wired LAN
> Subject: [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile
> DCB and DebugFS support
> 
> Rather than wrapping fm10k_dcbnl.c and fm10k_debugfs.c support with
>  #ifdef blocks, just conditionally include the .o files in the Makefile.
> 
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>  drivers/net/ethernet/intel/fm10k/Makefile        | 20 +++++++++++++++----
> -
>  drivers/net/ethernet/intel/fm10k/fm10k.h         |  2 ++
>  drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c   |  4 ----
>  drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c |  4 ----
>  drivers/net/ethernet/intel/fm10k/fm10k_pci.c     |  4 ++++
>  5 files changed, 21 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/fm10k/Makefile
> b/drivers/net/ethernet/intel/fm10k/Makefile
> index 08859dd220a8..b006ff66d028 100644
> --- a/drivers/net/ethernet/intel/fm10k/Makefile
> +++ b/drivers/net/ethernet/intel/fm10k/Makefile
> @@ -1,7 +1,7 @@
> 
> ##########################################################
> ######################
>  #
>  # Intel Ethernet Switch Host Interface Driver
> -# Copyright(c) 2013 - 2014 Intel Corporation.
> +# Copyright(c) 2013 - 2015 Intel Corporation.

There's no mention of this change in the patch description.

>  #
>  # This program is free software; you can redistribute it and/or modify it
>  # under the terms and conditions of the GNU General Public License,
> @@ -27,7 +27,17 @@
> 
>  obj-$(CONFIG_FM10K) += fm10k.o
> 
> -fm10k-objs := fm10k_main.o fm10k_common.o fm10k_pci.o \
> -	      fm10k_netdev.o fm10k_ethtool.o fm10k_pf.o fm10k_vf.o \
> -	      fm10k_mbx.o fm10k_iov.o fm10k_tlv.o \
> -	      fm10k_debugfs.o fm10k_ptp.o fm10k_dcbnl.o
> +fm10k-y := fm10k_main.o \
> +	   fm10k_common.o \
> +	   fm10k_pci.o \
> +	   fm10k_ptp.o \
> +	   fm10k_netdev.o \
> +	   fm10k_ethtool.o \
> +	   fm10k_pf.o \
> +	   fm10k_vf.o \
> +	   fm10k_mbx.o \
> +	   fm10k_iov.o \
> +	   fm10k_tlv.o
> +
> +fm10k-$(CONFIG_DEBUG_FS) += fm10k_debugfs.o
> +fm10k-$(CONFIG_DCB) += fm10k_dcbnl.o
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h
> b/drivers/net/ethernet/intel/fm10k/fm10k.h
> index 48809e5d3f79..504d487fa800 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k.h
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
> @@ -551,5 +551,7 @@ int fm10k_get_ts_config(struct net_device *netdev,
> struct ifreq *ifr);
>  int fm10k_set_ts_config(struct net_device *netdev, struct ifreq *ifr);
> 
>  /* DCB */
> +#ifdef CONFIG_DCB
>  void fm10k_dcbnl_set_ops(struct net_device *dev);
> +#endif

To avoid the use of CONFIG_DCB in fm10k_pci.c, this should be:

#ifdef CONFIG_DCB
void fm10k_dcbnl_set_ops(struct net_device *dev);
#else
static inline void fm10k_dcbnl_set_ops(void)	{}
#endif

>  #endif /* _FM10K_H_ */
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
> index 5c7a4d7662d8..2be4361839db 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
> @@ -20,7 +20,6 @@
> 
>  #include "fm10k.h"
> 
> -#ifdef CONFIG_DCB
>  /**
>   * fm10k_dcbnl_ieee_getets - get the ETS configuration for the device
>   * @dev: netdev interface for the device
> @@ -155,7 +154,6 @@ static const struct dcbnl_rtnl_ops fm10k_dcbnl_ops
> = {
>  	.setdcbx	= fm10k_dcbnl_setdcbx,
>  };
> 
> -#endif /* CONFIG_DCB */
>  /**
>   * fm10k_dcbnl_set_ops - Configures dcbnl ops pointer for netdev
>   * @dev: netdev interface for the device
> @@ -164,11 +162,9 @@ static const struct dcbnl_rtnl_ops fm10k_dcbnl_ops
> = {
>   **/
>  void fm10k_dcbnl_set_ops(struct net_device *dev)
>  {
> -#ifdef CONFIG_DCB
>  	struct fm10k_intfc *interface = netdev_priv(dev);
>  	struct fm10k_hw *hw = &interface->hw;
> 
>  	if (hw->mac.type == fm10k_mac_pf)
>  		dev->dcbnl_ops = &fm10k_dcbnl_ops;
> -#endif /* CONFIG_DCB */
>  }
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
> index 5304bc1fbecd..5d6137faf7d1 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
> @@ -18,8 +18,6 @@
>   * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-
> 6497
>   */
> 
> -#ifdef CONFIG_DEBUG_FS
> -
>  #include "fm10k.h"
> 
>  #include <linux/debugfs.h>
> @@ -258,5 +256,3 @@ void fm10k_dbg_exit(void)
>  	debugfs_remove_recursive(dbg_root);
>  	dbg_root = NULL;
>  }
> -
> -#endif /* CONFIG_DEBUG_FS */
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> index 5fbffbaefe32..0dece3c07727 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> @@ -1729,8 +1729,10 @@ static int fm10k_sw_init(struct fm10k_intfc
> *interface,
>  		netdev->hw_features &= ~NETIF_F_GSO_UDP_TUNNEL;
>  	}
> 
> +#ifdef CONFIG_DCB
>  	/* initialize DCBNL interface */
>  	fm10k_dcbnl_set_ops(netdev);
> +#endif

The above does not need to be (and should not be) wrapped with #ifdef
CONFIG_DCB with the above change to fm10k.h

> 
>  	/* Initialize service timer and service task */
>  	set_bit(__FM10K_SERVICE_DISABLE, &interface->state);
> @@ -1919,8 +1921,10 @@ static int fm10k_probe(struct pci_dev *pdev,
>  	if (err)
>  		goto err_sw_init;
> 
> +#if IS_ENABLED(CONFIG_DEBUG_FS)
>  	/* enable debugfs support */
>  	fm10k_dbg_intfc_init(interface);
> +#endif

Does not need to be (should not be) wrapped with #if
IS_ENABLED(CONFIG_DEBUG_FS).

> 
>  	err = fm10k_init_queueing_scheme(interface);
>  	if (err)
> --
> 2.6.1.264.gbab76a9
> 
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors Jacob Keller
@ 2015-10-14  0:46   ` Allan, Bruce W
  2015-10-14 15:57     ` Keller, Jacob E
  2015-10-28  0:47   ` Singh, Krishneil K
  1 sibling, 1 reply; 40+ messages in thread
From: Allan, Bruce W @ 2015-10-14  0:46 UTC (permalink / raw)
  To: intel-wired-lan

> -----Original Message-----
> From: Intel-wired-lan [mailto:intel-wired-lan-bounces at lists.osuosl.org] On
> Behalf Of Jacob Keller
> Sent: Tuesday, October 13, 2015 4:39 PM
> To: Intel Wired LAN
> Subject: [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw
> for errors
> 
> A recent change to the SHARED code makes it such that in some flows,

"SHARED code" has no meaning in the upstream driver, please change this description.

> init_hw may fail. For example, init_hw_vf will fail if the 1st queue is
> not owned by the VF. However, previously some flows called init_hw
> without checking the return code, or checked it but only to display a
> diagnostic message.
> 
> Fix this by (a) always returning and preventing the netdevice from going
> up, and (b) printing the diagnostic in every flow for consistency. This
> should resolve an issue where VF drivers would attempt to come up
> before the PF has finished assigning queues.
> 
> In addition, change the dmesg output to explicitly show the actual
> function that failed, instead of combining reset_hw and init_hw into a
> single check.
> 
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>  drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 34
> ++++++++++++++++++++++++----
>  1 file changed, 29 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> index 48f0ef4545a0..9ad9f9164d91 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> @@ -166,9 +166,17 @@ static void fm10k_reinit(struct fm10k_intfc
> *interface)
>  	interface->last_reset = jiffies + (10 * HZ);
> 
>  	/* reset and initialize the hardware so it is in a known state */
> -	err = hw->mac.ops.reset_hw(hw) ? : hw->mac.ops.init_hw(hw);
> -	if (err)
> +	err = hw->mac.ops.reset_hw(hw);
> +	if (err) {
> +		dev_err(&interface->pdev->dev, "reset_hw failed: %d\n",
> err);
> +		goto reinit_err;
> +	}
> +
> +	err = hw->mac.ops.init_hw(hw);
> +	if (err) {
>  		dev_err(&interface->pdev->dev, "init_hw failed: %d\n",
> err);
> +		goto reinit_err;
> +	}
> 
>  	err = fm10k_init_queueing_scheme(interface);
>  	if (err) {
> @@ -202,6 +210,10 @@ static void fm10k_reinit(struct fm10k_intfc
> *interface)
> 
>  	fm10k_iov_resume(interface->pdev);
> 
> +reinit_err:
> +	if (err)
> +		netif_device_detach(netdev);
> +
>  	rtnl_unlock();
> 
>  	clear_bit(__FM10K_RESETTING, &interface->state);
> @@ -1693,7 +1705,13 @@ static int fm10k_sw_init(struct fm10k_intfc
> *interface,
>  	interface->last_reset = jiffies + (10 * HZ);
> 
>  	/* reset and initialize the hardware so it is in a known state */
> -	err = hw->mac.ops.reset_hw(hw) ? : hw->mac.ops.init_hw(hw);
> +	err = hw->mac.ops.reset_hw(hw);
> +	if (err) {
> +		dev_err(&pdev->dev, "reset_hw failed: %d\n", err);
> +		return err;
> +	}
> +
> +	err = hw->mac.ops.init_hw(hw);
>  	if (err) {
>  		dev_err(&pdev->dev, "init_hw failed: %d\n", err);
>  		return err;
> @@ -2077,8 +2095,10 @@ static int fm10k_resume(struct pci_dev *pdev)
> 
>  	/* reset hardware to known state */
>  	err = hw->mac.ops.init_hw(&interface->hw);
> -	if (err)
> +	if (err) {
> +		dev_err(&pdev->dev, "init_hw failed: %d\n", err);
>  		return err;
> +	}
> 
>  	/* reset statistics starting values */
>  	hw->mac.ops.rebind_hw_stats(hw, &interface->stats);
> @@ -2257,7 +2277,11 @@ static void fm10k_io_resume(struct pci_dev
> *pdev)
>  	int err = 0;
> 
>  	/* reset hardware to known state */
> -	hw->mac.ops.init_hw(&interface->hw);
> +	err = hw->mac.ops.init_hw(&interface->hw);
> +	if (err) {
> +		dev_err(&pdev->dev, "init_hw failed: %d\n", err);
> +		return;
> +	}
> 
>  	/* reset statistics starting values */
>  	hw->mac.ops.rebind_hw_stats(hw, &interface->stats);
> --
> 2.6.1.264.gbab76a9
> 
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 06/17] fm10k: Correct typecast in fm10k_update_xc_addr_pf
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 06/17] fm10k: Correct typecast in fm10k_update_xc_addr_pf Jacob Keller
@ 2015-10-14  0:46   ` Allan, Bruce W
  0 siblings, 0 replies; 40+ messages in thread
From: Allan, Bruce W @ 2015-10-14  0:46 UTC (permalink / raw)
  To: intel-wired-lan

> -----Original Message-----
> From: Intel-wired-lan [mailto:intel-wired-lan-bounces at lists.osuosl.org] On
> Behalf Of Jacob Keller
> Sent: Tuesday, October 13, 2015 4:39 PM
> To: Intel Wired LAN
> Subject: [Intel-wired-lan] [next-queue 06/17] fm10k: Correct typecast in
> fm10k_update_xc_addr_pf
> 
> Since the resultant data type of the mac_update.mac_upper field is u16,
> it does not make sense to typecast u8 variables to u32 first.

Care to update the copyright date in this file while you're already updating it?

> 
> Reported-by: Matthew Vick <matthew.vick@intel.com>
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>  drivers/net/ethernet/intel/fm10k/fm10k_pf.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> index 8c0bdc4e4edd..789ffd9d7fd9 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> @@ -334,8 +334,8 @@ static s32 fm10k_update_xc_addr_pf(struct
> fm10k_hw *hw, u16 glort,
>  						 ((u32)mac[3] << 16) |
>  						 ((u32)mac[4] << 8) |
>  						 ((u32)mac[5]));
> -	mac_update.mac_upper = cpu_to_le16(((u32)mac[0] << 8) |
> -						 ((u32)mac[1]));
> +	mac_update.mac_upper = cpu_to_le16(((u16)mac[0] << 8) |
> +					   ((u16)mac[1]));
>  	mac_update.vlan = cpu_to_le16(vid);
>  	mac_update.glort = cpu_to_le16(glort);
>  	mac_update.action = add ? 0 : 1;
> --
> 2.6.1.264.gbab76a9
> 
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 08/17] fm10k: add statistics for actual DWORD count of mbmem mailbox
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 08/17] fm10k: add statistics for actual DWORD count of mbmem mailbox Jacob Keller
@ 2015-10-14  0:47   ` Allan, Bruce W
  0 siblings, 0 replies; 40+ messages in thread
From: Allan, Bruce W @ 2015-10-14  0:47 UTC (permalink / raw)
  To: intel-wired-lan

> -----Original Message-----
> From: Intel-wired-lan [mailto:intel-wired-lan-bounces at lists.osuosl.org] On
> Behalf Of Jacob Keller
> Sent: Tuesday, October 13, 2015 4:39 PM
> To: Intel Wired LAN
> Subject: [Intel-wired-lan] [next-queue 08/17] fm10k: add statistics for actual
> DWORD count of mbmem mailbox
> 
> A previous bug was uncovered by addition of a debug stat to indicate the
> actual number of DWORDS we pulled from the mbmem. It turned out this
> was
> not the same as the tx_dwords counter. While the previous bug fix should
> have corrected this in all cases, add some debug stats that count the
> number of DWORDs pushed or pulled from the mbmem. A future debugger
> may
> take advantage of this statistic for debugging purposes.
> 
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>  drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 2 ++
>  drivers/net/ethernet/intel/fm10k/fm10k_mbx.c     | 4 ++++
>  drivers/net/ethernet/intel/fm10k/fm10k_mbx.h     | 2 ++

Care to update the copyright date in fm10k_mbx.h while you're already updating it?

>  3 files changed, 8 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
> index 447a5f8da42f..5db5a97cf28e 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
> @@ -114,9 +114,11 @@ static const struct fm10k_stats
> fm10k_gstrings_mbx_stats[] = {
>  	FM10K_MBX_STAT("mbx_tx_oversized", tx_dropped),
>  	FM10K_MBX_STAT("mbx_tx_messages", tx_messages),
>  	FM10K_MBX_STAT("mbx_tx_dwords", tx_dwords),
> +	FM10K_MBX_STAT("mbx_tx_mbmem_pulled",
> tx_mbmem_pulled),
>  	FM10K_MBX_STAT("mbx_rx_messages", rx_messages),
>  	FM10K_MBX_STAT("mbx_rx_dwords", rx_dwords),
>  	FM10K_MBX_STAT("mbx_rx_parse_err", rx_parse_err),
> +	FM10K_MBX_STAT("mbx_rx_mbmem_pushed",
> rx_mbmem_pushed),
>  };
> 
>  #define FM10K_GLOBAL_STATS_LEN
> ARRAY_SIZE(fm10k_gstrings_global_stats)
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
> index af09a1b272e6..2bce47490723 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
> @@ -375,6 +375,8 @@ static void fm10k_mbx_write_copy(struct fm10k_hw
> *hw,
>  			if (!tail)
>  				tail++;
> 
> +			mbx->tx_mbmem_pulled++;
> +
>  			/* write message to hardware FIFO */
>  			fm10k_write_reg(hw, mbmem + tail++, *(head++));
>  		} while (--len && --end);
> @@ -459,6 +461,8 @@ static void fm10k_mbx_read_copy(struct fm10k_hw
> *hw,
>  			if (!head)
>  				head++;
> 
> +			mbx->rx_mbmem_pushed++;
> +
>  			/* read message from hardware FIFO */
>  			*(tail++) = fm10k_read_reg(hw, mbmem +
> head++);
>  		} while (--len && --end);
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
> b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
> index 0419a7f0035e..b8eb6b5be894 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
> @@ -291,8 +291,10 @@ struct fm10k_mbx_info {
>  	u64 tx_dropped;
>  	u64 tx_messages;
>  	u64 tx_dwords;
> +	u64 tx_mbmem_pulled;
>  	u64 rx_messages;
>  	u64 rx_dwords;
> +	u64 rx_mbmem_pushed;
>  	u64 rx_parse_err;
> 
>  	/* Buffer to store messages */
> --
> 2.6.1.264.gbab76a9
> 
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 10/17] fm10k: add TEB check to fm10k_gre_is_nvgre
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 10/17] fm10k: add TEB check to fm10k_gre_is_nvgre Jacob Keller
@ 2015-10-14  0:47   ` Allan, Bruce W
  0 siblings, 0 replies; 40+ messages in thread
From: Allan, Bruce W @ 2015-10-14  0:47 UTC (permalink / raw)
  To: intel-wired-lan

> -----Original Message-----
> From: Intel-wired-lan [mailto:intel-wired-lan-bounces at lists.osuosl.org] On
> Behalf Of Jacob Keller
> Sent: Tuesday, October 13, 2015 4:39 PM
> To: Intel Wired LAN
> Subject: [Intel-wired-lan] [next-queue 10/17] fm10k: add TEB check to
> fm10k_gre_is_nvgre
> 
> The NVGRE protocol should be run over transparent ethernet bridge
> protocol, so we should verify this in our check whether the GRE tunnel
> is NVGRE.
> 
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>  drivers/net/ethernet/intel/fm10k/fm10k_main.c | 4 ++++

Care to update the copyright date in this file while you're already updating it?

>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> index 746a1986690b..5dfcba72825d 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> @@ -708,6 +708,10 @@ static struct ethhdr *fm10k_gre_is_nvgre(struct
> sk_buff *skb)
>  	if (nvgre_hdr->flags & FM10K_NVGRE_RESERVED0_FLAGS)
>  		return NULL;
> 
> +	/* verify protocol is transparent Ethernet bridging */
> +	if (nvgre_hdr->proto != htons(ETH_P_TEB))
> +		return NULL;
> +
>  	/* report start of ethernet header */
>  	if (nvgre_hdr->flags & NVGRE_TNI)
>  		return (struct ethhdr *)(nvgre_hdr + 1);
> --
> 2.6.1.264.gbab76a9
> 
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 11/17] fm10k: Add support for ITR scaling based on PCIe link speed
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 11/17] fm10k: Add support for ITR scaling based on PCIe link speed Jacob Keller
@ 2015-10-14  0:47   ` Allan, Bruce W
  0 siblings, 0 replies; 40+ messages in thread
From: Allan, Bruce W @ 2015-10-14  0:47 UTC (permalink / raw)
  To: intel-wired-lan

> -----Original Message-----
> From: Intel-wired-lan [mailto:intel-wired-lan-bounces at lists.osuosl.org] On
> Behalf Of Jacob Keller
> Sent: Tuesday, October 13, 2015 4:39 PM
> To: Intel Wired LAN
> Subject: [Intel-wired-lan] [next-queue 11/17] fm10k: Add support for ITR
> scaling based on PCIe link speed
> 
> Red Rock Canyon's interrupt throttle timers are based on the PCIe link
> speed. Because of this, the value being programmed into the ITR
> registers must be scaled.

"Red Rock Canyon" is a codename.  Use "The fm10k device" or
"Intel Ethernet Switch FM10000 Host Interface" instead.

> 
> For the PF, this is as simple as reading the PCIe link speed and storing
> the result. However, in the case of SR-IOV, the VF's interrupt throttle
> timers are based on the link speed of the PF. However, the VF is unable
> to get the link speed information from its configuration space, so the
> PF must inform it of what scale to use.
> 
> Rather than pass this scale via mailbox message, take advantage of
> unused bits in the TDLEN register to pass the scale. It is the
> responsibility of the PF to program this for the VF while setting up the
> VF queues and the responsibility of the VF to get the information
> accordingly. This is preferable because it allows the VF to set up the
> interrupts properly during initialization and matches how the MAC
> address is passed in the TDBAL/TDBAH registers.
> 
> Reported-by: Matthew Vick <matthew.vick@intel.com>
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>  drivers/net/ethernet/intel/fm10k/fm10k_pf.c   | 22
> +++++++++++++++++++++-
>  drivers/net/ethernet/intel/fm10k/fm10k_type.h | 15 +++++++++++++++

Care to update the copyright date in fm10k_type.h while you're already updating it?

>  drivers/net/ethernet/intel/fm10k/fm10k_vf.c   | 20
> ++++++++++++++++++--
>  3 files changed, 54 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> index 52f754b35972..9c3a48f04056 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> @@ -150,19 +150,26 @@ static s32 fm10k_init_hw_pf(struct fm10k_hw
> *hw)
>  				FM10K_TPH_RXCTRL_HDR_WROEN);
>  	}
> 
> -	/* set max hold interval to align with 1.024 usec in all modes */
> +	/* set max hold interval to align with 1.024 usec in all modes and
> +	 * store ITR scale
> +	 */
>  	switch (hw->bus.speed) {
>  	case fm10k_bus_speed_2500:
>  		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN1;
> +		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN1;
>  		break;
>  	case fm10k_bus_speed_5000:
>  		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN2;
> +		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN2;
>  		break;
>  	case fm10k_bus_speed_8000:
>  		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN3;
> +		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN3;
>  		break;
>  	default:
>  		dma_ctrl = 0;
> +		/* just in case, assume Gen3 ITR scale */
> +		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN3;
>  		break;
>  	}
> 
> @@ -903,6 +910,13 @@ static s32
> fm10k_iov_assign_default_mac_vlan_pf(struct fm10k_hw *hw,
>  	fm10k_write_reg(hw, FM10K_TDBAL(vf_q_idx), tdbal);
>  	fm10k_write_reg(hw, FM10K_TDBAH(vf_q_idx), tdbah);
> 
> +	/* Provide the VF the ITR scale, using software-defined fields in
> TDLEN
> +	 * to pass the information during VF initialization. See definition of
> +	 * FM10K_TDLEN_ITR_SCALE_SHIFT for more details.
> +	 */
> +	fm10k_write_reg(hw, FM10K_TDLEN(vf_q_idx), hw->mac.itr_scale
> <<
> +
> FM10K_TDLEN_ITR_SCALE_SHIFT);
> +
>  err_out:
>  	/* configure Queue control register */
>  	txqctl = ((u32)vf_vid << FM10K_TXQCTL_VID_SHIFT) &
> @@ -1035,6 +1049,12 @@ static s32 fm10k_iov_reset_resources_pf(struct
> fm10k_hw *hw,
>  	for (i = queues_per_pool; i--;) {
>  		fm10k_write_reg(hw, FM10K_TDBAL(vf_q_idx + i), tdbal);
>  		fm10k_write_reg(hw, FM10K_TDBAH(vf_q_idx + i), tdbah);
> +		/* See definition of FM10K_TDLEN_ITR_SCALE_SHIFT for an
> +		 * explanation of how TDLEN is used.
> +		 */
> +		fm10k_write_reg(hw, FM10K_TDLEN(vf_q_idx + i),
> +				hw->mac.itr_scale <<
> +				FM10K_TDLEN_ITR_SCALE_SHIFT);
>  		fm10k_write_reg(hw, FM10K_TQMAP(qmap_idx + i),
> vf_q_idx + i);
>  		fm10k_write_reg(hw, FM10K_RQMAP(qmap_idx + i),
> vf_q_idx + i);
>  	}
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_type.h
> b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
> index 35afd711d144..02d370fb8bbb 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_type.h
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
> @@ -272,6 +272,20 @@ struct fm10k_hw;
>  #define FM10K_TDBAL(_n)		((0x40 * (_n)) + 0x8000)
>  #define FM10K_TDBAH(_n)		((0x40 * (_n)) + 0x8001)
>  #define FM10K_TDLEN(_n)		((0x40 * (_n)) + 0x8002)
> +/* When fist initialized, VFs need to know the Interrupt Throttle Rate (ITR)
> + * scale which is based on the PCIe speed but the speed information in the
> PCI
> + * configuration space may not be accurate. The PF already knows the ITR
> scale
> + * but there is no defined method to pass that information from the PF to
> the
> + * VF. This is accomplished during VF initialization by temporarily co-opting
> + * the yet-to-be-used TDLEN register to have the PF store the ITR shift for
> + * the VF to retrieve before the VF needs to use the TDLEN register for its
> + * intended purpose, i.e. before the Tx resources are allocated.
> + */
> +#define FM10K_TDLEN_ITR_SCALE_SHIFT		9
> +#define FM10K_TDLEN_ITR_SCALE_MASK		0x00000E00
> +#define FM10K_TDLEN_ITR_SCALE_GEN1		2
> +#define FM10K_TDLEN_ITR_SCALE_GEN2		1
> +#define FM10K_TDLEN_ITR_SCALE_GEN3		0
>  #define FM10K_TPH_TXCTRL(_n)	((0x40 * (_n)) + 0x8003)
>  #define FM10K_TPH_TXCTRL_DESC_TPHEN		0x00000020
>  #define FM10K_TPH_TXCTRL_DESC_RROEN		0x00000200
> @@ -560,6 +574,7 @@ struct fm10k_mac_info {
>  	bool get_host_state;
>  	bool tx_ready;
>  	u32 dglort_map;
> +	u8 itr_scale;
>  };
> 
>  struct fm10k_swapi_table_info {
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
> index d512575c33f3..2af697df5abc 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
> @@ -28,7 +28,7 @@
>  static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
>  {
>  	u8 *perm_addr = hw->mac.perm_addr;
> -	u32 bal = 0, bah = 0;
> +	u32 bal = 0, bah = 0, tdlen;
>  	s32 err;
>  	u16 i;
> 
> @@ -48,6 +48,9 @@ static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
>  		       ((u32)perm_addr[2]);
>  	}
> 
> +	/* restore default itr_scale for next VF initialization */
> +	tdlen = hw->mac.itr_scale << FM10K_TDLEN_ITR_SCALE_SHIFT;
> +
>  	/* The queues have already been disabled so we just need to
>  	 * update their base address registers
>  	 */
> @@ -56,6 +59,12 @@ static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
>  		fm10k_write_reg(hw, FM10K_TDBAH(i), bah);
>  		fm10k_write_reg(hw, FM10K_RDBAL(i), bal);
>  		fm10k_write_reg(hw, FM10K_RDBAH(i), bah);
> +		/* Restore ITR scale in software-defined mechanism in
> TDLEN
> +		 * for next VF initialization. See definition of
> +		 * FM10K_TDLEN_ITR_SCALE_SHIFT for more details on the
> use of
> +		 * TDLEN here.
> +		 */
> +		fm10k_write_reg(hw, FM10K_TDLEN(i), tdlen);
>  	}
> 
>  	return 0;
> @@ -131,9 +140,16 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
>  	/* record maximum queue count */
>  	hw->mac.max_queues = i;
> 
> -	/* fetch default VLAN */
> +	/* fetch default VLAN and ITR scale */
>  	hw->mac.default_vid = (fm10k_read_reg(hw, FM10K_TXQCTL(0)) &
>  			       FM10K_TXQCTL_VID_MASK) >>
> FM10K_TXQCTL_VID_SHIFT;
> +	/* Read the ITR scale from TDLEN. See the definition of
> +	 * FM10K_TDLEN_ITR_SCALE_SHIFT for more information about
> how TDLEN is
> +	 * used here.
> +	 */
> +	hw->mac.itr_scale = (fm10k_read_reg(hw, FM10K_TDLEN(0)) &
> +			     FM10K_TDLEN_ITR_SCALE_MASK) >>
> +			    FM10K_TDLEN_ITR_SCALE_SHIFT;
> 
>  	return 0;
> 
> --
> 2.6.1.264.gbab76a9
> 
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec Jacob Keller
@ 2015-10-14 15:15   ` Alexander Duyck
  2015-10-14 15:59     ` Keller, Jacob E
  0 siblings, 1 reply; 40+ messages in thread
From: Alexander Duyck @ 2015-10-14 15:15 UTC (permalink / raw)
  To: intel-wired-lan

On 10/13/2015 04:39 PM, Jacob Keller wrote:
> The current default ITR for Tx is overly restrictive. Using a simple
> netperf TCP_STREAM test, we top out at about 10Gb/s for a single thread
> when running using 1500 byte frames. By reducing the ITR value to 25usec
> (up to 40K interrupts a second from 10K), we are able to achieve 36Gb/s
> for a single thread TCP stream test.
>
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>   drivers/net/ethernet/intel/fm10k/fm10k.h | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
> index bf2cc6f78418..5eb42fd21d3f 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k.h
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
> @@ -171,11 +171,12 @@ struct fm10k_ring_container {
>   #define FM10K_ITR_MAX		0x0FFF	/* maximum value for ITR */
>   #define FM10K_ITR_10K		100	/* 100us */
>   #define FM10K_ITR_20K		50	/* 50us */
> +#define FM10K_ITR_40K		25	/* 25us */
>   #define FM10K_ITR_ADAPTIVE	0x8000	/* adaptive interrupt moderation flag */
>   
>   #define ITR_IS_ADAPTIVE(itr) (!!(itr & FM10K_ITR_ADAPTIVE))
>   
> -#define FM10K_TX_ITR_DEFAULT	FM10K_ITR_10K
> +#define FM10K_TX_ITR_DEFAULT	FM10K_ITR_40K
>   #define FM10K_RX_ITR_DEFAULT	FM10K_ITR_20K
>   #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)

Even 40K might be a bit restrictive.  I've found that 10Gb NICs need 
about 12K interrupts per second to avoid depleting the socket buffers in 
the case of a UDP test.  You might want to see if you see performance 
gains by taking it up to 50K maybe even 60K.  As far as test cases to 
use for tuning it would be useful to use a UDP_STREAM test instead of 
TCP_STREAM since UDP is not bound by the link partner throughput.

- Alex

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors
  2015-10-14  0:46   ` Allan, Bruce W
@ 2015-10-14 15:57     ` Keller, Jacob E
  0 siblings, 0 replies; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-14 15:57 UTC (permalink / raw)
  To: intel-wired-lan

On Wed, 2015-10-14 at 00:46 +0000, Allan, Bruce W wrote:
> > -----Original Message-----
> > From: Intel-wired-lan [mailto:
> > intel-wired-lan-bounces at lists.osuosl.org] On
> > Behalf Of Jacob Keller
> > Sent: Tuesday, October 13, 2015 4:39 PM
> > To: Intel Wired LAN
> > Subject: [Intel-wired-lan] [next-queue 05/17] fm10k: always check
> > init_hw
> > for errors
> > 
> > A recent change to the SHARED code makes it such that in some
> > flows,
> 
> "SHARED code" has no meaning in the upstream driver, please change
> this description.
> 

Woops, yep, thought I proofread for all these. Will fix.

Regards,
Jake

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-14 15:15   ` Alexander Duyck
@ 2015-10-14 15:59     ` Keller, Jacob E
  2015-10-14 16:23       ` Alexander Duyck
  0 siblings, 1 reply; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-14 15:59 UTC (permalink / raw)
  To: intel-wired-lan

On Wed, 2015-10-14 at 08:15 -0700, Alexander Duyck wrote:
> Even 40K might be a bit restrictive.  I've found that 10Gb NICs need 
> about 12K interrupts per second to avoid depleting the socket buffers
> in 
> the case of a UDP test.  You might want to see if you see performance
> gains by taking it up to 50K maybe even 60K.  As far as test cases to
> use for tuning it would be useful to use a UDP_STREAM test instead of
> TCP_STREAM since UDP is not bound by the link partner throughput.
> 
> - Alex

Thanks Alex. I tried with Tx set to 0usec up to 25, and for my tests
with both TCP and UDP trials, I topped out at 25 and didn't see
improvement beyond that. I think it's a reasonable default, but I
likely have other areas on my system that need tuning first.

I can give some more tests specifically with UDP again to see if any
other changes made that not true.

Regards,
Jake

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-14 15:59     ` Keller, Jacob E
@ 2015-10-14 16:23       ` Alexander Duyck
  2015-10-14 16:31         ` Keller, Jacob E
  2015-10-14 17:57         ` Keller, Jacob E
  0 siblings, 2 replies; 40+ messages in thread
From: Alexander Duyck @ 2015-10-14 16:23 UTC (permalink / raw)
  To: intel-wired-lan

On 10/14/2015 08:59 AM, Keller, Jacob E wrote:
> On Wed, 2015-10-14 at 08:15 -0700, Alexander Duyck wrote:
>> Even 40K might be a bit restrictive.  I've found that 10Gb NICs need
>> about 12K interrupts per second to avoid depleting the socket buffers
>> in
>> the case of a UDP test.  You might want to see if you see performance
>> gains by taking it up to 50K maybe even 60K.  As far as test cases to
>> use for tuning it would be useful to use a UDP_STREAM test instead of
>> TCP_STREAM since UDP is not bound by the link partner throughput.
>>
>> - Alex
> Thanks Alex. I tried with Tx set to 0usec up to 25, and for my tests
> with both TCP and UDP trials, I topped out at 25 and didn't see
> improvement beyond that. I think it's a reasonable default, but I
> likely have other areas on my system that need tuning first.
>
> I can give some more tests specifically with UDP again to see if any
> other changes made that not true.

The 36Gb/s number is pretty impressive and may be pushing you into other 
limits.  What does the CPU utilization look like for your test?  Do you 
see the thread handling the workload get fully utilized?  Also have you 
tried enabling XPS to see if maybe you could squeeze a bit more out of 
the core that is doing the test by improving locality?

- Alex

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-14 16:23       ` Alexander Duyck
@ 2015-10-14 16:31         ` Keller, Jacob E
  2015-10-14 17:57         ` Keller, Jacob E
  1 sibling, 0 replies; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-14 16:31 UTC (permalink / raw)
  To: intel-wired-lan

On Wed, 2015-10-14 at 09:23 -0700, Alexander Duyck wrote:
> The 36Gb/s number is pretty impressive and may be pushing you into
> other 
> limits.  What does the CPU utilization look like for your test?  Do
> you 
> see the thread handling the workload get fully utilized?  Also have
> you 
> tried enabling XPS to see if maybe you could squeeze a bit more out
> of 
> the core that is doing the test by improving locality?
> 
> - Alex

I don't have those offhand but I can go measure. I have already enabled
XPS in my setup with this test, though currently the default driver
doesn't enable it, so I do it manually. Without XPS, I get
significantly worse performance.

I will try to get CPU utilization numbers soon for this. And I'll also
try UDP_STREAM and TCP_STREAM both.

Regards,
Jake

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-14 16:23       ` Alexander Duyck
  2015-10-14 16:31         ` Keller, Jacob E
@ 2015-10-14 17:57         ` Keller, Jacob E
  2015-10-14 23:27           ` Alexander Duyck
  1 sibling, 1 reply; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-14 17:57 UTC (permalink / raw)
  To: intel-wired-lan

On Wed, 2015-10-14 at 09:23 -0700, Alexander Duyck wrote:
> The 36Gb/s number is pretty impressive and may be pushing you into
> other 
> limits.  What does the CPU utilization look like for your test?  Do
> you 
> see the thread handling the workload get fully utilized?  Also have
> you 
> tried enabling XPS to see if maybe you could squeeze a bit more out
> of 
> the core that is doing the test by improving locality?
> 
> - Alex

Hello,

So what I am getting now with a few things. First, I set rx-usecs and
tx-usecs to both 25, and disabled adaptive mode for now...

TCP_STREAM => 33Gb/s, but sometimes drops to 27Gb/s, I believe this is
a result of no rx_flow_steering (Flow director / ATR) support in the
host interface. So sometimes the queue we pick is more local other
times it is not. If I reduce the number of queues and ensure only CPUs
local to the one I am running netperf, I get 33Gb/s pretty straight. I
have not been able to reproduce the 36Gb/s test above, so I may re-word
the commit message...

I used:

./netperf -T0,5 -t TCP_STREAM -f m -c -C  -H 192.168.21.2 -- -m 64K  

So this is with rather large messages that can be run through TSO.

CPU utilization here tops says: 10% local, 10% remote, but if I look at
top on both ends, it shows ~80% CPU utilization on receiver, and about
50% on the transmit end.

I've got a weird issue right now where sometimes it seems to drop to
half and I haven't determined exactly why yet. But I am pretty sure
it's due to queue assignment

I've been getting pretty inconsistent performance results over the last
few tests.

I tried these tests with interrupt moderation disabled completely and I
generally got less performance.

Interestingly, I just set both rx and tx to 10, and got one test
through to report 39Gb/s... But I am definitely not able to
consistently hit this value.

I generally seem to range pretty wide over tests.

For UDP I used:

./netperf -T0,5 -t UDP_STREAM -f m -c -C  -H 192.168.21.2 -- -m 64k  

For this test, I see 80% CPU utilization on the sender, and 50% on the
receiver, when bound as above.

I seem to get ~16Gb/s send and receive here, with no variance... 

I suspect part of this is due to the fact that TCP can do hardware TSO,
which we don't have in UDP? I'm not sure here..

UDP is significantly more stable than TCP was. but it doesn't seem to
ever go above 16Gb/s for a single stream.

I'm still a bit concerned over the instability produced by TCP_STREAM,
but it should be noted that my test setup is far from ideal:

I currently only have a single host interface, and have used network
namespacing to separate the two devices so that it routes over the
physical hardware. So it's a single system test which impacts irq to
CPU binding, as well as queue to CPU binding, and so on. There are a
lot of issues here that impact, but I'm happy to be able to get much
better than 2-3Gb/s like I was before.

Any further suggestions would be appreciated.

Regards,
Jake


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm Jacob Keller
@ 2015-10-14 18:35   ` Alexander Duyck
  2015-10-14 20:12     ` Keller, Jacob E
  0 siblings, 1 reply; 40+ messages in thread
From: Alexander Duyck @ 2015-10-14 18:35 UTC (permalink / raw)
  To: intel-wired-lan

On 10/13/2015 04:39 PM, Jacob Keller wrote:
> The existing adaptive ITR algorithm is overly restrictive. It throttles
> incorrectly for various traffic rates, and does not produce good
> performance. The algorithm now allows for more interrupts per second,
> and does some calculation to help improve for smaller packet loads. In
> addition, take into account the new itr_scale from the hardware which
> indicates how much to scale due to PCIe link speed.
>
> A single thread of receiving TCP_STREAM in netperf:
> - Before: 450 Mbps
> - After: 20,000 Mbps
>
> Reported-by: Matthew Vick <matthew.vick@intel.com>
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>   drivers/net/ethernet/intel/fm10k/fm10k.h      |  1 +
>   drivers/net/ethernet/intel/fm10k/fm10k_main.c | 29 +++++++++++++++++++++++----
>   drivers/net/ethernet/intel/fm10k/fm10k_pci.c  |  6 ++++--
>   3 files changed, 30 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
> index c40f50737d17..ceddf39d7cec 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k.h
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
> @@ -164,6 +164,7 @@ struct fm10k_ring_container {
>   	unsigned int total_packets;	/* total packets processed this int */
>   	u16 work_limit;			/* total work allowed per interrupt */
>   	u16 itr;			/* interrupt throttle rate value */
> +	u8 itr_scale;			/* ITR adjustment scaler based on PCI speed */
>   	u8 count;			/* total number of rings in vector */
>   };
>
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> index babde3e4b2bb..cae6b4e309a9 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> @@ -1386,11 +1386,30 @@ static void fm10k_update_itr(struct fm10k_ring_container *ring_container)
>   	if (avg_wire_size > 3000)
>   		avg_wire_size = 3000;
>
> -	/* Give a little boost to mid-size frames */
> -	if ((avg_wire_size > 300) && (avg_wire_size < 1200))
> -		avg_wire_size /= 3;
> +	/* Throttle rate management based on average wire size, attempting to
> +	 * slightly boost small and medium packet loads. Divide the average
> +	 * wire size by a small factor to calculate the minimum time until the
> +	 * next interrupt in microseconds. Save some cycles by using a
> +	 * multiply then a shift, which also accounts for difference due to
> +	 * PCIe link speed.
> +	 */
> +#define FM10K_ITR_SCALE_SMALL	6
> +#define FM10K_ITR_SCALE_MEDIUM	5
> +#define FM10K_ITR_SCALE_LARGE	4
> +
> +	if (avg_wire_size < 300)
> +		avg_wire_size *= FM10K_ITR_SCALE_SMALL;
> +	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
> +		avg_wire_size *= FM10K_ITR_SCALE_MEDIUM;
>   	else
> -		avg_wire_size /= 2;
> +		avg_wire_size *= FM10K_ITR_SCALE_LARGE;
> +

Where is it these scaling values originated from?  Just looking through 
the values I am not sure having this broken out like it is provides much 
value.

What I am getting at is that the input is a packet size, and the output 
is a value somewhere between 2 and 47.  (I think that 47 is still a bit 
high by the way and probably should be something more like 25 which I 
believe you established as the minimum Tx interrupt rate in a later patch.)

What you may want to do is look at pulling in the upper limit to 
something more reasonable like 1536 for avg_wire_size, and then simplify 
this logic a bit.  Specifically what is it you are trying to accomplish 
by tweaking the scale factor like you are?  I assume you are wanting to 
approximate a curve.  If so you might wan to look at possibly including 
an offset value so that you can smooth out the points where your 
intersections occur.

For example what you may want to consider doing would be to instead use 
a multiplication factor for small, an addition value for medium, and for 
large you simply cap it at a certain value.


> +	/* Round up average wire size, then perform bit shift, to ensure that
> +	 * the calculation will never get below 1. Account for changes in ITR
> +	 * value due to PCIe link speed.
> +	 */
> +	avg_wire_size += (1 << (ring_container->itr_scale + 8)) - 1;
> +	avg_wire_size >>= ring_container->itr_scale + 8;
>

You might want to store off the value for itr_scale + 8 somewhere.  It 
is likely you might save a cycle or two, especially if the compiler 
thinks it has to read itr_scale twice.

>   	/* write back value and retain adaptive flag */
>   	ring_container->itr = avg_wire_size | FM10K_ITR_ADAPTIVE;
> @@ -1608,6 +1627,7 @@ static int fm10k_alloc_q_vector(struct fm10k_intfc *interface,
>   	q_vector->tx.ring = ring;
>   	q_vector->tx.work_limit = FM10K_DEFAULT_TX_WORK;
>   	q_vector->tx.itr = interface->tx_itr;
> +	q_vector->tx.itr_scale = interface->hw.mac.itr_scale;
>   	q_vector->tx.count = txr_count;
>
>   	while (txr_count) {
> @@ -1636,6 +1656,7 @@ static int fm10k_alloc_q_vector(struct fm10k_intfc *interface,
>   	/* save Rx ring container info */
>   	q_vector->rx.ring = ring;
>   	q_vector->rx.itr = interface->rx_itr;
> +	q_vector->rx.itr_scale = interface->hw.mac.itr_scale;
>   	q_vector->rx.count = rxr_count;
>
>   	while (rxr_count) {
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> index 9ad9f9164d91..cbf38da0ada7 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> @@ -880,7 +880,8 @@ static irqreturn_t fm10k_msix_mbx_vf(int __always_unused irq, void *data)
>
>   	/* re-enable mailbox interrupt and indicate 20us delay */
>   	fm10k_write_reg(hw, FM10K_VFITR(FM10K_MBX_VECTOR),
> -			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
> +			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY >>
> +					    hw->mac.itr_scale));
>
>   	/* service upstream mailbox */
>   	if (fm10k_mbx_trylock(interface)) {
> @@ -1111,7 +1112,8 @@ static irqreturn_t fm10k_msix_mbx_pf(int __always_unused irq, void *data)
>
>   	/* re-enable mailbox interrupt and indicate 20us delay */
>   	fm10k_write_reg(hw, FM10K_ITR(FM10K_MBX_VECTOR),
> -			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
> +			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY >>
> +					    hw->mac.itr_scale));
>
>   	return IRQ_HANDLED;
>   }
>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm
  2015-10-14 18:35   ` Alexander Duyck
@ 2015-10-14 20:12     ` Keller, Jacob E
  2015-10-14 22:40       ` Alexander Duyck
  0 siblings, 1 reply; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-14 20:12 UTC (permalink / raw)
  To: intel-wired-lan

On Wed, 2015-10-14 at 11:35 -0700, Alexander Duyck wrote:
> On 10/13/2015 04:39 PM, Jacob Keller wrote:
> +	 */+#define FM10K_ITR_SCALE_SMALL	6
> > +#define FM10K_ITR_SCALE_MEDIUM	5
> > +#define FM10K_ITR_SCALE_LARGE	4
> > +
> > +	if (avg_wire_size < 300)
> > +		avg_wire_size *= FM10K_ITR_SCALE_SMALL;
> > +	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
> > +		avg_wire_size *= FM10K_ITR_SCALE_MEDIUM;
> >   	else
> > -		avg_wire_size /= 2;
> > +		avg_wire_size *= FM10K_ITR_SCALE_LARGE;
> > +
> 
> Where is it these scaling values originated from?  Just looking
> through 
> the values I am not sure having this broken out like it is provides
> much 
> value.
> 

I am not really sure what exactly you mean here?

> What I am getting at is that the input is a packet size, and the
> output 
> is a value somewhere between 2 and 47.  (I think that 47 is still a
> bit 
> high by the way and probably should be something more like 25 which I
> believe you established as the minimum Tx interrupt rate in a later
> patch.)
> 

The input is two fold for calculation, packet size, and number of
packets.

> What you may want to do is look at pulling in the upper limit to 
> something more reasonable like 1536 for avg_wire_size, and then
> simplify 
> this logic a bit.  Specifically what is it you are trying to
> accomplish 
> by tweaking the scale factor like you are?  I assume you are wanting
> to 
> approximate a curve.  If so you might wan to look at possibly
> including 
> an offset value so that you can smooth out the points where your 
> intersections occur.
> 

I honestly don't know. I mostly took the original work from 6-7 months
ago, and added your suggestions from that series, but maybe that isn't
valid now?

> For example what you may want to consider doing would be to instead
> use 
> a multiplication factor for small, an addition value for medium, and
> for 
> large you simply cap it at a certain value.
> 

So, how would this look? The capping makes sense, we should probably
cap it at around 30 or something? I'm not sure that 25 is the limit,
since for some workloads I think we did calculate that we could use
that few interrupts.. maybe the CPU savings aren't worth it though if
it messes up too many other flows?

I could also try to take a completely different algorithm say from i40e
instead? This one really has limited testing.

> 
> > +	/* Round up average wire size, then perform bit shift, to
> > ensure that
> > +	 * the calculation will never get below 1. Account for
> > changes in ITR
> > +	 * value due to PCIe link speed.
> > +	 */
> > +	avg_wire_size += (1 << (ring_container->itr_scale + 8)) -
> > 1;
> > +	avg_wire_size >>= ring_container->itr_scale + 8;
> > 
> 
> You might want to store off the value for itr_scale + 8 somewhere. 
>  It 
> is likely you might save a cycle or two, especially if the compiler 
> thinks it has to read itr_scale twice.

Fair enough that seems reasonable change.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm
  2015-10-14 20:12     ` Keller, Jacob E
@ 2015-10-14 22:40       ` Alexander Duyck
  2015-10-14 23:50         ` Keller, Jacob E
  0 siblings, 1 reply; 40+ messages in thread
From: Alexander Duyck @ 2015-10-14 22:40 UTC (permalink / raw)
  To: intel-wired-lan

On 10/14/2015 01:12 PM, Keller, Jacob E wrote:
> On Wed, 2015-10-14 at 11:35 -0700, Alexander Duyck wrote:
>> On 10/13/2015 04:39 PM, Jacob Keller wrote:
>> +	 */+#define FM10K_ITR_SCALE_SMALL	6
>>> +#define FM10K_ITR_SCALE_MEDIUM	5
>>> +#define FM10K_ITR_SCALE_LARGE	4
>>> +
>>> +	if (avg_wire_size < 300)
>>> +		avg_wire_size *= FM10K_ITR_SCALE_SMALL;
>>> +	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
>>> +		avg_wire_size *= FM10K_ITR_SCALE_MEDIUM;
>>>    	else
>>> -		avg_wire_size /= 2;
>>> +		avg_wire_size *= FM10K_ITR_SCALE_LARGE;
>>> +
>> Where is it these scaling values originated from?  Just looking
>> through
>> the values I am not sure having this broken out like it is provides
>> much
>> value.
>>
> I am not really sure what exactly you mean here?

The numbers are kind of all over the place.  The result of the math 
above swings back and forth in a saw tooth between values and doesn't 
seem to do anything really consistent.

For example a packet that is 275 in size will have an interrupt rate of 
125K interrupts per second, while a packet that is 276 bytes in size 
with be closer to 166K.  It is basically creating some odd peaks and 
valleys.

>
>> What I am getting at is that the input is a packet size, and the
>> output
>> is a value somewhere between 2 and 47.  (I think that 47 is still a
>> bit
>> high by the way and probably should be something more like 25 which I
>> believe you established as the minimum Tx interrupt rate in a later
>> patch.)
>>
> The input is two fold for calculation, packet size, and number of
> packets.

Last I knew though the average packet size is the only portion we end up 
using to actually set the interrupt moderation rate.

>> What you may want to do is look at pulling in the upper limit to
>> something more reasonable like 1536 for avg_wire_size, and then
>> simplify
>> this logic a bit.  Specifically what is it you are trying to
>> accomplish
>> by tweaking the scale factor like you are?  I assume you are wanting
>> to
>> approximate a curve.  If so you might wan to look at possibly
>> including
>> an offset value so that you can smooth out the points where your
>> intersections occur.
>>
> I honestly don't know. I mostly took the original work from 6-7 months
> ago, and added your suggestions from that series, but maybe that isn't
> valid now?

I suspect some of my views have changed over the last several months 
after dealing with interrupt moderation issues on ixgbe.

The thing I started realizing is that with full sized Ethernet frames, 
1514 bytes in size, you have a maximum rate of something like 4 million 
packets per second at 50Gbps.  It seems like you should be firing an 
interrupt once every 100 packets at least don't you think?  That is why 
I am now thinking at minimum 40K interrupts per second is necessary.  
However a side effect of that is that 100 packets will overrun a UDP 
buffer in most cases as there is only room for about 70 frames based on 
math I saw when I submitted the patch for ixgbe 
(https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c?id=8ac34f10a5ea4c7b6f57dfd52b0693a2b67d9ac4). 
So if I want to take UDP rmem_default into account we would be looking 
at something more like 60K interrupts per second which is getting fairly 
tiny at 17us

>> For example what you may want to consider doing would be to instead
>> use
>> a multiplication factor for small, an addition value for medium, and
>> for
>> large you simply cap it at a certain value.
>>
> So, how would this look? The capping makes sense, we should probably
> cap it at around 30 or something? I'm not sure that 25 is the limit,
> since for some workloads I think we did calculate that we could use
> that few interrupts.. maybe the CPU savings aren't worth it though if
> it messes up too many other flows?

So I have done some math based on an assumption of a 212992 wmem_default 
and a overhead of 640 bytes per packet (256 skb, 64 headroom, 320 shared 
info).  Breaking it all down goes a little something like this:

     wmem_default / (size + overhead) = desired_pkts_per_int
     rate / bits_per_byte / (size + ethernet_overhead) = pkt_rate
     (desired_pkts_per_int / pkt_rate) * usec_per_sec = ITR value

So then when we start plugging in the numbers:
     212992 / (size + 640) = desired_pkts_per_int

we can simplify the second expression by just doing the division now.
     50,000,000,000 / 8 / (size + 24) = pkt_rate
     6,250,000,000/(size + 24) = pkt_rate

Then we just need to plug in the values and reduce:
     (212,992 / (size + 640)) / (6,250,000,000/(size + 24)) * 1,000,000 
= ITR value
     (212,992/(size + 640)) / (6,250/(size + 24)) = ITR value
     (34.078/(size + 640))/(1/(size+24) = ITR value
     (34 * (size + 24))/(size + 640) = ITR value

So now we have the expression we are looking for in order to determine 
the minimum interrupt rate, but we want to avoid the division.  So in 
the end we have something that looks like this in order to generate an 
approximation of the curve without having to do any unnecessary math 
(dropping the +24, and the 3000 limit from earlier):

     /* the following is a crude approximation for the following expression:
      * (34 * (size + 24))/(size + 640) = ITR value
      * /
     if (avg_wire_size <= 360) {
         /* start 333K ints/sec and gradually drop to 77K ints/sec */
         avg_wire_size *= 8;
         avg_wire_size += 376;
     } else if (avg_wire_size <= 1152) {
         /* 77K ints/sec to 45K ints/sec
         avg_wire_size *= 3;
         avg_wire_size += 2176;
     } else if (avg_wire_size <= 1920) {
         /* 45K ints/sec to 38K ints/sec
         avg_wire_size += 4480;
     } else {
         /* plateau@a limit of 38K ints/sec */
         avg_wire_size = 6656;
     }

Mind you the above is a crude approximation, but it should give decent 
performance and it only strays from the values of the original function 
by 1us or 2us and it stays under the curve.  It could probably use some 
tuning and tweaking as well but you get the general idea.

You may even want to tune this to be a bit more aggressive in terms of 
interrupts per second.  I know some distros such as RHEL are still 
running around with an untuned skb_shared_info and such and as a result 
they take up more space resulting in a larger memory footprint.  What 
this would represent is modifying the 640 value in the original function 
to increase based on the extra overhead.  Then it would be necessary to 
modify the slopes, offsets, and transitions points to get the right 
approximation for the new curve.

> I could also try to take a completely different algorithm say from i40e
> instead? This one really has limited testing.

Yeah, this one was a "good enough" solution at the time and as I recall 
it was a clone of the igb interrupt moderation.  Now that you have real 
ports you probably need something better than 1Gbps.

The i40e one is from ixgbe, which was inherited from e1000.  The problem 
with the interrupt moderation in that design is that for any high 
throughput usage everything becomes bulk (8K ints per second). For 1G 
that works fine.  But I can tell you from what I have seen on 10Gbps 
NICs it doesn't do well under any kind of small packet, or single 
threaded throughput test.



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-14 17:57         ` Keller, Jacob E
@ 2015-10-14 23:27           ` Alexander Duyck
  2015-10-14 23:44             ` Keller, Jacob E
  0 siblings, 1 reply; 40+ messages in thread
From: Alexander Duyck @ 2015-10-14 23:27 UTC (permalink / raw)
  To: intel-wired-lan

On 10/14/2015 10:57 AM, Keller, Jacob E wrote:
> On Wed, 2015-10-14 at 09:23 -0700, Alexander Duyck wrote:
>> The 36Gb/s number is pretty impressive and may be pushing you into
>> other
>> limits.  What does the CPU utilization look like for your test?  Do
>> you
>> see the thread handling the workload get fully utilized?  Also have
>> you
>> tried enabling XPS to see if maybe you could squeeze a bit more out
>> of
>> the core that is doing the test by improving locality?
>>
>> - Alex
>
> Hello,
>
> So what I am getting now with a few things. First, I set rx-usecs and
> tx-usecs to both 25, and disabled adaptive mode for now...
>
> TCP_STREAM => 33Gb/s, but sometimes drops to 27Gb/s, I believe this is
> a result of no rx_flow_steering (Flow director / ATR) support in the
> host interface. So sometimes the queue we pick is more local other
> times it is not. If I reduce the number of queues and ensure only CPUs
> local to the one I am running netperf, I get 33Gb/s pretty straight. I
> have not been able to reproduce the 36Gb/s test above, so I may re-word
> the commit message...
>
> I used:
>
> ./netperf -T0,5 -t TCP_STREAM -f m -c -C  -H 192.168.21.2 -- -m 64K
>
> So this is with rather large messages that can be run through TSO.
>
> CPU utilization here tops says: 10% local, 10% remote, but if I look at
> top on both ends, it shows ~80% CPU utilization on receiver, and about
> 50% on the transmit end.
>
> I've got a weird issue right now where sometimes it seems to drop to
> half and I haven't determined exactly why yet. But I am pretty sure
> it's due to queue assignment

Sounds reasonable.  With TCP loss can also play a huge factor, although 
I would assume you probably have no dropped packets correct?

> I've been getting pretty inconsistent performance results over the last
> few tests.
>
> I tried these tests with interrupt moderation disabled completely and I
> generally got less performance.

Completely disabling it will usually do that.  The problem is the rates 
for 50Gbs are insane.  You are looking at 4Mpps even with 1514 byte packets.

> Interestingly, I just set both rx and tx to 10, and got one test
> through to report 39Gb/s... But I am definitely not able to
> consistently hit this value.

The 10us range should be excessive.  I would expect you would see the 
best performance right around the amount of time it should take to 
almost fill the ring or socket buffers without actually ever filling 
them.  Basically it is a game of get as close as you can without going 
over in order to get the fewest interrupts possible.

> I generally seem to range pretty wide over tests.

CPU affinity along with everything else can always make these kind of 
tests pretty messy.  I'm assuming you have power management also 
disabled?  If not that could also cause some pretty wide swings due to 
processor C states and P states.

> For UDP I used:
>
> ./netperf -T0,5 -t UDP_STREAM -f m -c -C  -H 192.168.21.2 -- -m 64k
>
> For this test, I see 80% CPU utilization on the sender, and 50% on the
> receiver, when bound as above.
>
> I seem to get ~16Gb/s send and receive here, with no variance...

The fact that there is no variance likely means something is 
bottlenecking this somewhere early on in the Tx.

> I suspect part of this is due to the fact that TCP can do hardware TSO,
> which we don't have in UDP? I'm not sure here..

TCP will also allow you to have significantly more data in flight in 
many cases.  UDP is normally confined to a fairly small window.

> UDP is significantly more stable than TCP was. but it doesn't seem to
> ever go above 16Gb/s for a single stream.

I'd be interested in seeing the actual numbers.  I know for some 
UDP_STREAM tests I have run it ends up being that one side is 
transmitting a significant amount, while the receiving side is only 
getting a fraction of it because packets are being dropped due to 
overrunning the socket.

> I'm still a bit concerned over the instability produced by TCP_STREAM,
> but it should be noted that my test setup is far from ideal:

Agreed.

> I currently only have a single host interface, and have used network
> namespacing to separate the two devices so that it routes over the
> physical hardware. So it's a single system test which impacts irq to
> CPU binding, as well as queue to CPU binding, and so on. There are a
> lot of issues here that impact, but I'm happy to be able to get much
> better than 2-3Gb/s like I was before.
>
> Any further suggestions would be appreciated.
>
> Regards,
> Jake


The only other thing I can think of is to check flow control, but as I 
recall that is disabled by default with fm10k.

- Alex





^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-14 23:27           ` Alexander Duyck
@ 2015-10-14 23:44             ` Keller, Jacob E
  2015-10-15  2:23               ` Alexander Duyck
  0 siblings, 1 reply; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-14 23:44 UTC (permalink / raw)
  To: intel-wired-lan

On Wed, 2015-10-14 at 16:27 -0700, Alexander Duyck wrote:
> Sounds reasonable.  With TCP loss can also play a huge factor,
> although 
> I would assume you probably have no dropped packets correct?
> 

No drops for UDP, but with TCP I see drops on the receiving partner...
no more than about 200 total though.

> > I've been getting pretty inconsistent performance results over the
> > last
> > few tests.
> > 
> > I tried these tests with interrupt moderation disabled completely
> > and I
> > generally got less performance.
> 
> Completely disabling it will usually do that.  The problem is the
> rates 
> for 50Gbs are insane.  You are looking at 4Mpps even with 1514 byte
> packets.

Ok that makes sense, yea. Too much wakeup causes us to waste a lot.

> 
> > Interestingly, I just set both rx and tx to 10, and got one test
> > through to report 39Gb/s... But I am definitely not able to
> > consistently hit this value.
> 
> The 10us range should be excessive.  I would expect you would see the
> best performance right around the amount of time it should take to 
> almost fill the ring or socket buffers without actually ever filling 
> them.  Basically it is a game of get as close as you can without
> going 
> over in order to get the fewest interrupts possible.
> 

I am not sure what is best, but 10 so far as been I will try a few
others...

> > I generally seem to range pretty wide over tests.
> 
> CPU affinity along with everything else can always make these kind of
> tests pretty messy.  I'm assuming you have power management also 
> disabled?  If not that could also cause some pretty wide swings due
> to 
> processor C states and P states.
> 
> > For UDP I used:
> > 
> > ./netperf -T0,5 -t UDP_STREAM -f m -c -C  -H 192.168.21.2 -- -m 64k
> > 
> > For this test, I see 80% CPU utilization on the sender, and 50% on
> > the
> > receiver, when bound as above.
> > 
> > I seem to get ~16Gb/s send and receive here, with no variance...
> 
> The fact that there is no variance likely means something is 
> bottlenecking this somewhere early on in the Tx.
> 
> > I suspect part of this is due to the fact that TCP can do hardware
> > TSO,
> > which we don't have in UDP? I'm not sure here..
> 
> TCP will also allow you to have significantly more data in flight in 
> many cases.  UDP is normally confined to a fairly small window.
> 

Makes sense.

> > UDP is significantly more stable than TCP was. but it doesn't seem
> > to
> > ever go above 16Gb/s for a single stream.
> 
> I'd be interested in seeing the actual numbers.  I know for some 
> UDP_STREAM tests I have run it ends up being that one side is 
> transmitting a significant amount, while the receiving side is only 
> getting a fraction of it because packets are being dropped due to 
> overrunning the socket.
> 

According to netperf, it doesn't have any dropped packets doing UDP,
ethtool agrees:

MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.21.2 () port 0 AF_INET : cpu bind
Socket  Message  Elapsed      Messages                   CPU      Service
Size    Size     Time         Okay Errors   Throughput   Util     Demand
bytes   bytes    secs            #      #   10^6bits/sec % SS     us/KB

212992   64000   10.00      319414      0    16349.8     9.57     0.959 
212992           10.00      319407           16349.4     9.57     0.959 



So this looks quite low comapred to TCP, but it has no variance.

> > I'm still a bit concerned over the instability produced by
> > TCP_STREAM,
> > but it should be noted that my test setup is far from ideal:
> 
> Agreed.
> 

I can't really get a better one at present because we don't have
hardware with multiple host interfaces on different boxes that is
available to me for long term usage for test case here..

> > I currently only have a single host interface, and have used
> > network
> > namespacing to separate the two devices so that it routes over the
> > physical hardware. So it's a single system test which impacts irq
> > to
> > CPU binding, as well as queue to CPU binding, and so on. There are
> > a
> > lot of issues here that impact, but I'm happy to be able to get
> > much
> > better than 2-3Gb/s like I was before.
> > 
> > Any further suggestions would be appreciated.
> > 
> > Regards,
> > Jake
> 
> 
> The only other thing I can think of is to check flow control, but as
> I 
> recall that is disabled by default with fm10k.
> 
> - Alex
> 


There is no hardware ethernet flow control at all for the fm10k
interface.

Regards,
Jake

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm
  2015-10-14 22:40       ` Alexander Duyck
@ 2015-10-14 23:50         ` Keller, Jacob E
  2015-10-15  2:17           ` Alexander Duyck
  0 siblings, 1 reply; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-14 23:50 UTC (permalink / raw)
  To: intel-wired-lan

On Wed, 2015-10-14 at 15:40 -0700, Alexander Duyck wrote:
> On 10/14/2015 01:12 PM, Keller, Jacob E wrote:
> > On Wed, 2015-10-14 at 11:35 -0700, Alexander Duyck wrote:
> > > On 10/13/2015 04:39 PM, Jacob Keller wrote:
> > > +	 */+#define FM10K_ITR_SCALE_SMALL	6
> > > > +#define FM10K_ITR_SCALE_MEDIUM	5
> > > > +#define FM10K_ITR_SCALE_LARGE	4
> > > > +
> > > > +	if (avg_wire_size < 300)
> > > > +		avg_wire_size *= FM10K_ITR_SCALE_SMALL;
> > > > +	else if ((avg_wire_size >= 300) && (avg_wire_size <
> > > > 1200))
> > > > +		avg_wire_size *= FM10K_ITR_SCALE_MEDIUM;
> > > >    	else
> > > > -		avg_wire_size /= 2;
> > > > +		avg_wire_size *= FM10K_ITR_SCALE_LARGE;
> > > > +
> > > Where is it these scaling values originated from?  Just looking
> > > through
> > > the values I am not sure having this broken out like it is
> > > provides
> > > much
> > > value.
> > > 
> > I am not really sure what exactly you mean here?
> 
> The numbers are kind of all over the place.  The result of the math 
> above swings back and forth in a saw tooth between values and doesn't
> seem to do anything really consistent.
> 
> For example a packet that is 275 in size will have an interrupt rate
> of 
> 125K interrupts per second, while a packet that is 276 bytes in size 
> with be closer to 166K.  It is basically creating some odd peaks and 
> valleys.
> 

Yea ok this does do a lot of weird things. I can definitely look at
implementing what you suggest below and see how it goes..

> > 
> > > What I am getting at is that the input is a packet size, and the
> > > output
> > > is a value somewhere between 2 and 47.  (I think that 47 is still
> > > a
> > > bit
> > > high by the way and probably should be something more like 25
> > > which I
> > > believe you established as the minimum Tx interrupt rate in a
> > > later
> > > patch.)
> > > 
> > The input is two fold for calculation, packet size, and number of
> > packets.
> 
> Last I knew though the average packet size is the only portion we end
> up 
> using to actually set the interrupt moderation rate.
> 

You're correct right now we calculate average wiresize and only use
that, no mention of total packet as well.

> > > What you may want to do is look at pulling in the upper limit to
> > > something more reasonable like 1536 for avg_wire_size, and then
> > > simplify
> > > this logic a bit.  Specifically what is it you are trying to
> > > accomplish
> > > by tweaking the scale factor like you are?  I assume you are
> > > wanting
> > > to
> > > approximate a curve.  If so you might wan to look at possibly
> > > including
> > > an offset value so that you can smooth out the points where your
> > > intersections occur.
> > > 
> > I honestly don't know. I mostly took the original work from 6-7
> > months
> > ago, and added your suggestions from that series, but maybe that
> > isn't
> > valid now?
> 
> I suspect some of my views have changed over the last several months 
> after dealing with interrupt moderation issues on ixgbe.
> 

Sure. It's a complicated problem.

> The thing I started realizing is that with full sized Ethernet
> frames, 
> 1514 bytes in size, you have a maximum rate of something like 4
> million 
> packets per second at 50Gbps.  It seems like you should be firing an 
> interrupt once every 100 packets at least don't you think?  That is
> why 
> I am now thinking at minimum 40K interrupts per second is necessary. 
> However a side effect of that is that 100 packets will overrun a UDP 
> buffer in most cases as there is only room for about 70 frames based
> on 
> math I saw when I submitted the patch for ixgbe 
> (https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/comm
> it/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c?id=8ac34f10a5ea4c7b6
> f57dfd52b0693a2b67d9ac4). 
> So if I want to take UDP rmem_default into account we would be
> looking 
> at something more like 60K interrupts per second which is getting
> fairly 
> tiny at 17us
> 

Ok.

> > > For example what you may want to consider doing would be to
> > > instead
> > > use
> > > a multiplication factor for small, an addition value for medium,
> > > and
> > > for
> > > large you simply cap it at a certain value.
> > > 
> > So, how would this look? The capping makes sense, we should
> > probably
> > cap it at around 30 or something? I'm not sure that 25 is the
> > limit,
> > since for some workloads I think we did calculate that we could use
> > that few interrupts.. maybe the CPU savings aren't worth it though
> > if
> > it messes up too many other flows?
> 
> So I have done some math based on an assumption of a 212992
> wmem_default 
> and a overhead of 640 bytes per packet (256 skb, 64 headroom, 320
> shared 
> info).  Breaking it all down goes a little something like this:
> 
>      wmem_default / (size + overhead) = desired_pkts_per_int
>      rate / bits_per_byte / (size + ethernet_overhead) = pkt_rate
>      (desired_pkts_per_int / pkt_rate) * usec_per_sec = ITR value
> 
> So then when we start plugging in the numbers:
>      212992 / (size + 640) = desired_pkts_per_int
> 
> we can simplify the second expression by just doing the division now.
>      50,000,000,000 / 8 / (size + 24) = pkt_rate
>      6,250,000,000/(size + 24) = pkt_rate
> 
> Then we just need to plug in the values and reduce:
>      (212,992 / (size + 640)) / (6,250,000,000/(size + 24)) *
> 1,000,000 
> = ITR value
>      (212,992/(size + 640)) / (6,250/(size + 24)) = ITR value
>      (34.078/(size + 640))/(1/(size+24) = ITR value
>      (34 * (size + 24))/(size + 640) = ITR value
> 
> So now we have the expression we are looking for in order to
> determine 
> the minimum interrupt rate, but we want to avoid the division.  So in
> the end we have something that looks like this in order to generate
> an 
> approximation of the curve without having to do any unnecessary math 
> (dropping the +24, and the 3000 limit from earlier):
> 
>      /* the following is a crude approximation for the following
> expression:
>       * (34 * (size + 24))/(size + 640) = ITR value
>       * /
>      if (avg_wire_size <= 360) {
>          /* start 333K ints/sec and gradually drop to 77K ints/sec */
>          avg_wire_size *= 8;
>          avg_wire_size += 376;
>      } else if (avg_wire_size <= 1152) {
>          /* 77K ints/sec to 45K ints/sec
>          avg_wire_size *= 3;
>          avg_wire_size += 2176;
>      } else if (avg_wire_size <= 1920) {
>          /* 45K ints/sec to 38K ints/sec
>          avg_wire_size += 4480;
>      } else {
>          /* plateau at a limit of 38K ints/sec */
>          avg_wire_size = 6656;
>      }
> 

So this is calculating the inverse ints/sec? where do we end up
converting this to microseconds. Hmm.

> Mind you the above is a crude approximation, but it should give
> decent 
> performance and it only strays from the values of the original
> function 
> by 1us or 2us and it stays under the curve.  It could probably use
> some 
> tuning and tweaking as well but you get the general idea.
> 
> You may even want to tune this to be a bit more aggressive in terms
> of 
> interrupts per second.  I know some distros such as RHEL are still 
> running around with an untuned skb_shared_info and such and as a
> result 
> they take up more space resulting in a larger memory footprint.  What
> this would represent is modifying the 640 value in the original
> function 
> to increase based on the extra overhead.  Then it would be necessary
> to 
> modify the slopes, offsets, and transitions points to get the right 
> approximation for the new curve.
> 
> > I could also try to take a completely different algorithm say from
> > i40e
> > instead? This one really has limited testing.
> 
> Yeah, this one was a "good enough" solution at the time and as I
> recall 
> it was a clone of the igb interrupt moderation.  Now that you have
> real 
> ports you probably need something better than 1Gbps.
> 
> The i40e one is from ixgbe, which was inherited from e1000.  The
> problem 
> with the interrupt moderation in that design is that for any high 
> throughput usage everything becomes bulk (8K ints per second). For 1G
> that works fine.  But I can tell you from what I have seen on 10Gbps 
> NICs it doesn't do well under any kind of small packet, or single 
> threaded throughput test.
> 
> 

Agreed ok. I will look into this more.

Regards,
Jake

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm
  2015-10-14 23:50         ` Keller, Jacob E
@ 2015-10-15  2:17           ` Alexander Duyck
  2015-10-15 16:32             ` Keller, Jacob E
  0 siblings, 1 reply; 40+ messages in thread
From: Alexander Duyck @ 2015-10-15  2:17 UTC (permalink / raw)
  To: intel-wired-lan

On 10/14/2015 04:50 PM, Keller, Jacob E wrote:
> On Wed, 2015-10-14 at 15:40 -0700, Alexander Duyck wrote:
>> On 10/14/2015 01:12 PM, Keller, Jacob E wrote:
>>> On Wed, 2015-10-14 at 11:35 -0700, Alexander Duyck wrote:
>>>> On 10/13/2015 04:39 PM, Jacob Keller wrote:
>>>> +	 */+#define FM10K_ITR_SCALE_SMALL	6
>>>>> +#define FM10K_ITR_SCALE_MEDIUM	5
>>>>> +#define FM10K_ITR_SCALE_LARGE	4
>>>>> +
>>>>> +	if (avg_wire_size < 300)
>>>>> +		avg_wire_size *= FM10K_ITR_SCALE_SMALL;
>>>>> +	else if ((avg_wire_size >= 300) && (avg_wire_size <
>>>>> 1200))
>>>>> +		avg_wire_size *= FM10K_ITR_SCALE_MEDIUM;
>>>>>     	else
>>>>> -		avg_wire_size /= 2;
>>>>> +		avg_wire_size *= FM10K_ITR_SCALE_LARGE;
>>>>> +
>>>> Where is it these scaling values originated from?  Just looking
>>>> through
>>>> the values I am not sure having this broken out like it is
>>>> provides
>>>> much
>>>> value.
>>>>
>>> I am not really sure what exactly you mean here?
>>
>> The numbers are kind of all over the place.  The result of the math
>> above swings back and forth in a saw tooth between values and doesn't
>> seem to do anything really consistent.
>>
>> For example a packet that is 275 in size will have an interrupt rate
>> of
>> 125K interrupts per second, while a packet that is 276 bytes in size
>> with be closer to 166K.  It is basically creating some odd peaks and
>> valleys.
>>
>
> Yea ok this does do a lot of weird things. I can definitely look at
> implementing what you suggest below and see how it goes..
>
>>>
>>>> What I am getting at is that the input is a packet size, and the
>>>> output
>>>> is a value somewhere between 2 and 47.  (I think that 47 is still
>>>> a
>>>> bit
>>>> high by the way and probably should be something more like 25
>>>> which I
>>>> believe you established as the minimum Tx interrupt rate in a
>>>> later
>>>> patch.)
>>>>
>>> The input is two fold for calculation, packet size, and number of
>>> packets.
>>
>> Last I knew though the average packet size is the only portion we end
>> up
>> using to actually set the interrupt moderation rate.
>>
>
> You're correct right now we calculate average wiresize and only use
> that, no mention of total packet as well.
>
>>>> What you may want to do is look at pulling in the upper limit to
>>>> something more reasonable like 1536 for avg_wire_size, and then
>>>> simplify
>>>> this logic a bit.  Specifically what is it you are trying to
>>>> accomplish
>>>> by tweaking the scale factor like you are?  I assume you are
>>>> wanting
>>>> to
>>>> approximate a curve.  If so you might wan to look at possibly
>>>> including
>>>> an offset value so that you can smooth out the points where your
>>>> intersections occur.
>>>>
>>> I honestly don't know. I mostly took the original work from 6-7
>>> months
>>> ago, and added your suggestions from that series, but maybe that
>>> isn't
>>> valid now?
>>
>> I suspect some of my views have changed over the last several months
>> after dealing with interrupt moderation issues on ixgbe.
>>
>
> Sure. It's a complicated problem.
>
>> The thing I started realizing is that with full sized Ethernet
>> frames,
>> 1514 bytes in size, you have a maximum rate of something like 4
>> million
>> packets per second at 50Gbps.  It seems like you should be firing an
>> interrupt once every 100 packets at least don't you think?  That is
>> why
>> I am now thinking at minimum 40K interrupts per second is necessary.
>> However a side effect of that is that 100 packets will overrun a UDP
>> buffer in most cases as there is only room for about 70 frames based
>> on
>> math I saw when I submitted the patch for ixgbe
>> (https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/comm
>> it/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c?id=8ac34f10a5ea4c7b6
>> f57dfd52b0693a2b67d9ac4).
>> So if I want to take UDP rmem_default into account we would be
>> looking
>> at something more like 60K interrupts per second which is getting
>> fairly
>> tiny at 17us
>>
>
> Ok.
>
>>>> For example what you may want to consider doing would be to
>>>> instead
>>>> use
>>>> a multiplication factor for small, an addition value for medium,
>>>> and
>>>> for
>>>> large you simply cap it at a certain value.
>>>>
>>> So, how would this look? The capping makes sense, we should
>>> probably
>>> cap it at around 30 or something? I'm not sure that 25 is the
>>> limit,
>>> since for some workloads I think we did calculate that we could use
>>> that few interrupts.. maybe the CPU savings aren't worth it though
>>> if
>>> it messes up too many other flows?
>>
>> So I have done some math based on an assumption of a 212992
>> wmem_default
>> and a overhead of 640 bytes per packet (256 skb, 64 headroom, 320
>> shared
>> info).  Breaking it all down goes a little something like this:
>>
>>       wmem_default / (size + overhead) = desired_pkts_per_int
>>       rate / bits_per_byte / (size + ethernet_overhead) = pkt_rate
>>       (desired_pkts_per_int / pkt_rate) * usec_per_sec = ITR value
>>
>> So then when we start plugging in the numbers:
>>       212992 / (size + 640) = desired_pkts_per_int
>>
>> we can simplify the second expression by just doing the division now.
>>       50,000,000,000 / 8 / (size + 24) = pkt_rate
>>       6,250,000,000/(size + 24) = pkt_rate
>>
>> Then we just need to plug in the values and reduce:
>>       (212,992 / (size + 640)) / (6,250,000,000/(size + 24)) *
>> 1,000,000
>> = ITR value
>>       (212,992/(size + 640)) / (6,250/(size + 24)) = ITR value
>>       (34.078/(size + 640))/(1/(size+24) = ITR value
>>       (34 * (size + 24))/(size + 640) = ITR value
>>
>> So now we have the expression we are looking for in order to
>> determine
>> the minimum interrupt rate, but we want to avoid the division.  So in
>> the end we have something that looks like this in order to generate
>> an
>> approximation of the curve without having to do any unnecessary math
>> (dropping the +24, and the 3000 limit from earlier):
>>
>>       /* the following is a crude approximation for the following
>> expression:
>>        * (34 * (size + 24))/(size + 640) = ITR value
>>        * /
>>       if (avg_wire_size <= 360) {
>>           /* start 333K ints/sec and gradually drop to 77K ints/sec */
>>           avg_wire_size *= 8;
>>           avg_wire_size += 376;
>>       } else if (avg_wire_size <= 1152) {
>>           /* 77K ints/sec to 45K ints/sec
>>           avg_wire_size *= 3;
>>           avg_wire_size += 2176;
>>       } else if (avg_wire_size <= 1920) {
>>           /* 45K ints/sec to 38K ints/sec
>>           avg_wire_size += 4480;
>>       } else {
>>           /* plateau at a limit of 38K ints/sec */
>>           avg_wire_size = 6656;
>>       }
>>
>
> So this is calculating the inverse ints/sec? where do we end up
> converting this to microseconds. Hmm.

The value generated at the end is in microseconds.  Basically this would 
replace all the code that is located before your shift at the end that 
uses the PCIe factor + 8.  So everything from the size > 3000 check down 
to the adjustments for the other sizes.

I kind of always hated the algorithms that try to work everything out in 
terms of interrupts and then have to flip it over and into microseconds. 
  This is just working with the raw microseconds like the fm10k driver 
currently does.

>> Mind you the above is a crude approximation, but it should give
>> decent
>> performance and it only strays from the values of the original
>> function
>> by 1us or 2us and it stays under the curve.  It could probably use
>> some
>> tuning and tweaking as well but you get the general idea.
>>
>> You may even want to tune this to be a bit more aggressive in terms
>> of
>> interrupts per second.  I know some distros such as RHEL are still
>> running around with an untuned skb_shared_info and such and as a
>> result
>> they take up more space resulting in a larger memory footprint.  What
>> this would represent is modifying the 640 value in the original
>> function
>> to increase based on the extra overhead.  Then it would be necessary
>> to
>> modify the slopes, offsets, and transitions points to get the right
>> approximation for the new curve.
>>
>>> I could also try to take a completely different algorithm say from
>>> i40e
>>> instead? This one really has limited testing.
>>
>> Yeah, this one was a "good enough" solution at the time and as I
>> recall
>> it was a clone of the igb interrupt moderation.  Now that you have
>> real
>> ports you probably need something better than 1Gbps.
>>
>> The i40e one is from ixgbe, which was inherited from e1000.  The
>> problem
>> with the interrupt moderation in that design is that for any high
>> throughput usage everything becomes bulk (8K ints per second). For 1G
>> that works fine.  But I can tell you from what I have seen on 10Gbps
>> NICs it doesn't do well under any kind of small packet, or single
>> threaded throughput test.
>>
>>
>
> Agreed ok. I will look into this more.
>
> Regards,
> Jake

I'll probably be submitting some patches for ixgbe at some point to try 
and shape it into something similar.  Really the only issue right now is 
how to deal with the combined Rx/Tx ITR register since ixgbe only has 
one whereas the FM10K can maintain a separate one in each direction.

- Alex


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-14 23:44             ` Keller, Jacob E
@ 2015-10-15  2:23               ` Alexander Duyck
  2015-10-15 16:35                 ` Keller, Jacob E
  0 siblings, 1 reply; 40+ messages in thread
From: Alexander Duyck @ 2015-10-15  2:23 UTC (permalink / raw)
  To: intel-wired-lan

On 10/14/2015 04:44 PM, Keller, Jacob E wrote:
> On Wed, 2015-10-14 at 16:27 -0700, Alexander Duyck wrote:
>> Sounds reasonable.  With TCP loss can also play a huge factor,
>> although
>> I would assume you probably have no dropped packets correct?
>>
>
> No drops for UDP, but with TCP I see drops on the receiving partner...
> no more than about 200 total though.
>
>>> I've been getting pretty inconsistent performance results over the
>>> last
>>> few tests.
>>>
>>> I tried these tests with interrupt moderation disabled completely
>>> and I
>>> generally got less performance.
>>
>> Completely disabling it will usually do that.  The problem is the
>> rates
>> for 50Gbs are insane.  You are looking at 4Mpps even with 1514 byte
>> packets.
>
> Ok that makes sense, yea. Too much wakeup causes us to waste a lot.
>
>>
>>> Interestingly, I just set both rx and tx to 10, and got one test
>>> through to report 39Gb/s... But I am definitely not able to
>>> consistently hit this value.
>>
>> The 10us range should be excessive.  I would expect you would see the
>> best performance right around the amount of time it should take to
>> almost fill the ring or socket buffers without actually ever filling
>> them.  Basically it is a game of get as close as you can without
>> going
>> over in order to get the fewest interrupts possible.
>>
>
> I am not sure what is best, but 10 so far as been I will try a few
> others...
>
>>> I generally seem to range pretty wide over tests.
>>
>> CPU affinity along with everything else can always make these kind of
>> tests pretty messy.  I'm assuming you have power management also
>> disabled?  If not that could also cause some pretty wide swings due
>> to
>> processor C states and P states.
>>
>>> For UDP I used:
>>>
>>> ./netperf -T0,5 -t UDP_STREAM -f m -c -C  -H 192.168.21.2 -- -m 64k
>>>
>>> For this test, I see 80% CPU utilization on the sender, and 50% on
>>> the
>>> receiver, when bound as above.
>>>
>>> I seem to get ~16Gb/s send and receive here, with no variance...
>>
>> The fact that there is no variance likely means something is
>> bottlenecking this somewhere early on in the Tx.
>>
>>> I suspect part of this is due to the fact that TCP can do hardware
>>> TSO,
>>> which we don't have in UDP? I'm not sure here..
>>
>> TCP will also allow you to have significantly more data in flight in
>> many cases.  UDP is normally confined to a fairly small window.
>>
>
> Makes sense.
>
>>> UDP is significantly more stable than TCP was. but it doesn't seem
>>> to
>>> ever go above 16Gb/s for a single stream.
>>
>> I'd be interested in seeing the actual numbers.  I know for some
>> UDP_STREAM tests I have run it ends up being that one side is
>> transmitting a significant amount, while the receiving side is only
>> getting a fraction of it because packets are being dropped due to
>> overrunning the socket.
>>
>
> According to netperf, it doesn't have any dropped packets doing UDP,
> ethtool agrees:
>
> MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.21.2 () port 0 AF_INET : cpu bind
> Socket  Message  Elapsed      Messages                   CPU      Service
> Size    Size     Time         Okay Errors   Throughput   Util     Demand
> bytes   bytes    secs            #      #   10^6bits/sec % SS     us/KB
>
> 212992   64000   10.00      319414      0    16349.8     9.57     0.959
> 212992           10.00      319407           16349.4     9.57     0.959
>
>
>
> So this looks quite low comapred to TCP, but it has no variance.

So one thing you might try doing is modifying the wmem_default sysctl 
value to something larger.  Maybe try doubling it to see if that has any 
effect.  The issue could be that you aren't able to get enough packets 
in flight to actually saturate the link.

>>> I'm still a bit concerned over the instability produced by
>>> TCP_STREAM,
>>> but it should be noted that my test setup is far from ideal:
>>
>> Agreed.
>>
>
> I can't really get a better one at present because we don't have
> hardware with multiple host interfaces on different boxes that is
> available to me for long term usage for test case here..
>
>>> I currently only have a single host interface, and have used
>>> network
>>> namespacing to separate the two devices so that it routes over the
>>> physical hardware. So it's a single system test which impacts irq
>>> to
>>> CPU binding, as well as queue to CPU binding, and so on. There are
>>> a
>>> lot of issues here that impact, but I'm happy to be able to get
>>> much
>>> better than 2-3Gb/s like I was before.
>>>
>>> Any further suggestions would be appreciated.
>>>
>>> Regards,
>>> Jake
>>
>>
>> The only other thing I can think of is to check flow control, but as
>> I
>> recall that is disabled by default with fm10k.
>>
>> - Alex
>>
>
>
> There is no hardware ethernet flow control at all for the fm10k
> interface.

There is a flow control mechanism in the form of DMA back pressure that 
can be exerted by the PEP interfaces.  It is controlled via the ethtool 
flow control interface.  I believe it is defaulted to off though since 
that is what is in the current upstream driver.

- Alex


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm
  2015-10-15  2:17           ` Alexander Duyck
@ 2015-10-15 16:32             ` Keller, Jacob E
  0 siblings, 0 replies; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-15 16:32 UTC (permalink / raw)
  To: intel-wired-lan

> -----Original Message-----
> From: Alexander Duyck [mailto:alexander.duyck at gmail.com]
> Sent: Wednesday, October 14, 2015 7:17 PM
> To: Keller, Jacob E; intel-wired-lan at lists.osuosl.org
> Subject: Re: [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive
> ITR algorithm
> 
> >
> > So this is calculating the inverse ints/sec? where do we end up
> > converting this to microseconds. Hmm.
> 
> The value generated at the end is in microseconds.  Basically this would
> replace all the code that is located before your shift at the end that
> uses the PCIe factor + 8.  So everything from the size > 3000 check down
> to the adjustments for the other sizes.
> 
> I kind of always hated the algorithms that try to work everything out in
> terms of interrupts and then have to flip it over and into microseconds.
>   This is just working with the raw microseconds like the fm10k driver
> currently does.
> 

Ok, that makes sense. I figured that out a few minutes after sending the last email while I was on my way home. I'll try to code this up and see how it goes.

Regards,
Jake


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec
  2015-10-15  2:23               ` Alexander Duyck
@ 2015-10-15 16:35                 ` Keller, Jacob E
  0 siblings, 0 replies; 40+ messages in thread
From: Keller, Jacob E @ 2015-10-15 16:35 UTC (permalink / raw)
  To: intel-wired-lan



> -----Original Message-----
> From: Alexander Duyck [mailto:alexander.duyck at gmail.com]
> Sent: Wednesday, October 14, 2015 7:24 PM
> To: Keller, Jacob E; intel-wired-lan at lists.osuosl.org
> Subject: Re: [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx
> ITR to 25usec
> 
> > There is no hardware ethernet flow control at all for the fm10k
> > interface.
> 
> There is a flow control mechanism in the form of DMA back pressure that
> can be exerted by the PEP interfaces.  It is controlled via the ethtool
> flow control interface.  I believe it is defaulted to off though since
> that is what is in the current upstream driver.
> 
> - Alex

Ah yes that. It is disabled.

Regards,
Jake

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors
  2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors Jacob Keller
  2015-10-14  0:46   ` Allan, Bruce W
@ 2015-10-28  0:47   ` Singh, Krishneil K
  1 sibling, 0 replies; 40+ messages in thread
From: Singh, Krishneil K @ 2015-10-28  0:47 UTC (permalink / raw)
  To: intel-wired-lan



-----Original Message-----
From: Intel-wired-lan [mailto:intel-wired-lan-bounces at lists.osuosl.org] On Behalf Of Jacob Keller
Sent: Tuesday, October 13, 2015 4:39 PM
To: Intel Wired LAN <intel-wired-lan@lists.osuosl.org>
Subject: [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors

A recent change to the SHARED code makes it such that in some flows, init_hw may fail. For example, init_hw_vf will fail if the 1st queue is not owned by the VF. However, previously some flows called init_hw without checking the return code, or checked it but only to display a diagnostic message.

Fix this by (a) always returning and preventing the netdevice from going up, and (b) printing the diagnostic in every flow for consistency. This should resolve an issue where VF drivers would attempt to come up before the PF has finished assigning queues.

In addition, change the dmesg output to explicitly show the actual function that failed, instead of combining reset_hw and init_hw into a single check.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---

Tested-by: Krishneil Singh <Krishneil.k.singh@intel.com>



^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2015-10-28  0:47 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-13 23:38 [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Jacob Keller
2015-10-13 23:38 ` [Intel-wired-lan] [next-queue 02/17] fm10k: set netdev features in one location Jacob Keller
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 03/17] fm10k: reinitialize queuing scheme after calling init_hw Jacob Keller
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 04/17] fm10k: reset max_queues on init_hw_vf failure Jacob Keller
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 05/17] fm10k: always check init_hw for errors Jacob Keller
2015-10-14  0:46   ` Allan, Bruce W
2015-10-14 15:57     ` Keller, Jacob E
2015-10-28  0:47   ` Singh, Krishneil K
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 06/17] fm10k: Correct typecast in fm10k_update_xc_addr_pf Jacob Keller
2015-10-14  0:46   ` Allan, Bruce W
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 07/17] fm10k: explicitly typecast vlan values to u16 Jacob Keller
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 08/17] fm10k: add statistics for actual DWORD count of mbmem mailbox Jacob Keller
2015-10-14  0:47   ` Allan, Bruce W
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 09/17] fm10k: rename mbx_tx_oversized statistic to mbx_tx_dropped Jacob Keller
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 10/17] fm10k: add TEB check to fm10k_gre_is_nvgre Jacob Keller
2015-10-14  0:47   ` Allan, Bruce W
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 11/17] fm10k: Add support for ITR scaling based on PCIe link speed Jacob Keller
2015-10-14  0:47   ` Allan, Bruce W
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 12/17] fm10k: introduce ITR_IS_ADAPTIVE macro Jacob Keller
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 13/17] fm10k: Update adaptive ITR algorithm Jacob Keller
2015-10-14 18:35   ` Alexander Duyck
2015-10-14 20:12     ` Keller, Jacob E
2015-10-14 22:40       ` Alexander Duyck
2015-10-14 23:50         ` Keller, Jacob E
2015-10-15  2:17           ` Alexander Duyck
2015-10-15 16:32             ` Keller, Jacob E
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 14/17] fm10k: use macro for default Tx and Rx ITR values Jacob Keller
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 15/17] fm10k: change default Tx ITR to 25usec Jacob Keller
2015-10-14 15:15   ` Alexander Duyck
2015-10-14 15:59     ` Keller, Jacob E
2015-10-14 16:23       ` Alexander Duyck
2015-10-14 16:31         ` Keller, Jacob E
2015-10-14 17:57         ` Keller, Jacob E
2015-10-14 23:27           ` Alexander Duyck
2015-10-14 23:44             ` Keller, Jacob E
2015-10-15  2:23               ` Alexander Duyck
2015-10-15 16:35                 ` Keller, Jacob E
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 16/17] fm10k: TRIVIAL fix typo of hardware Jacob Keller
2015-10-13 23:39 ` [Intel-wired-lan] [next-queue 17/17] fm10k: TRIVIAL cleanup order at top of fm10k_xmit_frame Jacob Keller
2015-10-14  0:46 ` [Intel-wired-lan] [next-queue 01/17] fm10k: conditionally compile DCB and DebugFS support Allan, Bruce W

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.