All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] e1000e: Add delays after writing to registers
@ 2015-10-21 22:07 Jonathan David
  2015-10-22  5:59 ` Henrik Austad
  0 siblings, 1 reply; 11+ messages in thread
From: Jonathan David @ 2015-10-21 22:07 UTC (permalink / raw)
  To: linux-rt-users; +Cc: josh.cartwright, Jonathan David

There is a noticeable impact on determinism when a large number of
writes are flushed. Writes to the hardware registers are sent across
the PCI bus and take a significant amount of time to complete after
a flush, which causes high priority tasks (including interrupts) to
be delayed.

Adding a delay after long series of writes gives them time to
complete, and for higher priority tasks to run unimpeded.

Signed-off-by: Jonathan David <jonathan.david@ni.com>
---
 drivers/net/ethernet/intel/Kconfig            |  9 +++++++++
 drivers/net/ethernet/intel/e1000/e1000.h      |  3 +++
 drivers/net/ethernet/intel/e1000/e1000_main.c |  5 +++++
 drivers/net/ethernet/intel/e1000e/82571.c     |  3 +++
 drivers/net/ethernet/intel/e1000e/e1000.h     |  4 ++++
 drivers/net/ethernet/intel/e1000e/mac.c       |  3 +++
 drivers/net/ethernet/intel/e1000e/netdev.c    | 18 +++++++++++++++++-
 7 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index f4ff465..4c1fba2 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -85,6 +85,15 @@ config E1000E
 	  To compile this driver as a module, choose M here. The module
 	  will be called e1000e.
 
+config E1000_DELAY
+	bool "Add delays to e1000x drivers"
+	default n
+	depends on (E1000E || E1000)
+	---help---
+	  Enable delays after large numbers of MMIO writes to registers.
+	  The delays aid in preventing noticeable impact on real-time
+	  performance when a connection is interrupted.
+
 config IGB
 	tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
 	depends on PCI
diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index 6970710..ea405f3 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -223,6 +223,9 @@ struct e1000_rx_ring {
 #define E1000_TX_DESC(R, i)		E1000_GET_DESC(R, i, e1000_tx_desc)
 #define E1000_CONTEXT_DESC(R, i)	E1000_GET_DESC(R, i, e1000_context_desc)
 
+/* Time to wait after writing large amount of data to registers */
+#define E1000_WR_DELAY_RNG(val)         usleep_range(val*2, val*4)
+
 /* board specific private data structure */
 
 struct e1000_adapter {
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 983eb4e..eb57148 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -2331,6 +2331,11 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 		 */
 		E1000_WRITE_REG_ARRAY(hw, MTA, i, mcarray[i]);
 	}
+
+#ifdef CONFIG_E1000_DELAY
+	E1000_WR_DELAY_RNG(mta_reg_count);
+#endif
+
 	E1000_WRITE_FLUSH();
 
 	if (hw->mac_type == e1000_82542_rev2_0)
diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c
index 32e7775..3e381fe 100644
--- a/drivers/net/ethernet/intel/e1000e/82571.c
+++ b/drivers/net/ethernet/intel/e1000e/82571.c
@@ -998,6 +998,9 @@ static s32 e1000_reset_hw_82571(struct e1000_hw *hw)
 
 	e_dbg("Issuing a global reset to MAC\n");
 	ew32(CTRL, ctrl | E1000_CTRL_RST);
+#ifdef CONFIG_E1000_DELAY
+	E1000_WR_DELAY();
+#endif
 
 	/* Must release MDIO ownership and mutex after MAC reset. */
 	switch (hw->mac.type) {
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index 0abc942..48db3df 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -89,6 +89,10 @@ struct e1000_info;
 /* Time to wait before putting the device into D3 if there's no link (in ms). */
 #define LINK_TIMEOUT		100
 
+/* Time to wait after writing large amount of data to registers */
+#define E1000_WR_DELAY()		usleep_range(100, 150)
+#define E1000_WR_DELAY_RNG(val)		usleep_range(val*2, val*4)
+
 /* Count for polling __E1000_RESET condition every 10-20msec.
  * Experimentation has shown the reset can take approximately 210msec.
  */
diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c
index 30b74d5..d5ec122 100644
--- a/drivers/net/ethernet/intel/e1000e/mac.c
+++ b/drivers/net/ethernet/intel/e1000e/mac.c
@@ -353,6 +353,9 @@ void e1000e_update_mc_addr_list_generic(struct e1000_hw *hw,
 	/* replace the entire MTA table */
 	for (i = hw->mac.mta_reg_count - 1; i >= 0; i--)
 		E1000_WRITE_REG_ARRAY(hw, E1000_MTA, i, hw->mac.mta_shadow[i]);
+#ifdef CONFIG_E1000_DELAY
+	E1000_WR_DELAY_RNG(hw->mac.mta_reg_count);
+#endif
 	e1e_flush();
 }
 
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 68913d1..18112ef 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3360,6 +3360,9 @@ static int e1000e_write_uc_addr_list(struct net_device *netdev)
 	struct e1000_hw *hw = &adapter->hw;
 	unsigned int rar_entries;
 	int count = 0;
+#ifdef CONFIG_E1000_DELAY
+	unsigned int rar_count;
+#endif
 
 	rar_entries = hw->mac.ops.rar_get_count(hw);
 
@@ -3391,12 +3394,22 @@ static int e1000e_write_uc_addr_list(struct net_device *netdev)
 			count++;
 		}
 	}
-
+	
+	/* preserve number of remaining RAR entries for delay
+	 * function in order to prevent latency issues caused by
+	 * MMIO writes */
+#ifdef CONFIG_E1000_DELAY
+	rar_count = rar_entries;
+#endif
+	
 	/* zero out the remaining RAR entries not used above */
 	for (; rar_entries > 0; rar_entries--) {
 		ew32(RAH(rar_entries), 0);
 		ew32(RAL(rar_entries), 0);
 	}
+#ifdef CONFIG_E1000_DELAY
+	E1000_WR_DELAY_RNG(rar_count);
+#endif
 	e1e_flush();
 
 	return count;
@@ -3476,6 +3489,9 @@ static void e1000e_setup_rss_hash(struct e1000_adapter *adapter)
 	/* Direct all traffic to queue 0 */
 	for (i = 0; i < 32; i++)
 		ew32(RETA(i), 0);
+#ifdef CONFIG_E1000_DELAY
+	E1000_WR_DELAY();
+#endif
 
 	/* Disable raw packet checksumming so that RSS hash is placed in
 	 * descriptor on writeback.
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-10-21 22:07 [RFC] e1000e: Add delays after writing to registers Jonathan David
@ 2015-10-22  5:59 ` Henrik Austad
  2015-11-03 17:43   ` Jonathan David
  0 siblings, 1 reply; 11+ messages in thread
From: Henrik Austad @ 2015-10-22  5:59 UTC (permalink / raw)
  To: Jonathan David; +Cc: linux-rt-users, josh.cartwright

[-- Attachment #1: Type: text/plain, Size: 7654 bytes --]

On Wed, Oct 21, 2015 at 05:07:48PM -0500, Jonathan David wrote:
> There is a noticeable impact on determinism when a large number of
> writes are flushed. Writes to the hardware registers are sent across
> the PCI bus and take a significant amount of time to complete after
> a flush, which causes high priority tasks (including interrupts) to
> be delayed.

Do you see this in the entire system, or on the core where the write was 
triggered?

> Adding a delay after long series of writes gives them time to
> complete, and for higher priority tasks to run unimpeded.

Aren't we running with threaded interrupts?

What happens to the thread(s) pushing data to the network?
What about xmit-buffer once it is full? Which thread will block on send or 
have its sk_buff dropped?

I'm not sure if adding random delay and giving an unpredictable impact on 
completely random threads is the best way to solve this..

Are you sure that there are no other ways of avoiding the latency-spikes 
you see?

-H
(some more comments below)

> Signed-off-by: Jonathan David <jonathan.david@ni.com>
> ---
>  drivers/net/ethernet/intel/Kconfig            |  9 +++++++++
>  drivers/net/ethernet/intel/e1000/e1000.h      |  3 +++
>  drivers/net/ethernet/intel/e1000/e1000_main.c |  5 +++++
>  drivers/net/ethernet/intel/e1000e/82571.c     |  3 +++
>  drivers/net/ethernet/intel/e1000e/e1000.h     |  4 ++++
>  drivers/net/ethernet/intel/e1000e/mac.c       |  3 +++
>  drivers/net/ethernet/intel/e1000e/netdev.c    | 18 +++++++++++++++++-
>  7 files changed, 44 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
> index f4ff465..4c1fba2 100644
> --- a/drivers/net/ethernet/intel/Kconfig
> +++ b/drivers/net/ethernet/intel/Kconfig
> @@ -85,6 +85,15 @@ config E1000E
>  	  To compile this driver as a module, choose M here. The module
>  	  will be called e1000e.
>  
> +config E1000_DELAY
> +	bool "Add delays to e1000x drivers"
> +	default n
> +	depends on (E1000E || E1000)
> +	---help---
> +	  Enable delays after large numbers of MMIO writes to registers.
> +	  The delays aid in preventing noticeable impact on real-time
> +	  performance when a connection is interrupted.
> +
>  config IGB
>  	tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
>  	depends on PCI
> diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
> index 6970710..ea405f3 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000/e1000.h
> @@ -223,6 +223,9 @@ struct e1000_rx_ring {
>  #define E1000_TX_DESC(R, i)		E1000_GET_DESC(R, i, e1000_tx_desc)
>  #define E1000_CONTEXT_DESC(R, i)	E1000_GET_DESC(R, i, e1000_context_desc)
>  
> +/* Time to wait after writing large amount of data to registers */
> +#define E1000_WR_DELAY_RNG(val)         usleep_range(val*2, val*4)
> +

wait, so we delay for 'val' but get somewhere above *double* that?

Do we really want to add *random* delay to a network driver?

>  /* board specific private data structure */
>  
>  struct e1000_adapter {
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index 983eb4e..eb57148 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
> @@ -2331,6 +2331,11 @@ static void e1000_set_rx_mode(struct net_device *netdev)
>  		 */
>  		E1000_WRITE_REG_ARRAY(hw, MTA, i, mcarray[i]);
>  	}
> +
> +#ifdef CONFIG_E1000_DELAY
> +	E1000_WR_DELAY_RNG(mta_reg_count);
> +#endif
> +
>  	E1000_WRITE_FLUSH();
>  
>  	if (hw->mac_type == e1000_82542_rev2_0)
> diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c
> index 32e7775..3e381fe 100644
> --- a/drivers/net/ethernet/intel/e1000e/82571.c
> +++ b/drivers/net/ethernet/intel/e1000e/82571.c
> @@ -998,6 +998,9 @@ static s32 e1000_reset_hw_82571(struct e1000_hw *hw)
>  
>  	e_dbg("Issuing a global reset to MAC\n");
>  	ew32(CTRL, ctrl | E1000_CTRL_RST);
> +#ifdef CONFIG_E1000_DELAY
> +	E1000_WR_DELAY();
> +#endif
>  
>  	/* Must release MDIO ownership and mutex after MAC reset. */
>  	switch (hw->mac.type) {
> diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
> index 0abc942..48db3df 100644
> --- a/drivers/net/ethernet/intel/e1000e/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000e/e1000.h
> @@ -89,6 +89,10 @@ struct e1000_info;
>  /* Time to wait before putting the device into D3 if there's no link (in ms). */
>  #define LINK_TIMEOUT		100
>  
> +/* Time to wait after writing large amount of data to registers */
> +#define E1000_WR_DELAY()		usleep_range(100, 150)
> +#define E1000_WR_DELAY_RNG(val)		usleep_range(val*2, val*4)
> +

Apart from the fact that 'adding random delay' is bad, why replicate this 
in several headers? If you're going to add this, I'm sure others want part 
of this as well.

>  /* Count for polling __E1000_RESET condition every 10-20msec.
>   * Experimentation has shown the reset can take approximately 210msec.
>   */
> diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c
> index 30b74d5..d5ec122 100644
> --- a/drivers/net/ethernet/intel/e1000e/mac.c
> +++ b/drivers/net/ethernet/intel/e1000e/mac.c
> @@ -353,6 +353,9 @@ void e1000e_update_mc_addr_list_generic(struct e1000_hw *hw,
>  	/* replace the entire MTA table */
>  	for (i = hw->mac.mta_reg_count - 1; i >= 0; i--)
>  		E1000_WRITE_REG_ARRAY(hw, E1000_MTA, i, hw->mac.mta_shadow[i]);
> +#ifdef CONFIG_E1000_DELAY
> +	E1000_WR_DELAY_RNG(hw->mac.mta_reg_count);
> +#endif
>  	e1e_flush();
>  }
>  
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
> index 68913d1..18112ef 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> @@ -3360,6 +3360,9 @@ static int e1000e_write_uc_addr_list(struct net_device *netdev)
>  	struct e1000_hw *hw = &adapter->hw;
>  	unsigned int rar_entries;
>  	int count = 0;
> +#ifdef CONFIG_E1000_DELAY
> +	unsigned int rar_count;
> +#endif
>  
>  	rar_entries = hw->mac.ops.rar_get_count(hw);
>  
> @@ -3391,12 +3394,22 @@ static int e1000e_write_uc_addr_list(struct net_device *netdev)
>  			count++;
>  		}
>  	}
> -
> +	
> +	/* preserve number of remaining RAR entries for delay
> +	 * function in order to prevent latency issues caused by
> +	 * MMIO writes */
> +#ifdef CONFIG_E1000_DELAY
> +	rar_count = rar_entries;
> +#endif
> +	
>  	/* zero out the remaining RAR entries not used above */
>  	for (; rar_entries > 0; rar_entries--) {
>  		ew32(RAH(rar_entries), 0);
>  		ew32(RAL(rar_entries), 0);
>  	}
> +#ifdef CONFIG_E1000_DELAY
> +	E1000_WR_DELAY_RNG(rar_count);
> +#endif
>  	e1e_flush();
>  
>  	return count;
> @@ -3476,6 +3489,9 @@ static void e1000e_setup_rss_hash(struct e1000_adapter *adapter)
>  	/* Direct all traffic to queue 0 */
>  	for (i = 0; i < 32; i++)
>  		ew32(RETA(i), 0);
> +#ifdef CONFIG_E1000_DELAY
> +	E1000_WR_DELAY();
> +#endif
>  
>  	/* Disable raw packet checksumming so that RSS hash is placed in
>  	 * descriptor on writeback.
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Henrik Austad

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-10-22  5:59 ` Henrik Austad
@ 2015-11-03 17:43   ` Jonathan David
  2015-11-03 19:42     ` Henrik Austad
  0 siblings, 1 reply; 11+ messages in thread
From: Jonathan David @ 2015-11-03 17:43 UTC (permalink / raw)
  To: Henrik Austad; +Cc: linux-rt-users, josh.cartwright

On 10/22/2015 12:59 AM, Henrik Austad wrote:
> On Wed, Oct 21, 2015 at 05:07:48PM -0500, Jonathan David wrote:
>> There is a noticeable impact on determinism when a large number of
>> writes are flushed. Writes to the hardware registers are sent across
>> the PCI bus and take a significant amount of time to complete after
>> a flush, which causes high priority tasks (including interrupts) to
>> be delayed.
>
> Do you see this in the entire system, or on the core where the write was
> triggered?

Only on the core where the writes are issued.

>> Adding a delay after long series of writes gives them time to
>> complete, and for higher priority tasks to run unimpeded.
>
> Aren't we running with threaded interrupts?
>
> What happens to the thread(s) pushing data to the network?
> What about xmit-buffer once it is full? Which thread will block on send or
> have its sk_buff dropped?

All of this is totally irrelevant to the problem we are seeing.

The e1000x driver itself is not responsible for the delay here. The 
issue is with PCI where issuing a large number of MMIO writes followed 
by a read (to force said writes to execute) will stall the CPU. When the 
CPU is stalled, no interrupts are serviced, including the local apic 
timer interrupt, which was responsible for waking up cyclictest. This 
behavior was observed within traces gathered from cyclictest with ftrace 
enabled.

> I'm not sure if adding random delay and giving an unpredictable impact on
> completely random threads is the best way to solve this..

Agreed, we know that this is a hack. Do you have any better solutions?

- JD

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-11-03 17:43   ` Jonathan David
@ 2015-11-03 19:42     ` Henrik Austad
  2015-11-03 22:10       ` Jonathan David
  0 siblings, 1 reply; 11+ messages in thread
From: Henrik Austad @ 2015-11-03 19:42 UTC (permalink / raw)
  To: Jonathan David; +Cc: linux-rt-users, josh.cartwright

[-- Attachment #1: Type: text/plain, Size: 2808 bytes --]

On Tue, Nov 03, 2015 at 11:43:21AM -0600, Jonathan David wrote:
> On 10/22/2015 12:59 AM, Henrik Austad wrote:
> >On Wed, Oct 21, 2015 at 05:07:48PM -0500, Jonathan David wrote:
> >>There is a noticeable impact on determinism when a large number of
> >>writes are flushed. Writes to the hardware registers are sent across
> >>the PCI bus and take a significant amount of time to complete after
> >>a flush, which causes high priority tasks (including interrupts) to
> >>be delayed.
> >
> >Do you see this in the entire system, or on the core where the write was
> >triggered?
> 
> Only on the core where the writes are issued.

Ok, not a global freeze then :)

> >>Adding a delay after long series of writes gives them time to
> >>complete, and for higher priority tasks to run unimpeded.
> >
> >Aren't we running with threaded interrupts?
> >
> >What happens to the thread(s) pushing data to the network?
> >What about xmit-buffer once it is full? Which thread will block on send or
> >have its sk_buff dropped?
> 
> All of this is totally irrelevant to the problem we are seeing.

If this is irrelevant, why hack at the network-driver, hmm?

> The e1000x driver itself is not responsible for the delay here. 

... then why hack the network-driver?

> The issue is with PCI where issuing a large number of MMIO writes 
> followed by a read (to force said writes to execute) will stall the CPU. 
> When the CPU is stalled, no interrupts are serviced, including the local 
> apic timer interrupt, which was responsible for waking up cyclictest. 
> This behavior was observed within traces gathered from cyclictest with 
> ftrace enabled.

So you get bogged down with interrupts disabled; so back to my question, 
are we not running with threaded interrupts? What exactly are we triggering 
here?

> >I'm not sure if adding random delay and giving an unpredictable impact on
> >completely random threads is the best way to solve this..
> 
> Agreed, we know that this is a hack. Do you have any better solutions?

- Send less data over the network ;)
- affine network interrupts to a non-rt core and place rt-tasks on cores 
  shielded from interrupts (why are you not doing this already?)
- Look at the PCI driver and add breathers there (e.g. after X MMIO 
  writes, enable interrupts, disable interrupts and continue)

I'm sure adding random delay to the network-driver solves your particular 
problem, and that is fine, but I fear that you'll just end up opening a 
nasty can of worms. Debugging this when (not if) it fails is not going to 
be very pretty methinks.

Anyhow, I'm not about to tell you what you can or cannot do, just that I 
see some rather nasty bumps in the road ahead of you with the current 
solution.

-- 
Henrik Austad

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-11-03 19:42     ` Henrik Austad
@ 2015-11-03 22:10       ` Jonathan David
  2015-11-06  5:53         ` Henrik Austad
  0 siblings, 1 reply; 11+ messages in thread
From: Jonathan David @ 2015-11-03 22:10 UTC (permalink / raw)
  To: Henrik Austad; +Cc: linux-rt-users, josh.cartwright

On 11/03/2015 01:42 PM, Henrik Austad wrote:
> On Tue, Nov 03, 2015 at 11:43:21AM -0600, Jonathan David wrote:
>> On 10/22/2015 12:59 AM, Henrik Austad wrote:

>>>> Adding a delay after long series of writes gives them time to
>>>> complete, and for higher priority tasks to run unimpeded.
>>>
>>> Aren't we running with threaded interrupts?
>>>
>>> What happens to the thread(s) pushing data to the network?
>>> What about xmit-buffer once it is full? Which thread will block on send or
>>> have its sk_buff dropped?
>>
>> All of this is totally irrelevant to the problem we are seeing.
>
> If this is irrelevant, why hack at the network-driver, hmm?

It is relevant to the network driver, as this is where the symptoms were 
discovered; however, it has no relation to the packet delivery path. 
This is related purely to link configuration.

>> The e1000x driver itself is not responsible for the delay here.
>
> ... then why hack the network-driver?

Lack of better known options.

>> The issue is with PCI where issuing a large number of MMIO writes
>> followed by a read (to force said writes to execute) will stall the CPU.
>> When the CPU is stalled, no interrupts are serviced, including the local
>> apic timer interrupt, which was responsible for waking up cyclictest.
>> This behavior was observed within traces gathered from cyclictest with
>> ftrace enabled.
>
> So you get bogged down with interrupts disabled;

No, interrupts are entirely enabled while the PCI MMIO writes/read are 
issued; but the local apic timer still arrives late, presumably because 
the CPU is waiting to complete whatever writes remain in the buffer.

I think this might be the root of our miscommunication. You are asking 
good questions about threaded interrupts, etc, but it isn't clear how 
they are related to the specific problem we are seeing.

Thanks,

- JD

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-11-03 22:10       ` Jonathan David
@ 2015-11-06  5:53         ` Henrik Austad
  2015-11-06 11:08           ` Thomas Gleixner
  0 siblings, 1 reply; 11+ messages in thread
From: Henrik Austad @ 2015-11-06  5:53 UTC (permalink / raw)
  To: Jonathan David; +Cc: linux-rt-users, josh.cartwright

[-- Attachment #1: Type: text/plain, Size: 2458 bytes --]

On Tue, Nov 03, 2015 at 04:10:23PM -0600, Jonathan David wrote:
> On 11/03/2015 01:42 PM, Henrik Austad wrote:
> >On Tue, Nov 03, 2015 at 11:43:21AM -0600, Jonathan David wrote:
> >>On 10/22/2015 12:59 AM, Henrik Austad wrote:
> 
> >>>>Adding a delay after long series of writes gives them time to
> >>>>complete, and for higher priority tasks to run unimpeded.
> >>>
> >>>Aren't we running with threaded interrupts?
> >>>
> >>>What happens to the thread(s) pushing data to the network?
> >>>What about xmit-buffer once it is full? Which thread will block on send or
> >>>have its sk_buff dropped?
> >>
> >>All of this is totally irrelevant to the problem we are seeing.
> >
> >If this is irrelevant, why hack at the network-driver, hmm?
> 
> It is relevant to the network driver, as this is where the symptoms were
> discovered; however, it has no relation to the packet delivery path. This is
> related purely to link configuration.

I was under the impression that a PCI link configuration/training was down 
to speed etc, not how many MMIO read/writes it could do. Then again, a lot 
of this stuff is pure (black) magic.

> >>The e1000x driver itself is not responsible for the delay here.
> >
> >... then why hack the network-driver?
> 
> Lack of better known options.
> 
> >>The issue is with PCI where issuing a large number of MMIO writes
> >>followed by a read (to force said writes to execute) will stall the CPU.
> >>When the CPU is stalled, no interrupts are serviced, including the local
> >>apic timer interrupt, which was responsible for waking up cyclictest.
> >>This behavior was observed within traces gathered from cyclictest with
> >>ftrace enabled.
> >
> >So you get bogged down with interrupts disabled;
> 
> No, interrupts are entirely enabled while the PCI MMIO writes/read are
> issued; but the local apic timer still arrives late, presumably because the
> CPU is waiting to complete whatever writes remain in the buffer.

Heh, strange, is the interrupt signal itself delivered late as well, or 
just the handling of it?

> I think this might be the root of our miscommunication. You are asking good
> questions about threaded interrupts, etc, but it isn't clear how they are
> related to the specific problem we are seeing.

Perhaps a trace of the problem could be shared?

A full function-trace with irq-events and timer-events would be appreciated 
:)

-- 
Henrik Austad

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-11-06  5:53         ` Henrik Austad
@ 2015-11-06 11:08           ` Thomas Gleixner
  2015-11-06 15:46             ` Armin Steinhoff
  2015-11-06 18:38             ` Jonathan David
  0 siblings, 2 replies; 11+ messages in thread
From: Thomas Gleixner @ 2015-11-06 11:08 UTC (permalink / raw)
  To: Henrik Austad; +Cc: Jonathan David, linux-rt-users, josh.cartwright

On Fri, 6 Nov 2015, Henrik Austad wrote:
> On Tue, Nov 03, 2015 at 04:10:23PM -0600, Jonathan David wrote:
> > On 11/03/2015 01:42 PM, Henrik Austad wrote:
> > >On Tue, Nov 03, 2015 at 11:43:21AM -0600, Jonathan David wrote:
> > >>On 10/22/2015 12:59 AM, Henrik Austad wrote:
> > 
> > >>>>Adding a delay after long series of writes gives them time to
> > >>>>complete, and for higher priority tasks to run unimpeded.
> > >>>
> > >>>Aren't we running with threaded interrupts?
> > >>>
> > >>>What happens to the thread(s) pushing data to the network?
> > >>>What about xmit-buffer once it is full? Which thread will block on send or
> > >>>have its sk_buff dropped?
> > >>
> > >>All of this is totally irrelevant to the problem we are seeing.
> > >
> > >If this is irrelevant, why hack at the network-driver, hmm?
> > 
> > It is relevant to the network driver, as this is where the symptoms were
> > discovered; however, it has no relation to the packet delivery path. This is
> > related purely to link configuration.
> 
> I was under the impression that a PCI link configuration/training was down 
> to speed etc, not how many MMIO read/writes it could do. Then again, a lot 
> of this stuff is pure (black) magic.

This is not about PCI link training. Jonathan is talking about the
network link configuration.

> > >>The issue is with PCI where issuing a large number of MMIO writes
> > >>followed by a read (to force said writes to execute) will stall the CPU.
> > >>When the CPU is stalled, no interrupts are serviced, including the local
> > >>apic timer interrupt, which was responsible for waking up cyclictest.
> > >>This behavior was observed within traces gathered from cyclictest with
> > >>ftrace enabled.
> > >
> > >So you get bogged down with interrupts disabled;
> > 
> > No, interrupts are entirely enabled while the PCI MMIO writes/read are
> > issued; but the local apic timer still arrives late, presumably because the
> > CPU is waiting to complete whatever writes remain in the buffer.
> 
> Heh, strange, is the interrupt signal itself delivered late as well, or 
> just the handling of it?

The CPU stalls on the IO read, so the interrupt cannot be handled by
the CPU until that stall is resolved. The timer fires correctly.

The problem here is that even if the I/O memory of the network device
is mapped uncached, the PCI bus itself is allowed to buffer and do
write combining. That's done to overcome the bottleneck of waiting for
each single write transaction to complete.

The PCI bus guarantees that the writes are not reordered versus a
read. That's why drivers use a read after a series of writes to make
sure that the writes have reached the device and are not longer in the
PCI buffer queue.

Now that read after a sequence of writes has the effect that the CPU
has to wait for the writes to be finished before the read can take
place. During that time the CPU just sits there and twiddles
thumbs. It's a full stall.

Now my question is how big is the induced latency.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-11-06 11:08           ` Thomas Gleixner
@ 2015-11-06 15:46             ` Armin Steinhoff
  2015-11-06 17:22               ` Thomas Gleixner
  2015-11-06 18:38             ` Jonathan David
  1 sibling, 1 reply; 11+ messages in thread
From: Armin Steinhoff @ 2015-11-06 15:46 UTC (permalink / raw)
  To: Thomas Gleixner, Henrik Austad
  Cc: Jonathan David, linux-rt-users, josh.cartwright

Thomas Gleixner schrieb:
> On Fri, 6 Nov 2015, Henrik Austad wrote:
>> On Tue, Nov 03, 2015 at 04:10:23PM -0600, Jonathan David wrote:
>>> On 11/03/2015 01:42 PM, Henrik Austad wrote:
>>>> On Tue, Nov 03, 2015 at 11:43:21AM -0600, Jonathan David wrote:
>>>>> On 10/22/2015 12:59 AM, Henrik Austad wrote:
>>>>>>> Adding a delay after long series of writes gives them time to
>>>>>>> complete, and for higher priority tasks to run unimpeded.
>>>>>> Aren't we running with threaded interrupts?
>>>>>>
>>>>>> What happens to the thread(s) pushing data to the network?
>>>>>> What about xmit-buffer once it is full? Which thread will block on send or
>>>>>> have its sk_buff dropped?
>>>>> All of this is totally irrelevant to the problem we are seeing.
>>>> If this is irrelevant, why hack at the network-driver, hmm?
>>> It is relevant to the network driver, as this is where the symptoms were
>>> discovered; however, it has no relation to the packet delivery path. This is
>>> related purely to link configuration.
>> I was under the impression that a PCI link configuration/training was down
>> to speed etc, not how many MMIO read/writes it could do. Then again, a lot
>> of this stuff is pure (black) magic.
> This is not about PCI link training. Jonathan is talking about the
> network link configuration.
>
>>>>> The issue is with PCI where issuing a large number of MMIO writes
>>>>> followed by a read (to force said writes to execute) will stall the CPU.
>>>>> When the CPU is stalled, no interrupts are serviced, including the local
>>>>> apic timer interrupt, which was responsible for waking up cyclictest.
>>>>> This behavior was observed within traces gathered from cyclictest with
>>>>> ftrace enabled.
>>>> So you get bogged down with interrupts disabled;
>>> No, interrupts are entirely enabled while the PCI MMIO writes/read are
>>> issued; but the local apic timer still arrives late, presumably because the
>>> CPU is waiting to complete whatever writes remain in the buffer.
>> Heh, strange, is the interrupt signal itself delivered late as well, or
>> just the handling of it?
> The CPU stalls on the IO read, so the interrupt cannot be handled by
> the CPU until that stall is resolved. The timer fires correctly.
>
> The problem here is that even if the I/O memory of the network device
> is mapped uncached, the PCI bus itself is allowed to buffer and do
> write combining. That's done to overcome the bottleneck of waiting for
> each single write transaction to complete.
>
> The PCI bus guarantees that the writes are not reordered versus a
> read. That's why drivers use a read after a series of writes to make
> sure that the writes have reached the device and are not longer in the
> PCI buffer queue.
>
> Now that read after a sequence of writes has the effect that the CPU
> has to wait for the writes to be finished before the read can take
> place. During that time the CPU just sits there and twiddles
> thumbs. It's a full stall.

What means "the CPU" ? Does it mean ALL CPU cores are stalled ??

> Now my question is how big is the induced latency.
>
> Thanks,
>
> 	tglx
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-11-06 15:46             ` Armin Steinhoff
@ 2015-11-06 17:22               ` Thomas Gleixner
  0 siblings, 0 replies; 11+ messages in thread
From: Thomas Gleixner @ 2015-11-06 17:22 UTC (permalink / raw)
  To: Armin Steinhoff
  Cc: Henrik Austad, Jonathan David, linux-rt-users, josh.cartwright

On Fri, 6 Nov 2015, Armin Steinhoff wrote:
> Thomas Gleixner schrieb:
> > Now that read after a sequence of writes has the effect that the CPU
> > has to wait for the writes to be finished before the read can take
> > place. During that time the CPU just sits there and twiddles
> > thumbs. It's a full stall.
> 
> What means "the CPU" ? Does it mean ALL CPU cores are stalled ??

The CPU means the core on which the write/read sequence happened.

But any other core which is doing a concurrent read over PCI-E even on
an unrelated address space can be affected as well.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] e1000e: Add delays after writing to registers
  2015-11-06 11:08           ` Thomas Gleixner
  2015-11-06 15:46             ` Armin Steinhoff
@ 2015-11-06 18:38             ` Jonathan David
  2016-01-12 15:37               ` AW: " eg Engleder Gerhard
  1 sibling, 1 reply; 11+ messages in thread
From: Jonathan David @ 2015-11-06 18:38 UTC (permalink / raw)
  To: Thomas Gleixner, Henrik Austad; +Cc: linux-rt-users, josh.cartwright

On 11/06/2015 05:08 AM, Thomas Gleixner wrote:
[..]
>
> Now my question is how big is the induced latency.
>

Here is one such trace we've captured.

---
cyclictest-2345  [002]    73.674891: hrtimer_init:         hrtimer=0xffff8800b5b1feb8 clockid=CLOCK_MONOTONIC mode=HRTIMER_MODE_ABS
cyclictest-2345  [002]    73.674891: function:                   do_nanosleep
cyclictest-2345  [002]    73.674891: function:                      __hrtimer_start_range_ns
cyclictest-2345  [002]    73.674891: function:                         lock_hrtimer_base.isra.24
cyclictest-2345  [002]    73.674891: function:                            _raw_spin_lock_irqsave
cyclictest-2345  [002]    73.674891: function:                         enqueue_hrtimer
cyclictest-2345  [002]    73.674891: hrtimer_start:        hrtimer=0xffff8800b5b1feb8 function=hrtimer_wakeup expires=73712055622 softexpires=73712055622
...

...
avahi-daemon-1385  [002]    73.675075: function:                               e1000e_set_rx_mode
avahi-daemon-1385  [002]    73.675078: function:                                  __kmalloc
avahi-daemon-1385  [002]    73.675079: function:                                     kmalloc_slab
avahi-daemon-1385  [002]    73.675079: function:                                  e1000e_update_mc_addr_list_generic
avahi-daemon-1385  [002]    73.675080: bputs:                e1000e_update_mc_addr_list_generic: replace MTA table
avahi-daemon-1385  [002]    73.675159: function:             smp_apic_timer_interrupt
avahi-daemon-1385  [002]    73.675159: function:                irq_enter
avahi-daemon-1385  [002]    73.675159: function:                   rcu_irq_enter
avahi-daemon-1385  [002]    73.675159: function:                exit_idle
avahi-daemon-1385  [002]    73.675159: function:                local_apic_timer_interrupt
avahi-daemon-1385  [002]    73.675159: function:                hrtimer_interrupt
avahi-daemon-1385  [002]    73.675159: function:                   _raw_spin_lock
avahi-daemon-1385  [002]    73.675159: function:                   ktime_get_update_offsets
avahi-daemon-1385  [002]    73.675159: function:                   __run_hrtimer
avahi-daemon-1385  [002]    73.675160: hrtimer_cancel:       hrtimer=0xffff8800b5b1feb8
avahi-daemon-1385  [002]    73.675160: function:                      __remove_hrtimer
avahi-daemon-1385  [002]    73.675160: hrtimer_expire_entry: hrtimer=0xffff8800b5b1feb8 now=73712084997 function=hrtimer_wakeup/0x0
---

After the wake-up timer is armed, the e1000e driver handles the network
link configuration after the NIC was reset. When the function
"e1000e_update_mc_addr_list_generic" is called, the MTA table is
rewritten (the writing of this table consists of >70 MMIO writes
followed by a read).

We see the hrtimer entry expire at the value 73712084997 near the bottom of the
trace.

73712084997ns - 73712055622ns = 29375ns

So, we see a ~29us latency, in comparison to a good case where we see about 1us.
Consistently, when we've seen large cyclictest latencies, we see that the e1000e
driver calls a function which uses e1e_flush after a large series of writes.

- JD

^ permalink raw reply	[flat|nested] 11+ messages in thread

* AW: [RFC] e1000e: Add delays after writing to registers
  2015-11-06 18:38             ` Jonathan David
@ 2016-01-12 15:37               ` eg Engleder Gerhard
  0 siblings, 0 replies; 11+ messages in thread
From: eg Engleder Gerhard @ 2016-01-12 15:37 UTC (permalink / raw)
  To: Jonathan David
  Cc: linux-rt-users, josh.cartwright, Henrik Austad, Thomas Gleixner

> >
> > Now my question is how big is the induced latency.
> >
> After the wake-up timer is armed, the e1000e driver handles the network link
> configuration after the NIC was reset. When the function
> "e1000e_update_mc_addr_list_generic" is called, the MTA table is rewritten (the
> writing of this table consists of >70 MMIO writes followed by a read).
> 
> We see the hrtimer entry expire at the value 73712084997 near the bottom of the
> trace.
> 
> 73712084997ns - 73712055622ns = 29375ns
> 
> So, we see a ~29us latency, in comparison to a good case where we see about
> 1us.

I have a similar problem, but in my case the hardware latency is the problem.
We are executing DMA transfers over PCIe for fieldbus communication over
Ethernet (PCIe target is own fieldbus FPGA, not the e1000e).
Usually the DMA transfers start with a maximum delay of ~25us. But in case of
link down and up on the e1000e controller the worst case delay is 65us. This
violates our real-time deadlines. DMA transfer is triggered by the hardware and
delay is also measured by the hardware, no software is involved. With the following
patch the problem disappears:

 @@ -376,9 +376,10 @@ void e1000e_update_mc_addr_list_generic(struct e1000_hw *hw,
        }

        /* replace the entire MTA table */
-       for (i = hw->mac.mta_reg_count - 1; i >= 0; i--)
+       for (i = hw->mac.mta_reg_count - 1; i >= 0; i--) {
                E1000_WRITE_REG_ARRAY(hw, E1000_MTA, i, hw->mac.mta_shadow[i]);
-       e1e_flush();
+               e1e_flush();
+       }
 }

 /**

What is the state of your patch for the e1000e driver?

regards, gerhard


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2016-01-12 15:47 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-21 22:07 [RFC] e1000e: Add delays after writing to registers Jonathan David
2015-10-22  5:59 ` Henrik Austad
2015-11-03 17:43   ` Jonathan David
2015-11-03 19:42     ` Henrik Austad
2015-11-03 22:10       ` Jonathan David
2015-11-06  5:53         ` Henrik Austad
2015-11-06 11:08           ` Thomas Gleixner
2015-11-06 15:46             ` Armin Steinhoff
2015-11-06 17:22               ` Thomas Gleixner
2015-11-06 18:38             ` Jonathan David
2016-01-12 15:37               ` AW: " eg Engleder Gerhard

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.