All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] net/mlx5: fix Tx doorbell memory barrier
@ 2017-10-22  8:00 Yongseok Koh
  2017-10-22  9:46 ` Sagi Grimberg
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Yongseok Koh @ 2017-10-22  8:00 UTC (permalink / raw)
  To: adrien.mazarguil, nelio.laranjeiro
  Cc: dev, Yongseok Koh, stable, Sagi Grimberg, Alexander Solganik

Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
amount. If UAR is configured as write-combining register, a write memory
barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
the size of a burst is comparatively small.

Fixes: 9f9bebae5530 ("net/mlx5: don't map doorbell register to write combining")
Cc: stable@dpdk.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Alexander Solganik <solganik@gmail.com>

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
---
 drivers/net/mlx5/mlx5.c               | 2 --
 drivers/net/mlx5/mlx5_rxtx.c          | 8 ++++----
 drivers/net/mlx5/mlx5_rxtx.h          | 6 +++++-
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 4 ++--
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  | 4 ++--
 5 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 89fdc134f..fcdcbc367 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1037,8 +1037,6 @@ rte_mlx5_pmd_init(void)
 	 * using this PMD, which is not supported in forked processes.
 	 */
 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
-	/* Don't map UAR to WC if BlueFlame is not used.*/
-	setenv("MLX5_SHUT_UP_BF", "1", 1);
 	/* Match the size of Rx completion entry to the size of a cacheline. */
 	if (RTE_CACHE_LINE_SIZE == 128)
 		setenv("MLX5_CQE_SIZE", "128", 0);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 961967bf4..f54fee9fb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -732,7 +732,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	txq->stats.opackets += i;
 #endif
 	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
+	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe, 1);
 	return i;
 }
 
@@ -948,7 +948,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	/* Ring QP doorbell. */
 	if (mpw.state == MLX5_MPW_STATE_OPENED)
 		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
+	mlx5_tx_dbrec(txq, mpw.wqe, 1);
 	txq->elts_head = elts_head;
 	return i;
 }
@@ -1245,7 +1245,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		mlx5_mpw_inline_close(txq, &mpw);
 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
 		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
+	mlx5_tx_dbrec(txq, mpw.wqe, 1);
 	txq->elts_head = elts_head;
 	return i;
 }
@@ -1596,7 +1596,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
 		mlx5_mpw_close(txq, &mpw);
 	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, mpw.wqe);
+	mlx5_tx_dbrec(txq, mpw.wqe, 1);
 	txq->elts_head = elts_head;
 	return i;
 }
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index ea037427b..58ae3c85b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -584,9 +584,11 @@ mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
  *   Pointer to TX queue structure.
  * @param wqe
  *   Pointer to the last WQE posted in the NIC.
+ * @param mb
+ *   Request for write memory barrier after BF update.
  */
 static __rte_always_inline void
-mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
+mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe, int mb)
 {
 	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
 	volatile uint64_t *src = ((volatile uint64_t *)wqe);
@@ -596,6 +598,8 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
 	/* Ensure ordering between DB record and BF copy. */
 	rte_wmb();
 	*dst = *src;
+	if (mb)
+		rte_wmb();
 }
 
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 4cb7f2889..7a6e397de 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -225,7 +225,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	txq->stats.opackets += n;
 #endif
-	mlx5_tx_dbrec(txq, wqe);
+	mlx5_tx_dbrec(txq, wqe, 1);
 	return n;
 }
 
@@ -345,7 +345,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
 		       nb_dword_per_wqebb;
 	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, wqe);
+	mlx5_tx_dbrec(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
 	return pkts_n;
 }
 
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index e9819b762..12d5bed55 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -226,7 +226,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	txq->stats.opackets += n;
 #endif
-	mlx5_tx_dbrec(txq, wqe);
+	mlx5_tx_dbrec(txq, wqe, 1);
 	return n;
 }
 
@@ -344,7 +344,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
 		       nb_dword_per_wqebb;
 	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, wqe);
+	mlx5_tx_dbrec(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
 	return pkts_n;
 }
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] net/mlx5: fix Tx doorbell memory barrier
  2017-10-22  8:00 [PATCH] net/mlx5: fix Tx doorbell memory barrier Yongseok Koh
@ 2017-10-22  9:46 ` Sagi Grimberg
  2017-10-22 22:01   ` Yongseok Koh
  2017-10-23  7:00 ` Nélio Laranjeiro
  2017-10-25  0:27 ` [PATCH v2] " Yongseok Koh
  2 siblings, 1 reply; 10+ messages in thread
From: Sagi Grimberg @ 2017-10-22  9:46 UTC (permalink / raw)
  To: Yongseok Koh, adrien.mazarguil, nelio.laranjeiro
  Cc: dev, stable, Alexander Solganik


> Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
> amount. If UAR is configured as write-combining register, a write memory
> barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
> the size of a burst is comparatively small.

Personally I don't think that the flag is really a good interface
choice. But also I'm not convinced that its dependent on the burst size.

What guarantees that even for larger bursts the mmio write was flushed?
it comes after a set of writes that were flushed prior to the db update
and its not guaranteed that the application will immediately have more
data to trigger this writes to flush.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net/mlx5: fix Tx doorbell memory barrier
  2017-10-22  9:46 ` Sagi Grimberg
@ 2017-10-22 22:01   ` Yongseok Koh
  2017-10-23  7:50     ` Nélio Laranjeiro
  0 siblings, 1 reply; 10+ messages in thread
From: Yongseok Koh @ 2017-10-22 22:01 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: adrien.mazarguil, nelio.laranjeiro, dev, stable, Alexander Solganik

On Sun, Oct 22, 2017 at 12:46:53PM +0300, Sagi Grimberg wrote:
> 
> > Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
> > amount. If UAR is configured as write-combining register, a write memory
> > barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
> > the size of a burst is comparatively small.
> 
> Personally I don't think that the flag is really a good interface
> choice. But also I'm not convinced that its dependent on the burst size.
> 
> What guarantees that even for larger bursts the mmio write was flushed?
> it comes after a set of writes that were flushed prior to the db update
> and its not guaranteed that the application will immediately have more
> data to trigger this writes to flush.

Yes, I already knew the concern. I don't know you were aware but that can only
happen when burst size is exactly multiple of 32 in the vectorized Tx. If you
look at the mlx5_tx_burst_raw_vec(), every Tx bursts having more than 32 packets
will be calling txq_burst_v() more than one time. For example, if pkts_n is 45,
then it will firstly call txq_burst_v(32) and txq_burst_v(13) will follow with
setting barrier at the end. The only pitfall is when pkts_n is exactly multiple
of 32, e.g. 32, 64, 96 and so on. This shall not be likely when an app is
forwarding packets and the rate is low (if packet rate is high, we are good).
So, the only possible case of it is when an app generates traffic at
comparatively low rate in bursty way with burst size being multiple of 32. If a
user encounters such a rare case and latency is critical in their app, we will
recommend to set MLX5_SHUT_UP_BF=1 either by exporting in a shell or by
embedding it in their app's initialization. Or, they can use other
non-vectorized tx_burst() functions because the barrier is still enforced in
such functions like you firstly suggested.

It is always true that we can't make everyone satisfied. Some apps prefers
better performance to better latency. As vectorized Tx outperforms all the other
tx_burst() functions, I want to leave it as only one exceptional case. Actually,
we already received a complaint that 1-core performance of vPMD declined by 10%
(53Mpps -> 48Mpps) due to the patch (MLX5_SHUT_UP_BF=1). So, I wanted to give
users/apps more versatile options/knobs.

Before sending out this patch, I've done RFC2544 latency tests with Ixia and the
result was as good as before (actually same). That's why we think it is a good
compromise. 


Thanks for your comment,
Yongseok

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net/mlx5: fix Tx doorbell memory barrier
  2017-10-22  8:00 [PATCH] net/mlx5: fix Tx doorbell memory barrier Yongseok Koh
  2017-10-22  9:46 ` Sagi Grimberg
@ 2017-10-23  7:00 ` Nélio Laranjeiro
  2017-10-25  0:27 ` [PATCH v2] " Yongseok Koh
  2 siblings, 0 replies; 10+ messages in thread
From: Nélio Laranjeiro @ 2017-10-23  7:00 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: adrien.mazarguil, nelio.laranjeiro, dev, stable, Sagi Grimberg,
	Alexander Solganik

On Sun, Oct 22, 2017 at 01:00:22AM -0700, Yongseok Koh wrote:
>[...]
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -584,9 +584,11 @@ mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
>   *   Pointer to TX queue structure.
>   * @param wqe
>   *   Pointer to the last WQE posted in the NIC.
> + * @param mb
> + *   Request for write memory barrier after BF update.

BF should be replaced by its real name i.e. BlueFlame.
If the mb is only related to the BlueFlame, why not naming it mbbf or
bfmf?

>   */
>  static __rte_always_inline void
> -mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
> +mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe, int mb)
>  {
>  	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
>  	volatile uint64_t *src = ((volatile uint64_t *)wqe);
> @@ -596,6 +598,8 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
>  	/* Ensure ordering between DB record and BF copy. */
>  	rte_wmb();
>  	*dst = *src;
> +	if (mb)
> +		rte_wmb();
>  }

My last point, this modification seems necessary only for the MPS vector
case, why not adding a new function for this specific case instead?

At least I would suggest to create an enum with the possible values it
helps to read the source:

 enum mlx5_blueflame_mb {
 	MLX5_BLUEFLAME_MB_DIS, 
 	MLX5_BLUEFLAME_MB_EN, 
 }

you can also create wrappers for the function to avoid this extract
argument in the tx code:

 mlx5_tx_dbrec_bfmb(txq, wqe);
 mlx5_tx_dbrec_no_bfmb(txq, wqe);

Multiple ways instead of having 0 or 1 hard coded.

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net/mlx5: fix Tx doorbell memory barrier
  2017-10-22 22:01   ` Yongseok Koh
@ 2017-10-23  7:50     ` Nélio Laranjeiro
  2017-10-23 17:24       ` Yongseok Koh
  0 siblings, 1 reply; 10+ messages in thread
From: Nélio Laranjeiro @ 2017-10-23  7:50 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: Sagi Grimberg, adrien.mazarguil, nelio.laranjeiro, dev, stable,
	Alexander Solganik

Yongseok, Sagi, my small contribution to this discussion,

On Sun, Oct 22, 2017 at 03:01:04PM -0700, Yongseok Koh wrote:
> On Sun, Oct 22, 2017 at 12:46:53PM +0300, Sagi Grimberg wrote:
> > 
> > > Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
> > > amount. If UAR is configured as write-combining register, a write memory
> > > barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
> > > the size of a burst is comparatively small.
> > 
> > Personally I don't think that the flag is really a good interface
> > choice. But also I'm not convinced that its dependent on the burst size.
> > 
> > What guarantees that even for larger bursts the mmio write was flushed?
> > it comes after a set of writes that were flushed prior to the db update
> > and its not guaranteed that the application will immediately have more
> > data to trigger this writes to flush.
> 
> Yes, I already knew the concern. I don't know you were aware but that can only
> happen when burst size is exactly multiple of 32 in the vectorized Tx. If you
> look at the mlx5_tx_burst_raw_vec(), every Tx bursts having more than 32 packets
> will be calling txq_burst_v() more than one time. For example, if pkts_n is 45,
> then it will firstly call txq_burst_v(32) and txq_burst_v(13) will follow with
> setting barrier at the end. The only pitfall is when pkts_n is exactly multiple
> of 32, e.g. 32, 64, 96 and so on. This shall not be likely when an app is
> forwarding packets and the rate is low (if packet rate is high, we are good).
>
> So, the only possible case of it is when an app generates traffic at
> comparatively low rate in bursty way with burst size being multiple of 32.

A routing application will consume more than the 50 cycles the PMD needs
to process such burst.  It is not a rare case, it is the real one,
routing lookup table, parsing the packet to find the layers, modifying
them (decreasing the TTL, changing addresses and updating the checksum)
is not something fast.
The probability to have a full 32 packets burst entering in the PMD is
something "normal".

> If a user encounters such a rare case and latency is critical in their app, we will
> recommend to set MLX5_SHUT_UP_BF=1 either by exporting in a shell or by
> embedding it in their app's initialization. Or, they can use other
> non-vectorized tx_burst() functions because the barrier is still enforced in
> such functions like you firstly suggested.
> 
> It is always true that we can't make everyone satisfied. Some apps prefers
> better performance to better latency. As vectorized Tx outperforms all the other
> tx_burst() functions, I want to leave it as only one exceptional case. Actually,
> we already received a complaint that 1-core performance of vPMD declined by 10%
> (53Mpps -> 48Mpps) due to the patch (MLX5_SHUT_UP_BF=1). So, I wanted to give
> users/apps more versatile options/knobs.

DPDK is written for throughput, that the reason why Tx/Rx burst
functions are "burst" and written to consider it will process a large
amount of packets at each call and this to minimize the cost of all
memory barriers and doorbells.

If there are some application which also needs latency, it is maybe time
to introduce new API for that i.e. a dedicated to send/receive with a single
packet or a little more in an efficient way.
By the way, I am not sure such application handles such big sizes of
burst, Sagi you may have more informations, that you may want to share.

> Before sending out this patch, I've done RFC2544 latency tests with Ixia and the
> result was as good as before (actually same). That's why we think it is a good
> compromise. 

You cannot do that with testpmd, it does not match the real application
behavior as it receives a burst of packets and send them back without
touching them.  An application will at least process/route all received
packets to some other destination or port.  The send action will only be
triggered when the whole routing process is finished to maximize the
burst sizes.  According to the traffic, the latency will change.
>From what I know, we don't have such kind of example/tool in DPDK.

> Thanks for your comment,
> Yongseok

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net/mlx5: fix Tx doorbell memory barrier
  2017-10-23  7:50     ` Nélio Laranjeiro
@ 2017-10-23 17:24       ` Yongseok Koh
  2017-10-24  6:49         ` Nélio Laranjeiro
  0 siblings, 1 reply; 10+ messages in thread
From: Yongseok Koh @ 2017-10-23 17:24 UTC (permalink / raw)
  To: Nélio Laranjeiro
  Cc: Sagi Grimberg, adrien.mazarguil, dev, stable, Alexander Solganik

On Mon, Oct 23, 2017 at 09:50:14AM +0200, Nélio Laranjeiro wrote:
> Yongseok, Sagi, my small contribution to this discussion,
> 
> On Sun, Oct 22, 2017 at 03:01:04PM -0700, Yongseok Koh wrote:
> > On Sun, Oct 22, 2017 at 12:46:53PM +0300, Sagi Grimberg wrote:
> > > 
> > > > Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
> > > > amount. If UAR is configured as write-combining register, a write memory
> > > > barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
> > > > the size of a burst is comparatively small.
> > > 
> > > Personally I don't think that the flag is really a good interface
> > > choice. But also I'm not convinced that its dependent on the burst size.
> > > 
> > > What guarantees that even for larger bursts the mmio write was flushed?
> > > it comes after a set of writes that were flushed prior to the db update
> > > and its not guaranteed that the application will immediately have more
> > > data to trigger this writes to flush.
> > 
> > Yes, I already knew the concern. I don't know you were aware but that can only
> > happen when burst size is exactly multiple of 32 in the vectorized Tx. If you
> > look at the mlx5_tx_burst_raw_vec(), every Tx bursts having more than 32 packets
> > will be calling txq_burst_v() more than one time. For example, if pkts_n is 45,
> > then it will firstly call txq_burst_v(32) and txq_burst_v(13) will follow with
> > setting barrier at the end. The only pitfall is when pkts_n is exactly multiple
> > of 32, e.g. 32, 64, 96 and so on. This shall not be likely when an app is
> > forwarding packets and the rate is low (if packet rate is high, we are good).
> >
> > So, the only possible case of it is when an app generates traffic at
> > comparatively low rate in bursty way with burst size being multiple of 32.
> 
> A routing application will consume more than the 50 cycles the PMD needs
> to process such burst.  It is not a rare case, it is the real one,
> routing lookup table, parsing the packet to find the layers, modifying
> them (decreasing the TTL, changing addresses and updating the checksum)
> is not something fast.
> The probability to have a full 32 packets burst entering in the PMD is
> something "normal".

Right. There could be more common cases of skipping the barrier. And that's a
good example. But my point here is to give more options to users, not to defend
all the possible singular cases. If an app is processing packets in bursty way,
it is already compromise between latency and throughput. Like you mentioned
below, that's one of design goals of DPDK. If latency is so critical, it would
be better to use interrupt driven processing in kernel context.

Back to original issue, MLX5 PMD had unreasonably high max latency, especially
with low rate of traffic while min/avg was okay and this could be considered as
a critical bug. This sort of patches are to resolve that issue, not to improve
its overall latency results.

[...]
> > Before sending out this patch, I've done RFC2544 latency tests with Ixia and the
> > result was as good as before (actually same). That's why we think it is a good
> > compromise. 
> 
> You cannot do that with testpmd, it does not match the real application
> behavior as it receives a burst of packets and send them back without
> touching them.  An application will at least process/route all received
> packets to some other destination or port.  The send action will only be
> triggered when the whole routing process is finished to maximize the
> burst sizes.  According to the traffic, the latency will change.
> From what I know, we don't have such kind of example/tool in DPDK.

I wasn't saying that the test with testpmd represented real time use-cases but I
just wanted to verify this patch is effective to resolve the original issue.
Again, this isn't a patch for latency enhancement but to resolve the issue.

And I think I should also change documentation to address the option
(MLX5_SHUT_UP_BF=1) for v2, unless there's objection.


Thanks,
Yongseok

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net/mlx5: fix Tx doorbell memory barrier
  2017-10-23 17:24       ` Yongseok Koh
@ 2017-10-24  6:49         ` Nélio Laranjeiro
  0 siblings, 0 replies; 10+ messages in thread
From: Nélio Laranjeiro @ 2017-10-24  6:49 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: Nélio Laranjeiro, Sagi Grimberg, adrien.mazarguil, dev,
	stable, Alexander Solganik

On Mon, Oct 23, 2017 at 10:24:09AM -0700, Yongseok Koh wrote:
> On Mon, Oct 23, 2017 at 09:50:14AM +0200, Nélio Laranjeiro wrote:
> > Yongseok, Sagi, my small contribution to this discussion,
> > 
> > On Sun, Oct 22, 2017 at 03:01:04PM -0700, Yongseok Koh wrote:
> > > On Sun, Oct 22, 2017 at 12:46:53PM +0300, Sagi Grimberg wrote:
> > > > 
> > > > > Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
> > > > > amount. If UAR is configured as write-combining register, a write memory
> > > > > barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
> > > > > the size of a burst is comparatively small.
> > > > 
> > > > Personally I don't think that the flag is really a good interface
> > > > choice. But also I'm not convinced that its dependent on the burst size.
> > > > 
> > > > What guarantees that even for larger bursts the mmio write was flushed?
> > > > it comes after a set of writes that were flushed prior to the db update
> > > > and its not guaranteed that the application will immediately have more
> > > > data to trigger this writes to flush.
> > > 
> > > Yes, I already knew the concern. I don't know you were aware but that can only
> > > happen when burst size is exactly multiple of 32 in the vectorized Tx. If you
> > > look at the mlx5_tx_burst_raw_vec(), every Tx bursts having more than 32 packets
> > > will be calling txq_burst_v() more than one time. For example, if pkts_n is 45,
> > > then it will firstly call txq_burst_v(32) and txq_burst_v(13) will follow with
> > > setting barrier at the end. The only pitfall is when pkts_n is exactly multiple
> > > of 32, e.g. 32, 64, 96 and so on. This shall not be likely when an app is
> > > forwarding packets and the rate is low (if packet rate is high, we are good).
> > >
> > > So, the only possible case of it is when an app generates traffic at
> > > comparatively low rate in bursty way with burst size being multiple of 32.
> > 
> > A routing application will consume more than the 50 cycles the PMD needs
> > to process such burst.  It is not a rare case, it is the real one,
> > routing lookup table, parsing the packet to find the layers, modifying
> > them (decreasing the TTL, changing addresses and updating the checksum)
> > is not something fast.
> > The probability to have a full 32 packets burst entering in the PMD is
> > something "normal".
> 
> Right. There could be more common cases of skipping the barrier. And that's a
> good example. But my point here is to give more options to users, not to defend
> all the possible singular cases. If an app is processing packets in bursty way,
> it is already compromise between latency and throughput. Like you mentioned
> below, that's one of design goals of DPDK. If latency is so critical, it would
> be better to use interrupt driven processing in kernel context.
> 
> Back to original issue, MLX5 PMD had unreasonably high max latency, especially
> with low rate of traffic while min/avg was okay and this could be considered as
> a critical bug. This sort of patches are to resolve that issue, not to improve
> its overall latency results.
> 
> [...]
> > > Before sending out this patch, I've done RFC2544 latency tests with Ixia and the
> > > result was as good as before (actually same). That's why we think it is a good
> > > compromise. 
> > 
> > You cannot do that with testpmd, it does not match the real application
> > behavior as it receives a burst of packets and send them back without
> > touching them.  An application will at least process/route all received
> > packets to some other destination or port.  The send action will only be
> > triggered when the whole routing process is finished to maximize the
> > burst sizes.  According to the traffic, the latency will change.
> > From what I know, we don't have such kind of example/tool in DPDK.
> 
> I wasn't saying that the test with testpmd represented real time use-cases

I certain of that, but it is mostly because I know how you work, others
DPDK contributors entering in this thread, may understand this ;).

> but I just wanted to verify this patch is effective to resolve the
> original issue.
> Again, this isn't a patch for latency enhancement but to resolve the issue.
>
> And I think I should also change documentation to address the option
> (MLX5_SHUT_UP_BF=1) for v2, unless there's objection.

If letting it has a huge impact on the throughput performance and it is
the reason why the PMD won't set it in the future, it should be
documented to let users embed it in their environment. You can also
provide some information about the impact of activating such behavior.

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2] net/mlx5: fix Tx doorbell memory barrier
  2017-10-22  8:00 [PATCH] net/mlx5: fix Tx doorbell memory barrier Yongseok Koh
  2017-10-22  9:46 ` Sagi Grimberg
  2017-10-23  7:00 ` Nélio Laranjeiro
@ 2017-10-25  0:27 ` Yongseok Koh
  2017-10-25  9:19   ` Nélio Laranjeiro
  2 siblings, 1 reply; 10+ messages in thread
From: Yongseok Koh @ 2017-10-25  0:27 UTC (permalink / raw)
  To: adrien.mazarguil, nelio.laranjeiro
  Cc: dev, Yongseok Koh, stable, Sagi Grimberg, Alexander Solganik

Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
amount. If UAR is configured as write-combining register, a write memory
barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
the size of a burst is comparatively small. Revert the register back to
write-combining and enforce a write memory barrier instead, except for
vectorized Tx burst routines. Application can change it by setting
MLX5_SHUT_UP_BF under its own necessity.

Fixes: 9f9bebae5530 ("net/mlx5: don't map doorbell register to write combining")
Cc: stable@dpdk.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Alexander Solganik <solganik@gmail.com>

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
---

v2:
* Add documentation.
* Rename functions to minimize changes.

 doc/guides/nics/mlx5.rst              | 17 +++++++++++++++++
 drivers/net/mlx5/mlx5.c               |  2 --
 drivers/net/mlx5/mlx5_rxtx.h          | 23 +++++++++++++++++++++--
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  2 +-
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  2 +-
 5 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index d24941a22..085d3940c 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -171,6 +171,23 @@ Environment variables
   This is disabled by default since this can also decrease performance for
   unaligned packet sizes.
 
+- ``MLX5_SHUT_UP_BF``
+
+  Configures HW Tx doorbell register as IO-mapped.
+
+  By default, the HW Tx doorbell is configured as a write-combining register.
+  The register would be flushed to HW usually when the write-combining buffer
+  becomes full, but it depends on CPU design.
+
+  Except for vectorized Tx burst routines, a write memory barrier is enforced
+  after updating the register so that the update can be immediately visible to
+  HW.
+
+  When vectorized Tx burst is called, the barrier is set only if the burst size
+  is not aligned to MLX5_VPMD_TX_MAX_BURST. However, setting this environmental
+  variable will bring better latency even though the maximum throughput can
+  slightly decline.
+
 Run-time configuration
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 89fdc134f..fcdcbc367 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1037,8 +1037,6 @@ rte_mlx5_pmd_init(void)
 	 * using this PMD, which is not supported in forked processes.
 	 */
 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
-	/* Don't map UAR to WC if BlueFlame is not used.*/
-	setenv("MLX5_SHUT_UP_BF", "1", 1);
 	/* Match the size of Rx completion entry to the size of a cacheline. */
 	if (RTE_CACHE_LINE_SIZE == 128)
 		setenv("MLX5_CQE_SIZE", "128", 0);
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index ea037427b..d34f3cc04 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -578,15 +578,18 @@ mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
 }
 
 /**
- * Ring TX queue doorbell.
+ * Ring TX queue doorbell and flush the update if requested.
  *
  * @param txq
  *   Pointer to TX queue structure.
  * @param wqe
  *   Pointer to the last WQE posted in the NIC.
+ * @param cond
+ *   Request for write memory barrier after BlueFlame update.
  */
 static __rte_always_inline void
-mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
+mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe,
+		       int cond)
 {
 	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
 	volatile uint64_t *src = ((volatile uint64_t *)wqe);
@@ -596,6 +599,22 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
 	/* Ensure ordering between DB record and BF copy. */
 	rte_wmb();
 	*dst = *src;
+	if (cond)
+		rte_wmb();
+}
+
+/**
+ * Ring TX queue doorbell and flush the update by write memory barrier.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the last WQE posted in the NIC.
+ */
+static __rte_always_inline void
+mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
+{
+	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 4cb7f2889..61f5bc45b 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -345,7 +345,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
 		       nb_dword_per_wqebb;
 	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, wqe);
+	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
 	return pkts_n;
 }
 
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index e9819b762..a53027d84 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -344,7 +344,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
 		       nb_dword_per_wqebb;
 	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, wqe);
+	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
 	return pkts_n;
 }
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] net/mlx5: fix Tx doorbell memory barrier
  2017-10-25  0:27 ` [PATCH v2] " Yongseok Koh
@ 2017-10-25  9:19   ` Nélio Laranjeiro
  2017-10-25 21:34     ` [dpdk-stable] " Ferruh Yigit
  0 siblings, 1 reply; 10+ messages in thread
From: Nélio Laranjeiro @ 2017-10-25  9:19 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: adrien.mazarguil, dev, stable, Sagi Grimberg, Alexander Solganik

On Tue, Oct 24, 2017 at 05:27:25PM -0700, Yongseok Koh wrote:
> Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
> amount. If UAR is configured as write-combining register, a write memory
> barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
> the size of a burst is comparatively small. Revert the register back to
> write-combining and enforce a write memory barrier instead, except for
> vectorized Tx burst routines. Application can change it by setting
> MLX5_SHUT_UP_BF under its own necessity.
> 
> Fixes: 9f9bebae5530 ("net/mlx5: don't map doorbell register to write combining")
> Cc: stable@dpdk.org
> Cc: Sagi Grimberg <sagi@grimberg.me>
> Cc: Alexander Solganik <solganik@gmail.com>
> 
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> Acked-by: Shahaf Shuler <shahafs@mellanox.com>

Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-stable] [PATCH v2] net/mlx5: fix Tx doorbell memory barrier
  2017-10-25  9:19   ` Nélio Laranjeiro
@ 2017-10-25 21:34     ` Ferruh Yigit
  0 siblings, 0 replies; 10+ messages in thread
From: Ferruh Yigit @ 2017-10-25 21:34 UTC (permalink / raw)
  To: Nélio Laranjeiro, Yongseok Koh
  Cc: adrien.mazarguil, dev, stable, Sagi Grimberg, Alexander Solganik

On 10/25/2017 2:19 AM, Nélio Laranjeiro wrote:
> On Tue, Oct 24, 2017 at 05:27:25PM -0700, Yongseok Koh wrote:
>> Configuring UAR as IO-mapped makes maximum throughput decline by noticeable
>> amount. If UAR is configured as write-combining register, a write memory
>> barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when
>> the size of a burst is comparatively small. Revert the register back to
>> write-combining and enforce a write memory barrier instead, except for
>> vectorized Tx burst routines. Application can change it by setting
>> MLX5_SHUT_UP_BF under its own necessity.
>>
>> Fixes: 9f9bebae5530 ("net/mlx5: don't map doorbell register to write combining")
>> Cc: stable@dpdk.org
>> Cc: Sagi Grimberg <sagi@grimberg.me>
>> Cc: Alexander Solganik <solganik@gmail.com>
>>
>> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
>> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
> 
> Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-10-25 21:34 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-22  8:00 [PATCH] net/mlx5: fix Tx doorbell memory barrier Yongseok Koh
2017-10-22  9:46 ` Sagi Grimberg
2017-10-22 22:01   ` Yongseok Koh
2017-10-23  7:50     ` Nélio Laranjeiro
2017-10-23 17:24       ` Yongseok Koh
2017-10-24  6:49         ` Nélio Laranjeiro
2017-10-23  7:00 ` Nélio Laranjeiro
2017-10-25  0:27 ` [PATCH v2] " Yongseok Koh
2017-10-25  9:19   ` Nélio Laranjeiro
2017-10-25 21:34     ` [dpdk-stable] " Ferruh Yigit

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.