All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] mlx5 high latency observed on send operations
@ 2017-08-21  7:47 Sagi Grimberg
  2017-08-21  7:47 ` [PATCH 1/2] net/mlx5: replace memory barrier type Sagi Grimberg
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Sagi Grimberg @ 2017-08-21  7:47 UTC (permalink / raw)
  To: dev; +Cc: Nelio Laranjeiro, Adrien Mazarguil

When measuring latency when running a latency critical workload
on mlx5 pmd drivers we noticed high latency can occur due to
delayed doorbell record update flush.

This can be reproduced using the simple program [1]
against testpmd macswap fwd mode. This utility sends
a raw ethernet frame to the dpdk port and measures the
time between send and the received mirrored frame.

This patchset guarantees immediate doorbell updates
visibility by making the doorbell a non-cacheble memory.
In addition, we relax the memory barrier for dma-able
memory.

Without this fix the tsc delta was 3550760-5993019 cycles
(which translates to 2-6 ms on 1.7 GHz processor).

With the fix applied the tsc delta reduced to 17740-29663
(wich translates to 9-17 us).

Shahaf Shuler (2):
  net/mlx5: replace memory barrier type
  net/mlx5: don't map doorbell register to write combining

 drivers/net/mlx5/mlx5.c      | 2 ++
 drivers/net/mlx5/mlx5_rxtx.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

[1]:
/*
 * compiling: gcc test.c -o test
 * run using: ./test <local_iface> <dest_mac> 
 */
#include <arpa/inet.h>
#include <linux/if_packet.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <net/if.h>
#include <netinet/ether.h>

#define BUF_SIZ		1024

static inline uint64_t rte_rdtsc(void)
{
	union {
		uint64_t tsc_64;
		struct {
			uint32_t lo_32;
			uint32_t hi_32;
		};
	} tsc;

	asm volatile("rdtsc" :
		     "=a" (tsc.lo_32),
		     "=d" (tsc.hi_32));
	return tsc.tsc_64;
}

int main(int argc, char *argv[])
{
	int sockfd;
	struct ifreq if_idx;
	struct ifreq if_mac;
	int tx_len = 0;
	char sendbuf[BUF_SIZ];
	struct ether_header *eh = (struct ether_header *) sendbuf;
	struct sockaddr_ll socket_address;
	char ifname[IFNAMSIZ];
	int values[6];
	struct ether_header expected;
	uint64_t payload = 0xB16B00B5;
	uint8_t buffer[1024];
	int result;
	uint64_t before_rcv;
	uint64_t after_rcv;
	uint64_t delta;
	int numbytes;

	if (argc != 3) {
		fprintf(stderr, "device name and dest mac\n");
		return -1;
	}

	strcpy(ifname, argv[1]);
	result = sscanf(argv[2], "%x:%x:%x:%x:%x:%x",
			&values[0], &values[1], &values[2], &values[3], &values[4], &values[5]);
	if (result != 6) {
		fprintf(stderr, "invalid mac\n");
		return -1;
	}

	/* Open RAW socket to send on */
	if ((sockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL))) == -1) {
	    perror("socket");
	}

	/* Get the index of the interface to send on */
	memset(&if_idx, 0, sizeof(struct ifreq));
	strncpy(if_idx.ifr_name, ifname, IFNAMSIZ-1);
	if (ioctl(sockfd, SIOCGIFINDEX, &if_idx) < 0)
	    perror("SIOCGIFINDEX");
	/* Get the MAC address of the interface to send on */
	memset(&if_mac, 0, sizeof(struct ifreq));
	strncpy(if_mac.ifr_name, ifname, IFNAMSIZ-1);
	if (ioctl(sockfd, SIOCGIFHWADDR, &if_mac) < 0)
	    perror("SIOCGIFHWADDR");

	/* Construct the Ethernet header */
	memset(sendbuf, 0, BUF_SIZ);
	/* Ethernet header */
	eh->ether_shost[0] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[0];
	eh->ether_shost[1] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[1];
	eh->ether_shost[2] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[2];
	eh->ether_shost[3] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[3];
	eh->ether_shost[4] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[4];
	eh->ether_shost[5] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[5];
	eh->ether_dhost[0] = values[0];
	eh->ether_dhost[1] = values[1];
	eh->ether_dhost[2] = values[2];
	eh->ether_dhost[3] = values[3];
	eh->ether_dhost[4] = values[4];
	eh->ether_dhost[5] = values[5];
	/* Ethertype field */
	eh->ether_type = htons(ETH_P_IP);
	tx_len += sizeof(struct ether_header);

	memcpy(&sendbuf[tx_len], &payload, sizeof(payload));
	tx_len += sizeof(payload);

	/* Index of the network device */
	socket_address.sll_ifindex = if_idx.ifr_ifindex;
	/* Address length*/
	socket_address.sll_halen = ETH_ALEN;
	/* Destination MAC */
	socket_address.sll_addr[0] = values[0];
	socket_address.sll_addr[1] = values[1];
	socket_address.sll_addr[2] = values[2];
	socket_address.sll_addr[3] = values[3];
	socket_address.sll_addr[4] = values[4];
	socket_address.sll_addr[5] = values[5];

	memcpy(&expected.ether_dhost, &eh->ether_shost, ETH_ALEN);
	memcpy(&expected.ether_shost, &eh->ether_dhost, ETH_ALEN);
	expected.ether_type = eh->ether_type;


	/* Send packet */
	if (sendto(sockfd, sendbuf, tx_len, 0, (struct sockaddr*)&socket_address, sizeof(struct sockaddr_ll)) < 0) {
	    printf("Send failed\n");
	    return -2;
	}

	before_rcv = rte_rdtsc();
	while (1) {
		numbytes = recvfrom(sockfd, buffer, BUF_SIZ, 0, NULL, NULL);
		if (numbytes <= 0)
			continue;
		after_rcv = rte_rdtsc();

		if (memcmp(&expected, buffer, sizeof(expected)) != 0)
			continue;

		if (memcmp(&payload, &buffer[sizeof(expected)], sizeof(payload)) == 0) {
			break;
		}

	}

	delta =  after_rcv - before_rcv;
	printf("RTT is %lu tsc \n", delta);
	return 0;
}
-- 
2.7.4

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] net/mlx5: replace memory barrier type
  2017-08-21  7:47 [PATCH 0/2] mlx5 high latency observed on send operations Sagi Grimberg
@ 2017-08-21  7:47 ` Sagi Grimberg
  2017-08-23 11:39   ` Nélio Laranjeiro
  2017-08-21  7:47 ` [PATCH 2/2] net/mlx5: don't map doorbell register to write combining Sagi Grimberg
  2017-08-27  6:47 ` [PATCH v2 0/2] mlx5 high latency observed on send operations Shahaf Shuler
  2 siblings, 1 reply; 14+ messages in thread
From: Sagi Grimberg @ 2017-08-21  7:47 UTC (permalink / raw)
  To: dev; +Cc: Nelio Laranjeiro, Adrien Mazarguil, Shahaf Shuler

From: Shahaf Shuler <shahafs@mellanox.com>

The reason for the requirement of a barrier between the txq writes
and the doorbell record writes is to avoid a case where the device
reads the doorbell record's new value before the txq writes are flushed
to memory.

The current use of rte_wmb is not necessary, and can be replaced by
rte_compiler_barrier as it acts as a write memory barrier.
More details on this type of barrier can be found on [1]

Replacing the rte_wmb is also expected to improve the throughput.

[1] https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html

Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Signed-off-by: Alexander Solganik <solganik@gmail.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/net/mlx5/mlx5_rxtx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 7de1d10863e5..59b9ff24fb82 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -602,7 +602,7 @@ mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
 	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
 	volatile uint64_t *src = ((volatile uint64_t *)wqe);
 
-	rte_wmb();
+	rte_compiler_barrier();
 	*txq->qp_db = htonl(txq->wqe_ci);
 	/* Ensure ordering between DB record and BF copy. */
 	rte_wmb();
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] net/mlx5: don't map doorbell register to write combining
  2017-08-21  7:47 [PATCH 0/2] mlx5 high latency observed on send operations Sagi Grimberg
  2017-08-21  7:47 ` [PATCH 1/2] net/mlx5: replace memory barrier type Sagi Grimberg
@ 2017-08-21  7:47 ` Sagi Grimberg
  2017-08-23 11:03   ` Ferruh Yigit
  2017-08-27  6:47 ` [PATCH v2 0/2] mlx5 high latency observed on send operations Shahaf Shuler
  2 siblings, 1 reply; 14+ messages in thread
From: Sagi Grimberg @ 2017-08-21  7:47 UTC (permalink / raw)
  To: dev; +Cc: Nelio Laranjeiro, Adrien Mazarguil, Shahaf Shuler

From: Shahaf Shuler <shahafs@mellanox.com>

By default, Verbs maps the doorbell register to write combining.
Working with write combining is useful for drivers which use blue flame
for the doorbell write.

Since mlx5 PMD uses only doorbells and write combining mapping requires
an extra memory barrier to flush the doorbell after its write, setting
the mapping to un-cached by default.

Such change is reduces the max and average round trip
latency significantly.

Reported-by: Alexander Solganik <solganik@gmail.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Signed-off-by: Alexander Solganik <solganik@gmail.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/net/mlx5/mlx5.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index b7e5046325c0..4c2a0b9652e3 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -920,6 +920,8 @@ rte_mlx5_pmd_init(void)
 	 * using this PMD, which is not supported in forked processes.
 	 */
 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
+	/* Don't map UAR to WC if BlueFlame is not used.*/
+	setenv("MLX5_SHUT_UP_BF", "1", 1);
 	ibv_fork_init();
 	rte_pci_register(&mlx5_driver);
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] net/mlx5: don't map doorbell register to write combining
  2017-08-21  7:47 ` [PATCH 2/2] net/mlx5: don't map doorbell register to write combining Sagi Grimberg
@ 2017-08-23 11:03   ` Ferruh Yigit
  2017-08-23 11:58     ` Shahaf Shuler
  2017-08-23 12:06     ` Nélio Laranjeiro
  0 siblings, 2 replies; 14+ messages in thread
From: Ferruh Yigit @ 2017-08-23 11:03 UTC (permalink / raw)
  To: Sagi Grimberg, dev; +Cc: Nelio Laranjeiro, Adrien Mazarguil, Shahaf Shuler

On 8/21/2017 8:47 AM, Sagi Grimberg wrote:
> From: Shahaf Shuler <shahafs@mellanox.com>
> 
> By default, Verbs maps the doorbell register to write combining.
> Working with write combining is useful for drivers which use blue flame
> for the doorbell write.
> 
> Since mlx5 PMD uses only doorbells and write combining mapping requires
> an extra memory barrier to flush the doorbell after its write, setting
> the mapping to un-cached by default.
> 
> Such change is reduces the max and average round trip
> latency significantly.
> 
> Reported-by: Alexander Solganik <solganik@gmail.com>
> Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> Signed-off-by: Alexander Solganik <solganik@gmail.com>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  drivers/net/mlx5/mlx5.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index b7e5046325c0..4c2a0b9652e3 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -920,6 +920,8 @@ rte_mlx5_pmd_init(void)
>  	 * using this PMD, which is not supported in forked processes.
>  	 */
>  	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
> +	/* Don't map UAR to WC if BlueFlame is not used.*/
> +	setenv("MLX5_SHUT_UP_BF", "1", 1);

Although technically this is possible, I wonder how good idea it is a
driver communicating with other processes via system environment variable?

>  	ibv_fork_init();
>  	rte_pci_register(&mlx5_driver);
>  }
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net/mlx5: replace memory barrier type
  2017-08-21  7:47 ` [PATCH 1/2] net/mlx5: replace memory barrier type Sagi Grimberg
@ 2017-08-23 11:39   ` Nélio Laranjeiro
  2017-08-23 13:11     ` Bruce Richardson
  0 siblings, 1 reply; 14+ messages in thread
From: Nélio Laranjeiro @ 2017-08-23 11:39 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: dev, Adrien Mazarguil, Shahaf Shuler

On Mon, Aug 21, 2017 at 10:47:01AM +0300, Sagi Grimberg wrote:
> From: Shahaf Shuler <shahafs@mellanox.com>
> 
> The reason for the requirement of a barrier between the txq writes
> and the doorbell record writes is to avoid a case where the device
> reads the doorbell record's new value before the txq writes are flushed
> to memory.
> 
> The current use of rte_wmb is not necessary, and can be replaced by
> rte_compiler_barrier as it acts as a write memory barrier.
> More details on this type of barrier can be found on [1]
> 
> Replacing the rte_wmb is also expected to improve the throughput.
> 
> [1] https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
> 
> Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> Signed-off-by: Alexander Solganik <solganik@gmail.com>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  drivers/net/mlx5/mlx5_rxtx.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 7de1d10863e5..59b9ff24fb82 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -602,7 +602,7 @@ mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
>  	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
>  	volatile uint64_t *src = ((volatile uint64_t *)wqe);
>  
> -	rte_wmb();
> +	rte_compiler_barrier();
>  	*txq->qp_db = htonl(txq->wqe_ci);
>  	/* Ensure ordering between DB record and BF copy. */
>  	rte_wmb();
> -- 
> 2.7.4
 
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] net/mlx5: don't map doorbell register to write combining
  2017-08-23 11:03   ` Ferruh Yigit
@ 2017-08-23 11:58     ` Shahaf Shuler
  2017-08-23 12:06     ` Nélio Laranjeiro
  1 sibling, 0 replies; 14+ messages in thread
From: Shahaf Shuler @ 2017-08-23 11:58 UTC (permalink / raw)
  To: Ferruh Yigit, Sagi Grimberg, dev; +Cc: Nélio Laranjeiro, Adrien Mazarguil

Wednesday, August 23, 2017 2:04 PM, Ferruh Yigit:
> On 8/21/2017 8:47 AM, Sagi Grimberg wrote:
> > From: Shahaf Shuler <shahafs@mellanox.com>
> >
> > By default, Verbs maps the doorbell register to write combining.
> > Working with write combining is useful for drivers which use blue
> > flame for the doorbell write.
> >
> > Since mlx5 PMD uses only doorbells and write combining mapping
> > requires an extra memory barrier to flush the doorbell after its
> > write, setting the mapping to un-cached by default.
> >
> > Such change is reduces the max and average round trip latency
> > significantly.
> >
> > Reported-by: Alexander Solganik <solganik@gmail.com>
> > Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > Signed-off-by: Alexander Solganik <solganik@gmail.com>
> > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> > ---
> >  drivers/net/mlx5/mlx5.c | 2 ++
> >  1 file changed, 2 insertions(+)
> >
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > b7e5046325c0..4c2a0b9652e3 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -920,6 +920,8 @@ rte_mlx5_pmd_init(void)
> >  	 * using this PMD, which is not supported in forked processes.
> >  	 */
> >  	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
> > +	/* Don't map UAR to WC if BlueFlame is not used.*/
> > +	setenv("MLX5_SHUT_UP_BF", "1", 1);

Even if it is not clean, this is the only way to toggle the BF mapping mode on libmlx5/rdma-core.
We will try to propose changes on those libs to expose a proper API, in the meanwhile we have no other way.

> 
> Although technically this is possible, I wonder how good idea it is a driver
> communicating with other processes via system environment variable?
> 
> >  	ibv_fork_init();
> >  	rte_pci_register(&mlx5_driver);
> >  }
> >


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] net/mlx5: don't map doorbell register to write combining
  2017-08-23 11:03   ` Ferruh Yigit
  2017-08-23 11:58     ` Shahaf Shuler
@ 2017-08-23 12:06     ` Nélio Laranjeiro
  1 sibling, 0 replies; 14+ messages in thread
From: Nélio Laranjeiro @ 2017-08-23 12:06 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: Sagi Grimberg, dev, Adrien Mazarguil, Shahaf Shuler

On Wed, Aug 23, 2017 at 12:03:39PM +0100, Ferruh Yigit wrote:
> On 8/21/2017 8:47 AM, Sagi Grimberg wrote:
> > From: Shahaf Shuler <shahafs@mellanox.com>
> > 
> > By default, Verbs maps the doorbell register to write combining.
> > Working with write combining is useful for drivers which use blue flame
> > for the doorbell write.
> > 
> > Since mlx5 PMD uses only doorbells and write combining mapping requires
> > an extra memory barrier to flush the doorbell after its write, setting
> > the mapping to un-cached by default.
> > 
> > Such change is reduces the max and average round trip
> > latency significantly.
> > 
> > Reported-by: Alexander Solganik <solganik@gmail.com>
> > Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > Signed-off-by: Alexander Solganik <solganik@gmail.com>
> > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> > ---
> >  drivers/net/mlx5/mlx5.c | 2 ++
> >  1 file changed, 2 insertions(+)
> > 
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> > index b7e5046325c0..4c2a0b9652e3 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -920,6 +920,8 @@ rte_mlx5_pmd_init(void)
> >  	 * using this PMD, which is not supported in forked processes.
> >  	 */
> >  	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
> > +	/* Don't map UAR to WC if BlueFlame is not used.*/
> > +	setenv("MLX5_SHUT_UP_BF", "1", 1);
> 
> Although technically this is possible, I wonder how good idea it is a
> driver communicating with other processes via system environment variable?

We don't have such flexibility though verbs, those environment variable are
read by libmlx5 code and the PMD is linked though libibverbs which does not
provide a "clean" way tweak libmlx5.
It is not used to communicate with another process, it is the same process
executing the code which reads those environment variables.
Currently it is the only solution we have.

For the patch:

Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net/mlx5: replace memory barrier type
  2017-08-23 11:39   ` Nélio Laranjeiro
@ 2017-08-23 13:11     ` Bruce Richardson
  2017-08-24  6:56       ` Shahaf Shuler
  0 siblings, 1 reply; 14+ messages in thread
From: Bruce Richardson @ 2017-08-23 13:11 UTC (permalink / raw)
  To: Nélio Laranjeiro; +Cc: Sagi Grimberg, dev, Adrien Mazarguil, Shahaf Shuler

On Wed, Aug 23, 2017 at 01:39:08PM +0200, Nélio Laranjeiro wrote:
> On Mon, Aug 21, 2017 at 10:47:01AM +0300, Sagi Grimberg wrote:
> > From: Shahaf Shuler <shahafs@mellanox.com>
> > 
> > The reason for the requirement of a barrier between the txq writes
> > and the doorbell record writes is to avoid a case where the device
> > reads the doorbell record's new value before the txq writes are flushed
> > to memory.
> > 
> > The current use of rte_wmb is not necessary, and can be replaced by
> > rte_compiler_barrier as it acts as a write memory barrier.
> > More details on this type of barrier can be found on [1]
> > 
> > Replacing the rte_wmb is also expected to improve the throughput.
> > 
> > [1] https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
> > 
> > Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > Signed-off-by: Alexander Solganik <solganik@gmail.com>
> > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> > ---
> >  drivers/net/mlx5/mlx5_rxtx.h | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > index 7de1d10863e5..59b9ff24fb82 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > @@ -602,7 +602,7 @@ mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
> >  	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
> >  	volatile uint64_t *src = ((volatile uint64_t *)wqe);
> >  
> > -	rte_wmb();
> > +	rte_compiler_barrier();
> >  	*txq->qp_db = htonl(txq->wqe_ci);
> >  	/* Ensure ordering between DB record and BF copy. */
> >  	rte_wmb();
> > -- 
> > 2.7.4
>  
> Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> 
While a compiler barrier may do on platforms with strong ordering, I'm
wondering if the rte_smp_wmb() macro may be needed here to give compiler
barrier or actual memory barrier depending on platform?

/Bruce

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net/mlx5: replace memory barrier type
  2017-08-23 13:11     ` Bruce Richardson
@ 2017-08-24  6:56       ` Shahaf Shuler
  2017-08-24  9:27         ` Bruce Richardson
  0 siblings, 1 reply; 14+ messages in thread
From: Shahaf Shuler @ 2017-08-24  6:56 UTC (permalink / raw)
  To: Bruce Richardson, Nélio Laranjeiro
  Cc: Sagi Grimberg, dev, Adrien Mazarguil

Wednesday, August 23, 2017 4:12 PM, Bruce Richardson:
> On Wed, Aug 23, 2017 at 01:39:08PM +0200, Nélio Laranjeiro wrote:
> > On Mon, Aug 21, 2017 at 10:47:01AM +0300, Sagi Grimberg wrote:
> >
> > Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> >
> While a compiler barrier may do on platforms with strong ordering, I'm
> wondering if the rte_smp_wmb() macro may be needed here to give
> compiler barrier or actual memory barrier depending on platform?

Thanks for the catch!

However, the description of rte_smp_wmb() not seems to fit our case here.
We don't try to sync between different lcores, rather between the device and a single lcore. 

Maybe rte_io_wmb fits better? 

> 
> /Bruce

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net/mlx5: replace memory barrier type
  2017-08-24  6:56       ` Shahaf Shuler
@ 2017-08-24  9:27         ` Bruce Richardson
  0 siblings, 0 replies; 14+ messages in thread
From: Bruce Richardson @ 2017-08-24  9:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: Nélio Laranjeiro, Sagi Grimberg, dev, Adrien Mazarguil

On Thu, Aug 24, 2017 at 06:56:11AM +0000, Shahaf Shuler wrote:
> Wednesday, August 23, 2017 4:12 PM, Bruce Richardson:
> > On Wed, Aug 23, 2017 at 01:39:08PM +0200, Nélio Laranjeiro wrote:
> > > On Mon, Aug 21, 2017 at 10:47:01AM +0300, Sagi Grimberg wrote:
> > >
> > > Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > >
> > While a compiler barrier may do on platforms with strong ordering, I'm
> > wondering if the rte_smp_wmb() macro may be needed here to give
> > compiler barrier or actual memory barrier depending on platform?
> 
> Thanks for the catch!
> 
> However, the description of rte_smp_wmb() not seems to fit our case here.
> We don't try to sync between different lcores, rather between the device and a single lcore. 
> 
> Maybe rte_io_wmb fits better? 
> 
Yep. Looks about right.

/Bruce

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 0/2] mlx5 high latency observed on send operations
  2017-08-21  7:47 [PATCH 0/2] mlx5 high latency observed on send operations Sagi Grimberg
  2017-08-21  7:47 ` [PATCH 1/2] net/mlx5: replace memory barrier type Sagi Grimberg
  2017-08-21  7:47 ` [PATCH 2/2] net/mlx5: don't map doorbell register to write combining Sagi Grimberg
@ 2017-08-27  6:47 ` Shahaf Shuler
  2017-08-27  6:47   ` [PATCH v2 1/2] net/mlx5: replace memory barrier type Shahaf Shuler
                     ` (2 more replies)
  2 siblings, 3 replies; 14+ messages in thread
From: Shahaf Shuler @ 2017-08-27  6:47 UTC (permalink / raw)
  To: nelio.laranjeiro, adrien.mazarguil; +Cc: dev

from sagi@grimberg.me:

When measuring latency when running a latency critical workload on mlx5 pmd drivers we noticed high latency can occur due to delayed doorbell record update flush.

This can be reproduced using the simple program [1] against testpmd macswap fwd mode. This utility sends a raw ethernet frame to the dpdk port and measures the time between send and the received mirrored frame.

This patchset guarantees immediate doorbell updates visibility by making the doorbell a non-cacheble memory.
In addition, we relax the memory barrier for dma-able memory.

Without this fix the tsc delta was 3550760-5993019 cycles (which translates to 2-6 ms on 1.7 GHz processor).

With the fix applied the tsc delta reduced to 17740-29663 (wich translates to 9-17 us).

on v2:
 * replace compiler barrier with rte_io_wmb.

Shahaf Shuler (2):
  net/mlx5: replace memory barrier type
  net/mlx5: don't map doorbell register to write combining

 drivers/net/mlx5/mlx5.c      | 2 ++
 drivers/net/mlx5/mlx5_rxtx.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

[1]:
/*
 * compiling: gcc test.c -o test
 * run using: ./test <local_iface> <dest_mac>  */ #include <arpa/inet.h> #include <linux/if_packet.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include <sys/ioctl.h> #include <sys/socket.h> #include <net/if.h> #include <netinet/ether.h>

#define BUF_SIZ		1024

static inline uint64_t rte_rdtsc(void)
{
	union {
		uint64_t tsc_64;
		struct {
			uint32_t lo_32;
			uint32_t hi_32;
		};
	} tsc;

	asm volatile("rdtsc" :
		     "=a" (tsc.lo_32),
		     "=d" (tsc.hi_32));
	return tsc.tsc_64;
}

int main(int argc, char *argv[])
{
	int sockfd;
	struct ifreq if_idx;
	struct ifreq if_mac;
	int tx_len = 0;
	char sendbuf[BUF_SIZ];
	struct ether_header *eh = (struct ether_header *) sendbuf;
	struct sockaddr_ll socket_address;
	char ifname[IFNAMSIZ];
	int values[6];
	struct ether_header expected;
	uint64_t payload = 0xB16B00B5;
	uint8_t buffer[1024];
	int result;
	uint64_t before_rcv;
	uint64_t after_rcv;
	uint64_t delta;
	int numbytes;

	if (argc != 3) {
		fprintf(stderr, "device name and dest mac\n");
		return -1;
	}

	strcpy(ifname, argv[1]);
	result = sscanf(argv[2], "%x:%x:%x:%x:%x:%x",
			&values[0], &values[1], &values[2], &values[3], &values[4], &values[5]);
	if (result != 6) {
		fprintf(stderr, "invalid mac\n");
		return -1;
	}

	/* Open RAW socket to send on */
	if ((sockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL))) == -1) {
	    perror("socket");
	}

	/* Get the index of the interface to send on */
	memset(&if_idx, 0, sizeof(struct ifreq));
	strncpy(if_idx.ifr_name, ifname, IFNAMSIZ-1);
	if (ioctl(sockfd, SIOCGIFINDEX, &if_idx) < 0)
	    perror("SIOCGIFINDEX");
	/* Get the MAC address of the interface to send on */
	memset(&if_mac, 0, sizeof(struct ifreq));
	strncpy(if_mac.ifr_name, ifname, IFNAMSIZ-1);
	if (ioctl(sockfd, SIOCGIFHWADDR, &if_mac) < 0)
	    perror("SIOCGIFHWADDR");

	/* Construct the Ethernet header */
	memset(sendbuf, 0, BUF_SIZ);
	/* Ethernet header */
	eh->ether_shost[0] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[0];
	eh->ether_shost[1] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[1];
	eh->ether_shost[2] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[2];
	eh->ether_shost[3] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[3];
	eh->ether_shost[4] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[4];
	eh->ether_shost[5] = ((uint8_t *)&if_mac.ifr_hwaddr.sa_data)[5];
	eh->ether_dhost[0] = values[0];
	eh->ether_dhost[1] = values[1];
	eh->ether_dhost[2] = values[2];
	eh->ether_dhost[3] = values[3];
	eh->ether_dhost[4] = values[4];
	eh->ether_dhost[5] = values[5];
	/* Ethertype field */
	eh->ether_type = htons(ETH_P_IP);
	tx_len += sizeof(struct ether_header);

	memcpy(&sendbuf[tx_len], &payload, sizeof(payload));
	tx_len += sizeof(payload);

	/* Index of the network device */
	socket_address.sll_ifindex = if_idx.ifr_ifindex;
	/* Address length*/
	socket_address.sll_halen = ETH_ALEN;
	/* Destination MAC */
	socket_address.sll_addr[0] = values[0];
	socket_address.sll_addr[1] = values[1];
	socket_address.sll_addr[2] = values[2];
	socket_address.sll_addr[3] = values[3];
	socket_address.sll_addr[4] = values[4];
	socket_address.sll_addr[5] = values[5];

	memcpy(&expected.ether_dhost, &eh->ether_shost, ETH_ALEN);
	memcpy(&expected.ether_shost, &eh->ether_dhost, ETH_ALEN);
	expected.ether_type = eh->ether_type;


	/* Send packet */
	if (sendto(sockfd, sendbuf, tx_len, 0, (struct sockaddr*)&socket_address, sizeof(struct sockaddr_ll)) < 0) {
	    printf("Send failed\n");
	    return -2;
	}

	before_rcv = rte_rdtsc();
	while (1) {
		numbytes = recvfrom(sockfd, buffer, BUF_SIZ, 0, NULL, NULL);
		if (numbytes <= 0)
			continue;
		after_rcv = rte_rdtsc();

		if (memcmp(&expected, buffer, sizeof(expected)) != 0)
			continue;

		if (memcmp(&payload, &buffer[sizeof(expected)], sizeof(payload)) == 0) {
			break;
		}

	}

	delta =  after_rcv - before_rcv;
	printf("RTT is %lu tsc \n", delta);
	return 0;
}


-- 
2.12.0

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 1/2] net/mlx5: replace memory barrier type
  2017-08-27  6:47 ` [PATCH v2 0/2] mlx5 high latency observed on send operations Shahaf Shuler
@ 2017-08-27  6:47   ` Shahaf Shuler
  2017-08-27  6:47   ` [PATCH v2 2/2] net/mlx5: don't map doorbell register to write combining Shahaf Shuler
  2017-08-29 16:53   ` [PATCH v2 0/2] mlx5 high latency observed on send operations Ferruh Yigit
  2 siblings, 0 replies; 14+ messages in thread
From: Shahaf Shuler @ 2017-08-27  6:47 UTC (permalink / raw)
  To: nelio.laranjeiro, adrien.mazarguil
  Cc: dev, Yongseok Koh, Alexander Solganik, Sagi Grimberg

The reason for the requirement of a barrier between the txq writes
and the doorbell record writes is to avoid a case where the device
reads the doorbell record's new value before the txq writes are flushed
to memory.

The current use of rte_wmb is not necessary, and can be replaced by
rte_io_wmb which is more relaxed.

Replacing the rte_wmb is also expected to improve the throughput.

on v2:
 * replace compiler barrier with rte_io_wmb.

Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Signed-off-by: Alexander Solganik <solganik@gmail.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index b3b161da5..e9895a9c0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -591,7 +591,7 @@ mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
 	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
 	volatile uint64_t *src = ((volatile uint64_t *)wqe);
 
-	rte_wmb();
+	rte_io_wmb();
 	*txq->qp_db = htonl(txq->wqe_ci);
 	/* Ensure ordering between DB record and BF copy. */
 	rte_wmb();
-- 
2.12.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 2/2] net/mlx5: don't map doorbell register to write combining
  2017-08-27  6:47 ` [PATCH v2 0/2] mlx5 high latency observed on send operations Shahaf Shuler
  2017-08-27  6:47   ` [PATCH v2 1/2] net/mlx5: replace memory barrier type Shahaf Shuler
@ 2017-08-27  6:47   ` Shahaf Shuler
  2017-08-29 16:53   ` [PATCH v2 0/2] mlx5 high latency observed on send operations Ferruh Yigit
  2 siblings, 0 replies; 14+ messages in thread
From: Shahaf Shuler @ 2017-08-27  6:47 UTC (permalink / raw)
  To: nelio.laranjeiro, adrien.mazarguil
  Cc: dev, Yongseok Koh, Alexander Solganik, Sagi Grimberg

By default, Verbs maps the doorbell register to write combining.
Working with write combining is useful for drivers which use blue flame
for the doorbell write.

Since mlx5 PMD uses only doorbells and write combining mapping requires
an extra memory barrier to flush the doorbell after its write, setting
the mapping to un-cached by default.

Such change is expected to reduce the max and average round trip latency.

Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Signed-off-by: Alexander Solganik <solganik@gmail.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index bd66a7c77..50f4ba70a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -884,6 +884,8 @@ rte_mlx5_pmd_init(void)
 	 * using this PMD, which is not supported in forked processes.
 	 */
 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
+	/* Don't map UAR to WC if BlueFlame is not used.*/
+	setenv("MLX5_SHUT_UP_BF", "1", 1);
 	ibv_fork_init();
 	rte_pci_register(&mlx5_driver);
 }
-- 
2.12.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 0/2] mlx5 high latency observed on send operations
  2017-08-27  6:47 ` [PATCH v2 0/2] mlx5 high latency observed on send operations Shahaf Shuler
  2017-08-27  6:47   ` [PATCH v2 1/2] net/mlx5: replace memory barrier type Shahaf Shuler
  2017-08-27  6:47   ` [PATCH v2 2/2] net/mlx5: don't map doorbell register to write combining Shahaf Shuler
@ 2017-08-29 16:53   ` Ferruh Yigit
  2 siblings, 0 replies; 14+ messages in thread
From: Ferruh Yigit @ 2017-08-29 16:53 UTC (permalink / raw)
  To: Shahaf Shuler, nelio.laranjeiro, adrien.mazarguil; +Cc: dev

On 8/27/2017 7:47 AM, Shahaf Shuler wrote:
> from sagi@grimberg.me:
> 
> When measuring latency when running a latency critical workload on mlx5 pmd drivers we noticed high latency can occur due to delayed doorbell record update flush.
> 
> This can be reproduced using the simple program [1] against testpmd macswap fwd mode. This utility sends a raw ethernet frame to the dpdk port and measures the time between send and the received mirrored frame.
> 
> This patchset guarantees immediate doorbell updates visibility by making the doorbell a non-cacheble memory.
> In addition, we relax the memory barrier for dma-able memory.
> 
> Without this fix the tsc delta was 3550760-5993019 cycles (which translates to 2-6 ms on 1.7 GHz processor).
> 
> With the fix applied the tsc delta reduced to 17740-29663 (wich translates to 9-17 us).
> 
> on v2:
>  * replace compiler barrier with rte_io_wmb.
> 
> Shahaf Shuler (2):
>   net/mlx5: replace memory barrier type
>   net/mlx5: don't map doorbell register to write combining

Series applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-08-29 16:53 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-21  7:47 [PATCH 0/2] mlx5 high latency observed on send operations Sagi Grimberg
2017-08-21  7:47 ` [PATCH 1/2] net/mlx5: replace memory barrier type Sagi Grimberg
2017-08-23 11:39   ` Nélio Laranjeiro
2017-08-23 13:11     ` Bruce Richardson
2017-08-24  6:56       ` Shahaf Shuler
2017-08-24  9:27         ` Bruce Richardson
2017-08-21  7:47 ` [PATCH 2/2] net/mlx5: don't map doorbell register to write combining Sagi Grimberg
2017-08-23 11:03   ` Ferruh Yigit
2017-08-23 11:58     ` Shahaf Shuler
2017-08-23 12:06     ` Nélio Laranjeiro
2017-08-27  6:47 ` [PATCH v2 0/2] mlx5 high latency observed on send operations Shahaf Shuler
2017-08-27  6:47   ` [PATCH v2 1/2] net/mlx5: replace memory barrier type Shahaf Shuler
2017-08-27  6:47   ` [PATCH v2 2/2] net/mlx5: don't map doorbell register to write combining Shahaf Shuler
2017-08-29 16:53   ` [PATCH v2 0/2] mlx5 high latency observed on send operations Ferruh Yigit

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.