All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mbuf: add helpers to prefetch mbuf
@ 2016-05-09 16:18 Olivier Matz
  2016-05-09 17:28 ` Thomas Monjalon
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Olivier Matz @ 2016-05-09 16:18 UTC (permalink / raw)
  To: dev

Some architectures (ex: Power8) have a cache line size of 128 bytes,
so the drivers should not expect that prefetching the second part of
the mbuf is valid.

This commit add helpers that can be used by drivers to prefetch the
rx or tx part of the mbuf, whatever the cache line size.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 drivers/net/fm10k/fm10k_rxtx_vec.c |  8 ++++----
 drivers/net/i40e/i40e_rxtx_vec.c   |  8 ++++----
 drivers/net/ixgbe/ixgbe_rxtx_vec.c |  8 ++++----
 drivers/net/mlx4/mlx4.c            |  4 ++--
 drivers/net/mlx5/mlx5_rxtx.c       |  4 ++--
 examples/ipsec-secgw/ipsec-secgw.c |  2 +-
 lib/librte_mbuf/rte_mbuf.h         | 38 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index f8efe8f..2f3ccfe 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -487,10 +487,10 @@ fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		rte_compiler_barrier();
 
 		if (split_packet) {
-			rte_prefetch0(&rx_pkts[pos]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
+			rte_mbuf_prefetch_part1(rx_pkts[pos]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 3]);
 		}
 
 		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
diff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c
index 047aff5..ac528c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec.c
@@ -331,10 +331,10 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
 
 		if (split_packet) {
-			rte_prefetch0(&rx_pkts[pos]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
+			rte_mbuf_prefetch_part1(rx_pkts[pos]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 3]);
 		}
 
 		/*shift the pktlen field*/
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index 5040704..9bc7a5d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -302,10 +302,10 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
 
 		if (split_packet) {
-			rte_prefetch0(&rx_pkts[pos]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
+			rte_mbuf_prefetch_part1(rx_pkts[pos]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part1(rx_pkts[pos + 3]);
 		}
 
 		/* avoid compiler reorder optimization */
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 4f21dbe..e7a426d 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -3235,8 +3235,8 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * Fetch initial bytes of packet descriptor into a
 		 * cacheline while allocating rep.
 		 */
-		rte_prefetch0(seg);
-		rte_prefetch0(&seg->cacheline1);
+		rte_mbuf_prefetch_part0(seg);
+		rte_mbuf_prefetch_part1(seg);
 		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
 						    &flags);
 		if (unlikely(ret < 0)) {
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9d1380a..3e901be 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1086,8 +1086,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * Fetch initial bytes of packet descriptor into a
 		 * cacheline while allocating rep.
 		 */
-		rte_prefetch0(seg);
-		rte_prefetch0(&seg->cacheline1);
+		rte_mbuf_prefetch_part0(seg);
+		rte_mbuf_prefetch_part1(seg);
 		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
 		if (unlikely(ret < 0)) {
 			struct ibv_wc wc;
diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 00ab2d8..ecaa197 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -298,7 +298,7 @@ prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint8_t port)
 	const int32_t prefetch_offset = 2;
 
 	for (i = 0; i < (nb_pkts - prefetch_offset); i++) {
-		rte_prefetch0(pkts[i + prefetch_offset]->cacheline1);
+		rte_mbuf_prefetch_part1(pkts[i + prefetch_offset]);
 		prepare_tx_pkt(pkts[i], port);
 	}
 	/* Process left packets */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 529debb..e3ee0b3 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -842,6 +842,44 @@ struct rte_mbuf {
 	uint16_t timesync;
 } __rte_cache_aligned;
 
+/**
+ * Prefetch the first part of the mbuf
+ *
+ * The first 64 bytes of the mbuf corresponds to fields that are used early
+ * in the receive path. If the cache line of the architecture is higher than
+ * 64B, the second part will also be prefetched.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ */
+static inline void
+rte_mbuf_prefetch_part0(struct rte_mbuf *m)
+{
+	rte_prefetch0(&m->cacheline0);
+}
+
+/**
+ * Prefetch the second part of the mbuf
+ *
+ * The next 64 bytes of the mbuf corresponds to fields that are used in the
+ * transmit path. If the cache line of the architecture is higher than 64B,
+ * this function does nothing as it is expected that the full mbuf is
+ * already in cache.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ */
+static inline void
+rte_mbuf_prefetch_part1(struct rte_mbuf *m)
+{
+#if RTE_CACHE_LINE_SIZE == 64
+	rte_prefetch0(&m->cacheline1);
+#else
+	RTE_SET_USED(m);
+#endif
+}
+
+
 static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp);
 
 /**
-- 
2.8.0.rc3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] mbuf: add helpers to prefetch mbuf
  2016-05-09 16:18 [PATCH] mbuf: add helpers to prefetch mbuf Olivier Matz
@ 2016-05-09 17:28 ` Thomas Monjalon
  2016-05-09 22:02 ` Wiles, Keith
  2016-05-18 16:02 ` [PATCH v2] " Olivier Matz
  2 siblings, 0 replies; 7+ messages in thread
From: Thomas Monjalon @ 2016-05-09 17:28 UTC (permalink / raw)
  To: Olivier Matz; +Cc: dev

2016-05-09 18:18, Olivier Matz:
> +/**
> + * Prefetch the second part of the mbuf
> + *
> + * The next 64 bytes of the mbuf corresponds to fields that are used in the
> + * transmit path. If the cache line of the architecture is higher than 64B,
> + * this function does nothing as it is expected that the full mbuf is
> + * already in cache.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + */
> +static inline void
> +rte_mbuf_prefetch_part1(struct rte_mbuf *m)

It is not so natural to call rte_mbuf_prefetch_part1() for the second part.
Why not start mbuf parts numbering at 1?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mbuf: add helpers to prefetch mbuf
  2016-05-09 16:18 [PATCH] mbuf: add helpers to prefetch mbuf Olivier Matz
  2016-05-09 17:28 ` Thomas Monjalon
@ 2016-05-09 22:02 ` Wiles, Keith
  2016-05-10  8:08   ` Olivier MATZ
  2016-05-18 16:02 ` [PATCH v2] " Olivier Matz
  2 siblings, 1 reply; 7+ messages in thread
From: Wiles, Keith @ 2016-05-09 22:02 UTC (permalink / raw)
  To: Olivier Matz, dev

>diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
>index 529debb..e3ee0b3 100644
>--- a/lib/librte_mbuf/rte_mbuf.h
>+++ b/lib/librte_mbuf/rte_mbuf.h
>@@ -842,6 +842,44 @@ struct rte_mbuf {
> 	uint16_t timesync;
> } __rte_cache_aligned;
> 
>+/**
>+ * Prefetch the first part of the mbuf
>+ *
>+ * The first 64 bytes of the mbuf corresponds to fields that are used early
>+ * in the receive path. If the cache line of the architecture is higher than
>+ * 64B, the second part will also be prefetched.
>+ *
>+ * @param m
>+ *   The pointer to the mbuf.
>+ */
>+static inline void
>+rte_mbuf_prefetch_part0(struct rte_mbuf *m)
>+{
>+	rte_prefetch0(&m->cacheline0);
>+}
>+
>+/**
>+ * Prefetch the second part of the mbuf
>+ *
>+ * The next 64 bytes of the mbuf corresponds to fields that are used in the
>+ * transmit path. If the cache line of the architecture is higher than 64B,
>+ * this function does nothing as it is expected that the full mbuf is
>+ * already in cache.
>+ *
>+ * @param m
>+ *   The pointer to the mbuf.
>+ */
>+static inline void
>+rte_mbuf_prefetch_part1(struct rte_mbuf *m)
>+{
>+#if RTE_CACHE_LINE_SIZE == 64
>+	rte_prefetch0(&m->cacheline1);
>+#else
>+	RTE_SET_USED(m);
>+#endif
>+}

I am not super happy with the names here, but I understand that rte_mbuf_prefetch_cacheline0() is a bit long. I could live with them being longer if that makes more sense and adds to readability.

Another idea is to have only one function for both:

enum { MBUF_CACHELINE0 = 0, MBUF_CACHELINE1, MBUF_CACHELINES }; 	// Optional enum if you want

static inline void
rte_mbuf_prefetch(struct rte_mbuf *m, unsigned cacheline)	// Make sure we add a comment about the constant value
{
	if (cacheline == MBUF_CACHELINE0)
		rte_prefetch0(&m->cacheline0);
	else if (cacheline == MBUF_CACHELINE1)
		rte_prefetch0(&m->cacheline1);
	else {
		rte_prefetch0(&m->cacheline0);
		rte_prefetch0(&m->cacheline1);
	}
}

I believe if you use constant value in the call for the cacheline variable then the extra code should be optimized out. If not then what about a macro instead.

#define rte_mbuf_prefetch(m, c)	\
	do { \
		if ((c) == MBUF_CACHELINE0) \
			rte_prefetch0(&(m)->cacheline0); \
		else if ((c) == MBUF_CACHELINE1) \
			rte_prefetch0(&(m)->cacheline1); \
		else { \
			rte_prefetch0(&(m)->cacheline0); \
			rte_prefetch0(&(m)->cacheline1); \
		} \
	} while((0))

Call like this:
	rte_mbuf_prefetch(m, 0);	// For cacheline 0
	rte_mbuf_prefetch(m, 1);		// For cacheline 1
	rte_mbuf_prefetch(m, 2);		// For cacheline 0 and 1

We could have another routine:
	rte_mbuf_prefetch_data(m, 0);	// Prefetch the first cacheline of the packet data.

Just a thought and I did not test the above code, so I hope it works that way. I noticed something like this in the linux spinlock code a few years ago.



>+
>+
> static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp);
> 
> /**
>-- 
>2.8.0.rc3
>
>


Regards,
Keith





^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mbuf: add helpers to prefetch mbuf
  2016-05-09 22:02 ` Wiles, Keith
@ 2016-05-10  8:08   ` Olivier MATZ
  0 siblings, 0 replies; 7+ messages in thread
From: Olivier MATZ @ 2016-05-10  8:08 UTC (permalink / raw)
  To: Wiles, Keith, dev

Hi,

On 05/10/2016 12:02 AM, Wiles, Keith wrote:
>> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
>> index 529debb..e3ee0b3 100644
>> --- a/lib/librte_mbuf/rte_mbuf.h
>> +++ b/lib/librte_mbuf/rte_mbuf.h
>> @@ -842,6 +842,44 @@ struct rte_mbuf {
>> 	uint16_t timesync;
>> } __rte_cache_aligned;
>>
>> +/**
>> + * Prefetch the first part of the mbuf
>> + *
>> + * The first 64 bytes of the mbuf corresponds to fields that are used early
>> + * in the receive path. If the cache line of the architecture is higher than
>> + * 64B, the second part will also be prefetched.
>> + *
>> + * @param m
>> + *   The pointer to the mbuf.
>> + */
>> +static inline void
>> +rte_mbuf_prefetch_part0(struct rte_mbuf *m)
>> +{
>> +	rte_prefetch0(&m->cacheline0);
>> +}
>> +
>> +/**
>> + * Prefetch the second part of the mbuf
>> + *
>> + * The next 64 bytes of the mbuf corresponds to fields that are used in the
>> + * transmit path. If the cache line of the architecture is higher than 64B,
>> + * this function does nothing as it is expected that the full mbuf is
>> + * already in cache.
>> + *
>> + * @param m
>> + *   The pointer to the mbuf.
>> + */
>> +static inline void
>> +rte_mbuf_prefetch_part1(struct rte_mbuf *m)
>> +{
>> +#if RTE_CACHE_LINE_SIZE == 64
>> +	rte_prefetch0(&m->cacheline1);
>> +#else
>> +	RTE_SET_USED(m);
>> +#endif
>> +}
>
> I am not super happy with the names here, but I understand that rte_mbuf_prefetch_cacheline0() is a bit long. I could live with them being longer if that makes more sense and adds to readability.

Naming these functions rte_mbuf_prefetch_cacheline0() and
rte_mbuf_prefetch_cacheline1() was my first intention, but
as you said, it's long, and it's also not accurate because
here we don't really deal with cache lines, and that's why
I preffered to use "part" instead.

I'm not opposed to name them part1/part2 instead of part0/part1
as Thomas suggested. Another option would be:
   - rte_mbuf_prefetch_rx_part(m)
   - rte_mbuf_prefetch_tx_part(m)

The objective is to avoid the drivers to deal with the two possible
cache line sizes with #ifdefs. So I don't think the function should
be called something_cacheline.

As a side note, I'm not really satisfied by the RTE_CACHE_LINE_MIN_SIZE
and __rte_cache_min_aligned macros and I think it would be clearer
to explicitly align to 64. If other people agree, I can submit a patch
for this too.

Any comment?

> Another idea is to have only one function for both:
>
> enum { MBUF_CACHELINE0 = 0, MBUF_CACHELINE1, MBUF_CACHELINES }; 	// Optional enum if you want
>
> static inline void
> rte_mbuf_prefetch(struct rte_mbuf *m, unsigned cacheline)	// Make sure we add a comment about the constant value
> {
> 	if (cacheline == MBUF_CACHELINE0)
> 		rte_prefetch0(&m->cacheline0);
> 	else if (cacheline == MBUF_CACHELINE1)
> 		rte_prefetch0(&m->cacheline1);
> 	else {
> 		rte_prefetch0(&m->cacheline0);
> 		rte_prefetch0(&m->cacheline1);
> 	}
> }
>
> I believe if you use constant value in the call for the cacheline variable then the extra code should be optimized out. If not then what about a macro instead.
>
> #define rte_mbuf_prefetch(m, c)	\
> 	do { \
> 		if ((c) == MBUF_CACHELINE0) \
> 			rte_prefetch0(&(m)->cacheline0); \
> 		else if ((c) == MBUF_CACHELINE1) \
> 			rte_prefetch0(&(m)->cacheline1); \
> 		else { \
> 			rte_prefetch0(&(m)->cacheline0); \
> 			rte_prefetch0(&(m)->cacheline1); \
> 		} \
> 	} while((0))
>
> Call like this:
> 	rte_mbuf_prefetch(m, 0);	// For cacheline 0
> 	rte_mbuf_prefetch(m, 1);		// For cacheline 1
> 	rte_mbuf_prefetch(m, 2);		// For cacheline 0 and 1

In my opinion, the implementation and usage is simpler with 2
separate functions. What would be the advantage of this?

> We could have another routine:
> 	rte_mbuf_prefetch_data(m, 0);	// Prefetch the first cacheline of the packet data.

Well, here, I think there is no need to replace rte_prefetch0(m->data).

It's useful for mbuf structure prefetch because many drivers want
to prefetch the rx part first, then the tx part. For data, the same
function can be used whatever the cache line size.

Regards,
Olivier

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] mbuf: add helpers to prefetch mbuf
  2016-05-09 16:18 [PATCH] mbuf: add helpers to prefetch mbuf Olivier Matz
  2016-05-09 17:28 ` Thomas Monjalon
  2016-05-09 22:02 ` Wiles, Keith
@ 2016-05-18 16:02 ` Olivier Matz
  2016-05-19  6:46   ` Jerin Jacob
  2 siblings, 1 reply; 7+ messages in thread
From: Olivier Matz @ 2016-05-18 16:02 UTC (permalink / raw)
  To: dev; +Cc: thomas.monjalon, keith.wiles

Some architectures (ex: Power8) have a cache line size of 128 bytes,
so the drivers should not expect that prefetching the second part of
the mbuf with rte_prefetch0(&m->cacheline1) is valid.

This commit add helpers that can be used by drivers to prefetch the
rx or tx part of the mbuf, whatever the cache line size.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---

v1 -> v2:
- rename part0 as part1 and part1 as part2, as suggested by Thomas


 drivers/net/fm10k/fm10k_rxtx_vec.c |  8 ++++----
 drivers/net/i40e/i40e_rxtx_vec.c   |  8 ++++----
 drivers/net/ixgbe/ixgbe_rxtx_vec.c |  8 ++++----
 drivers/net/mlx4/mlx4.c            |  4 ++--
 drivers/net/mlx5/mlx5_rxtx.c       |  4 ++--
 examples/ipsec-secgw/ipsec-secgw.c |  2 +-
 lib/librte_mbuf/rte_mbuf.h         | 38 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 03e4a5c..ef256a5 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -487,10 +487,10 @@ fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		rte_compiler_barrier();
 
 		if (split_packet) {
-			rte_prefetch0(&rx_pkts[pos]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
 		}
 
 		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
diff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c
index f7a62a8..eef80d9 100644
--- a/drivers/net/i40e/i40e_rxtx_vec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec.c
@@ -297,10 +297,10 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
 
 		if (split_packet) {
-			rte_prefetch0(&rx_pkts[pos]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
 		}
 
 		/* avoid compiler reorder optimization */
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index c4d709b..e97ea82 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -307,10 +307,10 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
 
 		if (split_packet) {
-			rte_prefetch0(&rx_pkts[pos]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
 		}
 
 		/* avoid compiler reorder optimization */
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index c5d8535..733d192 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -3235,8 +3235,8 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * Fetch initial bytes of packet descriptor into a
 		 * cacheline while allocating rep.
 		 */
-		rte_prefetch0(seg);
-		rte_prefetch0(&seg->cacheline1);
+		rte_mbuf_prefetch_part1(seg);
+		rte_mbuf_prefetch_part2(seg);
 		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
 						    &flags);
 		if (unlikely(ret < 0)) {
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 1832a21..5be8c62 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1086,8 +1086,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * Fetch initial bytes of packet descriptor into a
 		 * cacheline while allocating rep.
 		 */
-		rte_prefetch0(seg);
-		rte_prefetch0(&seg->cacheline1);
+		rte_mbuf_prefetch_part1(seg);
+		rte_mbuf_prefetch_part2(seg);
 		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
 		if (unlikely(ret < 0)) {
 			struct ibv_wc wc;
diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 1dc505c..ebd7c23 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -298,7 +298,7 @@ prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint8_t port)
 	const int32_t prefetch_offset = 2;
 
 	for (i = 0; i < (nb_pkts - prefetch_offset); i++) {
-		rte_prefetch0(pkts[i + prefetch_offset]->cacheline1);
+		rte_mbuf_prefetch_part2(pkts[i + prefetch_offset]);
 		prepare_tx_pkt(pkts[i], port);
 	}
 	/* Process left packets */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 7b92b88..3ee8d66 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -842,6 +842,44 @@ struct rte_mbuf {
 	uint16_t timesync;
 } __rte_cache_aligned;
 
+/**
+ * Prefetch the first part of the mbuf
+ *
+ * The first 64 bytes of the mbuf corresponds to fields that are used early
+ * in the receive path. If the cache line of the architecture is higher than
+ * 64B, the second part will also be prefetched.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ */
+static inline void
+rte_mbuf_prefetch_part1(struct rte_mbuf *m)
+{
+	rte_prefetch0(&m->cacheline0);
+}
+
+/**
+ * Prefetch the second part of the mbuf
+ *
+ * The next 64 bytes of the mbuf corresponds to fields that are used in the
+ * transmit path. If the cache line of the architecture is higher than 64B,
+ * this function does nothing as it is expected that the full mbuf is
+ * already in cache.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ */
+static inline void
+rte_mbuf_prefetch_part2(struct rte_mbuf *m)
+{
+#if RTE_CACHE_LINE_SIZE == 64
+	rte_prefetch0(&m->cacheline1);
+#else
+	RTE_SET_USED(m);
+#endif
+}
+
+
 static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp);
 
 /**
-- 
2.8.0.rc3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] mbuf: add helpers to prefetch mbuf
  2016-05-18 16:02 ` [PATCH v2] " Olivier Matz
@ 2016-05-19  6:46   ` Jerin Jacob
  2016-05-24  9:20     ` Thomas Monjalon
  0 siblings, 1 reply; 7+ messages in thread
From: Jerin Jacob @ 2016-05-19  6:46 UTC (permalink / raw)
  To: Olivier Matz; +Cc: dev, thomas.monjalon, keith.wiles

On Wed, May 18, 2016 at 06:02:08PM +0200, Olivier Matz wrote:
> Some architectures (ex: Power8) have a cache line size of 128 bytes,
> so the drivers should not expect that prefetching the second part of
> the mbuf with rte_prefetch0(&m->cacheline1) is valid.
> 
> This commit add helpers that can be used by drivers to prefetch the
> rx or tx part of the mbuf, whatever the cache line size.
> 
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

Reviewed-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

> ---
> 
> v1 -> v2:
> - rename part0 as part1 and part1 as part2, as suggested by Thomas
> 
> 
>  drivers/net/fm10k/fm10k_rxtx_vec.c |  8 ++++----
>  drivers/net/i40e/i40e_rxtx_vec.c   |  8 ++++----
>  drivers/net/ixgbe/ixgbe_rxtx_vec.c |  8 ++++----
>  drivers/net/mlx4/mlx4.c            |  4 ++--
>  drivers/net/mlx5/mlx5_rxtx.c       |  4 ++--
>  examples/ipsec-secgw/ipsec-secgw.c |  2 +-
>  lib/librte_mbuf/rte_mbuf.h         | 38 ++++++++++++++++++++++++++++++++++++++
>  7 files changed, 55 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
> index 03e4a5c..ef256a5 100644
> --- a/drivers/net/fm10k/fm10k_rxtx_vec.c
> +++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
> @@ -487,10 +487,10 @@ fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		rte_compiler_barrier();
>  
>  		if (split_packet) {
> -			rte_prefetch0(&rx_pkts[pos]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
>  		}
>  
>  		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
> diff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c
> index f7a62a8..eef80d9 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec.c
> @@ -297,10 +297,10 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>  		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
>  
>  		if (split_packet) {
> -			rte_prefetch0(&rx_pkts[pos]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
>  		}
>  
>  		/* avoid compiler reorder optimization */
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> index c4d709b..e97ea82 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> @@ -307,10 +307,10 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>  		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
>  
>  		if (split_packet) {
> -			rte_prefetch0(&rx_pkts[pos]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
>  		}
>  
>  		/* avoid compiler reorder optimization */
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
> index c5d8535..733d192 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -3235,8 +3235,8 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		 * Fetch initial bytes of packet descriptor into a
>  		 * cacheline while allocating rep.
>  		 */
> -		rte_prefetch0(seg);
> -		rte_prefetch0(&seg->cacheline1);
> +		rte_mbuf_prefetch_part1(seg);
> +		rte_mbuf_prefetch_part2(seg);
>  		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
>  						    &flags);
>  		if (unlikely(ret < 0)) {
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 1832a21..5be8c62 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -1086,8 +1086,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		 * Fetch initial bytes of packet descriptor into a
>  		 * cacheline while allocating rep.
>  		 */
> -		rte_prefetch0(seg);
> -		rte_prefetch0(&seg->cacheline1);
> +		rte_mbuf_prefetch_part1(seg);
> +		rte_mbuf_prefetch_part2(seg);
>  		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
>  		if (unlikely(ret < 0)) {
>  			struct ibv_wc wc;
> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> index 1dc505c..ebd7c23 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.c
> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> @@ -298,7 +298,7 @@ prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint8_t port)
>  	const int32_t prefetch_offset = 2;
>  
>  	for (i = 0; i < (nb_pkts - prefetch_offset); i++) {
> -		rte_prefetch0(pkts[i + prefetch_offset]->cacheline1);
> +		rte_mbuf_prefetch_part2(pkts[i + prefetch_offset]);
>  		prepare_tx_pkt(pkts[i], port);
>  	}
>  	/* Process left packets */
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 7b92b88..3ee8d66 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -842,6 +842,44 @@ struct rte_mbuf {
>  	uint16_t timesync;
>  } __rte_cache_aligned;
>  
> +/**
> + * Prefetch the first part of the mbuf
> + *
> + * The first 64 bytes of the mbuf corresponds to fields that are used early
> + * in the receive path. If the cache line of the architecture is higher than
> + * 64B, the second part will also be prefetched.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + */
> +static inline void
> +rte_mbuf_prefetch_part1(struct rte_mbuf *m)
> +{
> +	rte_prefetch0(&m->cacheline0);
> +}
> +
> +/**
> + * Prefetch the second part of the mbuf
> + *
> + * The next 64 bytes of the mbuf corresponds to fields that are used in the
> + * transmit path. If the cache line of the architecture is higher than 64B,
> + * this function does nothing as it is expected that the full mbuf is
> + * already in cache.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + */
> +static inline void
> +rte_mbuf_prefetch_part2(struct rte_mbuf *m)
> +{
> +#if RTE_CACHE_LINE_SIZE == 64
> +	rte_prefetch0(&m->cacheline1);
> +#else
> +	RTE_SET_USED(m);
> +#endif
> +}
> +
> +
>  static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp);
>  
>  /**
> -- 
> 2.8.0.rc3
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] mbuf: add helpers to prefetch mbuf
  2016-05-19  6:46   ` Jerin Jacob
@ 2016-05-24  9:20     ` Thomas Monjalon
  0 siblings, 0 replies; 7+ messages in thread
From: Thomas Monjalon @ 2016-05-24  9:20 UTC (permalink / raw)
  To: Olivier Matz; +Cc: Jerin Jacob, dev, keith.wiles

2016-05-19 12:16, Jerin Jacob:
> On Wed, May 18, 2016 at 06:02:08PM +0200, Olivier Matz wrote:
> > Some architectures (ex: Power8) have a cache line size of 128 bytes,
> > so the drivers should not expect that prefetching the second part of
> > the mbuf with rte_prefetch0(&m->cacheline1) is valid.
> > 
> > This commit add helpers that can be used by drivers to prefetch the
> > rx or tx part of the mbuf, whatever the cache line size.
> > 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> 
> Reviewed-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-05-24  9:20 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-09 16:18 [PATCH] mbuf: add helpers to prefetch mbuf Olivier Matz
2016-05-09 17:28 ` Thomas Monjalon
2016-05-09 22:02 ` Wiles, Keith
2016-05-10  8:08   ` Olivier MATZ
2016-05-18 16:02 ` [PATCH v2] " Olivier Matz
2016-05-19  6:46   ` Jerin Jacob
2016-05-24  9:20     ` Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.