This random series addresses some of suboptimal paths used in the main GRO entry point. The main body is patches 3-4 which simplify the code and improve flow distribution. Two others are mostly cosmetic to make code more readable. The benetifs are not so huge and mostly depend on NIC RSS hash function and a number of Rx flows per single NAPI instance. I got something like +10-15 Mbps on 4-8 flows NATing. Alexander Lobakin (4): gro: give 'hash' variable in dev_gro_receive() a less confusing name gro: don't dereference napi->gro_hash[x] multiple times in dev_gro_receive() gro: simplify gro_list_prepare() gro: improve flow distribution across GRO buckets in dev_gro_receive() net/core/dev.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) -- 2.30.2
'hash' stores not the flow hash, but the index of the GRO bucket corresponding to it. Change its name to 'bucket' to avoid confusion while reading lines like '__set_bit(hash, &napi->gro_bitmask)'. Signed-off-by: Alexander Lobakin <alobakin@pm.me> --- net/core/dev.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 2bfdd528c7c3..adc42ba7ffd8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5956,7 +5956,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; @@ -6024,7 +6024,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); - napi->gro_hash[hash].count--; + napi->gro_hash[bucket].count--; } if (same_flow) @@ -6033,10 +6033,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { + if (unlikely(napi->gro_hash[bucket].count >= MAX_GRO_SKBS)) { gro_flush_oldest(napi, gro_head); } else { - napi->gro_hash[hash].count++; + napi->gro_hash[bucket].count++; } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; @@ -6050,11 +6050,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (grow > 0) gro_pull_from_frag0(skb, grow); ok: - if (napi->gro_hash[hash].count) { - if (!test_bit(hash, &napi->gro_bitmask)) - __set_bit(hash, &napi->gro_bitmask); - } else if (test_bit(hash, &napi->gro_bitmask)) { - __clear_bit(hash, &napi->gro_bitmask); + if (napi->gro_hash[bucket].count) { + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); } return ret; -- 2.30.2
GRO bucket index doesn't change through the entire function. Store a pointer to the corresponding bucket on stack once and use it later instead of dereferencing again and again. Signed-off-by: Alexander Lobakin <alobakin@pm.me> --- net/core/dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index adc42ba7ffd8..ee124aecb8a2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5957,6 +5957,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; @@ -6024,7 +6025,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); - napi->gro_hash[bucket].count--; + gro_list->count--; } if (same_flow) @@ -6033,10 +6034,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_hash[bucket].count >= MAX_GRO_SKBS)) { + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) { gro_flush_oldest(napi, gro_head); } else { - napi->gro_hash[bucket].count++; + gro_list->count++; } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; @@ -6050,7 +6051,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (grow > 0) gro_pull_from_frag0(skb, grow); ok: - if (napi->gro_hash[bucket].count) { + if (gro_list->count) { if (!test_bit(bucket, &napi->gro_bitmask)) __set_bit(bucket, &napi->gro_bitmask); } else if (test_bit(bucket, &napi->gro_bitmask)) { -- 2.30.2
gro_list_prepare() always returns &napi->gro_hash[bucket].list, without any variations. Moreover, it uses 'napi' argument only to have access to this list, and calculates the bucket index for the second time (firstly it happens at the beginning of dev_gro_receive()) to do that. Given that dev_gro_receive() already has a pointer to the needed list, just pass it as the first argument to eliminate redundant calculations, and make gro_list_prepare() return void. Also, both arguments of gro_list_prepare() can be constified since this function can only modify the skbs from the bucket list. Signed-off-by: Alexander Lobakin <alobakin@pm.me> --- net/core/dev.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index ee124aecb8a2..65d9e7d9d1e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5858,15 +5858,13 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); -static struct list_head *gro_list_prepare(struct napi_struct *napi, - struct sk_buff *skb) +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); - struct list_head *head; struct sk_buff *p; - head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; list_for_each_entry(p, head, list) { unsigned long diffs; @@ -5892,8 +5890,6 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } - - return head; } static void skb_gro_reset_offset(struct sk_buff *skb) @@ -5958,10 +5954,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; + struct list_head *gro_head = &gro_list->list; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *gro_head; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; @@ -5970,7 +5966,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_head = gro_list_prepare(napi, skb); + gro_list_prepare(gro_head, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { -- 2.30.2
Most of the functions that "convert" hash value into an index (when RPS is configured / XPS is not configured / etc.) set reciprocal_scale() on it. Its logics is simple, but fair enough and accounts the entire input value. On the opposite side, 'hash & (GRO_HASH_BUCKETS - 1)' expression uses only 3 least significant bits of the value, which is far from optimal (especially for XOR RSS hashers, where the hashes of two different flows may differ only by 1 bit somewhere in the middle). Use reciprocal_scale() here too to take the entire hash value into account and improve flow dispersion between GRO hash buckets. Signed-off-by: Alexander Lobakin <alobakin@pm.me> --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 65d9e7d9d1e8..bd7c9ba54623 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5952,7 +5952,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + u32 bucket = reciprocal_scale(skb_get_hash_raw(skb), GRO_HASH_BUCKETS); struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *gro_head = &gro_list->list; struct list_head *head = &offload_base; -- 2.30.2
On Fri, Mar 12, 2021 at 5:22 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> Most of the functions that "convert" hash value into an index
> (when RPS is configured / XPS is not configured / etc.) set
> reciprocal_scale() on it. Its logics is simple, but fair enough and
> accounts the entire input value.
> On the opposite side, 'hash & (GRO_HASH_BUCKETS - 1)' expression uses
> only 3 least significant bits of the value, which is far from
> optimal (especially for XOR RSS hashers, where the hashes of two
> different flows may differ only by 1 bit somewhere in the middle).
>
> Use reciprocal_scale() here too to take the entire hash value into
> account and improve flow dispersion between GRO hash buckets.
>
> Signed-off-by: Alexander Lobakin <alobakin@pm.me>
> ---
> net/core/dev.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 65d9e7d9d1e8..bd7c9ba54623 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5952,7 +5952,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
>
> static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
> {
> - u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
> + u32 bucket = reciprocal_scale(skb_get_hash_raw(skb), GRO_HASH_BUCKETS);
This is going to use 3 high order bits instead of 3 low-order bits.
Now, had you use hash_32(skb_get_hash_raw(skb), 3), you could have
claimed to use "more bits"
Toeplitz already shuffles stuff.
Adding a multiply here seems not needed.
Please provide experimental results, because this looks unnecessary to me.
On Fri, Mar 12, 2021 at 5:22 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> GRO bucket index doesn't change through the entire function.
> Store a pointer to the corresponding bucket on stack once and use
> it later instead of dereferencing again and again.
>
> Signed-off-by: Alexander Lobakin <alobakin@pm.me>
> ---
> net/core/dev.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index adc42ba7ffd8..ee124aecb8a2 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5957,6 +5957,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
> static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
> {
> u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
> + struct gro_list *gro_list = &napi->gro_hash[bucket];
> struct list_head *head = &offload_base;
> struct packet_offload *ptype;
> __be16 type = skb->protocol;
> @@ -6024,7 +6025,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> if (pp) {
> skb_list_del_init(pp);
> napi_gro_complete(napi, pp);
> - napi->gro_hash[bucket].count--;
> + gro_list->count--;
> }
>
> if (same_flow)
> @@ -6033,10 +6034,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> if (NAPI_GRO_CB(skb)->flush)
> goto normal;
>
> - if (unlikely(napi->gro_hash[bucket].count >= MAX_GRO_SKBS)) {
> + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) {
> gro_flush_oldest(napi, gro_head);
> } else {
> - napi->gro_hash[bucket].count++;
> + gro_list->count++;
> }
> NAPI_GRO_CB(skb)->count = 1;
> NAPI_GRO_CB(skb)->age = jiffies;
> @@ -6050,7 +6051,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> if (grow > 0)
> gro_pull_from_frag0(skb, grow);
> ok:
> - if (napi->gro_hash[bucket].count) {
> + if (gro_list->count) {
> if (!test_bit(bucket, &napi->gro_bitmask))
> __set_bit(bucket, &napi->gro_bitmask);
> } else if (test_bit(bucket, &napi->gro_bitmask)) {
> --
> 2.30.2
>
>
This adds more register pressure, do you have precise measures to
confirm this change is a win ?
Presumably the compiler should be able to optimize the code just fine,
it can see @bucket does not change.
From: Eric Dumazet <edumazet@google.com> Date: Fri, 12 Mar 2021 17:33:53 +0100 > On Fri, Mar 12, 2021 at 5:22 PM Alexander Lobakin <alobakin@pm.me> wrote: > > > > Most of the functions that "convert" hash value into an index > > (when RPS is configured / XPS is not configured / etc.) set > > reciprocal_scale() on it. Its logics is simple, but fair enough and > > accounts the entire input value. > > On the opposite side, 'hash & (GRO_HASH_BUCKETS - 1)' expression uses > > only 3 least significant bits of the value, which is far from > > optimal (especially for XOR RSS hashers, where the hashes of two > > different flows may differ only by 1 bit somewhere in the middle). > > > > Use reciprocal_scale() here too to take the entire hash value into > > account and improve flow dispersion between GRO hash buckets. > > > > Signed-off-by: Alexander Lobakin <alobakin@pm.me> > > --- > > net/core/dev.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/net/core/dev.c b/net/core/dev.c > > index 65d9e7d9d1e8..bd7c9ba54623 100644 > > --- a/net/core/dev.c > > +++ b/net/core/dev.c > > @@ -5952,7 +5952,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) > > > > static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) > > { > > - u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); > > + u32 bucket = reciprocal_scale(skb_get_hash_raw(skb), GRO_HASH_BUCKETS); > > This is going to use 3 high order bits instead of 3 low-order bits. We-e-ell, seems like it. > Now, had you use hash_32(skb_get_hash_raw(skb), 3), you could have > claimed to use "more bits" Nice suggestion, I'll try. If there won't be any visible improvements, I'll just drop this one. > Toeplitz already shuffles stuff. As well as CRC and others, but I feel like we shouldn't rely only on the hardware. > Adding a multiply here seems not needed. > > Please provide experimental results, because this looks unnecessary to me. Thanks, Al
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 12 Mar 2021 17:47:04 +0100
> On Fri, Mar 12, 2021 at 5:22 PM Alexander Lobakin <alobakin@pm.me> wrote:
> >
> > GRO bucket index doesn't change through the entire function.
> > Store a pointer to the corresponding bucket on stack once and use
> > it later instead of dereferencing again and again.
> >
> > Signed-off-by: Alexander Lobakin <alobakin@pm.me>
> > ---
> > net/core/dev.c | 9 +++++----
> > 1 file changed, 5 insertions(+), 4 deletions(-)
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index adc42ba7ffd8..ee124aecb8a2 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -5957,6 +5957,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
> > static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
> > {
> > u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
> > + struct gro_list *gro_list = &napi->gro_hash[bucket];
> > struct list_head *head = &offload_base;
> > struct packet_offload *ptype;
> > __be16 type = skb->protocol;
> > @@ -6024,7 +6025,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > if (pp) {
> > skb_list_del_init(pp);
> > napi_gro_complete(napi, pp);
> > - napi->gro_hash[bucket].count--;
> > + gro_list->count--;
> > }
> >
> > if (same_flow)
> > @@ -6033,10 +6034,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > if (NAPI_GRO_CB(skb)->flush)
> > goto normal;
> >
> > - if (unlikely(napi->gro_hash[bucket].count >= MAX_GRO_SKBS)) {
> > + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) {
> > gro_flush_oldest(napi, gro_head);
> > } else {
> > - napi->gro_hash[bucket].count++;
> > + gro_list->count++;
> > }
> > NAPI_GRO_CB(skb)->count = 1;
> > NAPI_GRO_CB(skb)->age = jiffies;
> > @@ -6050,7 +6051,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > if (grow > 0)
> > gro_pull_from_frag0(skb, grow);
> > ok:
> > - if (napi->gro_hash[bucket].count) {
> > + if (gro_list->count) {
> > if (!test_bit(bucket, &napi->gro_bitmask))
> > __set_bit(bucket, &napi->gro_bitmask);
> > } else if (test_bit(bucket, &napi->gro_bitmask)) {
> > --
> > 2.30.2
> >
> >
>
> This adds more register pressure, do you have precise measures to
> confirm this change is a win ?
>
> Presumably the compiler should be able to optimize the code just fine,
> it can see @bucket does not change.
This is mostly (if not purely) cosmetic, I don't think it changes
anything at all for the most of sane compilers.
Regarding registers, since @gro_list and @gro_head are pretty the
same, we could drop @gro_head in favour of @gro_list and just use
@gro_list->list instead.
Al