* [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-09 18:32 [PATCH net-next 0/8] enic updates Govindarajulu Varadarajan
@ 2014-06-09 18:32 ` Govindarajulu Varadarajan
2014-06-10 13:11 ` Daniel Borkmann
` (2 more replies)
2014-06-09 18:32 ` [PATCH net-next 2/8] enic: fix return value in _vnic_dev_cmd Govindarajulu Varadarajan
` (5 subsequent siblings)
6 siblings, 3 replies; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-09 18:32 UTC (permalink / raw)
To: davem, netdev; +Cc: ssujith, gvaradar, benve, _govind
skb_flow_dissect() dissects only transport header type in ip_proto. It dose not
give any information about IPv4 or IPv6.
This patch adds new member, n_proto, to struct flow_keys. Which records the
IP layer type. i.e IPv4 or IPv6.
This can be used in netdev->ndo_rx_flow_steer driver function to dissect flow.
Adding new member to flow_keys increases the struct size by around 4 bytes.
This causes BUILD_BUG_ON(sizeof(qcb->data) < sz); to fail in
qdisc_cb_private_validate()
So increase data size by 4
Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
---
include/net/flow_keys.h | 14 ++++++++++++++
include/net/sch_generic.h | 2 +-
net/core/flow_dissector.c | 1 +
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
index 7e64bd8..fbefdca 100644
--- a/include/net/flow_keys.h
+++ b/include/net/flow_keys.h
@@ -1,6 +1,19 @@
#ifndef _NET_FLOW_KEYS_H
#define _NET_FLOW_KEYS_H
+/* struct flow_keys:
+ * @src: source ip address in case of IPv4
+ * For IPv6 it contains 32bit hash of src address
+ * @dst: destination ip address in case of IPv4
+ * For IPv6 it contains 32bit hash of dst address
+ * @ports: port numbers of Transport header
+ * port16[0]: src port number
+ * port16[1]: dst port number
+ * @thoff: Transport header offset
+ * @n_proto: Network header protocol (eg. IPv4/IPv6)
+ * @ip_proto: Transport header protocol (eg. TCP/UDP)
+ * All the members, except thoff, are in network byte order.
+ */
struct flow_keys {
/* (src,dst) must be grouped, in the same way than in IP header */
__be32 src;
@@ -10,6 +23,7 @@ struct flow_keys {
__be16 port16[2];
};
u16 thoff;
+ u16 n_proto;
u8 ip_proto;
};
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 624f985..a3cfb8e 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -231,7 +231,7 @@ struct qdisc_skb_cb {
unsigned int pkt_len;
u16 slave_dev_queue_mapping;
u16 _pad;
- unsigned char data[20];
+ unsigned char data[24];
};
static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12..c2b53c1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -175,6 +175,7 @@ ipv6:
break;
}
+ flow->n_proto = proto;
flow->ip_proto = ip_proto;
flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
flow->thoff = (u16) nhoff;
--
2.0.0
^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-09 18:32 ` [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect() Govindarajulu Varadarajan
@ 2014-06-10 13:11 ` Daniel Borkmann
2014-06-10 14:08 ` Govindarajulu Varadarajan
2014-06-19 18:06 ` Chema Gonzalez
2014-06-24 5:29 ` Yinghai Lu
2 siblings, 1 reply; 36+ messages in thread
From: Daniel Borkmann @ 2014-06-10 13:11 UTC (permalink / raw)
To: Govindarajulu Varadarajan
Cc: davem, netdev, ssujith, gvaradar, benve, eric.dumazet
On 06/09/2014 08:32 PM, Govindarajulu Varadarajan wrote:
> skb_flow_dissect() dissects only transport header type in ip_proto. It dose not
> give any information about IPv4 or IPv6.
>
> This patch adds new member, n_proto, to struct flow_keys. Which records the
> IP layer type. i.e IPv4 or IPv6.
>
> This can be used in netdev->ndo_rx_flow_steer driver function to dissect flow.
>
> Adding new member to flow_keys increases the struct size by around 4 bytes.
> This causes BUILD_BUG_ON(sizeof(qcb->data) < sz); to fail in
> qdisc_cb_private_validate()
>
> So increase data size by 4
>
> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
> ---
> include/net/flow_keys.h | 14 ++++++++++++++
> include/net/sch_generic.h | 2 +-
> net/core/flow_dissector.c | 1 +
> 3 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
> index 7e64bd8..fbefdca 100644
> --- a/include/net/flow_keys.h
> +++ b/include/net/flow_keys.h
> @@ -1,6 +1,19 @@
> #ifndef _NET_FLOW_KEYS_H
> #define _NET_FLOW_KEYS_H
>
> +/* struct flow_keys:
> + * @src: source ip address in case of IPv4
> + * For IPv6 it contains 32bit hash of src address
> + * @dst: destination ip address in case of IPv4
> + * For IPv6 it contains 32bit hash of dst address
> + * @ports: port numbers of Transport header
> + * port16[0]: src port number
> + * port16[1]: dst port number
> + * @thoff: Transport header offset
> + * @n_proto: Network header protocol (eg. IPv4/IPv6)
> + * @ip_proto: Transport header protocol (eg. TCP/UDP)
> + * All the members, except thoff, are in network byte order.
> + */
> struct flow_keys {
> /* (src,dst) must be grouped, in the same way than in IP header */
> __be32 src;
> @@ -10,6 +23,7 @@ struct flow_keys {
> __be16 port16[2];
> };
> u16 thoff;
> + u16 n_proto;
> u8 ip_proto;
> };
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 624f985..a3cfb8e 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -231,7 +231,7 @@ struct qdisc_skb_cb {
> unsigned int pkt_len;
> u16 slave_dev_queue_mapping;
> u16 _pad;
> - unsigned char data[20];
> + unsigned char data[24];
I'm wondering if this is actually needed. We add an extra
u16 n_proto into the flow_keys *just* to determine IPv4/v6
while if it finds anything else than this the dissector
returns false anyway w/o filling out the flow keys structure.
Plus, in case of IPv6 you'll have a hashed/folded src/dst
addr anyway.
> };
>
> static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 107ed12..c2b53c1 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -175,6 +175,7 @@ ipv6:
> break;
> }
>
> + flow->n_proto = proto;
> flow->ip_proto = ip_proto;
> flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
> flow->thoff = (u16) nhoff;
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-10 13:11 ` Daniel Borkmann
@ 2014-06-10 14:08 ` Govindarajulu Varadarajan
2014-06-10 14:26 ` Eric Dumazet
0 siblings, 1 reply; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-10 14:08 UTC (permalink / raw)
To: Daniel Borkmann
Cc: Govindarajulu Varadarajan, davem, netdev, ssujith, gvaradar,
benve, eric.dumazet
On Tue, 10 Jun 2014, Daniel Borkmann wrote:
>> diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
>> index 7e64bd8..fbefdca 100644
>> --- a/include/net/flow_keys.h
>> +++ b/include/net/flow_keys.h
>> @@ -1,6 +1,19 @@
>> #ifndef _NET_FLOW_KEYS_H
>> #define _NET_FLOW_KEYS_H
>>
>> +/* struct flow_keys:
>> + * @src: source ip address in case of IPv4
>> + * For IPv6 it contains 32bit hash of src address
>> + * @dst: destination ip address in case of IPv4
>> + * For IPv6 it contains 32bit hash of dst address
>> + * @ports: port numbers of Transport header
>> + * port16[0]: src port number
>> + * port16[1]: dst port number
>> + * @thoff: Transport header offset
>> + * @n_proto: Network header protocol (eg. IPv4/IPv6)
>> + * @ip_proto: Transport header protocol (eg. TCP/UDP)
>> + * All the members, except thoff, are in network byte order.
>> + */
>> struct flow_keys {
>> /* (src,dst) must be grouped, in the same way than in IP header */
>> __be32 src;
>> @@ -10,6 +23,7 @@ struct flow_keys {
>> __be16 port16[2];
>> };
>> u16 thoff;
>> + u16 n_proto;
>> u8 ip_proto;
>> };
>>
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index 624f985..a3cfb8e 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -231,7 +231,7 @@ struct qdisc_skb_cb {
>> unsigned int pkt_len;
>> u16 slave_dev_queue_mapping;
>> u16 _pad;
>> - unsigned char data[20];
>> + unsigned char data[24];
>
> I'm wondering if this is actually needed. We add an extra
> u16 n_proto into the flow_keys *just* to determine IPv4/v6
> while if it finds anything else than this the dissector
> returns false anyway w/o filling out the flow keys structure.
> Plus, in case of IPv6 you'll have a hashed/folded src/dst
> addr anyway.
>
determining IPv4/IPv6 is important because this can be used in dissecting flow
in Accelerated RFS. Adaptor does not support IPv6 filters. Since Accelerated
RFS is supported for IPV6, using skb_flow_dissect will return true with non-zero
values in src/dst for IPv6.
If I am going to write separate function for getting IP address and port
numbers, its definition is going to be somewhat same as skb_flow_dissect.
Why not improve whats already written and reuse it?
Is there any significant downside of adding u16 n_proto and increasing
size of qdisc_skb_cb by 4 bytes?
Thanks
Govind
>> };
>>
>> static inline void qdisc_cb_private_validate(const struct sk_buff *skb,
>> int sz)
>> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
>> index 107ed12..c2b53c1 100644
>> --- a/net/core/flow_dissector.c
>> +++ b/net/core/flow_dissector.c
>> @@ -175,6 +175,7 @@ ipv6:
>> break;
>> }
>>
>> + flow->n_proto = proto;
>> flow->ip_proto = ip_proto;
>> flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
>> flow->thoff = (u16) nhoff;
>>
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-10 14:08 ` Govindarajulu Varadarajan
@ 2014-06-10 14:26 ` Eric Dumazet
2014-06-11 22:06 ` David Miller
0 siblings, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2014-06-10 14:26 UTC (permalink / raw)
To: Govindarajulu Varadarajan
Cc: Daniel Borkmann, davem, netdev, ssujith, gvaradar, benve
On Tue, 2014-06-10 at 19:38 +0530, Govindarajulu Varadarajan wrote:
>
> determining IPv4/IPv6 is important because this can be used in dissecting flow
> in Accelerated RFS. Adaptor does not support IPv6 filters. Since Accelerated
> RFS is supported for IPV6, using skb_flow_dissect will return true with non-zero
> values in src/dst for IPv6.
>
> If I am going to write separate function for getting IP address and port
> numbers, its definition is going to be somewhat same as skb_flow_dissect.
> Why not improve whats already written and reuse it?
>
> Is there any significant downside of adding u16 n_proto and increasing
> size of qdisc_skb_cb by 4 bytes?
You can avoid this increase (might be bad for IB, hard to tell), by
changing sch_choke.c to only store a part of the struct flow_keys.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-10 14:26 ` Eric Dumazet
@ 2014-06-11 22:06 ` David Miller
2014-06-11 22:08 ` David Miller
0 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2014-06-11 22:06 UTC (permalink / raw)
To: eric.dumazet; +Cc: _govind, dborkman, netdev, ssujith, gvaradar, benve
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 10 Jun 2014 07:26:30 -0700
> On Tue, 2014-06-10 at 19:38 +0530, Govindarajulu Varadarajan wrote:
>>
>
>> determining IPv4/IPv6 is important because this can be used in dissecting flow
>> in Accelerated RFS. Adaptor does not support IPv6 filters. Since Accelerated
>> RFS is supported for IPV6, using skb_flow_dissect will return true with non-zero
>> values in src/dst for IPv6.
>>
>> If I am going to write separate function for getting IP address and port
>> numbers, its definition is going to be somewhat same as skb_flow_dissect.
>> Why not improve whats already written and reuse it?
>>
>> Is there any significant downside of adding u16 n_proto and increasing
>> size of qdisc_skb_cb by 4 bytes?
>
> You can avoid this increase (might be bad for IB, hard to tell), by
> changing sch_choke.c to only store a part of the struct flow_keys.
I think this is fine, IPOIB's control block will need still just 44
bytes after these changes, so there will still be 4 bytes to spare.
I'm going to apply this series.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-11 22:06 ` David Miller
@ 2014-06-11 22:08 ` David Miller
2014-06-12 5:38 ` Govindarajulu Varadarajan
0 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2014-06-11 22:08 UTC (permalink / raw)
To: eric.dumazet; +Cc: _govind, dborkman, netdev, ssujith, gvaradar, benve
From: David Miller <davem@davemloft.net>
Date: Wed, 11 Jun 2014 15:06:36 -0700 (PDT)
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 10 Jun 2014 07:26:30 -0700
>
>> On Tue, 2014-06-10 at 19:38 +0530, Govindarajulu Varadarajan wrote:
>>>
>>
>>> determining IPv4/IPv6 is important because this can be used in dissecting flow
>>> in Accelerated RFS. Adaptor does not support IPv6 filters. Since Accelerated
>>> RFS is supported for IPV6, using skb_flow_dissect will return true with non-zero
>>> values in src/dst for IPv6.
>>>
>>> If I am going to write separate function for getting IP address and port
>>> numbers, its definition is going to be somewhat same as skb_flow_dissect.
>>> Why not improve whats already written and reuse it?
>>>
>>> Is there any significant downside of adding u16 n_proto and increasing
>>> size of qdisc_skb_cb by 4 bytes?
>>
>> You can avoid this increase (might be bad for IB, hard to tell), by
>> changing sch_choke.c to only store a part of the struct flow_keys.
>
> I think this is fine, IPOIB's control block will need still just 44
> bytes after these changes, so there will still be 4 bytes to spare.
>
> I'm going to apply this series.
Actually, I change my mind, Govindarajulu can you address Sergei's feedback
in your other changes and repost this series?
Thanks.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-11 22:08 ` David Miller
@ 2014-06-12 5:38 ` Govindarajulu Varadarajan
0 siblings, 0 replies; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-12 5:38 UTC (permalink / raw)
To: David Miller
Cc: eric.dumazet, _govind, dborkman, netdev, ssujith, gvaradar, benve
On Wed, 11 Jun 2014, David Miller wrote:
> Actually, I change my mind, Govindarajulu can you address Sergei's feedback
> in your other changes and repost this series?
>
Sure, I will send v2 soon.
Thanks
Govind
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-09 18:32 ` [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect() Govindarajulu Varadarajan
2014-06-10 13:11 ` Daniel Borkmann
@ 2014-06-19 18:06 ` Chema Gonzalez
2014-06-19 18:52 ` Govindarajulu Varadarajan
2014-06-24 5:29 ` Yinghai Lu
2 siblings, 1 reply; 36+ messages in thread
From: Chema Gonzalez @ 2014-06-19 18:06 UTC (permalink / raw)
To: Govindarajulu Varadarajan; +Cc: davem, netdev, ssujith, gvaradar, benve
On Mon, Jun 9, 2014 at 11:32 AM, Govindarajulu Varadarajan
<_govind@gmx.com> wrote:
> skb_flow_dissect() dissects only transport header type in ip_proto. It dose not
> give any information about IPv4 or IPv6.
>
> This patch adds new member, n_proto, to struct flow_keys. Which records the
> IP layer type. i.e IPv4 or IPv6.
>
> This can be used in netdev->ndo_rx_flow_steer driver function to dissect flow.
>
> Adding new member to flow_keys increases the struct size by around 4 bytes.
> This causes BUILD_BUG_ON(sizeof(qcb->data) < sz); to fail in
> qdisc_cb_private_validate()
>
> So increase data size by 4
>
> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
> ---
> include/net/flow_keys.h | 14 ++++++++++++++
> include/net/sch_generic.h | 2 +-
> net/core/flow_dissector.c | 1 +
> 3 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
> index 7e64bd8..fbefdca 100644
> --- a/include/net/flow_keys.h
> +++ b/include/net/flow_keys.h
> @@ -1,6 +1,19 @@
> #ifndef _NET_FLOW_KEYS_H
> #define _NET_FLOW_KEYS_H
>
> +/* struct flow_keys:
> + * @src: source ip address in case of IPv4
> + * For IPv6 it contains 32bit hash of src address
> + * @dst: destination ip address in case of IPv4
> + * For IPv6 it contains 32bit hash of dst address
> + * @ports: port numbers of Transport header
> + * port16[0]: src port number
> + * port16[1]: dst port number
> + * @thoff: Transport header offset
> + * @n_proto: Network header protocol (eg. IPv4/IPv6)
> + * @ip_proto: Transport header protocol (eg. TCP/UDP)
> + * All the members, except thoff, are in network byte order.
> + */
> struct flow_keys {
> /* (src,dst) must be grouped, in the same way than in IP header */
> __be32 src;
> @@ -10,6 +23,7 @@ struct flow_keys {
> __be16 port16[2];
> };
> u16 thoff;
> + u16 n_proto;
> u8 ip_proto;
If you add n_proto, have you considered adding nhoff too? That way the
L3 and L4 headers will be completely pinned by the flow dissector.
-Chema
> };
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 624f985..a3cfb8e 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -231,7 +231,7 @@ struct qdisc_skb_cb {
> unsigned int pkt_len;
> u16 slave_dev_queue_mapping;
> u16 _pad;
> - unsigned char data[20];
> + unsigned char data[24];
> };
>
> static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 107ed12..c2b53c1 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -175,6 +175,7 @@ ipv6:
> break;
> }
>
> + flow->n_proto = proto;
> flow->ip_proto = ip_proto;
> flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
> flow->thoff = (u16) nhoff;
> --
> 2.0.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-19 18:06 ` Chema Gonzalez
@ 2014-06-19 18:52 ` Govindarajulu Varadarajan
0 siblings, 0 replies; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-19 18:52 UTC (permalink / raw)
To: Chema Gonzalez
Cc: Govindarajulu Varadarajan, davem, netdev, ssujith, gvaradar, benve
On Thu, 19 Jun 2014, Chema Gonzalez wrote:
> If you add n_proto, have you considered adding nhoff too? That way the
> L3 and L4 headers will be completely pinned by the flow dissector.
>
Yes, I have actually considered it. But right now there is no one who would need
nhoff. We can add it later when we need it?
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-09 18:32 ` [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect() Govindarajulu Varadarajan
2014-06-10 13:11 ` Daniel Borkmann
2014-06-19 18:06 ` Chema Gonzalez
@ 2014-06-24 5:29 ` Yinghai Lu
2014-06-26 6:34 ` Govindarajulu Varadarajan
2 siblings, 1 reply; 36+ messages in thread
From: Yinghai Lu @ 2014-06-24 5:29 UTC (permalink / raw)
To: Govindarajulu Varadarajan; +Cc: David Miller, NetDev, ssujith, gvaradar, benve
On Mon, Jun 9, 2014 at 11:32 AM, Govindarajulu Varadarajan
<_govind@gmx.com> wrote:
> skb_flow_dissect() dissects only transport header type in ip_proto. It dose not
> give any information about IPv4 or IPv6.
>
> This patch adds new member, n_proto, to struct flow_keys. Which records the
> IP layer type. i.e IPv4 or IPv6.
>
> This can be used in netdev->ndo_rx_flow_steer driver function to dissect flow.
>
> Adding new member to flow_keys increases the struct size by around 4 bytes.
> This causes BUILD_BUG_ON(sizeof(qcb->data) < sz); to fail in
> qdisc_cb_private_validate()
>
> So increase data size by 4
>
> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
> ---
> include/net/flow_keys.h | 14 ++++++++++++++
> include/net/sch_generic.h | 2 +-
> net/core/flow_dissector.c | 1 +
> 3 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
> index 7e64bd8..fbefdca 100644
> --- a/include/net/flow_keys.h
> +++ b/include/net/flow_keys.h
> @@ -1,6 +1,19 @@
> #ifndef _NET_FLOW_KEYS_H
> #define _NET_FLOW_KEYS_H
>
> +/* struct flow_keys:
> + * @src: source ip address in case of IPv4
> + * For IPv6 it contains 32bit hash of src address
> + * @dst: destination ip address in case of IPv4
> + * For IPv6 it contains 32bit hash of dst address
> + * @ports: port numbers of Transport header
> + * port16[0]: src port number
> + * port16[1]: dst port number
> + * @thoff: Transport header offset
> + * @n_proto: Network header protocol (eg. IPv4/IPv6)
> + * @ip_proto: Transport header protocol (eg. TCP/UDP)
> + * All the members, except thoff, are in network byte order.
> + */
> struct flow_keys {
> /* (src,dst) must be grouped, in the same way than in IP header */
> __be32 src;
> @@ -10,6 +23,7 @@ struct flow_keys {
> __be16 port16[2];
> };
> u16 thoff;
> + u16 n_proto;
> u8 ip_proto;
> };
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 624f985..a3cfb8e 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -231,7 +231,7 @@ struct qdisc_skb_cb {
> unsigned int pkt_len;
> u16 slave_dev_queue_mapping;
> u16 _pad;
> - unsigned char data[20];
> + unsigned char data[24];
> };
>
> static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 107ed12..c2b53c1 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -175,6 +175,7 @@ ipv6:
> break;
> }
>
> + flow->n_proto = proto;
> flow->ip_proto = ip_proto;
> flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
> flow->thoff = (u16) nhoff;
> --
> 2.0.0
>
this patch in net-next cause kernel crash.
[ 148.466045] qlge 0000:4a:00.1 eth27: Passed Get Port Configuration.
[ 162.385445] BUG: unable to handle kernel paging request at 000000010000007e
[ 162.385839] IP: [<ffffffff81f18899>] __dev_queue_xmit+0x399/0x630
[ 162.398541] PGD 0
[ 162.398659] Oops: 0002 [#1] SMP
[ 162.398845] Modules linked in:
[ 162.399022] CPU: 5 PID: 1 Comm: swapper/0 Tainted: G W
3.16.0-rc2-yh-00302-g3d5dc41-dirty #22
[ 162.418490] Hardware name: Oracle Corporation unknown /
, BIOS 11016600 05/17/2011
[ 162.438851] task: ffff884027a80000 ti: ffff881027d20000 task.ti:
ffff881027d20000
[ 162.468329] RIP: 0010:[<ffffffff81f18899>] [<ffffffff81f18899>]
__dev_queue_xmit+0x399/0x630
[ 162.488085] RSP: 0000:ffff881027d23d28 EFLAGS: 00010202
[ 162.488345] RAX: 00000000fffffffe RBX: ffff887026041000 RCX: 0000000000000001
[ 162.508245] RDX: 0000000000000000 RSI: ffffffff82dfee78 RDI: ffff884027a80000
[ 162.508590] RBP: ffff881027d23d70 R08: 0000000000000001 R09: 0000000000000000
[ 162.528255] R10: 0000000000000000 R11: ffff885026020800 R12: ffffffff82dfedc0
[ 162.547963] R13: ffff881022b7c000 R14: 0000000000000dac R15: ffff88702434c400
[ 162.548310] FS: 0000000000000000(0000) GS:ffff88103f000000(0000)
knlGS:0000000000000000
[ 162.568186] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 162.568489] CR2: 000000010000007e CR3: 0000000002c19000 CR4: 00000000000007e0
[ 162.588263] Stack:
[ 162.588354] ffffffff81f18505 0000000000000000 00ff881027d23d80
ffffffff82dfee60
[ 162.608269] ffff885023dd1e40 ffff881022b7c000 ffff885026020929
0000000000000dac
[ 162.628043] ffff887026041000 ffff881027d23d80 ffffffff81f18b40
ffff881027d23e18
[ 162.628422] Call Trace:
[ 162.647963] [<ffffffff81f18505>] ? __dev_queue_xmit+0x5/0x630
[ 162.648284] [<ffffffff81f18b40>] dev_queue_xmit+0x10/0x20
[ 162.667987] [<ffffffff83087b64>] ip_auto_config+0x8e6/0xf13
[ 162.668282] [<ffffffff8100031d>] ? do_one_initcall+0xdd/0x1e0
[ 162.688018] [<ffffffff810e36ad>] ? trace_hardirqs_on+0xd/0x10
[ 162.688298] [<ffffffff8110c40f>] ? ktime_get+0xbf/0x140
[ 162.708029] [<ffffffff8308727e>] ? root_nfs_parse_addr+0xbd/0xbd
[ 162.708292] [<ffffffff81000323>] do_one_initcall+0xe3/0x1e0
[ 162.728075] [<ffffffff810ba8bd>] ? parse_args+0x1ed/0x330
[ 162.728340] [<ffffffff8203501a>] ? printk+0x54/0x56
[ 162.748027] [<ffffffff8301f4c5>] kernel_init_freeable+0x237/0x2ce
[ 162.748344] [<ffffffff8301eaf7>] ? do_early_param+0x8a/0x8a
[ 162.768070] [<ffffffff8202abb0>] ? rest_init+0xc0/0xc0
[ 162.768318] [<ffffffff8202abbe>] kernel_init+0xe/0x100
[ 162.788057] [<ffffffff8204c4ac>] ret_from_fork+0x7c/0xb0
[ 162.788314] [<ffffffff8202abb0>] ? rest_init+0xc0/0xc0
[ 162.808008] Code: e8 5d 48 18 ff eb 13 48 c7 c7 60 d9 c5 82 e8 cf
bc 1c ff 85 c0 74 dd 0f 1f 00 48 8b 43 58 48 83 e0 fe 48 85 c0 48 89
43 58 74 07 <f0> ff 80 80 00 00 00 4c 89 e6 48 89 df 41 ff 14 24 41 89
c6 41
[ 162.828797] RIP [<ffffffff81f18899>] __dev_queue_xmit+0x399/0x630
[ 162.848186] RSP <ffff881027d23d28>
[ 162.848341] CR2: 000000010000007e
[ 162.848490] ---[ end trace 26b7736a09036e46 ]---
[ 162.868194] Kernel panic - not syncing: Fatal exception in interrupt
[ 162.872673] Kernel Offset: 0x0 from 0xffffffff81000000 (relocation
range: 0xffffffff80000000-0xffffffff9fffffff)
[ 162.888531] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
[ 162.908306] ------------[ cut here ]------------
After the commit is reverted, the system work again.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-24 5:29 ` Yinghai Lu
@ 2014-06-26 6:34 ` Govindarajulu Varadarajan
2014-09-18 14:18 ` Or Gerlitz
0 siblings, 1 reply; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-26 6:34 UTC (permalink / raw)
To: Yinghai Lu
Cc: Govindarajulu Varadarajan, David Miller, NetDev, ssujith,
gvaradar, benve
On Mon, 23 Jun 2014, Yinghai Lu wrote:
> this patch in net-next cause kernel crash.
>
> [ 148.466045] qlge 0000:4a:00.1 eth27: Passed Get Port Configuration.
> [ 162.385445] BUG: unable to handle kernel paging request at 000000010000007e
> [ 162.385839] IP: [<ffffffff81f18899>] __dev_queue_xmit+0x399/0x630
> [ 162.398541] PGD 0
> [ 162.398659] Oops: 0002 [#1] SMP
> [ 162.398845] Modules linked in:
> [ 162.399022] CPU: 5 PID: 1 Comm: swapper/0 Tainted: G W
> 3.16.0-rc2-yh-00302-g3d5dc41-dirty #22
> [ 162.418490] Hardware name: Oracle Corporation unknown /
> , BIOS 11016600 05/17/2011
> [ 162.438851] task: ffff884027a80000 ti: ffff881027d20000 task.ti:
> ffff881027d20000
> [ 162.468329] RIP: 0010:[<ffffffff81f18899>] [<ffffffff81f18899>]
> __dev_queue_xmit+0x399/0x630
> [ 162.488085] RSP: 0000:ffff881027d23d28 EFLAGS: 00010202
> [ 162.488345] RAX: 00000000fffffffe RBX: ffff887026041000 RCX: 0000000000000001
> [ 162.508245] RDX: 0000000000000000 RSI: ffffffff82dfee78 RDI: ffff884027a80000
> [ 162.508590] RBP: ffff881027d23d70 R08: 0000000000000001 R09: 0000000000000000
> [ 162.528255] R10: 0000000000000000 R11: ffff885026020800 R12: ffffffff82dfedc0
> [ 162.547963] R13: ffff881022b7c000 R14: 0000000000000dac R15: ffff88702434c400
> [ 162.548310] FS: 0000000000000000(0000) GS:ffff88103f000000(0000)
> knlGS:0000000000000000
> [ 162.568186] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 162.568489] CR2: 000000010000007e CR3: 0000000002c19000 CR4: 00000000000007e0
> [ 162.588263] Stack:
> [ 162.588354] ffffffff81f18505 0000000000000000 00ff881027d23d80
> ffffffff82dfee60
> [ 162.608269] ffff885023dd1e40 ffff881022b7c000 ffff885026020929
> 0000000000000dac
> [ 162.628043] ffff887026041000 ffff881027d23d80 ffffffff81f18b40
> ffff881027d23e18
> [ 162.628422] Call Trace:
> [ 162.647963] [<ffffffff81f18505>] ? __dev_queue_xmit+0x5/0x630
> [ 162.648284] [<ffffffff81f18b40>] dev_queue_xmit+0x10/0x20
> [ 162.667987] [<ffffffff83087b64>] ip_auto_config+0x8e6/0xf13
> [ 162.668282] [<ffffffff8100031d>] ? do_one_initcall+0xdd/0x1e0
> [ 162.688018] [<ffffffff810e36ad>] ? trace_hardirqs_on+0xd/0x10
> [ 162.688298] [<ffffffff8110c40f>] ? ktime_get+0xbf/0x140
> [ 162.708029] [<ffffffff8308727e>] ? root_nfs_parse_addr+0xbd/0xbd
> [ 162.708292] [<ffffffff81000323>] do_one_initcall+0xe3/0x1e0
> [ 162.728075] [<ffffffff810ba8bd>] ? parse_args+0x1ed/0x330
> [ 162.728340] [<ffffffff8203501a>] ? printk+0x54/0x56
> [ 162.748027] [<ffffffff8301f4c5>] kernel_init_freeable+0x237/0x2ce
> [ 162.748344] [<ffffffff8301eaf7>] ? do_early_param+0x8a/0x8a
> [ 162.768070] [<ffffffff8202abb0>] ? rest_init+0xc0/0xc0
> [ 162.768318] [<ffffffff8202abbe>] kernel_init+0xe/0x100
> [ 162.788057] [<ffffffff8204c4ac>] ret_from_fork+0x7c/0xb0
> [ 162.788314] [<ffffffff8202abb0>] ? rest_init+0xc0/0xc0
> [ 162.808008] Code: e8 5d 48 18 ff eb 13 48 c7 c7 60 d9 c5 82 e8 cf
> bc 1c ff 85 c0 74 dd 0f 1f 00 48 8b 43 58 48 83 e0 fe 48 85 c0 48 89
> 43 58 74 07 <f0> ff 80 80 00 00 00 4c 89 e6 48 89 df 41 ff 14 24 41 89
> c6 41
> [ 162.828797] RIP [<ffffffff81f18899>] __dev_queue_xmit+0x399/0x630
> [ 162.848186] RSP <ffff881027d23d28>
> [ 162.848341] CR2: 000000010000007e
> [ 162.848490] ---[ end trace 26b7736a09036e46 ]---
> [ 162.868194] Kernel panic - not syncing: Fatal exception in interrupt
> [ 162.872673] Kernel Offset: 0x0 from 0xffffffff81000000 (relocation
> range: 0xffffffff80000000-0xffffffff9fffffff)
> [ 162.888531] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
> [ 162.908306] ------------[ cut here ]------------
>
> After the commit is reverted, the system work again.
I do not see any problem in my system.
Did you try disecting what "__dev_queue_xmit+0x399/0x630" is?
On what interface did the crash occur on? is it bond interface?
Thanks
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-06-26 6:34 ` Govindarajulu Varadarajan
@ 2014-09-18 14:18 ` Or Gerlitz
2014-09-18 14:38 ` Eric Dumazet
2014-09-18 15:02 ` [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes Eric Dumazet
0 siblings, 2 replies; 36+ messages in thread
From: Or Gerlitz @ 2014-09-18 14:18 UTC (permalink / raw)
To: Govindarajulu Varadarajan
Cc: Yinghai Lu, David Miller, NetDev, ssujith, gvaradar,
Christian Benvenuti (benve)
On Thu, Jun 26, 2014 at 9:34 AM, Govindarajulu Varadarajan
<_govind@gmx.com> wrote:
>
>
> On Mon, 23 Jun 2014, Yinghai Lu wrote:
>>
>> this patch in net-next cause kernel crash.
>>
>> [ 148.466045] qlge 0000:4a:00.1 eth27: Passed Get Port Configuration.
>> [ 162.385445] BUG: unable to handle kernel paging request at 000000010000007e
>> [ 162.385839] IP: [<ffffffff81f18899>] __dev_queue_xmit+0x399/0x630
>> [ 162.398541] PGD 0
>> [ 162.398659] Oops: 0002 [#1] SMP
>> [ 162.398845] Modules linked in:
>> [ 162.399022] CPU: 5 PID: 1 Comm: swapper/0 Tainted: G W
>> 3.16.0-rc2-yh-00302-g3d5dc41-dirty #22
>> [ 162.418490] Hardware name: Oracle Corporation unknown /
>> , BIOS 11016600 05/17/2011
>> [ 162.438851] task: ffff884027a80000 ti: ffff881027d20000 task.ti:
>> ffff881027d20000
>> [ 162.468329] RIP: 0010:[<ffffffff81f18899>] [<ffffffff81f18899>]
>> __dev_queue_xmit+0x399/0x630
>> [ 162.488085] RSP: 0000:ffff881027d23d28 EFLAGS: 00010202
>> [ 162.488345] RAX: 00000000fffffffe RBX: ffff887026041000 RCX: 0000000000000001
>> [ 162.508245] RDX: 0000000000000000 RSI: ffffffff82dfee78 RDI: ffff884027a80000
>> [ 162.508590] RBP: ffff881027d23d70 R08: 0000000000000001 R09: 0000000000000000
>> [ 162.528255] R10: 0000000000000000 R11: ffff885026020800 R12: ffffffff82dfedc0
>> [ 162.547963] R13: ffff881022b7c000 R14: 0000000000000dac R15: ffff88702434c400
>> [ 162.548310] FS: 0000000000000000(0000) GS:ffff88103f000000(0000)
>> knlGS:0000000000000000
>> [ 162.568186] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>> [ 162.568489] CR2: 000000010000007e CR3: 0000000002c19000 CR4: 00000000000007e0
>> [ 162.588263] Stack:
>> [ 162.588354] ffffffff81f18505 0000000000000000 00ff881027d23d80
>> ffffffff82dfee60
>> [ 162.608269] ffff885023dd1e40 ffff881022b7c000 ffff885026020929
>> 0000000000000dac
>> [ 162.628043] ffff887026041000 ffff881027d23d80 ffffffff81f18b40
>> ffff881027d23e18
>> [ 162.628422] Call Trace:
>> [ 162.647963] [<ffffffff81f18505>] ? __dev_queue_xmit+0x5/0x630
>> [ 162.648284] [<ffffffff81f18b40>] dev_queue_xmit+0x10/0x20
>> [ 162.667987] [<ffffffff83087b64>] ip_auto_config+0x8e6/0xf13
>> [ 162.668282] [<ffffffff8100031d>] ? do_one_initcall+0xdd/0x1e0
>> [ 162.688018] [<ffffffff810e36ad>] ? trace_hardirqs_on+0xd/0x10
>> [ 162.688298] [<ffffffff8110c40f>] ? ktime_get+0xbf/0x140
>> [ 162.708029] [<ffffffff8308727e>] ? root_nfs_parse_addr+0xbd/0xbd
>> [ 162.708292] [<ffffffff81000323>] do_one_initcall+0xe3/0x1e0
>> [ 162.728075] [<ffffffff810ba8bd>] ? parse_args+0x1ed/0x330
>> [ 162.728340] [<ffffffff8203501a>] ? printk+0x54/0x56
>> [ 162.748027] [<ffffffff8301f4c5>] kernel_init_freeable+0x237/0x2ce
>> [ 162.748344] [<ffffffff8301eaf7>] ? do_early_param+0x8a/0x8a
>> [ 162.768070] [<ffffffff8202abb0>] ? rest_init+0xc0/0xc0
>> [ 162.768318] [<ffffffff8202abbe>] kernel_init+0xe/0x100
>> [ 162.788057] [<ffffffff8204c4ac>] ret_from_fork+0x7c/0xb0
>> [ 162.788314] [<ffffffff8202abb0>] ? rest_init+0xc0/0xc0
>> [ 162.808008] Code: e8 5d 48 18 ff eb 13 48 c7 c7 60 d9 c5 82 e8 cf
>> bc 1c ff 85 c0 74 dd 0f 1f 00 48 8b 43 58 48 83 e0 fe 48 85 c0 48 89
>> 43 58 74 07 <f0> ff 80 80 00 00 00 4c 89 e6 48 89 df 41 ff 14 24 41 89
>> c6 41
>> [ 162.828797] RIP [<ffffffff81f18899>] __dev_queue_xmit+0x399/0x630
>> [ 162.848186] RSP <ffff881027d23d28>
>> [ 162.848341] CR2: 000000010000007e
>> [ 162.848490] ---[ end trace 26b7736a09036e46 ]---
>> [ 162.868194] Kernel panic - not syncing: Fatal exception in interrupt
>> [ 162.872673] Kernel Offset: 0x0 from 0xffffffff81000000 (relocation
>> range: 0xffffffff80000000-0xffffffff9fffffff)
>> [ 162.888531] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
>> [ 162.908306] ------------[ cut here ]------------
>>
>> After the commit is reverted, the system work again.
>
>
> I do not see any problem in my system.
>
> Did you try disecting what "__dev_queue_xmit+0x399/0x630" is?
>
> On what interface did the crash occur on? is it bond interface?
>
The crash happens 100% on IPoIB (IP-over-Infiniband) [1] interfaces
b/c your upstream commit e0f31d8 "flow_keys: Record IP layer protocol
in skb_flow_dissect()" causes the IPoIB data stashed on skb->cb [2] to
smash other skb fields.
So your 3.17-rc1 commit introduced a regression to how things work
since kernel 3.2
Can please see how to revert this hunk
-- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -231,7 +231,7 @@ struct qdisc_skb_cb {
unsigned int pkt_len;
u16 slave_dev_queue_mapping;
u16 _pad;
- unsigned char data[20];
+ unsigned char data[24];
};
thanks,
Or.
[1] http://marc.info/?l=linux-rdma&m=141029109017035&w=2
[2] see these commits
936d7de3 IPoIB: Stop lying about hard_header_len and use skb->cb to
stash LL addresses
a0417fa3 net: Make qdisc_skb_cb upper size bound explicit
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect()
2014-09-18 14:18 ` Or Gerlitz
@ 2014-09-18 14:38 ` Eric Dumazet
2014-09-18 15:02 ` [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes Eric Dumazet
1 sibling, 0 replies; 36+ messages in thread
From: Eric Dumazet @ 2014-09-18 14:38 UTC (permalink / raw)
To: Or Gerlitz
Cc: Govindarajulu Varadarajan, Yinghai Lu, David Miller, NetDev,
ssujith, gvaradar, Christian Benvenuti (benve)
On Thu, 2014-09-18 at 17:18 +0300, Or Gerlitz wrote:
> The crash happens 100% on IPoIB (IP-over-Infiniband) [1] interfaces
> b/c your upstream commit e0f31d8 "flow_keys: Record IP layer protocol
> in skb_flow_dissect()" causes the IPoIB data stashed on skb->cb [2] to
> smash other skb fields.
>
> So your 3.17-rc1 commit introduced a regression to how things work
> since kernel 3.2
>
> Can please see how to revert this hunk
>
> -- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -231,7 +231,7 @@ struct qdisc_skb_cb {
> unsigned int pkt_len;
> u16 slave_dev_queue_mapping;
> u16 _pad;
> - unsigned char data[20];
> + unsigned char data[24];
> };
>
> thanks,
>
> Or.
>
> [1] http://marc.info/?l=linux-rdma&m=141029109017035&w=2
> [2] see these commits
>
> 936d7de3 IPoIB: Stop lying about hard_header_len and use skb->cb to
> stash LL addresses
> a0417fa3 net: Make qdisc_skb_cb upper size bound explicit
I am taking care of this right now guys.
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 14:18 ` Or Gerlitz
2014-09-18 14:38 ` Eric Dumazet
@ 2014-09-18 15:02 ` Eric Dumazet
2014-09-18 16:26 ` Stephen Hemminger
` (2 more replies)
1 sibling, 3 replies; 36+ messages in thread
From: Eric Dumazet @ 2014-09-18 15:02 UTC (permalink / raw)
To: Or Gerlitz
Cc: Govindarajulu Varadarajan, Yinghai Lu, David Miller, NetDev,
ssujith, gvaradar, Christian Benvenuti (benve)
From: Eric Dumazet <edumazet@google.com>
We cannot make struct qdisc_skb_cb bigger without impacting IPoIB,
or increasing skb->cb[] size.
Commit e0f31d849867 ("flow_keys: Record IP layer protocol in
skb_flow_dissect()") broke IPoIB.
Only current offender is sch_choke, and this one do not need an
absolutely precise flow key.
If we store 17 bytes of flow key, its more than enough. (Its the actual
size of flow_keys if it was a packed structure, but we might add new
fields at the end of it later)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: e0f31d849867 ("flow_keys: Record IP layer protocol in skb_flow_dissect()")
---
include/net/sch_generic.h | 3 ++-
net/sched/sch_choke.c | 18 ++++++++++++++----
2 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a3cfb8ebeb53..620e086c0cbe 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -231,7 +231,8 @@ struct qdisc_skb_cb {
unsigned int pkt_len;
u16 slave_dev_queue_mapping;
u16 _pad;
- unsigned char data[24];
+#define QDISC_CB_PRIV_LEN 20
+ unsigned char data[QDISC_CB_PRIV_LEN];
};
static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index ed30e436128b..fb666d1e4de3 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -133,10 +133,16 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
--sch->q.qlen;
}
+/* private part of skb->cb[] that a qdisc is allowed to use
+ * is limited to QDISC_CB_PRIV_LEN bytes.
+ * As a flow key might be too large, we store a part of it only.
+ */
+#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3)
+
struct choke_skb_cb {
u16 classid;
u8 keys_valid;
- struct flow_keys keys;
+ u8 keys[QDISC_CB_PRIV_LEN - 3];
};
static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
@@ -163,22 +169,26 @@ static u16 choke_get_classid(const struct sk_buff *skb)
static bool choke_match_flow(struct sk_buff *skb1,
struct sk_buff *skb2)
{
+ struct flow_keys temp;
+
if (skb1->protocol != skb2->protocol)
return false;
if (!choke_skb_cb(skb1)->keys_valid) {
choke_skb_cb(skb1)->keys_valid = 1;
- skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
+ skb_flow_dissect(skb1, &temp);
+ memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN);
}
if (!choke_skb_cb(skb2)->keys_valid) {
choke_skb_cb(skb2)->keys_valid = 1;
- skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
+ skb_flow_dissect(skb2, &temp);
+ memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN);
}
return !memcmp(&choke_skb_cb(skb1)->keys,
&choke_skb_cb(skb2)->keys,
- sizeof(struct flow_keys));
+ CHOKE_K_LEN);
}
/*
^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 15:02 ` [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes Eric Dumazet
@ 2014-09-18 16:26 ` Stephen Hemminger
2014-09-18 16:32 ` Eric Dumazet
2014-09-18 22:29 ` [net] " Doug Ledford
2014-09-22 18:22 ` [PATCH net] " David Miller
2 siblings, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2014-09-18 16:26 UTC (permalink / raw)
To: Eric Dumazet
Cc: Or Gerlitz, Govindarajulu Varadarajan, Yinghai Lu, David Miller,
NetDev, ssujith, gvaradar, Christian Benvenuti (benve)
On Thu, 18 Sep 2014 08:02:05 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> We cannot make struct qdisc_skb_cb bigger without impacting IPoIB,
> or increasing skb->cb[] size.
>
> Commit e0f31d849867 ("flow_keys: Record IP layer protocol in
> skb_flow_dissect()") broke IPoIB.
>
> Only current offender is sch_choke, and this one do not need an
> absolutely precise flow key.
>
> If we store 17 bytes of flow key, its more than enough. (Its the actual
> size of flow_keys if it was a packed structure, but we might add new
> fields at the end of it later)
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Can we add BUILD_BUG to stop next time something smacks this.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 16:26 ` Stephen Hemminger
@ 2014-09-18 16:32 ` Eric Dumazet
2014-09-18 18:00 ` Eric Dumazet
2014-09-18 21:28 ` Or Gerlitz
0 siblings, 2 replies; 36+ messages in thread
From: Eric Dumazet @ 2014-09-18 16:32 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Or Gerlitz, Govindarajulu Varadarajan, Yinghai Lu, David Miller,
NetDev, ssujith, gvaradar, Christian Benvenuti (benve)
On Thu, 2014-09-18 at 09:26 -0700, Stephen Hemminger wrote:
> On Thu, 18 Sep 2014 08:02:05 -0700
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> > From: Eric Dumazet <edumazet@google.com>
> >
> > We cannot make struct qdisc_skb_cb bigger without impacting IPoIB,
> > or increasing skb->cb[] size.
> >
> > Commit e0f31d849867 ("flow_keys: Record IP layer protocol in
> > skb_flow_dissect()") broke IPoIB.
> >
> > Only current offender is sch_choke, and this one do not need an
> > absolutely precise flow key.
> >
> > If we store 17 bytes of flow key, its more than enough. (Its the actual
> > size of flow_keys if it was a packed structure, but we might add new
> > fields at the end of it later)
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
>
> Can we add BUILD_BUG to stop next time something smacks this.
I though we had.
Maybe IPoIB lacks one.
Or, do you have an idea ?
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 16:32 ` Eric Dumazet
@ 2014-09-18 18:00 ` Eric Dumazet
2014-09-18 18:07 ` Joe Perches
2014-09-18 21:28 ` Or Gerlitz
1 sibling, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2014-09-18 18:00 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Or Gerlitz, Govindarajulu Varadarajan, Yinghai Lu, David Miller,
NetDev, ssujith, gvaradar, Christian Benvenuti (benve)
On Thu, 2014-09-18 at 09:32 -0700, Eric Dumazet wrote:
> On Thu, 2014-09-18 at 09:26 -0700, Stephen Hemminger wrote:
> > On Thu, 18 Sep 2014 08:02:05 -0700
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > > From: Eric Dumazet <edumazet@google.com>
> > >
> > > We cannot make struct qdisc_skb_cb bigger without impacting IPoIB,
> > > or increasing skb->cb[] size.
> > >
> > > Commit e0f31d849867 ("flow_keys: Record IP layer protocol in
> > > skb_flow_dissect()") broke IPoIB.
> > >
> > > Only current offender is sch_choke, and this one do not need an
> > > absolutely precise flow key.
> > >
> > > If we store 17 bytes of flow key, its more than enough. (Its the actual
> > > size of flow_keys if it was a packed structure, but we might add new
> > > fields at the end of it later)
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> >
> > Can we add BUILD_BUG to stop next time something smacks this.
>
> I though we had.
>
> Maybe IPoIB lacks one.
>
> Or, do you have an idea ?
Seems straightforward ...
Or can you carry this fix for me ?
Thanks
[PATCH] ipoib: validate struct ipoib_cb size
To catch future errors sooner.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 3edce617c31b..d7562beb5423 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -131,6 +131,12 @@ struct ipoib_cb {
u8 hwaddr[INFINIBAND_ALEN];
};
+static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
+ return (struct ipoib_cb *)skb->cb;
+}
+
/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
struct ipoib_mcast {
struct ib_sa_mcmember_rec mcmember;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 1310acf6bf92..13e6e0431592 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -716,7 +716,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_neigh *neigh;
- struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
+ struct ipoib_cb *cb = ipoib_skb_cb(skb);
struct ipoib_header *header;
unsigned long flags;
@@ -813,7 +813,7 @@ static int ipoib_hard_header(struct sk_buff *skb,
const void *daddr, const void *saddr, unsigned len)
{
struct ipoib_header *header;
- struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
+ struct ipoib_cb *cb = ipoib_skb_cb(skb);
header = (struct ipoib_header *) skb_push(skb, sizeof *header);
^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 18:00 ` Eric Dumazet
@ 2014-09-18 18:07 ` Joe Perches
2014-09-18 19:14 ` Eric Dumazet
0 siblings, 1 reply; 36+ messages in thread
From: Joe Perches @ 2014-09-18 18:07 UTC (permalink / raw)
To: Eric Dumazet
Cc: Stephen Hemminger, Or Gerlitz, Govindarajulu Varadarajan,
Yinghai Lu, David Miller, NetDev, ssujith, gvaradar,
Christian Benvenuti (benve)
On Thu, 2014-09-18 at 11:00 -0700, Eric Dumazet wrote:
> On Thu, 2014-09-18 at 09:32 -0700, Eric Dumazet wrote:
> > On Thu, 2014-09-18 at 09:26 -0700, Stephen Hemminger wrote:
[]
> > Or, do you have an idea ?
>
> Seems straightforward ...
>
> Or can you carry this fix for me ?
>
> Thanks
>
> [PATCH] ipoib: validate struct ipoib_cb size
>
> To catch future errors sooner.
[]
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
[]
> +static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
> +{
> + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
> + return (struct ipoib_cb *)skb->cb;
> +}
It seems better not to use const for the struct sk_buff * here.
Neither of the uses take a const struct sk_buff *
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -716,7 +716,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> struct ipoib_dev_priv *priv = netdev_priv(dev);
> struct ipoib_neigh *neigh;
> - struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
> + struct ipoib_cb *cb = ipoib_skb_cb(skb);
> struct ipoib_header *header;
> unsigned long flags;
>
> @@ -813,7 +813,7 @@ static int ipoib_hard_header(struct sk_buff *skb,
> const void *daddr, const void *saddr, unsigned len)
> {
> struct ipoib_header *header;
> - struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
> + struct ipoib_cb *cb = ipoib_skb_cb(skb);
>
> header = (struct ipoib_header *) skb_push(skb, sizeof *header);
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 18:07 ` Joe Perches
@ 2014-09-18 19:14 ` Eric Dumazet
2014-09-18 19:31 ` Joe Perches
0 siblings, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2014-09-18 19:14 UTC (permalink / raw)
To: Joe Perches
Cc: Stephen Hemminger, Or Gerlitz, Govindarajulu Varadarajan,
Yinghai Lu, David Miller, NetDev, ssujith, gvaradar,
Christian Benvenuti (benve)
On Thu, 2014-09-18 at 11:07 -0700, Joe Perches wrote:
> > +static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
> > +{
> > + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
> > + return (struct ipoib_cb *)skb->cb;
> > +}
>
> It seems better not to use const for the struct sk_buff * here.
>
> Neither of the uses take a const struct sk_buff *
Thats pretty standard, check for other similar constructs like that.
static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
{
return (struct qdisc_skb_cb *)skb->cb;
}
This allows uses of the helper when the skb is only read (has the const qual)
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 19:14 ` Eric Dumazet
@ 2014-09-18 19:31 ` Joe Perches
2014-09-18 20:31 ` Eric Dumazet
0 siblings, 1 reply; 36+ messages in thread
From: Joe Perches @ 2014-09-18 19:31 UTC (permalink / raw)
To: Eric Dumazet
Cc: Stephen Hemminger, Or Gerlitz, Govindarajulu Varadarajan,
Yinghai Lu, David Miller, NetDev, ssujith, gvaradar,
Christian Benvenuti (benve)
On Thu, 2014-09-18 at 12:14 -0700, Eric Dumazet wrote:
> On Thu, 2014-09-18 at 11:07 -0700, Joe Perches wrote:
>
> > > +static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
> > > +{
> > > + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
> > > + return (struct ipoib_cb *)skb->cb;
> > > +}
> >
> > It seems better not to use const for the struct sk_buff * here.
> >
> > Neither of the uses take a const struct sk_buff *
>
> Thats pretty standard, check for other similar constructs like that.
>
>
> static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
> {
> return (struct qdisc_skb_cb *)skb->cb;
> }
>
> This allows uses of the helper when the skb is only read (has the const qual)
I don't mind the const argument, but casting
the return to non-const seems like poor form.
btw: it seems like it's 8:5 non-const to const
$ grep -rP --include=*.[ch] -n "cb\s*\*\s*\w+\s*\(\s*(?:const\s+)?struct\s+sk_buff\s*\*\s*\w+\s*\)" *
drivers/net/ethernet/freescale/gianfar.c:2091:static inline struct txfcb *gfar_add_fcb(struct sk_buff *skb)
drivers/net/wireless/ath/ath10k/core.h:83:static inline struct ath10k_skb_cb *ATH10K_SKB_CB(struct sk_buff *skb)
include/net/mrp.h:38:static inline struct mrp_skb_cb *mrp_cb(struct sk_buff *skb)
include/net/ieee802154_netdev.h:235:static inline struct ieee802154_mac_cb *mac_cb(struct sk_buff *skb)
include/net/ieee802154_netdev.h:240:static inline struct ieee802154_mac_cb *mac_cb_init(struct sk_buff *skb)
include/net/sch_generic.h:251:static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
include/net/codel.h:95:static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
include/net/garp.h:36:static inline struct garp_skb_cb *garp_cb(struct sk_buff *skb)
net/core/dev.c:2177:static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
net/sched/sch_choke.c:142:static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
net/sched/sch_sfb.c:95:static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
net/sched/sch_netem.c:171:static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
net/sched/sch_sfq.c:167:static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 19:31 ` Joe Perches
@ 2014-09-18 20:31 ` Eric Dumazet
0 siblings, 0 replies; 36+ messages in thread
From: Eric Dumazet @ 2014-09-18 20:31 UTC (permalink / raw)
To: Joe Perches
Cc: Stephen Hemminger, Or Gerlitz, Govindarajulu Varadarajan,
Yinghai Lu, David Miller, NetDev, ssujith, gvaradar,
Christian Benvenuti (benve)
On Thu, 2014-09-18 at 12:31 -0700, Joe Perches wrote:
> I don't mind the const argument, but casting
> the return to non-const seems like poor form.
This is the current way to do this, even if we do not like it.
C does not allow to have the same function name for different
parameters, there is nothing we can do about it at this moment.
->
static inline const struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
static inline struct ipoib_cb *ipoib_skb_cb(struct sk_buff *skb)
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 16:32 ` Eric Dumazet
2014-09-18 18:00 ` Eric Dumazet
@ 2014-09-18 21:28 ` Or Gerlitz
1 sibling, 0 replies; 36+ messages in thread
From: Or Gerlitz @ 2014-09-18 21:28 UTC (permalink / raw)
To: Eric Dumazet
Cc: Stephen Hemminger, Govindarajulu Varadarajan, Yinghai Lu,
David Miller, NetDev, ssujith, gvaradar,
Christian Benvenuti (benve)
On Thu, Sep 18, 2014 at 7:32 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2014-09-18 at 09:26 -0700, Stephen Hemminger wrote:
>> On Thu, 18 Sep 2014 08:02:05 -0700
>> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>> > From: Eric Dumazet <edumazet@google.com>
>> >
>> > We cannot make struct qdisc_skb_cb bigger without impacting IPoIB,
>> > or increasing skb->cb[] size.
>> >
>> > Commit e0f31d849867 ("flow_keys: Record IP layer protocol in
>> > skb_flow_dissect()") broke IPoIB.
>> >
>> > Only current offender is sch_choke, and this one do not need an
>> > absolutely precise flow key.
>> >
>> > If we store 17 bytes of flow key, its more than enough. (Its the actual
>> > size of flow_keys if it was a packed structure, but we might add new
>> > fields at the end of it later)
>> >
>> > Signed-off-by: Eric Dumazet <edumazet@google.com>
>>
>> Can we add BUILD_BUG to stop next time something smacks this.
>
> I though we had.
>
> Maybe IPoIB lacks one.
>
> Or, do you have an idea ?
Nope, we currently don't have such build time check, I saw you posted
one and I will be able to test it Sunday.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 15:02 ` [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes Eric Dumazet
2014-09-18 16:26 ` Stephen Hemminger
@ 2014-09-18 22:29 ` Doug Ledford
2014-09-19 12:07 ` Or Gerlitz
2014-09-22 18:38 ` David Miller
2014-09-22 18:22 ` [PATCH net] " David Miller
2 siblings, 2 replies; 36+ messages in thread
From: Doug Ledford @ 2014-09-18 22:29 UTC (permalink / raw)
To: Eric Dumazet, Or Gerlitz
Cc: Govindarajulu Varadarajan, Yinghai Lu, David Miller, NetDev,
ssujith, gvaradar, Christian Benvenuti (benve)
On 09/18/2014 11:02 AM, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> We cannot make struct qdisc_skb_cb bigger without impacting IPoIB,
> or increasing skb->cb[] size.
>
> Commit e0f31d849867 ("flow_keys: Record IP layer protocol in
> skb_flow_dissect()") broke IPoIB.
>
> Only current offender is sch_choke, and this one do not need an
> absolutely precise flow key.
>
> If we store 17 bytes of flow key, its more than enough. (Its the actual
> size of flow_keys if it was a packed structure, but we might add new
> fields at the end of it later)
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Fixes: e0f31d849867 ("flow_keys: Record IP layer protocol in skb_flow_dissect()")
I've installed this patch on my cluster and it resolves the problem.
Tested-by/Acked-by: Doug Ledford <dledford@redhat.com>
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 22:29 ` [net] " Doug Ledford
@ 2014-09-19 12:07 ` Or Gerlitz
2014-09-22 18:38 ` David Miller
1 sibling, 0 replies; 36+ messages in thread
From: Or Gerlitz @ 2014-09-19 12:07 UTC (permalink / raw)
To: Doug Ledford, David Miller
Cc: Eric Dumazet, Govindarajulu Varadarajan, Yinghai Lu, NetDev,
ssujith, gvaradar, Christian Benvenuti (benve)
On Fri, Sep 19, 2014 at 1:29 AM, Doug Ledford <dledford@xsintricity.com> wrote:
> On 09/18/2014 11:02 AM, Eric Dumazet wrote:
>> From: Eric Dumazet <edumazet@google.com>
>> We cannot make struct qdisc_skb_cb bigger without impacting IPoIB,
>> or increasing skb->cb[] size.
>>
>> Commit e0f31d849867 ("flow_keys: Record IP layer protocol in
>> skb_flow_dissect()") broke IPoIB.
>>
>> Only current offender is sch_choke, and this one do not need an
>> absolutely precise flow key.
>>
>> If we store 17 bytes of flow key, its more than enough. (Its the actual
>> size of flow_keys if it was a packed structure, but we might add new
>> fields at the end of it later)
>>
>> Signed-off-by: Eric Dumazet <edumazet@google.com>
>> Fixes: e0f31d849867 ("flow_keys: Record IP layer protocol in
>> skb_flow_dissect()")
> I've installed this patch on my cluster and it resolves the problem.
> Tested-by/Acked-by: Doug Ledford <dledford@redhat.com>
Thanks Eric/Doug!
Dave - just to make sure, this is for net, as the regression was
introduced in 3.17-rc1.
Or.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 22:29 ` [net] " Doug Ledford
2014-09-19 12:07 ` Or Gerlitz
@ 2014-09-22 18:38 ` David Miller
1 sibling, 0 replies; 36+ messages in thread
From: David Miller @ 2014-09-22 18:38 UTC (permalink / raw)
To: dledford
Cc: eric.dumazet, or.gerlitz, _govind, yinghai, netdev, ssujith,
gvaradar, benve
From: Doug Ledford <dledford@xsintricity.com>
Date: Thu, 18 Sep 2014 18:29:31 -0400
> Tested-by/Acked-by: Doug Ledford <dledford@redhat.com>
Automated tools do not understand this, please explicitly give
separate Tested-by: and Acked-by: tags in the future.
Thanks.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes
2014-09-18 15:02 ` [PATCH net] net: sched: shrink struct qdisc_skb_cb to 28 bytes Eric Dumazet
2014-09-18 16:26 ` Stephen Hemminger
2014-09-18 22:29 ` [net] " Doug Ledford
@ 2014-09-22 18:22 ` David Miller
2 siblings, 0 replies; 36+ messages in thread
From: David Miller @ 2014-09-22 18:22 UTC (permalink / raw)
To: eric.dumazet
Cc: or.gerlitz, _govind, yinghai, netdev, ssujith, gvaradar, benve
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 18 Sep 2014 08:02:05 -0700
> From: Eric Dumazet <edumazet@google.com>
>
> We cannot make struct qdisc_skb_cb bigger without impacting IPoIB,
> or increasing skb->cb[] size.
>
> Commit e0f31d849867 ("flow_keys: Record IP layer protocol in
> skb_flow_dissect()") broke IPoIB.
>
> Only current offender is sch_choke, and this one do not need an
> absolutely precise flow key.
>
> If we store 17 bytes of flow key, its more than enough. (Its the actual
> size of flow_keys if it was a packed structure, but we might add new
> fields at the end of it later)
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Fixes: e0f31d849867 ("flow_keys: Record IP layer protocol in skb_flow_dissect()")
Applied, thanks Eric.
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 2/8] enic: fix return value in _vnic_dev_cmd
2014-06-09 18:32 [PATCH net-next 0/8] enic updates Govindarajulu Varadarajan
2014-06-09 18:32 ` [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect() Govindarajulu Varadarajan
@ 2014-06-09 18:32 ` Govindarajulu Varadarajan
2014-06-09 18:32 ` [PATCH net-next 3/8] enic: devcmd for adding IP 5 tuple hardware filters Govindarajulu Varadarajan
` (4 subsequent siblings)
6 siblings, 0 replies; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-09 18:32 UTC (permalink / raw)
To: davem, netdev; +Cc: ssujith, gvaradar, benve, _govind
Hardware (in readq(&devcmd->args[0])) returns positive number in case of error.
But _vnic_dev_cmd should return a negative value in case of error.
Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
---
drivers/net/ethernet/cisco/enic/vnic_dev.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.c b/drivers/net/ethernet/cisco/enic/vnic_dev.c
index e86a45c..263081b 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_dev.c
+++ b/drivers/net/ethernet/cisco/enic/vnic_dev.c
@@ -312,12 +312,12 @@ static int _vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd,
err = (int)readq(&devcmd->args[0]);
if (err == ERR_EINVAL &&
cmd == CMD_CAPABILITY)
- return err;
+ return -err;
if (err != ERR_ECMDUNKNOWN ||
cmd != CMD_CAPABILITY)
pr_err("Error %d devcmd %d\n",
err, _CMD_N(cmd));
- return err;
+ return -err;
}
if (_CMD_DIR(cmd) & _CMD_DIR_READ) {
--
2.0.0
^ permalink raw reply related [flat|nested] 36+ messages in thread
* [PATCH net-next 3/8] enic: devcmd for adding IP 5 tuple hardware filters
2014-06-09 18:32 [PATCH net-next 0/8] enic updates Govindarajulu Varadarajan
2014-06-09 18:32 ` [PATCH net-next 1/8] flow_keys: Record IP layer protocol in skb_flow_dissect() Govindarajulu Varadarajan
2014-06-09 18:32 ` [PATCH net-next 2/8] enic: fix return value in _vnic_dev_cmd Govindarajulu Varadarajan
@ 2014-06-09 18:32 ` Govindarajulu Varadarajan
2014-06-09 18:32 ` [PATCH net-next 4/8] enic: alloc/free rx_cpu_rmap Govindarajulu Varadarajan
` (3 subsequent siblings)
6 siblings, 0 replies; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-09 18:32 UTC (permalink / raw)
To: davem, netdev; +Cc: ssujith, gvaradar, benve, _govind
This patch adds interface to add and delete IP 5 tuple filter. This interface
is used by Accelerated RFS code to steer a flow to corresponding receive
queue.
As of now adaptor supports only ipv4 + tcp/udp packet steering.
Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
---
drivers/net/ethernet/cisco/enic/Makefile | 2 +-
drivers/net/ethernet/cisco/enic/enic_clsf.c | 66 +++++++++++++++++++++++++++
drivers/net/ethernet/cisco/enic/enic_clsf.h | 10 ++++
drivers/net/ethernet/cisco/enic/vnic_dev.c | 61 +++++++++++++++++++++++++
drivers/net/ethernet/cisco/enic/vnic_dev.h | 2 +
drivers/net/ethernet/cisco/enic/vnic_devcmd.h | 5 ++
6 files changed, 145 insertions(+), 1 deletion(-)
create mode 100644 drivers/net/ethernet/cisco/enic/enic_clsf.c
create mode 100644 drivers/net/ethernet/cisco/enic/enic_clsf.h
diff --git a/drivers/net/ethernet/cisco/enic/Makefile b/drivers/net/ethernet/cisco/enic/Makefile
index 239e1e4..aadcaf7 100644
--- a/drivers/net/ethernet/cisco/enic/Makefile
+++ b/drivers/net/ethernet/cisco/enic/Makefile
@@ -2,5 +2,5 @@ obj-$(CONFIG_ENIC) := enic.o
enic-y := enic_main.o vnic_cq.o vnic_intr.o vnic_wq.o \
enic_res.o enic_dev.o enic_pp.o vnic_dev.o vnic_rq.o vnic_vic.o \
- enic_ethtool.o enic_api.o
+ enic_ethtool.o enic_api.o enic_clsf.o
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
new file mode 100644
index 0000000..f6703c4
--- /dev/null
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -0,0 +1,66 @@
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/if_link.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <net/flow_keys.h>
+#include "enic_res.h"
+#include "enic_clsf.h"
+
+/* enic_addfltr_5t - Add ipv4 5tuple filter
+ * @enic: enic struct of vnic
+ * @keys: flow_keys of ipv4 5tuple
+ * @rq: rq number to steer to
+ *
+ * This function returns filter_id(hardware_id) of the filter
+ * added. In case of error it returns an negative number.
+ */
+int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
+{
+ int res;
+ struct filter data;
+
+ switch (keys->ip_proto) {
+ case IPPROTO_TCP:
+ data.u.ipv4.protocol = PROTO_TCP;
+ break;
+ case IPPROTO_UDP:
+ data.u.ipv4.protocol = PROTO_UDP;
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ };
+ data.type = FILTER_IPV4_5TUPLE;
+ data.u.ipv4.src_addr = ntohl(keys->src);
+ data.u.ipv4.dst_addr = ntohl(keys->dst);
+ data.u.ipv4.src_port = ntohs(keys->port16[0]);
+ data.u.ipv4.dst_port = ntohs(keys->port16[1]);
+ data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE;
+
+ spin_lock_bh(&enic->devcmd_lock);
+ res = vnic_dev_classifier(enic->vdev, CLSF_ADD, &rq, &data);
+ spin_unlock_bh(&enic->devcmd_lock);
+ res = (res == 0) ? rq : res;
+
+ return res;
+}
+
+/* enic_delfltr - Delete clsf filter
+ * @enic: enic struct of vnic
+ * @filter_id: filter_is(hardware_id) of filter to be deleted
+ *
+ * This function returns zero in case of success, negative number incase of
+ * error.
+ */
+int enic_delfltr(struct enic *enic, u16 filter_id)
+{
+ int ret;
+
+ spin_lock_bh(&enic->devcmd_lock);
+ ret = vnic_dev_classifier(enic->vdev, CLSF_DEL, &filter_id, NULL);
+ spin_unlock_bh(&enic->devcmd_lock);
+
+ return ret;
+}
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.h b/drivers/net/ethernet/cisco/enic/enic_clsf.h
new file mode 100644
index 0000000..b6925b3
--- /dev/null
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.h
@@ -0,0 +1,10 @@
+#ifndef _ENIC_CLSF_H_
+#define _ENIC_CLSF_H_
+
+#include "vnic_dev.h"
+#include "enic.h"
+
+int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq);
+int enic_delfltr(struct enic *enic, u16 filter_id);
+
+#endif /* _ENIC_CLSF_H_ */
diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.c b/drivers/net/ethernet/cisco/enic/vnic_dev.c
index 263081b..5abc496 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_dev.c
+++ b/drivers/net/ethernet/cisco/enic/vnic_dev.c
@@ -1048,3 +1048,64 @@ int vnic_dev_set_mac_addr(struct vnic_dev *vdev, u8 *mac_addr)
return vnic_dev_cmd(vdev, CMD_SET_MAC_ADDR, &a0, &a1, wait);
}
+
+/* vnic_dev_classifier: Add/Delete classifier entries
+ * @vdev: vdev of the device
+ * @cmd: CLSF_ADD for Add filter
+ * CLSF_DEL for Delete filter
+ * @entry: In case of ADD filter, the caller passes the RQ number in this
+ * variable.
+ *
+ * This function stores the filter_id returned by the firmware in the
+ * same variable before return;
+ *
+ * In case of DEL filter, the caller passes the RQ number. Return
+ * value is irrelevant.
+ * @data: filter data
+ */
+int vnic_dev_classifier(struct vnic_dev *vdev, u8 cmd, u16 *entry,
+ struct filter *data)
+{
+ u64 a0, a1;
+ int wait = 1000;
+ dma_addr_t tlv_pa;
+ int ret = -EINVAL;
+ struct filter_tlv *tlv, *tlv_va;
+ struct filter_action *action;
+ u64 tlv_size;
+
+ if (cmd == CLSF_ADD) {
+ tlv_size = sizeof(struct filter) +
+ sizeof(struct filter_action) +
+ 2 * sizeof(struct filter_tlv);
+ tlv_va = pci_alloc_consistent(vdev->pdev, tlv_size, &tlv_pa);
+ if (!tlv_va)
+ return -ENOMEM;
+ tlv = tlv_va;
+ a0 = tlv_pa;
+ a1 = tlv_size;
+ memset(tlv, 0, tlv_size);
+ tlv->type = CLSF_TLV_FILTER;
+ tlv->length = sizeof(struct filter);
+ *(struct filter *)&tlv->val = *data;
+
+ tlv = (struct filter_tlv *)((char *)tlv +
+ sizeof(struct filter_tlv) +
+ sizeof(struct filter));
+
+ tlv->type = CLSF_TLV_ACTION;
+ tlv->length = sizeof(struct filter_action);
+ action = (struct filter_action *)&tlv->val;
+ action->type = FILTER_ACTION_RQ_STEERING;
+ action->u.rq_idx = *entry;
+
+ ret = vnic_dev_cmd(vdev, CMD_ADD_FILTER, &a0, &a1, wait);
+ *entry = (u16)a0;
+ pci_free_consistent(vdev->pdev, tlv_size, tlv_va, tlv_pa);
+ } else if (cmd == CLSF_DEL) {
+ a0 = *entry;
+ ret = vnic_dev_cmd(vdev, CMD_DEL_FILTER, &a0, &a1, wait);
+ }
+
+ return ret;
+}
diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.h b/drivers/net/ethernet/cisco/enic/vnic_dev.h
index 1f3b301..1fb214e 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_dev.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_dev.h
@@ -133,5 +133,7 @@ int vnic_dev_enable2(struct vnic_dev *vdev, int active);
int vnic_dev_enable2_done(struct vnic_dev *vdev, int *status);
int vnic_dev_deinit_done(struct vnic_dev *vdev, int *status);
int vnic_dev_set_mac_addr(struct vnic_dev *vdev, u8 *mac_addr);
+int vnic_dev_classifier(struct vnic_dev *vdev, u8 cmd, u16 *entry,
+ struct filter *data);
#endif /* _VNIC_DEV_H_ */
diff --git a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
index b9a0d78..435d0cd 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
@@ -603,6 +603,11 @@ struct filter_tlv {
u_int32_t val[0];
};
+enum {
+ CLSF_ADD = 0,
+ CLSF_DEL = 1,
+};
+
/*
* Writing cmd register causes STAT_BUSY to get set in status register.
* When cmd completes, STAT_BUSY will be cleared.
--
2.0.0
^ permalink raw reply related [flat|nested] 36+ messages in thread
* [PATCH net-next 4/8] enic: alloc/free rx_cpu_rmap
2014-06-09 18:32 [PATCH net-next 0/8] enic updates Govindarajulu Varadarajan
` (2 preceding siblings ...)
2014-06-09 18:32 ` [PATCH net-next 3/8] enic: devcmd for adding IP 5 tuple hardware filters Govindarajulu Varadarajan
@ 2014-06-09 18:32 ` Govindarajulu Varadarajan
2014-06-09 18:44 ` Sergei Shtylyov
2014-06-09 18:32 ` [PATCH net-next 5/8] enic: Add Accelerated RFS support Govindarajulu Varadarajan
` (2 subsequent siblings)
6 siblings, 1 reply; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-09 18:32 UTC (permalink / raw)
To: davem, netdev; +Cc: ssujith, gvaradar, benve, _govind
rx_cpu_rmap provides the reverse irq cpu affinity. This patch allocates and
sets drivers netdev->rx_cpu_rmap accordingly.
rx_cpu_rmap is set in enic_request_intr() which is called by enic_open and
rx_cpu_rmap is freed in enic_free_intr() which is called by enic_stop.
This is used by Accelerated RFS.
Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
---
drivers/net/ethernet/cisco/enic/enic_main.c | 36 +++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index f32f828..f4508d9 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -39,6 +39,9 @@
#include <linux/prefetch.h>
#include <net/ip6_checksum.h>
#include <linux/ktime.h>
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif
#include "cq_enet_desc.h"
#include "vnic_dev.h"
@@ -1192,6 +1195,33 @@ static void enic_calc_int_moderation(struct enic *enic, struct vnic_rq *rq)
pkt_size_counter->small_pkt_bytes_cnt = 0;
}
+#ifdef CONFIG_RFS_ACCEL
+static void enic_free_rx_cpu_rmap(struct enic *enic)
+{
+ free_irq_cpu_rmap(enic->netdev->rx_cpu_rmap);
+ enic->netdev->rx_cpu_rmap = NULL;
+}
+
+static inline void enic_set_rx_cpu_rmap(struct enic *enic)
+{
+ int i, res;
+
+ if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) {
+ enic->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(enic->rq_count);
+ if (unlikely(!enic->netdev->rx_cpu_rmap))
+ return;
+ for (i = 0; i < enic->rq_count; i++) {
+ res = irq_cpu_rmap_add(enic->netdev->rx_cpu_rmap,
+ enic->msix_entry[i].vector);
+ if (unlikely(res)) {
+ enic_free_rx_cpu_rmap(enic);
+ return;
+ }
+ }
+ }
+}
+#endif
+
static int enic_poll_msix(struct napi_struct *napi, int budget)
{
struct net_device *netdev = napi->dev;
@@ -1267,6 +1297,9 @@ static void enic_free_intr(struct enic *enic)
struct net_device *netdev = enic->netdev;
unsigned int i;
+#ifdef CONFIG_RFS_ACCEL
+ enic_free_rx_cpu_rmap(enic);
+#endif
switch (vnic_dev_get_intr_mode(enic->vdev)) {
case VNIC_DEV_INTR_MODE_INTX:
free_irq(enic->pdev->irq, netdev);
@@ -1291,6 +1324,9 @@ static int enic_request_intr(struct enic *enic)
unsigned int i, intr;
int err = 0;
+#ifdef CONFIG_RFS_ACCEL
+ enic_set_rx_cpu_rmap(enic);
+#endif
switch (vnic_dev_get_intr_mode(enic->vdev)) {
case VNIC_DEV_INTR_MODE_INTX:
--
2.0.0
^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 4/8] enic: alloc/free rx_cpu_rmap
2014-06-09 18:32 ` [PATCH net-next 4/8] enic: alloc/free rx_cpu_rmap Govindarajulu Varadarajan
@ 2014-06-09 18:44 ` Sergei Shtylyov
2014-06-10 13:52 ` Govindarajulu Varadarajan
0 siblings, 1 reply; 36+ messages in thread
From: Sergei Shtylyov @ 2014-06-09 18:44 UTC (permalink / raw)
To: Govindarajulu Varadarajan, davem, netdev; +Cc: ssujith, gvaradar, benve
Hello.
On 06/09/2014 10:32 PM, Govindarajulu Varadarajan wrote:
> rx_cpu_rmap provides the reverse irq cpu affinity. This patch allocates and
> sets drivers netdev->rx_cpu_rmap accordingly.
> rx_cpu_rmap is set in enic_request_intr() which is called by enic_open and
> rx_cpu_rmap is freed in enic_free_intr() which is called by enic_stop.
> This is used by Accelerated RFS.
> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
> ---
> drivers/net/ethernet/cisco/enic/enic_main.c | 36 +++++++++++++++++++++++++++++
> 1 file changed, 36 insertions(+)
> diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
> index f32f828..f4508d9 100644
> --- a/drivers/net/ethernet/cisco/enic/enic_main.c
> +++ b/drivers/net/ethernet/cisco/enic/enic_main.c
[...]
> @@ -1192,6 +1195,33 @@ static void enic_calc_int_moderation(struct enic *enic, struct vnic_rq *rq)
> pkt_size_counter->small_pkt_bytes_cnt = 0;
> }
>
> +#ifdef CONFIG_RFS_ACCEL
> +static void enic_free_rx_cpu_rmap(struct enic *enic)
> +{
> + free_irq_cpu_rmap(enic->netdev->rx_cpu_rmap);
> + enic->netdev->rx_cpu_rmap = NULL;
> +}
> +
> +static inline void enic_set_rx_cpu_rmap(struct enic *enic)
No need to use *inline* in a .c file, the compiler should figure it out.
> +{
> + int i, res;
> +
> + if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) {
> + enic->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(enic->rq_count);
> + if (unlikely(!enic->netdev->rx_cpu_rmap))
> + return;
> + for (i = 0; i < enic->rq_count; i++) {
> + res = irq_cpu_rmap_add(enic->netdev->rx_cpu_rmap,
> + enic->msix_entry[i].vector);
> + if (unlikely(res)) {
> + enic_free_rx_cpu_rmap(enic);
> + return;
> + }
> + }
> + }
> +}
It's better to do the following here:
#else
static void enic_free_rx_cpu_rmap(struct enic *enic)
{
}
static void enic_set_rx_cpu_rmap(struct enic *enic)
{
}
> +#endif
> +
> static int enic_poll_msix(struct napi_struct *napi, int budget)
> {
> struct net_device *netdev = napi->dev;
> @@ -1267,6 +1297,9 @@ static void enic_free_intr(struct enic *enic)
> struct net_device *netdev = enic->netdev;
> unsigned int i;
>
> +#ifdef CONFIG_RFS_ACCEL
> + enic_free_rx_cpu_rmap(enic);
> +#endif
... so that you can avoid #ifdef's at the call sites.
WBR, Sergei
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 4/8] enic: alloc/free rx_cpu_rmap
2014-06-09 18:44 ` Sergei Shtylyov
@ 2014-06-10 13:52 ` Govindarajulu Varadarajan
2014-06-10 15:00 ` Sergei Shtylyov
0 siblings, 1 reply; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-10 13:52 UTC (permalink / raw)
To: Sergei Shtylyov
Cc: Govindarajulu Varadarajan, davem, netdev, ssujith, gvaradar, benve
On Mon, 9 Jun 2014, Sergei Shtylyov wrote:
> @@ -1192,6 +1195,33 @@ static void enic_calc_int_moderation(struct enic
>> *enic, struct vnic_rq *rq)
>> pkt_size_counter->small_pkt_bytes_cnt = 0;
>> }
>>
>> +#ifdef CONFIG_RFS_ACCEL
>> +static void enic_free_rx_cpu_rmap(struct enic *enic)
>> +{
>> + free_irq_cpu_rmap(enic->netdev->rx_cpu_rmap);
>> + enic->netdev->rx_cpu_rmap = NULL;
>> +}
>> +
>> +static inline void enic_set_rx_cpu_rmap(struct enic *enic)
>
> No need to use *inline* in a .c file, the compiler should figure it out.
>
Yes, I agree.
>> +{
>> + int i, res;
>> +
>> + if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) {
>> + enic->netdev->rx_cpu_rmap =
>> alloc_irq_cpu_rmap(enic->rq_count);
>> + if (unlikely(!enic->netdev->rx_cpu_rmap))
>> + return;
>> + for (i = 0; i < enic->rq_count; i++) {
>> + res = irq_cpu_rmap_add(enic->netdev->rx_cpu_rmap,
>> + enic->msix_entry[i].vector);
>> + if (unlikely(res)) {
>> + enic_free_rx_cpu_rmap(enic);
>> + return;
>> + }
>> + }
>> + }
>> +}
>
> It's better to do the following here:
>
> #else
> static void enic_free_rx_cpu_rmap(struct enic *enic)
> {
> }
> static void enic_set_rx_cpu_rmap(struct enic *enic)
> {
> }
>
How about
static void enic_free_rx_cpu_rmap(struct enic *enic)
{
#ifdef CONFIG_RFS_ACCEL
...
...
#endif
}
I prefer this over yours because, if I use yours tools like cscope finds two
definitions of function enic_free_rx_cpu_rmap. Which makes code walk through
little bit difficult.
Thanks
Govind
>> +#endif
>> +
>> static int enic_poll_msix(struct napi_struct *napi, int budget)
>> {
>> struct net_device *netdev = napi->dev;
>> @@ -1267,6 +1297,9 @@ static void enic_free_intr(struct enic *enic)
>> struct net_device *netdev = enic->netdev;
>> unsigned int i;
>>
>> +#ifdef CONFIG_RFS_ACCEL
>> + enic_free_rx_cpu_rmap(enic);
>> +#endif
>
> ... so that you can avoid #ifdef's at the call sites.
>
> WBR, Sergei
>
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH net-next 4/8] enic: alloc/free rx_cpu_rmap
2014-06-10 13:52 ` Govindarajulu Varadarajan
@ 2014-06-10 15:00 ` Sergei Shtylyov
0 siblings, 0 replies; 36+ messages in thread
From: Sergei Shtylyov @ 2014-06-10 15:00 UTC (permalink / raw)
To: Govindarajulu Varadarajan; +Cc: davem, netdev, ssujith, gvaradar, benve
Hello.
On 06/10/2014 05:52 PM, Govindarajulu Varadarajan wrote:
> >> @@ -1192,6 +1195,33 @@ static void enic_calc_int_moderation(struct enic
>>> *enic, struct vnic_rq *rq)
>>> pkt_size_counter->small_pkt_bytes_cnt = 0;
>>> }
>>>
>>> +#ifdef CONFIG_RFS_ACCEL
>>> +static void enic_free_rx_cpu_rmap(struct enic *enic)
>>> +{
>>> + free_irq_cpu_rmap(enic->netdev->rx_cpu_rmap);
>>> + enic->netdev->rx_cpu_rmap = NULL;
>>> +}
>>> +
>>> +static inline void enic_set_rx_cpu_rmap(struct enic *enic)
>> No need to use *inline* in a .c file, the compiler should figure it out.
> Yes, I agree.
>>> +{
>>> + int i, res;
>>> +
>>> + if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) {
>>> + enic->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(enic->rq_count);
>>> + if (unlikely(!enic->netdev->rx_cpu_rmap))
>>> + return;
>>> + for (i = 0; i < enic->rq_count; i++) {
>>> + res = irq_cpu_rmap_add(enic->netdev->rx_cpu_rmap,
>>> + enic->msix_entry[i].vector);
>>> + if (unlikely(res)) {
>>> + enic_free_rx_cpu_rmap(enic);
>>> + return;
>>> + }
>>> + }
>>> + }
>>> +}
>> It's better to do the following here:
>> #else
>> static void enic_free_rx_cpu_rmap(struct enic *enic)
>> {
>> }
>> static void enic_set_rx_cpu_rmap(struct enic *enic)
>> {
>> }
> How about
> static void enic_free_rx_cpu_rmap(struct enic *enic)
> {
> #ifdef CONFIG_RFS_ACCEL
> ...
> ...
> #endif
> }
> I prefer this over yours because, if I use yours tools like cscope finds two
> definitions of function enic_free_rx_cpu_rmap. Which makes code walk through
> little bit difficult.
#ifdef's in the function bodies are generally frowned upon. See
Documentation/SubmittingPatches section 2 item (2).
> Thanks
> Govind
>>> +#endif
>>> +
>>> static int enic_poll_msix(struct napi_struct *napi, int budget)
>>> {
>>> struct net_device *netdev = napi->dev;
>>> @@ -1267,6 +1297,9 @@ static void enic_free_intr(struct enic *enic)
>>> struct net_device *netdev = enic->netdev;
>>> unsigned int i;
>>>
>>> +#ifdef CONFIG_RFS_ACCEL
>>> + enic_free_rx_cpu_rmap(enic);
>>> +#endif
>> ... so that you can avoid #ifdef's at the call sites.
WBR, Sergei
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH net-next 5/8] enic: Add Accelerated RFS support
2014-06-09 18:32 [PATCH net-next 0/8] enic updates Govindarajulu Varadarajan
` (3 preceding siblings ...)
2014-06-09 18:32 ` [PATCH net-next 4/8] enic: alloc/free rx_cpu_rmap Govindarajulu Varadarajan
@ 2014-06-09 18:32 ` Govindarajulu Varadarajan
2014-06-09 18:32 ` [PATCH net-next 6/8] enic: fix lockdep around devcmd_lock Govindarajulu Varadarajan
2014-06-09 18:32 ` [PATCH net-next 7/8] enic: add low latency socket busy_poll support Govindarajulu Varadarajan
6 siblings, 0 replies; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-09 18:32 UTC (permalink / raw)
To: davem, netdev; +Cc: ssujith, gvaradar, benve, _govind
This patch adds supports for Accelerated Receive Flow Steering.
When the desired rx is different from current rq, for a flow, kernel calls the
driver function enic_rx_flow_steer(). enic_rx_flow_steer adds a IP-TCP/UDP
hardware filter.
Driver registers a timer function enic_flow_may_expire. This function is called
every HZ/4 seconds. In this function we check if the added filter has expired
by calling rps_may_expire_flow(). If the flow has expired, it removes the hw
filter.
As of now adaptor supports only IPv4 - TCP/UDP filters.
Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
---
drivers/net/ethernet/cisco/enic/enic.h | 41 ++++++
drivers/net/ethernet/cisco/enic/enic_clsf.c | 203 ++++++++++++++++++++++++++++
drivers/net/ethernet/cisco/enic/enic_clsf.h | 9 ++
drivers/net/ethernet/cisco/enic/enic_main.c | 17 +++
drivers/net/ethernet/cisco/enic/enic_res.c | 1 +
drivers/net/ethernet/cisco/enic/vnic_enet.h | 2 +
6 files changed, 273 insertions(+)
diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index 14f465f..b9b9178 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -99,6 +99,44 @@ struct enic_port_profile {
u8 mac_addr[ETH_ALEN];
};
+#ifdef CONFIG_RFS_ACCEL
+/* enic_rfs_fltr_node - rfs filter node in hash table
+ * @@keys: IPv4 5 tuple
+ * @flow_id: flow_id of clsf filter provided by kernel
+ * @fltr_id: filter id of clsf filter returned by adaptor
+ * @rq_id: desired rq index
+ * @node: hlist_node
+ */
+struct enic_rfs_fltr_node {
+ struct flow_keys keys;
+ u32 flow_id;
+ u16 fltr_id;
+ u16 rq_id;
+ struct hlist_node node;
+};
+
+/* enic_rfs_flw_tbl - rfs flow table
+ * @max: Maximum number of filters vNIC supports
+ * @free: Number of free filters available
+ * @toclean: hash table index to clean next
+ * @ht_head: hash table list head
+ * @lock: spin lock
+ * @rfs_may_expire: timer function for enic_rps_may_expire_flow
+ */
+struct enic_rfs_flw_tbl {
+ u16 max;
+ int free;
+
+#define ENIC_RFS_FLW_BITSHIFT (10)
+#define ENIC_RFS_FLW_MASK ((1 << ENIC_RFS_FLW_BITSHIFT) - 1)
+ u16 toclean:ENIC_RFS_FLW_BITSHIFT;
+ struct hlist_head ht_head[1 << ENIC_RFS_FLW_BITSHIFT];
+ spinlock_t lock;
+ struct timer_list rfs_may_expire;
+};
+
+#endif /* CONFIG_RFS_ACCEL */
+
/* Per-instance private data structure */
struct enic {
struct net_device *netdev;
@@ -150,6 +188,9 @@ struct enic {
/* completion queue cache line section */
____cacheline_aligned struct vnic_cq cq[ENIC_CQ_MAX];
unsigned int cq_count;
+#ifdef CONFIG_RFS_ACCEL
+ struct enic_rfs_flw_tbl rfs_h;
+#endif
};
static inline struct device *enic_get_dev(struct enic *enic)
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index f6703c4..1cabcfc 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -64,3 +64,206 @@ int enic_delfltr(struct enic *enic, u16 filter_id)
return ret;
}
+
+#ifdef CONFIG_RFS_ACCEL
+void enic_flow_may_expire(unsigned long data)
+{
+ struct enic *enic = (struct enic *) data;
+ bool res;
+ int j;
+
+ spin_lock(&enic->rfs_h.lock);
+ for (j = 0; j < ENIC_CLSF_EXPIRE_COUNT; j++) {
+ struct hlist_head *hhead;
+ struct hlist_node *tmp;
+ struct enic_rfs_fltr_node *n;
+
+ hhead = &enic->rfs_h.ht_head[enic->rfs_h.toclean++];
+ hlist_for_each_entry_safe(n, tmp, hhead, node) {
+ res = rps_may_expire_flow(enic->netdev, n->rq_id,
+ n->flow_id, n->fltr_id);
+ if (res) {
+ res = enic_delfltr(enic, n->fltr_id);
+ if (unlikely(res))
+ continue;
+ hlist_del(&n->node);
+ kfree(n);
+ enic->rfs_h.free++;
+ }
+ }
+ }
+ spin_unlock(&enic->rfs_h.lock);
+ mod_timer(&enic->rfs_h.rfs_may_expire, jiffies + HZ/4);
+}
+
+/* enic_rfs_flw_tbl_init - initialize enic->rfs_h members
+ * @enic: enic data
+ */
+void enic_rfs_flw_tbl_init(struct enic *enic)
+{
+ int i;
+
+ spin_lock_init(&enic->rfs_h.lock);
+ for (i = 0; i <= ENIC_RFS_FLW_MASK; i++)
+ INIT_HLIST_HEAD(&enic->rfs_h.ht_head[i]);
+ enic->rfs_h.max = enic->config.num_arfs;
+ enic->rfs_h.free = enic->rfs_h.max;
+ enic->rfs_h.toclean = 0;
+ init_timer(&enic->rfs_h.rfs_may_expire);
+ enic->rfs_h.rfs_may_expire.function = enic_flow_may_expire;
+ enic->rfs_h.rfs_may_expire.data = (unsigned long)enic;
+ mod_timer(&enic->rfs_h.rfs_may_expire, jiffies + HZ/4);
+}
+
+void enic_rfs_flw_tbl_free(struct enic *enic)
+{
+ int i, res;
+
+ del_timer_sync(&enic->rfs_h.rfs_may_expire);
+ spin_lock(&enic->rfs_h.lock);
+ enic->rfs_h.free = 0;
+ for (i = 0; i < (1 << ENIC_RFS_FLW_BITSHIFT); i++) {
+ struct hlist_head *hhead;
+ struct hlist_node *tmp;
+ struct enic_rfs_fltr_node *n;
+
+ hhead = &enic->rfs_h.ht_head[i];
+ hlist_for_each_entry_safe(n, tmp, hhead, node) {
+ enic_delfltr(enic, n->fltr_id);
+ hlist_del(&n->node);
+ kfree(n);
+ }
+ }
+ spin_unlock(&enic->rfs_h.lock);
+}
+
+static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h,
+ struct flow_keys *k)
+{
+ struct enic_rfs_fltr_node *tpos;
+
+ hlist_for_each_entry(tpos, h, node)
+ if (tpos->keys.src == k->src &&
+ tpos->keys.dst == k->dst &&
+ tpos->keys.ports == k->ports &&
+ tpos->keys.ip_proto == k->ip_proto &&
+ tpos->keys.n_proto == k->n_proto)
+ return tpos;
+ return NULL;
+}
+
+int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+ u16 rxq_index, u32 flow_id)
+{
+ struct flow_keys keys;
+ struct enic_rfs_fltr_node *n;
+ struct enic *enic;
+ u16 tbl_idx;
+ int res, i;
+
+ enic = netdev_priv(dev);
+ res = skb_flow_dissect(skb, &keys);
+ if (!res || keys.n_proto != htons(ETH_P_IP) ||
+ (keys.ip_proto != IPPROTO_TCP && keys.ip_proto != IPPROTO_UDP))
+ return -EPROTONOSUPPORT;
+
+ tbl_idx = skb_get_hash_raw(skb) & ENIC_RFS_FLW_MASK;
+ spin_lock(&enic->rfs_h.lock);
+ n = htbl_key_search(&enic->rfs_h.ht_head[tbl_idx], &keys);
+
+ if (n) { /* entry already present */
+ if (rxq_index == n->rq_id) {
+ res = -EEXIST;
+ goto ret_unlock;
+ }
+
+ /* desired rq changed for the flow, we need to delete
+ * old fltr and add new one
+ *
+ * The moment we delete the fltr, the upcoming pkts
+ * are put it default rq based on rss. When we add
+ * new filter, upcoming pkts are put in desired queue.
+ * This could cause ooo pkts.
+ *
+ * Lets 1st try adding new fltr and then del old one.
+ */
+ i = --enic->rfs_h.free;
+ /* clsf tbl is full, we have to del old fltr first*/
+ if (unlikely(i < 0)) {
+ enic->rfs_h.free++;
+ res = enic_delfltr(enic, n->fltr_id);
+ if (unlikely(res < 0))
+ goto ret_unlock;
+ res = enic_addfltr_5t(enic, &keys, rxq_index);
+ if (res < 0) {
+ hlist_del(&n->node);
+ enic->rfs_h.free++;
+ goto ret_unlock;
+ }
+ /* add new fltr 1st then del old fltr */
+ } else {
+ int ret;
+
+ res = enic_addfltr_5t(enic, &keys, rxq_index);
+ if (res < 0) {
+ enic->rfs_h.free++;
+ goto ret_unlock;
+ }
+ ret = enic_delfltr(enic, n->fltr_id);
+ /* deleting old fltr failed. Add old fltr to list.
+ * enic_flow_may_expire() will try to delete it later.
+ */
+ if (unlikely(ret < 0)) {
+ struct enic_rfs_fltr_node *d;
+ struct hlist_head *head;
+
+ head = &enic->rfs_h.ht_head[tbl_idx];
+ d = kmalloc(sizeof(*d), GFP_ATOMIC);
+ if (d) {
+ d->fltr_id = n->fltr_id;
+ INIT_HLIST_NODE(&d->node);
+ hlist_add_head(&d->node, head);
+ }
+ } else {
+ enic->rfs_h.free++;
+ }
+ }
+ n->rq_id = rxq_index;
+ n->fltr_id = res;
+ n->flow_id = flow_id;
+ /* entry not present */
+ } else {
+ i = --enic->rfs_h.free;
+ if (i <= 0) {
+ enic->rfs_h.free++;
+ res = -EBUSY;
+ goto ret_unlock;
+ }
+
+ n = kmalloc(sizeof(*n), GFP_ATOMIC);
+ if (!n) {
+ res = -ENOMEM;
+ enic->rfs_h.free++;
+ goto ret_unlock;
+ }
+
+ res = enic_addfltr_5t(enic, &keys, rxq_index);
+ if (res < 0) {
+ kfree(n);
+ enic->rfs_h.free++;
+ goto ret_unlock;
+ }
+ n->rq_id = rxq_index;
+ n->fltr_id = res;
+ n->flow_id = flow_id;
+ n->keys = keys;
+ INIT_HLIST_NODE(&n->node);
+ hlist_add_head(&n->node, &enic->rfs_h.ht_head[tbl_idx]);
+ }
+
+ret_unlock:
+ spin_unlock(&enic->rfs_h.lock);
+ return res;
+}
+
+#endif /* CONFIG_RFS_ACCEL */
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.h b/drivers/net/ethernet/cisco/enic/enic_clsf.h
index b6925b3..76a85bb 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.h
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.h
@@ -4,7 +4,16 @@
#include "vnic_dev.h"
#include "enic.h"
+#define ENIC_CLSF_EXPIRE_COUNT 128
+
int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq);
int enic_delfltr(struct enic *enic, u16 filter_id);
+#ifdef CONFIG_RFS_ACCEL
+void enic_rfs_flw_tbl_init(struct enic *enic);
+void enic_rfs_flw_tbl_free(struct enic *enic);
+int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+ u16 rxq_index, u32 flow_id);
+#endif /* CONFIG_RFS_ACCEL */
+
#endif /* _ENIC_CLSF_H_ */
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index f4508d9..67a12a7 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -52,6 +52,7 @@
#include "enic.h"
#include "enic_dev.h"
#include "enic_pp.h"
+#include "enic_clsf.h"
#define ENIC_NOTIFY_TIMER_PERIOD (2 * HZ)
#define WQ_ENET_MAX_DESC_LEN (1 << WQ_ENET_LEN_BITS)
@@ -1539,6 +1540,9 @@ static int enic_open(struct net_device *netdev)
vnic_intr_unmask(&enic->intr[i]);
enic_notify_timer_start(enic);
+#ifdef CONFIG_RFS_ACCEL
+ enic_rfs_flw_tbl_init(enic);
+#endif
return 0;
@@ -1565,6 +1569,9 @@ static int enic_stop(struct net_device *netdev)
enic_synchronize_irqs(enic);
del_timer_sync(&enic->notify_timer);
+#ifdef CONFIG_RFS_ACCEL
+ enic_rfs_flw_tbl_free(enic);
+#endif
enic_dev_disable(enic);
@@ -2057,6 +2064,9 @@ static const struct net_device_ops enic_netdev_dynamic_ops = {
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = enic_poll_controller,
#endif
+#ifdef CONFIG_RFS_ACCEL
+ .ndo_rx_flow_steer = enic_rx_flow_steer,
+#endif
};
static const struct net_device_ops enic_netdev_ops = {
@@ -2077,6 +2087,9 @@ static const struct net_device_ops enic_netdev_ops = {
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = enic_poll_controller,
#endif
+#ifdef CONFIG_RFS_ACCEL
+ .ndo_rx_flow_steer = enic_rx_flow_steer,
+#endif
};
static void enic_dev_deinit(struct enic *enic)
@@ -2422,6 +2435,10 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
netdev->features |= netdev->hw_features;
+#ifdef CONFIG_RFS_ACCEL
+ netdev->hw_features |= NETIF_F_NTUPLE;
+#endif
+
if (using_dac)
netdev->features |= NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/cisco/enic/enic_res.c b/drivers/net/ethernet/cisco/enic/enic_res.c
index 31d6588..9c96911 100644
--- a/drivers/net/ethernet/cisco/enic/enic_res.c
+++ b/drivers/net/ethernet/cisco/enic/enic_res.c
@@ -71,6 +71,7 @@ int enic_get_vnic_config(struct enic *enic)
GET_CONFIG(intr_mode);
GET_CONFIG(intr_timer_usec);
GET_CONFIG(loop_tag);
+ GET_CONFIG(num_arfs);
c->wq_desc_count =
min_t(u32, ENIC_MAX_WQ_DESCS,
diff --git a/drivers/net/ethernet/cisco/enic/vnic_enet.h b/drivers/net/ethernet/cisco/enic/vnic_enet.h
index 6095428..75aced2 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_enet.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_enet.h
@@ -32,6 +32,8 @@ struct vnic_enet_config {
char devname[16];
u32 intr_timer_usec;
u16 loop_tag;
+ u16 vf_rq_count;
+ u16 num_arfs;
};
#define VENETF_TSO 0x1 /* TSO enabled */
--
2.0.0
^ permalink raw reply related [flat|nested] 36+ messages in thread
* [PATCH net-next 6/8] enic: fix lockdep around devcmd_lock
2014-06-09 18:32 [PATCH net-next 0/8] enic updates Govindarajulu Varadarajan
` (4 preceding siblings ...)
2014-06-09 18:32 ` [PATCH net-next 5/8] enic: Add Accelerated RFS support Govindarajulu Varadarajan
@ 2014-06-09 18:32 ` Govindarajulu Varadarajan
2014-06-09 18:32 ` [PATCH net-next 7/8] enic: add low latency socket busy_poll support Govindarajulu Varadarajan
6 siblings, 0 replies; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-09 18:32 UTC (permalink / raw)
To: davem, netdev; +Cc: ssujith, gvaradar, benve, _govind, Tony Camuso
From: Tony Camuso <tcamuso@redhat.com>
We were experiencing occasional "BUG: scheduling while atomic" splats
in our testing. Enabling DEBUG_SPINLOCK and DEBUG_LOCKDEP in the kernel
exposed a lockdep in the enic driver.
enic 0000:0b:00.0 eth2: Link UP
======================================================
[ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ]
3.12.0-rc1.x86_64-dbg+ #2 Tainted: GF W
------------------------------------------------------
NetworkManager/4209 [HC0[0]:SC0[2]:HE1:SE0] is trying to acquire:
(&(&enic->devcmd_lock)->rlock){+.+...}, at: [<ffffffffa026b7e4>] enic_dev_packet_filter+0x44/0x90 [enic]
The fix was to replace spin_lock with spin_lock_bh for the enic
devcmd_lock, so that soft irqs would be disabled while the lock
is held.
Signed-off-by: Sujith Sankar <ssujith@cisco.com>
Signed-off-by: Tony Camuso <tcamuso@redhat.com>
Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
---
drivers/net/ethernet/cisco/enic/enic_api.c | 4 +-
drivers/net/ethernet/cisco/enic/enic_dev.c | 80 ++++++++++++++---------------
drivers/net/ethernet/cisco/enic/enic_dev.h | 4 +-
drivers/net/ethernet/cisco/enic/enic_main.c | 16 +++---
4 files changed, 52 insertions(+), 52 deletions(-)
diff --git a/drivers/net/ethernet/cisco/enic/enic_api.c b/drivers/net/ethernet/cisco/enic/enic_api.c
index e13efbd..b161f24 100644
--- a/drivers/net/ethernet/cisco/enic/enic_api.c
+++ b/drivers/net/ethernet/cisco/enic/enic_api.c
@@ -34,13 +34,13 @@ int enic_api_devcmd_proxy_by_index(struct net_device *netdev, int vf,
struct vnic_dev *vdev = enic->vdev;
spin_lock(&enic->enic_api_lock);
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
vnic_dev_cmd_proxy_by_index_start(vdev, vf);
err = vnic_dev_cmd(vdev, cmd, a0, a1, wait);
vnic_dev_cmd_proxy_end(vdev);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
spin_unlock(&enic->enic_api_lock);
return err;
diff --git a/drivers/net/ethernet/cisco/enic/enic_dev.c b/drivers/net/ethernet/cisco/enic/enic_dev.c
index 3e27df5..87ddc44 100644
--- a/drivers/net/ethernet/cisco/enic/enic_dev.c
+++ b/drivers/net/ethernet/cisco/enic/enic_dev.c
@@ -29,9 +29,9 @@ int enic_dev_fw_info(struct enic *enic, struct vnic_devcmd_fw_info **fw_info)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_fw_info(enic->vdev, fw_info);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -40,9 +40,9 @@ int enic_dev_stats_dump(struct enic *enic, struct vnic_stats **vstats)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_stats_dump(enic->vdev, vstats);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -54,9 +54,9 @@ int enic_dev_add_station_addr(struct enic *enic)
if (!is_valid_ether_addr(enic->netdev->dev_addr))
return -EADDRNOTAVAIL;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_add_addr(enic->vdev, enic->netdev->dev_addr);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -68,9 +68,9 @@ int enic_dev_del_station_addr(struct enic *enic)
if (!is_valid_ether_addr(enic->netdev->dev_addr))
return -EADDRNOTAVAIL;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_del_addr(enic->vdev, enic->netdev->dev_addr);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -80,10 +80,10 @@ int enic_dev_packet_filter(struct enic *enic, int directed, int multicast,
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_packet_filter(enic->vdev, directed,
multicast, broadcast, promisc, allmulti);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -92,9 +92,9 @@ int enic_dev_add_addr(struct enic *enic, const u8 *addr)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_add_addr(enic->vdev, addr);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -103,9 +103,9 @@ int enic_dev_del_addr(struct enic *enic, const u8 *addr)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_del_addr(enic->vdev, addr);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -114,9 +114,9 @@ int enic_dev_notify_unset(struct enic *enic)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_notify_unset(enic->vdev);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -125,9 +125,9 @@ int enic_dev_hang_notify(struct enic *enic)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_hang_notify(enic->vdev);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -136,10 +136,10 @@ int enic_dev_set_ig_vlan_rewrite_mode(struct enic *enic)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_set_ig_vlan_rewrite_mode(enic->vdev,
IG_VLAN_REWRITE_MODE_PRIORITY_TAG_DEFAULT_VLAN);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -148,9 +148,9 @@ int enic_dev_enable(struct enic *enic)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_enable_wait(enic->vdev);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -159,9 +159,9 @@ int enic_dev_disable(struct enic *enic)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_disable(enic->vdev);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -170,9 +170,9 @@ int enic_dev_intr_coal_timer_info(struct enic *enic)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_intr_coal_timer_info(enic->vdev);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -181,9 +181,9 @@ int enic_vnic_dev_deinit(struct enic *enic)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_deinit(enic->vdev);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -192,10 +192,10 @@ int enic_dev_init_prov2(struct enic *enic, struct vic_provinfo *vp)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_init_prov2(enic->vdev,
(u8 *)vp, vic_provinfo_size(vp));
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -204,9 +204,9 @@ int enic_dev_deinit_done(struct enic *enic, int *status)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_deinit_done(enic->vdev, status);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -217,9 +217,9 @@ int enic_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
struct enic *enic = netdev_priv(netdev);
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = enic_add_vlan(enic, vid);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -230,9 +230,9 @@ int enic_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid)
struct enic *enic = netdev_priv(netdev);
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = enic_del_vlan(enic, vid);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -241,9 +241,9 @@ int enic_dev_enable2(struct enic *enic, int active)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_enable2(enic->vdev, active);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -252,9 +252,9 @@ int enic_dev_enable2_done(struct enic *enic, int *status)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = vnic_dev_enable2_done(enic->vdev, status);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
diff --git a/drivers/net/ethernet/cisco/enic/enic_dev.h b/drivers/net/ethernet/cisco/enic/enic_dev.h
index 36ea1ab..10bb970 100644
--- a/drivers/net/ethernet/cisco/enic/enic_dev.h
+++ b/drivers/net/ethernet/cisco/enic/enic_dev.h
@@ -28,7 +28,7 @@
*/
#define ENIC_DEVCMD_PROXY_BY_INDEX(vf, err, enic, vnicdevcmdfn, ...) \
do { \
- spin_lock(&enic->devcmd_lock); \
+ spin_lock_bh(&enic->devcmd_lock); \
if (enic_is_valid_vf(enic, vf)) { \
vnic_dev_cmd_proxy_by_index_start(enic->vdev, vf); \
err = vnicdevcmdfn(enic->vdev, ##__VA_ARGS__); \
@@ -36,7 +36,7 @@
} else { \
err = vnicdevcmdfn(enic->vdev, ##__VA_ARGS__); \
} \
- spin_unlock(&enic->devcmd_lock); \
+ spin_unlock_bh(&enic->devcmd_lock); \
} while (0)
int enic_dev_fw_info(struct enic *enic, struct vnic_devcmd_fw_info **fw_info);
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 67a12a7..33decff 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -1458,7 +1458,7 @@ static int enic_dev_notify_set(struct enic *enic)
{
int err;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
switch (vnic_dev_get_intr_mode(enic->vdev)) {
case VNIC_DEV_INTR_MODE_INTX:
err = vnic_dev_notify_set(enic->vdev,
@@ -1472,7 +1472,7 @@ static int enic_dev_notify_set(struct enic *enic)
err = vnic_dev_notify_set(enic->vdev, -1 /* no intr */);
break;
}
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
@@ -1801,11 +1801,11 @@ static int enic_set_rsskey(struct enic *enic)
memcpy(rss_key_buf_va, &rss_key, sizeof(union vnic_rss_key));
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = enic_set_rss_key(enic,
rss_key_buf_pa,
sizeof(union vnic_rss_key));
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
pci_free_consistent(enic->pdev, sizeof(union vnic_rss_key),
rss_key_buf_va, rss_key_buf_pa);
@@ -1828,11 +1828,11 @@ static int enic_set_rsscpu(struct enic *enic, u8 rss_hash_bits)
for (i = 0; i < (1 << rss_hash_bits); i++)
(*rss_cpu_buf_va).cpu[i/4].b[i%4] = i % enic->rq_count;
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = enic_set_rss_cpu(enic,
rss_cpu_buf_pa,
sizeof(union vnic_rss_cpu));
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
pci_free_consistent(enic->pdev, sizeof(union vnic_rss_cpu),
rss_cpu_buf_va, rss_cpu_buf_pa);
@@ -1850,13 +1850,13 @@ static int enic_set_niccfg(struct enic *enic, u8 rss_default_cpu,
/* Enable VLAN tag stripping.
*/
- spin_lock(&enic->devcmd_lock);
+ spin_lock_bh(&enic->devcmd_lock);
err = enic_set_nic_cfg(enic,
rss_default_cpu, rss_hash_type,
rss_hash_bits, rss_base_cpu,
rss_enable, tso_ipid_split_en,
ig_vlan_strip_en);
- spin_unlock(&enic->devcmd_lock);
+ spin_unlock_bh(&enic->devcmd_lock);
return err;
}
--
2.0.0
^ permalink raw reply related [flat|nested] 36+ messages in thread
* [PATCH net-next 7/8] enic: add low latency socket busy_poll support
2014-06-09 18:32 [PATCH net-next 0/8] enic updates Govindarajulu Varadarajan
` (5 preceding siblings ...)
2014-06-09 18:32 ` [PATCH net-next 6/8] enic: fix lockdep around devcmd_lock Govindarajulu Varadarajan
@ 2014-06-09 18:32 ` Govindarajulu Varadarajan
6 siblings, 0 replies; 36+ messages in thread
From: Govindarajulu Varadarajan @ 2014-06-09 18:32 UTC (permalink / raw)
To: davem, netdev; +Cc: ssujith, gvaradar, benve, _govind
This patch adds support for low latency busy_poll.
* Introduce drivers ndo_busy_poll function enic_busy_poll, which is called by
socket waiting for data.
* Introduce locking between napi_poll nad busy_poll
* enic_busy_poll cleans up all the rx pkts possible. While in busy_poll, rq
holds the state ENIC_POLL_STATE_POLL. While in napi_poll, rq holds the state
ENIC_POLL_STATE_NAPI.
* in napi_poll we return if we are in busy_poll. Incase of INTx & msix, we just
service wq and return if busy_poll is going on.
Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
---
drivers/net/ethernet/cisco/enic/enic_main.c | 85 ++++++++++++++++---
drivers/net/ethernet/cisco/enic/vnic_rq.h | 122 ++++++++++++++++++++++++++++
2 files changed, 195 insertions(+), 12 deletions(-)
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 33decff..558ee45 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -42,6 +42,9 @@
#ifdef CONFIG_RFS_ACCEL
#include <linux/cpu_rmap.h>
#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#include <net/busy_poll.h>
+#endif
#include "cq_enet_desc.h"
#include "vnic_dev.h"
@@ -1053,10 +1056,12 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq,
if (vlan_stripped)
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci);
- if (netdev->features & NETIF_F_GRO)
- napi_gro_receive(&enic->napi[q_number], skb);
- else
+ skb_mark_napi_id(skb, &enic->napi[rq->index]);
+ if (enic_poll_busy_polling(rq) ||
+ !(netdev->features & NETIF_F_GRO))
netif_receive_skb(skb);
+ else
+ napi_gro_receive(&enic->napi[q_number], skb);
if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce)
enic_intr_update_pkt_size(&cq->pkt_size_counter,
bytes_written);
@@ -1093,16 +1098,22 @@ static int enic_poll(struct napi_struct *napi, int budget)
unsigned int work_done, rq_work_done = 0, wq_work_done;
int err;
- /* Service RQ (first) and WQ
- */
+ wq_work_done = vnic_cq_service(&enic->cq[cq_wq], wq_work_to_do,
+ enic_wq_service, NULL);
+
+ if (!enic_poll_lock_napi(&enic->rq[cq_rq])) {
+ if (wq_work_done > 0)
+ vnic_intr_return_credits(&enic->intr[intr],
+ wq_work_done,
+ 0 /* dont unmask intr */,
+ 0 /* dont reset intr timer */);
+ return rq_work_done;
+ }
if (budget > 0)
rq_work_done = vnic_cq_service(&enic->cq[cq_rq],
rq_work_to_do, enic_rq_service, NULL);
- wq_work_done = vnic_cq_service(&enic->cq[cq_wq],
- wq_work_to_do, enic_wq_service, NULL);
-
/* Accumulate intr event credits for this polling
* cycle. An intr event is the completion of a
* a WQ or RQ packet.
@@ -1134,6 +1145,7 @@ static int enic_poll(struct napi_struct *napi, int budget)
napi_complete(napi);
vnic_intr_unmask(&enic->intr[intr]);
}
+ enic_poll_unlock_napi(&enic->rq[cq_rq]);
return rq_work_done;
}
@@ -1223,6 +1235,34 @@ static inline void enic_set_rx_cpu_rmap(struct enic *enic)
}
#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+int enic_busy_poll(struct napi_struct *napi)
+{
+ struct net_device *netdev = napi->dev;
+ struct enic *enic = netdev_priv(netdev);
+ unsigned int rq = (napi - &enic->napi[0]);
+ unsigned int cq = enic_cq_rq(enic, rq);
+ unsigned int intr = enic_msix_rq_intr(enic, rq);
+ unsigned int work_to_do = -1; /* clean all pkts possible */
+ unsigned int work_done;
+
+ if (!enic_poll_lock_poll(&enic->rq[rq]))
+ return LL_FLUSH_BUSY;
+ work_done = vnic_cq_service(&enic->cq[cq], work_to_do,
+ enic_rq_service, NULL);
+
+ if (work_done > 0)
+ vnic_intr_return_credits(&enic->intr[intr],
+ work_done, 0, 0);
+ vnic_rq_fill(&enic->rq[rq], enic_rq_alloc_buf);
+ if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce)
+ enic_calc_int_moderation(enic, &enic->rq[rq]);
+ enic_poll_unlock_poll(&enic->rq[rq]);
+
+ return work_done;
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
static int enic_poll_msix(struct napi_struct *napi, int budget)
{
struct net_device *netdev = napi->dev;
@@ -1234,6 +1274,8 @@ static int enic_poll_msix(struct napi_struct *napi, int budget)
unsigned int work_done = 0;
int err;
+ if (!enic_poll_lock_napi(&enic->rq[rq]))
+ return work_done;
/* Service RQ
*/
@@ -1279,6 +1321,7 @@ static int enic_poll_msix(struct napi_struct *napi, int budget)
enic_set_int_moderation(enic, &enic->rq[rq]);
vnic_intr_unmask(&enic->intr[intr]);
}
+ enic_poll_unlock_napi(&enic->rq[rq]);
return work_done;
}
@@ -1531,8 +1574,10 @@ static int enic_open(struct net_device *netdev)
netif_tx_wake_all_queues(netdev);
- for (i = 0; i < enic->rq_count; i++)
+ for (i = 0; i < enic->rq_count; i++) {
+ enic_busy_poll_init_lock(&enic->rq[i]);
napi_enable(&enic->napi[i]);
+ }
enic_dev_enable(enic);
@@ -1575,8 +1620,13 @@ static int enic_stop(struct net_device *netdev)
enic_dev_disable(enic);
- for (i = 0; i < enic->rq_count; i++)
+ local_bh_disable();
+ for (i = 0; i < enic->rq_count; i++) {
napi_disable(&enic->napi[i]);
+ while (!enic_poll_lock_napi(&enic->rq[i]))
+ mdelay(1);
+ }
+ local_bh_enable();
netif_carrier_off(netdev);
netif_tx_disable(netdev);
@@ -2067,6 +2117,9 @@ static const struct net_device_ops enic_netdev_dynamic_ops = {
#ifdef CONFIG_RFS_ACCEL
.ndo_rx_flow_steer = enic_rx_flow_steer,
#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ .ndo_busy_poll = enic_busy_poll,
+#endif
};
static const struct net_device_ops enic_netdev_ops = {
@@ -2090,14 +2143,19 @@ static const struct net_device_ops enic_netdev_ops = {
#ifdef CONFIG_RFS_ACCEL
.ndo_rx_flow_steer = enic_rx_flow_steer,
#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ .ndo_busy_poll = enic_busy_poll,
+#endif
};
static void enic_dev_deinit(struct enic *enic)
{
unsigned int i;
- for (i = 0; i < enic->rq_count; i++)
+ for (i = 0; i < enic->rq_count; i++) {
+ napi_hash_del(&enic->napi[i]);
netif_napi_del(&enic->napi[i]);
+ }
enic_free_vnic_resources(enic);
enic_clear_intr_mode(enic);
@@ -2163,11 +2221,14 @@ static int enic_dev_init(struct enic *enic)
switch (vnic_dev_get_intr_mode(enic->vdev)) {
default:
netif_napi_add(netdev, &enic->napi[0], enic_poll, 64);
+ napi_hash_add(&enic->napi[0]);
break;
case VNIC_DEV_INTR_MODE_MSIX:
- for (i = 0; i < enic->rq_count; i++)
+ for (i = 0; i < enic->rq_count; i++) {
netif_napi_add(netdev, &enic->napi[i],
enic_poll_msix, 64);
+ napi_hash_add(&enic->napi[i]);
+ }
break;
}
diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.h b/drivers/net/ethernet/cisco/enic/vnic_rq.h
index ee7bc95..8111d52 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_rq.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_rq.h
@@ -85,6 +85,21 @@ struct vnic_rq {
struct vnic_rq_buf *to_clean;
void *os_buf_head;
unsigned int pkts_outstanding;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#define ENIC_POLL_STATE_IDLE 0
+#define ENIC_POLL_STATE_NAPI (1 << 0) /* NAPI owns this poll */
+#define ENIC_POLL_STATE_POLL (1 << 1) /* poll owns this poll */
+#define ENIC_POLL_STATE_NAPI_YIELD (1 << 2) /* NAPI yielded this poll */
+#define ENIC_POLL_STATE_POLL_YIELD (1 << 3) /* poll yielded this poll */
+#define ENIC_POLL_YIELD (ENIC_POLL_STATE_NAPI_YIELD | \
+ ENIC_POLL_STATE_POLL_YIELD)
+#define ENIC_POLL_LOCKED (ENIC_POLL_STATE_NAPI | \
+ ENIC_POLL_STATE_POLL)
+#define ENIC_POLL_USER_PEND (ENIC_POLL_STATE_POLL | \
+ ENIC_POLL_STATE_POLL_YIELD)
+ unsigned int bpoll_state;
+ spinlock_t bpoll_lock;
+#endif /* CONFIG_NET_RX_BUSY_POLL */
};
static inline unsigned int vnic_rq_desc_avail(struct vnic_rq *rq)
@@ -197,6 +212,113 @@ static inline int vnic_rq_fill(struct vnic_rq *rq,
return 0;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline void enic_busy_poll_init_lock(struct vnic_rq *rq)
+{
+ spin_lock_init(&rq->bpoll_lock);
+ rq->bpoll_state = ENIC_POLL_STATE_IDLE;
+}
+
+static inline bool enic_poll_lock_napi(struct vnic_rq *rq)
+{
+ bool rc = true;
+
+ spin_lock(&rq->bpoll_lock);
+ if (rq->bpoll_state & ENIC_POLL_LOCKED) {
+ WARN_ON(rq->bpoll_state & ENIC_POLL_STATE_NAPI);
+ rq->bpoll_state |= ENIC_POLL_STATE_NAPI_YIELD;
+ rc = false;
+ } else {
+ rq->bpoll_state = ENIC_POLL_STATE_NAPI;
+ }
+ spin_unlock(&rq->bpoll_lock);
+
+ return rc;
+}
+
+static inline bool enic_poll_unlock_napi(struct vnic_rq *rq)
+{
+ bool rc = false;
+
+ spin_lock(&rq->bpoll_lock);
+ WARN_ON(rq->bpoll_state &
+ (ENIC_POLL_STATE_POLL | ENIC_POLL_STATE_NAPI_YIELD));
+ if (rq->bpoll_state & ENIC_POLL_STATE_POLL_YIELD)
+ rc = true;
+ rq->bpoll_state = ENIC_POLL_STATE_IDLE;
+ spin_unlock(&rq->bpoll_lock);
+
+ return rc;
+}
+
+static inline bool enic_poll_lock_poll(struct vnic_rq *rq)
+{
+ bool rc = true;
+
+ spin_lock_bh(&rq->bpoll_lock);
+ if (rq->bpoll_state & ENIC_POLL_LOCKED) {
+ rq->bpoll_state |= ENIC_POLL_STATE_POLL_YIELD;
+ rc = false;
+ } else {
+ rq->bpoll_state |= ENIC_POLL_STATE_POLL;
+ }
+ spin_unlock_bh(&rq->bpoll_lock);
+
+ return rc;
+}
+
+static inline bool enic_poll_unlock_poll(struct vnic_rq *rq)
+{
+ bool rc = false;
+
+ spin_lock_bh(&rq->bpoll_lock);
+ WARN_ON(rq->bpoll_state & ENIC_POLL_STATE_NAPI);
+ if (rq->bpoll_state & ENIC_POLL_STATE_POLL_YIELD)
+ rc = true;
+ rq->bpoll_state = ENIC_POLL_STATE_IDLE;
+ spin_unlock_bh(&rq->bpoll_lock);
+
+ return rc;
+}
+
+static inline bool enic_poll_busy_polling(struct vnic_rq *rq)
+{
+ WARN_ON(!(rq->bpoll_state & ENIC_POLL_LOCKED));
+ return rq->bpoll_state & ENIC_POLL_USER_PEND;
+}
+
+#else
+
+static inline void enic_busy_poll_init_lock(struct vnic_rq *rq)
+{
+}
+
+static inline bool enic_poll_lock_napi(struct vnic_rq *rq)
+{
+ return true;
+}
+
+static inline bool enic_poll_unlock_napi(struct vnic_rq *rq)
+{
+ return false;
+}
+
+static inline bool enic_poll_lock_poll(struct vnic_rq *rq)
+{
+ return false;
+}
+
+static inline bool enic_poll_unlock_poll(struct vnic_rq *rq)
+{
+ return false;
+}
+
+static inline bool enic_poll_ll_polling(struct vnic_rq *rq)
+{
+ return false;
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
void vnic_rq_free(struct vnic_rq *rq);
int vnic_rq_alloc(struct vnic_dev *vdev, struct vnic_rq *rq, unsigned int index,
unsigned int desc_count, unsigned int desc_size);
--
2.0.0
^ permalink raw reply related [flat|nested] 36+ messages in thread