* [PATCH 2/5] soreuseport: TCP/IPv4 implementation
@ 2013-01-22 19:50 Tom Herbert
2013-01-25 8:08 ` Steffen Klassert
0 siblings, 1 reply; 9+ messages in thread
From: Tom Herbert @ 2013-01-22 19:50 UTC (permalink / raw)
To: netdev, davem; +Cc: netdev, eric.dumazet
Allow multiple listener sockets to bind to the same port.
Motivation for soresuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket. This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads. In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets. We have seen the disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest. With so_reusport the distribution is
uniform.
Signed-off-by: Tom Herbert <therbert@google.com>
---
include/net/inet_hashtables.h | 13 ++++++--
include/net/netfilter/nf_tproxy_core.h | 1 +
net/ipv4/inet_connection_sock.c | 48 ++++++++++++++++++++++++-------
net/ipv4/inet_hashtables.c | 28 ++++++++++++++----
net/ipv4/tcp_ipv4.c | 4 ++-
5 files changed, 73 insertions(+), 21 deletions(-)
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 67a8fa0..7b2ae9d 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -81,7 +81,9 @@ struct inet_bind_bucket {
struct net *ib_net;
#endif
unsigned short port;
- signed short fastreuse;
+ signed char fastreuse;
+ signed char fastreuseport;
+ kuid_t fastuid;
int num_owners;
struct hlist_node node;
struct hlist_head owners;
@@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk);
extern struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const __be32 saddr,
+ const __be16 sport,
const __be32 daddr,
const unsigned short hnum,
const int dif);
static inline struct sock *inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
- return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
+ return __inet_lookup_listener(net, hashinfo, saddr, sport,
+ daddr, ntohs(dport), dif);
}
/* Socket demux engine toys. */
@@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
struct sock *sk = __inet_lookup_established(net, hashinfo,
saddr, sport, daddr, hnum, dif);
- return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
+ return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
+ daddr, hnum, dif);
}
static inline struct sock *inet_lookup(struct net *net,
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 75ca929..1937964 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -82,6 +82,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
break;
case NFT_LOOKUP_LISTENER:
sk = inet_lookup_listener(net, &tcp_hashinfo,
+ saddr, sport,
daddr, dport,
in->ifindex);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d0670f0..8bb623d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
struct sock *sk2;
struct hlist_node *node;
int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ kuid_t uid = sock_i_uid((struct sock *)sk);
/*
* Unlike other sk lookup places we do not check
@@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2))))) {
const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
sk2_rcv_saddr == sk_rcv_saddr(sk))
@@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
int ret, attempts = 5;
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
+ kuid_t uid = sock_i_uid(sk);
local_bh_disable();
if (!snum) {
@@ -125,9 +131,12 @@ again:
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) {
- if (tb->fastreuse > 0 &&
- sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
@@ -185,14 +194,17 @@ tb_found:
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1) {
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
@@ -212,9 +224,23 @@ tb_not_found:
tb->fastreuse = 1;
else
tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = 1;
+ tb->fastuid = uid;
+ } else {
+ tb->fastreuseport = 0;
+ tb->fastuid = 0;
+ }
+ } else {
+ if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+ if (tb->fastreuseport &&
+ (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
+ tb->fastreuseport = 0;
+ tb->fastuid = 0;
+ }
+ }
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fa3ae81..0ce0595 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum;
tb->fastreuse = 0;
+ tb->fastreuseport = 0;
tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
@@ -151,16 +152,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
__be32 rcv_saddr = inet->inet_rcv_saddr;
- score = sk->sk_family == PF_INET ? 1 : 0;
+ score = sk->sk_family == PF_INET ? 2 : 1;
if (rcv_saddr) {
if (rcv_saddr != daddr)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -176,6 +177,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
@@ -183,17 +185,29 @@ struct sock *__inet_lookup_listener(struct net *net,
struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore;
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
rcu_read_lock();
begin:
result = NULL;
- hiscore = -1;
+ hiscore = 0;
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
result = sk;
hiscore = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (((u64)phash * matches) >> 32 == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
}
}
/*
@@ -501,7 +515,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_bind_bucket_for_each(tb, node, &head->chain) {
if (net_eq(ib_net(tb), net) &&
tb->port == port) {
- if (tb->fastreuse >= 0)
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
@@ -518,6 +533,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
break;
}
tb->fastreuse = -1;
+ tb->fastreuseport = -1;
goto ok;
next_port:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c6ce9ca..bbbdcc5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
* no RST generated if md5 hash doesn't match.
*/
sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
- &tcp_hashinfo, ip_hdr(skb)->daddr,
+ &tcp_hashinfo, ip_hdr(skb)->saddr,
+ th->source, ip_hdr(skb)->daddr,
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
if (!sk1)
@@ -2074,6 +2075,7 @@ do_time_wait:
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo,
+ iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
--
1.7.7.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/5] soreuseport: TCP/IPv4 implementation
2013-01-22 19:50 [PATCH 2/5] soreuseport: TCP/IPv4 implementation Tom Herbert
@ 2013-01-25 8:08 ` Steffen Klassert
2013-01-26 3:40 ` Tom Herbert
0 siblings, 1 reply; 9+ messages in thread
From: Steffen Klassert @ 2013-01-25 8:08 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, davem, netdev, eric.dumazet
On Tue, Jan 22, 2013 at 11:50:24AM -0800, Tom Herbert wrote:
> - } else if (tb->fastreuse &&
> - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
> - tb->fastreuse = 0;
> + if (sk->sk_reuseport) {
> + tb->fastreuseport = 1;
> + tb->fastuid = uid;
> + } else {
> + tb->fastreuseport = 0;
> + tb->fastuid = 0;
> + }
> + } else {
> + if (tb->fastreuse &&
> + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
> + tb->fastreuse = 0;
> + if (tb->fastreuseport &&
> + (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
> + tb->fastreuseport = 0;
> + tb->fastuid = 0;
> + }
I'm getting the following compile error due to the 0 assignment
to tb->fastuid:
net/ipv4/inet_connection_sock.c: In function ‘inet_csk_get_port’:
net/ipv4/inet_connection_sock.c:232:16: error: incompatible types when assigning to type ‘kuid_t’ from type ‘int’
net/ipv4/inet_connection_sock.c:241:16: error: incompatible types when assigning to type ‘kuid_t’ from type ‘int’
CC lib/show_mem.o
CC net/ipv6/ipv6_sockglue.o
make[3]: *** [net/ipv4/inet_connection_sock.o] Error 1
make[2]: *** [net/ipv4] Error 2
I have not seen this reported so far what surprises me a bit.
This is net-next from today complied with
gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/5] soreuseport: TCP/IPv4 implementation
2013-01-25 8:08 ` Steffen Klassert
@ 2013-01-26 3:40 ` Tom Herbert
0 siblings, 0 replies; 9+ messages in thread
From: Tom Herbert @ 2013-01-26 3:40 UTC (permalink / raw)
To: Steffen Klassert; +Cc: netdev, davem, netdev, eric.dumazet
Thanks Steffen. I'm looking at it.
Tom
On Fri, Jan 25, 2013 at 12:08 AM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> On Tue, Jan 22, 2013 at 11:50:24AM -0800, Tom Herbert wrote:
>> - } else if (tb->fastreuse &&
>> - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
>> - tb->fastreuse = 0;
>> + if (sk->sk_reuseport) {
>> + tb->fastreuseport = 1;
>> + tb->fastuid = uid;
>> + } else {
>> + tb->fastreuseport = 0;
>> + tb->fastuid = 0;
>> + }
>> + } else {
>> + if (tb->fastreuse &&
>> + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
>> + tb->fastreuse = 0;
>> + if (tb->fastreuseport &&
>> + (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
>> + tb->fastreuseport = 0;
>> + tb->fastuid = 0;
>> + }
>
> I'm getting the following compile error due to the 0 assignment
> to tb->fastuid:
>
> net/ipv4/inet_connection_sock.c: In function ‘inet_csk_get_port’:
> net/ipv4/inet_connection_sock.c:232:16: error: incompatible types when assigning to type ‘kuid_t’ from type ‘int’
> net/ipv4/inet_connection_sock.c:241:16: error: incompatible types when assigning to type ‘kuid_t’ from type ‘int’
> CC lib/show_mem.o
> CC net/ipv6/ipv6_sockglue.o
> make[3]: *** [net/ipv4/inet_connection_sock.o] Error 1
> make[2]: *** [net/ipv4] Error 2
>
> I have not seen this reported so far what surprises me a bit.
> This is net-next from today complied with
> gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/5] soreuseport: TCP/IPv4 implementation
2013-01-16 21:30 ` Tom Herbert
@ 2013-01-16 22:14 ` Vijay Subramanian
0 siblings, 0 replies; 9+ messages in thread
From: Vijay Subramanian @ 2013-01-16 22:14 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, davem, netdev, eric.dumazet
On 16 January 2013 13:30, Tom Herbert <therbert@google.com> wrote:
> I'll send updated patches today fixing the issue raised so far. You
> might want to put these on top of those?
>
Apologies for letting the mailer mangle the patch. I think I have fixed it now.
Instead of resending this version, I will wait till your patches are
applied and send a version on top of those.
Thanks,
Vijay
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/5] soreuseport: TCP/IPv4 implementation
2013-01-16 21:09 ` Vijay Subramanian
2013-01-16 21:10 ` David Miller
@ 2013-01-16 21:30 ` Tom Herbert
2013-01-16 22:14 ` Vijay Subramanian
1 sibling, 1 reply; 9+ messages in thread
From: Tom Herbert @ 2013-01-16 21:30 UTC (permalink / raw)
To: Vijay Subramanian; +Cc: netdev, davem, netdev, eric.dumazet
I'll send updated patches today fixing the issue raised so far. You
might want to put these on top of those?
On Wed, Jan 16, 2013 at 1:09 PM, Vijay Subramanian
<subramanian.vijay@gmail.com> wrote:
>
>
>> * Unlike other sk lookup places we do not check
>> @@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
>> (!sk->sk_bound_dev_if ||
>> !sk2->sk_bound_dev_if ||
>> sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
>> - if (!reuse || !sk2->sk_reuse ||
>> - sk2->sk_state == TCP_LISTEN) {
>> + if ((!reuse || !sk2->sk_reuse ||
>> + sk2->sk_state == TCP_LISTEN) &&
>> + (!reuseport || !sk2->sk_reuseport ||
>> + (sk2->sk_state != TCP_TIME_WAIT &&
>> + uid != sock_i_uid(sk2)))) {
>> const __be32 sk2_rcv_saddr =
>> sk_rcv_saddr(sk2);
>> if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
>> sk2_rcv_saddr == sk_rcv_saddr(sk))
>
>
>
> How about introducing some helper functions to make inet_csk_bid_conflict()
> and inet6_csk_bind_conflict() more readable as in patch below? We can add
> another test for reuseport() for soreuseport patches.
>
> udp.c already has ipv4_rcv_saddr_equal() but it seems to call
> ipv6_only_sock() and not inet_v6_ipv6only() which is needed in
> inet{6}_csk_bid_conflict().So I added sk_rcv_saddr_equal().
>
> Also the bind_conflict functions can return bool instead of int (not
> implemented in patch below).
>
>
> If patch idea below is ok, I will send it officially.
>
> Thanks,
> Vijay
>
>
> diff --git a/include/net/inet_connection_sock.h
> b/include/net/inet_connection_sock.h
> index 1832927..c15d2eb 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -22,6 +22,8 @@
>
> #include <net/inet_sock.h>
> #include <net/request_sock.h>
> +#include <net/tcp_states.h>
> +#include <net/inet_timewait_sock.h>
>
> #define INET_CSK_DEBUG 1
>
> @@ -205,6 +207,37 @@ static inline void inet_csk_clear_xmit_timer(struct
> sock *sk, const int what)
> #endif
> }
>
> +/* The port cannot be reused if the older socket is in LISTEN state or if
> + * either the old or new one does not allow reuse
> + */
> +static inline bool sk_reuse_equal(int reuse, const struct sock *sk2)
> +{
> + return !reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN;
> +}
> +
> +/* The port cannot be reused if both sockets are bound to the same device
> or
> + * if either one is not bound
> + */
> +static inline bool sk_bound_dev_equal(const struct sock *sk,
> + const struct sock *sk2)
> +{
> + return !sk->sk_bound_dev_if || !sk2->sk_bound_dev_if ||
> + sk->sk_bound_dev_if == sk2->sk_bound_dev_if;
> +}
> +
> +/* The port cannot be reused if both sockets have the same rcv_saddr
> + * or if either rcv_saddr is NULL
> + */
> +static inline bool sk_rcv_saddr_equal(const struct sock *sk1,
> + const struct sock *sk2)
> +{
> + __be32 sk1_rcv_saddr = sk_rcv_saddr(sk1);
> + __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
> +
> + return !sk2_rcv_saddr || !sk1_rcv_saddr ||
> + sk2_rcv_saddr == sk1_rcv_saddr;
> +}
> +
> /*
> * Reset the retransmission timer
> */
> diff --git a/net/ipv4/inet_connection_sock.c
> b/net/ipv4/inet_connection_sock.c
> index d0670f0..375cca3 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -68,26 +68,15 @@ int inet_csk_bind_conflict(const struct sock *sk,
> */
>
> sk_for_each_bound(sk2, node, &tb->owners) {
> - if (sk != sk2 &&
> - !inet_v6_ipv6only(sk2) &&
> - (!sk->sk_bound_dev_if ||
> - !sk2->sk_bound_dev_if ||
>
> - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> - if (!reuse || !sk2->sk_reuse ||
> - sk2->sk_state == TCP_LISTEN) {
> - const __be32 sk2_rcv_saddr =
> sk_rcv_saddr(sk2);
> - if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
> - sk2_rcv_saddr == sk_rcv_saddr(sk))
> - break;
> - }
> - if (!relax && reuse && sk2->sk_reuse &&
> - sk2->sk_state != TCP_LISTEN) {
> - const __be32 sk2_rcv_saddr =
> sk_rcv_saddr(sk2);
> + if (sk != sk2 && !inet_v6_ipv6only(sk2) &&
> + sk_bound_dev_equal(sk, sk2)) {
> + if (sk_reuse_equal(reuse, sk2) &&
> + sk_rcv_saddr_equal(sk, sk2))
> + break;
>
> - if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
> - sk2_rcv_saddr == sk_rcv_saddr(sk))
> - break;
> - }
> + if (!relax && sk_reuse_equal(reuse, sk2) &&
> + sk_rcv_saddr_equal(sk, sk2))
> + break;
> }
> }
> return node != NULL;
> diff --git a/net/ipv6/inet6_connection_sock.c
> b/net/ipv6/inet6_connection_sock.c
> index 3064785..8ebe20d 100644
> --- a/net/ipv6/inet6_connection_sock.c
> +++ b/net/ipv6/inet6_connection_sock.c
> @@ -39,13 +39,9 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> * vs net namespaces issues.
> */
> sk_for_each_bound(sk2, node, &tb->owners) {
> - if (sk != sk2 &&
> - (!sk->sk_bound_dev_if ||
> - !sk2->sk_bound_dev_if ||
> - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> - (!sk->sk_reuse || !sk2->sk_reuse ||
> - sk2->sk_state == TCP_LISTEN) &&
> - ipv6_rcv_saddr_equal(sk, sk2))
> + if (sk != sk2 && sk_bound_dev_equal(sk, sk2) &&
> + sk_reuse_equal(sk->sk_reuse, sk2) &&
> + ipv6_rcv_saddr_equal(sk, sk2))
> break;
> }
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/5] soreuseport: TCP/IPv4 implementation
2013-01-16 21:09 ` Vijay Subramanian
@ 2013-01-16 21:10 ` David Miller
2013-01-16 21:30 ` Tom Herbert
1 sibling, 0 replies; 9+ messages in thread
From: David Miller @ 2013-01-16 21:10 UTC (permalink / raw)
To: subramanian.vijay; +Cc: therbert, netdev, netdev, eric.dumazet
From: Vijay Subramanian <subramanian.vijay@gmail.com>
Date: Wed, 16 Jan 2013 13:09:10 -0800 (PST)
>
> +/* The port cannot be reused if the older socket is in LISTEN state
> or if
> + * either the old or new one does not allow reuse
> + */
Please post patches without them being destroyed by your mailer.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/5] soreuseport: TCP/IPv4 implementation
2013-01-14 20:00 Tom Herbert
2013-01-14 20:20 ` YOSHIFUJI Hideaki
@ 2013-01-16 21:09 ` Vijay Subramanian
2013-01-16 21:10 ` David Miller
2013-01-16 21:30 ` Tom Herbert
1 sibling, 2 replies; 9+ messages in thread
From: Vijay Subramanian @ 2013-01-16 21:09 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, davem, netdev, eric.dumazet
> * Unlike other sk lookup places we do not check
> @@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
> (!sk->sk_bound_dev_if ||
> !sk2->sk_bound_dev_if ||
> sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> - if (!reuse || !sk2->sk_reuse ||
> - sk2->sk_state == TCP_LISTEN) {
> + if ((!reuse || !sk2->sk_reuse ||
> + sk2->sk_state == TCP_LISTEN) &&
> + (!reuseport || !sk2->sk_reuseport ||
> + (sk2->sk_state != TCP_TIME_WAIT &&
> + uid != sock_i_uid(sk2)))) {
> const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
> if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
> sk2_rcv_saddr == sk_rcv_saddr(sk))
How about introducing some helper functions to make
inet_csk_bid_conflict() and inet6_csk_bind_conflict() more readable as in
patch below? We can add another test for reuseport() for soreuseport
patches.
udp.c already has ipv4_rcv_saddr_equal() but it seems to call
ipv6_only_sock() and not inet_v6_ipv6only() which is needed in
inet{6}_csk_bid_conflict().So I added sk_rcv_saddr_equal().
Also the bind_conflict functions can return bool instead of int (not
implemented in patch below).
If patch idea below is ok, I will send it officially.
Thanks,
Vijay
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 1832927..c15d2eb 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -22,6 +22,8 @@
#include <net/inet_sock.h>
#include <net/request_sock.h>
+#include <net/tcp_states.h>
+#include <net/inet_timewait_sock.h>
#define INET_CSK_DEBUG 1
@@ -205,6 +207,37 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
#endif
}
+/* The port cannot be reused if the older socket is in LISTEN state or if
+ * either the old or new one does not allow reuse
+ */
+static inline bool sk_reuse_equal(int reuse, const struct sock *sk2)
+{
+ return !reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN;
+}
+
+/* The port cannot be reused if both sockets are bound to the same device or
+ * if either one is not bound
+ */
+static inline bool sk_bound_dev_equal(const struct sock *sk,
+ const struct sock *sk2)
+{
+ return !sk->sk_bound_dev_if || !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if;
+}
+
+/* The port cannot be reused if both sockets have the same rcv_saddr
+ * or if either rcv_saddr is NULL
+ */
+static inline bool sk_rcv_saddr_equal(const struct sock *sk1,
+ const struct sock *sk2)
+{
+ __be32 sk1_rcv_saddr = sk_rcv_saddr(sk1);
+ __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
+
+ return !sk2_rcv_saddr || !sk1_rcv_saddr ||
+ sk2_rcv_saddr == sk1_rcv_saddr;
+}
+
/*
* Reset the retransmission timer
*/
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d0670f0..375cca3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -68,26 +68,15 @@ int inet_csk_bind_conflict(const struct sock *sk,
*/
sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- !inet_v6_ipv6only(sk2) &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
- const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
- sk2_rcv_saddr == sk_rcv_saddr(sk))
- break;
- }
- if (!relax && reuse && sk2->sk_reuse &&
- sk2->sk_state != TCP_LISTEN) {
- const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
+ if (sk != sk2 && !inet_v6_ipv6only(sk2) &&
+ sk_bound_dev_equal(sk, sk2)) {
+ if (sk_reuse_equal(reuse, sk2) &&
+ sk_rcv_saddr_equal(sk, sk2))
+ break;
- if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
- sk2_rcv_saddr == sk_rcv_saddr(sk))
- break;
- }
+ if (!relax && sk_reuse_equal(reuse, sk2) &&
+ sk_rcv_saddr_equal(sk, sk2))
+ break;
}
}
return node != NULL;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 3064785..8ebe20d 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -39,13 +39,9 @@ int inet6_csk_bind_conflict(const struct sock *sk,
* vs net namespaces issues.
*/
sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
- (!sk->sk_reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- ipv6_rcv_saddr_equal(sk, sk2))
+ if (sk != sk2 && sk_bound_dev_equal(sk, sk2) &&
+ sk_reuse_equal(sk->sk_reuse, sk2) &&
+ ipv6_rcv_saddr_equal(sk, sk2))
break;
}
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/5] soreuseport: TCP/IPv4 implementation
2013-01-14 20:00 Tom Herbert
@ 2013-01-14 20:20 ` YOSHIFUJI Hideaki
2013-01-16 21:09 ` Vijay Subramanian
1 sibling, 0 replies; 9+ messages in thread
From: YOSHIFUJI Hideaki @ 2013-01-14 20:20 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, davem, netdev, eric.dumazet
Tom Herbert wrote:
> static inline struct sock *inet_lookup_listener(struct net *net,
> struct inet_hashinfo *hashinfo,
> + __be32 saddr, __be16 sport,
> __be32 daddr, __be16 dport, int dif)
> {
> - return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
> + return __inet_lookup_listener(net, hashinfo, saddr, sport,
> + daddr, ntohs(dport), dif);
> }
>
> /* Socket demux engine toys. */
> @@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
> struct sock *sk = __inet_lookup_established(net, hashinfo,
> saddr, sport, daddr, hnum, dif);
>
> - return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
> + return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
> + daddr, hnum, dif);
> }
>
Please fix indent.
--yoshfuji
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/5] soreuseport: TCP/IPv4 implementation
@ 2013-01-14 20:00 Tom Herbert
2013-01-14 20:20 ` YOSHIFUJI Hideaki
2013-01-16 21:09 ` Vijay Subramanian
0 siblings, 2 replies; 9+ messages in thread
From: Tom Herbert @ 2013-01-14 20:00 UTC (permalink / raw)
To: netdev, davem; +Cc: netdev, eric.dumazet
Allow multiple listener sockets to bind to the same port.
Motivation for soresuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket. This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads. In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets. We have seen the disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest. With so_reusport the distribution is
uniform.
Signed-off-by: Tom Herbert <therbert@google.com>
---
include/net/inet_hashtables.h | 13 ++++++++--
net/ipv4/inet_connection_sock.c | 48 ++++++++++++++++++++++++++++++---------
net/ipv4/inet_hashtables.c | 28 +++++++++++++++++-----
net/ipv4/tcp_ipv4.c | 4 ++-
4 files changed, 72 insertions(+), 21 deletions(-)
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 67a8fa0..2968c8f 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -81,7 +81,9 @@ struct inet_bind_bucket {
struct net *ib_net;
#endif
unsigned short port;
- signed short fastreuse;
+ signed char fastreuse;
+ signed char fastreuseport;
+ int fastuid;
int num_owners;
struct hlist_node node;
struct hlist_head owners;
@@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk);
extern struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const __be32 saddr,
+ const __be16 sport,
const __be32 daddr,
const unsigned short hnum,
const int dif);
static inline struct sock *inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
- return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
+ return __inet_lookup_listener(net, hashinfo, saddr, sport,
+ daddr, ntohs(dport), dif);
}
/* Socket demux engine toys. */
@@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
struct sock *sk = __inet_lookup_established(net, hashinfo,
saddr, sport, daddr, hnum, dif);
- return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
+ return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
+ daddr, hnum, dif);
}
static inline struct sock *inet_lookup(struct net *net,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d0670f0..c2ce445 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
struct sock *sk2;
struct hlist_node *node;
int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ int uid = sock_i_uid((struct sock *)sk);
/*
* Unlike other sk lookup places we do not check
@@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ uid != sock_i_uid(sk2)))) {
const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
sk2_rcv_saddr == sk_rcv_saddr(sk))
@@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
int ret, attempts = 5;
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
+ int uid = sock_i_uid(sk);
local_bh_disable();
if (!snum) {
@@ -125,9 +131,12 @@ again:
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) {
- if (tb->fastreuse > 0 &&
- sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ tb->fastuid == uid)) &&
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
@@ -185,14 +194,17 @@ tb_found:
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && tb->fastuid == uid)) &&
smallest_size == -1) {
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (sk->sk_reuseport && tb->fastuid == uid)) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
@@ -212,9 +224,23 @@ tb_not_found:
tb->fastreuse = 1;
else
tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = 1;
+ tb->fastuid = uid;
+ } else {
+ tb->fastreuseport = 0;
+ tb->fastuid = 0;
+ }
+ } else {
+ if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+ if (tb->fastreuseport &&
+ (!sk->sk_reuseport || tb->fastuid != uid)) {
+ tb->fastreuseport = 0;
+ tb->fastuid = 0;
+ }
+ }
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fa3ae81..491cb85 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum;
tb->fastreuse = 0;
+ tb->fastreuseport = 0;
tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
@@ -151,16 +152,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
__be32 rcv_saddr = inet->inet_rcv_saddr;
- score = sk->sk_family == PF_INET ? 1 : 0;
+ score = sk->sk_family == PF_INET ? 2 : 1;
if (rcv_saddr) {
if (rcv_saddr != daddr)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -176,6 +177,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
@@ -183,17 +185,29 @@ struct sock *__inet_lookup_listener(struct net *net,
struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore;
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
rcu_read_lock();
begin:
result = NULL;
- hiscore = -1;
+ hiscore = 0;
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
result = sk;
hiscore = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (((u64)phash * matches) >> 32 == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
}
}
/*
@@ -501,7 +515,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_bind_bucket_for_each(tb, node, &head->chain) {
if (net_eq(ib_net(tb), net) &&
tb->port == port) {
- if (tb->fastreuse >= 0)
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
@@ -518,6 +533,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
break;
}
tb->fastreuse = -1;
+ tb->fastreuseport = -1;
goto ok;
next_port:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c6ce9ca..bbbdcc5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
* no RST generated if md5 hash doesn't match.
*/
sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
- &tcp_hashinfo, ip_hdr(skb)->daddr,
+ &tcp_hashinfo, ip_hdr(skb)->saddr,
+ th->source, ip_hdr(skb)->daddr,
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
if (!sk1)
@@ -2074,6 +2075,7 @@ do_time_wait:
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo,
+ iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
--
1.7.7.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2013-01-26 3:40 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-01-22 19:50 [PATCH 2/5] soreuseport: TCP/IPv4 implementation Tom Herbert
2013-01-25 8:08 ` Steffen Klassert
2013-01-26 3:40 ` Tom Herbert
-- strict thread matches above, loose matches on Subject: below --
2013-01-14 20:00 Tom Herbert
2013-01-14 20:20 ` YOSHIFUJI Hideaki
2013-01-16 21:09 ` Vijay Subramanian
2013-01-16 21:10 ` David Miller
2013-01-16 21:30 ` Tom Herbert
2013-01-16 22:14 ` Vijay Subramanian
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.