netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
@ 2015-09-27  0:30 Tolga Ceylan
  2015-09-27  1:04 ` Eric Dumazet
  2015-09-27  1:44 ` Aaron Conole
  0 siblings, 2 replies; 61+ messages in thread
From: Tolga Ceylan @ 2015-09-27  0:30 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Tolga Ceylan

For applications using SO_REUSEPORT listeners, there is
no clean way to switch traffic on/off or add/remove
listeners without dropping pending connections. With this
patch, applications can turn off queueing of new connections
for a specific listener socket which enables implementation of
zero down time server applications.

For example, a popular web server nginx handles application
configuration changes by forking new processes (listeners)
and waiting for old processes (listeners) to finish up their
processing. However, this approach is distruptive as removal
of a listener will drop pending connections for that listener.
Instead, with this patch, nginx can maintain two sets of listener
socket pools to be used by old/new processes and switch traffic off/on
using this socket option. Old processes set set this socket option
to drain their existing queues.

Tested on a x86_64 kernel.

Signed-off-by: Tolga Ceylan <tolga.ceylan@gmail.com>
---
 arch/alpha/include/uapi/asm/socket.h   | 2 ++
 arch/avr32/include/uapi/asm/socket.h   | 2 ++
 arch/frv/include/uapi/asm/socket.h     | 2 ++
 arch/ia64/include/uapi/asm/socket.h    | 2 ++
 arch/m32r/include/uapi/asm/socket.h    | 2 ++
 arch/mips/include/uapi/asm/socket.h    | 2 ++
 arch/mn10300/include/uapi/asm/socket.h | 2 ++
 arch/parisc/include/uapi/asm/socket.h  | 2 ++
 arch/powerpc/include/uapi/asm/socket.h | 2 ++
 arch/sparc/include/uapi/asm/socket.h   | 2 ++
 include/net/sock.h                     | 3 +++
 include/uapi/asm-generic/socket.h      | 2 ++
 net/core/sock.c                        | 3 +++
 net/ipv4/inet_hashtables.c             | 5 ++++-
 14 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 9a20821..d2ad268 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 2b65ed6..6b6d0af 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index 4823ad1..23d6b82 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -85,5 +85,7 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 59be3d8..c3d5ada 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -94,4 +94,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 7bc4cb2..602f4b4 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index dec3c85..e0880e2 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -103,4 +103,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index cab7d6d..d60f747 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index a5cd40c..0ffa8de 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -84,4 +84,6 @@
 #define SO_ATTACH_BPF		0x402B
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index c046666..6935839 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index e6a16c4..e5ecf16 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -81,6 +81,8 @@
 #define SO_ATTACH_BPF		0x0034
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 0x0035
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/include/net/sock.h b/include/net/sock.h
index 94dff7f..ebb3c08 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -142,6 +142,7 @@ typedef __u64 __bitwise __addrpair;
  *	@skc_state: Connection state
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_reuseport: %SO_REUSEPORT setting
+ *	@skc_reuseport_listen_off: %SO_REUSEPORT_LISTEN_OFF setting
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -183,6 +184,7 @@ struct sock_common {
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse:4;
 	unsigned char		skc_reuseport:1;
+	unsigned char       skc_reuseport_listen_off:1;
 	unsigned char		skc_ipv6only:1;
 	unsigned char		skc_net_refcnt:1;
 	int			skc_bound_dev_if;
@@ -322,6 +324,7 @@ struct sock {
 #define sk_state		__sk_common.skc_state
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_reuseport		__sk_common.skc_reuseport
+#define sk_reuseport_listen_off		__sk_common.skc_reuseport_listen_off
 #define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 5c15c2a..ed22ee4 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -87,4 +87,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 3307c02..5861513 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -714,6 +714,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 	case SO_REUSEPORT:
 		sk->sk_reuseport = valbool;
 		break;
+	case SO_REUSEPORT_LISTEN_OFF:
+		sk->sk_reuseport_listen_off = valbool;
+		break;
 	case SO_TYPE:
 	case SO_PROTOCOL:
 	case SO_DOMAIN:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8912019..59e8540 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -224,10 +224,13 @@ begin:
 				phash = inet_ehashfn(net, daddr, hnum,
 						     saddr, sport);
 				matches = 1;
+				if (sk->sk_reuseport_listen_off)
+					result = NULL;
 			}
 		} else if (score == hiscore && reuseport) {
 			matches++;
-			if (reciprocal_scale(phash, matches) == 0)
+			if (reciprocal_scale(phash, matches) == 0 &&
+			    !sk->sk_reuseport_listen_off)
 				result = sk;
 			phash = next_pseudo_random32(phash);
 		}
-- 
2.5.3

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-09-27  0:30 [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode Tolga Ceylan
@ 2015-09-27  1:04 ` Eric Dumazet
  2015-09-27  1:37   ` Tolga Ceylan
  2015-09-27  1:44 ` Aaron Conole
  1 sibling, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-09-27  1:04 UTC (permalink / raw)
  To: Tolga Ceylan; +Cc: David S. Miller, netdev

On Sat, 2015-09-26 at 17:30 -0700, Tolga Ceylan wrote:
> For applications using SO_REUSEPORT listeners, there is
> no clean way to switch traffic on/off or add/remove
> listeners without dropping pending connections. With this
> patch, applications can turn off queueing of new connections
> for a specific listener socket which enables implementation of
> zero down time server applications.
> 
> For example, a popular web server nginx handles application
> configuration changes by forking new processes (listeners)
> and waiting for old processes (listeners) to finish up their
> processing. However, this approach is distruptive as removal
> of a listener will drop pending connections for that listener.
> Instead, with this patch, nginx can maintain two sets of listener
> socket pools to be used by old/new processes and switch traffic off/on
> using this socket option. Old processes set set this socket option
> to drain their existing queues.

What about listen(fd, 0) ?

Not sure we need to add a new socket option.

It makes sense to extend reuseport logic to ignore listeners with a 0
backlog (if not already done, I did not check)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-09-27  1:04 ` Eric Dumazet
@ 2015-09-27  1:37   ` Tolga Ceylan
  0 siblings, 0 replies; 61+ messages in thread
From: Tolga Ceylan @ 2015-09-27  1:37 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

On Sat, Sep 26, 2015 at 6:04 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> What about listen(fd, 0) ?
>
> Not sure we need to add a new socket option.
>
> It makes sense to extend reuseport logic to ignore listeners with a 0
> backlog (if not already done, I did not check)
>
>

Just checked this and no listen(fd, 0) still does schedule new
connections despite zero
accept backlog for a reuseport socket. I'll resubmit this patch with
your suggestion.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-09-27  0:30 [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode Tolga Ceylan
  2015-09-27  1:04 ` Eric Dumazet
@ 2015-09-27  1:44 ` Aaron Conole
  2015-09-27  2:02   ` Tolga Ceylan
  1 sibling, 1 reply; 61+ messages in thread
From: Aaron Conole @ 2015-09-27  1:44 UTC (permalink / raw)
  To: Tolga Ceylan; +Cc: David S. Miller, netdev

Greetings.

Tolga Ceylan <tolga.ceylan@gmail.com> writes:
> +#define SO_REUSEPORT_LISTEN_OFF 51
> +
For all of these, I think the space should be tab.

> 	unsigned char		skc_reuseport:1;
>+	unsigned char       skc_reuseport_listen_off:1;
> 	unsigned char		skc_ipv6only:1;
The spacing here is wrong.

> @@ -224,10 +224,13 @@ begin:
>  				phash = inet_ehashfn(net, daddr, hnum,
>  						     saddr, sport);
>  				matches = 1;
> +				if (sk->sk_reuseport_listen_off)
> +					result = NULL;
I am concerned here. I think you need to reset hiscore and matches as
well, not just result.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-09-27  1:44 ` Aaron Conole
@ 2015-09-27  2:02   ` Tolga Ceylan
  2015-09-27  2:24     ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Tolga Ceylan @ 2015-09-27  2:02 UTC (permalink / raw)
  To: Aaron Conole; +Cc: David S. Miller, netdev

On Sat, Sep 26, 2015 at 6:44 PM, Aaron Conole <aconole@bytheb.org> wrote:
> Greetings.
>
> Tolga Ceylan <tolga.ceylan@gmail.com> writes:
>> +#define SO_REUSEPORT_LISTEN_OFF 51
>> +
> For all of these, I think the space should be tab.
>
>>       unsigned char           skc_reuseport:1;
>>+      unsigned char       skc_reuseport_listen_off:1;
>>       unsigned char           skc_ipv6only:1;
> The spacing here is wrong.
>
>> @@ -224,10 +224,13 @@ begin:
>>                               phash = inet_ehashfn(net, daddr, hnum,
>>                                                    saddr, sport);
>>                               matches = 1;
>> +                             if (sk->sk_reuseport_listen_off)
>> +                                     result = NULL;
> I am concerned here. I think you need to reset hiscore and matches as
> well, not just result.

By keeping hiscore/matches as is, I'm trying to keep the hashing consistent.
Otherwise, this would behave similar to removing a listener which
drops connections.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-09-27  2:02   ` Tolga Ceylan
@ 2015-09-27  2:24     ` Eric Dumazet
  2015-11-11  5:41       ` Tom Herbert
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-09-27  2:24 UTC (permalink / raw)
  To: Tolga Ceylan; +Cc: Aaron Conole, David S. Miller, netdev

On Sat, 2015-09-26 at 19:02 -0700, Tolga Ceylan wrote:
> By keeping hiscore/matches as is, I'm trying to keep the hashing consistent.
> Otherwise, this would behave similar to removing a listener which
> drops connections.

Right, this problem will soon disappear when listener rewrite is
complete.

Only SYN packet will have to select a listener.

Then when ACK packet comes, the SYN_RECV will be found in ehash table,
and req->rsk_listener will be used to get the listener that was chosen
at SYN time.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-09-27  2:24     ` Eric Dumazet
@ 2015-11-11  5:41       ` Tom Herbert
  2015-11-11  6:19         ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Tom Herbert @ 2015-11-11  5:41 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

Tolga, are you still planning to respin this patch (when tree opens?)

Thanks,
Tom


On Sat, Sep 26, 2015 at 7:24 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Sat, 2015-09-26 at 19:02 -0700, Tolga Ceylan wrote:
>> By keeping hiscore/matches as is, I'm trying to keep the hashing consistent.
>> Otherwise, this would behave similar to removing a listener which
>> drops connections.
>
> Right, this problem will soon disappear when listener rewrite is
> complete.
>
> Only SYN packet will have to select a listener.
>
> Then when ACK packet comes, the SYN_RECV will be found in ehash table,
> and req->rsk_listener will be used to get the listener that was chosen
> at SYN time.
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-11-11  5:41       ` Tom Herbert
@ 2015-11-11  6:19         ` Eric Dumazet
  2015-11-11 17:05           ` Tom Herbert
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-11-11  6:19 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Tue, 2015-11-10 at 21:41 -0800, Tom Herbert wrote:
> Tolga, are you still planning to respin this patch (when tree opens?)

I was planning to add an union on skc_tx_queue_mapping and
sk_max_ack_backlog, so that adding a check on sk_max_ack_backlog in
listener lookup would not add an additional cache line miss.

This would remove false sharing because sk_ack_backlog is often dirtied
when a socket is added into accept queue.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-11-11  6:19         ` Eric Dumazet
@ 2015-11-11 17:05           ` Tom Herbert
  2015-11-11 17:23             ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Tom Herbert @ 2015-11-11 17:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Tue, Nov 10, 2015 at 10:19 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2015-11-10 at 21:41 -0800, Tom Herbert wrote:
>> Tolga, are you still planning to respin this patch (when tree opens?)
>
> I was planning to add an union on skc_tx_queue_mapping and
> sk_max_ack_backlog, so that adding a check on sk_max_ack_backlog in
> listener lookup would not add an additional cache line miss.
>
> This would remove false sharing because sk_ack_backlog is often dirtied
> when a socket is added into accept queue.
>
That's sounds like good fixes, but my question was more about the
problem originally described by Tolga where we are transitioning
processing for a listener port from one process to another. I think
the conclusion in this thread was to modify the code so that
listen(fd, 0) would stop new connections from being assigned to a
socket (as opposed to explicit SO_REUSEPORT_LISTEN_OFF option). Does
this still seem reasonable?

Tom

>
>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-11-11 17:05           ` Tom Herbert
@ 2015-11-11 17:23             ` Eric Dumazet
  2015-11-11 18:23               ` Tom Herbert
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-11-11 17:23 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Wed, 2015-11-11 at 09:05 -0800, Tom Herbert wrote:
> On Tue, Nov 10, 2015 at 10:19 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Tue, 2015-11-10 at 21:41 -0800, Tom Herbert wrote:
> >> Tolga, are you still planning to respin this patch (when tree opens?)
> >
> > I was planning to add an union on skc_tx_queue_mapping and
> > sk_max_ack_backlog, so that adding a check on sk_max_ack_backlog in
> > listener lookup would not add an additional cache line miss.
> >
> > This would remove false sharing because sk_ack_backlog is often dirtied
> > when a socket is added into accept queue.
> >
> That's sounds like good fixes, but my question was more about the
> problem originally described by Tolga where we are transitioning
> processing for a listener port from one process to another. I think
> the conclusion in this thread was to modify the code so that
> listen(fd, 0) would stop new connections from being assigned to a
> socket (as opposed to explicit SO_REUSEPORT_LISTEN_OFF option). Does
> this still seem reasonable?

Actually listen(fd, 0) is not going to work well :

For request_sock that were created (by incoming SYN packet) before this
listen(fd, 0) call, the 3rd packet (ACK coming from client) would not be
able to create a child attached to this listener.

sk_acceptq_is_full() test in tcp_v4_syn_recv_sock() would simply drop
the thing.

I was mainly objecting adding yet another socket option.

Maybe setsockopt(... SO_REUSEPORT, &off, sizeof(off)) could detect the
condition automatically ?

(I am not sure current behavior of setting sk->sk_reuseport = valbool;
is correct if valbool==0 and current sk_reuseport is 1)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-11-11 17:23             ` Eric Dumazet
@ 2015-11-11 18:23               ` Tom Herbert
  2015-11-11 18:43                 ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Tom Herbert @ 2015-11-11 18:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Wed, Nov 11, 2015 at 9:23 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2015-11-11 at 09:05 -0800, Tom Herbert wrote:
>> On Tue, Nov 10, 2015 at 10:19 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > On Tue, 2015-11-10 at 21:41 -0800, Tom Herbert wrote:
>> >> Tolga, are you still planning to respin this patch (when tree opens?)
>> >
>> > I was planning to add an union on skc_tx_queue_mapping and
>> > sk_max_ack_backlog, so that adding a check on sk_max_ack_backlog in
>> > listener lookup would not add an additional cache line miss.
>> >
>> > This would remove false sharing because sk_ack_backlog is often dirtied
>> > when a socket is added into accept queue.
>> >
>> That's sounds like good fixes, but my question was more about the
>> problem originally described by Tolga where we are transitioning
>> processing for a listener port from one process to another. I think
>> the conclusion in this thread was to modify the code so that
>> listen(fd, 0) would stop new connections from being assigned to a
>> socket (as opposed to explicit SO_REUSEPORT_LISTEN_OFF option). Does
>> this still seem reasonable?
>
> Actually listen(fd, 0) is not going to work well :
>
> For request_sock that were created (by incoming SYN packet) before this
> listen(fd, 0) call, the 3rd packet (ACK coming from client) would not be
> able to create a child attached to this listener.
>
> sk_acceptq_is_full() test in tcp_v4_syn_recv_sock() would simply drop
> the thing.
>
> I was mainly objecting adding yet another socket option.
>
> Maybe setsockopt(... SO_REUSEPORT, &off, sizeof(off)) could detect the
> condition automatically ?
>
How about doing this in shutdown called for a listener?

> (I am not sure current behavior of setting sk->sk_reuseport = valbool;
> is correct if valbool==0 and current sk_reuseport is 1)
>
>
>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-11-11 18:23               ` Tom Herbert
@ 2015-11-11 18:43                 ` Eric Dumazet
  2015-11-12  1:09                   ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-11-11 18:43 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Wed, 2015-11-11 at 10:23 -0800, Tom Herbert wrote:

> How about doing this in shutdown called for a listener?

Seems a good idea, I will try it, thanks !

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-11-11 18:43                 ` Eric Dumazet
@ 2015-11-12  1:09                   ` Eric Dumazet
  2015-12-15 16:14                     ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-11-12  1:09 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Wed, 2015-11-11 at 10:43 -0800, Eric Dumazet wrote:
> On Wed, 2015-11-11 at 10:23 -0800, Tom Herbert wrote:
> 
> > How about doing this in shutdown called for a listener?
> 
> Seems a good idea, I will try it, thanks !
> 

Arg, I forgot about this shutdown() discussion we had recently
with Oracle guys.

It is currently used in linux to unblock potential threads in accept()
system call.

This would prevent syn_recv sockets to be finally accepted.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-11-12  1:09                   ` Eric Dumazet
@ 2015-12-15 16:14                     ` Willy Tarreau
  2015-12-15 17:10                       ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2015-12-15 16:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

[-- Attachment #1: Type: text/plain, Size: 1913 bytes --]

Hi Eric,

On Wed, Nov 11, 2015 at 05:09:01PM -0800, Eric Dumazet wrote:
> On Wed, 2015-11-11 at 10:43 -0800, Eric Dumazet wrote:
> > On Wed, 2015-11-11 at 10:23 -0800, Tom Herbert wrote:
> > 
> > > How about doing this in shutdown called for a listener?
> > 
> > Seems a good idea, I will try it, thanks !
> > 
> 
> Arg, I forgot about this shutdown() discussion we had recently
> with Oracle guys.
> 
> It is currently used in linux to unblock potential threads in accept()
> system call.
> 
> This would prevent syn_recv sockets to be finally accepted.

I had a conversation with an haproxy user who's concerned with the
connection drops during the reload operation and we stumbled upon
this thread. I was considering improving shutdown() as well for this
as haproxy already performs a shutdown(RD) during a "pause" operation
(ie: workaround for kernels missing SO_REUSEPORT).

And I found that the code clearly doesn't make this possible since
shutdown(RD) flushes the queue and stops the listening.

However I found what I consider an elegant solution which works
pretty well : by simply adding a test in compute_score(), we can
ensure that a previous socket ranks lower than the current ones,
and is never considered as long as the new ones are present. Here I
achieved this using setsockopt(SO_LINGER). The old process just has
to do this with a non-zero value on the socket it wants to put into
lingering mode and that's all.

I find this elegant since it keeps the same semantics as for a
connected socket in that it avoids killing the queue, and that it
doesn't change the behaviour for existing applications. It just
turns out that listening sockets are set up without any lingering
by default so we don't need to add any new socket options nor
anything.

Please let me know what you think about it (patch attached), if
it's accepted it's trivial to adapt haproxy to this new behaviour.

Thanks!
Willy


[-- Attachment #2: 0001-net-make-lingering-sockets-score-less-in-compute_sco.patch --]
[-- Type: text/plain, Size: 2494 bytes --]

>From 7b79e362479fa7084798e6aa41da2a2045f0d6bb Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 15 Dec 2015 16:40:00 +0100
Subject: net: make lingering sockets score less in compute_score()

When multiple processes use SO_REUSEPORT for a seamless restart
operation, there's a tiny window during which both the old and the new
process are bound to the same port, and there's no way for the old
process to gracefully stop receiving connections without dropping
the few that are left in the queue between the last poll() and the
shutdown() or close() operation.

Incoming connections are distributed between multiple listening sockets
in inet_lookup_listener() according to multiple criteria. The first
criterion is a score based on a number of attributes for each socket,
then a hash computation ensures that the connections are evenly
distributed between sockets of equal weight.

This patch provides an elegant approach by which the old process can
simply decrease its score by setting the lingering time to non-zero
on its listening socket. Thus, the sockets from the new process
(which start without any lingering) always score higher and are always
preferred.

The old process can then safely drain incoming connections and stop
after meeting the -1 EAGAIN condition, as shown in the example below :

         process A (old one)    |  process B (new one)
                                |
          listen() >= 0         |
          ...                   |
          accept()              |
          ...                   |
          ...                   |  listen()

       From now on, both processes receive incoming connections

          ...                   |  kill(process A, go_away)
          setsockopt(SO_LINGER) |  accept() >= 0

       Here process A stops receiving new connections

          accept() >= 0         |  accept() >= 0
          ...                   |
          accept() = -1 EAGAIN  |  accept() >= 0
          close()               |
          exit()                |

Signed-off-by: Willy Tarreau <w@1wt.eu>
---
 net/ipv4/inet_hashtables.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c6fb80b..473b16f 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -191,6 +191,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			score += 4;
 		}
 	}
+	if (!sock_flag(sk, SOCK_LINGER))
+		score++;
 	return score;
 }
 
-- 
1.7.12.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-15 16:14                     ` Willy Tarreau
@ 2015-12-15 17:10                       ` Eric Dumazet
  2015-12-15 17:43                         ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-12-15 17:10 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tom Herbert, Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Tue, 2015-12-15 at 17:14 +0100, Willy Tarreau wrote:
> Hi Eric,
> 
> On Wed, Nov 11, 2015 at 05:09:01PM -0800, Eric Dumazet wrote:
> > On Wed, 2015-11-11 at 10:43 -0800, Eric Dumazet wrote:
> > > On Wed, 2015-11-11 at 10:23 -0800, Tom Herbert wrote:
> > > 
> > > > How about doing this in shutdown called for a listener?
> > > 
> > > Seems a good idea, I will try it, thanks !
> > > 
> > 
> > Arg, I forgot about this shutdown() discussion we had recently
> > with Oracle guys.
> > 
> > It is currently used in linux to unblock potential threads in accept()
> > system call.
> > 
> > This would prevent syn_recv sockets to be finally accepted.
> 
> I had a conversation with an haproxy user who's concerned with the
> connection drops during the reload operation and we stumbled upon
> this thread. I was considering improving shutdown() as well for this
> as haproxy already performs a shutdown(RD) during a "pause" operation
> (ie: workaround for kernels missing SO_REUSEPORT).
> 
> And I found that the code clearly doesn't make this possible since
> shutdown(RD) flushes the queue and stops the listening.
> 
> However I found what I consider an elegant solution which works
> pretty well : by simply adding a test in compute_score(), we can
> ensure that a previous socket ranks lower than the current ones,
> and is never considered as long as the new ones are present. Here I
> achieved this using setsockopt(SO_LINGER). The old process just has
> to do this with a non-zero value on the socket it wants to put into
> lingering mode and that's all.
> 
> I find this elegant since it keeps the same semantics as for a
> connected socket in that it avoids killing the queue, and that it
> doesn't change the behaviour for existing applications. It just
> turns out that listening sockets are set up without any lingering
> by default so we don't need to add any new socket options nor
> anything.
> 
> Please let me know what you think about it (patch attached), if
> it's accepted it's trivial to adapt haproxy to this new behaviour.

Well, problem is : some applications use LINGER on the listener,
you can not really hijack this flag.

Thanks.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-15 17:10                       ` Eric Dumazet
@ 2015-12-15 17:43                         ` Willy Tarreau
  2015-12-15 18:21                           ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2015-12-15 17:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Tue, Dec 15, 2015 at 09:10:24AM -0800, Eric Dumazet wrote:
> On Tue, 2015-12-15 at 17:14 +0100, Willy Tarreau wrote:
> > Hi Eric,
> > 
> > On Wed, Nov 11, 2015 at 05:09:01PM -0800, Eric Dumazet wrote:
> > > On Wed, 2015-11-11 at 10:43 -0800, Eric Dumazet wrote:
> > > > On Wed, 2015-11-11 at 10:23 -0800, Tom Herbert wrote:
> > > > 
> > > > > How about doing this in shutdown called for a listener?
> > > > 
> > > > Seems a good idea, I will try it, thanks !
> > > > 
> > > 
> > > Arg, I forgot about this shutdown() discussion we had recently
> > > with Oracle guys.
> > > 
> > > It is currently used in linux to unblock potential threads in accept()
> > > system call.
> > > 
> > > This would prevent syn_recv sockets to be finally accepted.
> > 
> > I had a conversation with an haproxy user who's concerned with the
> > connection drops during the reload operation and we stumbled upon
> > this thread. I was considering improving shutdown() as well for this
> > as haproxy already performs a shutdown(RD) during a "pause" operation
> > (ie: workaround for kernels missing SO_REUSEPORT).
> > 
> > And I found that the code clearly doesn't make this possible since
> > shutdown(RD) flushes the queue and stops the listening.
> > 
> > However I found what I consider an elegant solution which works
> > pretty well : by simply adding a test in compute_score(), we can
> > ensure that a previous socket ranks lower than the current ones,
> > and is never considered as long as the new ones are present. Here I
> > achieved this using setsockopt(SO_LINGER). The old process just has
> > to do this with a non-zero value on the socket it wants to put into
> > lingering mode and that's all.
> > 
> > I find this elegant since it keeps the same semantics as for a
> > connected socket in that it avoids killing the queue, and that it
> > doesn't change the behaviour for existing applications. It just
> > turns out that listening sockets are set up without any lingering
> > by default so we don't need to add any new socket options nor
> > anything.
> > 
> > Please let me know what you think about it (patch attached), if
> > it's accepted it's trivial to adapt haproxy to this new behaviour.
> 
> Well, problem is : some applications use LINGER on the listener,
> you can not really hijack this flag.

Ah ? but what does it bring in this case ? I'm not seeing it used
anywhere on a listening socket. The code took care of not breaking
them though (ie they still accept if no other socket shows up with
a higher score). Otherwise we'll have to switch to Tolga's patch,
unless we find another socket option that can safely be combined
and which makes sense (I often find it better not to make userland
depend on new updates of includes when possible).

Cheers,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-15 17:43                         ` Willy Tarreau
@ 2015-12-15 18:21                           ` Eric Dumazet
  2015-12-15 19:44                             ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-12-15 18:21 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tom Herbert, Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Tue, 2015-12-15 at 18:43 +0100, Willy Tarreau wrote:

> Ah ? but what does it bring in this case ? I'm not seeing it used
> anywhere on a listening socket. The code took care of not breaking
> them though (ie they still accept if no other socket shows up with
> a higher score). Otherwise we'll have to switch to Tolga's patch,
> unless we find another socket option that can safely be combined
> and which makes sense (I often find it better not to make userland
> depend on new updates of includes when possible).

Socket options set on the listener before the accept() are inherited.

Applications wanting SO_LINGER special settings on all their sockets can
use a single system call right before listen().

Some servers having to deal with TIME_WAIT proliferation very often use
SO_LINGER with timeout 0

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-15 18:21                           ` Eric Dumazet
@ 2015-12-15 19:44                             ` Willy Tarreau
  2015-12-15 21:21                               ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2015-12-15 19:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Tue, Dec 15, 2015 at 10:21:52AM -0800, Eric Dumazet wrote:
> On Tue, 2015-12-15 at 18:43 +0100, Willy Tarreau wrote:
> 
> > Ah ? but what does it bring in this case ? I'm not seeing it used
> > anywhere on a listening socket. The code took care of not breaking
> > them though (ie they still accept if no other socket shows up with
> > a higher score). Otherwise we'll have to switch to Tolga's patch,
> > unless we find another socket option that can safely be combined
> > and which makes sense (I often find it better not to make userland
> > depend on new updates of includes when possible).
> 
> Socket options set on the listener before the accept() are inherited.

I completely forgot about this use case, stupid me!

> Applications wanting SO_LINGER special settings on all their sockets can
> use a single system call right before listen().
> 
> Some servers having to deal with TIME_WAIT proliferation very often use
> SO_LINGER with timeout 0

Yes definitely, it's just that I was focused on the listening socket not
taking into account the fact that it would be inherited to the (rare) few
sockets that are accepted from the queue afterwards. And indeed it's a
perfectly legitimate usage to save a syscall per incoming connection.

Thus do you think it's worth adding a new option as Tolga proposed ?

Another solution I considered (but found a bit dirty) was to make use of
the unimplemented shutdown(WR) for this. While it's easy to do, I don't
like it simply because it looks like a hack and not logical at all from
the users perspective.

Thanks,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-15 19:44                             ` Willy Tarreau
@ 2015-12-15 21:21                               ` Eric Dumazet
  2015-12-16  7:38                                 ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-12-15 21:21 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tom Herbert, Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Tue, 2015-12-15 at 20:44 +0100, Willy Tarreau wrote:

> Thus do you think it's worth adding a new option as Tolga proposed ?


I thought we tried hard to avoid adding the option but determined
we could not avoid it ;)

So I would simply resend the patch for another review.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-15 21:21                               ` Eric Dumazet
@ 2015-12-16  7:38                                 ` Willy Tarreau
  2015-12-16 16:15                                   ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2015-12-16  7:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Tue, Dec 15, 2015 at 01:21:15PM -0800, Eric Dumazet wrote:
> On Tue, 2015-12-15 at 20:44 +0100, Willy Tarreau wrote:
> 
> > Thus do you think it's worth adding a new option as Tolga proposed ?
> 
> 
> I thought we tried hard to avoid adding the option but determined
> we could not avoid it ;)

Not yet, your other proposal of disabling SO_REUSEPORT makes sense if
we combine it with the proposal to change the score in my patch. If
we say that a socket which has SO_REUSEPORT scores higher, then the
connections which don't want to accept new connections anymore will
simply have to drop it an not be elected. I find this even cleaner
since the sole purpose of the loop is to find the best socket in case
of SO_REUSEPORT.

I'll give this a try and will submit such a proposal if that works
for me.

Cheers!
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-16  7:38                                 ` Willy Tarreau
@ 2015-12-16 16:15                                   ` Willy Tarreau
  2015-12-18 16:33                                     ` Josh Snyder
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2015-12-16 16:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, Tolga Ceylan, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

[-- Attachment #1: Type: text/plain, Size: 2023 bytes --]

Hi Eric,

On Wed, Dec 16, 2015 at 08:38:14AM +0100, Willy Tarreau wrote:
> On Tue, Dec 15, 2015 at 01:21:15PM -0800, Eric Dumazet wrote:
> > On Tue, 2015-12-15 at 20:44 +0100, Willy Tarreau wrote:
> > 
> > > Thus do you think it's worth adding a new option as Tolga proposed ?
> > 
> > 
> > I thought we tried hard to avoid adding the option but determined
> > we could not avoid it ;)
> 
> Not yet, your other proposal of disabling SO_REUSEPORT makes sense if
> we combine it with the proposal to change the score in my patch. If
> we say that a socket which has SO_REUSEPORT scores higher, then the
> connections which don't want to accept new connections anymore will
> simply have to drop it an not be elected. I find this even cleaner
> since the sole purpose of the loop is to find the best socket in case
> of SO_REUSEPORT.

So I tried this and am pretty satisfied with the results, as I couldn't
see any single reset on 4.4-rc5 with it. On 4.1 I got a few very rare
resets at the exact moment the new process binds to the socket, because
I suspect some ACKs end up in the wrong queue exactly there. But
apparently the changes you did in 4.4 totally got rid of this, which is
great!

I suspected that I could enter a situation where a new process could
fail to bind if generations n-1 and n-2 were still present, because
n-2 would be running without SO_REUSEPORT and that should make this
test fail in inet_csk_bind_conflict(), but it never failed for me :

                        if ((!reuse || !sk2->sk_reuse ||
                            sk2->sk_state == TCP_LISTEN) &&
                            (!reuseport || !sk2->sk_reuseport ||
                            (sk2->sk_state != TCP_TIME_WAIT &&
                             !uid_eq(uid, sock_i_uid(sk2))))) {
...

So I'm clearly missing something and can't spot what. I mean, I'd
prefer to see my patch occasionally fail than not understanding why
it always works! If anyone has an suggestion I'm interested.

Here's the updated patch.

Best regards,
Willy


[-- Attachment #2: 0001-net-make-lingering-sockets-score-less-in-compute_sco.patch --]
[-- Type: text/plain, Size: 2647 bytes --]

>From c060a5db92274402a0178d7c777a1e37c15eadb9 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 15 Dec 2015 16:40:00 +0100
Subject: net: make lingering sockets score less in compute_score()

When multiple processes use SO_REUSEPORT for a seamless restart
operation, there's a tiny window during which both the old and the new
process are bound to the same port, and there's no way for the old
process to gracefully stop receiving connections without dropping
the few that are left in the queue between the last poll() and the
shutdown() or close() operation.

Incoming connections are distributed between multiple listening sockets
in inet_lookup_listener() according to multiple criteria. The first
criterion is a score based on a number of attributes for each socket,
then a hash computation ensures that the connections are evenly
distributed between sockets of equal weight.

This patch provides a simple approach by which the old process can
simply decrease its score by disabling SO_REUSEPORT on its listening
sockets. Thus, the sockets from the new process always score higher
and are always preferred.

The old process can then safely drain incoming connections and stop
after meeting the -1 EAGAIN condition, as shown in the example below :

         process A (old one)          |  process B (new one)
                                      |
          setsockopt(SO_REUSEPORT, 1) |
          listen() >= 0               |
          ...                         |
          accept()                    |
          ...                         |  setsockopt(SO_REUSEPORT, 1)
          ...                         |  listen()

       From now on, both processes receive incoming connections

          ...                         |  kill(process A, go_away)
          setsockopt(SO_REUSEPORT, 0) |  accept() >= 0

       Here process A stops receiving new connections

          accept() >= 0               |  accept() >= 0
          ...                         |
          accept() = -1 EAGAIN        |  accept() >= 0
          close()                     |
          exit()                      |

Signed-off-by: Willy Tarreau <w@1wt.eu>
---
 net/ipv4/inet_hashtables.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc5980..1c950ba 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
 				return -1;
 			score += 4;
 		}
+		if (sk->sk_reuseport)
+			score++;
 		if (sk->sk_incoming_cpu == raw_smp_processor_id())
 			score++;
 	}
-- 
1.7.12.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-16 16:15                                   ` Willy Tarreau
@ 2015-12-18 16:33                                     ` Josh Snyder
  2015-12-18 18:58                                       ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Josh Snyder @ 2015-12-18 16:33 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Eric Dumazet, Tom Herbert, Tolga Ceylan, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

I was also puzzled that binding succeeded. Looking into the code paths
involved, in inet_csk_get_port, we quickly goto have_snum. From there, we end
up dropping into tb_found. Since !hlist_empty(&tb->owners), we end up checking
that (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)).
This test passes, so we goto success and bind.

Crucially, we are checking the fastreuseport field on the inet_bind_bucket, and
not the sk_reuseport variable on the other sockets in the bucket. Since this
bit is set based on sk_reuseport at the time the first socket binds (see
tb_not_found), I can see no reason why sockets need to keep SO_REUSEPORT set
beyond initial binding.

Given this, I believe Willy's patch elegantly solves the problem at hand.

Josh

On Wed, Dec 16, 2015 at 8:15 AM, Willy Tarreau <w@1wt.eu> wrote:
> Hi Eric,
>
> On Wed, Dec 16, 2015 at 08:38:14AM +0100, Willy Tarreau wrote:
>> On Tue, Dec 15, 2015 at 01:21:15PM -0800, Eric Dumazet wrote:
>> > On Tue, 2015-12-15 at 20:44 +0100, Willy Tarreau wrote:
>> >
>> > > Thus do you think it's worth adding a new option as Tolga proposed ?
>> >
>> >
>> > I thought we tried hard to avoid adding the option but determined
>> > we could not avoid it ;)
>>
>> Not yet, your other proposal of disabling SO_REUSEPORT makes sense if
>> we combine it with the proposal to change the score in my patch. If
>> we say that a socket which has SO_REUSEPORT scores higher, then the
>> connections which don't want to accept new connections anymore will
>> simply have to drop it an not be elected. I find this even cleaner
>> since the sole purpose of the loop is to find the best socket in case
>> of SO_REUSEPORT.
>
> So I tried this and am pretty satisfied with the results, as I couldn't
> see any single reset on 4.4-rc5 with it. On 4.1 I got a few very rare
> resets at the exact moment the new process binds to the socket, because
> I suspect some ACKs end up in the wrong queue exactly there. But
> apparently the changes you did in 4.4 totally got rid of this, which is
> great!
>
> I suspected that I could enter a situation where a new process could
> fail to bind if generations n-1 and n-2 were still present, because
> n-2 would be running without SO_REUSEPORT and that should make this
> test fail in inet_csk_bind_conflict(), but it never failed for me :
>
>                         if ((!reuse || !sk2->sk_reuse ||
>                             sk2->sk_state == TCP_LISTEN) &&
>                             (!reuseport || !sk2->sk_reuseport ||
>                             (sk2->sk_state != TCP_TIME_WAIT &&
>                              !uid_eq(uid, sock_i_uid(sk2))))) {
> ...
>
> So I'm clearly missing something and can't spot what. I mean, I'd
> prefer to see my patch occasionally fail than not understanding why
> it always works! If anyone has an suggestion I'm interested.
>
> Here's the updated patch.
>
> Best regards,
> Willy
>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-18 16:33                                     ` Josh Snyder
@ 2015-12-18 18:58                                       ` Willy Tarreau
  2015-12-19  2:38                                         ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2015-12-18 18:58 UTC (permalink / raw)
  To: Josh Snyder
  Cc: Eric Dumazet, Tom Herbert, Tolga Ceylan, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

Hi Josh,

On Fri, Dec 18, 2015 at 08:33:45AM -0800, Josh Snyder wrote:
> I was also puzzled that binding succeeded. Looking into the code paths
> involved, in inet_csk_get_port, we quickly goto have_snum. From there, we end
> up dropping into tb_found. Since !hlist_empty(&tb->owners), we end up checking
> that (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)).
> This test passes, so we goto success and bind.
> 
> Crucially, we are checking the fastreuseport field on the inet_bind_bucket, and
> not the sk_reuseport variable on the other sockets in the bucket. Since this
> bit is set based on sk_reuseport at the time the first socket binds (see
> tb_not_found), I can see no reason why sockets need to keep SO_REUSEPORT set
> beyond initial binding.
> 
> Given this, I believe Willy's patch elegantly solves the problem at hand.

Great, thanks for your in-depth explanation.

Eric, do you think that this patch may be acceptable material for next
merge window (given that it's not a fix per-se) ? If so I'll resubmit
later.

Thanks,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-18 18:58                                       ` Willy Tarreau
@ 2015-12-19  2:38                                         ` Eric Dumazet
  2015-12-19  7:00                                           ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2015-12-19  2:38 UTC (permalink / raw)
  To: Willy Tarreau, cgallek
  Cc: Josh Snyder, Tom Herbert, Tolga Ceylan, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Fri, 2015-12-18 at 19:58 +0100, Willy Tarreau wrote:
> Hi Josh,
> 
> On Fri, Dec 18, 2015 at 08:33:45AM -0800, Josh Snyder wrote:
> > I was also puzzled that binding succeeded. Looking into the code paths
> > involved, in inet_csk_get_port, we quickly goto have_snum. From there, we end
> > up dropping into tb_found. Since !hlist_empty(&tb->owners), we end up checking
> > that (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)).
> > This test passes, so we goto success and bind.
> > 
> > Crucially, we are checking the fastreuseport field on the inet_bind_bucket, and
> > not the sk_reuseport variable on the other sockets in the bucket. Since this
> > bit is set based on sk_reuseport at the time the first socket binds (see
> > tb_not_found), I can see no reason why sockets need to keep SO_REUSEPORT set
> > beyond initial binding.
> > 
> > Given this, I believe Willy's patch elegantly solves the problem at hand.
> 
> Great, thanks for your in-depth explanation.
> 
> Eric, do you think that this patch may be acceptable material for next
> merge window (given that it's not a fix per-se) ? If so I'll resubmit
> later.

I need to check with Craig Gallek, because he was about to upstream a
change to make SO_REUSEPORT more scalable & sexy (like having an [e]BPF
filter to perform the selection in an array of sockets)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-19  2:38                                         ` Eric Dumazet
@ 2015-12-19  7:00                                           ` Willy Tarreau
  2015-12-21 20:38                                             ` Tom Herbert
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2015-12-19  7:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: cgallek, Josh Snyder, Tom Herbert, Tolga Ceylan, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Fri, Dec 18, 2015 at 06:38:03PM -0800, Eric Dumazet wrote:
> On Fri, 2015-12-18 at 19:58 +0100, Willy Tarreau wrote:
> > Hi Josh,
> > 
> > On Fri, Dec 18, 2015 at 08:33:45AM -0800, Josh Snyder wrote:
> > > I was also puzzled that binding succeeded. Looking into the code paths
> > > involved, in inet_csk_get_port, we quickly goto have_snum. From there, we end
> > > up dropping into tb_found. Since !hlist_empty(&tb->owners), we end up checking
> > > that (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)).
> > > This test passes, so we goto success and bind.
> > > 
> > > Crucially, we are checking the fastreuseport field on the inet_bind_bucket, and
> > > not the sk_reuseport variable on the other sockets in the bucket. Since this
> > > bit is set based on sk_reuseport at the time the first socket binds (see
> > > tb_not_found), I can see no reason why sockets need to keep SO_REUSEPORT set
> > > beyond initial binding.
> > > 
> > > Given this, I believe Willy's patch elegantly solves the problem at hand.
> > 
> > Great, thanks for your in-depth explanation.
> > 
> > Eric, do you think that this patch may be acceptable material for next
> > merge window (given that it's not a fix per-se) ? If so I'll resubmit
> > later.
> 
> I need to check with Craig Gallek, because he was about to upstream a
> change to make SO_REUSEPORT more scalable & sexy (like having an [e]BPF
> filter to perform the selection in an array of sockets)

OK fine. Please note that I also considered using a new value instead of
zero there but I preferred to avoid it since the man talked about zero/
non-zero so I wanted to limit any API change. If Craig adds new values
there then this is something we can reconsider.

Have a nice week-end,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-19  7:00                                           ` Willy Tarreau
@ 2015-12-21 20:38                                             ` Tom Herbert
  2015-12-21 20:41                                               ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Tom Herbert @ 2015-12-21 20:38 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Eric Dumazet, cgallek, Josh Snyder, Tolga Ceylan, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Fri, Dec 18, 2015 at 11:00 PM, Willy Tarreau <w@1wt.eu> wrote:
> On Fri, Dec 18, 2015 at 06:38:03PM -0800, Eric Dumazet wrote:
>> On Fri, 2015-12-18 at 19:58 +0100, Willy Tarreau wrote:
>> > Hi Josh,
>> >
>> > On Fri, Dec 18, 2015 at 08:33:45AM -0800, Josh Snyder wrote:
>> > > I was also puzzled that binding succeeded. Looking into the code paths
>> > > involved, in inet_csk_get_port, we quickly goto have_snum. From there, we end
>> > > up dropping into tb_found. Since !hlist_empty(&tb->owners), we end up checking
>> > > that (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)).
>> > > This test passes, so we goto success and bind.
>> > >
>> > > Crucially, we are checking the fastreuseport field on the inet_bind_bucket, and
>> > > not the sk_reuseport variable on the other sockets in the bucket. Since this
>> > > bit is set based on sk_reuseport at the time the first socket binds (see
>> > > tb_not_found), I can see no reason why sockets need to keep SO_REUSEPORT set
>> > > beyond initial binding.
>> > >
>> > > Given this, I believe Willy's patch elegantly solves the problem at hand.
>> >
>> > Great, thanks for your in-depth explanation.
>> >
>> > Eric, do you think that this patch may be acceptable material for next
>> > merge window (given that it's not a fix per-se) ? If so I'll resubmit
>> > later.
>>
>> I need to check with Craig Gallek, because he was about to upstream a
>> change to make SO_REUSEPORT more scalable & sexy (like having an [e]BPF
>> filter to perform the selection in an array of sockets)
>
> OK fine. Please note that I also considered using a new value instead of
> zero there but I preferred to avoid it since the man talked about zero/
> non-zero so I wanted to limit any API change. If Craig adds new values
> there then this is something we can reconsider.
>
Is there any reason why this turning off a soreuseport socket should
not apply to UDP also? (seems like we have a need to turn off RX but
not TX for a UDP socket).

Tom

> Have a nice week-end,
> Willy
>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-21 20:38                                             ` Tom Herbert
@ 2015-12-21 20:41                                               ` Willy Tarreau
  2016-03-24  5:10                                                 ` Tolga Ceylan
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2015-12-21 20:41 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Eric Dumazet, cgallek, Josh Snyder, Tolga Ceylan, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Mon, Dec 21, 2015 at 12:38:27PM -0800, Tom Herbert wrote:
> On Fri, Dec 18, 2015 at 11:00 PM, Willy Tarreau <w@1wt.eu> wrote:
> > On Fri, Dec 18, 2015 at 06:38:03PM -0800, Eric Dumazet wrote:
> >> On Fri, 2015-12-18 at 19:58 +0100, Willy Tarreau wrote:
> >> > Hi Josh,
> >> >
> >> > On Fri, Dec 18, 2015 at 08:33:45AM -0800, Josh Snyder wrote:
> >> > > I was also puzzled that binding succeeded. Looking into the code paths
> >> > > involved, in inet_csk_get_port, we quickly goto have_snum. From there, we end
> >> > > up dropping into tb_found. Since !hlist_empty(&tb->owners), we end up checking
> >> > > that (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)).
> >> > > This test passes, so we goto success and bind.
> >> > >
> >> > > Crucially, we are checking the fastreuseport field on the inet_bind_bucket, and
> >> > > not the sk_reuseport variable on the other sockets in the bucket. Since this
> >> > > bit is set based on sk_reuseport at the time the first socket binds (see
> >> > > tb_not_found), I can see no reason why sockets need to keep SO_REUSEPORT set
> >> > > beyond initial binding.
> >> > >
> >> > > Given this, I believe Willy's patch elegantly solves the problem at hand.
> >> >
> >> > Great, thanks for your in-depth explanation.
> >> >
> >> > Eric, do you think that this patch may be acceptable material for next
> >> > merge window (given that it's not a fix per-se) ? If so I'll resubmit
> >> > later.
> >>
> >> I need to check with Craig Gallek, because he was about to upstream a
> >> change to make SO_REUSEPORT more scalable & sexy (like having an [e]BPF
> >> filter to perform the selection in an array of sockets)
> >
> > OK fine. Please note that I also considered using a new value instead of
> > zero there but I preferred to avoid it since the man talked about zero/
> > non-zero so I wanted to limit any API change. If Craig adds new values
> > there then this is something we can reconsider.
> >
> Is there any reason why this turning off a soreuseport socket should
> not apply to UDP also? (seems like we have a need to turn off RX but
> not TX for a UDP socket).

I didn't know it was supported for UDP :-) I guess that's the only reason.

willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2015-12-21 20:41                                               ` Willy Tarreau
@ 2016-03-24  5:10                                                 ` Tolga Ceylan
  2016-03-24  6:12                                                   ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Tolga Ceylan @ 2016-03-24  5:10 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tom Herbert, Eric Dumazet, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Mon, Dec 21, 2015 at 12:41 PM, Willy Tarreau <w@1wt.eu> wrote:
> On Mon, Dec 21, 2015 at 12:38:27PM -0800, Tom Herbert wrote:
>> On Fri, Dec 18, 2015 at 11:00 PM, Willy Tarreau <w@1wt.eu> wrote:
>> > On Fri, Dec 18, 2015 at 06:38:03PM -0800, Eric Dumazet wrote:
>> >> On Fri, 2015-12-18 at 19:58 +0100, Willy Tarreau wrote:
>> >> > Hi Josh,
>> >> >
>> >> > On Fri, Dec 18, 2015 at 08:33:45AM -0800, Josh Snyder wrote:
>> >> > > I was also puzzled that binding succeeded. Looking into the code paths
>> >> > > involved, in inet_csk_get_port, we quickly goto have_snum. From there, we end
>> >> > > up dropping into tb_found. Since !hlist_empty(&tb->owners), we end up checking
>> >> > > that (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)).
>> >> > > This test passes, so we goto success and bind.
>> >> > >
>> >> > > Crucially, we are checking the fastreuseport field on the inet_bind_bucket, and
>> >> > > not the sk_reuseport variable on the other sockets in the bucket. Since this
>> >> > > bit is set based on sk_reuseport at the time the first socket binds (see
>> >> > > tb_not_found), I can see no reason why sockets need to keep SO_REUSEPORT set
>> >> > > beyond initial binding.
>> >> > >
>> >> > > Given this, I believe Willy's patch elegantly solves the problem at hand.
>> >> >
>> >> > Great, thanks for your in-depth explanation.
>> >> >
>> >> > Eric, do you think that this patch may be acceptable material for next
>> >> > merge window (given that it's not a fix per-se) ? If so I'll resubmit
>> >> > later.
>> >>
>> >> I need to check with Craig Gallek, because he was about to upstream a
>> >> change to make SO_REUSEPORT more scalable & sexy (like having an [e]BPF
>> >> filter to perform the selection in an array of sockets)
>> >

Hi All,

I apologize for not properly following up on this. I had the
impression that we did not want to merge my original patch and then I
also noticed that it fails to keep the hash consistent. Recently, I
read the follow ups on it as well as Willy's patch/proposals.

Is there any update on Willy's SO_REUSEPORT patch? IMHO, it solves the
problem and it is simpler than adding new sock option.

Best Regards,
Tolga Ceylan

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24  5:10                                                 ` Tolga Ceylan
@ 2016-03-24  6:12                                                   ` Willy Tarreau
  2016-03-24 14:13                                                     ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2016-03-24  6:12 UTC (permalink / raw)
  To: Tolga Ceylan
  Cc: Tom Herbert, Eric Dumazet, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

Hi,

On Wed, Mar 23, 2016 at 10:10:06PM -0700, Tolga Ceylan wrote:
> I apologize for not properly following up on this. I had the
> impression that we did not want to merge my original patch and then I
> also noticed that it fails to keep the hash consistent. Recently, I
> read the follow ups on it as well as Willy's patch/proposals.
> 
> Is there any update on Willy's SO_REUSEPORT patch? IMHO, it solves the
> problem and it is simpler than adding new sock option.

no, Craig's changes were merged, and I haven't checked yet if my patch
needs to be rebased or still applies. Feel free to check it and resubmit
if you have time.

Best regards,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24  6:12                                                   ` Willy Tarreau
@ 2016-03-24 14:13                                                     ` Eric Dumazet
  2016-03-24 14:22                                                       ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2016-03-24 14:13 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Thu, 2016-03-24 at 07:12 +0100, Willy Tarreau wrote:
> Hi,
> 
> On Wed, Mar 23, 2016 at 10:10:06PM -0700, Tolga Ceylan wrote:
> > I apologize for not properly following up on this. I had the
> > impression that we did not want to merge my original patch and then I
> > also noticed that it fails to keep the hash consistent. Recently, I
> > read the follow ups on it as well as Willy's patch/proposals.
> > 
> > Is there any update on Willy's SO_REUSEPORT patch? IMHO, it solves the
> > problem and it is simpler than adding new sock option.
> 
> no, Craig's changes were merged, and I haven't checked yet if my patch
> needs to be rebased or still applies. Feel free to check it and resubmit
> if you have time.

No need for a patch AFAIK.

BPF solution is generic enough.

All user space needs to do is to update the BPF filter so that the
listener needing to be dismantled does not receive any new packet.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 14:13                                                     ` Eric Dumazet
@ 2016-03-24 14:22                                                       ` Willy Tarreau
  2016-03-24 14:45                                                         ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2016-03-24 14:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

Hi Eric,

On Thu, Mar 24, 2016 at 07:13:33AM -0700, Eric Dumazet wrote:
> On Thu, 2016-03-24 at 07:12 +0100, Willy Tarreau wrote:
> > Hi,
> > 
> > On Wed, Mar 23, 2016 at 10:10:06PM -0700, Tolga Ceylan wrote:
> > > I apologize for not properly following up on this. I had the
> > > impression that we did not want to merge my original patch and then I
> > > also noticed that it fails to keep the hash consistent. Recently, I
> > > read the follow ups on it as well as Willy's patch/proposals.
> > > 
> > > Is there any update on Willy's SO_REUSEPORT patch? IMHO, it solves the
> > > problem and it is simpler than adding new sock option.
> > 
> > no, Craig's changes were merged, and I haven't checked yet if my patch
> > needs to be rebased or still applies. Feel free to check it and resubmit
> > if you have time.
> 
> No need for a patch AFAIK.
> 
> BPF solution is generic enough.
> 
> All user space needs to do is to update the BPF filter so that the
> listener needing to be dismantled does not receive any new packet.

But that means that any software making use of SO_REUSEPORT needs to
also implement BPF on Linux to achieve the same as what it does on
other OSes ? Also I found a case where a dying process would still
cause trouble in the accept queue, maybe it's not redistributed, I
don't remember, all I remember is that my traffic stopped after a
segfault of only one of them :-/ I'll have to dig a bit regarding
this.

Thanks,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 14:22                                                       ` Willy Tarreau
@ 2016-03-24 14:45                                                         ` Eric Dumazet
  2016-03-24 15:30                                                           ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2016-03-24 14:45 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Thu, 2016-03-24 at 15:22 +0100, Willy Tarreau wrote:
> Hi Eric,

> But that means that any software making use of SO_REUSEPORT needs to
> also implement BPF on Linux to achieve the same as what it does on
> other OSes ? Also I found a case where a dying process would still
> cause trouble in the accept queue, maybe it's not redistributed, I
> don't remember, all I remember is that my traffic stopped after a
> segfault of only one of them :-/ I'll have to dig a bit regarding
> this.

Hi Willy

Problem is : If we add a SO_REUSEPORT_LISTEN_OFF, this wont work with
BPF. 

BPF makes a decision without knowing individual listeners states.

Or we would need to extend BPF to access these kind of states.
Doable certainly, but we need to be convinced it is necessary.

And yes, if a listener is closed while children are still in accept
queue, we drop all the children, we do not care of redistributing them
to another listeners. Really too complex to be worth it.

For example, we could probably revert
70da268b569d32a9fddeea85dc18043de9d89f89
("net: SO_INCOMING_CPU setsockopt() support") as this can be handled by
BPF as well, and would remove extra tests in fast path (when
SO_REUSEPORT is not used at all)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 14:45                                                         ` Eric Dumazet
@ 2016-03-24 15:30                                                           ` Willy Tarreau
  2016-03-24 16:33                                                             ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2016-03-24 15:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

[-- Attachment #1: Type: text/plain, Size: 2982 bytes --]

Hi Eric,

(just lost my e-mail, trying not to forget some points)

On Thu, Mar 24, 2016 at 07:45:44AM -0700, Eric Dumazet wrote:
> On Thu, 2016-03-24 at 15:22 +0100, Willy Tarreau wrote:
> > Hi Eric,
> 
> > But that means that any software making use of SO_REUSEPORT needs to
> > also implement BPF on Linux to achieve the same as what it does on
> > other OSes ? Also I found a case where a dying process would still
> > cause trouble in the accept queue, maybe it's not redistributed, I
> > don't remember, all I remember is that my traffic stopped after a
> > segfault of only one of them :-/ I'll have to dig a bit regarding
> > this.
> 
> Hi Willy
> 
> Problem is : If we add a SO_REUSEPORT_LISTEN_OFF, this wont work with
> BPF. 

I wasn't for adding SO_REUSEPORT_LISTEN_OFF either. Instead the idea was
just to modify the score in compute_score() so that a socket which disables
SO_REUSEPORT scores less than one which still has it. The application
wishing to terminate just has to clear the SO_REUSEPORT flag and wait for
accept() reporting EAGAIN. The patch simply looked like this (copy-pasted
hence space-mangled) :

--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
                                return -1;
                        score += 4;
                }
+               if (sk->sk_reuseport)
+                       score++;
                if (sk->sk_incoming_cpu == raw_smp_processor_id())
                        score++;
        }

> BPF makes a decision without knowing individual listeners states.

But is the decision taken without considering compute_score() ? The point
really was to be the least possibly intrusive and quite logical for the
application : "disable SO_REUSEPORT when you don't want to participate to
incoming load balancing anymore".

> Or we would need to extend BPF to access these kind of states.
> Doable certainly, but we need to be convinced it is necessary.

You know that I don't like complex designs to address simple issues if
possible :-)

> And yes, if a listener is closed while children are still in accept
> queue, we drop all the children, we do not care of redistributing them
> to another listeners. Really too complex to be worth it.

Forget this, I mixed two issues here. Yes I know that redistributing is
almost impossible, I've read that code a year ago or so and realized how
complex this would be, without providing even 100% success rate. I wasn't
speaking about redistribution of incoming queue but about an issue I've
met where when I have 4 processes bound to the same port, if one dies,
its share of incoming traffic is definitely lost. The only fix was to
restart the processes to create new listeners. But I don't remember the
conditions where I met this case, I don't even remember if it was on an
-rc kernel or a final one, so I'd prefer to discuss this only once I have
more elements.

Cheers,
Willy

[-- Attachment #2: 0001-net-make-lingering-sockets-score-less-in-compute_sco.patch --]
[-- Type: text/plain, Size: 2647 bytes --]

>From c060a5db92274402a0178d7c777a1e37c15eadb9 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 15 Dec 2015 16:40:00 +0100
Subject: net: make lingering sockets score less in compute_score()

When multiple processes use SO_REUSEPORT for a seamless restart
operation, there's a tiny window during which both the old and the new
process are bound to the same port, and there's no way for the old
process to gracefully stop receiving connections without dropping
the few that are left in the queue between the last poll() and the
shutdown() or close() operation.

Incoming connections are distributed between multiple listening sockets
in inet_lookup_listener() according to multiple criteria. The first
criterion is a score based on a number of attributes for each socket,
then a hash computation ensures that the connections are evenly
distributed between sockets of equal weight.

This patch provides a simple approach by which the old process can
simply decrease its score by disabling SO_REUSEPORT on its listening
sockets. Thus, the sockets from the new process always score higher
and are always preferred.

The old process can then safely drain incoming connections and stop
after meeting the -1 EAGAIN condition, as shown in the example below :

         process A (old one)          |  process B (new one)
                                      |
          setsockopt(SO_REUSEPORT, 1) |
          listen() >= 0               |
          ...                         |
          accept()                    |
          ...                         |  setsockopt(SO_REUSEPORT, 1)
          ...                         |  listen()

       From now on, both processes receive incoming connections

          ...                         |  kill(process A, go_away)
          setsockopt(SO_REUSEPORT, 0) |  accept() >= 0

       Here process A stops receiving new connections

          accept() >= 0               |  accept() >= 0
          ...                         |
          accept() = -1 EAGAIN        |  accept() >= 0
          close()                     |
          exit()                      |

Signed-off-by: Willy Tarreau <w@1wt.eu>
---
 net/ipv4/inet_hashtables.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc5980..1c950ba 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
 				return -1;
 			score += 4;
 		}
+		if (sk->sk_reuseport)
+			score++;
 		if (sk->sk_incoming_cpu == raw_smp_processor_id())
 			score++;
 	}
-- 
1.7.12.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 15:30                                                           ` Willy Tarreau
@ 2016-03-24 16:33                                                             ` Eric Dumazet
  2016-03-24 16:50                                                               ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2016-03-24 16:33 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers


On Thu, 2016-03-24 at 16:30 +0100, Willy Tarreau wrote:
> Hi Eric,
> 
> (just lost my e-mail, trying not to forget some points)
> 
> On Thu, Mar 24, 2016 at 07:45:44AM -0700, Eric Dumazet wrote:
> > On Thu, 2016-03-24 at 15:22 +0100, Willy Tarreau wrote:
> > > Hi Eric,
> > 
> > > But that means that any software making use of SO_REUSEPORT needs to
> > > also implement BPF on Linux to achieve the same as what it does on
> > > other OSes ? Also I found a case where a dying process would still
> > > cause trouble in the accept queue, maybe it's not redistributed, I
> > > don't remember, all I remember is that my traffic stopped after a
> > > segfault of only one of them :-/ I'll have to dig a bit regarding
> > > this.
> > 
> > Hi Willy
> > 
> > Problem is : If we add a SO_REUSEPORT_LISTEN_OFF, this wont work with
> > BPF. 
> 
> I wasn't for adding SO_REUSEPORT_LISTEN_OFF either. Instead the idea was
> just to modify the score in compute_score() so that a socket which disables
> SO_REUSEPORT scores less than one which still has it. The application
> wishing to terminate just has to clear the SO_REUSEPORT flag and wait for
> accept() reporting EAGAIN. The patch simply looked like this (copy-pasted
> hence space-mangled) :
> 
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score += 4;
>                 }
> +               if (sk->sk_reuseport)
> +                       score++;

This wont work with BPF

>                 if (sk->sk_incoming_cpu == raw_smp_processor_id())
>                         score++;

This one does not work either with BPF

>         }
> 
> > BPF makes a decision without knowing individual listeners states.
> 
> But is the decision taken without considering compute_score() ? The point
> really was to be the least possibly intrusive and quite logical for the
> application : "disable SO_REUSEPORT when you don't want to participate to
> incoming load balancing anymore".

Whole point of BPF was to avoid iterate through all sockets [1],
and let user space use whatever selection logic it needs.

[1] This was okay with up to 16 sockets. But with 128 it does not scale.

If you really look at how BPF works, implementing another 'per listener' flag
would break the BPF selection.

You can certainly implement the SO_REUSEPORT_LISTEN_OFF by loading an
updated BPF, why should we add another way in the kernel to do the same,
in a way that would not work in some cases ?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 16:33                                                             ` Eric Dumazet
@ 2016-03-24 16:50                                                               ` Willy Tarreau
  2016-03-24 17:01                                                                 ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2016-03-24 16:50 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Thu, Mar 24, 2016 at 09:33:11AM -0700, Eric Dumazet wrote:
> > --- a/net/ipv4/inet_hashtables.c
> > +++ b/net/ipv4/inet_hashtables.c
> > @@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
> >                                 return -1;
> >                         score += 4;
> >                 }
> > +               if (sk->sk_reuseport)
> > +                       score++;
> 
> This wont work with BPF
> 
> >                 if (sk->sk_incoming_cpu == raw_smp_processor_id())
> >                         score++;
> 
> This one does not work either with BPF

But this *is* in 4.5. Does this mean that this part doesn't work anymore or
just that it's not usable in conjunction with BPF ? In this case I'm less
worried, because it would mean that we have a solution for non-BPF aware
applications and that BPF-aware applications can simply use BPF.

> Whole point of BPF was to avoid iterate through all sockets [1],
> and let user space use whatever selection logic it needs.
> 
> [1] This was okay with up to 16 sockets. But with 128 it does not scale.

Indeed.

> If you really look at how BPF works, implementing another 'per listener' flag
> would break the BPF selection.

OK.

> You can certainly implement the SO_REUSEPORT_LISTEN_OFF by loading an
> updated BPF, why should we add another way in the kernel to do the same,
> in a way that would not work in some cases ?

I don't try to reimplement something already available, but I'm confused
by a few points :
  - the code above already exists and you mention it cannot be used with BPF
  - for the vast majority of applications not using BPF, would the above *still*
    work (it worked in 4.4-rc at least)
  - it seems to me that for BPF to be usable on process shutting down, we'd
    need to have some form of central knowledge if the goal is to redefine
    how to distribute the load. In my case there are multiple independant
    processes forked on startup, so it's unclear to me how each of them could
    reconfigure BPF when shutting down without risking to break the other ones.
  - the doc makes me believe that BPF would require privileges to be unset, so
    that would not be compatible with a process shutting down which has already
    dropped its privileges after startup, but I could be wrong.

Thanks for your help on this,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 16:50                                                               ` Willy Tarreau
@ 2016-03-24 17:01                                                                 ` Eric Dumazet
  2016-03-24 17:26                                                                   ` Tom Herbert
  2016-03-24 18:00                                                                   ` Willy Tarreau
  0 siblings, 2 replies; 61+ messages in thread
From: Eric Dumazet @ 2016-03-24 17:01 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Thu, 2016-03-24 at 17:50 +0100, Willy Tarreau wrote:
> On Thu, Mar 24, 2016 at 09:33:11AM -0700, Eric Dumazet wrote:
> > > --- a/net/ipv4/inet_hashtables.c
> > > +++ b/net/ipv4/inet_hashtables.c
> > > @@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
> > >                                 return -1;
> > >                         score += 4;
> > >                 }
> > > +               if (sk->sk_reuseport)
> > > +                       score++;
> > 
> > This wont work with BPF
> > 
> > >                 if (sk->sk_incoming_cpu == raw_smp_processor_id())
> > >                         score++;
> > 
> > This one does not work either with BPF
> 
> But this *is* in 4.5. Does this mean that this part doesn't work anymore or
> just that it's not usable in conjunction with BPF ? In this case I'm less
> worried, because it would mean that we have a solution for non-BPF aware
> applications and that BPF-aware applications can simply use BPF.
> 

BPF can implement the CPU choice/pref itself. It has everything needed.

> I don't try to reimplement something already available, but I'm confused
> by a few points :
>   - the code above already exists and you mention it cannot be used with BPF

_If_ you use BPF, then you can implement a CPU preference using BPF
instructions. It is a user choice.

>   - for the vast majority of applications not using BPF, would the above *still*
>     work (it worked in 4.4-rc at least)


>   - it seems to me that for BPF to be usable on process shutting down, we'd
>     need to have some form of central knowledge if the goal is to redefine
>     how to distribute the load. In my case there are multiple independant
>     processes forked on startup, so it's unclear to me how each of them could
>     reconfigure BPF when shutting down without risking to break the other ones.
>   - the doc makes me believe that BPF would require privileges to be unset, so
>     that would not be compatible with a process shutting down which has already
>     dropped its privileges after startup, but I could be wrong.
> 
> Thanks for your help on this,
> Willy
> 

The point is : BPF is the way to go, because it is expandable.

No more hard points coded forever in the kernel.

Really, when BPF can be the solution, we wont allow adding new stuff in
the kernel in the old way.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 17:01                                                                 ` Eric Dumazet
@ 2016-03-24 17:26                                                                   ` Tom Herbert
  2016-03-24 17:55                                                                     ` Daniel Borkmann
  2016-03-24 18:00                                                                   ` Willy Tarreau
  1 sibling, 1 reply; 61+ messages in thread
From: Tom Herbert @ 2016-03-24 17:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Willy Tarreau, Tolga Ceylan, Craig Gallek, Josh Snyder,
	Aaron Conole, David S. Miller, Linux Kernel Network Developers

On Thu, Mar 24, 2016 at 10:01 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-03-24 at 17:50 +0100, Willy Tarreau wrote:
>> On Thu, Mar 24, 2016 at 09:33:11AM -0700, Eric Dumazet wrote:
>> > > --- a/net/ipv4/inet_hashtables.c
>> > > +++ b/net/ipv4/inet_hashtables.c
>> > > @@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>> > >                                 return -1;
>> > >                         score += 4;
>> > >                 }
>> > > +               if (sk->sk_reuseport)
>> > > +                       score++;
>> >
>> > This wont work with BPF
>> >
>> > >                 if (sk->sk_incoming_cpu == raw_smp_processor_id())
>> > >                         score++;
>> >
>> > This one does not work either with BPF
>>
>> But this *is* in 4.5. Does this mean that this part doesn't work anymore or
>> just that it's not usable in conjunction with BPF ? In this case I'm less
>> worried, because it would mean that we have a solution for non-BPF aware
>> applications and that BPF-aware applications can simply use BPF.
>>
>
> BPF can implement the CPU choice/pref itself. It has everything needed.
>
>> I don't try to reimplement something already available, but I'm confused
>> by a few points :
>>   - the code above already exists and you mention it cannot be used with BPF
>
> _If_ you use BPF, then you can implement a CPU preference using BPF
> instructions. It is a user choice.
>
>>   - for the vast majority of applications not using BPF, would the above *still*
>>     work (it worked in 4.4-rc at least)
>
>
>>   - it seems to me that for BPF to be usable on process shutting down, we'd
>>     need to have some form of central knowledge if the goal is to redefine
>>     how to distribute the load. In my case there are multiple independant
>>     processes forked on startup, so it's unclear to me how each of them could
>>     reconfigure BPF when shutting down without risking to break the other ones.
>>   - the doc makes me believe that BPF would require privileges to be unset, so
>>     that would not be compatible with a process shutting down which has already
>>     dropped its privileges after startup, but I could be wrong.
>>
>> Thanks for your help on this,
>> Willy
>>
>
> The point is : BPF is the way to go, because it is expandable.
>
> No more hard points coded forever in the kernel.
>
> Really, when BPF can be the solution, we wont allow adding new stuff in
> the kernel in the old way.

I completely agree with this, but I wonder if we now need a repository
of useful BPF modules. So in the case of implementing functionality
like in SO_REUSEPORT_LISTEN_OFF that might just become a common BPF
program we could direct people to use.

Tom

>
>
>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 17:26                                                                   ` Tom Herbert
@ 2016-03-24 17:55                                                                     ` Daniel Borkmann
  2016-03-24 18:20                                                                       ` Tolga Ceylan
  2016-03-24 22:40                                                                       ` Yann Ylavic
  0 siblings, 2 replies; 61+ messages in thread
From: Daniel Borkmann @ 2016-03-24 17:55 UTC (permalink / raw)
  To: Tom Herbert, Eric Dumazet
  Cc: Willy Tarreau, Tolga Ceylan, Craig Gallek, Josh Snyder,
	Aaron Conole, David S. Miller, Linux Kernel Network Developers

On 03/24/2016 06:26 PM, Tom Herbert wrote:
> On Thu, Mar 24, 2016 at 10:01 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Thu, 2016-03-24 at 17:50 +0100, Willy Tarreau wrote:
>>> On Thu, Mar 24, 2016 at 09:33:11AM -0700, Eric Dumazet wrote:
>>>>> --- a/net/ipv4/inet_hashtables.c
>>>>> +++ b/net/ipv4/inet_hashtables.c
>>>>> @@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>>>>>                                  return -1;
>>>>>                          score += 4;
>>>>>                  }
>>>>> +               if (sk->sk_reuseport)
>>>>> +                       score++;
>>>>
>>>> This wont work with BPF
>>>>
>>>>>                  if (sk->sk_incoming_cpu == raw_smp_processor_id())
>>>>>                          score++;
>>>>
>>>> This one does not work either with BPF
>>>
>>> But this *is* in 4.5. Does this mean that this part doesn't work anymore or
>>> just that it's not usable in conjunction with BPF ? In this case I'm less
>>> worried, because it would mean that we have a solution for non-BPF aware
>>> applications and that BPF-aware applications can simply use BPF.
>>
>> BPF can implement the CPU choice/pref itself. It has everything needed.
>>
>>> I don't try to reimplement something already available, but I'm confused
>>> by a few points :
>>>    - the code above already exists and you mention it cannot be used with BPF
>>
>> _If_ you use BPF, then you can implement a CPU preference using BPF
>> instructions. It is a user choice.
>>
>>>    - for the vast majority of applications not using BPF, would the above *still*
>>>      work (it worked in 4.4-rc at least)
>>
>>>    - it seems to me that for BPF to be usable on process shutting down, we'd
>>>      need to have some form of central knowledge if the goal is to redefine
>>>      how to distribute the load. In my case there are multiple independant
>>>      processes forked on startup, so it's unclear to me how each of them could
>>>      reconfigure BPF when shutting down without risking to break the other ones.
>>>    - the doc makes me believe that BPF would require privileges to be unset, so
>>>      that would not be compatible with a process shutting down which has already
>>>      dropped its privileges after startup, but I could be wrong.
>>>
>>> Thanks for your help on this,
>>> Willy
>>
>> The point is : BPF is the way to go, because it is expandable.
>>
>> No more hard points coded forever in the kernel.
>>
>> Really, when BPF can be the solution, we wont allow adding new stuff in
>> the kernel in the old way.
>
> I completely agree with this, but I wonder if we now need a repository
> of useful BPF modules. So in the case of implementing functionality
> like in SO_REUSEPORT_LISTEN_OFF that might just become a common BPF
> program we could direct people to use.

Good point. There's tools/testing/selftests/net/ containing already reuseport
BPF example, maybe it could be extended.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 17:01                                                                 ` Eric Dumazet
  2016-03-24 17:26                                                                   ` Tom Herbert
@ 2016-03-24 18:00                                                                   ` Willy Tarreau
  2016-03-24 18:21                                                                     ` Willy Tarreau
  2016-03-24 18:32                                                                     ` Eric Dumazet
  1 sibling, 2 replies; 61+ messages in thread
From: Willy Tarreau @ 2016-03-24 18:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Thu, Mar 24, 2016 at 10:01:37AM -0700, Eric Dumazet wrote:
> On Thu, 2016-03-24 at 17:50 +0100, Willy Tarreau wrote:
> > On Thu, Mar 24, 2016 at 09:33:11AM -0700, Eric Dumazet wrote:
> > > > --- a/net/ipv4/inet_hashtables.c
> > > > +++ b/net/ipv4/inet_hashtables.c
> > > > @@ -189,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
> > > >                                 return -1;
> > > >                         score += 4;
> > > >                 }
> > > > +               if (sk->sk_reuseport)
> > > > +                       score++;
> > > 
> > > This wont work with BPF
> > > 
> > > >                 if (sk->sk_incoming_cpu == raw_smp_processor_id())
> > > >                         score++;
> > > 
> > > This one does not work either with BPF
> > 
> > But this *is* in 4.5. Does this mean that this part doesn't work anymore or
> > just that it's not usable in conjunction with BPF ? In this case I'm less
> > worried, because it would mean that we have a solution for non-BPF aware
> > applications and that BPF-aware applications can simply use BPF.
> > 
> 
> BPF can implement the CPU choice/pref itself. It has everything needed.

Well I don't need the CPU choice, it was already there, it's not my code,
I only need the ability for an independant process to stop receiving new
connections without altering the other processes nor dropping some of these
connections.

In fact initially I didn't even need anything related to incoming connection
load-balancing, just the ability to start a new process without stopping the
old one, as it used to work in 2.2 and for which I used to keep a patch in
2.4 and 2.6. When SO_REUSEPORT was reintroduced in 3.9, that solved the issue
and some users started to complain that between the old and the new processes,
some connections were lost. Hence the proposal above. Since it's not about
load distribution and that processes are totally independant, I don't see
well how to (ab)use BPF to achieve this.

The pattern is :

  t0 : unprivileged processes 1 and 2 are listening to the same port
       (sock1@pid1) (sock2@pid2)
       <------ listening ------>

  t1 : new processes are started to replace the old ones
       (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
       <------ listening ------> <------ listening ------>

  t2 : new processes signal the old ones they must stop
       (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
       <------- draining ------> <------ listening ------>

  t3 : pids 1 and 2 have finished, they go away
                                 (sock3@pid3) (sock4@pid4)
        <------ gone ----->      <------ listening ------>

> >   - it seems to me that for BPF to be usable on process shutting down, we'd
> >     need to have some form of central knowledge if the goal is to redefine
> >     how to distribute the load. In my case there are multiple independant
> >     processes forked on startup, so it's unclear to me how each of them could
> >     reconfigure BPF when shutting down without risking to break the other ones.
> >   - the doc makes me believe that BPF would require privileges to be unset, so
> >     that would not be compatible with a process shutting down which has already
> >     dropped its privileges after startup, but I could be wrong.
> > 
> > Thanks for your help on this,
> > Willy
> > 
> 
> The point is : BPF is the way to go, because it is expandable.

OK so this means we have to find a way to expand it to allow an individual
non-privileged process to change the distribution algorithm without impacting
other processes.

I need to discover it better to find what can be done, but unfortunately at
this point the sole principle makes me think of a level of complexity that
doesn't seem obvious to solve at all :-/

Regards,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 17:55                                                                     ` Daniel Borkmann
@ 2016-03-24 18:20                                                                       ` Tolga Ceylan
  2016-03-24 18:24                                                                         ` Willy Tarreau
  2016-03-24 18:37                                                                         ` Eric Dumazet
  2016-03-24 22:40                                                                       ` Yann Ylavic
  1 sibling, 2 replies; 61+ messages in thread
From: Tolga Ceylan @ 2016-03-24 18:20 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Tom Herbert, Eric Dumazet, Willy Tarreau, Craig Gallek,
	Josh Snyder, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Thu, Mar 24, 2016 at 10:55 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 03/24/2016 06:26 PM, Tom Herbert wrote:
>>
>> I completely agree with this, but I wonder if we now need a repository
>> of useful BPF modules. So in the case of implementing functionality
>> like in SO_REUSEPORT_LISTEN_OFF that might just become a common BPF
>> program we could direct people to use.
>
>
> Good point. There's tools/testing/selftests/net/ containing already
> reuseport
> BPF example, maybe it could be extended.

I would appreciate a conceptual description on how this would work
especially for a common scenario
as described by Willy. My initial impression was that a coordinator
(master) process takes this
responsibility to adjust BPF filters as children come and go.

Two popular software has similar use cases: nginx and haproxy. Another
concern is with the
introduction of BPF itself, should we expect a performance drop in
these applications?

Best Regards,
Tolga Ceylan

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 18:00                                                                   ` Willy Tarreau
@ 2016-03-24 18:21                                                                     ` Willy Tarreau
  2016-03-24 18:32                                                                     ` Eric Dumazet
  1 sibling, 0 replies; 61+ messages in thread
From: Willy Tarreau @ 2016-03-24 18:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Thu, Mar 24, 2016 at 07:00:11PM +0100, Willy Tarreau wrote:
> Since it's not about
> load distribution and that processes are totally independant, I don't see
> well how to (ab)use BPF to achieve this.
> 
> The pattern is :
> 
>   t0 : unprivileged processes 1 and 2 are listening to the same port
>        (sock1@pid1) (sock2@pid2)
>        <------ listening ------>
> 
>   t1 : new processes are started to replace the old ones
>        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
>        <------ listening ------> <------ listening ------>
> 
>   t2 : new processes signal the old ones they must stop
>        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
>        <------- draining ------> <------ listening ------>
> 
>   t3 : pids 1 and 2 have finished, they go away
>                                  (sock3@pid3) (sock4@pid4)
>         <------ gone ----->      <------ listening ------>
> 

Thinking a bit more about it, would it make sense to consider that in order
to address such a scenario, the only the new (still privileged) process
reconfigures the BPF to deliver traffic only to its own sockets and that
by doing so it will result in the old one not to receive any of it anymore ?
If so that could possibly be reasonably doable then. Ie: the old processes
don't have to do anything to stop receiving traffic.

Thanks,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 18:20                                                                       ` Tolga Ceylan
@ 2016-03-24 18:24                                                                         ` Willy Tarreau
  2016-03-24 18:37                                                                         ` Eric Dumazet
  1 sibling, 0 replies; 61+ messages in thread
From: Willy Tarreau @ 2016-03-24 18:24 UTC (permalink / raw)
  To: Tolga Ceylan
  Cc: Daniel Borkmann, Tom Herbert, Eric Dumazet, Craig Gallek,
	Josh Snyder, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Thu, Mar 24, 2016 at 11:20:49AM -0700, Tolga Ceylan wrote:
> I would appreciate a conceptual description on how this would work
> especially for a common scenario
> as described by Willy. My initial impression was that a coordinator
> (master) process takes this
> responsibility to adjust BPF filters as children come and go.

Indeed that would help, I don't know where to start from for now.

> Two popular software has similar use cases: nginx and haproxy. Another
> concern is with the
> introduction of BPF itself, should we expect a performance drop in
> these applications?

Knowing how picky Eric is about performance in such areas, I'm not
worried a single second about adopting something he recommends :-)
I just need to ensure it covers our users' needs. And maybe the
solution I mentionned in the other e-mail could work.

Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 18:00                                                                   ` Willy Tarreau
  2016-03-24 18:21                                                                     ` Willy Tarreau
@ 2016-03-24 18:32                                                                     ` Eric Dumazet
  1 sibling, 0 replies; 61+ messages in thread
From: Eric Dumazet @ 2016-03-24 18:32 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tolga Ceylan, Tom Herbert, cgallek, Josh Snyder, Aaron Conole,
	David S. Miller, Linux Kernel Network Developers

On Thu, 2016-03-24 at 19:00 +0100, Willy Tarreau wrote:

> OK so this means we have to find a way to expand it to allow an individual
> non-privileged process to change the distribution algorithm without impacting
> other processes.

Just to clarify : Installing a BPF filter on a SO_REUSEPORT socket is
not a privileged operation.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 18:20                                                                       ` Tolga Ceylan
  2016-03-24 18:24                                                                         ` Willy Tarreau
@ 2016-03-24 18:37                                                                         ` Eric Dumazet
  1 sibling, 0 replies; 61+ messages in thread
From: Eric Dumazet @ 2016-03-24 18:37 UTC (permalink / raw)
  To: Tolga Ceylan
  Cc: Daniel Borkmann, Tom Herbert, Willy Tarreau, Craig Gallek,
	Josh Snyder, Aaron Conole, David S. Miller,
	Linux Kernel Network Developers

On Thu, 2016-03-24 at 11:20 -0700, Tolga Ceylan wrote:
> On Thu, Mar 24, 2016 at 10:55 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > On 03/24/2016 06:26 PM, Tom Herbert wrote:
> >>
> >> I completely agree with this, but I wonder if we now need a repository
> >> of useful BPF modules. So in the case of implementing functionality
> >> like in SO_REUSEPORT_LISTEN_OFF that might just become a common BPF
> >> program we could direct people to use.
> >
> >
> > Good point. There's tools/testing/selftests/net/ containing already
> > reuseport
> > BPF example, maybe it could be extended.
> 
> I would appreciate a conceptual description on how this would work
> especially for a common scenario
> as described by Willy. My initial impression was that a coordinator
> (master) process takes this
> responsibility to adjust BPF filters as children come and go.
> 
> Two popular software has similar use cases: nginx and haproxy. Another
> concern is with the
> introduction of BPF itself, should we expect a performance drop in
> these applications?

Just to make it clear : 

BPF allows proper siloing if you have multi queue NIC, instead of a
random hashing that was reducing performance.

BPF on SO_REUSEPORT can reduce false sharing between cpus and increase
NUMA locality.

BPF allows us to use whatever number of silos, without having to scan
the whole socket list for every incoming packet.

Huge gain really.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 17:55                                                                     ` Daniel Borkmann
  2016-03-24 18:20                                                                       ` Tolga Ceylan
@ 2016-03-24 22:40                                                                       ` Yann Ylavic
  2016-03-24 22:49                                                                         ` Eric Dumazet
  2016-03-25  0:24                                                                         ` David Miller
  1 sibling, 2 replies; 61+ messages in thread
From: Yann Ylavic @ 2016-03-24 22:40 UTC (permalink / raw)
  To: Linux Kernel Network Developers
  Cc: Tom Herbert, Eric Dumazet, Willy Tarreau, Tolga Ceylan,
	Craig Gallek, Josh Snyder, Aaron Conole, David S. Miller,
	Daniel Borkmann

On Thu, Mar 24, 2016 at 6:55 PM, Daniel Borkmann wrote:
> On 03/24/2016 06:26 PM, Tom Herbert wrote:
>>
>> On Thu, Mar 24, 2016 at 10:01 AM, Eric Dumazet wrote:
>>>
>>> Really, when BPF can be the solution, we wont allow adding new stuff in
>>> the kernel in the old way.
>>
>> I completely agree with this, but I wonder if we now need a repository
>> of useful BPF modules. So in the case of implementing functionality
>> like in SO_REUSEPORT_LISTEN_OFF that might just become a common BPF
>> program we could direct people to use.
>
> Good point. There's tools/testing/selftests/net/ containing already
> reuseport
> BPF example, maybe it could be extended.

FWIW, I find:

    const struct bpf_insn prog[] = {
        /* BPF_MOV64_REG(BPF_REG_6, BPF_REG_1) */
        { BPF_ALU64 | BPF_MOV | BPF_X, BPF_REG_6, BPF_REG_1, 0, 0 },
        /* BPF_LD_ABS(BPF_W, 0) R0 = (uint32_t)skb[0] */
        { BPF_LD | BPF_ABS | BPF_W, 0, 0, 0, 0 },
        /* BPF_ALU64_IMM(BPF_MOD, BPF_REG_0, mod) */
        { BPF_ALU64 | BPF_MOD | BPF_K, BPF_REG_0, 0, 0, mod },
        /* BPF_EXIT_INSN() */
        { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
    };
(and all the way to make it run)

something quite unintuitive from a web server developper perspective,
simply to make SO_REUSEPORT work with forked TCP listeners (probably
as it should out of the box)...

Regards,
Yann.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 22:40                                                                       ` Yann Ylavic
@ 2016-03-24 22:49                                                                         ` Eric Dumazet
  2016-03-24 23:40                                                                           ` Yann Ylavic
  2016-03-25  0:25                                                                           ` David Miller
  2016-03-25  0:24                                                                         ` David Miller
  1 sibling, 2 replies; 61+ messages in thread
From: Eric Dumazet @ 2016-03-24 22:49 UTC (permalink / raw)
  To: Yann Ylavic
  Cc: Linux Kernel Network Developers, Tom Herbert, Willy Tarreau,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

On Thu, 2016-03-24 at 23:40 +0100, Yann Ylavic wrote:

> FWIW, I find:
> 
>     const struct bpf_insn prog[] = {
>         /* BPF_MOV64_REG(BPF_REG_6, BPF_REG_1) */
>         { BPF_ALU64 | BPF_MOV | BPF_X, BPF_REG_6, BPF_REG_1, 0, 0 },
>         /* BPF_LD_ABS(BPF_W, 0) R0 = (uint32_t)skb[0] */
>         { BPF_LD | BPF_ABS | BPF_W, 0, 0, 0, 0 },
>         /* BPF_ALU64_IMM(BPF_MOD, BPF_REG_0, mod) */
>         { BPF_ALU64 | BPF_MOD | BPF_K, BPF_REG_0, 0, 0, mod },
>         /* BPF_EXIT_INSN() */
>         { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
>     };
> (and all the way to make it run)
> 
> something quite unintuitive from a web server developper perspective,
> simply to make SO_REUSEPORT work with forked TCP listeners (probably
> as it should out of the box)...


That is why EBPF has LLVM backend.

Basically you can write your "BPF" program in C, and let llvm convert it
into EBPF.

Sure, you still can write BPF manually, as you could write HTTPS server
in assembly.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 22:49                                                                         ` Eric Dumazet
@ 2016-03-24 23:40                                                                           ` Yann Ylavic
  2016-03-24 23:54                                                                             ` Tom Herbert
  2016-03-25  0:25                                                                           ` David Miller
  1 sibling, 1 reply; 61+ messages in thread
From: Yann Ylavic @ 2016-03-24 23:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linux Kernel Network Developers, Tom Herbert, Willy Tarreau,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

On Thu, Mar 24, 2016 at 11:49 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-03-24 at 23:40 +0100, Yann Ylavic wrote:
>
>> FWIW, I find:
>>
>>     const struct bpf_insn prog[] = {
>>         /* BPF_MOV64_REG(BPF_REG_6, BPF_REG_1) */
>>         { BPF_ALU64 | BPF_MOV | BPF_X, BPF_REG_6, BPF_REG_1, 0, 0 },
>>         /* BPF_LD_ABS(BPF_W, 0) R0 = (uint32_t)skb[0] */
>>         { BPF_LD | BPF_ABS | BPF_W, 0, 0, 0, 0 },
>>         /* BPF_ALU64_IMM(BPF_MOD, BPF_REG_0, mod) */
>>         { BPF_ALU64 | BPF_MOD | BPF_K, BPF_REG_0, 0, 0, mod },
>>         /* BPF_EXIT_INSN() */
>>         { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
>>     };
>> (and all the way to make it run)
>>
>> something quite unintuitive from a web server developper perspective,
>> simply to make SO_REUSEPORT work with forked TCP listeners (probably
>> as it should out of the box)...
>
>
> That is why EBPF has LLVM backend.
>
> Basically you can write your "BPF" program in C, and let llvm convert it
> into EBPF.

I'll learn how to do this to get the best performances from the
server, but having to do so to work around what looks like a defect
(for simple/default SMP configurations at least, no NUMA or clever
CPU-affinity or queuing policy involved) seems odd in the first place.

>From this POV, draining the (ending) listeners is already non obvious
but might be reasonable, (e)BPF sounds really overkill.

But there are surely plenty of good reasons for it, and I won't be
able to dispute your technical arguments in any case ;)

>
> Sure, you still can write BPF manually, as you could write HTTPS server
> in assembly.

OK, I'll take your previous proposal :)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 23:40                                                                           ` Yann Ylavic
@ 2016-03-24 23:54                                                                             ` Tom Herbert
  2016-03-25  0:01                                                                               ` Yann Ylavic
  2016-03-25  5:28                                                                               ` Willy Tarreau
  0 siblings, 2 replies; 61+ messages in thread
From: Tom Herbert @ 2016-03-24 23:54 UTC (permalink / raw)
  To: Yann Ylavic
  Cc: Eric Dumazet, Linux Kernel Network Developers, Willy Tarreau,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

On Thu, Mar 24, 2016 at 4:40 PM, Yann Ylavic <ylavic.dev@gmail.com> wrote:
> On Thu, Mar 24, 2016 at 11:49 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Thu, 2016-03-24 at 23:40 +0100, Yann Ylavic wrote:
>>
>>> FWIW, I find:
>>>
>>>     const struct bpf_insn prog[] = {
>>>         /* BPF_MOV64_REG(BPF_REG_6, BPF_REG_1) */
>>>         { BPF_ALU64 | BPF_MOV | BPF_X, BPF_REG_6, BPF_REG_1, 0, 0 },
>>>         /* BPF_LD_ABS(BPF_W, 0) R0 = (uint32_t)skb[0] */
>>>         { BPF_LD | BPF_ABS | BPF_W, 0, 0, 0, 0 },
>>>         /* BPF_ALU64_IMM(BPF_MOD, BPF_REG_0, mod) */
>>>         { BPF_ALU64 | BPF_MOD | BPF_K, BPF_REG_0, 0, 0, mod },
>>>         /* BPF_EXIT_INSN() */
>>>         { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
>>>     };
>>> (and all the way to make it run)
>>>
>>> something quite unintuitive from a web server developper perspective,
>>> simply to make SO_REUSEPORT work with forked TCP listeners (probably
>>> as it should out of the box)...
>>
>>
>> That is why EBPF has LLVM backend.
>>
>> Basically you can write your "BPF" program in C, and let llvm convert it
>> into EBPF.
>
> I'll learn how to do this to get the best performances from the
> server, but having to do so to work around what looks like a defect
> (for simple/default SMP configurations at least, no NUMA or clever
> CPU-affinity or queuing policy involved) seems odd in the first place.
>
I disagree with your assessment that there is a defect. SO_REUSEPORT
is designed to spread packets amongst _equivalent_ connections. In the
server draining case sockets are no longer equivalent, but that is a
special case.

> From this POV, draining the (ending) listeners is already non obvious
> but might be reasonable, (e)BPF sounds really overkill.
>
Just the opposite, it's a simplification. With BPF we no longer to add
interfaces for all these special cases. This is an important point,
because the question is going to be raised for any proposed interface
change that could be accomplished with BPF (i.e. adding new interfaces
in the kernel becomes the overkill).

Please try to work with it. As I mentioned, the part that we may be
missing are some real world programs that we can direct people to use,
but aside from that I don't think we've seen any arguments that BPF is
overkill or too hard to use for stuff like this.

Tom

> But there are surely plenty of good reasons for it, and I won't be
> able to dispute your technical arguments in any case ;)
>
>>
>> Sure, you still can write BPF manually, as you could write HTTPS server
>> in assembly.
>
> OK, I'll take your previous proposal :)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 23:54                                                                             ` Tom Herbert
@ 2016-03-25  0:01                                                                               ` Yann Ylavic
  2016-03-25  5:28                                                                               ` Willy Tarreau
  1 sibling, 0 replies; 61+ messages in thread
From: Yann Ylavic @ 2016-03-25  0:01 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Eric Dumazet, Linux Kernel Network Developers, Willy Tarreau,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

On Fri, Mar 25, 2016 at 12:54 AM, Tom Herbert <tom@herbertland.com> wrote:
> On Thu, Mar 24, 2016 at 4:40 PM, Yann Ylavic <ylavic.dev@gmail.com> wrote:
>>
>> From this POV, draining the (ending) listeners is already non obvious
>> but might be reasonable, (e)BPF sounds really overkill.
>>
> Just the opposite, it's a simplification. With BPF we no longer to add
> interfaces for all these special cases. This is an important point,
> because the question is going to be raised for any proposed interface
> change that could be accomplished with BPF (i.e. adding new interfaces
> in the kernel becomes the overkill).
>
> Please try to work with it. As I mentioned, the part that we may be
> missing are some real world programs that we can direct people to use,
> but aside from that I don't think we've seen any arguments that BPF is
> overkill or too hard to use for stuff like this.

OK, you certainly know better than me.
Really looking forward these real world BPF programs to enlighten me.

Thanks for your time.

Regards,
Yann.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 22:40                                                                       ` Yann Ylavic
  2016-03-24 22:49                                                                         ` Eric Dumazet
@ 2016-03-25  0:24                                                                         ` David Miller
  1 sibling, 0 replies; 61+ messages in thread
From: David Miller @ 2016-03-25  0:24 UTC (permalink / raw)
  To: ylavic.dev
  Cc: netdev, tom, eric.dumazet, w, tolga.ceylan, cgallek, josh,
	aconole, daniel

From: Yann Ylavic <ylavic.dev@gmail.com>
Date: Thu, 24 Mar 2016 23:40:30 +0100

> On Thu, Mar 24, 2016 at 6:55 PM, Daniel Borkmann wrote:
>> On 03/24/2016 06:26 PM, Tom Herbert wrote:
>>>
>>> On Thu, Mar 24, 2016 at 10:01 AM, Eric Dumazet wrote:
>>>>
>>>> Really, when BPF can be the solution, we wont allow adding new stuff in
>>>> the kernel in the old way.
>>>
>>> I completely agree with this, but I wonder if we now need a repository
>>> of useful BPF modules. So in the case of implementing functionality
>>> like in SO_REUSEPORT_LISTEN_OFF that might just become a common BPF
>>> program we could direct people to use.
>>
>> Good point. There's tools/testing/selftests/net/ containing already
>> reuseport
>> BPF example, maybe it could be extended.
> 
> FWIW, I find:
> 
>     const struct bpf_insn prog[] = {
>         /* BPF_MOV64_REG(BPF_REG_6, BPF_REG_1) */
>         { BPF_ALU64 | BPF_MOV | BPF_X, BPF_REG_6, BPF_REG_1, 0, 0 },
>         /* BPF_LD_ABS(BPF_W, 0) R0 = (uint32_t)skb[0] */
>         { BPF_LD | BPF_ABS | BPF_W, 0, 0, 0, 0 },
>         /* BPF_ALU64_IMM(BPF_MOD, BPF_REG_0, mod) */
>         { BPF_ALU64 | BPF_MOD | BPF_K, BPF_REG_0, 0, 0, mod },
>         /* BPF_EXIT_INSN() */
>         { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
>     };
> (and all the way to make it run)
> 
> something quite unintuitive from a web server developper perspective,
> simply to make SO_REUSEPORT work with forked TCP listeners (probably
> as it should out of the box)...

If we encapsulate this into libraries and helper wrappers, there is
no reason web server developers should be looking at these details
anyways.

Please don't make a mountain out of a mole-hill.

We build things on top of good infrastructure, rather than build
duplicate ways to do the same exact thing.

BPF is good infrastructure, therefore that is what things will be
built on top of.

Thanks.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 22:49                                                                         ` Eric Dumazet
  2016-03-24 23:40                                                                           ` Yann Ylavic
@ 2016-03-25  0:25                                                                           ` David Miller
  1 sibling, 0 replies; 61+ messages in thread
From: David Miller @ 2016-03-25  0:25 UTC (permalink / raw)
  To: eric.dumazet
  Cc: ylavic.dev, netdev, tom, w, tolga.ceylan, cgallek, josh, aconole, daniel

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Mar 2016 15:49:48 -0700

> That is why EBPF has LLVM backend.
> 
> Basically you can write your "BPF" program in C, and let llvm convert it
> into EBPF.
> 
> Sure, you still can write BPF manually, as you could write HTTPS server
> in assembly.

+1

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-24 23:54                                                                             ` Tom Herbert
  2016-03-25  0:01                                                                               ` Yann Ylavic
@ 2016-03-25  5:28                                                                               ` Willy Tarreau
  2016-03-25  6:49                                                                                 ` Eric Dumazet
  1 sibling, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2016-03-25  5:28 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Yann Ylavic, Eric Dumazet, Linux Kernel Network Developers,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

On Thu, Mar 24, 2016 at 04:54:03PM -0700, Tom Herbert wrote:
> On Thu, Mar 24, 2016 at 4:40 PM, Yann Ylavic <ylavic.dev@gmail.com> wrote:
> > I'll learn how to do this to get the best performances from the
> > server, but having to do so to work around what looks like a defect
> > (for simple/default SMP configurations at least, no NUMA or clever
> > CPU-affinity or queuing policy involved) seems odd in the first place.
> >
> I disagree with your assessment that there is a defect. SO_REUSEPORT
> is designed to spread packets amongst _equivalent_ connections. In the
> server draining case sockets are no longer equivalent, but that is a
> special case.

I partially disagree with you here Tom. Initially SO_REUSEPORT was not
used to spread packets but to allow soft restart in some applications.
I've been using it since 2001 in haproxy on *BSD and linux 2.2. It was
removed during 2.3 and I used to keep a patch to reimplement it in 2.4
(basically 2 or 3 lines, the infrastructure was still present), but the
patch was not accepted. The same patch worked for 2.6 and 3.x, allowing
me to continue to perform soft-restarts on Linux just like I used to do
on *BSD. When SO_REUSEPORT was reimplemented in 3.9 with load balancing,
I was happy because it at last allowed me to drop my patch and I got
the extra benefit of better load balancing of incoming connections.

But the main use we have for it (at least historically) is for soft
restarts, where one process replaces another one. Very few people use
more than one process in our case.

However given the benefits of the load spreading for extreme loads,
I'm willing to find how to achieve the same with BPF, but it's pretty
clear that at this point I have no idea where to start from and that
for a single process replacing a single one, it looks quite complicated.

For me quite frankly the special case is the load balancing which is
a side benefit (and a nice one, don't get me wrong).

That's why I would have found it nice to "fix" the process replacement
to avoid dropping incoming connections, though I don't want it to become
a problem for future improvements on BPF. I don't think the two lines I
proposed could become an issue but I'll live without them (or continue
to apply this patch).

BTW, I have no problem with having to write a little bit of assembly for
fast interfaces if it remains untouched for years, we do already have a
bit in haproxy. It's just a longterm investment.

Best regards,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-25  5:28                                                                               ` Willy Tarreau
@ 2016-03-25  6:49                                                                                 ` Eric Dumazet
  2016-03-25  8:53                                                                                   ` Willy Tarreau
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2016-03-25  6:49 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Tom Herbert, Yann Ylavic, Linux Kernel Network Developers,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

On Fri, 2016-03-25 at 06:28 +0100, Willy Tarreau wrote:
> On Thu, Mar 24, 2016 at 04:54:03PM -0700, Tom Herbert wrote:
> > On Thu, Mar 24, 2016 at 4:40 PM, Yann Ylavic <ylavic.dev@gmail.com> wrote:
> > > I'll learn how to do this to get the best performances from the
> > > server, but having to do so to work around what looks like a defect
> > > (for simple/default SMP configurations at least, no NUMA or clever
> > > CPU-affinity or queuing policy involved) seems odd in the first place.
> > >
> > I disagree with your assessment that there is a defect. SO_REUSEPORT
> > is designed to spread packets amongst _equivalent_ connections. In the
> > server draining case sockets are no longer equivalent, but that is a
> > special case.
> 
> I partially disagree with you here Tom. Initially SO_REUSEPORT was not
> used to spread packets but to allow soft restart in some applications.
> I've been using it since 2001 in haproxy on *BSD and linux 2.2. It was
> removed during 2.3 and I used to keep a patch to reimplement it in 2.4
> (basically 2 or 3 lines, the infrastructure was still present), but the
> patch was not accepted. The same patch worked for 2.6 and 3.x, allowing
> me to continue to perform soft-restarts on Linux just like I used to do
> on *BSD. When SO_REUSEPORT was reimplemented in 3.9 with load balancing,
> I was happy because it at last allowed me to drop my patch and I got
> the extra benefit of better load balancing of incoming connections.
> 
> But the main use we have for it (at least historically) is for soft
> restarts, where one process replaces another one. Very few people use
> more than one process in our case.
> 
> However given the benefits of the load spreading for extreme loads,
> I'm willing to find how to achieve the same with BPF, but it's pretty
> clear that at this point I have no idea where to start from and that
> for a single process replacing a single one, it looks quite complicated.
> 
> For me quite frankly the special case is the load balancing which is
> a side benefit (and a nice one, don't get me wrong).
> 
> That's why I would have found it nice to "fix" the process replacement
> to avoid dropping incoming connections, though I don't want it to become
> a problem for future improvements on BPF. I don't think the two lines I
> proposed could become an issue but I'll live without them (or continue
> to apply this patch).
> 
> BTW, I have no problem with having to write a little bit of assembly for
> fast interfaces if it remains untouched for years, we do already have a
> bit in haproxy. It's just a longterm investment.

Everything is possible, but do not complain because BPF went in the
kernel before your changes.

Just rework your patch.

Supporting multiple SO_REUSEPORT groups on the same port should not be
too hard really. Making sure BPF still work for them is feasible.

But the semantic of the socket option would be really different.

You need to not control an individual listener, but a group of listener.

Your dying haproxy would issue a single system call to tell the kernel :
My SO_REUSEPORT group should not accept new SYN packets, so that the new
haproxy can setup a working new SO_REUSEPORT group.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-25  6:49                                                                                 ` Eric Dumazet
@ 2016-03-25  8:53                                                                                   ` Willy Tarreau
  2016-03-25 11:21                                                                                     ` Yann Ylavic
  0 siblings, 1 reply; 61+ messages in thread
From: Willy Tarreau @ 2016-03-25  8:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, Yann Ylavic, Linux Kernel Network Developers,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

Hi Eric,

On Thu, Mar 24, 2016 at 11:49:41PM -0700, Eric Dumazet wrote:
> Everything is possible, but do not complain because BPF went in the
> kernel before your changes.

Don't get me wrong, I'm not complaining, I'm more asking for help to
try to elaborate the alternate solution. I understood well what my
proposal did because it was pretty straightforward, and the new one
I'll have to do is of an immense complexity for me by now, since it
will require learning a new language and finding doc on how all this
works together. But as I said I'm totally sold to the benefits it can
provide for large scale deployments and I'm well aware of the ugly
socket scan there was in the previous one.

> Just rework your patch.
> 
> Supporting multiple SO_REUSEPORT groups on the same port should not be
> too hard really. Making sure BPF still work for them is feasible.
> 
> But the semantic of the socket option would be really different.

I don't care much about the socket options themselves as long as I can
continue to support seamless reloads. I could even get rid of SO_REUSEPORT
on Linux to use something else instead if I have a reliable way to detect
that the alternative will work.

> You need to not control an individual listener, but a group of listener.
> 
> Your dying haproxy would issue a single system call to tell the kernel :
> My SO_REUSEPORT group should not accept new SYN packets, so that the new
> haproxy can setup a working new SO_REUSEPORT group.

Normally it's the other way around :-) The new process first grabs the
socket, there's a tiny window where both are attached, and only then the
old process leaves. That's the only way to ensure there's no loss nor
added latency in the processing.

If you could share a pointer to some good starter documentation for this,
I would appreciate it. I really have no idea where to start from and the
only elements I found on the net didn't give a single hint regarding all
this :-/

Thanks,
Willy

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-25  8:53                                                                                   ` Willy Tarreau
@ 2016-03-25 11:21                                                                                     ` Yann Ylavic
  2016-03-25 13:17                                                                                       ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Yann Ylavic @ 2016-03-25 11:21 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Eric Dumazet, Tom Herbert, Linux Kernel Network Developers,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

On Fri, Mar 25, 2016 at 9:53 AM, Willy Tarreau wrote:
>
> On Thu, Mar 24, 2016 at 11:49:41PM -0700, Eric Dumazet wrote:
>> Everything is possible, but do not complain because BPF went in the
>> kernel before your changes.
>
> Don't get me wrong, I'm not complaining, I'm more asking for help to
> try to elaborate the alternate solution. I understood well what my
> proposal did because it was pretty straightforward, and the new one
> I'll have to do is of an immense complexity for me by now, since it
> will require learning a new language and finding doc on how all this
> works together. But as I said I'm totally sold to the benefits it can
> provide for large scale deployments and I'm well aware of the ugly
> socket scan there was in the previous one.

+1 (all the work done on the listeners, including SO_REUSEPORT, and
until lately very much appreciated in any case!)

>
> If you could share a pointer to some good starter documentation for this,
> I would appreciate it. I really have no idea where to start from and the
> only elements I found on the net didn't give a single hint regarding all
> this :-/

+1

On Fri, Mar 25, 2016 at 1:24 AM, David Miller wrote:
>
> If we encapsulate this into libraries and helper wrappers, there is
> no reason web server developers should be looking at these details
> anyways.

++1

>
> Please don't make a mountain out of a mole-hill.

Not my intention, you guys know what's the better for the kernel and its APIs.
My concern is (was) admittedly due to my own lack of knowledge of
(e)BPF, hence how much of kernel internals I'd need to know to make
the SO_REUSEPORT work in my case.

But sure, any of the help suggested above would make it go away, I'll
stay tuned.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-25 11:21                                                                                     ` Yann Ylavic
@ 2016-03-25 13:17                                                                                       ` Eric Dumazet
  0 siblings, 0 replies; 61+ messages in thread
From: Eric Dumazet @ 2016-03-25 13:17 UTC (permalink / raw)
  To: Yann Ylavic
  Cc: Willy Tarreau, Tom Herbert, Linux Kernel Network Developers,
	Tolga Ceylan, Craig Gallek, Josh Snyder, Aaron Conole,
	David S. Miller, Daniel Borkmann

On Fri, 2016-03-25 at 12:21 +0100, Yann Ylavic wrote:

> Not my intention, you guys know what's the better for the kernel and its APIs.
> My concern is (was) admittedly due to my own lack of knowledge of
> (e)BPF, hence how much of kernel internals I'd need to know to make
> the SO_REUSEPORT work in my case.
> 
> But sure, any of the help suggested above would make it go away, I'll
> stay tuned.

Most of the time, the BPF filter can be trivial.

If your server has a multi queue NIC with 16 queues on a 16 cpus host,
it would be natural to use 16 listeners, to properly use the NIC
features (RSS).

BPF program would be

A = QUEUE_NUMBER(skb);
RETURN A;

If you chose to have 4 listeners instead, for whatever reasons.

A = QUEUE_NUMBER(skb);
A = A & 3;
RETURN A;


You also can use 

A = CURRENT_CPU;
RETURN A;

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-25 17:00     ` Eric Dumazet
@ 2016-03-25 18:31       ` Willem de Bruijn
  0 siblings, 0 replies; 61+ messages in thread
From: Willem de Bruijn @ 2016-03-25 18:31 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Craig Gallek, Linux Kernel Network Developers, Alexei Starovoitov

On Fri, Mar 25, 2016 at 1:00 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2016-03-25 at 12:31 -0400, Craig Gallek wrote:
>
>> I believe the issue here is that closing the listen sockets will drop
>> any connections that are in the listen queue but have not been
>> accepted yet.  In the case of reuseport, you could in theory drain
>> those queues into the non-closed sockets, but that probably has some
>> interesting consequences...
>
> It is more complicated than this.
>
> Ideally, no TCP connection should be dropped during a server change.
>
> The idea is to let old program running as long as :
> 1) It has established TCP sessions
> 2) Some SYN_RECV pseudo requests are still around
>
> Once 3WHS completes for these SYN_RECV, children are queued into
> listener accept queues.
>
> But the idea is to direct all new SYN packets to the 'new' process and
> its listeners. (New SYN_RECV should be created on behalf on the new
> listeners only)
>
>
> In some environments, the listeners are simply transfered via FD
> passing, from the 'old process' to the new one.

Right. Comparatively, one of the nice features of the BPF variant is
that the sockets in the old process can passively enter listen_off
state solely with changes initiated by the new process (change the bpf
filter for the group).

By the way, if I read correctly, the listen_off feature was already
possible without kernel changes prior to fast reuseport by changing
SO_BINDTODEVICE on the old process's sockets to effectively segment
them into a separate reuseport group. With fast reuseport,
sk_bound_dev_if state equivalence is checked on joining a group, but
the socket is not removed from the array when that syscall is made, so
this does not work.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-25 16:31   ` Craig Gallek
@ 2016-03-25 17:00     ` Eric Dumazet
  2016-03-25 18:31       ` Willem de Bruijn
  0 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2016-03-25 17:00 UTC (permalink / raw)
  To: Craig Gallek; +Cc: Linux Kernel Network Developers, Alexei Starovoitov

On Fri, 2016-03-25 at 12:31 -0400, Craig Gallek wrote:

> I believe the issue here is that closing the listen sockets will drop
> any connections that are in the listen queue but have not been
> accepted yet.  In the case of reuseport, you could in theory drain
> those queues into the non-closed sockets, but that probably has some
> interesting consequences...

It is more complicated than this.

Ideally, no TCP connection should be dropped during a server change.

The idea is to let old program running as long as :
1) It has established TCP sessions
2) Some SYN_RECV pseudo requests are still around

Once 3WHS completes for these SYN_RECV, children are queued into
listener accept queues.

But the idea is to direct all new SYN packets to the 'new' process and
its listeners. (New SYN_RECV should be created on behalf on the new
listeners only)


In some environments, the listeners are simply transfered via FD
passing, from the 'old process' to the new one.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-25 16:21 ` Alexei Starovoitov
@ 2016-03-25 16:31   ` Craig Gallek
  2016-03-25 17:00     ` Eric Dumazet
  0 siblings, 1 reply; 61+ messages in thread
From: Craig Gallek @ 2016-03-25 16:31 UTC (permalink / raw)
  To: Linux Kernel Network Developers; +Cc: Alexei Starovoitov

On Fri, Mar 25, 2016 at 12:21 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Fri, Mar 25, 2016 at 11:29:10AM -0400, Craig Gallek wrote:
>> On Thu, Mar 24, 2016 at 2:00 PM, Willy Tarreau <w@1wt.eu> wrote:
>> > The pattern is :
>> >
>> >   t0 : unprivileged processes 1 and 2 are listening to the same port
>> >        (sock1@pid1) (sock2@pid2)
>> >        <------ listening ------>
>> >
>> >   t1 : new processes are started to replace the old ones
>> >        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
>> >        <------ listening ------> <------ listening ------>
>> >
>> >   t2 : new processes signal the old ones they must stop
>> >        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
>> >        <------- draining ------> <------ listening ------>
>> >
>> >   t3 : pids 1 and 2 have finished, they go away
>> >                                  (sock3@pid3) (sock4@pid4)
>> >         <------ gone ----->      <------ listening ------>
> ...
>> t3: Close the first two sockets and only use the last two.  This is
>> the tricky step.  Before this point, the sockets are numbered 0
>> through 3 from the perspective of the BPF program (in the order
>> listen() was called).  As soon as socket 0 is closed, the last socket
>> in the list replaces it (what was 3 becomes 0).  When socket 1 is
>> closed, socket 2 moves into that position.  The assumptions about the
>> socket indexes in the BPF program need to change as the indexes change
>> as a result of closing them.
>
> yeah, the way reuseport_detach_sock() was done makes it hard to manage
> such transitions from bpf program, but I don't see yet what stops
> pid1 an pid2 at stage t2 to just close their sockets.
> If these 'draining' pids don't want to receive packets, they should
> close their sockets. Complicating bpf side to redistribute spraying
> to sock3 and sock4 only (while sock1 and sock2 are still open) is possible,
> but looks unnecessary complex to me.
> Just close sock1 and sock2 at t2 time and then exit pid1, pid2 later.
> If they are tcp sockets with rpc protocol on top and have a problem of
> partial messages, then kcm can solve that and it will simplify
> the user space side as well.
I believe the issue here is that closing the listen sockets will drop
any connections that are in the listen queue but have not been
accepted yet.  In the case of reuseport, you could in theory drain
those queues into the non-closed sockets, but that probably has some
interesting consequences...

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
  2016-03-25 15:29 Craig Gallek
@ 2016-03-25 16:21 ` Alexei Starovoitov
  2016-03-25 16:31   ` Craig Gallek
  0 siblings, 1 reply; 61+ messages in thread
From: Alexei Starovoitov @ 2016-03-25 16:21 UTC (permalink / raw)
  To: Craig Gallek; +Cc: Linux Kernel Network Developers

On Fri, Mar 25, 2016 at 11:29:10AM -0400, Craig Gallek wrote:
> On Thu, Mar 24, 2016 at 2:00 PM, Willy Tarreau <w@1wt.eu> wrote:
> > The pattern is :
> >
> >   t0 : unprivileged processes 1 and 2 are listening to the same port
> >        (sock1@pid1) (sock2@pid2)
> >        <------ listening ------>
> >
> >   t1 : new processes are started to replace the old ones
> >        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
> >        <------ listening ------> <------ listening ------>
> >
> >   t2 : new processes signal the old ones they must stop
> >        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
> >        <------- draining ------> <------ listening ------>
> >
> >   t3 : pids 1 and 2 have finished, they go away
> >                                  (sock3@pid3) (sock4@pid4)
> >         <------ gone ----->      <------ listening ------>
...
> t3: Close the first two sockets and only use the last two.  This is
> the tricky step.  Before this point, the sockets are numbered 0
> through 3 from the perspective of the BPF program (in the order
> listen() was called).  As soon as socket 0 is closed, the last socket
> in the list replaces it (what was 3 becomes 0).  When socket 1 is
> closed, socket 2 moves into that position.  The assumptions about the
> socket indexes in the BPF program need to change as the indexes change
> as a result of closing them.

yeah, the way reuseport_detach_sock() was done makes it hard to manage
such transitions from bpf program, but I don't see yet what stops
pid1 an pid2 at stage t2 to just close their sockets.
If these 'draining' pids don't want to receive packets, they should
close their sockets. Complicating bpf side to redistribute spraying
to sock3 and sock4 only (while sock1 and sock2 are still open) is possible,
but looks unnecessary complex to me.
Just close sock1 and sock2 at t2 time and then exit pid1, pid2 later.
If they are tcp sockets with rpc protocol on top and have a problem of
partial messages, then kcm can solve that and it will simplify
the user space side as well.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
@ 2016-03-25 15:29 Craig Gallek
  2016-03-25 16:21 ` Alexei Starovoitov
  0 siblings, 1 reply; 61+ messages in thread
From: Craig Gallek @ 2016-03-25 15:29 UTC (permalink / raw)
  To: Linux Kernel Network Developers

On Thu, Mar 24, 2016 at 2:00 PM, Willy Tarreau <w@1wt.eu> wrote:
> The pattern is :
>
>   t0 : unprivileged processes 1 and 2 are listening to the same port
>        (sock1@pid1) (sock2@pid2)
>        <------ listening ------>
>
>   t1 : new processes are started to replace the old ones
>        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
>        <------ listening ------> <------ listening ------>
>
>   t2 : new processes signal the old ones they must stop
>        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
>        <------- draining ------> <------ listening ------>
>
>   t3 : pids 1 and 2 have finished, they go away
>                                  (sock3@pid3) (sock4@pid4)
>         <------ gone ----->      <------ listening ------>

To address the documentation issues, I'd like to reference the following:
- The filter.txt document in the kernel tree:
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/filter.txt
- It uses (and extends) the BPF instruction set defined in the
original BSD BPF paper: http://www.tcpdump.org/papers/bpf-usenix93.pdf
- The kernel headers define all of the user-space structures used:
  * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/filter.h
  * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/bpf.h

I've been trying to come up with an example BPF program for use in the
example Willy gave earlier in this thread (using 4 points in time and
describing one process with two listening sockets replacing another
with two listening sockets).  Everything except the last step is
pretty straight forward using what is currently available in the
kernel.  I'm using random distribution for simplicity, but you could
easily do something smarter using more information about the specific
hardware:

t0: Evenly distrubute load to two SO_REUSEPORT sockets in a single process:
  ld rand
  mod #2
  ret a

t1: Fork a new process, create two new listening sockets in the same
group. Even after calling listen(), but before updating the BPF
program, only the first two sockets will see new connections.  The
program is trivially modified to use all 4.
  ld rand
  mod #4
  ret a

t2: Stop sending new connections to the first two sockets (draining)
  ld rand
  mod #2
  add #2
  ret a

t3: Close the first two sockets and only use the last two.  This is
the tricky step.  Before this point, the sockets are numbered 0
through 3 from the perspective of the BPF program (in the order
listen() was called).  As soon as socket 0 is closed, the last socket
in the list replaces it (what was 3 becomes 0).  When socket 1 is
closed, socket 2 moves into that position.  The assumptions about the
socket indexes in the BPF program need to change as the indexes change
as a result of closing them.

Even if you use an EBPF map as a level of indirection here, you still
have the issue that the socket indexes change as a result of some of
them leaving the group.  I'm not sure yet how to properly fix this,
but it will probably mean changing the way the socket indexing
works...  The current scheme is really an implementation detail
optimized for efficiency.  It may be worth modifying or creating a
mode which results in a stable mapping.  This will probably be
necessary for any scheme which expects sockets to regularly enter or
leave the group.

^ permalink raw reply	[flat|nested] 61+ messages in thread

end of thread, other threads:[~2016-03-25 18:32 UTC | newest]

Thread overview: 61+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-27  0:30 [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode Tolga Ceylan
2015-09-27  1:04 ` Eric Dumazet
2015-09-27  1:37   ` Tolga Ceylan
2015-09-27  1:44 ` Aaron Conole
2015-09-27  2:02   ` Tolga Ceylan
2015-09-27  2:24     ` Eric Dumazet
2015-11-11  5:41       ` Tom Herbert
2015-11-11  6:19         ` Eric Dumazet
2015-11-11 17:05           ` Tom Herbert
2015-11-11 17:23             ` Eric Dumazet
2015-11-11 18:23               ` Tom Herbert
2015-11-11 18:43                 ` Eric Dumazet
2015-11-12  1:09                   ` Eric Dumazet
2015-12-15 16:14                     ` Willy Tarreau
2015-12-15 17:10                       ` Eric Dumazet
2015-12-15 17:43                         ` Willy Tarreau
2015-12-15 18:21                           ` Eric Dumazet
2015-12-15 19:44                             ` Willy Tarreau
2015-12-15 21:21                               ` Eric Dumazet
2015-12-16  7:38                                 ` Willy Tarreau
2015-12-16 16:15                                   ` Willy Tarreau
2015-12-18 16:33                                     ` Josh Snyder
2015-12-18 18:58                                       ` Willy Tarreau
2015-12-19  2:38                                         ` Eric Dumazet
2015-12-19  7:00                                           ` Willy Tarreau
2015-12-21 20:38                                             ` Tom Herbert
2015-12-21 20:41                                               ` Willy Tarreau
2016-03-24  5:10                                                 ` Tolga Ceylan
2016-03-24  6:12                                                   ` Willy Tarreau
2016-03-24 14:13                                                     ` Eric Dumazet
2016-03-24 14:22                                                       ` Willy Tarreau
2016-03-24 14:45                                                         ` Eric Dumazet
2016-03-24 15:30                                                           ` Willy Tarreau
2016-03-24 16:33                                                             ` Eric Dumazet
2016-03-24 16:50                                                               ` Willy Tarreau
2016-03-24 17:01                                                                 ` Eric Dumazet
2016-03-24 17:26                                                                   ` Tom Herbert
2016-03-24 17:55                                                                     ` Daniel Borkmann
2016-03-24 18:20                                                                       ` Tolga Ceylan
2016-03-24 18:24                                                                         ` Willy Tarreau
2016-03-24 18:37                                                                         ` Eric Dumazet
2016-03-24 22:40                                                                       ` Yann Ylavic
2016-03-24 22:49                                                                         ` Eric Dumazet
2016-03-24 23:40                                                                           ` Yann Ylavic
2016-03-24 23:54                                                                             ` Tom Herbert
2016-03-25  0:01                                                                               ` Yann Ylavic
2016-03-25  5:28                                                                               ` Willy Tarreau
2016-03-25  6:49                                                                                 ` Eric Dumazet
2016-03-25  8:53                                                                                   ` Willy Tarreau
2016-03-25 11:21                                                                                     ` Yann Ylavic
2016-03-25 13:17                                                                                       ` Eric Dumazet
2016-03-25  0:25                                                                           ` David Miller
2016-03-25  0:24                                                                         ` David Miller
2016-03-24 18:00                                                                   ` Willy Tarreau
2016-03-24 18:21                                                                     ` Willy Tarreau
2016-03-24 18:32                                                                     ` Eric Dumazet
2016-03-25 15:29 Craig Gallek
2016-03-25 16:21 ` Alexei Starovoitov
2016-03-25 16:31   ` Craig Gallek
2016-03-25 17:00     ` Eric Dumazet
2016-03-25 18:31       ` Willem de Bruijn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).