netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
@ 2015-09-27  0:30 Tolga Ceylan
  2015-09-27  1:04 ` Eric Dumazet
  2015-09-27  1:44 ` Aaron Conole
  0 siblings, 2 replies; 61+ messages in thread
From: Tolga Ceylan @ 2015-09-27  0:30 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Tolga Ceylan

For applications using SO_REUSEPORT listeners, there is
no clean way to switch traffic on/off or add/remove
listeners without dropping pending connections. With this
patch, applications can turn off queueing of new connections
for a specific listener socket which enables implementation of
zero down time server applications.

For example, a popular web server nginx handles application
configuration changes by forking new processes (listeners)
and waiting for old processes (listeners) to finish up their
processing. However, this approach is distruptive as removal
of a listener will drop pending connections for that listener.
Instead, with this patch, nginx can maintain two sets of listener
socket pools to be used by old/new processes and switch traffic off/on
using this socket option. Old processes set set this socket option
to drain their existing queues.

Tested on a x86_64 kernel.

Signed-off-by: Tolga Ceylan <tolga.ceylan@gmail.com>
---
 arch/alpha/include/uapi/asm/socket.h   | 2 ++
 arch/avr32/include/uapi/asm/socket.h   | 2 ++
 arch/frv/include/uapi/asm/socket.h     | 2 ++
 arch/ia64/include/uapi/asm/socket.h    | 2 ++
 arch/m32r/include/uapi/asm/socket.h    | 2 ++
 arch/mips/include/uapi/asm/socket.h    | 2 ++
 arch/mn10300/include/uapi/asm/socket.h | 2 ++
 arch/parisc/include/uapi/asm/socket.h  | 2 ++
 arch/powerpc/include/uapi/asm/socket.h | 2 ++
 arch/sparc/include/uapi/asm/socket.h   | 2 ++
 include/net/sock.h                     | 3 +++
 include/uapi/asm-generic/socket.h      | 2 ++
 net/core/sock.c                        | 3 +++
 net/ipv4/inet_hashtables.c             | 5 ++++-
 14 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 9a20821..d2ad268 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 2b65ed6..6b6d0af 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index 4823ad1..23d6b82 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -85,5 +85,7 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 59be3d8..c3d5ada 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -94,4 +94,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 7bc4cb2..602f4b4 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index dec3c85..e0880e2 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -103,4 +103,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index cab7d6d..d60f747 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index a5cd40c..0ffa8de 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -84,4 +84,6 @@
 #define SO_ATTACH_BPF		0x402B
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index c046666..6935839 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index e6a16c4..e5ecf16 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -81,6 +81,8 @@
 #define SO_ATTACH_BPF		0x0034
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 0x0035
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/include/net/sock.h b/include/net/sock.h
index 94dff7f..ebb3c08 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -142,6 +142,7 @@ typedef __u64 __bitwise __addrpair;
  *	@skc_state: Connection state
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_reuseport: %SO_REUSEPORT setting
+ *	@skc_reuseport_listen_off: %SO_REUSEPORT_LISTEN_OFF setting
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -183,6 +184,7 @@ struct sock_common {
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse:4;
 	unsigned char		skc_reuseport:1;
+	unsigned char       skc_reuseport_listen_off:1;
 	unsigned char		skc_ipv6only:1;
 	unsigned char		skc_net_refcnt:1;
 	int			skc_bound_dev_if;
@@ -322,6 +324,7 @@ struct sock {
 #define sk_state		__sk_common.skc_state
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_reuseport		__sk_common.skc_reuseport
+#define sk_reuseport_listen_off		__sk_common.skc_reuseport_listen_off
 #define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 5c15c2a..ed22ee4 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -87,4 +87,6 @@
 #define SO_ATTACH_BPF		50
 #define SO_DETACH_BPF		SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 3307c02..5861513 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -714,6 +714,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 	case SO_REUSEPORT:
 		sk->sk_reuseport = valbool;
 		break;
+	case SO_REUSEPORT_LISTEN_OFF:
+		sk->sk_reuseport_listen_off = valbool;
+		break;
 	case SO_TYPE:
 	case SO_PROTOCOL:
 	case SO_DOMAIN:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8912019..59e8540 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -224,10 +224,13 @@ begin:
 				phash = inet_ehashfn(net, daddr, hnum,
 						     saddr, sport);
 				matches = 1;
+				if (sk->sk_reuseport_listen_off)
+					result = NULL;
 			}
 		} else if (score == hiscore && reuseport) {
 			matches++;
-			if (reciprocal_scale(phash, matches) == 0)
+			if (reciprocal_scale(phash, matches) == 0 &&
+			    !sk->sk_reuseport_listen_off)
 				result = sk;
 			phash = next_pseudo_random32(phash);
 		}
-- 
2.5.3

^ permalink raw reply related	[flat|nested] 61+ messages in thread
* Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode
@ 2016-03-25 15:29 Craig Gallek
  2016-03-25 16:21 ` Alexei Starovoitov
  0 siblings, 1 reply; 61+ messages in thread
From: Craig Gallek @ 2016-03-25 15:29 UTC (permalink / raw)
  To: Linux Kernel Network Developers

On Thu, Mar 24, 2016 at 2:00 PM, Willy Tarreau <w@1wt.eu> wrote:
> The pattern is :
>
>   t0 : unprivileged processes 1 and 2 are listening to the same port
>        (sock1@pid1) (sock2@pid2)
>        <------ listening ------>
>
>   t1 : new processes are started to replace the old ones
>        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
>        <------ listening ------> <------ listening ------>
>
>   t2 : new processes signal the old ones they must stop
>        (sock1@pid1) (sock2@pid2) (sock3@pid3) (sock4@pid4)
>        <------- draining ------> <------ listening ------>
>
>   t3 : pids 1 and 2 have finished, they go away
>                                  (sock3@pid3) (sock4@pid4)
>         <------ gone ----->      <------ listening ------>

To address the documentation issues, I'd like to reference the following:
- The filter.txt document in the kernel tree:
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/filter.txt
- It uses (and extends) the BPF instruction set defined in the
original BSD BPF paper: http://www.tcpdump.org/papers/bpf-usenix93.pdf
- The kernel headers define all of the user-space structures used:
  * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/filter.h
  * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/bpf.h

I've been trying to come up with an example BPF program for use in the
example Willy gave earlier in this thread (using 4 points in time and
describing one process with two listening sockets replacing another
with two listening sockets).  Everything except the last step is
pretty straight forward using what is currently available in the
kernel.  I'm using random distribution for simplicity, but you could
easily do something smarter using more information about the specific
hardware:

t0: Evenly distrubute load to two SO_REUSEPORT sockets in a single process:
  ld rand
  mod #2
  ret a

t1: Fork a new process, create two new listening sockets in the same
group. Even after calling listen(), but before updating the BPF
program, only the first two sockets will see new connections.  The
program is trivially modified to use all 4.
  ld rand
  mod #4
  ret a

t2: Stop sending new connections to the first two sockets (draining)
  ld rand
  mod #2
  add #2
  ret a

t3: Close the first two sockets and only use the last two.  This is
the tricky step.  Before this point, the sockets are numbered 0
through 3 from the perspective of the BPF program (in the order
listen() was called).  As soon as socket 0 is closed, the last socket
in the list replaces it (what was 3 becomes 0).  When socket 1 is
closed, socket 2 moves into that position.  The assumptions about the
socket indexes in the BPF program need to change as the indexes change
as a result of closing them.

Even if you use an EBPF map as a level of indirection here, you still
have the issue that the socket indexes change as a result of some of
them leaving the group.  I'm not sure yet how to properly fix this,
but it will probably mean changing the way the socket indexing
works...  The current scheme is really an implementation detail
optimized for efficiency.  It may be worth modifying or creating a
mode which results in a stable mapping.  This will probably be
necessary for any scheme which expects sockets to regularly enter or
leave the group.

^ permalink raw reply	[flat|nested] 61+ messages in thread

end of thread, other threads:[~2016-03-25 18:32 UTC | newest]

Thread overview: 61+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-27  0:30 [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain mode Tolga Ceylan
2015-09-27  1:04 ` Eric Dumazet
2015-09-27  1:37   ` Tolga Ceylan
2015-09-27  1:44 ` Aaron Conole
2015-09-27  2:02   ` Tolga Ceylan
2015-09-27  2:24     ` Eric Dumazet
2015-11-11  5:41       ` Tom Herbert
2015-11-11  6:19         ` Eric Dumazet
2015-11-11 17:05           ` Tom Herbert
2015-11-11 17:23             ` Eric Dumazet
2015-11-11 18:23               ` Tom Herbert
2015-11-11 18:43                 ` Eric Dumazet
2015-11-12  1:09                   ` Eric Dumazet
2015-12-15 16:14                     ` Willy Tarreau
2015-12-15 17:10                       ` Eric Dumazet
2015-12-15 17:43                         ` Willy Tarreau
2015-12-15 18:21                           ` Eric Dumazet
2015-12-15 19:44                             ` Willy Tarreau
2015-12-15 21:21                               ` Eric Dumazet
2015-12-16  7:38                                 ` Willy Tarreau
2015-12-16 16:15                                   ` Willy Tarreau
2015-12-18 16:33                                     ` Josh Snyder
2015-12-18 18:58                                       ` Willy Tarreau
2015-12-19  2:38                                         ` Eric Dumazet
2015-12-19  7:00                                           ` Willy Tarreau
2015-12-21 20:38                                             ` Tom Herbert
2015-12-21 20:41                                               ` Willy Tarreau
2016-03-24  5:10                                                 ` Tolga Ceylan
2016-03-24  6:12                                                   ` Willy Tarreau
2016-03-24 14:13                                                     ` Eric Dumazet
2016-03-24 14:22                                                       ` Willy Tarreau
2016-03-24 14:45                                                         ` Eric Dumazet
2016-03-24 15:30                                                           ` Willy Tarreau
2016-03-24 16:33                                                             ` Eric Dumazet
2016-03-24 16:50                                                               ` Willy Tarreau
2016-03-24 17:01                                                                 ` Eric Dumazet
2016-03-24 17:26                                                                   ` Tom Herbert
2016-03-24 17:55                                                                     ` Daniel Borkmann
2016-03-24 18:20                                                                       ` Tolga Ceylan
2016-03-24 18:24                                                                         ` Willy Tarreau
2016-03-24 18:37                                                                         ` Eric Dumazet
2016-03-24 22:40                                                                       ` Yann Ylavic
2016-03-24 22:49                                                                         ` Eric Dumazet
2016-03-24 23:40                                                                           ` Yann Ylavic
2016-03-24 23:54                                                                             ` Tom Herbert
2016-03-25  0:01                                                                               ` Yann Ylavic
2016-03-25  5:28                                                                               ` Willy Tarreau
2016-03-25  6:49                                                                                 ` Eric Dumazet
2016-03-25  8:53                                                                                   ` Willy Tarreau
2016-03-25 11:21                                                                                     ` Yann Ylavic
2016-03-25 13:17                                                                                       ` Eric Dumazet
2016-03-25  0:25                                                                           ` David Miller
2016-03-25  0:24                                                                         ` David Miller
2016-03-24 18:00                                                                   ` Willy Tarreau
2016-03-24 18:21                                                                     ` Willy Tarreau
2016-03-24 18:32                                                                     ` Eric Dumazet
2016-03-25 15:29 Craig Gallek
2016-03-25 16:21 ` Alexei Starovoitov
2016-03-25 16:31   ` Craig Gallek
2016-03-25 17:00     ` Eric Dumazet
2016-03-25 18:31       ` Willem de Bruijn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).