All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace API support
@ 2017-01-23 18:59 Wei Wang
  2017-01-23 18:59 ` [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic Wei Wang
                   ` (3 more replies)
  0 siblings, 4 replies; 30+ messages in thread
From: Wei Wang @ 2017-01-23 18:59 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Eric Dumazet, Yuchung Cheng, Wei Wang

From: Wei Wang <weiwan@google.com>

The patch series is to add support for new userspace API for TCP fastopen
sockets. 
In the current code, user has to call sendto()/sendmsg() with special flag
MSG_FASTOPEN for TCP fastopen sockets. This API is quite different from the
normal TCP socket API and can be cumbersome for applications to make use
fastopen sockets.
So this new patch introduces a new way of using TCP fastopen sockets which
is similar to normal TCP sockets with a new sockopt TCP_FASTOPEN_CONNECT.
More details about it is described in the third patch.
(First 2 patches are preparations for the third patch.)

Wei Wang (3):
  net/tcp-fastopen: refactor cookie check logic
  net: Remove __sk_dst_reset() in tcp_v6_connect()
  net/tcp-fastopen: Add new API support

 include/linux/tcp.h      |  3 ++-
 include/net/inet_sock.h  |  6 +++++-
 include/net/tcp.h        |  3 +++
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/af_inet.c       | 31 ++++++++++++++++++++-------
 net/ipv4/tcp.c           | 35 ++++++++++++++++++++++++++++++-
 net/ipv4/tcp_fastopen.c  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  7 ++++++-
 net/ipv4/tcp_output.c    | 16 ++------------
 net/ipv6/tcp_ipv6.c      |  6 +++++-
 10 files changed, 136 insertions(+), 26 deletions(-)

-- 
2.11.0.483.g087da7b7c-goog

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic
  2017-01-23 18:59 [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace API support Wei Wang
@ 2017-01-23 18:59 ` Wei Wang
  2017-01-23 19:13   ` Eric Dumazet
  2017-01-23 20:51   ` Yuchung Cheng
  2017-01-23 18:59 ` [PATCH net-next 2/3] net: Remove __sk_dst_reset() in tcp_v6_connect() Wei Wang
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 30+ messages in thread
From: Wei Wang @ 2017-01-23 18:59 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Eric Dumazet, Yuchung Cheng, Wei Wang

From: Wei Wang <weiwan@google.com>

Refactor the cookie check logic in tcp_send_syn_data() into a function.
This function will be called else where in later changes.

Signed-off-by: Wei Wang <weiwan@google.com>
---
 include/net/tcp.h       |  2 ++
 net/ipv4/tcp_fastopen.c | 21 +++++++++++++++++++++
 net/ipv4/tcp_output.c   | 16 ++--------------
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c55d65f74f7f..de67541d7adf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1493,6 +1493,8 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct tcp_fastopen_cookie *foc,
 			      struct dst_entry *dst);
 void tcp_fastopen_init_key_once(bool publish);
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+			     struct tcp_fastopen_cookie *cookie);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
 /* Fastopen key context */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f51919535ca7..f90e09e1ff4c 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -325,3 +325,24 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 	*foc = valid_foc;
 	return NULL;
 }
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+			       struct tcp_fastopen_cookie *cookie)
+{
+	unsigned long last_syn_loss = 0;
+	int syn_loss = 0;
+
+	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
+
+	/* Recurring FO SYN losses: no cookie or data in SYN */
+	if (syn_loss > 1 &&
+	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+		cookie->len = -1;
+		return false;
+	}
+	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+		cookie->len = -1;
+		return true;
+	}
+	return cookie->len > 0;
+}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9a1a1494b9dd..671c69535671 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3267,23 +3267,11 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_fastopen_request *fo = tp->fastopen_req;
-	int syn_loss = 0, space, err = 0;
-	unsigned long last_syn_loss = 0;
+	int space, err = 0;
 	struct sk_buff *syn_data;
 
 	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
-	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
-			       &syn_loss, &last_syn_loss);
-	/* Recurring FO SYN losses: revert to regular handshake temporarily */
-	if (syn_loss > 1 &&
-	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
-		fo->cookie.len = -1;
-		goto fallback;
-	}
-
-	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
-		fo->cookie.len = -1;
-	else if (fo->cookie.len <= 0)
+	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
 		goto fallback;
 
 	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
-- 
2.11.0.483.g087da7b7c-goog

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 2/3] net: Remove __sk_dst_reset() in tcp_v6_connect()
  2017-01-23 18:59 [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace API support Wei Wang
  2017-01-23 18:59 ` [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic Wei Wang
@ 2017-01-23 18:59 ` Wei Wang
  2017-01-23 19:14   ` Eric Dumazet
  2017-01-23 18:59 ` [PATCH net-next 3/3] net/tcp-fastopen: Add new API support Wei Wang
  2017-01-25 19:09 ` [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace " David Miller
  3 siblings, 1 reply; 30+ messages in thread
From: Wei Wang @ 2017-01-23 18:59 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Eric Dumazet, Yuchung Cheng, Wei Wang

From: Wei Wang <weiwan@google.com>

Remove __sk_dst_reset() in the failure handling because __sk_dst_reset()
will eventually get called when sk is released. No need to handle it in
the protocol specific connect call.
This is also to make the code path consistent with ipv4.

Signed-off-by: Wei Wang <weiwan@google.com>
---
 net/ipv6/tcp_ipv6.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f72100eedd5d..0b7cd3d009b6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -295,7 +295,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 late_failure:
 	tcp_set_state(sk, TCP_CLOSE);
-	__sk_dst_reset(sk);
 failure:
 	inet->inet_dport = 0;
 	sk->sk_route_caps = 0;
-- 
2.11.0.483.g087da7b7c-goog

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-23 18:59 [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace API support Wei Wang
  2017-01-23 18:59 ` [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic Wei Wang
  2017-01-23 18:59 ` [PATCH net-next 2/3] net: Remove __sk_dst_reset() in tcp_v6_connect() Wei Wang
@ 2017-01-23 18:59 ` Wei Wang
  2017-01-23 19:15   ` Eric Dumazet
                     ` (3 more replies)
  2017-01-25 19:09 ` [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace " David Miller
  3 siblings, 4 replies; 30+ messages in thread
From: Wei Wang @ 2017-01-23 18:59 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Eric Dumazet, Yuchung Cheng, Wei Wang

From: Wei Wang <weiwan@google.com>

This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
alternative way to perform Fast Open on the active side (client). Prior
to this patch, a client needs to replace the connect() call with
sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
to use Fast Open: these socket operations are often done in lower layer
libraries used by many other applications. Changing these libraries
and/or the socket call sequences are not trivial. A more convenient
approach is to perform Fast Open by simply enabling a socket option when
the socket is created w/o changing other socket calls sequence:
  s = socket()
    create a new socket
  setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …);
    newly introduced sockopt
    If set, new functionality described below will be used.
    Return ENOTSUPP if TFO is not supported or not enabled in the
    kernel.

  connect()
    With cookie present, return 0 immediately.
    With no cookie, initiate 3WHS with TFO cookie-request option and
    return -1 with errno = EINPROGRESS.

  write()/sendmsg()
    With cookie present, send out SYN with data and return the number of
    bytes buffered.
    With no cookie, and 3WHS not yet completed, return -1 with errno =
    EINPROGRESS.
    No MSG_FASTOPEN flag is needed.

  read()
    Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but
    write() is not called yet.
    Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is
    established but no msg is received yet.
    Return number of bytes read if socket is established and there is
    msg received.

The new API simplifies life for applications that always perform a write()
immediately after a successful connect(). Such applications can now take
advantage of Fast Open by merely making one new setsockopt() call at the time
of creating the socket. Nothing else about the application's socket call
sequence needs to change.

Signed-off-by: Wei Wang <weiwan@google.com>
---
 include/linux/tcp.h      |  3 ++-
 include/net/inet_sock.h  |  6 +++++-
 include/net/tcp.h        |  1 +
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/af_inet.c       | 31 ++++++++++++++++++++++++-------
 net/ipv4/tcp.c           | 35 ++++++++++++++++++++++++++++++++++-
 net/ipv4/tcp_fastopen.c  | 33 +++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  7 ++++++-
 net/ipv6/tcp_ipv6.c      |  5 +++++
 9 files changed, 111 insertions(+), 11 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 5371b3d70cfe..f88f4649ba6f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -222,7 +222,8 @@ struct tcp_sock {
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
-		unused:5;
+		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+		unused:4;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index c9cff977a7fb..aa95053dfc78 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -206,7 +206,11 @@ struct inet_sock {
 				transparent:1,
 				mc_all:1,
 				nodefrag:1;
-	__u8			bind_address_no_port:1;
+	__u8			bind_address_no_port:1,
+				defer_connect:1; /* Indicates that fastopen_connect is set
+						  * and cookie exists so we defer connect
+						  * until first data frame is written
+						  */
 	__u8			rcv_tos;
 	__u8			convert_csum;
 	int			uc_index;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de67541d7adf..6ec4ea652f3f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1495,6 +1495,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
 /* Fastopen key context */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index c53de2691cec..6ff35eb48d10 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -116,6 +116,7 @@ enum {
 #define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
+#define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index aae410bb655a..d8a0dc076f97 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -576,13 +576,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int err;
 	long timeo;
 
-	if (addr_len < sizeof(uaddr->sa_family))
-		return -EINVAL;
+	/*
+	 * uaddr can be NULL and addr_len can be 0 if:
+	 * sk is a TCP fastopen active socket and
+	 * TCP_FASTOPEN_CONNECT sockopt is set and
+	 * we already have a valid cookie for this socket.
+	 * In this case, user can call write() after connect().
+	 * write() will invoke tcp_sendmsg_fastopen() which calls
+	 * __inet_stream_connect().
+	 */
+	if (uaddr) {
+		if (addr_len < sizeof(uaddr->sa_family))
+			return -EINVAL;
 
-	if (uaddr->sa_family == AF_UNSPEC) {
-		err = sk->sk_prot->disconnect(sk, flags);
-		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
-		goto out;
+		if (uaddr->sa_family == AF_UNSPEC) {
+			err = sk->sk_prot->disconnect(sk, flags);
+			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+			goto out;
+		}
 	}
 
 	switch (sock->state) {
@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		err = -EISCONN;
 		goto out;
 	case SS_CONNECTING:
-		err = -EALREADY;
+		if (inet_sk(sk)->defer_connect)
+			err = -EINPROGRESS;
+		else
+			err = -EALREADY;
 		/* Fall out of switch with err, set for this state */
 		break;
 	case SS_UNCONNECTED:
@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 		sock->state = SS_CONNECTING;
 
+		if (!err && inet_sk(sk)->defer_connect)
+			goto out;
+
 		/* Just entered SS_CONNECTING state; the only
 		 * difference is that return value in non-blocking
 		 * case is EINPROGRESS, rather than EALREADY.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c43eb1a831d7..d9735b76d073 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -533,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 		if (tp->urg_data & TCP_URG_VALID)
 			mask |= POLLPRI;
+	} else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+		/* Active TCP fastopen socket with defer_connect
+		 * Return POLLOUT so application can call write()
+		 * in order for kernel to generate SYN+data
+		 */
+		mask |= POLLOUT | POLLWRNORM;
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
@@ -1071,6 +1077,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 				int *copied, size_t size)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
 	int err, flags;
 
 	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1085,9 +1092,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	tp->fastopen_req->data = msg;
 	tp->fastopen_req->size = size;
 
+	if (inet->defer_connect) {
+		err = tcp_connect(sk);
+		/* Same failure procedure as in tcp_v4/6_connect */
+		if (err) {
+			tcp_set_state(sk, TCP_CLOSE);
+			inet->inet_dport = 0;
+			sk->sk_route_caps = 0;
+		}
+	}
 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
 	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
 				    msg->msg_namelen, flags);
+	inet->defer_connect = 0;
 	*copied = tp->fastopen_req->copied;
 	tcp_free_fastopen_req(tp);
 	return err;
@@ -1107,7 +1124,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 
 	flags = msg->msg_flags;
-	if (flags & MSG_FASTOPEN) {
+	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
 		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
 		if (err == -EINPROGRESS && copied_syn > 0)
 			goto out;
@@ -2656,6 +2673,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		}
 		break;
+	case TCP_FASTOPEN_CONNECT:
+		if (val > 1 || val < 0) {
+			err = -EINVAL;
+		} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+			if (sk->sk_state == TCP_CLOSE)
+				tp->fastopen_connect = val;
+			else
+				err = -EINVAL;
+		} else {
+			err = -EOPNOTSUPP;
+		}
+		break;
 	case TCP_TIMESTAMP:
 		if (!tp->repair)
 			err = -EPERM;
@@ -3016,6 +3045,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = icsk->icsk_accept_queue.fastopenq.max_qlen;
 		break;
 
+	case TCP_FASTOPEN_CONNECT:
+		val = tp->fastopen_connect;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f90e09e1ff4c..9674bec4a0f8 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -346,3 +346,36 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 	}
 	return cookie->len > 0;
 }
+
+/* This function checks if we want to defer sending SYN until the first
+ * write().  We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ *               return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+	struct tcp_fastopen_cookie cookie = { .len = 0 };
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 mss;
+
+	if (tp->fastopen_connect && !tp->fastopen_req) {
+		if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+			inet_sk(sk)->defer_connect = 1;
+			return true;
+		}
+
+		/* Alloc fastopen_req in order for FO option to be included
+		 * in SYN
+		 */
+		tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+					   sk->sk_allocation);
+		if (tp->fastopen_req)
+			tp->fastopen_req->cookie = cookie;
+		else
+			*err = -ENOBUFS;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f7325b25b06e..27b726c96459 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* OK, now commit destination to socket.  */
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
+	rt = NULL;
 
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -242,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	inet->inet_id = tp->write_seq ^ jiffies;
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto failure;
+
 	err = tcp_connect(sk);
 
-	rt = NULL;
 	if (err)
 		goto failure;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0b7cd3d009b6..95c05e5293b1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -287,6 +287,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 							     inet->inet_dport,
 							     &tp->tsoffset);
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto late_failure;
+
 	err = tcp_connect(sk);
 	if (err)
 		goto late_failure;
-- 
2.11.0.483.g087da7b7c-goog

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic
  2017-01-23 18:59 ` [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic Wei Wang
@ 2017-01-23 19:13   ` Eric Dumazet
  2017-01-23 20:51   ` Yuchung Cheng
  1 sibling, 0 replies; 30+ messages in thread
From: Eric Dumazet @ 2017-01-23 19:13 UTC (permalink / raw)
  To: Wei Wang; +Cc: netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

On Mon, 2017-01-23 at 10:59 -0800, Wei Wang wrote:
> From: Wei Wang <weiwan@google.com>
> 
> Refactor the cookie check logic in tcp_send_syn_data() into a function.
> This function will be called else where in later changes.
> 
> Signed-off-by: Wei Wang <weiwan@google.com>
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 2/3] net: Remove __sk_dst_reset() in tcp_v6_connect()
  2017-01-23 18:59 ` [PATCH net-next 2/3] net: Remove __sk_dst_reset() in tcp_v6_connect() Wei Wang
@ 2017-01-23 19:14   ` Eric Dumazet
  0 siblings, 0 replies; 30+ messages in thread
From: Eric Dumazet @ 2017-01-23 19:14 UTC (permalink / raw)
  To: Wei Wang; +Cc: netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

On Mon, 2017-01-23 at 10:59 -0800, Wei Wang wrote:
> From: Wei Wang <weiwan@google.com>
> 
> Remove __sk_dst_reset() in the failure handling because __sk_dst_reset()
> will eventually get called when sk is released. No need to handle it in
> the protocol specific connect call.
> This is also to make the code path consistent with ipv4.
> 
> Signed-off-by: Wei Wang <weiwan@google.com>
> ---

Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>

Thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-23 18:59 ` [PATCH net-next 3/3] net/tcp-fastopen: Add new API support Wei Wang
@ 2017-01-23 19:15   ` Eric Dumazet
  2017-01-23 20:55   ` Yuchung Cheng
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 30+ messages in thread
From: Eric Dumazet @ 2017-01-23 19:15 UTC (permalink / raw)
  To: Wei Wang; +Cc: netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

On Mon, 2017-01-23 at 10:59 -0800, Wei Wang wrote:
> From: Wei Wang <weiwan@google.com>
> 
> This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
> alternative way to perform Fast Open on the active side (client). Prior
> to this patch, a client needs to replace the connect() call with
> sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
> to use Fast Open: these socket operations are often done in lower layer
> libraries used by many other applications. Changing these libraries
> and/or the socket call sequences are not trivial. A more convenient
> approach is to perform Fast Open by simply enabling a socket option when
> the socket is created w/o changing other socket calls sequence:

> Signed-off-by: Wei Wang <weiwan@google.com>
> ---

Thanks for this hard work Wei.

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic
  2017-01-23 18:59 ` [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic Wei Wang
  2017-01-23 19:13   ` Eric Dumazet
@ 2017-01-23 20:51   ` Yuchung Cheng
  1 sibling, 0 replies; 30+ messages in thread
From: Yuchung Cheng @ 2017-01-23 20:51 UTC (permalink / raw)
  To: Wei Wang; +Cc: netdev, David Miller, Eric Dumazet, Wei Wang

On Mon, Jan 23, 2017 at 10:59 AM, Wei Wang <tracywwnj@gmail.com> wrote:
>
> From: Wei Wang <weiwan@google.com>
>
> Refactor the cookie check logic in tcp_send_syn_data() into a function.
> This function will be called else where in later changes.
>
> Signed-off-by: Wei Wang <weiwan@google.com>
> ---
Acked-by: Yuchung Cheng <ycheng@google.com>

>  include/net/tcp.h       |  2 ++
>  net/ipv4/tcp_fastopen.c | 21 +++++++++++++++++++++
>  net/ipv4/tcp_output.c   | 16 ++--------------
>  3 files changed, 25 insertions(+), 14 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index c55d65f74f7f..de67541d7adf 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1493,6 +1493,8 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
>                               struct tcp_fastopen_cookie *foc,
>                               struct dst_entry *dst);
>  void tcp_fastopen_init_key_once(bool publish);
> +bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
> +                            struct tcp_fastopen_cookie *cookie);
>  #define TCP_FASTOPEN_KEY_LENGTH 16
>
>  /* Fastopen key context */
> diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
> index f51919535ca7..f90e09e1ff4c 100644
> --- a/net/ipv4/tcp_fastopen.c
> +++ b/net/ipv4/tcp_fastopen.c
> @@ -325,3 +325,24 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
>         *foc = valid_foc;
>         return NULL;
>  }
> +
> +bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
> +                              struct tcp_fastopen_cookie *cookie)
> +{
> +       unsigned long last_syn_loss = 0;
> +       int syn_loss = 0;
> +
> +       tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
> +
> +       /* Recurring FO SYN losses: no cookie or data in SYN */
> +       if (syn_loss > 1 &&
> +           time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
> +               cookie->len = -1;
> +               return false;
> +       }
> +       if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
> +               cookie->len = -1;
> +               return true;
> +       }
> +       return cookie->len > 0;
> +}
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 9a1a1494b9dd..671c69535671 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3267,23 +3267,11 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
>         struct tcp_fastopen_request *fo = tp->fastopen_req;
> -       int syn_loss = 0, space, err = 0;
> -       unsigned long last_syn_loss = 0;
> +       int space, err = 0;
>         struct sk_buff *syn_data;
>
>         tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
> -       tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
> -                              &syn_loss, &last_syn_loss);
> -       /* Recurring FO SYN losses: revert to regular handshake temporarily */
> -       if (syn_loss > 1 &&
> -           time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
> -               fo->cookie.len = -1;
> -               goto fallback;
> -       }
> -
> -       if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
> -               fo->cookie.len = -1;
> -       else if (fo->cookie.len <= 0)
> +       if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
>                 goto fallback;
>
>         /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
> --
> 2.11.0.483.g087da7b7c-goog
>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-23 18:59 ` [PATCH net-next 3/3] net/tcp-fastopen: Add new API support Wei Wang
  2017-01-23 19:15   ` Eric Dumazet
@ 2017-01-23 20:55   ` Yuchung Cheng
  2017-01-23 21:16   ` Willy Tarreau
  2017-01-24  7:30   ` Willy Tarreau
  3 siblings, 0 replies; 30+ messages in thread
From: Yuchung Cheng @ 2017-01-23 20:55 UTC (permalink / raw)
  To: Wei Wang; +Cc: netdev, David Miller, Eric Dumazet, Wei Wang

On Mon, Jan 23, 2017 at 10:59 AM, Wei Wang <tracywwnj@gmail.com> wrote:
> From: Wei Wang <weiwan@google.com>
>
> This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
> alternative way to perform Fast Open on the active side (client). Prior
> to this patch, a client needs to replace the connect() call with
> sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
> to use Fast Open: these socket operations are often done in lower layer
> libraries used by many other applications. Changing these libraries
> and/or the socket call sequences are not trivial. A more convenient
> approach is to perform Fast Open by simply enabling a socket option when
> the socket is created w/o changing other socket calls sequence:
>   s = socket()
>     create a new socket
>   setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …);
>     newly introduced sockopt
>     If set, new functionality described below will be used.
>     Return ENOTSUPP if TFO is not supported or not enabled in the
>     kernel.
>
>   connect()
>     With cookie present, return 0 immediately.
>     With no cookie, initiate 3WHS with TFO cookie-request option and
>     return -1 with errno = EINPROGRESS.
>
>   write()/sendmsg()
>     With cookie present, send out SYN with data and return the number of
>     bytes buffered.
>     With no cookie, and 3WHS not yet completed, return -1 with errno =
>     EINPROGRESS.
>     No MSG_FASTOPEN flag is needed.
>
>   read()
>     Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but
>     write() is not called yet.
>     Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is
>     established but no msg is received yet.
>     Return number of bytes read if socket is established and there is
>     msg received.
>
> The new API simplifies life for applications that always perform a write()
> immediately after a successful connect(). Such applications can now take
> advantage of Fast Open by merely making one new setsockopt() call at the time
> of creating the socket. Nothing else about the application's socket call
> sequence needs to change.
>
> Signed-off-by: Wei Wang <weiwan@google.com>
> ---
Acked-by: Yuchung Cheng <ycheng@google.com>

Thanks for making this happen.

>  include/linux/tcp.h      |  3 ++-
>  include/net/inet_sock.h  |  6 +++++-
>  include/net/tcp.h        |  1 +
>  include/uapi/linux/tcp.h |  1 +
>  net/ipv4/af_inet.c       | 31 ++++++++++++++++++++++++-------
>  net/ipv4/tcp.c           | 35 ++++++++++++++++++++++++++++++++++-
>  net/ipv4/tcp_fastopen.c  | 33 +++++++++++++++++++++++++++++++++
>  net/ipv4/tcp_ipv4.c      |  7 ++++++-
>  net/ipv6/tcp_ipv6.c      |  5 +++++
>  9 files changed, 111 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 5371b3d70cfe..f88f4649ba6f 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -222,7 +222,8 @@ struct tcp_sock {
>         u32     chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
>         u8      chrono_type:2,  /* current chronograph type */
>                 rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
> -               unused:5;
> +               fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
> +               unused:4;
>         u8      nonagle     : 4,/* Disable Nagle algorithm?             */
>                 thin_lto    : 1,/* Use linear timeouts for thin streams */
>                 unused1     : 1,
> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> index c9cff977a7fb..aa95053dfc78 100644
> --- a/include/net/inet_sock.h
> +++ b/include/net/inet_sock.h
> @@ -206,7 +206,11 @@ struct inet_sock {
>                                 transparent:1,
>                                 mc_all:1,
>                                 nodefrag:1;
> -       __u8                    bind_address_no_port:1;
> +       __u8                    bind_address_no_port:1,
> +                               defer_connect:1; /* Indicates that fastopen_connect is set
> +                                                 * and cookie exists so we defer connect
> +                                                 * until first data frame is written
> +                                                 */
>         __u8                    rcv_tos;
>         __u8                    convert_csum;
>         int                     uc_index;
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index de67541d7adf..6ec4ea652f3f 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1495,6 +1495,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
>  void tcp_fastopen_init_key_once(bool publish);
>  bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
>                              struct tcp_fastopen_cookie *cookie);
> +bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
>  #define TCP_FASTOPEN_KEY_LENGTH 16
>
>  /* Fastopen key context */
> diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
> index c53de2691cec..6ff35eb48d10 100644
> --- a/include/uapi/linux/tcp.h
> +++ b/include/uapi/linux/tcp.h
> @@ -116,6 +116,7 @@ enum {
>  #define TCP_SAVE_SYN           27      /* Record SYN headers for new connections */
>  #define TCP_SAVED_SYN          28      /* Get SYN headers recorded for connection */
>  #define TCP_REPAIR_WINDOW      29      /* Get/set window parameters */
> +#define TCP_FASTOPEN_CONNECT   30      /* Attempt FastOpen with connect */
>
>  struct tcp_repair_opt {
>         __u32   opt_code;
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index aae410bb655a..d8a0dc076f97 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -576,13 +576,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
>         int err;
>         long timeo;
>
> -       if (addr_len < sizeof(uaddr->sa_family))
> -               return -EINVAL;
> +       /*
> +        * uaddr can be NULL and addr_len can be 0 if:
> +        * sk is a TCP fastopen active socket and
> +        * TCP_FASTOPEN_CONNECT sockopt is set and
> +        * we already have a valid cookie for this socket.
> +        * In this case, user can call write() after connect().
> +        * write() will invoke tcp_sendmsg_fastopen() which calls
> +        * __inet_stream_connect().
> +        */
> +       if (uaddr) {
> +               if (addr_len < sizeof(uaddr->sa_family))
> +                       return -EINVAL;
>
> -       if (uaddr->sa_family == AF_UNSPEC) {
> -               err = sk->sk_prot->disconnect(sk, flags);
> -               sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
> -               goto out;
> +               if (uaddr->sa_family == AF_UNSPEC) {
> +                       err = sk->sk_prot->disconnect(sk, flags);
> +                       sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
> +                       goto out;
> +               }
>         }
>
>         switch (sock->state) {
> @@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
>                 err = -EISCONN;
>                 goto out;
>         case SS_CONNECTING:
> -               err = -EALREADY;
> +               if (inet_sk(sk)->defer_connect)
> +                       err = -EINPROGRESS;
> +               else
> +                       err = -EALREADY;
>                 /* Fall out of switch with err, set for this state */
>                 break;
>         case SS_UNCONNECTED:
> @@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
>
>                 sock->state = SS_CONNECTING;
>
> +               if (!err && inet_sk(sk)->defer_connect)
> +                       goto out;
> +
>                 /* Just entered SS_CONNECTING state; the only
>                  * difference is that return value in non-blocking
>                  * case is EINPROGRESS, rather than EALREADY.
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index c43eb1a831d7..d9735b76d073 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -533,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
>
>                 if (tp->urg_data & TCP_URG_VALID)
>                         mask |= POLLPRI;
> +       } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
> +               /* Active TCP fastopen socket with defer_connect
> +                * Return POLLOUT so application can call write()
> +                * in order for kernel to generate SYN+data
> +                */
> +               mask |= POLLOUT | POLLWRNORM;
>         }
>         /* This barrier is coupled with smp_wmb() in tcp_reset() */
>         smp_rmb();
> @@ -1071,6 +1077,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
>                                 int *copied, size_t size)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
> +       struct inet_sock *inet = inet_sk(sk);
>         int err, flags;
>
>         if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
> @@ -1085,9 +1092,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
>         tp->fastopen_req->data = msg;
>         tp->fastopen_req->size = size;
>
> +       if (inet->defer_connect) {
> +               err = tcp_connect(sk);
> +               /* Same failure procedure as in tcp_v4/6_connect */
> +               if (err) {
> +                       tcp_set_state(sk, TCP_CLOSE);
> +                       inet->inet_dport = 0;
> +                       sk->sk_route_caps = 0;
> +               }
> +       }
>         flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
>         err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
>                                     msg->msg_namelen, flags);
> +       inet->defer_connect = 0;
>         *copied = tp->fastopen_req->copied;
>         tcp_free_fastopen_req(tp);
>         return err;
> @@ -1107,7 +1124,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
>         lock_sock(sk);
>
>         flags = msg->msg_flags;
> -       if (flags & MSG_FASTOPEN) {
> +       if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
>                 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
>                 if (err == -EINPROGRESS && copied_syn > 0)
>                         goto out;
> @@ -2656,6 +2673,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
>                         err = -EINVAL;
>                 }
>                 break;
> +       case TCP_FASTOPEN_CONNECT:
> +               if (val > 1 || val < 0) {
> +                       err = -EINVAL;
> +               } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
> +                       if (sk->sk_state == TCP_CLOSE)
> +                               tp->fastopen_connect = val;
> +                       else
> +                               err = -EINVAL;
> +               } else {
> +                       err = -EOPNOTSUPP;
> +               }
> +               break;
>         case TCP_TIMESTAMP:
>                 if (!tp->repair)
>                         err = -EPERM;
> @@ -3016,6 +3045,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
>                 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
>                 break;
>
> +       case TCP_FASTOPEN_CONNECT:
> +               val = tp->fastopen_connect;
> +               break;
> +
>         case TCP_TIMESTAMP:
>                 val = tcp_time_stamp + tp->tsoffset;
>                 break;
> diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
> index f90e09e1ff4c..9674bec4a0f8 100644
> --- a/net/ipv4/tcp_fastopen.c
> +++ b/net/ipv4/tcp_fastopen.c
> @@ -346,3 +346,36 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
>         }
>         return cookie->len > 0;
>  }
> +
> +/* This function checks if we want to defer sending SYN until the first
> + * write().  We defer under the following conditions:
> + * 1. fastopen_connect sockopt is set
> + * 2. we have a valid cookie
> + * Return value: return true if we want to defer until application writes data
> + *               return false if we want to send out SYN immediately
> + */
> +bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
> +{
> +       struct tcp_fastopen_cookie cookie = { .len = 0 };
> +       struct tcp_sock *tp = tcp_sk(sk);
> +       u16 mss;
> +
> +       if (tp->fastopen_connect && !tp->fastopen_req) {
> +               if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
> +                       inet_sk(sk)->defer_connect = 1;
> +                       return true;
> +               }
> +
> +               /* Alloc fastopen_req in order for FO option to be included
> +                * in SYN
> +                */
> +               tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
> +                                          sk->sk_allocation);
> +               if (tp->fastopen_req)
> +                       tp->fastopen_req->cookie = cookie;
> +               else
> +                       *err = -ENOBUFS;
> +       }
> +       return false;
> +}
> +EXPORT_SYMBOL(tcp_fastopen_defer_connect);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index f7325b25b06e..27b726c96459 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>         /* OK, now commit destination to socket.  */
>         sk->sk_gso_type = SKB_GSO_TCPV4;
>         sk_setup_caps(sk, &rt->dst);
> +       rt = NULL;
>
>         if (!tp->write_seq && likely(!tp->repair))
>                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
> @@ -242,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>
>         inet->inet_id = tp->write_seq ^ jiffies;
>
> +       if (tcp_fastopen_defer_connect(sk, &err))
> +               return err;
> +       if (err)
> +               goto failure;
> +
>         err = tcp_connect(sk);
>
> -       rt = NULL;
>         if (err)
>                 goto failure;
>
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 0b7cd3d009b6..95c05e5293b1 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -287,6 +287,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
>                                                              inet->inet_dport,
>                                                              &tp->tsoffset);
>
> +       if (tcp_fastopen_defer_connect(sk, &err))
> +               return err;
> +       if (err)
> +               goto late_failure;
> +
>         err = tcp_connect(sk);
>         if (err)
>                 goto late_failure;
> --
> 2.11.0.483.g087da7b7c-goog
>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-23 18:59 ` [PATCH net-next 3/3] net/tcp-fastopen: Add new API support Wei Wang
  2017-01-23 19:15   ` Eric Dumazet
  2017-01-23 20:55   ` Yuchung Cheng
@ 2017-01-23 21:16   ` Willy Tarreau
       [not found]     ` <CAEA6p_DxVMAry1PCz_idmk=TGpnnTib3WpWso03FB1oMVXN+sg@mail.gmail.com>
  2017-01-24 17:44     ` Eric Dumazet
  2017-01-24  7:30   ` Willy Tarreau
  3 siblings, 2 replies; 30+ messages in thread
From: Willy Tarreau @ 2017-01-23 21:16 UTC (permalink / raw)
  To: Wei Wang; +Cc: netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

Hi Wei,

first, thanks a lot for doing this, it's really awesome!

I'm testing it on 4.9 on haproxy and I met a corner case : when I
perform a connect() to a server and I have nothing to send, upon
POLLOUT notification since I have nothing to send I simply probe the
connection using connect() again to see if it returns EISCONN or
anything else. But here now I'm seeing EINPROGRESS loops.

To illustrate this, here's what I'm doing :

                :8000          :8001
  [ client ] ---> [ proxy ] ---> [ server ]

The proxy is configured to enable TFO to the server and the server
supports TFO as well. The proxy and the server are in fact two proxy
instances in haproxy running in the same process for convenience.

When I already have data to send here's what I'm seeing (so it works fine) :

06:29:16.861190 accept4(7, {sa_family=AF_INET, sin_port=htons(33986), sin_addr=inet_addr("192.168.0.176")}, [128->16], SOCK
_NONBLOCK) = 9
06:29:16.861277 setsockopt(9, SOL_TCP, TCP_NODELAY, [1], 4) = 0
06:29:16.861342 accept4(7, 0x7ffd0d794430, [128], SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
06:29:16.861417 recvfrom(9, "BLAH\n", 7006, 0, NULL, NULL) = 5
06:29:16.861509 recvfrom(9, 0x2619329, 7001, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
06:29:16.861657 socket(AF_INET, SOCK_STREAM, IPPROTO_TCP) = 10
06:29:16.861730 fcntl(10, F_SETFL, O_RDONLY|O_NONBLOCK) = 0
06:29:16.861779 setsockopt(10, SOL_TCP, TCP_NODELAY, [1], 4) = 0
06:29:16.861882 setsockopt(10, SOL_TCP, 0x1e /* TCP_??? */, [1], 4) = 0
06:29:16.861942 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
06:29:16.862015 epoll_ctl(3, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLRDHUP, {u32=9, u64=9}}) = 0
06:29:16.862072 epoll_wait(3, [], 200, 0) = 0
06:29:16.862126 sendto(10, "BLAH\n", 5, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 5
06:29:16.862281 epoll_wait(3, [{EPOLLIN, {u32=8, u64=8}}], 200, 0) = 1
06:29:16.862334 recvfrom(10, 0x26173a4, 8030, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
06:29:16.862385 accept4(8, {sa_family=AF_INET, sin_port=htons(46760), sin_addr=inet_addr("127.0.0.1")}, [128->16], SOCK_NON
BLOCK) = 11
06:29:16.862450 setsockopt(11, SOL_TCP, TCP_NODELAY, [1], 4) = 0
06:29:16.862504 accept4(8, 0x7ffd0d794430, [128], SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
06:29:16.862564 recvfrom(11, "BLAH\n", 7006, 0, NULL, NULL) = 5


When I don't have data, here's what I'm seeing :

06:29:24.047801 accept4(7, {sa_family=AF_INET, sin_port=htons(33988), sin_addr=inet_addr("192.168.0.176")}, [128->16], SOCK
_NONBLOCK) = 9
06:29:24.047899 setsockopt(9, SOL_TCP, TCP_NODELAY, [1], 4) = 0
06:29:24.047966 accept4(7, 0x7ffdedb2c7f0, [128], SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
06:29:24.048043 recvfrom(9, 0xd31324, 7006, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
06:29:24.048281 socket(AF_INET, SOCK_STREAM, IPPROTO_TCP) = 10
06:29:24.048342 fcntl(10, F_SETFL, O_RDONLY|O_NONBLOCK) = 0
06:29:24.048392 setsockopt(10, SOL_TCP, TCP_NODELAY, [1], 4) = 0
06:29:24.048447 setsockopt(10, SOL_TCP, 0x1e /* TCP_??? */, [1], 4) = 0
06:29:24.048508 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
06:29:24.048593 epoll_ctl(3, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLRDHUP, {u32=9, u64=9}}) = 0
06:29:24.048651 epoll_wait(3, [], 200, 0) = 0
06:29:24.048699 getsockopt(10, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
06:29:24.048751 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRES
S (Operation now in progress)
06:29:24.048808 epoll_ctl(3, EPOLL_CTL_ADD, 10, {EPOLLOUT, {u32=10, u64=10}}) = 0
06:29:24.048860 epoll_wait(3, [{EPOLLOUT, {u32=10, u64=10}}], 200, 1000) = 1
06:29:24.048912 getsockopt(10, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
06:29:24.048963 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRES
S (Operation now in progress)
06:29:24.049018 epoll_wait(3, [{EPOLLOUT, {u32=10, u64=10}}], 200, 1000) = 1
06:29:24.049072 getsockopt(10, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
06:29:24.049122 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRES
S (Operation now in progress)


I theorically understand why but I think we have something wrong here
and instead we should have -1 EISCONN (to pretend the connection is
established) or return EALREADY (to mention that a previous request was
already made and that we're waiting for the next step).

While I can instrument my connect() *not* to use TFO when connecting
without any pending data, I don't always know this (eg when I use
openssl and cross fingers so that it decides to quickly send something
on the next round).

I think it's easy to fall into this tricky corner case and am wondering
what can be done about it. Does the EINPROGRESS happen only because there
is no cookie yet ? If so, shouldn't the connect's status change in this
case ?

Thanks,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
       [not found]     ` <CAEA6p_DxVMAry1PCz_idmk=TGpnnTib3WpWso03FB1oMVXN+sg@mail.gmail.com>
@ 2017-01-23 21:37       ` Willy Tarreau
  2017-01-23 22:01         ` Willy Tarreau
  0 siblings, 1 reply; 30+ messages in thread
From: Willy Tarreau @ 2017-01-23 21:37 UTC (permalink / raw)
  To: Wei Wang
  Cc: Wei Wang, Linux Kernel Network Developers, David Miller,
	Eric Dumazet, Yuchung Cheng

On Mon, Jan 23, 2017 at 01:28:53PM -0800, Wei Wang wrote:
> Hi Willy,
> 
> True. If you call connect() multiple times on a socket which already has
> cookie without a write(), the second and onward connect() call will return
> EINPROGRESS.
> It is basically because the following code block in __inet_stream_connect()
> can't distinguish if it is the first time connect() is called or not:
> 
> case SS_CONNECTING:
>                 if (inet_sk(sk)->defer_connect)  <----- defer_connect will
> be 0 only after a write() is called
>                         err = -EINPROGRESS;
>                 else
>                         err = -EALREADY;
>                 /* Fall out of switch with err, set for this state */
>                 break;

Ah OK that totally makes sense, thanks for the explanation!

> I guess we can add some extra logic here to address this issue. So the
> second connect() and onwards will return EALREADY.

If that's possible at little cost it would be nice, because your patch
makes it so easy to enable TFO on outgoing connections now that I
expect many people will blindly run the setsockopt() before connect().

Do not hesitate to ask me to run some tests. While 4 years ago it was
not easy, here it's very simple for me. By the way I'm seeing an ~10%
performance increase on haproxy by enabling this, it's really cool!

Thanks,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-23 21:37       ` Willy Tarreau
@ 2017-01-23 22:01         ` Willy Tarreau
  2017-01-23 22:33           ` Willy Tarreau
  0 siblings, 1 reply; 30+ messages in thread
From: Willy Tarreau @ 2017-01-23 22:01 UTC (permalink / raw)
  To: Wei Wang
  Cc: Wei Wang, Linux Kernel Network Developers, David Miller,
	Eric Dumazet, Yuchung Cheng

On Mon, Jan 23, 2017 at 10:37:32PM +0100, Willy Tarreau wrote:
> On Mon, Jan 23, 2017 at 01:28:53PM -0800, Wei Wang wrote:
> > Hi Willy,
> > 
> > True. If you call connect() multiple times on a socket which already has
> > cookie without a write(), the second and onward connect() call will return
> > EINPROGRESS.
> > It is basically because the following code block in __inet_stream_connect()
> > can't distinguish if it is the first time connect() is called or not:
> > 
> > case SS_CONNECTING:
> >                 if (inet_sk(sk)->defer_connect)  <----- defer_connect will
> > be 0 only after a write() is called
> >                         err = -EINPROGRESS;
> >                 else
> >                         err = -EALREADY;
> >                 /* Fall out of switch with err, set for this state */
> >                 break;
> 
> Ah OK that totally makes sense, thanks for the explanation!
> 
> > I guess we can add some extra logic here to address this issue. So the
> > second connect() and onwards will return EALREADY.

Thinking about it a bit more, I really think it would make more
sense to return -EISCONN here if we want to match the semantics
of a connect() returning zero on the first call. This way the
caller knows it can write whenever it wants and can disable
write polling until needed.

I'm currently rebuilding a kernel with this change to see if it
behaves any better :

-                        err = -EINPROGRESS;
+                        err = -EISCONN;

I'll keep you updated.

Thanks,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-23 22:01         ` Willy Tarreau
@ 2017-01-23 22:33           ` Willy Tarreau
       [not found]             ` <CAC15z3g8OxZNET+OnvcyYwHYuHq7QBamgGmjkBHzDr-XnUSGDQ@mail.gmail.com>
  0 siblings, 1 reply; 30+ messages in thread
From: Willy Tarreau @ 2017-01-23 22:33 UTC (permalink / raw)
  To: Wei Wang
  Cc: Wei Wang, Linux Kernel Network Developers, David Miller,
	Eric Dumazet, Yuchung Cheng

On Mon, Jan 23, 2017 at 11:01:21PM +0100, Willy Tarreau wrote:
> On Mon, Jan 23, 2017 at 10:37:32PM +0100, Willy Tarreau wrote:
> > On Mon, Jan 23, 2017 at 01:28:53PM -0800, Wei Wang wrote:
> > > Hi Willy,
> > > 
> > > True. If you call connect() multiple times on a socket which already has
> > > cookie without a write(), the second and onward connect() call will return
> > > EINPROGRESS.
> > > It is basically because the following code block in __inet_stream_connect()
> > > can't distinguish if it is the first time connect() is called or not:
> > > 
> > > case SS_CONNECTING:
> > >                 if (inet_sk(sk)->defer_connect)  <----- defer_connect will
> > > be 0 only after a write() is called
> > >                         err = -EINPROGRESS;
> > >                 else
> > >                         err = -EALREADY;
> > >                 /* Fall out of switch with err, set for this state */
> > >                 break;
> > 
> > Ah OK that totally makes sense, thanks for the explanation!
> > 
> > > I guess we can add some extra logic here to address this issue. So the
> > > second connect() and onwards will return EALREADY.
> 
> Thinking about it a bit more, I really think it would make more
> sense to return -EISCONN here if we want to match the semantics
> of a connect() returning zero on the first call. This way the
> caller knows it can write whenever it wants and can disable
> write polling until needed.
> 
> I'm currently rebuilding a kernel with this change to see if it
> behaves any better :
> 
> -                        err = -EINPROGRESS;
> +                        err = -EISCONN;

OK so obviously it didn't work since sendmsg() goes there as well.

But that made me realize that there really are 3 states, not 2 :

  - after connect() and before sendmsg() :
     defer_accept = 1, we want to lie to userland and pretend we're
     connected so that userland can call send(). A connect() must
     return either zero or -EISCONN.

  - during first sendmsg(), still connecting :
     the connection is in progress, EINPROGRESS must be returned to
     the first sendmsg().

  - after the first sendmsg() :
     defer_accept = 0 ; connect() must return -EALREADY. We want to
     return real socket states from now on.

Thus I modified defer_accept to take two bits to represent the extra
state we need to indicate the transition. Now sendmsg() upgrades
defer_accept from 1 to 2 before calling __inet_stream_connect(), which
then knows it must return EINPROGRESS to sendmsg().

This way we correctly cover all these situations. Even if we call
connect() again after the first connect() attempt it still matches
the first result :

accept4(7, {sa_family=AF_INET, sin_port=htons(36860), sin_addr=inet_addr("192.168.0.176")}, [128->16], SOCK_NONBLOCK) = 1
setsockopt(1, SOL_TCP, TCP_NODELAY, [1], 4) = 0
accept4(7, 0x7ffc2282fcb0, [128], SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
recvfrom(1, 0x7b53a4, 7006, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
socket(AF_INET, SOCK_STREAM, IPPROTO_TCP) = 2
fcntl(2, F_SETFL, O_RDONLY|O_NONBLOCK)  = 0
setsockopt(2, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(2, SOL_TCP, 0x1e /* TCP_??? */, [1], 4) = 0
connect(2, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
epoll_ctl(0, EPOLL_CTL_ADD, 1, {EPOLLIN|EPOLLRDHUP, {u32=1, u64=1}}) = 0
epoll_wait(0, [], 200, 0)               = 0
connect(2, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EISCONN (Transport endpoint is already connected)
epoll_wait(0, [], 200, 0)               = 0
recvfrom(2, 0x7b53a4, 8030, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
epoll_ctl(0, EPOLL_CTL_ADD, 2, {EPOLLIN|EPOLLRDHUP, {u32=2, u64=2}}) = 0
epoll_wait(0, [], 200, 1000)            = 0
epoll_wait(0, [], 200, 1000)            = 0
epoll_wait(0, [], 200, 1000)            = 0
epoll_wait(0, [{EPOLLIN, {u32=1, u64=1}}], 200, 1000) = 1
recvfrom(1, "GET / HTTP/1.1\r\n", 8030, 0, NULL, NULL) = 16
sendto(2, "GET / HTTP/1.1\r\n", 16, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 16
epoll_wait(0, [{EPOLLIN, {u32=1, u64=1}}], 200, 1000) = 1
recvfrom(1, "\r\n", 8030, 0, NULL, NULL) = 2
sendto(2, "\r\n", 2, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 2
epoll_wait(0, [{EPOLLIN|EPOLLRDHUP, {u32=2, u64=2}}], 200, 1000) = 1
recvfrom(2, "HTTP/1.1 302 Found\r\nCache-Contro"..., 8030, 0, NULL, NULL) = 98
sendto(1, "HTTP/1.1 302 Found\r\nCache-Contro"..., 98, MSG_DONTWAIT|MSG_NOSIGNAL|MSG_MORE, NULL, 0) = 98
shutdown(1, SHUT_WR)                    = 0
epoll_ctl(0, EPOLL_CTL_DEL, 2, 0x6ff55c) = 0
epoll_wait(0, [{EPOLLIN|EPOLLHUP|EPOLLRDHUP, {u32=1, u64=1}}], 200, 1000) = 1
recvfrom(1, "", 8030, 0, NULL, NULL)    = 0
close(1)                                = 0
shutdown(2, SHUT_WR)                    = 0
close(2)                                = 0

Here's what I changed on top of your patchset :

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0042fed..dc53f7f 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -207,9 +207,10 @@ struct inet_sock {
 				mc_all:1,
 				nodefrag:1;
 	__u8			bind_address_no_port:1,
-				defer_connect:1; /* Indicates that fastopen_connect is set
+				defer_connect:2; /* Indicates that fastopen_connect is set
 						  * and cookie exists so we defer connect
-						  * until first data frame is written
+						  * until first data frame is written.
+						  * Switches from 1 to 2 during first write()
 						  */
 	__u8			rcv_tos;
 	__u8			convert_csum;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e67d572..6dda9d5a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -594,8 +594,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		err = -EISCONN;
 		goto out;
 	case SS_CONNECTING:
-		if (inet_sk(sk)->defer_connect)
-			err = -EINPROGRESS;
+		if (inet_sk(sk)->defer_connect == 2)
+			err = -EINPROGRESS; /* sendmsg started */
+		else if (inet_sk(sk)->defer_connect == 1)
+			err = -EISCONN;     /* suggest to send now */
 		else
 			err = -EALREADY;
 		/* Fall out of switch with err, set for this state */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3c8938d..bd71f60 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1103,6 +1103,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 			inet->inet_dport = 0;
 			sk->sk_route_caps = 0;
 		}
+		/* tell __inet_stream_connect() that we're doing the
+		 * first write.
+		 */
+		inet->defer_connect = 2;
 	}
 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
 	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,


Is this also what you had in mind ?

thanks,
Willy

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
       [not found]             ` <CAC15z3g8OxZNET+OnvcyYwHYuHq7QBamgGmjkBHzDr-XnUSGDQ@mail.gmail.com>
@ 2017-01-23 23:01               ` Willy Tarreau
  0 siblings, 0 replies; 30+ messages in thread
From: Willy Tarreau @ 2017-01-23 23:01 UTC (permalink / raw)
  To: Wei Wang
  Cc: Wei Wang, Linux Kernel Network Developers, David Miller,
	Eric Dumazet, Yuchung Cheng

On Mon, Jan 23, 2017 at 02:57:31PM -0800, Wei Wang wrote:
> Yes. That seems to be a valid fix to it.
> Let me try it with my existing test cases as well to see if it works for
> all scenarios I have.

Perfect. Note that since the state 2 is transient I initially thought
about abusing the flags passed to __inet_stream_connect() to say "hey
I'm sendmsg() and not connect()" but that would have been a ugly hack
while here we really have the 3 socket states represented eventhough
one changes twice around a call.

Thanks,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-23 18:59 ` [PATCH net-next 3/3] net/tcp-fastopen: Add new API support Wei Wang
                     ` (2 preceding siblings ...)
  2017-01-23 21:16   ` Willy Tarreau
@ 2017-01-24  7:30   ` Willy Tarreau
  2017-01-24 17:26     ` Yuchung Cheng
  3 siblings, 1 reply; 30+ messages in thread
From: Willy Tarreau @ 2017-01-24  7:30 UTC (permalink / raw)
  To: Wei Wang; +Cc: netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

On Mon, Jan 23, 2017 at 10:59:22AM -0800, Wei Wang wrote:
> This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
> alternative way to perform Fast Open on the active side (client).

Wei, I think that nothing prevents from reusin the original TCP_FASTOPEN
sockopt instead of adding a new one. The original one does this :

        case TCP_FASTOPEN:
                if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
                    TCPF_LISTEN))) {
                        tcp_fastopen_init_key_once(true);

                        fastopen_queue_tune(sk, val);
                } else {
                        err = -EINVAL;
                }
                break;

and your new option does this :

	case TCP_FASTOPEN_CONNECT:
		if (val > 1 || val < 0) {
			err = -EINVAL;
		} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
			if (sk->sk_state == TCP_CLOSE)
				tp->fastopen_connect = val;
			else
				err = -EINVAL;
		} else {
			err = -EOPNOTSUPP;
		}
		break;

Now if we compare :
  - the value ranges are the same (0,1)
  - tcp_fastopen_init_key_once() only performs an initialization once
  - fastopen_queue_tune() only sets sk->max_qlen based on the backlog,
    this has no effect on an outgoing connection ;
  - tp->fastopen_connect can be applied to a listening socket without
    side effect.

Thus I think we can merge them this way :

        case TCP_FASTOPEN:
                if (val >= 0) {
		        if ((sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) &&
			    (sk->sk_state == TCP_CLOSE)
				tp->fastopen_connect = val;

			if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
	                        tcp_fastopen_init_key_once(true);
                        	fastopen_queue_tune(sk, val);
			}
                } else {
                        err = -EINVAL;
                }
                break;

And for the userland, the API is even simpler because we can use the
same TCP_FASTOPEN sockopt regardless of the socket direction. Also,
I don't know if TCP_FASTOPEN is supported on simultaneous connect,
but at least if it works it would be easier to understand this way.

Do you think there's a compelling reason for adding a new option or
are you interested in a small patch to perform the change above ?

Regards,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-24  7:30   ` Willy Tarreau
@ 2017-01-24 17:26     ` Yuchung Cheng
  2017-01-24 17:42       ` Eric Dumazet
  0 siblings, 1 reply; 30+ messages in thread
From: Yuchung Cheng @ 2017-01-24 17:26 UTC (permalink / raw)
  To: Willy Tarreau; +Cc: Wei Wang, netdev, David Miller, Eric Dumazet, Wei Wang

On Mon, Jan 23, 2017 at 11:30 PM, Willy Tarreau <w@1wt.eu> wrote:
>
> On Mon, Jan 23, 2017 at 10:59:22AM -0800, Wei Wang wrote:
> > This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
> > alternative way to perform Fast Open on the active side (client).
>
> Wei, I think that nothing prevents from reusin the original TCP_FASTOPEN
> sockopt instead of adding a new one. The original one does this :
>
>         case TCP_FASTOPEN:
>                 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
>                     TCPF_LISTEN))) {
>                         tcp_fastopen_init_key_once(true);
>
>                         fastopen_queue_tune(sk, val);
>                 } else {
>                         err = -EINVAL;
>                 }
>                 break;
>
> and your new option does this :
>
>         case TCP_FASTOPEN_CONNECT:
>                 if (val > 1 || val < 0) {
>                         err = -EINVAL;
>                 } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
>                         if (sk->sk_state == TCP_CLOSE)
>                                 tp->fastopen_connect = val;
>                         else
>                                 err = -EINVAL;
>                 } else {
>                         err = -EOPNOTSUPP;
>                 }
>                 break;
>
> Now if we compare :
>   - the value ranges are the same (0,1)
>   - tcp_fastopen_init_key_once() only performs an initialization once
>   - fastopen_queue_tune() only sets sk->max_qlen based on the backlog,
>     this has no effect on an outgoing connection ;
>   - tp->fastopen_connect can be applied to a listening socket without
>     side effect.
>
> Thus I think we can merge them this way :
>
>         case TCP_FASTOPEN:
>                 if (val >= 0) {
>                         if ((sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) &&
>                             (sk->sk_state == TCP_CLOSE)
>                                 tp->fastopen_connect = val;
>
>                         if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
>                                 tcp_fastopen_init_key_once(true);
>                                 fastopen_queue_tune(sk, val);
>                         }
>                 } else {
>                         err = -EINVAL;
>                 }
>                 break;
>
> And for the userland, the API is even simpler because we can use the
> same TCP_FASTOPEN sockopt regardless of the socket direction. Also,
> I don't know if TCP_FASTOPEN is supported on simultaneous connect,
> but at least if it works it would be easier to understand this way.
It supports partially (i.e. send SYN data but not accept simul.
SYN-data crossing).
Here is the snippet of packetdrill test we use internally:

`sysctl net.ipv4.tcp_timestamps=0`

// Cache warmup: send a Fast Open cookie request
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
0.000 sendto(3, ..., 0, MSG_FASTOPEN, ..., ...) = -1 EINPROGRESS
(Operation is now in progress)
0.000 > S 0:0(0) <mss 1460,nop,nop,sackOK,nop,wscale 8,FO,nop,nop>
0.010 < S. 123:123(0) ack 1 win 14600 <mss
1460,nop,nop,sackOK,nop,wscale 6,FO abcd1234,nop,nop>
0.010 > . 1:1(0) ack 1
0.020 close(3) = 0
0.020 > F. 1:1(0) ack 1
0.030 < F. 1:1(0) ack 2 win 92
0.030 > .  2:2(0) ack 2


//
// Test: simulatenous fast open
//
+.010 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+.000 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+.000 sendto(4, ..., 1000, MSG_FASTOPEN, ..., ...) = 1000
+.000 > S 0:1000(1000) <mss 1460,nop,nop,sackOK,nop,wscale 8,FO
abcd1234,nop,nop>
// Simul. SYN-data crossing: we don't support that yet so ack only remote ISN
+.005 < S 1234:1734(500) win 14600 <mss 1040,nop,nop,sackOK,nop,wscale
6,FO 87654321,nop,nop>
+.000 > S. 0:0(0) ack 1235 <mss 1460,nop,nop,sackOK,nop,wscale 8>

// SYN data is never retried.
+.045 < S. 1234:1234(0) ack 1001 win 14600 <mss
940,nop,nop,sackOK,nop,wscale 6,FO 12345678,nop,nop>
+.000 > . 1001:1001(0) ack 1
// The other end retries
+.100 < P. 1:501(500) ack 1000 win 257
+.000 > . 1001:1001(0) ack 501
+.000 read(4, ..., 4096) = 500
+.000 close(4) = 0
+.000 > F. 1001:1001(0) ack 501
+.050 < F. 501:501(0) ack 1002 win 257
+.000 > . 1002:1002(0) ack 502


>
> Do you think there's a compelling reason for adding a new option or
> are you interested in a small patch to perform the change above ?
I like the proposal especially other stack also uses TCP_FASTOPEN
https://msdn.microsoft.com/en-us/library/windows/desktop/ms738596(v=vs.85).aspx


>
> Regards,
> Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-24 17:26     ` Yuchung Cheng
@ 2017-01-24 17:42       ` Eric Dumazet
  2017-01-24 18:43         ` Willy Tarreau
  0 siblings, 1 reply; 30+ messages in thread
From: Eric Dumazet @ 2017-01-24 17:42 UTC (permalink / raw)
  To: Yuchung Cheng
  Cc: Willy Tarreau, Wei Wang, netdev, David Miller, Eric Dumazet, Wei Wang

On Tue, 2017-01-24 at 09:26 -0800, Yuchung Cheng wrote:

> >
> > Do you think there's a compelling reason for adding a new option or
> > are you interested in a small patch to perform the change above ?
> I like the proposal especially other stack also uses TCP_FASTOPEN
> https://msdn.microsoft.com/en-us/library/windows/desktop/ms738596(v=vs.85).aspx


Problem is that might break existing applications that were using
TCP_FASTOPEN before a connect() (it was a NOP until now)

I prefer we use a separate new option to be 100% safe, not adding
regressions.

Only new applications, tested, will use this new feature at their risk.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-23 21:16   ` Willy Tarreau
       [not found]     ` <CAEA6p_DxVMAry1PCz_idmk=TGpnnTib3WpWso03FB1oMVXN+sg@mail.gmail.com>
@ 2017-01-24 17:44     ` Eric Dumazet
  2017-01-24 18:34       ` Willy Tarreau
  1 sibling, 1 reply; 30+ messages in thread
From: Eric Dumazet @ 2017-01-24 17:44 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Wei Wang, netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

On Mon, 2017-01-23 at 22:16 +0100, Willy Tarreau wrote:
> Hi Wei,
> 
> first, thanks a lot for doing this, it's really awesome!
> 
> I'm testing it on 4.9 on haproxy and I met a corner case : when I
> perform a connect() to a server and I have nothing to send, upon
> POLLOUT notification since I have nothing to send I simply probe the
> connection using connect() again to see if it returns EISCONN or
> anything else. But here now I'm seeing EINPROGRESS loops.
> 
> To illustrate this, here's what I'm doing :
> 
>                 :8000          :8001
>   [ client ] ---> [ proxy ] ---> [ server ]
> 
> The proxy is configured to enable TFO to the server and the server
> supports TFO as well. The proxy and the server are in fact two proxy
> instances in haproxy running in the same process for convenience.
> 
> When I already have data to send here's what I'm seeing (so it works fine) :
> 
> 06:29:16.861190 accept4(7, {sa_family=AF_INET, sin_port=htons(33986), sin_addr=inet_addr("192.168.0.176")}, [128->16], SOCK
> _NONBLOCK) = 9
> 06:29:16.861277 setsockopt(9, SOL_TCP, TCP_NODELAY, [1], 4) = 0
> 06:29:16.861342 accept4(7, 0x7ffd0d794430, [128], SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
> 06:29:16.861417 recvfrom(9, "BLAH\n", 7006, 0, NULL, NULL) = 5
> 06:29:16.861509 recvfrom(9, 0x2619329, 7001, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
> 06:29:16.861657 socket(AF_INET, SOCK_STREAM, IPPROTO_TCP) = 10
> 06:29:16.861730 fcntl(10, F_SETFL, O_RDONLY|O_NONBLOCK) = 0
> 06:29:16.861779 setsockopt(10, SOL_TCP, TCP_NODELAY, [1], 4) = 0
> 06:29:16.861882 setsockopt(10, SOL_TCP, 0x1e /* TCP_??? */, [1], 4) = 0
> 06:29:16.861942 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
> 06:29:16.862015 epoll_ctl(3, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLRDHUP, {u32=9, u64=9}}) = 0
> 06:29:16.862072 epoll_wait(3, [], 200, 0) = 0
> 06:29:16.862126 sendto(10, "BLAH\n", 5, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 5
> 06:29:16.862281 epoll_wait(3, [{EPOLLIN, {u32=8, u64=8}}], 200, 0) = 1
> 06:29:16.862334 recvfrom(10, 0x26173a4, 8030, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
> 06:29:16.862385 accept4(8, {sa_family=AF_INET, sin_port=htons(46760), sin_addr=inet_addr("127.0.0.1")}, [128->16], SOCK_NON
> BLOCK) = 11
> 06:29:16.862450 setsockopt(11, SOL_TCP, TCP_NODELAY, [1], 4) = 0
> 06:29:16.862504 accept4(8, 0x7ffd0d794430, [128], SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
> 06:29:16.862564 recvfrom(11, "BLAH\n", 7006, 0, NULL, NULL) = 5
> 
> 
> When I don't have data, here's what I'm seeing :
> 
> 06:29:24.047801 accept4(7, {sa_family=AF_INET, sin_port=htons(33988), sin_addr=inet_addr("192.168.0.176")}, [128->16], SOCK
> _NONBLOCK) = 9
> 06:29:24.047899 setsockopt(9, SOL_TCP, TCP_NODELAY, [1], 4) = 0
> 06:29:24.047966 accept4(7, 0x7ffdedb2c7f0, [128], SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
> 06:29:24.048043 recvfrom(9, 0xd31324, 7006, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
> 06:29:24.048281 socket(AF_INET, SOCK_STREAM, IPPROTO_TCP) = 10
> 06:29:24.048342 fcntl(10, F_SETFL, O_RDONLY|O_NONBLOCK) = 0
> 06:29:24.048392 setsockopt(10, SOL_TCP, TCP_NODELAY, [1], 4) = 0
> 06:29:24.048447 setsockopt(10, SOL_TCP, 0x1e /* TCP_??? */, [1], 4) = 0
> 06:29:24.048508 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = 0

I believe there is a bug in this application.

It does not check connect() return value.

When 0 is returned, it makes no sense to wait 200 ms :

> 06:29:24.048593 epoll_ctl(3, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLRDHUP, {u32=9, u64=9}}) = 0
> 06:29:24.048651 epoll_wait(3, [], 200, 0) = 0
> 06:29:24.048699 getsockopt(10, SOL_SOCKET, SO_ERROR, [0], [4]) = 0

And it makes no sense to call connect() again :

> 06:29:24.048751 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRES
> S (Operation now in progress)

man connect

<quote>
Generally,  connection-based protocol sockets may successfully connect()
only once;
</quote>


I would prefer we do not add yet another bit in tcp kernel sockets, to
work around some oddity in your program Willy.

> 06:29:24.048808 epoll_ctl(3, EPOLL_CTL_ADD, 10, {EPOLLOUT, {u32=10, u64=10}}) = 0
> 06:29:24.048860 epoll_wait(3, [{EPOLLOUT, {u32=10, u64=10}}], 200, 1000) = 1
> 06:29:24.048912 getsockopt(10, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
> 06:29:24.048963 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRES
> S (Operation now in progress)
> 06:29:24.049018 epoll_wait(3, [{EPOLLOUT, {u32=10, u64=10}}], 200, 1000) = 1
> 06:29:24.049072 getsockopt(10, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
> 06:29:24.049122 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRES
> S (Operation now in progress)
> 
> 
> I theorically understand why but I think we have something wrong here
> and instead we should have -1 EISCONN (to pretend the connection is
> established) or return EALREADY (to mention that a previous request was
> already made and that we're waiting for the next step).
> 
> While I can instrument my connect() *not* to use TFO when connecting
> without any pending data, I don't always know this (eg when I use
> openssl and cross fingers so that it decides to quickly send something
> on the next round).
> 
> I think it's easy to fall into this tricky corner case and am wondering
> what can be done about it. Does the EINPROGRESS happen only because there
> is no cookie yet ? If so, shouldn't the connect's status change in this
> case ?
> 
> Thanks,
> Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-24 17:44     ` Eric Dumazet
@ 2017-01-24 18:34       ` Willy Tarreau
  2017-01-24 18:51         ` Eric Dumazet
  0 siblings, 1 reply; 30+ messages in thread
From: Willy Tarreau @ 2017-01-24 18:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Wei Wang, netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

Hi Eric,

On Tue, Jan 24, 2017 at 09:44:49AM -0800, Eric Dumazet wrote:
> I believe there is a bug in this application.
> 
> It does not check connect() return value.

Yes in fact it does but I noticed the same thing, there's something causing
the event not to be registered or something like this.

> When 0 is returned, it makes no sense to wait 200 ms :
> 
> > 06:29:24.048593 epoll_ctl(3, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLRDHUP, {u32=9, u64=9}}) = 0
> > 06:29:24.048651 epoll_wait(3, [], 200, 0) = 0
> > 06:29:24.048699 getsockopt(10, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
> 
> And it makes no sense to call connect() again :
> 
> > 06:29:24.048751 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRES
> > S (Operation now in progress)

I totally agree.

> man connect
> 
> <quote>
> Generally,  connection-based protocol sockets may successfully connect()
> only once;
> </quote>
> 
> 
> I would prefer we do not add yet another bit in tcp kernel sockets, to
> work around some oddity in your program Willy.

I'm fine with chasing the bug on my side and fixing it, but there's a
semantic trouble anyway with returning -EINPROGRESS :
  - connect() = 0 indicates that the connection is established
  - then a further connect() should return -EISCONN, and does so when
    not using TFO

man connect says this regarding EINPROGRESS :

    The socket is nonblocking and the connection cannot be completed immediately.
    It is possible to  select(2)  or  poll(2)  for completion  by  selecting  the
    socket  for  writing.  After  select(2) indicates writability, use getsockopt(2)
    to read the SO_ERROR option at level SOL_SOCKET to determine whether connect()
    completed successfully (SO_ERROR is  zero)  or  unsuccess-fully (SO_ERROR is
    one of the usual error codes listed here, explaining the reason for the failure).

Here we clearly have an incompatibility between this EINPROGRESS saying
that we must poll, and poll returning POLLOUT suggesting that it's now
OK.

I'm totally fine with not using an extra bit in a scarce area, but then
we can either add an extra argument to __inet_stream_connect() to say
"this is sendmsg" or just add an extra flag in the last argument.

But in general I don't feel comfortable with a semantics that doesn't
completely match the current and documented one :-/

Thanks,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-24 17:42       ` Eric Dumazet
@ 2017-01-24 18:43         ` Willy Tarreau
  0 siblings, 0 replies; 30+ messages in thread
From: Willy Tarreau @ 2017-01-24 18:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Yuchung Cheng, Wei Wang, netdev, David Miller, Eric Dumazet, Wei Wang

On Tue, Jan 24, 2017 at 09:42:07AM -0800, Eric Dumazet wrote:
> On Tue, 2017-01-24 at 09:26 -0800, Yuchung Cheng wrote:
> 
> > >
> > > Do you think there's a compelling reason for adding a new option or
> > > are you interested in a small patch to perform the change above ?
> > I like the proposal especially other stack also uses TCP_FASTOPEN
> > https://msdn.microsoft.com/en-us/library/windows/desktop/ms738596(v=vs.85).aspx
> 
> 
> Problem is that might break existing applications that were using
> TCP_FASTOPEN before a connect() (it was a NOP until now)
> 
> I prefer we use a separate new option to be 100% safe, not adding
> regressions.
> 
> Only new applications, tested, will use this new feature at their risk.

That's indeed a good point. I Yuchung's comment above made me wonder
about application's portability but very few OSes will use this and in
the end it might be that portable applications will just add :

   #define TCP_FASTOPEN_CONNECT TCP_FASTOPEN

For other OSes and use TCP_FASTOPEN_CONNECT only for the connect() case.

Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-24 18:34       ` Willy Tarreau
@ 2017-01-24 18:51         ` Eric Dumazet
  2017-01-24 19:11           ` Willy Tarreau
  0 siblings, 1 reply; 30+ messages in thread
From: Eric Dumazet @ 2017-01-24 18:51 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Wei Wang, netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

On Tue, 2017-01-24 at 19:34 +0100, Willy Tarreau wrote:
> Hi Eric,
> 
> On Tue, Jan 24, 2017 at 09:44:49AM -0800, Eric Dumazet wrote:
> > I believe there is a bug in this application.
> > 
> > It does not check connect() return value.
> 
> Yes in fact it does but I noticed the same thing, there's something causing
> the event not to be registered or something like this.
> 
> > When 0 is returned, it makes no sense to wait 200 ms :
> > 
> > > 06:29:24.048593 epoll_ctl(3, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLRDHUP, {u32=9, u64=9}}) = 0
> > > 06:29:24.048651 epoll_wait(3, [], 200, 0) = 0
> > > 06:29:24.048699 getsockopt(10, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
> > 
> > And it makes no sense to call connect() again :
> > 
> > > 06:29:24.048751 connect(10, {sa_family=AF_INET, sin_port=htons(8001), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRES
> > > S (Operation now in progress)
> 
> I totally agree.
> 
> > man connect
> > 
> > <quote>
> > Generally,  connection-based protocol sockets may successfully connect()
> > only once;
> > </quote>
> > 
> > 
> > I would prefer we do not add yet another bit in tcp kernel sockets, to
> > work around some oddity in your program Willy.
> 
> I'm fine with chasing the bug on my side and fixing it, but there's a
> semantic trouble anyway with returning -EINPROGRESS :
>   - connect() = 0 indicates that the connection is established
>   - then a further connect() should return -EISCONN, and does so when
>     not using TFO
> 
> man connect says this regarding EINPROGRESS :
> 
>     The socket is nonblocking and the connection cannot be completed immediately.
>     It is possible to  select(2)  or  poll(2)  for completion  by  selecting  the
>     socket  for  writing.  After  select(2) indicates writability, use getsockopt(2)
>     to read the SO_ERROR option at level SOL_SOCKET to determine whether connect()
>     completed successfully (SO_ERROR is  zero)  or  unsuccess-fully (SO_ERROR is
>     one of the usual error codes listed here, explaining the reason for the failure).
> 
> Here we clearly have an incompatibility between this EINPROGRESS saying
> that we must poll, and poll returning POLLOUT suggesting that it's now
> OK.
> 
> I'm totally fine with not using an extra bit in a scarce area, but then
> we can either add an extra argument to __inet_stream_connect() to say
> "this is sendmsg" or just add an extra flag in the last argument.
> 
> But in general I don't feel comfortable with a semantics that doesn't
> completely match the current and documented one :-/
> 
> Thanks,
> Willy


We do not return -1 / EINPROGRESS but 0

Do not call connect() twice, it is clearly not supposed to work.

Fact that it happened to work is still kept for applications not using
new features (like TCP_FASTOPEN_CONNECT), we wont break this.

I would prefer you submit _if_ needed a patch on top of Wei patch, which
was carefully tested with our ~500 packetdrill tests.



TCP_FASTOPEN_CONNECT + connect() returning 0 is already a violation of
past behavior, in the sense that no connection really happened yet.

An application exploiting this return value and consider the server is
reachable would be mistaken.

We do not support connect() + TCP_FASTOPEN_CONNECT + read(), because we
do not want to add yet another conditional test in recvmsg() fast path
for such feature, while existing sendmsg() can already be used to send a
SYN with FastOpen option.

So people are not expected to blindly add TCP_FASTOPEN_CONNECT to every
TCP socket they allocate/use. This would add too much bloat to the
kernel.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-24 18:51         ` Eric Dumazet
@ 2017-01-24 19:11           ` Willy Tarreau
       [not found]             ` <CAC15z3izWNh7th_yxA_a90vgLnU5XzVDm6vyojq3cMDgQZ71YA@mail.gmail.com>
  0 siblings, 1 reply; 30+ messages in thread
From: Willy Tarreau @ 2017-01-24 19:11 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Wei Wang, netdev, David Miller, Eric Dumazet, Yuchung Cheng, Wei Wang

On Tue, Jan 24, 2017 at 10:51:25AM -0800, Eric Dumazet wrote:
> We do not return -1 / EINPROGRESS but 0
> 
> Do not call connect() twice, it is clearly not supposed to work.

Yes it is, it normally returns -1 / EISCONN on a regular socket :

      EISCONN
              The socket is already connected.

> Fact that it happened to work is still kept for applications not using
> new features (like TCP_FASTOPEN_CONNECT), we wont break this.

Sure but as we saw, deeply burried silent bugs having no effect in
existing applications can suddenly become problematic once TFO is
enabled, and the semantics difference between the two are minimal
enough to warrant being closed.

> I would prefer you submit _if_ needed a patch on top of Wei patch, which
> was carefully tested with our ~500 packetdrill tests.

I totally understand and rest assured that I have a great respect for
this amount of test, which is also why I find the feature really exciting.
I'll probably propose something involving an extra argument then, this
will be much easier to review in the perspective of the existing tests.

> TCP_FASTOPEN_CONNECT + connect() returning 0 is already a violation of
> past behavior, in the sense that no connection really happened yet.

I agree but semantically it could be considered that it means "connect()
already called successfully, feel free to proceed with send() whenever
you want" and that's why it's appealing ;-)

> An application exploiting this return value and consider the server is
> reachable would be mistaken.

100% agree, I even had a private discussion regarding this, mentionning
that I already added a test in haproxy to only enable it if there are
data scheduled for leaving. In my case it's easy because I already have
the same test to decide whether or not to disable TCP_QUICKACK to save
one packet by sending the payload with the first ACK. So in short it will
be :

     if (data) {
          if (disable_quick_ack)
               setsockopt(fd, SOL_TCP, TCP_QUICKACK, &zero, sizeof(&zero));
          if (enable_fastopen)
               setsockopt(fd, SOL_TCP, TCP_FASTOPEN_CONNECT, &one, sizeof(&one));
     }
     connect(fd, ...);

But I certainly understand that in some implementations it's could be
trickier. That just reminds me that I haven't tested it combined with
splicing. I'll have to try this.

> We do not support connect() + TCP_FASTOPEN_CONNECT + read(), because we
> do not want to add yet another conditional test in recvmsg() fast path
> for such feature, while existing sendmsg() can already be used to send a
> SYN with FastOpen option.

Yes, I think the mechanism is complex enough internally not to try to
make it even more complex :-)

> So people are not expected to blindly add TCP_FASTOPEN_CONNECT to every
> TCP socket they allocate/use. This would add too much bloat to the
> kernel.

I really think that the true benefit of TFO is for HTTP and SSL where
the client speaks first and already has something to say when the decision
to connect is made. It should be clear in implementors' minds that it
cannot be a default setting and that it doesn't make sense.

Thanks,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
       [not found]             ` <CAC15z3izWNh7th_yxA_a90vgLnU5XzVDm6vyojq3cMDgQZ71YA@mail.gmail.com>
@ 2017-01-25 17:22               ` David Miller
  2017-01-25 17:54                 ` Willy Tarreau
  2017-01-25 17:53               ` Willy Tarreau
  1 sibling, 1 reply; 30+ messages in thread
From: David Miller @ 2017-01-25 17:22 UTC (permalink / raw)
  To: tracywwnj; +Cc: w, eric.dumazet, netdev, edumazet, ycheng, weiwan

From: Wei Wang <tracywwnj@gmail.com>
Date: Wed, 25 Jan 2017 09:15:34 -0800

> Looks like you sent a separate patch on top of this patch series to
> address double connect().  Then I think this patch series should be
> good to go.

Indeed, Willy please give some kind of ACK.

Thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
       [not found]             ` <CAC15z3izWNh7th_yxA_a90vgLnU5XzVDm6vyojq3cMDgQZ71YA@mail.gmail.com>
  2017-01-25 17:22               ` David Miller
@ 2017-01-25 17:53               ` Willy Tarreau
  1 sibling, 0 replies; 30+ messages in thread
From: Willy Tarreau @ 2017-01-25 17:53 UTC (permalink / raw)
  To: Wei Wang
  Cc: Eric Dumazet, Linux Kernel Network Developers, David Miller,
	Eric Dumazet, Yuchung Cheng, Wei Wang

Hi Wei,

On Wed, Jan 25, 2017 at 09:15:34AM -0800, Wei Wang wrote:
> Willy,
> 
> Looks like you sent a separate patch on top of this patch series to address
> double connect().

Yes, sorry, I wanted to reply to this thread after the git-send-email
and got caught immediately after :-)

So as suggested by Eric in order to make the review easier, it was done
on top of your series.

> Then I think this patch series should be good to go.
> I will get your patch tested with our TFO test cases.

I think so as well. Thanks for running the tests. On my side I could fix
the haproxy bug which triggered this, and could verify the the whole
series works fine both with and without the haproxy fix. So I think we're
good now.

Thanks,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-25 17:22               ` David Miller
@ 2017-01-25 17:54                 ` Willy Tarreau
  2017-01-25 18:54                   ` Wei Wang
  0 siblings, 1 reply; 30+ messages in thread
From: Willy Tarreau @ 2017-01-25 17:54 UTC (permalink / raw)
  To: David Miller; +Cc: tracywwnj, eric.dumazet, netdev, edumazet, ycheng, weiwan

On Wed, Jan 25, 2017 at 12:22:05PM -0500, David Miller wrote:
> From: Wei Wang <tracywwnj@gmail.com>
> Date: Wed, 25 Jan 2017 09:15:34 -0800
> 
> > Looks like you sent a separate patch on top of this patch series to
> > address double connect().  Then I think this patch series should be
> > good to go.
> 
> Indeed, Willy please give some kind of ACK.

Yes sorry David, for me it's OK. If Wei runs his whole series of tests
on it again, it should be perfect.

thanks,
Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-25 17:54                 ` Willy Tarreau
@ 2017-01-25 18:54                   ` Wei Wang
  2017-01-25 19:03                     ` Eric Dumazet
  2017-01-25 19:03                     ` David Miller
  0 siblings, 2 replies; 30+ messages in thread
From: Wei Wang @ 2017-01-25 18:54 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: David Miller, Eric Dumazet, Linux Kernel Network Developers,
	Eric Dumazet, Yuchung Cheng, Wei Wang

> Yes sorry David, for me it's OK. If Wei runs his whole series of tests
> on it again, it should be perfect.

I just ran all the TFO related tests with Willy's patch on top of this
patch series.
And everything passes.

On Wed, Jan 25, 2017 at 9:54 AM, Willy Tarreau <w@1wt.eu> wrote:
> On Wed, Jan 25, 2017 at 12:22:05PM -0500, David Miller wrote:
>> From: Wei Wang <tracywwnj@gmail.com>
>> Date: Wed, 25 Jan 2017 09:15:34 -0800
>>
>> > Looks like you sent a separate patch on top of this patch series to
>> > address double connect().  Then I think this patch series should be
>> > good to go.
>>
>> Indeed, Willy please give some kind of ACK.
>
> Yes sorry David, for me it's OK. If Wei runs his whole series of tests
> on it again, it should be perfect.
>
> thanks,
> Willy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-25 18:54                   ` Wei Wang
@ 2017-01-25 19:03                     ` Eric Dumazet
  2017-01-25 19:03                     ` David Miller
  1 sibling, 0 replies; 30+ messages in thread
From: Eric Dumazet @ 2017-01-25 19:03 UTC (permalink / raw)
  To: Wei Wang
  Cc: Willy Tarreau, David Miller, Linux Kernel Network Developers,
	Eric Dumazet, Yuchung Cheng, Wei Wang

On Wed, 2017-01-25 at 10:54 -0800, Wei Wang wrote:
> > Yes sorry David, for me it's OK. If Wei runs his whole series of tests
> > on it again, it should be perfect.
> 
> I just ran all the TFO related tests with Willy's patch on top of this
> patch series.
> And everything passes.

Note that I am not sure we correctly test that a second connect() can be
done, and I am not sure kernel would check that the remote IP and
destination port is the same.

Ie what should happen for

setsockopt(fd, SOL_TCP, TCP_FASTOPEN_CONNECT, &on, 4)
connect(fd,  "1.2.3.4:80")
connect(fd, "55.66.77.88:4000")

This multiple connect() thing should have been forbidden in the first
place really.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-25 18:54                   ` Wei Wang
  2017-01-25 19:03                     ` Eric Dumazet
@ 2017-01-25 19:03                     ` David Miller
  2017-01-25 19:30                       ` Wei Wang
  1 sibling, 1 reply; 30+ messages in thread
From: David Miller @ 2017-01-25 19:03 UTC (permalink / raw)
  To: tracywwnj; +Cc: w, eric.dumazet, netdev, edumazet, ycheng, weiwan

From: Wei Wang <tracywwnj@gmail.com>
Date: Wed, 25 Jan 2017 10:54:50 -0800

>> Yes sorry David, for me it's OK. If Wei runs his whole series of tests
>> on it again, it should be perfect.
> 
> I just ran all the TFO related tests with Willy's patch on top of this
> patch series.
> And everything passes.

Great, I'll apply everything, thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace API support
  2017-01-23 18:59 [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace API support Wei Wang
                   ` (2 preceding siblings ...)
  2017-01-23 18:59 ` [PATCH net-next 3/3] net/tcp-fastopen: Add new API support Wei Wang
@ 2017-01-25 19:09 ` David Miller
  3 siblings, 0 replies; 30+ messages in thread
From: David Miller @ 2017-01-25 19:09 UTC (permalink / raw)
  To: tracywwnj; +Cc: netdev, edumazet, ycheng, weiwan

From: Wei Wang <tracywwnj@gmail.com>
Date: Mon, 23 Jan 2017 10:59:19 -0800

> The patch series is to add support for new userspace API for TCP fastopen
> sockets. 
> In the current code, user has to call sendto()/sendmsg() with special flag
> MSG_FASTOPEN for TCP fastopen sockets. This API is quite different from the
> normal TCP socket API and can be cumbersome for applications to make use
> fastopen sockets.
> So this new patch introduces a new way of using TCP fastopen sockets which
> is similar to normal TCP sockets with a new sockopt TCP_FASTOPEN_CONNECT.
> More details about it is described in the third patch.
> (First 2 patches are preparations for the third patch.)

Series applied.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 3/3] net/tcp-fastopen: Add new API support
  2017-01-25 19:03                     ` David Miller
@ 2017-01-25 19:30                       ` Wei Wang
  0 siblings, 0 replies; 30+ messages in thread
From: Wei Wang @ 2017-01-25 19:30 UTC (permalink / raw)
  To: David Miller
  Cc: Willy Tarreau, Eric Dumazet, Linux Kernel Network Developers,
	Eric Dumazet, Yuchung Cheng, Wei Wang

> Note that I am not sure we correctly test that a second connect() can be
> done, and I am not sure kernel would check that the remote IP and
> destination port is the same.

> Ie what should happen for

> setsockopt(fd, SOL_TCP, TCP_FASTOPEN_CONNECT, &on, 4)
> connect(fd,  "1.2.3.4:80")
> connect(fd, "55.66.77.88:4000")

I wrote a simple code to test this scenario and the second connect()
returns EISCONN as well even though the destination IP is different.

On Wed, Jan 25, 2017 at 11:03 AM, David Miller <davem@davemloft.net> wrote:
> From: Wei Wang <tracywwnj@gmail.com>
> Date: Wed, 25 Jan 2017 10:54:50 -0800
>
>>> Yes sorry David, for me it's OK. If Wei runs his whole series of tests
>>> on it again, it should be perfect.
>>
>> I just ran all the TFO related tests with Willy's patch on top of this
>> patch series.
>> And everything passes.
>
> Great, I'll apply everything, thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2017-01-25 19:30 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-23 18:59 [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace API support Wei Wang
2017-01-23 18:59 ` [PATCH net-next 1/3] net/tcp-fastopen: refactor cookie check logic Wei Wang
2017-01-23 19:13   ` Eric Dumazet
2017-01-23 20:51   ` Yuchung Cheng
2017-01-23 18:59 ` [PATCH net-next 2/3] net: Remove __sk_dst_reset() in tcp_v6_connect() Wei Wang
2017-01-23 19:14   ` Eric Dumazet
2017-01-23 18:59 ` [PATCH net-next 3/3] net/tcp-fastopen: Add new API support Wei Wang
2017-01-23 19:15   ` Eric Dumazet
2017-01-23 20:55   ` Yuchung Cheng
2017-01-23 21:16   ` Willy Tarreau
     [not found]     ` <CAEA6p_DxVMAry1PCz_idmk=TGpnnTib3WpWso03FB1oMVXN+sg@mail.gmail.com>
2017-01-23 21:37       ` Willy Tarreau
2017-01-23 22:01         ` Willy Tarreau
2017-01-23 22:33           ` Willy Tarreau
     [not found]             ` <CAC15z3g8OxZNET+OnvcyYwHYuHq7QBamgGmjkBHzDr-XnUSGDQ@mail.gmail.com>
2017-01-23 23:01               ` Willy Tarreau
2017-01-24 17:44     ` Eric Dumazet
2017-01-24 18:34       ` Willy Tarreau
2017-01-24 18:51         ` Eric Dumazet
2017-01-24 19:11           ` Willy Tarreau
     [not found]             ` <CAC15z3izWNh7th_yxA_a90vgLnU5XzVDm6vyojq3cMDgQZ71YA@mail.gmail.com>
2017-01-25 17:22               ` David Miller
2017-01-25 17:54                 ` Willy Tarreau
2017-01-25 18:54                   ` Wei Wang
2017-01-25 19:03                     ` Eric Dumazet
2017-01-25 19:03                     ` David Miller
2017-01-25 19:30                       ` Wei Wang
2017-01-25 17:53               ` Willy Tarreau
2017-01-24  7:30   ` Willy Tarreau
2017-01-24 17:26     ` Yuchung Cheng
2017-01-24 17:42       ` Eric Dumazet
2017-01-24 18:43         ` Willy Tarreau
2017-01-25 19:09 ` [PATCH net-next 0/3] net/tcp-fastopen: Add new userspace " David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.