bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests
@ 2021-10-02  0:37 Cong Wang
  2021-10-02  0:37 ` [Patch bpf v3 1/4] net: rename ->stream_memory_read to ->sock_is_readable Cong Wang
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Cong Wang @ 2021-10-02  0:37 UTC (permalink / raw)
  To: netdev; +Cc: bpf, Cong Wang

From: Cong Wang <cong.wang@bytedance.com>

This patchset fixes ->poll() for sockets in sockmap and updates
selftests accordingly with select(). Please check each patch
for more details.

Fixes: c50524ec4e3a ("Merge branch 'sockmap: add sockmap support for unix datagram socket'")
Fixes: 89d69c5d0fbc ("Merge branch 'sockmap: introduce BPF_SK_SKB_VERDICT and support UDP'")

---
v3: drop sk_psock_get_checked()
    reuse tcp_bpf_sock_is_readable()

v2: rename and reuse ->stream_memory_read()
    fix a compile error in sk_psock_get_checked()

Cong Wang (3):
  net: rename ->stream_memory_read to ->sock_is_readable
  skmsg: extract and reuse sk_msg_is_readable()
  net: implement ->sock_is_readable() for UDP and AF_UNIX

Yucong Sun (1):
  selftests/bpf: use recv_timeout() instead of retries

 include/linux/skmsg.h                         |  1 +
 include/net/sock.h                            |  8 +-
 include/net/tls.h                             |  2 +-
 net/core/skmsg.c                              | 14 ++++
 net/ipv4/tcp.c                                |  5 +-
 net/ipv4/tcp_bpf.c                            | 15 +---
 net/ipv4/udp.c                                |  2 +
 net/ipv4/udp_bpf.c                            |  1 +
 net/tls/tls_main.c                            |  4 +-
 net/tls/tls_sw.c                              |  2 +-
 net/unix/af_unix.c                            |  4 +
 net/unix/unix_bpf.c                           |  2 +
 .../selftests/bpf/prog_tests/sockmap_listen.c | 75 +++++--------------
 13 files changed, 57 insertions(+), 78 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Patch bpf v3 1/4] net: rename ->stream_memory_read to ->sock_is_readable
  2021-10-02  0:37 [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests Cong Wang
@ 2021-10-02  0:37 ` Cong Wang
  2021-10-02  0:37 ` [Patch bpf v3 2/4] skmsg: extract and reuse sk_msg_is_readable() Cong Wang
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Cong Wang @ 2021-10-02  0:37 UTC (permalink / raw)
  To: netdev
  Cc: bpf, Cong Wang, John Fastabend, Daniel Borkmann, Jakub Sitnicki,
	Lorenz Bauer

From: Cong Wang <cong.wang@bytedance.com>

The proto ops ->stream_memory_read() is currently only used
by TCP to check whether psock queue is empty or not. We need
to rename it before reusing it for non-TCP protocols, and
adjust the exsiting users accordingly.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/net/sock.h | 8 +++++++-
 include/net/tls.h  | 2 +-
 net/ipv4/tcp.c     | 5 +----
 net/ipv4/tcp_bpf.c | 4 ++--
 net/tls/tls_main.c | 4 ++--
 net/tls/tls_sw.c   | 2 +-
 6 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index c005c3c750e8..88395943f7b9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1205,7 +1205,7 @@ struct proto {
 #endif
 
 	bool			(*stream_memory_free)(const struct sock *sk, int wake);
-	bool			(*stream_memory_read)(const struct sock *sk);
+	bool			(*sock_is_readable)(struct sock *sk);
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	void			(*leave_memory_pressure)(struct sock *sk);
@@ -2788,4 +2788,10 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs);
 
 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
 
+static inline bool sk_is_readable(struct sock *sk)
+{
+	if (sk->sk_prot->sock_is_readable)
+		return sk->sk_prot->sock_is_readable(sk);
+	return false;
+}
 #endif	/* _SOCK_H */
diff --git a/include/net/tls.h b/include/net/tls.h
index be4b3e1cac46..01d2e3744393 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -375,7 +375,7 @@ void tls_sw_release_resources_rx(struct sock *sk);
 void tls_sw_free_ctx_rx(struct tls_context *tls_ctx);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
-bool tls_sw_stream_read(const struct sock *sk);
+bool tls_sw_sock_is_readable(struct sock *sk);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..f5c336f8b0c8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,10 +486,7 @@ static bool tcp_stream_is_readable(struct sock *sk, int target)
 {
 	if (tcp_epollin_ready(sk, target))
 		return true;
-
-	if (sk->sk_prot->stream_memory_read)
-		return sk->sk_prot->stream_memory_read(sk);
-	return false;
+	return sk_is_readable(sk);
 }
 
 /*
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index d3e9386b493e..0175dbcb7722 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -150,7 +150,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
 EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
 
 #ifdef CONFIG_BPF_SYSCALL
-static bool tcp_bpf_stream_read(const struct sock *sk)
+static bool tcp_bpf_sock_is_readable(struct sock *sk)
 {
 	struct sk_psock *psock;
 	bool empty = true;
@@ -479,7 +479,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 	prot[TCP_BPF_BASE].unhash		= sock_map_unhash;
 	prot[TCP_BPF_BASE].close		= sock_map_close;
 	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
-	prot[TCP_BPF_BASE].stream_memory_read	= tcp_bpf_stream_read;
+	prot[TCP_BPF_BASE].sock_is_readable	= tcp_bpf_sock_is_readable;
 
 	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
 	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index fde56ff49163..9ab81db8a654 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -681,12 +681,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 
 	prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
 	prot[TLS_BASE][TLS_SW].recvmsg		  = tls_sw_recvmsg;
-	prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
+	prot[TLS_BASE][TLS_SW].sock_is_readable   = tls_sw_sock_is_readable;
 	prot[TLS_BASE][TLS_SW].close		  = tls_sk_proto_close;
 
 	prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
 	prot[TLS_SW][TLS_SW].recvmsg		= tls_sw_recvmsg;
-	prot[TLS_SW][TLS_SW].stream_memory_read	= tls_sw_stream_read;
+	prot[TLS_SW][TLS_SW].sock_is_readable   = tls_sw_sock_is_readable;
 	prot[TLS_SW][TLS_SW].close		= tls_sk_proto_close;
 
 #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 4feb95e34b64..d5d09bd817b7 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2026,7 +2026,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 	return copied ? : err;
 }
 
-bool tls_sw_stream_read(const struct sock *sk)
+bool tls_sw_sock_is_readable(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [Patch bpf v3 2/4] skmsg: extract and reuse sk_msg_is_readable()
  2021-10-02  0:37 [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests Cong Wang
  2021-10-02  0:37 ` [Patch bpf v3 1/4] net: rename ->stream_memory_read to ->sock_is_readable Cong Wang
@ 2021-10-02  0:37 ` Cong Wang
  2021-10-02  0:37 ` [Patch bpf v3 3/4] net: implement ->sock_is_readable() for UDP and AF_UNIX Cong Wang
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Cong Wang @ 2021-10-02  0:37 UTC (permalink / raw)
  To: netdev
  Cc: bpf, Cong Wang, John Fastabend, Daniel Borkmann, Jakub Sitnicki,
	Lorenz Bauer

From: Cong Wang <cong.wang@bytedance.com>

tcp_bpf_sock_is_readable() is pretty much generic,
we can extract it and reuse it for non-TCP sockets.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/skmsg.h |  1 +
 net/core/skmsg.c      | 14 ++++++++++++++
 net/ipv4/tcp_bpf.c    | 15 +--------------
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 14ab0c0bc924..1ce9a9eb223b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -128,6 +128,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 			     struct sk_msg *msg, u32 bytes);
 int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 		   int len, int flags);
+bool sk_msg_is_readable(struct sock *sk);
 
 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
 {
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 2d6249b28928..a86ef7e844f8 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -474,6 +474,20 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 }
 EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
 
+bool sk_msg_is_readable(struct sock *sk)
+{
+	struct sk_psock *psock;
+	bool empty = true;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (likely(psock))
+		empty = list_empty(&psock->ingress_msg);
+	rcu_read_unlock();
+	return !empty;
+}
+EXPORT_SYMBOL_GPL(sk_msg_is_readable);
+
 static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
 						  struct sk_buff *skb)
 {
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 0175dbcb7722..dabbc93e31b6 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -150,19 +150,6 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
 EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
 
 #ifdef CONFIG_BPF_SYSCALL
-static bool tcp_bpf_sock_is_readable(struct sock *sk)
-{
-	struct sk_psock *psock;
-	bool empty = true;
-
-	rcu_read_lock();
-	psock = sk_psock(sk);
-	if (likely(psock))
-		empty = list_empty(&psock->ingress_msg);
-	rcu_read_unlock();
-	return !empty;
-}
-
 static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
 			     long timeo)
 {
@@ -479,7 +466,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 	prot[TCP_BPF_BASE].unhash		= sock_map_unhash;
 	prot[TCP_BPF_BASE].close		= sock_map_close;
 	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
-	prot[TCP_BPF_BASE].sock_is_readable	= tcp_bpf_sock_is_readable;
+	prot[TCP_BPF_BASE].sock_is_readable	= sk_msg_is_readable;
 
 	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
 	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [Patch bpf v3 3/4] net: implement ->sock_is_readable() for UDP and AF_UNIX
  2021-10-02  0:37 [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests Cong Wang
  2021-10-02  0:37 ` [Patch bpf v3 1/4] net: rename ->stream_memory_read to ->sock_is_readable Cong Wang
  2021-10-02  0:37 ` [Patch bpf v3 2/4] skmsg: extract and reuse sk_msg_is_readable() Cong Wang
@ 2021-10-02  0:37 ` Cong Wang
  2021-10-07 20:00   ` Daniel Borkmann
  2021-10-02  0:37 ` [Patch bpf v3 4/4] selftests/bpf: use recv_timeout() instead of retries Cong Wang
  2021-10-06 15:24 ` [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests John Fastabend
  4 siblings, 1 reply; 9+ messages in thread
From: Cong Wang @ 2021-10-02  0:37 UTC (permalink / raw)
  To: netdev
  Cc: bpf, Cong Wang, Yucong Sun, John Fastabend, Daniel Borkmann,
	Jakub Sitnicki, Lorenz Bauer

From: Cong Wang <cong.wang@bytedance.com>

Yucong noticed we can't poll() sockets in sockmap even
when they are the destination sockets of redirections.
This is because we never poll any psock queues in ->poll(),
except for TCP. With ->sock_is_readable() now we can
overwrite >sock_is_readable(), invoke and implement it for
both UDP and AF_UNIX sockets.

Reported-by: Yucong Sun <sunyucong@gmail.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 net/ipv4/udp.c      | 2 ++
 net/ipv4/udp_bpf.c  | 1 +
 net/unix/af_unix.c  | 4 ++++
 net/unix/unix_bpf.c | 2 ++
 4 files changed, 9 insertions(+)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2a7825a5b842..4a7e15a43a68 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2866,6 +2866,8 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
 		mask &= ~(EPOLLIN | EPOLLRDNORM);
 
+	if (sk_is_readable(sk))
+		mask |= EPOLLIN | EPOLLRDNORM;
 	return mask;
 
 }
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 7a1d5f473878..bbe6569c9ad3 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -114,6 +114,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
 	*prot        = *base;
 	prot->close  = sock_map_close;
 	prot->recvmsg = udp_bpf_recvmsg;
+	prot->sock_is_readable = sk_msg_is_readable;
 }
 
 static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f505b89bda6a..3e65d9f5531d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3029,6 +3029,8 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
 	/* readable? */
 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
+	if (sk_is_readable(sk))
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
@@ -3068,6 +3070,8 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 	/* readable? */
 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
+	if (sk_is_readable(sk))
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
 	if (sk->sk_type == SOCK_SEQPACKET) {
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index b927e2baae50..452376c6f419 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -102,6 +102,7 @@ static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto
 	*prot        = *base;
 	prot->close  = sock_map_close;
 	prot->recvmsg = unix_bpf_recvmsg;
+	prot->sock_is_readable = sk_msg_is_readable;
 }
 
 static void unix_stream_bpf_rebuild_protos(struct proto *prot,
@@ -110,6 +111,7 @@ static void unix_stream_bpf_rebuild_protos(struct proto *prot,
 	*prot        = *base;
 	prot->close  = sock_map_close;
 	prot->recvmsg = unix_bpf_recvmsg;
+	prot->sock_is_readable = sk_msg_is_readable;
 	prot->unhash  = sock_map_unhash;
 }
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [Patch bpf v3 4/4] selftests/bpf: use recv_timeout() instead of retries
  2021-10-02  0:37 [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests Cong Wang
                   ` (2 preceding siblings ...)
  2021-10-02  0:37 ` [Patch bpf v3 3/4] net: implement ->sock_is_readable() for UDP and AF_UNIX Cong Wang
@ 2021-10-02  0:37 ` Cong Wang
  2021-10-06 15:24 ` [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests John Fastabend
  4 siblings, 0 replies; 9+ messages in thread
From: Cong Wang @ 2021-10-02  0:37 UTC (permalink / raw)
  To: netdev
  Cc: bpf, Yucong Sun, John Fastabend, Daniel Borkmann, Jakub Sitnicki,
	Lorenz Bauer, Cong Wang

From: Yucong Sun <sunyucong@gmail.com>

We use non-blocking sockets in those tests, retrying for
EAGAIN is ugly because there is no upper bound for the packet
arrival time, at least in theory. After we fix poll() on
sockmap sockets, now we can switch to select()+recv().

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Yucong Sun <sunyucong@gmail.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 .../selftests/bpf/prog_tests/sockmap_listen.c | 75 +++++--------------
 1 file changed, 20 insertions(+), 55 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index 5c5979046523..d88bb65b74cc 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -949,7 +949,6 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
 	int err, n;
 	u32 key;
 	char b;
-	int retries = 100;
 
 	zero_verdict_count(verd_mapfd);
 
@@ -1002,17 +1001,11 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
 		goto close_peer1;
 	if (pass != 1)
 		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
-again:
-	n = read(c0, &b, 1);
-	if (n < 0) {
-		if (errno == EAGAIN && retries--) {
-			usleep(1000);
-			goto again;
-		}
-		FAIL_ERRNO("%s: read", log_prefix);
-	}
+	n = recv_timeout(c0, &b, 1, 0, IO_TIMEOUT_SEC);
+	if (n < 0)
+		FAIL_ERRNO("%s: recv_timeout", log_prefix);
 	if (n == 0)
-		FAIL("%s: incomplete read", log_prefix);
+		FAIL("%s: incomplete recv", log_prefix);
 
 close_peer1:
 	xclose(p1);
@@ -1571,7 +1564,6 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd,
 	const char *log_prefix = redir_mode_str(mode);
 	int c0, c1, p0, p1;
 	unsigned int pass;
-	int retries = 100;
 	int err, n;
 	int sfd[2];
 	u32 key;
@@ -1606,17 +1598,11 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd,
 	if (pass != 1)
 		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
-again:
-	n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-	if (n < 0) {
-		if (errno == EAGAIN && retries--) {
-			usleep(1000);
-			goto again;
-		}
-		FAIL_ERRNO("%s: read", log_prefix);
-	}
+	n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+	if (n < 0)
+		FAIL_ERRNO("%s: recv_timeout", log_prefix);
 	if (n == 0)
-		FAIL("%s: incomplete read", log_prefix);
+		FAIL("%s: incomplete recv", log_prefix);
 
 close:
 	xclose(c1);
@@ -1748,7 +1734,6 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
 	const char *log_prefix = redir_mode_str(mode);
 	int c0, c1, p0, p1;
 	unsigned int pass;
-	int retries = 100;
 	int err, n;
 	u32 key;
 	char b;
@@ -1781,17 +1766,11 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
 	if (pass != 1)
 		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
-again:
-	n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-	if (n < 0) {
-		if (errno == EAGAIN && retries--) {
-			usleep(1000);
-			goto again;
-		}
-		FAIL_ERRNO("%s: read", log_prefix);
-	}
+	n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+	if (n < 0)
+		FAIL_ERRNO("%s: recv_timeout", log_prefix);
 	if (n == 0)
-		FAIL("%s: incomplete read", log_prefix);
+		FAIL("%s: incomplete recv", log_prefix);
 
 close_cli1:
 	xclose(c1);
@@ -1841,7 +1820,6 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
 	const char *log_prefix = redir_mode_str(mode);
 	int c0, c1, p0, p1;
 	unsigned int pass;
-	int retries = 100;
 	int err, n;
 	int sfd[2];
 	u32 key;
@@ -1876,17 +1854,11 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
 	if (pass != 1)
 		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
-again:
-	n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-	if (n < 0) {
-		if (errno == EAGAIN && retries--) {
-			usleep(1000);
-			goto again;
-		}
-		FAIL_ERRNO("%s: read", log_prefix);
-	}
+	n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+	if (n < 0)
+		FAIL_ERRNO("%s: recv_timeout", log_prefix);
 	if (n == 0)
-		FAIL("%s: incomplete read", log_prefix);
+		FAIL("%s: incomplete recv", log_prefix);
 
 close_cli1:
 	xclose(c1);
@@ -1932,7 +1904,6 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
 	int sfd[2];
 	u32 key;
 	char b;
-	int retries = 100;
 
 	zero_verdict_count(verd_mapfd);
 
@@ -1963,17 +1934,11 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
 	if (pass != 1)
 		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
-again:
-	n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-	if (n < 0) {
-		if (errno == EAGAIN && retries--) {
-			usleep(1000);
-			goto again;
-		}
-		FAIL_ERRNO("%s: read", log_prefix);
-	}
+	n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+	if (n < 0)
+		FAIL_ERRNO("%s: recv_timeout", log_prefix);
 	if (n == 0)
-		FAIL("%s: incomplete read", log_prefix);
+		FAIL("%s: incomplete recv", log_prefix);
 
 close:
 	xclose(c1);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests
  2021-10-02  0:37 [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests Cong Wang
                   ` (3 preceding siblings ...)
  2021-10-02  0:37 ` [Patch bpf v3 4/4] selftests/bpf: use recv_timeout() instead of retries Cong Wang
@ 2021-10-06 15:24 ` John Fastabend
  4 siblings, 0 replies; 9+ messages in thread
From: John Fastabend @ 2021-10-06 15:24 UTC (permalink / raw)
  To: Cong Wang, netdev; +Cc: bpf, Cong Wang

Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
> 
> This patchset fixes ->poll() for sockets in sockmap and updates
> selftests accordingly with select(). Please check each patch
> for more details.
> 
> Fixes: c50524ec4e3a ("Merge branch 'sockmap: add sockmap support for unix datagram socket'")
> Fixes: 89d69c5d0fbc ("Merge branch 'sockmap: introduce BPF_SK_SKB_VERDICT and support UDP'")

Looks good thanks.

Acked-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch bpf v3 3/4] net: implement ->sock_is_readable() for UDP and AF_UNIX
  2021-10-02  0:37 ` [Patch bpf v3 3/4] net: implement ->sock_is_readable() for UDP and AF_UNIX Cong Wang
@ 2021-10-07 20:00   ` Daniel Borkmann
  2021-10-07 20:53     ` Cong Wang
  0 siblings, 1 reply; 9+ messages in thread
From: Daniel Borkmann @ 2021-10-07 20:00 UTC (permalink / raw)
  To: Cong Wang, netdev
  Cc: bpf, Cong Wang, Yucong Sun, John Fastabend, Jakub Sitnicki, Lorenz Bauer

On 10/2/21 2:37 AM, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
> 
> Yucong noticed we can't poll() sockets in sockmap even
> when they are the destination sockets of redirections.
> This is because we never poll any psock queues in ->poll(),
> except for TCP. With ->sock_is_readable() now we can
> overwrite >sock_is_readable(), invoke and implement it for
> both UDP and AF_UNIX sockets.
> 
> Reported-by: Yucong Sun <sunyucong@gmail.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Cc: Lorenz Bauer <lmb@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
>   net/ipv4/udp.c      | 2 ++
>   net/ipv4/udp_bpf.c  | 1 +
>   net/unix/af_unix.c  | 4 ++++
>   net/unix/unix_bpf.c | 2 ++
>   4 files changed, 9 insertions(+)
> 
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 2a7825a5b842..4a7e15a43a68 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -2866,6 +2866,8 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
>   	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
>   		mask &= ~(EPOLLIN | EPOLLRDNORM);
>   
> +	if (sk_is_readable(sk))
> +		mask |= EPOLLIN | EPOLLRDNORM;

udp_poll() has this extra logic around first_packet_length() which drops all bad csum'ed
skbs. How does this stand in relation to sk_msg_is_readable()? Is this a concern as well
there? Maybe makes sense to elaborate a bit more in the commit message for context / future
reference.

Thanks,
Daniel

>   	return mask;
>   
>   }



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch bpf v3 3/4] net: implement ->sock_is_readable() for UDP and AF_UNIX
  2021-10-07 20:00   ` Daniel Borkmann
@ 2021-10-07 20:53     ` Cong Wang
  2021-10-08 15:26       ` John Fastabend
  0 siblings, 1 reply; 9+ messages in thread
From: Cong Wang @ 2021-10-07 20:53 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Linux Kernel Network Developers, bpf, Cong Wang, Yucong Sun,
	John Fastabend, Jakub Sitnicki, Lorenz Bauer

On Thu, Oct 7, 2021 at 1:00 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 10/2/21 2:37 AM, Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > Yucong noticed we can't poll() sockets in sockmap even
> > when they are the destination sockets of redirections.
> > This is because we never poll any psock queues in ->poll(),
> > except for TCP. With ->sock_is_readable() now we can
> > overwrite >sock_is_readable(), invoke and implement it for
> > both UDP and AF_UNIX sockets.
> >
> > Reported-by: Yucong Sun <sunyucong@gmail.com>
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
> >   net/ipv4/udp.c      | 2 ++
> >   net/ipv4/udp_bpf.c  | 1 +
> >   net/unix/af_unix.c  | 4 ++++
> >   net/unix/unix_bpf.c | 2 ++
> >   4 files changed, 9 insertions(+)
> >
> > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > index 2a7825a5b842..4a7e15a43a68 100644
> > --- a/net/ipv4/udp.c
> > +++ b/net/ipv4/udp.c
> > @@ -2866,6 +2866,8 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
> >           !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
> >               mask &= ~(EPOLLIN | EPOLLRDNORM);
> >
> > +     if (sk_is_readable(sk))
> > +             mask |= EPOLLIN | EPOLLRDNORM;
>
> udp_poll() has this extra logic around first_packet_length() which drops all bad csum'ed
> skbs. How does this stand in relation to sk_msg_is_readable()? Is this a concern as well
> there? Maybe makes sense to elaborate a bit more in the commit message for context / future
> reference.

We don't validate UDP checksums on sockmap RX path, so
it is okay to leave it as it is, but it is worth a comment like
you suggest. I will add a comment in this code.

If we really need to validate the checksum, it should be addressed
in a separate patch(set), not in this one.

Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch bpf v3 3/4] net: implement ->sock_is_readable() for UDP and AF_UNIX
  2021-10-07 20:53     ` Cong Wang
@ 2021-10-08 15:26       ` John Fastabend
  0 siblings, 0 replies; 9+ messages in thread
From: John Fastabend @ 2021-10-08 15:26 UTC (permalink / raw)
  To: Cong Wang, Daniel Borkmann
  Cc: Linux Kernel Network Developers, bpf, Cong Wang, Yucong Sun,
	John Fastabend, Jakub Sitnicki, Lorenz Bauer

Cong Wang wrote:
> On Thu, Oct 7, 2021 at 1:00 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >
> > On 10/2/21 2:37 AM, Cong Wang wrote:
> > > From: Cong Wang <cong.wang@bytedance.com>
> > >
> > > Yucong noticed we can't poll() sockets in sockmap even
> > > when they are the destination sockets of redirections.
> > > This is because we never poll any psock queues in ->poll(),
> > > except for TCP. With ->sock_is_readable() now we can
> > > overwrite >sock_is_readable(), invoke and implement it for
> > > both UDP and AF_UNIX sockets.
> > >
> > > Reported-by: Yucong Sun <sunyucong@gmail.com>
> > > Cc: John Fastabend <john.fastabend@gmail.com>
> > > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > > ---
> > >   net/ipv4/udp.c      | 2 ++
> > >   net/ipv4/udp_bpf.c  | 1 +
> > >   net/unix/af_unix.c  | 4 ++++
> > >   net/unix/unix_bpf.c | 2 ++
> > >   4 files changed, 9 insertions(+)
> > >
> > > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > > index 2a7825a5b842..4a7e15a43a68 100644
> > > --- a/net/ipv4/udp.c
> > > +++ b/net/ipv4/udp.c
> > > @@ -2866,6 +2866,8 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
> > >           !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
> > >               mask &= ~(EPOLLIN | EPOLLRDNORM);
> > >
> > > +     if (sk_is_readable(sk))
> > > +             mask |= EPOLLIN | EPOLLRDNORM;
> >
> > udp_poll() has this extra logic around first_packet_length() which drops all bad csum'ed
> > skbs. How does this stand in relation to sk_msg_is_readable()? Is this a concern as well
> > there? Maybe makes sense to elaborate a bit more in the commit message for context / future
> > reference.
> 
> We don't validate UDP checksums on sockmap RX path, so
> it is okay to leave it as it is, but it is worth a comment like
> you suggest. I will add a comment in this code.
> 
> If we really need to validate the checksum, it should be addressed
> in a separate patch(set), not in this one.
> 
> Thanks.

We should validate the checksum before creating the sk_msg so that any parsers running
on top of UDP don't parse a bad checksum payload or pass a bad checksum up to user
space. I thought there would be a check in the read_sock side, but I didn't see it.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-10-08 15:26 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-02  0:37 [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests Cong Wang
2021-10-02  0:37 ` [Patch bpf v3 1/4] net: rename ->stream_memory_read to ->sock_is_readable Cong Wang
2021-10-02  0:37 ` [Patch bpf v3 2/4] skmsg: extract and reuse sk_msg_is_readable() Cong Wang
2021-10-02  0:37 ` [Patch bpf v3 3/4] net: implement ->sock_is_readable() for UDP and AF_UNIX Cong Wang
2021-10-07 20:00   ` Daniel Borkmann
2021-10-07 20:53     ` Cong Wang
2021-10-08 15:26       ` John Fastabend
2021-10-02  0:37 ` [Patch bpf v3 4/4] selftests/bpf: use recv_timeout() instead of retries Cong Wang
2021-10-06 15:24 ` [Patch bpf v3 0/4] sock_map: fix ->poll() and update selftests John Fastabend

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).