All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 net-next 0/8] inet: more data-race fixes
@ 2023-09-22  3:42 Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 1/8] inet: implement lockless IP_MULTICAST_TTL Eric Dumazet
                   ` (8 more replies)
  0 siblings, 9 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

This series fixes some existing data-races on inet fields:

inet->mc_ttl, inet->pmtudisc, inet->tos, inet->uc_index,
inet->mc_index and inet->mc_addr.

While fixing them, we convert eight socket options
to lockless implementation.

v2: addressed David Ahern feedback on ("inet: implement lockless IP_TOS")
    Added David Reviewed-by: tag on other patches.

Eric Dumazet (8):
  inet: implement lockless IP_MULTICAST_TTL
  inet: implement lockless IP_MTU_DISCOVER
  inet: implement lockless IP_TOS
  inet: lockless getsockopt(IP_OPTIONS)
  inet: lockless getsockopt(IP_MTU)
  inet: implement lockless getsockopt(IP_UNICAST_IF)
  inet: lockless IP_PKTOPTIONS implementation
  inet: implement lockless getsockopt(IP_MULTICAST_IF)

 include/net/ip.h                              |  16 +-
 include/net/route.h                           |   4 +-
 net/dccp/ipv4.c                               |   2 +-
 net/ipv4/datagram.c                           |   6 +-
 net/ipv4/inet_diag.c                          |   2 +-
 net/ipv4/ip_output.c                          |  13 +-
 net/ipv4/ip_sockglue.c                        | 192 ++++++++----------
 net/ipv4/ping.c                               |   8 +-
 net/ipv4/raw.c                                |  19 +-
 net/ipv4/tcp_ipv4.c                           |   9 +-
 net/ipv4/udp.c                                |  18 +-
 net/mptcp/sockopt.c                           |   8 +-
 net/netfilter/ipvs/ip_vs_sync.c               |   4 +-
 net/sctp/protocol.c                           |   4 +-
 .../selftests/net/mptcp/mptcp_connect.sh      |   2 +-
 15 files changed, 150 insertions(+), 157 deletions(-)

-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 net-next 1/8] inet: implement lockless IP_MULTICAST_TTL
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
@ 2023-09-22  3:42 ` Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 2/8] inet: implement lockless IP_MTU_DISCOVER Eric Dumazet
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

inet->mc_ttl can be read locklessly.

Implement proper lockless reads and writes to inet->mc_ttl

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
---
 net/ipv4/ip_output.c            |  2 +-
 net/ipv4/ip_sockglue.c          | 31 ++++++++++++++++---------------
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4ab877cf6d35f229761986d5c6a17eb2a3ad4043..adad16f1e872ce20941a087b3965fdb040868d4e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1430,7 +1430,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	if (cork->ttl != 0)
 		ttl = cork->ttl;
 	else if (rt->rt_type == RTN_MULTICAST)
-		ttl = inet->mc_ttl;
+		ttl = READ_ONCE(inet->mc_ttl);
 	else
 		ttl = ip_select_ttl(inet, &rt->dst);
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cce9cb25f3b31cd57fa883ae0dedb6829d8da2fa..4ad3003378ae6b186513000264f77b54a7babe6d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1039,6 +1039,17 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 
 		WRITE_ONCE(inet->min_ttl, val);
 		return 0;
+	case IP_MULTICAST_TTL:
+		if (sk->sk_type == SOCK_STREAM)
+			return -EINVAL;
+		if (optlen < 1)
+			return -EINVAL;
+		if (val == -1)
+			val = 1;
+		if (val < 0 || val > 255)
+			return -EINVAL;
+		WRITE_ONCE(inet->mc_ttl, val);
+		return 0;
 	}
 
 	err = 0;
@@ -1101,17 +1112,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 			goto e_inval;
 		inet->pmtudisc = val;
 		break;
-	case IP_MULTICAST_TTL:
-		if (sk->sk_type == SOCK_STREAM)
-			goto e_inval;
-		if (optlen < 1)
-			goto e_inval;
-		if (val == -1)
-			val = 1;
-		if (val < 0 || val > 255)
-			goto e_inval;
-		inet->mc_ttl = val;
-		break;
 	case IP_UNICAST_IF:
 	{
 		struct net_device *dev = NULL;
@@ -1592,6 +1592,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_MINTTL:
 		val = READ_ONCE(inet->min_ttl);
 		goto copyval;
+	case IP_MULTICAST_TTL:
+		val = READ_ONCE(inet->mc_ttl);
+		goto copyval;
 	}
 
 	if (needs_rtnl)
@@ -1649,9 +1652,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		}
 		break;
 	}
-	case IP_MULTICAST_TTL:
-		val = inet->mc_ttl;
-		break;
 	case IP_UNICAST_IF:
 		val = (__force int)htonl((__u32) inet->uc_index);
 		break;
@@ -1718,7 +1718,8 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
 		}
 		if (inet_test_bit(TTL, sk)) {
-			int hlim = inet->mc_ttl;
+			int hlim = READ_ONCE(inet->mc_ttl);
+
 			put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
 		}
 		if (inet_test_bit(TOS, sk)) {
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 5820a8156c4701bb163f569d735c389d7a8e3820..3eed1670224888acf639cff06537ddf2505461bb 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1316,7 +1316,7 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
 
 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
 	lock_sock(sk);
-	inet->mc_ttl = ttl;
+	WRITE_ONCE(inet->mc_ttl, ttl);
 #ifdef CONFIG_IP_VS_IPV6
 	if (sk->sk_family == AF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 net-next 2/8] inet: implement lockless IP_MTU_DISCOVER
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 1/8] inet: implement lockless IP_MULTICAST_TTL Eric Dumazet
@ 2023-09-22  3:42 ` Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS Eric Dumazet
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

inet->pmtudisc can be read locklessly.

Implement proper lockless reads and writes to inet->pmtudisc

ip_sock_set_mtu_discover() can now be called from arbitrary
contexts.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
---
 include/net/ip.h                | 13 ++++++++-----
 net/ipv4/ip_output.c            |  7 ++++---
 net/ipv4/ip_sockglue.c          | 17 ++++++-----------
 net/ipv4/ping.c                 |  2 +-
 net/ipv4/raw.c                  |  2 +-
 net/ipv4/udp.c                  |  2 +-
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 7 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 3489a1cca5e7bc315ba646f6bc125b2b6ded9416..46933a0d98eac2db40c2e88006125588b8f8143e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -434,19 +434,22 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 
 static inline bool ip_sk_accept_pmtu(const struct sock *sk)
 {
-	return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE &&
-	       inet_sk(sk)->pmtudisc != IP_PMTUDISC_OMIT;
+	u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);
+
+	return pmtudisc != IP_PMTUDISC_INTERFACE &&
+	       pmtudisc != IP_PMTUDISC_OMIT;
 }
 
 static inline bool ip_sk_use_pmtu(const struct sock *sk)
 {
-	return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
+	return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE;
 }
 
 static inline bool ip_sk_ignore_df(const struct sock *sk)
 {
-	return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO ||
-	       inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT;
+	u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);
+
+	return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT;
 }
 
 static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index adad16f1e872ce20941a087b3965fdb040868d4e..2be281f184a5fe5a695ccd51fabe69fa45bea0b8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1387,8 +1387,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	struct ip_options *opt = NULL;
 	struct rtable *rt = (struct rtable *)cork->dst;
 	struct iphdr *iph;
+	u8 pmtudisc, ttl;
 	__be16 df = 0;
-	__u8 ttl;
 
 	skb = __skb_dequeue(queue);
 	if (!skb)
@@ -1418,8 +1418,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If ignore_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc == IP_PMTUDISC_DO ||
-	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
+	pmtudisc = READ_ONCE(inet->pmtudisc);
+	if (pmtudisc == IP_PMTUDISC_DO ||
+	    pmtudisc == IP_PMTUDISC_PROBE ||
 	    (skb->len <= dst_mtu(&rt->dst) &&
 	     ip_dont_fragment(sk, &rt->dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4ad3003378ae6b186513000264f77b54a7babe6d..6d874cc03c8b4e88d79ebc50a6db105606b6ae60 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -622,9 +622,7 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val)
 {
 	if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
 		return -EINVAL;
-	lock_sock(sk);
-	inet_sk(sk)->pmtudisc = val;
-	release_sock(sk);
+	WRITE_ONCE(inet_sk(sk)->pmtudisc, val);
 	return 0;
 }
 EXPORT_SYMBOL(ip_sock_set_mtu_discover);
@@ -1050,6 +1048,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		WRITE_ONCE(inet->mc_ttl, val);
 		return 0;
+	case IP_MTU_DISCOVER:
+		return ip_sock_set_mtu_discover(sk, val);
 	}
 
 	err = 0;
@@ -1107,11 +1107,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 	case IP_TOS:	/* This sets both TOS and Precedence */
 		__ip_sock_set_tos(sk, val);
 		break;
-	case IP_MTU_DISCOVER:
-		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
-			goto e_inval;
-		inet->pmtudisc = val;
-		break;
 	case IP_UNICAST_IF:
 	{
 		struct net_device *dev = NULL;
@@ -1595,6 +1590,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_MULTICAST_TTL:
 		val = READ_ONCE(inet->mc_ttl);
 		goto copyval;
+	case IP_MTU_DISCOVER:
+		val = READ_ONCE(inet->pmtudisc);
+		goto copyval;
 	}
 
 	if (needs_rtnl)
@@ -1634,9 +1632,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_TOS:
 		val = inet->tos;
 		break;
-	case IP_MTU_DISCOVER:
-		val = inet->pmtudisc;
-		break;
 	case IP_MTU:
 	{
 		struct dst_entry *dst;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4dd809b7b18867154df42bc28809b886913e253c..50d12b0c8d46fdcd9b448c3ebc90395ebf426075 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -551,7 +551,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
 		case ICMP_DEST_UNREACH:
 			if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
 				ipv4_sk_update_pmtu(skb, sk, info);
-				if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+				if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) {
 					err = EMSGSIZE;
 					harderr = 1;
 					break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4b5db5d1edc279df1fd7412af2845a7a79c95ec8..ade1aecd7c71184d753a28a67bc9b30087247db4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -239,7 +239,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 		if (code > NR_ICMP_UNREACH)
 			break;
 		if (code == ICMP_FRAG_NEEDED) {
-			harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+			harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT;
 			err = EMSGSIZE;
 		} else {
 			err = icmp_err_convert[code].errno;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ff984b63547daf0ecfb4ab96956aee2f8d589d..731a723dc80816f0b5b0803d7397f7e9e8cd8b09 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -750,7 +750,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	case ICMP_DEST_UNREACH:
 		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
 			ipv4_sk_update_pmtu(skb, sk, info);
-			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+			if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
 				err = EMSGSIZE;
 				harderr = 1;
 				break;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3eed1670224888acf639cff06537ddf2505461bb..4f6c795588fbdbf084154025b8172e0fd2ea7384 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1335,7 +1335,7 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
 
 	/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
 	lock_sock(sk);
-	inet->pmtudisc = val;
+	WRITE_ONCE(inet->pmtudisc, val);
 #ifdef CONFIG_IP_VS_IPV6
 	if (sk->sk_family == AF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 1/8] inet: implement lockless IP_MULTICAST_TTL Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 2/8] inet: implement lockless IP_MTU_DISCOVER Eric Dumazet
@ 2023-09-22  3:42 ` Eric Dumazet
  2023-09-22 13:07   ` David Ahern
  2023-10-18  3:37   ` Christoph Paasch
  2023-09-22  3:42 ` [PATCH v2 net-next 4/8] inet: lockless getsockopt(IP_OPTIONS) Eric Dumazet
                   ` (5 subsequent siblings)
  8 siblings, 2 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

Some reads of inet->tos are racy.

Add needed READ_ONCE() annotations and convert IP_TOS option lockless.

v2: missing changes in include/net/route.h (David Ahern)

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/ip.h                              |  3 +-
 include/net/route.h                           |  4 +--
 net/dccp/ipv4.c                               |  2 +-
 net/ipv4/inet_diag.c                          |  2 +-
 net/ipv4/ip_output.c                          |  4 +--
 net/ipv4/ip_sockglue.c                        | 29 ++++++++-----------
 net/ipv4/tcp_ipv4.c                           |  9 +++---
 net/mptcp/sockopt.c                           |  8 ++---
 net/sctp/protocol.c                           |  4 +--
 .../selftests/net/mptcp/mptcp_connect.sh      |  2 +-
 10 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 46933a0d98eac2db40c2e88006125588b8f8143e..6fbc0dcf4b9780d60b5e5d6f84d6017fbf57d0ae 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -258,7 +258,7 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
 
 static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet)
 {
-	return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(inet->tos);
+	return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(READ_ONCE(inet->tos));
 }
 
 /* datagram.c */
@@ -810,6 +810,5 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val);
 void ip_sock_set_pktinfo(struct sock *sk);
 void ip_sock_set_recverr(struct sock *sk);
 void ip_sock_set_tos(struct sock *sk, int val);
-void  __ip_sock_set_tos(struct sock *sk, int val);
 
 #endif	/* _IP_H */
diff --git a/include/net/route.h b/include/net/route.h
index 51a45b1887b562bfb473f9f8c50897d5d3073476..5c248a8e3d0e3ed757ad95f546032c2c49729eec 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -37,7 +37,7 @@
 
 #define RTO_ONLINK	0x01
 
-#define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
+#define RT_CONN_FLAGS(sk)   (RT_TOS(READ_ONCE(inet_sk(sk)->tos)) | sock_flag(sk, SOCK_LOCALROUTE))
 #define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
 
 static inline __u8 ip_sock_rt_scope(const struct sock *sk)
@@ -50,7 +50,7 @@ static inline __u8 ip_sock_rt_scope(const struct sock *sk)
 
 static inline __u8 ip_sock_rt_tos(const struct sock *sk)
 {
-	return RT_TOS(inet_sk(sk)->tos);
+	return RT_TOS(READ_ONCE(inet_sk(sk)->tos));
 }
 
 struct ip_tunnel_info;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 69453b936bd557c77a790a27ff64cc91e5a58296..1b8cbfda6e5dbd098a58d92639a64bc8db83ff23 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -511,7 +511,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 					    ireq->ir_rmt_addr,
 					    rcu_dereference(ireq->ireq_opt),
-					    inet_sk(sk)->tos);
+					    READ_ONCE(inet_sk(sk)->tos));
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e13a84433413ed88088435ff8e11efeb30fc3cca..1f2d7a8bd060e59baeb00fcb1c6aabfcb3bb213d 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -134,7 +134,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 	 * hence this needs to be included regardless of socket family.
 	 */
 	if (ext & (1 << (INET_DIAG_TOS - 1)))
-		if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+		if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0)
 			goto errout;
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2be281f184a5fe5a695ccd51fabe69fa45bea0b8..85320f92e8363d59e92c54139044cbab7e0561fa 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -544,7 +544,7 @@ EXPORT_SYMBOL(__ip_queue_xmit);
 
 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
 {
-	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+	return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
 }
 EXPORT_SYMBOL(ip_queue_xmit);
 
@@ -1438,7 +1438,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	iph = ip_hdr(skb);
 	iph->version = 4;
 	iph->ihl = 5;
-	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+	iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
 	iph->frag_off = df;
 	iph->ttl = ttl;
 	iph->protocol = sk->sk_protocol;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6d874cc03c8b4e88d79ebc50a6db105606b6ae60..50c008efbb6de7303621dd30b178c90cb3f5a2fc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -585,25 +585,20 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 	return err;
 }
 
-void __ip_sock_set_tos(struct sock *sk, int val)
+void ip_sock_set_tos(struct sock *sk, int val)
 {
+	u8 old_tos = READ_ONCE(inet_sk(sk)->tos);
+
 	if (sk->sk_type == SOCK_STREAM) {
 		val &= ~INET_ECN_MASK;
-		val |= inet_sk(sk)->tos & INET_ECN_MASK;
+		val |= old_tos & INET_ECN_MASK;
 	}
-	if (inet_sk(sk)->tos != val) {
-		inet_sk(sk)->tos = val;
+	if (old_tos != val) {
+		WRITE_ONCE(inet_sk(sk)->tos, val);
 		WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
 		sk_dst_reset(sk);
 	}
 }
-
-void ip_sock_set_tos(struct sock *sk, int val)
-{
-	lock_sock(sk);
-	__ip_sock_set_tos(sk, val);
-	release_sock(sk);
-}
 EXPORT_SYMBOL(ip_sock_set_tos);
 
 void ip_sock_set_freebind(struct sock *sk)
@@ -1050,6 +1045,9 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 		return 0;
 	case IP_MTU_DISCOVER:
 		return ip_sock_set_mtu_discover(sk, val);
+	case IP_TOS:	/* This sets both TOS and Precedence */
+		ip_sock_set_tos(sk, val);
+		return 0;
 	}
 
 	err = 0;
@@ -1104,9 +1102,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 			}
 		}
 		break;
-	case IP_TOS:	/* This sets both TOS and Precedence */
-		__ip_sock_set_tos(sk, val);
-		break;
 	case IP_UNICAST_IF:
 	{
 		struct net_device *dev = NULL;
@@ -1593,6 +1588,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_MTU_DISCOVER:
 		val = READ_ONCE(inet->pmtudisc);
 		goto copyval;
+	case IP_TOS:
+		val = READ_ONCE(inet->tos);
+		goto copyval;
 	}
 
 	if (needs_rtnl)
@@ -1629,9 +1627,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 		return 0;
 	}
-	case IP_TOS:
-		val = inet->tos;
-		break;
 	case IP_MTU:
 	{
 		struct dst_entry *dst;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f13eb7e23d03f3681055257e6ebea0612ae3f9b3..1f89ba58e71eff74d8ed75019de9e70d2f4d5926 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1024,10 +1024,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 	if (skb) {
 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 
-		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
-				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
-				(inet_sk(sk)->tos & INET_ECN_MASK) :
-				inet_sk(sk)->tos;
+		tos = READ_ONCE(inet_sk(sk)->tos);
+
+		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+			      (tos & INET_ECN_MASK);
 
 		if (!INET_ECN_is_capable(tos) &&
 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 8260202c00669fd7d2eed2f94a3c2cf225a0d89c..155e8472ba9b83c35c6f827b2bb35c0be4127917 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -734,11 +734,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
 
 	lock_sock(sk);
 	sockopt_seq_inc(msk);
-	val = inet_sk(sk)->tos;
+	val = READ_ONCE(inet_sk(sk)->tos);
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
-		__ip_sock_set_tos(ssk, val);
+		ip_sock_set_tos(ssk, val);
 	}
 	release_sock(sk);
 
@@ -1343,7 +1343,7 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
 
 	switch (optname) {
 	case IP_TOS:
-		return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
+		return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
 	}
 
 	return -EOPNOTSUPP;
@@ -1411,7 +1411,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
 	ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
 	ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
 	ssk->sk_ipv6only = sk->sk_ipv6only;
-	__ip_sock_set_tos(ssk, inet_sk(sk)->tos);
+	ip_sock_set_tos(ssk, inet_sk(sk)->tos);
 
 	if (sk->sk_userlocks & tx_rx_locks) {
 		ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2185f44198deb002bc8ed7f1b0f3fe02d6bb9f09..94c6dd53cd62d1fa6236d07946e8d5ff68eb587d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,7 +426,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	struct dst_entry *dst = NULL;
 	union sctp_addr *daddr = &t->ipaddr;
 	union sctp_addr dst_saddr;
-	__u8 tos = inet_sk(sk)->tos;
+	u8 tos = READ_ONCE(inet_sk(sk)->tos);
 
 	if (t->dscp & SCTP_DSCP_SET_MASK)
 		tos = t->dscp & SCTP_DSCP_VAL_MASK;
@@ -1057,7 +1057,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	struct flowi4 *fl4 = &t->fl.u.ip4;
 	struct sock *sk = skb->sk;
 	struct inet_sock *inet = inet_sk(sk);
-	__u8 dscp = inet->tos;
+	__u8 dscp = READ_ONCE(inet->tos);
 	__be16 df = 0;
 
 	pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index b1fc8afd072dc6ddde8d561a675a5549a9a37dba..61a2a1988ce69ffa17e0dd8e629eac550f4f7d99 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -716,7 +716,7 @@ run_test_transparent()
 	# the required infrastructure in MPTCP sockopt code. To support TOS, the
 	# following function has been exported (T). Not great but better than
 	# checking for a specific kernel version.
-	if ! mptcp_lib_kallsyms_has "T __ip_sock_set_tos$"; then
+	if ! mptcp_lib_kallsyms_has "T ip_sock_set_tos$"; then
 		echo "INFO: ${msg} not supported by the kernel: SKIP"
 		mptcp_lib_result_skip "${TEST_GROUP}"
 		return
-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 net-next 4/8] inet: lockless getsockopt(IP_OPTIONS)
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
                   ` (2 preceding siblings ...)
  2023-09-22  3:42 ` [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS Eric Dumazet
@ 2023-09-22  3:42 ` Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 5/8] inet: lockless getsockopt(IP_MTU) Eric Dumazet
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

inet->inet_opt being RCU protected, we can use RCU instead
of locking the socket.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
---
 net/ipv4/ip_sockglue.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 50c008efbb6de7303621dd30b178c90cb3f5a2fc..45d89487914a12061f05c192004ad79f0abbf756 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1591,27 +1591,20 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_TOS:
 		val = READ_ONCE(inet->tos);
 		goto copyval;
-	}
-
-	if (needs_rtnl)
-		rtnl_lock();
-	sockopt_lock_sock(sk);
-
-	switch (optname) {
 	case IP_OPTIONS:
 	{
 		unsigned char optbuf[sizeof(struct ip_options)+40];
 		struct ip_options *opt = (struct ip_options *)optbuf;
 		struct ip_options_rcu *inet_opt;
 
-		inet_opt = rcu_dereference_protected(inet->inet_opt,
-						     lockdep_sock_is_held(sk));
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
 		opt->optlen = 0;
 		if (inet_opt)
 			memcpy(optbuf, &inet_opt->opt,
 			       sizeof(struct ip_options) +
 			       inet_opt->opt.optlen);
-		sockopt_release_sock(sk);
+		rcu_read_unlock();
 
 		if (opt->optlen == 0) {
 			len = 0;
@@ -1627,6 +1620,13 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 		return 0;
 	}
+	}
+
+	if (needs_rtnl)
+		rtnl_lock();
+	sockopt_lock_sock(sk);
+
+	switch (optname) {
 	case IP_MTU:
 	{
 		struct dst_entry *dst;
-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 net-next 5/8] inet: lockless getsockopt(IP_MTU)
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
                   ` (3 preceding siblings ...)
  2023-09-22  3:42 ` [PATCH v2 net-next 4/8] inet: lockless getsockopt(IP_OPTIONS) Eric Dumazet
@ 2023-09-22  3:42 ` Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF) Eric Dumazet
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

sk_dst_get() does not require socket lock.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
---
 net/ipv4/ip_sockglue.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 45d89487914a12061f05c192004ad79f0abbf756..04579e390ddd4dadb8a107ef0b5da15e7a60f1ff 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1620,13 +1620,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 		return 0;
 	}
-	}
-
-	if (needs_rtnl)
-		rtnl_lock();
-	sockopt_lock_sock(sk);
-
-	switch (optname) {
 	case IP_MTU:
 	{
 		struct dst_entry *dst;
@@ -1636,12 +1629,17 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			val = dst_mtu(dst);
 			dst_release(dst);
 		}
-		if (!val) {
-			sockopt_release_sock(sk);
+		if (!val)
 			return -ENOTCONN;
-		}
-		break;
+		goto copyval;
+	}
 	}
+
+	if (needs_rtnl)
+		rtnl_lock();
+	sockopt_lock_sock(sk);
+
+	switch (optname) {
 	case IP_UNICAST_IF:
 		val = (__force int)htonl((__u32) inet->uc_index);
 		break;
-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF)
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
                   ` (4 preceding siblings ...)
  2023-09-22  3:42 ` [PATCH v2 net-next 5/8] inet: lockless getsockopt(IP_MTU) Eric Dumazet
@ 2023-09-22  3:42 ` Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 7/8] inet: lockless IP_PKTOPTIONS implementation Eric Dumazet
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

Add missing READ_ONCE() annotations when reading inet->uc_index

Implementing getsockopt(IP_UNICAST_IF) locklessly seems possible,
the setsockopt() part might not be possible at the moment.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
---
 net/ipv4/datagram.c    |  2 +-
 net/ipv4/ip_sockglue.c | 10 +++++-----
 net/ipv4/ping.c        |  2 +-
 net/ipv4/raw.c         | 13 +++++++------
 net/ipv4/udp.c         | 12 +++++++-----
 5 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index cb5dbee9e018fbba1bc1e5705e8bec6c4203af56..1480e9ebdfef445960e1f70f34f33a0e0c52b65b 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -43,7 +43,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
 		if (!saddr)
 			saddr = inet->mc_addr;
 	} else if (!oif) {
-		oif = inet->uc_index;
+		oif = READ_ONCE(inet->uc_index);
 	}
 	fl4 = &inet->cork.fl.u.ip4;
 	rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 04579e390ddd4dadb8a107ef0b5da15e7a60f1ff..58995526c6e965d613b8cdea61b84916d608a6fb 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1113,7 +1113,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 
 		ifindex = (__force int)ntohl((__force __be32)val);
 		if (ifindex == 0) {
-			inet->uc_index = 0;
+			WRITE_ONCE(inet->uc_index, 0);
 			err = 0;
 			break;
 		}
@@ -1130,7 +1130,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 		if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if)
 			break;
 
-		inet->uc_index = ifindex;
+		WRITE_ONCE(inet->uc_index, ifindex);
 		err = 0;
 		break;
 	}
@@ -1633,6 +1633,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -ENOTCONN;
 		goto copyval;
 	}
+	case IP_UNICAST_IF:
+		val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
+		goto copyval;
 	}
 
 	if (needs_rtnl)
@@ -1640,9 +1643,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	sockopt_lock_sock(sk);
 
 	switch (optname) {
-	case IP_UNICAST_IF:
-		val = (__force int)htonl((__u32) inet->uc_index);
-		break;
 	case IP_MULTICAST_IF:
 	{
 		struct in_addr addr;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 50d12b0c8d46fdcd9b448c3ebc90395ebf426075..66ad1f95af49f222afe0ee75b9163dd0af0a2c49 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -777,7 +777,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		if (!saddr)
 			saddr = inet->mc_addr;
 	} else if (!ipc.oif)
-		ipc.oif = inet->uc_index;
+		ipc.oif = READ_ONCE(inet->uc_index);
 
 	flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ade1aecd7c71184d753a28a67bc9b30087247db4..e2357d23202e5a39832bb1550c365de9a836c363 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -482,7 +482,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int free = 0;
 	__be32 daddr;
 	__be32 saddr;
-	int err;
+	int uc_index, err;
 	struct ip_options_data opt_copy;
 	struct raw_frag_vec rfv;
 	int hdrincl;
@@ -576,24 +576,25 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	tos = get_rttos(&ipc, inet);
 	scope = ip_sendmsg_scope(inet, &ipc, msg);
 
+	uc_index = READ_ONCE(inet->uc_index);
 	if (ipv4_is_multicast(daddr)) {
 		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
 			ipc.oif = inet->mc_index;
 		if (!saddr)
 			saddr = inet->mc_addr;
 	} else if (!ipc.oif) {
-		ipc.oif = inet->uc_index;
-	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+		ipc.oif = uc_index;
+	} else if (ipv4_is_lbcast(daddr) && uc_index) {
 		/* oif is set, packet is to local broadcast
 		 * and uc_index is set. oif is most likely set
 		 * by sk_bound_dev_if. If uc_index != oif check if the
 		 * oif is an L3 master and uc_index is an L3 slave.
 		 * If so, we want to allow the send using the uc_index.
 		 */
-		if (ipc.oif != inet->uc_index &&
+		if (ipc.oif != uc_index &&
 		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
-							      inet->uc_index)) {
-			ipc.oif = inet->uc_index;
+							      uc_index)) {
+			ipc.oif = uc_index;
 		}
 	}
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 731a723dc80816f0b5b0803d7397f7e9e8cd8b09..1e0c3aba1e5a88c7ba50a28511412a1710f1bab5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1055,6 +1055,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
 	struct sk_buff *skb;
 	struct ip_options_data opt_copy;
+	int uc_index;
 
 	if (len > 0xFFFF)
 		return -EMSGSIZE;
@@ -1173,6 +1174,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	if (scope == RT_SCOPE_LINK)
 		connected = 0;
 
+	uc_index = READ_ONCE(inet->uc_index);
 	if (ipv4_is_multicast(daddr)) {
 		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
 			ipc.oif = inet->mc_index;
@@ -1180,18 +1182,18 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			saddr = inet->mc_addr;
 		connected = 0;
 	} else if (!ipc.oif) {
-		ipc.oif = inet->uc_index;
-	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+		ipc.oif = uc_index;
+	} else if (ipv4_is_lbcast(daddr) && uc_index) {
 		/* oif is set, packet is to local broadcast and
 		 * uc_index is set. oif is most likely set
 		 * by sk_bound_dev_if. If uc_index != oif check if the
 		 * oif is an L3 master and uc_index is an L3 slave.
 		 * If so, we want to allow the send using the uc_index.
 		 */
-		if (ipc.oif != inet->uc_index &&
+		if (ipc.oif != uc_index &&
 		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
-							      inet->uc_index)) {
-			ipc.oif = inet->uc_index;
+							      uc_index)) {
+			ipc.oif = uc_index;
 		}
 	}
 
-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 net-next 7/8] inet: lockless IP_PKTOPTIONS implementation
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
                   ` (5 preceding siblings ...)
  2023-09-22  3:42 ` [PATCH v2 net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF) Eric Dumazet
@ 2023-09-22  3:42 ` Eric Dumazet
  2023-09-22  3:42 ` [PATCH v2 net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF) Eric Dumazet
  2023-10-01 18:40 ` [PATCH v2 net-next 0/8] inet: more data-race fixes patchwork-bot+netdevbpf
  8 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

Current implementation is already lockless, because the socket
lock is released before reading socket fields.

Add missing READ_ONCE() annotations.

Note that corresponding WRITE_ONCE() are needed, the order
of the patches do not really matter.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
---
 net/ipv4/ip_sockglue.c | 76 ++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 58995526c6e965d613b8cdea61b84916d608a6fb..1ee01ff64171c94b6b244589518a53ce807a212d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1633,6 +1633,43 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -ENOTCONN;
 		goto copyval;
 	}
+	case IP_PKTOPTIONS:
+	{
+		struct msghdr msg;
+
+		if (sk->sk_type != SOCK_STREAM)
+			return -ENOPROTOOPT;
+
+		if (optval.is_kernel) {
+			msg.msg_control_is_user = false;
+			msg.msg_control = optval.kernel;
+		} else {
+			msg.msg_control_is_user = true;
+			msg.msg_control_user = optval.user;
+		}
+		msg.msg_controllen = len;
+		msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
+
+		if (inet_test_bit(PKTINFO, sk)) {
+			struct in_pktinfo info;
+
+			info.ipi_addr.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+			info.ipi_spec_dst.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+			info.ipi_ifindex = READ_ONCE(inet->mc_index);
+			put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+		}
+		if (inet_test_bit(TTL, sk)) {
+			int hlim = READ_ONCE(inet->mc_ttl);
+
+			put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+		}
+		if (inet_test_bit(TOS, sk)) {
+			int tos = READ_ONCE(inet->rcv_tos);
+			put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+		}
+		len -= msg.msg_controllen;
+		return copy_to_sockptr(optlen, &len, sizeof(int));
+	}
 	case IP_UNICAST_IF:
 		val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
 		goto copyval;
@@ -1678,45 +1715,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		else
 			err = ip_get_mcast_msfilter(sk, optval, optlen, len);
 		goto out;
-	case IP_PKTOPTIONS:
-	{
-		struct msghdr msg;
-
-		sockopt_release_sock(sk);
-
-		if (sk->sk_type != SOCK_STREAM)
-			return -ENOPROTOOPT;
-
-		if (optval.is_kernel) {
-			msg.msg_control_is_user = false;
-			msg.msg_control = optval.kernel;
-		} else {
-			msg.msg_control_is_user = true;
-			msg.msg_control_user = optval.user;
-		}
-		msg.msg_controllen = len;
-		msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
-
-		if (inet_test_bit(PKTINFO, sk)) {
-			struct in_pktinfo info;
-
-			info.ipi_addr.s_addr = inet->inet_rcv_saddr;
-			info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
-			info.ipi_ifindex = inet->mc_index;
-			put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
-		}
-		if (inet_test_bit(TTL, sk)) {
-			int hlim = READ_ONCE(inet->mc_ttl);
-
-			put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
-		}
-		if (inet_test_bit(TOS, sk)) {
-			int tos = inet->rcv_tos;
-			put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
-		}
-		len -= msg.msg_controllen;
-		return copy_to_sockptr(optlen, &len, sizeof(int));
-	}
 	case IP_LOCAL_PORT_RANGE:
 		val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
 		break;
-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF)
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
                   ` (6 preceding siblings ...)
  2023-09-22  3:42 ` [PATCH v2 net-next 7/8] inet: lockless IP_PKTOPTIONS implementation Eric Dumazet
@ 2023-09-22  3:42 ` Eric Dumazet
  2023-10-01 18:40 ` [PATCH v2 net-next 0/8] inet: more data-race fixes patchwork-bot+netdevbpf
  8 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-09-22  3:42 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet

Add missing annotations to inet->mc_index and inet->mc_addr
to fix data-races.

getsockopt(IP_MULTICAST_IF) can be lockless.

setsockopt() side is left for later.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
---
 net/ipv4/datagram.c    |  4 ++--
 net/ipv4/ip_sockglue.c | 25 ++++++++++++-------------
 net/ipv4/ping.c        |  4 ++--
 net/ipv4/raw.c         |  4 ++--
 net/ipv4/udp.c         |  4 ++--
 5 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 1480e9ebdfef445960e1f70f34f33a0e0c52b65b..2cc50cbfc2a31ec91fbdc4a541cb89df689cd9ae 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -39,9 +39,9 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
 	saddr = inet->inet_saddr;
 	if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
 		if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
-			oif = inet->mc_index;
+			oif = READ_ONCE(inet->mc_index);
 		if (!saddr)
-			saddr = inet->mc_addr;
+			saddr = READ_ONCE(inet->mc_addr);
 	} else if (!oif) {
 		oif = READ_ONCE(inet->uc_index);
 	}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 1ee01ff64171c94b6b244589518a53ce807a212d..0b74ac49d6a6f82f5e8ffe5279dba3baf30f874e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1168,8 +1168,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 
 		if (!mreq.imr_ifindex) {
 			if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
-				inet->mc_index = 0;
-				inet->mc_addr  = 0;
+				WRITE_ONCE(inet->mc_index, 0);
+				WRITE_ONCE(inet->mc_addr, 0);
 				err = 0;
 				break;
 			}
@@ -1194,8 +1194,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 		    midx != sk->sk_bound_dev_if)
 			break;
 
-		inet->mc_index = mreq.imr_ifindex;
-		inet->mc_addr  = mreq.imr_address.s_addr;
+		WRITE_ONCE(inet->mc_index, mreq.imr_ifindex);
+		WRITE_ONCE(inet->mc_addr, mreq.imr_address.s_addr);
 		err = 0;
 		break;
 	}
@@ -1673,19 +1673,11 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_UNICAST_IF:
 		val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
 		goto copyval;
-	}
-
-	if (needs_rtnl)
-		rtnl_lock();
-	sockopt_lock_sock(sk);
-
-	switch (optname) {
 	case IP_MULTICAST_IF:
 	{
 		struct in_addr addr;
 		len = min_t(unsigned int, len, sizeof(struct in_addr));
-		addr.s_addr = inet->mc_addr;
-		sockopt_release_sock(sk);
+		addr.s_addr = READ_ONCE(inet->mc_addr);
 
 		if (copy_to_sockptr(optlen, &len, sizeof(int)))
 			return -EFAULT;
@@ -1693,6 +1685,13 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 		return 0;
 	}
+	}
+
+	if (needs_rtnl)
+		rtnl_lock();
+	sockopt_lock_sock(sk);
+
+	switch (optname) {
 	case IP_MSFILTER:
 	{
 		struct ip_msfilter msf;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 66ad1f95af49f222afe0ee75b9163dd0af0a2c49..2c61f444e1c7d322e75e020c41af02977d8814f0 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -773,9 +773,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	if (ipv4_is_multicast(daddr)) {
 		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
-			ipc.oif = inet->mc_index;
+			ipc.oif = READ_ONCE(inet->mc_index);
 		if (!saddr)
-			saddr = inet->mc_addr;
+			saddr = READ_ONCE(inet->mc_addr);
 	} else if (!ipc.oif)
 		ipc.oif = READ_ONCE(inet->uc_index);
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index e2357d23202e5a39832bb1550c365de9a836c363..27da9d7294c0b4fb9027bb7feb704063dc6302db 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -579,9 +579,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	uc_index = READ_ONCE(inet->uc_index);
 	if (ipv4_is_multicast(daddr)) {
 		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
-			ipc.oif = inet->mc_index;
+			ipc.oif = READ_ONCE(inet->mc_index);
 		if (!saddr)
-			saddr = inet->mc_addr;
+			saddr = READ_ONCE(inet->mc_addr);
 	} else if (!ipc.oif) {
 		ipc.oif = uc_index;
 	} else if (ipv4_is_lbcast(daddr) && uc_index) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1e0c3aba1e5a88c7ba50a28511412a1710f1bab5..7f7724beca33781f8ff12750d1c9c9ccc420f481 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1177,9 +1177,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	uc_index = READ_ONCE(inet->uc_index);
 	if (ipv4_is_multicast(daddr)) {
 		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
-			ipc.oif = inet->mc_index;
+			ipc.oif = READ_ONCE(inet->mc_index);
 		if (!saddr)
-			saddr = inet->mc_addr;
+			saddr = READ_ONCE(inet->mc_addr);
 		connected = 0;
 	} else if (!ipc.oif) {
 		ipc.oif = uc_index;
-- 
2.42.0.515.g380fc7ccd1-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS
  2023-09-22  3:42 ` [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS Eric Dumazet
@ 2023-09-22 13:07   ` David Ahern
  2023-10-18  3:37   ` Christoph Paasch
  1 sibling, 0 replies; 13+ messages in thread
From: David Ahern @ 2023-09-22 13:07 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, eric.dumazet

On 9/21/23 9:42 PM, Eric Dumazet wrote:
> Some reads of inet->tos are racy.
> 
> Add needed READ_ONCE() annotations and convert IP_TOS option lockless.
> 
> v2: missing changes in include/net/route.h (David Ahern)
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/net/ip.h                              |  3 +-
>  include/net/route.h                           |  4 +--
>  net/dccp/ipv4.c                               |  2 +-
>  net/ipv4/inet_diag.c                          |  2 +-
>  net/ipv4/ip_output.c                          |  4 +--
>  net/ipv4/ip_sockglue.c                        | 29 ++++++++-----------
>  net/ipv4/tcp_ipv4.c                           |  9 +++---
>  net/mptcp/sockopt.c                           |  8 ++---
>  net/sctp/protocol.c                           |  4 +--
>  .../selftests/net/mptcp/mptcp_connect.sh      |  2 +-
>  10 files changed, 31 insertions(+), 36 deletions(-)
> 

Reviewed-by: David Ahern <dsahern@kernel.org>



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 net-next 0/8] inet: more data-race fixes
  2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
                   ` (7 preceding siblings ...)
  2023-09-22  3:42 ` [PATCH v2 net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF) Eric Dumazet
@ 2023-10-01 18:40 ` patchwork-bot+netdevbpf
  8 siblings, 0 replies; 13+ messages in thread
From: patchwork-bot+netdevbpf @ 2023-10-01 18:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, kuba, pabeni, dsahern, netdev, eric.dumazet

Hello:

This series was applied to netdev/net-next.git (main)
by David S. Miller <davem@davemloft.net>:

On Fri, 22 Sep 2023 03:42:13 +0000 you wrote:
> This series fixes some existing data-races on inet fields:
> 
> inet->mc_ttl, inet->pmtudisc, inet->tos, inet->uc_index,
> inet->mc_index and inet->mc_addr.
> 
> While fixing them, we convert eight socket options
> to lockless implementation.
> 
> [...]

Here is the summary with links:
  - [v2,net-next,1/8] inet: implement lockless IP_MULTICAST_TTL
    https://git.kernel.org/netdev/net-next/c/c9746e6a19c2
  - [v2,net-next,2/8] inet: implement lockless IP_MTU_DISCOVER
    https://git.kernel.org/netdev/net-next/c/ceaa714138a3
  - [v2,net-next,3/8] inet: implement lockless IP_TOS
    https://git.kernel.org/netdev/net-next/c/e08d0b3d1723
  - [v2,net-next,4/8] inet: lockless getsockopt(IP_OPTIONS)
    https://git.kernel.org/netdev/net-next/c/a4725d0d8935
  - [v2,net-next,5/8] inet: lockless getsockopt(IP_MTU)
    https://git.kernel.org/netdev/net-next/c/3523bc91e4b4
  - [v2,net-next,6/8] inet: implement lockless getsockopt(IP_UNICAST_IF)
    https://git.kernel.org/netdev/net-next/c/959d5c11601b
  - [v2,net-next,7/8] inet: lockless IP_PKTOPTIONS implementation
    https://git.kernel.org/netdev/net-next/c/c4480eb5504c
  - [v2,net-next,8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF)
    https://git.kernel.org/netdev/net-next/c/02715925222c

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS
  2023-09-22  3:42 ` [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS Eric Dumazet
  2023-09-22 13:07   ` David Ahern
@ 2023-10-18  3:37   ` Christoph Paasch
  2023-10-18  7:56     ` Eric Dumazet
  1 sibling, 1 reply; 13+ messages in thread
From: Christoph Paasch @ 2023-10-18  3:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Jakub Kicinski, Paolo Abeni, David Ahern, netdev,
	Eric Dumazet

Hello Eric,

> On Sep 21, 2023, at 8:42 PM, Eric Dumazet <edumazet@google.com> wrote:
> 
> Some reads of inet->tos are racy.
> 
> Add needed READ_ONCE() annotations and convert IP_TOS option lockless.
> 
> v2: missing changes in include/net/route.h (David Ahern)
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> include/net/ip.h                              |  3 +-
> include/net/route.h                           |  4 +--
> net/dccp/ipv4.c                               |  2 +-
> net/ipv4/inet_diag.c                          |  2 +-
> net/ipv4/ip_output.c                          |  4 +--
> net/ipv4/ip_sockglue.c                        | 29 ++++++++-----------
> net/ipv4/tcp_ipv4.c                           |  9 +++---
> net/mptcp/sockopt.c                           |  8 ++---
> net/sctp/protocol.c                           |  4 +--
> .../selftests/net/mptcp/mptcp_connect.sh      |  2 +-
> 10 files changed, 31 insertions(+), 36 deletions(-)

This patch causes a NULL-pointer deref in my syzkaller instances:

BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 12bad6067 P4D 12bad6067 PUD 12bad5067 PMD 0 
Oops: 0000 [#1] PREEMPT SMP
CPU: 1 PID: 2750 Comm: syz-executor.5 Not tainted 6.6.0-rc4-g7a5720a344e7 #49
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
RIP: 0010:tcp_get_metrics+0x118/0x8f0 net/ipv4/tcp_metrics.c:321
Code: c7 44 24 70 02 00 8b 03 89 44 24 48 c7 44 24 4c 00 00 00 00 66 c7 44 24 58 02 00 66 ba 02 00 b1 01 89 4c 24 04 4c 89 7c 24 10 <49> 8b 0f 48 8b 89 50 05 00 00 48 89 4c 24 30 33 81 00 02 00 00 69
RSP: 0018:ffffc90000af79b8 EFLAGS: 00010293
RAX: 000000000100007f RBX: ffff88812ae8f500 RCX: ffff88812b5f8f01
RDX: 0000000000000002 RSI: ffffffff8300f080 RDI: 0000000000000002
RBP: 0000000000000002 R08: 0000000000000003 R09: ffffffff8205eca0
R10: 0000000000000002 R11: ffff88812b5f8f00 R12: ffff88812a9e0580
R13: 0000000000000000 R14: ffff88812ae8fbd2 R15: 0000000000000000
FS: 00007f70a006b640(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000012bad7003 CR4: 0000000000170ee0
Call Trace:
<TASK>
tcp_fastopen_cache_get+0x32/0x140 net/ipv4/tcp_metrics.c:567
tcp_fastopen_cookie_check+0x28/0x180 net/ipv4/tcp_fastopen.c:419
tcp_connect+0x9c8/0x12a0 net/ipv4/tcp_output.c:3839
tcp_v4_connect+0x645/0x6e0 net/ipv4/tcp_ipv4.c:323
__inet_stream_connect+0x120/0x590 net/ipv4/af_inet.c:676
tcp_sendmsg_fastopen+0x2d6/0x3a0 net/ipv4/tcp.c:1021
tcp_sendmsg_locked+0x1957/0x1b00 net/ipv4/tcp.c:1073
tcp_sendmsg+0x30/0x50 net/ipv4/tcp.c:1336
__sock_sendmsg+0x83/0xd0 net/socket.c:730
__sys_sendto+0x20a/0x2a0 net/socket.c:2194
__do_sys_sendto net/socket.c:2206 [inline]
__se_sys_sendto net/socket.c:2202 [inline]
__x64_sys_sendto+0x28/0x30 net/socket.c:2202
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x47/0xa0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x6e/0xd8

The reason is that setting IP_TOS calls sk_reset_dst, which then causes these issues in the places where we assume that the dst in the socket is set (specifically, the tcp_connect-path).

Here is the syzkaller reproducer:

# {Threaded:true Repeat:true RepeatTimes:0 Procs:1 Slowdown:1 Sandbox:none SandboxArg:0 Leak:false NetInjection:false NetDevices:true NetReset:false Cgroups:false BinfmtMisc:false CloseFDs:true KCSAN:false DevlinkPCI:false NicVF:false USB:false VhciInjection:false Wifi:false IEEE802154:false Sysctl:false Swap:false UseTmpDir:false HandleSegv:false Repro:false Trace:false LegacyOptions:{Collide:false Fault:false FaultCall:0 FaultNth:0}}
r0 = socket$inet(0x2, 0x1, 0x0)
sendto$inet(r0, 0x0, 0x0, 0x20000841, &(0x7f0000000080)={0x2, 0x4e20, @dev={0xac, 0x14, 0x14, 0x15}}, 0x10) (async)
setsockopt$inet_int(r0, 0x0, 0x1, &(0x7f00000002c0)=0x81, 0x4)


Cheers,
Christoph

> 
> diff --git a/include/net/ip.h b/include/net/ip.h
> index 46933a0d98eac2db40c2e88006125588b8f8143e..6fbc0dcf4b9780d60b5e5d6f84d6017fbf57d0ae 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -258,7 +258,7 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
> 
> static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet)
> {
> - return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(inet->tos);
> + return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(READ_ONCE(inet->tos));
> }
> 
> /* datagram.c */
> @@ -810,6 +810,5 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val);
> void ip_sock_set_pktinfo(struct sock *sk);
> void ip_sock_set_recverr(struct sock *sk);
> void ip_sock_set_tos(struct sock *sk, int val);
> -void  __ip_sock_set_tos(struct sock *sk, int val);
> 
> #endif /* _IP_H */
> diff --git a/include/net/route.h b/include/net/route.h
> index 51a45b1887b562bfb473f9f8c50897d5d3073476..5c248a8e3d0e3ed757ad95f546032c2c49729eec 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -37,7 +37,7 @@
> 
> #define RTO_ONLINK 0x01
> 
> -#define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
> +#define RT_CONN_FLAGS(sk)   (RT_TOS(READ_ONCE(inet_sk(sk)->tos)) | sock_flag(sk, SOCK_LOCALROUTE))
> #define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
> 
> static inline __u8 ip_sock_rt_scope(const struct sock *sk)
> @@ -50,7 +50,7 @@ static inline __u8 ip_sock_rt_scope(const struct sock *sk)
> 
> static inline __u8 ip_sock_rt_tos(const struct sock *sk)
> {
> - return RT_TOS(inet_sk(sk)->tos);
> + return RT_TOS(READ_ONCE(inet_sk(sk)->tos));
> }
> 
> struct ip_tunnel_info;
> diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
> index 69453b936bd557c77a790a27ff64cc91e5a58296..1b8cbfda6e5dbd098a58d92639a64bc8db83ff23 100644
> --- a/net/dccp/ipv4.c
> +++ b/net/dccp/ipv4.c
> @@ -511,7 +511,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
> err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
>    ireq->ir_rmt_addr,
>    rcu_dereference(ireq->ireq_opt),
> -    inet_sk(sk)->tos);
> +    READ_ONCE(inet_sk(sk)->tos));
> rcu_read_unlock();
> err = net_xmit_eval(err);
> }
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index e13a84433413ed88088435ff8e11efeb30fc3cca..1f2d7a8bd060e59baeb00fcb1c6aabfcb3bb213d 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -134,7 +134,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
> * hence this needs to be included regardless of socket family.
> */
> if (ext & (1 << (INET_DIAG_TOS - 1)))
> - if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
> + if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0)
> goto errout;
> 
> #if IS_ENABLED(CONFIG_IPV6)
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 2be281f184a5fe5a695ccd51fabe69fa45bea0b8..85320f92e8363d59e92c54139044cbab7e0561fa 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -544,7 +544,7 @@ EXPORT_SYMBOL(__ip_queue_xmit);
> 
> int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
> {
> - return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> + return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
> }
> EXPORT_SYMBOL(ip_queue_xmit);
> 
> @@ -1438,7 +1438,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
> iph = ip_hdr(skb);
> iph->version = 4;
> iph->ihl = 5;
> - iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
> + iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
> iph->frag_off = df;
> iph->ttl = ttl;
> iph->protocol = sk->sk_protocol;
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 6d874cc03c8b4e88d79ebc50a6db105606b6ae60..50c008efbb6de7303621dd30b178c90cb3f5a2fc 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -585,25 +585,20 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
> return err;
> }
> 
> -void __ip_sock_set_tos(struct sock *sk, int val)
> +void ip_sock_set_tos(struct sock *sk, int val)
> {
> + u8 old_tos = READ_ONCE(inet_sk(sk)->tos);
> +
> if (sk->sk_type == SOCK_STREAM) {
> val &= ~INET_ECN_MASK;
> - val |= inet_sk(sk)->tos & INET_ECN_MASK;
> + val |= old_tos & INET_ECN_MASK;
> }
> - if (inet_sk(sk)->tos != val) {
> - inet_sk(sk)->tos = val;
> + if (old_tos != val) {
> + WRITE_ONCE(inet_sk(sk)->tos, val);
> WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
> sk_dst_reset(sk);
> }
> }
> -
> -void ip_sock_set_tos(struct sock *sk, int val)
> -{
> - lock_sock(sk);
> - __ip_sock_set_tos(sk, val);
> - release_sock(sk);
> -}
> EXPORT_SYMBOL(ip_sock_set_tos);
> 
> void ip_sock_set_freebind(struct sock *sk)
> @@ -1050,6 +1045,9 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
> return 0;
> case IP_MTU_DISCOVER:
> return ip_sock_set_mtu_discover(sk, val);
> + case IP_TOS: /* This sets both TOS and Precedence */
> + ip_sock_set_tos(sk, val);
> + return 0;
> }
> 
> err = 0;
> @@ -1104,9 +1102,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
> }
> }
> break;
> - case IP_TOS: /* This sets both TOS and Precedence */
> - __ip_sock_set_tos(sk, val);
> - break;
> case IP_UNICAST_IF:
> {
> struct net_device *dev = NULL;
> @@ -1593,6 +1588,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
> case IP_MTU_DISCOVER:
> val = READ_ONCE(inet->pmtudisc);
> goto copyval;
> + case IP_TOS:
> + val = READ_ONCE(inet->tos);
> + goto copyval;
> }
> 
> if (needs_rtnl)
> @@ -1629,9 +1627,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
> return -EFAULT;
> return 0;
> }
> - case IP_TOS:
> - val = inet->tos;
> - break;
> case IP_MTU:
> {
> struct dst_entry *dst;
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index f13eb7e23d03f3681055257e6ebea0612ae3f9b3..1f89ba58e71eff74d8ed75019de9e70d2f4d5926 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1024,10 +1024,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
> if (skb) {
> __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
> 
> - tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
> - (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
> - (inet_sk(sk)->tos & INET_ECN_MASK) :
> - inet_sk(sk)->tos;
> + tos = READ_ONCE(inet_sk(sk)->tos);
> +
> + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
> + tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
> +      (tos & INET_ECN_MASK);
> 
> if (!INET_ECN_is_capable(tos) &&
>    tcp_bpf_ca_needs_ecn((struct sock *)req))
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index 8260202c00669fd7d2eed2f94a3c2cf225a0d89c..155e8472ba9b83c35c6f827b2bb35c0be4127917 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -734,11 +734,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
> 
> lock_sock(sk);
> sockopt_seq_inc(msk);
> - val = inet_sk(sk)->tos;
> + val = READ_ONCE(inet_sk(sk)->tos);
> mptcp_for_each_subflow(msk, subflow) {
> struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> 
> - __ip_sock_set_tos(ssk, val);
> + ip_sock_set_tos(ssk, val);
> }
> release_sock(sk);
> 
> @@ -1343,7 +1343,7 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
> 
> switch (optname) {
> case IP_TOS:
> - return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
> + return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
> }
> 
> return -EOPNOTSUPP;
> @@ -1411,7 +1411,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
> ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
> ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
> ssk->sk_ipv6only = sk->sk_ipv6only;
> - __ip_sock_set_tos(ssk, inet_sk(sk)->tos);
> + ip_sock_set_tos(ssk, inet_sk(sk)->tos);
> 
> if (sk->sk_userlocks & tx_rx_locks) {
> ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index 2185f44198deb002bc8ed7f1b0f3fe02d6bb9f09..94c6dd53cd62d1fa6236d07946e8d5ff68eb587d 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -426,7 +426,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
> struct dst_entry *dst = NULL;
> union sctp_addr *daddr = &t->ipaddr;
> union sctp_addr dst_saddr;
> - __u8 tos = inet_sk(sk)->tos;
> + u8 tos = READ_ONCE(inet_sk(sk)->tos);
> 
> if (t->dscp & SCTP_DSCP_SET_MASK)
> tos = t->dscp & SCTP_DSCP_VAL_MASK;
> @@ -1057,7 +1057,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
> struct flowi4 *fl4 = &t->fl.u.ip4;
> struct sock *sk = skb->sk;
> struct inet_sock *inet = inet_sk(sk);
> - __u8 dscp = inet->tos;
> + __u8 dscp = READ_ONCE(inet->tos);
> __be16 df = 0;
> 
> pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
> diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
> index b1fc8afd072dc6ddde8d561a675a5549a9a37dba..61a2a1988ce69ffa17e0dd8e629eac550f4f7d99 100755
> --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
> +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
> @@ -716,7 +716,7 @@ run_test_transparent()
> # the required infrastructure in MPTCP sockopt code. To support TOS, the
> # following function has been exported (T). Not great but better than
> # checking for a specific kernel version.
> - if ! mptcp_lib_kallsyms_has "T __ip_sock_set_tos$"; then
> + if ! mptcp_lib_kallsyms_has "T ip_sock_set_tos$"; then
> echo "INFO: ${msg} not supported by the kernel: SKIP"
> mptcp_lib_result_skip "${TEST_GROUP}"
> return
> -- 
> 2.42.0.515.g380fc7ccd1-goog
> 
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS
  2023-10-18  3:37   ` Christoph Paasch
@ 2023-10-18  7:56     ` Eric Dumazet
  0 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2023-10-18  7:56 UTC (permalink / raw)
  To: Christoph Paasch
  Cc: David Miller, Jakub Kicinski, Paolo Abeni, David Ahern, netdev,
	Eric Dumazet

On Wed, Oct 18, 2023 at 5:37 AM Christoph Paasch
<christophpaasch@icloud.com> wrote:
>
> Hello Eric,
>
> > On Sep 21, 2023, at 8:42 PM, Eric Dumazet <edumazet@google.com> wrote:
> >
> > Some reads of inet->tos are racy.
> >
> > Add needed READ_ONCE() annotations and convert IP_TOS option lockless.
> >
> > v2: missing changes in include/net/route.h (David Ahern)
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> > include/net/ip.h                              |  3 +-
> > include/net/route.h                           |  4 +--
> > net/dccp/ipv4.c                               |  2 +-
> > net/ipv4/inet_diag.c                          |  2 +-
> > net/ipv4/ip_output.c                          |  4 +--
> > net/ipv4/ip_sockglue.c                        | 29 ++++++++-----------
> > net/ipv4/tcp_ipv4.c                           |  9 +++---
> > net/mptcp/sockopt.c                           |  8 ++---
> > net/sctp/protocol.c                           |  4 +--
> > .../selftests/net/mptcp/mptcp_connect.sh      |  2 +-
> > 10 files changed, 31 insertions(+), 36 deletions(-)
>
> This patch causes a NULL-pointer deref in my syzkaller instances:
>
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 12bad6067 P4D 12bad6067 PUD 12bad5067 PMD 0
> Oops: 0000 [#1] PREEMPT SMP
> CPU: 1 PID: 2750 Comm: syz-executor.5 Not tainted 6.6.0-rc4-g7a5720a344e7 #49
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
> RIP: 0010:tcp_get_metrics+0x118/0x8f0 net/ipv4/tcp_metrics.c:321
> Code: c7 44 24 70 02 00 8b 03 89 44 24 48 c7 44 24 4c 00 00 00 00 66 c7 44 24 58 02 00 66 ba 02 00 b1 01 89 4c 24 04 4c 89 7c 24 10 <49> 8b 0f 48 8b 89 50 05 00 00 48 89 4c 24 30 33 81 00 02 00 00 69
> RSP: 0018:ffffc90000af79b8 EFLAGS: 00010293
> RAX: 000000000100007f RBX: ffff88812ae8f500 RCX: ffff88812b5f8f01
> RDX: 0000000000000002 RSI: ffffffff8300f080 RDI: 0000000000000002
> RBP: 0000000000000002 R08: 0000000000000003 R09: ffffffff8205eca0
> R10: 0000000000000002 R11: ffff88812b5f8f00 R12: ffff88812a9e0580
> R13: 0000000000000000 R14: ffff88812ae8fbd2 R15: 0000000000000000
> FS: 00007f70a006b640(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000000 CR3: 000000012bad7003 CR4: 0000000000170ee0
> Call Trace:
> <TASK>
> tcp_fastopen_cache_get+0x32/0x140 net/ipv4/tcp_metrics.c:567
> tcp_fastopen_cookie_check+0x28/0x180 net/ipv4/tcp_fastopen.c:419
> tcp_connect+0x9c8/0x12a0 net/ipv4/tcp_output.c:3839
> tcp_v4_connect+0x645/0x6e0 net/ipv4/tcp_ipv4.c:323
> __inet_stream_connect+0x120/0x590 net/ipv4/af_inet.c:676
> tcp_sendmsg_fastopen+0x2d6/0x3a0 net/ipv4/tcp.c:1021
> tcp_sendmsg_locked+0x1957/0x1b00 net/ipv4/tcp.c:1073
> tcp_sendmsg+0x30/0x50 net/ipv4/tcp.c:1336
> __sock_sendmsg+0x83/0xd0 net/socket.c:730
> __sys_sendto+0x20a/0x2a0 net/socket.c:2194
> __do_sys_sendto net/socket.c:2206 [inline]
> __se_sys_sendto net/socket.c:2202 [inline]
> __x64_sys_sendto+0x28/0x30 net/socket.c:2202
> do_syscall_x64 arch/x86/entry/common.c:50 [inline]
> do_syscall_64+0x47/0xa0 arch/x86/entry/common.c:80
> entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
> The reason is that setting IP_TOS calls sk_reset_dst, which then causes these issues in the places where we assume that the dst in the socket is set (specifically, the tcp_connect-path).
>

Thanks for the report.

You are right, too many places calling __sk_dst_get() would have to
properly use RCU,
this does not seem worth the pain.

I will send a fix.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2023-10-18  7:56 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-22  3:42 [PATCH v2 net-next 0/8] inet: more data-race fixes Eric Dumazet
2023-09-22  3:42 ` [PATCH v2 net-next 1/8] inet: implement lockless IP_MULTICAST_TTL Eric Dumazet
2023-09-22  3:42 ` [PATCH v2 net-next 2/8] inet: implement lockless IP_MTU_DISCOVER Eric Dumazet
2023-09-22  3:42 ` [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS Eric Dumazet
2023-09-22 13:07   ` David Ahern
2023-10-18  3:37   ` Christoph Paasch
2023-10-18  7:56     ` Eric Dumazet
2023-09-22  3:42 ` [PATCH v2 net-next 4/8] inet: lockless getsockopt(IP_OPTIONS) Eric Dumazet
2023-09-22  3:42 ` [PATCH v2 net-next 5/8] inet: lockless getsockopt(IP_MTU) Eric Dumazet
2023-09-22  3:42 ` [PATCH v2 net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF) Eric Dumazet
2023-09-22  3:42 ` [PATCH v2 net-next 7/8] inet: lockless IP_PKTOPTIONS implementation Eric Dumazet
2023-09-22  3:42 ` [PATCH v2 net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF) Eric Dumazet
2023-10-01 18:40 ` [PATCH v2 net-next 0/8] inet: more data-race fixes patchwork-bot+netdevbpf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.