All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: stable@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	patches@lists.linux.dev, Kuniyuki Iwashima <kuniyu@amazon.com>,
	Jakub Kicinski <kuba@kernel.org>
Subject: [PATCH 4.14 26/40] tcp/udp: Make early_demux back namespacified.
Date: Tue,  8 Nov 2022 14:39:11 +0100	[thread overview]
Message-ID: <20221108133329.363720341@linuxfoundation.org> (raw)
In-Reply-To: <20221108133328.351887714@linuxfoundation.org>

From: Kuniyuki Iwashima <kuniyu@amazon.com>

commit 11052589cf5c0bab3b4884d423d5f60c38fcf25d upstream.

Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made
it possible to enable/disable early_demux on a per-netns basis.  Then, we
introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for
TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for
tcp and udp").  However, the .proc_handler() was wrong and actually
disabled us from changing the behaviour in each netns.

We can execute early_demux if net.ipv4.ip_early_demux is on and each proto
.early_demux() handler is not NULL.  When we toggle (tcp|udp)_early_demux,
the change itself is saved in each netns variable, but the .early_demux()
handler is a global variable, so the handler is switched based on the
init_net's sysctl variable.  Thus, netns (tcp|udp)_early_demux knobs have
nothing to do with the logic.  Whether we CAN execute proto .early_demux()
is always decided by init_net's sysctl knob, and whether we DO it or not is
by each netns ip_early_demux knob.

This patch namespacifies (tcp|udp)_early_demux again.  For now, the users
of the .early_demux() handler are TCP and UDP only, and they are called
directly to avoid retpoline.  So, we can remove the .early_demux() handler
from inet6?_protos and need not dereference them in ip6?_rcv_finish_core().
If another proto needs .early_demux(), we can restore it at that time.

Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/protocol.h     |    4 ---
 include/net/tcp.h          |    2 +
 include/net/udp.h          |    1 
 net/ipv4/af_inet.c         |   14 +---------
 net/ipv4/ip_input.c        |   32 ++++++++++++++++--------
 net/ipv4/sysctl_net_ipv4.c |   59 +--------------------------------------------
 net/ipv6/ip6_input.c       |   23 +++++++++++------
 net/ipv6/tcp_ipv6.c        |    9 +-----
 net/ipv6/udp.c             |    9 +-----
 9 files changed, 47 insertions(+), 106 deletions(-)

--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -39,8 +39,6 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb);
-	int			(*early_demux_handler)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	unsigned int		no_policy:1,
@@ -54,8 +52,6 @@ struct net_protocol {
 
 #if IS_ENABLED(CONFIG_IPV6)
 struct inet6_protocol {
-	void	(*early_demux)(struct sk_buff *skb);
-	void    (*early_demux_handler)(struct sk_buff *skb);
 	int	(*handler)(struct sk_buff *skb);
 
 	void	(*err_handler)(struct sk_buff *skb,
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -890,6 +890,8 @@ static inline int tcp_v6_sdif(const stru
 #endif
 	return 0;
 }
+
+void tcp_v6_early_demux(struct sk_buff *skb);
 #endif
 
 static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -173,6 +173,7 @@ typedef struct sock *(*udp_lookup_t)(str
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh, udp_lookup_t lookup);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
+void udp_v6_early_demux(struct sk_buff *skb);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1608,12 +1608,7 @@ static const struct net_protocol igmp_pr
 };
 #endif
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol tcp_protocol = {
-	.early_demux	=	tcp_v4_early_demux,
-	.early_demux_handler =  tcp_v4_early_demux,
+static const struct net_protocol tcp_protocol = {
 	.handler	=	tcp_v4_rcv,
 	.err_handler	=	tcp_v4_err,
 	.no_policy	=	1,
@@ -1621,12 +1616,7 @@ static struct net_protocol tcp_protocol
 	.icmp_strict_tag_validation = 1,
 };
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol udp_protocol = {
-	.early_demux =	udp_v4_early_demux,
-	.early_demux_handler =	udp_v4_early_demux,
+static const struct net_protocol udp_protocol = {
 	.handler =	udp_rcv,
 	.err_handler =	udp_err,
 	.no_policy =	1,
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -307,10 +307,11 @@ drop:
 	return true;
 }
 
+int udp_v4_early_demux(struct sk_buff *);
+int tcp_v4_early_demux(struct sk_buff *);
 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	int (*edemux)(struct sk_buff *skb);
 	struct net_device *dev = skb->dev;
 	struct rtable *rt;
 	int err;
@@ -322,20 +323,29 @@ static int ip_rcv_finish(struct net *net
 	if (!skb)
 		return NET_RX_SUCCESS;
 
-	if (net->ipv4.sysctl_ip_early_demux &&
+	if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
 	    !skb_dst(skb) &&
 	    !skb->sk &&
 	    !ip_is_fragment(iph)) {
-		const struct net_protocol *ipprot;
-		int protocol = iph->protocol;
+		switch (iph->protocol) {
+		case IPPROTO_TCP:
+			if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
+				tcp_v4_early_demux(skb);
 
-		ipprot = rcu_dereference(inet_protos[protocol]);
-		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
-			err = edemux(skb);
-			if (unlikely(err))
-				goto drop_error;
-			/* must reload iph, skb->head might have changed */
-			iph = ip_hdr(skb);
+				/* must reload iph, skb->head might have changed */
+				iph = ip_hdr(skb);
+			}
+			break;
+		case IPPROTO_UDP:
+			if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
+				err = udp_v4_early_demux(skb);
+				if (unlikely(err))
+					goto drop_error;
+
+				/* must reload iph, skb->head might have changed */
+				iph = ip_hdr(skb);
+			}
+			break;
 		}
 	}
 
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -311,61 +311,6 @@ bad_key:
 	return ret;
 }
 
-static void proc_configure_early_demux(int enabled, int protocol)
-{
-	struct net_protocol *ipprot;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct inet6_protocol *ip6prot;
-#endif
-
-	rcu_read_lock();
-
-	ipprot = rcu_dereference(inet_protos[protocol]);
-	if (ipprot)
-		ipprot->early_demux = enabled ? ipprot->early_demux_handler :
-						NULL;
-
-#if IS_ENABLED(CONFIG_IPV6)
-	ip6prot = rcu_dereference(inet6_protos[protocol]);
-	if (ip6prot)
-		ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
-						 NULL;
-#endif
-	rcu_read_unlock();
-}
-
-static int proc_tcp_early_demux(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret = 0;
-
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
-	if (write && !ret) {
-		int enabled = init_net.ipv4.sysctl_tcp_early_demux;
-
-		proc_configure_early_demux(enabled, IPPROTO_TCP);
-	}
-
-	return ret;
-}
-
-static int proc_udp_early_demux(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret = 0;
-
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
-	if (write && !ret) {
-		int enabled = init_net.ipv4.sysctl_udp_early_demux;
-
-		proc_configure_early_demux(enabled, IPPROTO_UDP);
-	}
-
-	return ret;
-}
-
 static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
 					     int write,
 					     void __user *buffer,
@@ -853,14 +798,14 @@ static struct ctl_table ipv4_net_table[]
 		.data           = &init_net.ipv4.sysctl_udp_early_demux,
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
-		.proc_handler   = proc_udp_early_demux
+		.proc_handler   = proc_douintvec,
 	},
 	{
 		.procname       = "tcp_early_demux",
 		.data           = &init_net.ipv4.sysctl_tcp_early_demux,
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
-		.proc_handler   = proc_tcp_early_demux
+		.proc_handler   = proc_douintvec,
 	},
 	{
 		.procname	= "ip_default_ttl",
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -47,10 +47,10 @@
 #include <net/inet_ecn.h>
 #include <net/dst_metadata.h>
 
+void udp_v6_early_demux(struct sk_buff *);
+void tcp_v6_early_demux(struct sk_buff *);
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	void (*edemux)(struct sk_buff *skb);
-
 	/* if ingress device is enslaved to an L3 master device pass the
 	 * skb to its handler for processing
 	 */
@@ -58,13 +58,20 @@ int ip6_rcv_finish(struct net *net, stru
 	if (!skb)
 		return NET_RX_SUCCESS;
 
-	if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
-		const struct inet6_protocol *ipprot;
-
-		ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
-		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
-			edemux(skb);
+	if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+	    !skb_dst(skb) && !skb->sk) {
+		switch (ipv6_hdr(skb)->nexthdr) {
+		case IPPROTO_TCP:
+			if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
+				tcp_v6_early_demux(skb);
+			break;
+		case IPPROTO_UDP:
+			if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
+				udp_v6_early_demux(skb);
+			break;
+		}
 	}
+
 	if (!skb_valid_dst(skb))
 		ip6_route_input(skb);
 
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1635,7 +1635,7 @@ do_time_wait:
 	goto discard_it;
 }
 
-static void tcp_v6_early_demux(struct sk_buff *skb)
+void tcp_v6_early_demux(struct sk_buff *skb)
 {
 	const struct ipv6hdr *hdr;
 	const struct tcphdr *th;
@@ -1991,12 +1991,7 @@ struct proto tcpv6_prot = {
 	.diag_destroy		= tcp_abort,
 };
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol tcpv6_protocol = {
-	.early_demux	=	tcp_v6_early_demux,
-	.early_demux_handler =  tcp_v6_early_demux,
+static const struct inet6_protocol tcpv6_protocol = {
 	.handler	=	tcp_v6_rcv,
 	.err_handler	=	tcp_v6_err,
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -932,7 +932,7 @@ static struct sock *__udp6_lib_demux_loo
 	return NULL;
 }
 
-static void udp_v6_early_demux(struct sk_buff *skb)
+void udp_v6_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct udphdr *uh;
@@ -1491,12 +1491,7 @@ int compat_udpv6_getsockopt(struct sock
 }
 #endif
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol udpv6_protocol = {
-	.early_demux	=	udp_v6_early_demux,
-	.early_demux_handler =  udp_v6_early_demux,
+static const struct inet6_protocol udpv6_protocol = {
 	.handler	=	udpv6_rcv,
 	.err_handler	=	udpv6_err,
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,



  parent reply	other threads:[~2022-11-08 13:43 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-08 13:38 [PATCH 4.14 00/40] 4.14.299-rc1 review Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 01/40] NFSv4.1: Handle RECLAIM_COMPLETE trunking errors Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 02/40] NFSv4.1: We must always send RECLAIM_COMPLETE after a reboot Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 03/40] nfs4: Fix kmemleak when allocate slot failed Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 04/40] net: dsa: Fix possible memory leaks in dsa_loop_init() Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 05/40] nfc: s3fwrn5: Fix potential memory leak in s3fwrn5_nci_send() Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 06/40] nfc: nfcmrvl: Fix potential memory leak in nfcmrvl_i2c_nci_send() Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 07/40] net: fec: fix improper use of NETDEV_TX_BUSY Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 08/40] ata: pata_legacy: fix pdc20230_set_piomode() Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 09/40] net: sched: Fix use after free in red_enqueue() Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 10/40] ipvs: use explicitly signed chars Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 11/40] rose: Fix NULL pointer dereference in rose_send_frame() Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 12/40] mISDN: fix possible memory leak in mISDN_register_device() Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 13/40] isdn: mISDN: netjet: fix wrong check of device registration Greg Kroah-Hartman
2022-11-08 13:38 ` [PATCH 4.14 14/40] btrfs: fix inode list leak during backref walking at resolve_indirect_refs() Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 15/40] btrfs: fix ulist leaks in error paths of qgroup self tests Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 16/40] Bluetooth: L2CAP: Fix use-after-free caused by l2cap_reassemble_sdu Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 17/40] Bluetooth: L2CAP: fix use-after-free in l2cap_conn_del() Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 18/40] net: mdio: fix undefined behavior in bit shift for __mdiobus_register Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 19/40] net, neigh: Fix null-ptr-deref in neigh_table_clear() Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 20/40] media: s5p_cec: limit msg.len to CEC_MAX_MSG_SIZE Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 21/40] media: dvb-frontends/drxk: initialize err to 0 Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 22/40] i2c: xiic: Add platform module alias Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 23/40] Bluetooth: L2CAP: Fix attempting to access uninitialized memory Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 24/40] block, bfq: protect bfqd->queued by bfqd->lock Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 25/40] btrfs: fix type of parameter generation in btrfs_get_dentry Greg Kroah-Hartman
2022-11-08 13:39 ` Greg Kroah-Hartman [this message]
2022-11-08 13:39 ` [PATCH 4.14 27/40] capabilities: fix potential memleak on error path from vfs_getxattr_alloc() Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 28/40] ALSA: usb-audio: Add quirks for MacroSilicon MS2100/MS2106 devices Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 29/40] efi: random: reduce seed size to 32 bytes Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 30/40] parisc: Make 8250_gsc driver dependend on CONFIG_PARISC Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 31/40] parisc: Export iosapic_serial_irq() symbol for serial port driver Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 32/40] ext4: fix warning in ext4_da_release_space Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 33/40] KVM: x86: Mask off reserved bits in CPUID.80000008H Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 34/40] KVM: x86: emulator: em_sysexit should update ctxt->mode Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 35/40] KVM: x86: emulator: introduce emulator_recalc_and_set_mode Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 36/40] KVM: x86: emulator: update the emulation mode after CR0 write Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 37/40] linux/const.h: prefix include guard of uapi/linux/const.h with _UAPI Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 38/40] linux/const.h: move UL() macro to include/linux/const.h Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 39/40] linux/bits.h: make BIT(), GENMASK(), and friends available in assembly Greg Kroah-Hartman
2022-11-08 13:39 ` [PATCH 4.14 40/40] wifi: brcmfmac: Fix potential buffer overflow in brcmf_fweh_event_worker() Greg Kroah-Hartman
2022-11-09  2:58 ` [PATCH 4.14 00/40] 4.14.299-rc1 review Guenter Roeck
2022-11-09 10:47 ` Jon Hunter
2022-11-09 15:46 ` Naresh Kamboju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221108133329.363720341@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=kuba@kernel.org \
    --cc=kuniyu@amazon.com \
    --cc=patches@lists.linux.dev \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.