All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] ipv4: Early TCP socket demux.
@ 2012-06-19 23:39 David Miller
  2012-06-20  0:21 ` Ben Hutchings
  2012-06-20  2:35 ` Stephen Hemminger
  0 siblings, 2 replies; 26+ messages in thread
From: David Miller @ 2012-06-19 23:39 UTC (permalink / raw)
  To: netdev


Input packet processing for local sockets involves two major demuxes.
One for the route and one for the socket.

But we can optimize this down to one demux for certain kinds of local
sockets.

Currently we only do this for established TCP sockets, but it could
at least in theory be expanded to other kinds of connections.

If a TCP socket is established then it's identity is fully specified.

This means that whatever input route was used during the three-way
handshake must work equally well for the rest of the connection since
the keys will not change.

Once we move to established state, we cache the receive packet's input
route to use later.

Like the existing cached route in sk->sk_dst_cache used for output
packets, we have to check for route invalidations using dst->obsolete
and dst->ops->check().

Early demux occurs outside of a socket locked section, so when a route
invalidation occurs we defer the fixup of sk->sk_rx_dst until we are
actually inside of established state packet processing and thus have
the socket locked.

Signed-off-by: David S. Miller <davem@davemloft.net>
---

Changes since v1:

1) Remove unlikely() from __inet_lookup_skb()

2) Check for cached route invalidations.

3) Hook up RX dst when outgoing connection moved to established too,
   previously it was only handling incoming connections.

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 808fc5f..54be028 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     const __be16 sport,
 					     const __be16 dport)
 {
-	struct sock *sk;
+	struct sock *sk = skb_steal_sock(skb);
 	const struct iphdr *iph = ip_hdr(skb);
 
-	if (unlikely(sk = skb_steal_sock(skb)))
+	if (sk)
 		return sk;
 	else
 		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 875f489..6c47bf8 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -34,6 +34,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index 4a45216..87b424a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -319,6 +319,7 @@ struct sock {
 	unsigned long 		sk_flags;
 	struct dst_entry	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
+	struct dst_entry	*sk_rx_dst;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
@@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
+extern void			sock_edemux(struct sk_buff *skb);
 
 extern int			sock_setsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9332f34..6660ffc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e5b71f..929bdcc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_rfree);
 
+void sock_edemux(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
 
 int sock_i_uid(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4e8e00..a2bd2d2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
 
 	kfree(rcu_dereference_protected(inet->inet_opt, 1));
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	dst_release(sk->sk_rx_dst);
 	sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -1520,14 +1521,15 @@ static const struct net_protocol igmp_protocol = {
 #endif
 
 static const struct net_protocol tcp_protocol = {
-	.handler =	tcp_v4_rcv,
-	.err_handler =	tcp_v4_err,
-	.gso_send_check = tcp_v4_gso_send_check,
-	.gso_segment =	tcp_tso_segment,
-	.gro_receive =	tcp4_gro_receive,
-	.gro_complete =	tcp4_gro_complete,
-	.no_policy =	1,
-	.netns_ok =	1,
+	.early_demux	=	tcp_v4_early_demux,
+	.handler	=	tcp_v4_rcv,
+	.err_handler	=	tcp_v4_err,
+	.gso_send_check	=	tcp_v4_gso_send_check,
+	.gso_segment	=	tcp_tso_segment,
+	.gro_receive	=	tcp4_gro_receive,
+	.gro_complete	=	tcp4_gro_complete,
+	.no_policy	=	1,
+	.netns_ok	=	1,
 };
 
 static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144..cb883e1 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
-		if (unlikely(err)) {
-			if (err == -EHOSTUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INADDRERRORS);
-			else if (err == -ENETUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INNOROUTES);
-			else if (err == -EXDEV)
-				NET_INC_STATS_BH(dev_net(skb->dev),
-						 LINUX_MIB_IPRPFILTER);
-			goto drop;
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+		int hash, err;
+
+		hash = protocol & (MAX_INET_PROTOS - 1);
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[hash]);
+		err = -ENOENT;
+		if (ipprot && ipprot->early_demux)
+			err = ipprot->early_demux(skb);
+		rcu_read_unlock();
+
+		if (err) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb->dev);
+			if (unlikely(err)) {
+				if (err == -EHOSTUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INADDRERRORS);
+				else if (err == -ENETUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INNOROUTES);
+				else if (err == -EXDEV)
+					NET_INC_STATS_BH(dev_net(skb->dev),
+							 LINUX_MIB_IPRPFILTER);
+				goto drop;
+			}
 		}
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8..8416f8a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	int res;
 
+	if (sk->sk_rx_dst) {
+		struct dst_entry *dst = sk->sk_rx_dst;
+		if (unlikely(dst->obsolete)) {
+			if (dst->ops->check(dst, 0) == NULL) {
+				dst_release(dst);
+				sk->sk_rx_dst = NULL;
+			}
+		}
+	}
+	if (unlikely(sk->sk_rx_dst == NULL))
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 	/*
 	 *	Header prediction.
 	 *	The code loosely follows the one in the famous
@@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
-	if (skb != NULL)
+	if (skb != NULL) {
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
 		security_inet_conn_established(sk, skb);
+	}
 
 	/* Make sure socket is routed, for correct metrics.  */
 	icsk->icsk_af_ops->rebuild_header(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca1..13857df 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,6 +1671,52 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct sock *sk;
+	int err;
+
+	err = -ENOENT;
+	if (skb->pkt_type != PACKET_HOST)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+		goto out_err;
+
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+		goto out_err;
+
+	sk = __inet_lookup_established(net, &tcp_hashinfo,
+				       iph->saddr, th->source,
+				       iph->daddr, th->dest,
+				       skb->dev->ifindex);
+	if (sk) {
+		skb->sk = sk;
+		skb->destructor = sock_edemux;
+		if (sk->sk_state != TCP_TIME_WAIT) {
+			struct dst_entry *dst = sk->sk_rx_dst;
+			if (dst)
+				dst = dst_check(dst, 0);
+			if (dst) {
+				skb_dst_set_noref(skb, dst);
+				err = 0;
+			}
+		}
+	}
+
+out_err:
+	return err;
+}
+
 /*
  *	From tcp_input.c
  */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb01531..72b7c63 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
+		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
 		 * copy any s_data_payload stored at the original socket.

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-19 23:39 [PATCH v2] ipv4: Early TCP socket demux David Miller
@ 2012-06-20  0:21 ` Ben Hutchings
  2012-06-20  0:54   ` David Miller
  2012-06-20  2:35 ` Stephen Hemminger
  1 sibling, 1 reply; 26+ messages in thread
From: Ben Hutchings @ 2012-06-20  0:21 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Tue, 2012-06-19 at 16:39 -0700, David Miller wrote:
> Input packet processing for local sockets involves two major demuxes.
> One for the route and one for the socket.
> 
> But we can optimize this down to one demux for certain kinds of local
> sockets.
[...]
> --- a/net/ipv4/ip_input.c
> +++ b/net/ipv4/ip_input.c
> @@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
>  	 *	how the packet travels inside Linux networking.
>  	 */
>  	if (skb_dst(skb) == NULL) {
> -		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> -					       iph->tos, skb->dev);
> -		if (unlikely(err)) {
> -			if (err == -EHOSTUNREACH)
> -				IP_INC_STATS_BH(dev_net(skb->dev),
> -						IPSTATS_MIB_INADDRERRORS);
> -			else if (err == -ENETUNREACH)
> -				IP_INC_STATS_BH(dev_net(skb->dev),
> -						IPSTATS_MIB_INNOROUTES);
> -			else if (err == -EXDEV)
> -				NET_INC_STATS_BH(dev_net(skb->dev),
> -						 LINUX_MIB_IPRPFILTER);
> -			goto drop;
> +		const struct net_protocol *ipprot;
> +		int protocol = iph->protocol;
> +		int hash, err;
> +
> +		hash = protocol & (MAX_INET_PROTOS - 1);
[...]

This 'hashing' threw me when I read v1, because nowhere do we actually
check that the protocol (as opposed to hash) matches that for the
selected ipprot.  (And this also turns out to be true for the current
receive path.)

This works only because MAX_INET_PROTOS is defined as 256, so that hash
== protocol.  If we were ever to change MAX_INET_PROTOS then we would
need to add a whole lot of protocol checks, but this isn't particularly
obvious!  Perhaps it would be better to remove the 'hashing' altogether?

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  0:21 ` Ben Hutchings
@ 2012-06-20  0:54   ` David Miller
  2012-06-20  1:03     ` Ben Hutchings
  0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2012-06-20  0:54 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 20 Jun 2012 01:21:06 +0100

> This works only because MAX_INET_PROTOS is defined as 256, so that hash
> == protocol.  If we were ever to change MAX_INET_PROTOS then we would
> need to add a whole lot of protocol checks, but this isn't particularly
> obvious!  Perhaps it would be better to remove the 'hashing' altogether?

We never have changed the value and we never will.  The hash is
perfect, what's the big deal?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  0:54   ` David Miller
@ 2012-06-20  1:03     ` Ben Hutchings
  2012-06-20  1:05       ` David Miller
  0 siblings, 1 reply; 26+ messages in thread
From: Ben Hutchings @ 2012-06-20  1:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Tue, 2012-06-19 at 17:54 -0700, David Miller wrote:
> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Wed, 20 Jun 2012 01:21:06 +0100
> 
> > This works only because MAX_INET_PROTOS is defined as 256, so that hash
> > == protocol.  If we were ever to change MAX_INET_PROTOS then we would
> > need to add a whole lot of protocol checks, but this isn't particularly
> > obvious!  Perhaps it would be better to remove the 'hashing' altogether?
> 
> We never have changed the value and we never will.

Well that's what I expected.

> The hash is perfect, what's the big deal?

It obscures what we're really doing and relying on.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  1:03     ` Ben Hutchings
@ 2012-06-20  1:05       ` David Miller
  2012-06-20  2:02         ` David Miller
  0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2012-06-20  1:05 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 20 Jun 2012 02:03:26 +0100

> On Tue, 2012-06-19 at 17:54 -0700, David Miller wrote:
>> The hash is perfect, what's the big deal?
> 
> It obscures what we're really doing and relying on.

If it matters to you, patches are always welcome :-)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  1:05       ` David Miller
@ 2012-06-20  2:02         ` David Miller
  2012-06-20 18:16           ` Ben Hutchings
  0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2012-06-20  2:02 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev

From: David Miller <davem@davemloft.net>
Date: Tue, 19 Jun 2012 18:05:27 -0700 (PDT)

> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Wed, 20 Jun 2012 02:03:26 +0100
> 
>> On Tue, 2012-06-19 at 17:54 -0700, David Miller wrote:
>>> The hash is perfect, what's the big deal?
>> 
>> It obscures what we're really doing and relying on.
> 
> If it matters to you, patches are always welcome :-)

Nevermind, I just committed the following to net-next:

--------------------
inet: Sanitize inet{,6} protocol demux.

Don't pretend that inet_protos[] and inet6_protos[] are hashes, thay
are just a straight arrays.  Remove all unnecessary hash masking.

Document MAX_INET_PROTOS.

Use RAW_HTABLE_SIZE when appropriate.

Reported-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h |    7 +++++--
 net/ipv4/af_inet.c     |   26 ++++++++++++--------------
 net/ipv4/icmp.c        |    9 ++++-----
 net/ipv4/ip_input.c    |    5 ++---
 net/ipv4/protocol.c    |    8 +++-----
 net/ipv6/icmp.c        |    7 ++-----
 net/ipv6/ip6_input.c   |    9 +++------
 net/ipv6/protocol.c    |    8 +++-----
 net/ipv6/raw.c         |    4 ++--
 9 files changed, 36 insertions(+), 47 deletions(-)

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 875f489..a1b1b53 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -29,8 +29,11 @@
 #include <linux/ipv6.h>
 #endif
 
-#define MAX_INET_PROTOS	256		/* Must be a power of 2		*/
-
+/* This is one larger than the largest protocol value that can be
+ * found in an ipv4 or ipv6 header.  Since in both cases the protocol
+ * value is presented in a __u8, this is defined to be 256.
+ */
+#define MAX_INET_PROTOS		256
 
 /* This is used to register protocols. */
 struct net_protocol {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4e8e00..85a3b17 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -242,20 +242,18 @@ void build_ehash_secret(void)
 }
 EXPORT_SYMBOL(build_ehash_secret);
 
-static inline int inet_netns_ok(struct net *net, int protocol)
+static inline int inet_netns_ok(struct net *net, __u8 protocol)
 {
-	int hash;
 	const struct net_protocol *ipprot;
 
 	if (net_eq(net, &init_net))
 		return 1;
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
-	ipprot = rcu_dereference(inet_protos[hash]);
-
-	if (ipprot == NULL)
+	ipprot = rcu_dereference(inet_protos[protocol]);
+	if (ipprot == NULL) {
 		/* raw IP is OK */
 		return 1;
+	}
 	return ipprot->netns_ok;
 }
 
@@ -1216,8 +1214,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 
 static int inet_gso_send_check(struct sk_buff *skb)
 {
-	const struct iphdr *iph;
 	const struct net_protocol *ops;
+	const struct iphdr *iph;
 	int proto;
 	int ihl;
 	int err = -EINVAL;
@@ -1236,7 +1234,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
 	__skb_pull(skb, ihl);
 	skb_reset_transport_header(skb);
 	iph = ip_hdr(skb);
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 	err = -EPROTONOSUPPORT;
 
 	rcu_read_lock();
@@ -1253,8 +1251,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	struct iphdr *iph;
 	const struct net_protocol *ops;
+	struct iphdr *iph;
 	int proto;
 	int ihl;
 	int id;
@@ -1286,7 +1284,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	skb_reset_transport_header(skb);
 	iph = ip_hdr(skb);
 	id = ntohs(iph->id);
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
 	rcu_read_lock();
@@ -1340,7 +1338,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 
 	rcu_read_lock();
 	ops = rcu_dereference(inet_protos[proto]);
@@ -1398,11 +1396,11 @@ out:
 
 static int inet_gro_complete(struct sk_buff *skb)
 {
-	const struct net_protocol *ops;
+	__be16 newlen = htons(skb->len - skb_network_offset(skb));
 	struct iphdr *iph = ip_hdr(skb);
-	int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	const struct net_protocol *ops;
+	int proto = iph->protocol;
 	int err = -ENOSYS;
-	__be16 newlen = htons(skb->len - skb_network_offset(skb));
 
 	csum_replace2(&iph->check, iph->tot_len, newlen);
 	iph->tot_len = newlen;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e1caa1a..49a74cc 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -637,12 +637,12 @@ EXPORT_SYMBOL(icmp_send);
 
 static void icmp_unreach(struct sk_buff *skb)
 {
+	const struct net_protocol *ipprot;
 	const struct iphdr *iph;
 	struct icmphdr *icmph;
-	int hash, protocol;
-	const struct net_protocol *ipprot;
-	u32 info = 0;
 	struct net *net;
+	u32 info = 0;
+	int protocol;
 
 	net = dev_net(skb_dst(skb)->dev);
 
@@ -731,9 +731,8 @@ static void icmp_unreach(struct sk_buff *skb)
 	 */
 	raw_icmp_error(skb, protocol, info);
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
 	rcu_read_lock();
-	ipprot = rcu_dereference(inet_protos[hash]);
+	ipprot = rcu_dereference(inet_protos[protocol]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, info);
 	rcu_read_unlock();
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144..c4fe1d2 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
 	rcu_read_lock();
 	{
 		int protocol = ip_hdr(skb)->protocol;
-		int hash, raw;
 		const struct net_protocol *ipprot;
+		int raw;
 
 	resubmit:
 		raw = raw_local_deliver(skb, protocol);
 
-		hash = protocol & (MAX_INET_PROTOS - 1);
-		ipprot = rcu_dereference(inet_protos[hash]);
+		ipprot = rcu_dereference(inet_protos[protocol]);
 		if (ipprot != NULL) {
 			int ret;
 
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01..8918eff 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
 
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int hash = protocol & (MAX_INET_PROTOS - 1);
-
-	return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
 			NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet_add_protocol);
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol);
 
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+	int ret;
 
-	ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+	ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
 		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 5247d5c..c7da142 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -600,9 +600,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 {
 	const struct inet6_protocol *ipprot;
 	int inner_offset;
-	int hash;
-	u8 nexthdr;
 	__be16 frag_off;
+	u8 nexthdr;
 
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		return;
@@ -629,10 +628,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 	   --ANK (980726)
 	 */
 
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
-
 	rcu_read_lock();
-	ipprot = rcu_dereference(inet6_protos[hash]);
+	ipprot = rcu_dereference(inet6_protos[nexthdr]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
 	rcu_read_unlock();
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 21a15df..5ab923e 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -168,13 +168,12 @@ drop:
 
 static int ip6_input_finish(struct sk_buff *skb)
 {
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	const struct inet6_protocol *ipprot;
+	struct inet6_dev *idev;
 	unsigned int nhoff;
 	int nexthdr;
 	bool raw;
-	u8 hash;
-	struct inet6_dev *idev;
-	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	/*
 	 *	Parse extension headers
@@ -189,9 +188,7 @@ resubmit:
 	nexthdr = skb_network_header(skb)[nhoff];
 
 	raw = raw6_local_deliver(skb, nexthdr);
-
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
-	if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
+	if ((ipprot = rcu_dereference(inet6_protos[nexthdr])) != NULL) {
 		int ret;
 
 		if (ipprot->flags & INET6_PROTO_FINAL) {
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 9a7978f..053082d 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -29,9 +29,7 @@ const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
 
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
-	int hash = protocol & (MAX_INET_PROTOS - 1);
-
-	return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+	return !cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
 			NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet6_add_protocol);
@@ -42,9 +40,9 @@ EXPORT_SYMBOL(inet6_add_protocol);
 
 int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
-	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+	int ret;
 
-	ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+	ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
 		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 43b0042..b5c1dcb 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -165,7 +165,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 	saddr = &ipv6_hdr(skb)->saddr;
 	daddr = saddr + 1;
 
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
+	hash = nexthdr & (RAW_HTABLE_SIZE - 1);
 
 	read_lock(&raw_v6_hashinfo.lock);
 	sk = sk_head(&raw_v6_hashinfo.ht[hash]);
@@ -229,7 +229,7 @@ bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
 {
 	struct sock *raw_sk;
 
-	raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]);
+	raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]);
 	if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
 		raw_sk = NULL;
 
-- 
1.7.10

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-19 23:39 [PATCH v2] ipv4: Early TCP socket demux David Miller
  2012-06-20  0:21 ` Ben Hutchings
@ 2012-06-20  2:35 ` Stephen Hemminger
  2012-06-20  4:46   ` David Miller
  1 sibling, 1 reply; 26+ messages in thread
From: Stephen Hemminger @ 2012-06-20  2:35 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Tue, 19 Jun 2012 16:39:11 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> 
> Input packet processing for local sockets involves two major demuxes.
> One for the route and one for the socket.
> 
> But we can optimize this down to one demux for certain kinds of local
> sockets.
> 
> Currently we only do this for established TCP sockets, but it could
> at least in theory be expanded to other kinds of connections.
> 
> If a TCP socket is established then it's identity is fully specified.
> 
> This means that whatever input route was used during the three-way
> handshake must work equally well for the rest of the connection since
> the keys will not change.
> 
> Once we move to established state, we cache the receive packet's input
> route to use later.
> 
> Like the existing cached route in sk->sk_dst_cache used for output
> packets, we have to check for route invalidations using dst->obsolete
> and dst->ops->check().
> 
> Early demux occurs outside of a socket locked section, so when a route
> invalidation occurs we defer the fixup of sk->sk_rx_dst until we are
> actually inside of established state packet processing and thus have
> the socket locked.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
> 
> Changes since v1:
> 
> 1) Remove unlikely() from __inet_lookup_skb()
> 
> 2) Check for cached route invalidations.
> 
> 3) Hook up RX dst when outgoing connection moved to established too,
>    previously it was only handling incoming connections.
> 
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index 808fc5f..54be028 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
>  					     const __be16 sport,
>  					     const __be16 dport)
>  {
> -	struct sock *sk;
> +	struct sock *sk = skb_steal_sock(skb);
>  	const struct iphdr *iph = ip_hdr(skb);
>  
> -	if (unlikely(sk = skb_steal_sock(skb)))
> +	if (sk)
>  		return sk;
>  	else
>  		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
> diff --git a/include/net/protocol.h b/include/net/protocol.h
> index 875f489..6c47bf8 100644
> --- a/include/net/protocol.h
> +++ b/include/net/protocol.h
> @@ -34,6 +34,7 @@
>  
>  /* This is used to register protocols. */
>  struct net_protocol {
> +	int			(*early_demux)(struct sk_buff *skb);
>  	int			(*handler)(struct sk_buff *skb);
>  	void			(*err_handler)(struct sk_buff *skb, u32 info);
>  	int			(*gso_send_check)(struct sk_buff *skb);
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 4a45216..87b424a 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -319,6 +319,7 @@ struct sock {
>  	unsigned long 		sk_flags;
>  	struct dst_entry	*sk_dst_cache;
>  	spinlock_t		sk_dst_lock;
> +	struct dst_entry	*sk_rx_dst;
>  	atomic_t		sk_wmem_alloc;
>  	atomic_t		sk_omem_alloc;
>  	int			sk_sndbuf;
> @@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
>  					      gfp_t priority);
>  extern void			sock_wfree(struct sk_buff *skb);
>  extern void			sock_rfree(struct sk_buff *skb);
> +extern void			sock_edemux(struct sk_buff *skb);
>  
>  extern int			sock_setsockopt(struct socket *sock, int level,
>  						int op, char __user *optval,
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 9332f34..6660ffc 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
>  
>  extern void tcp_shutdown (struct sock *sk, int how);
>  
> +extern int tcp_v4_early_demux(struct sk_buff *skb);
>  extern int tcp_v4_rcv(struct sk_buff *skb);
>  
>  extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 9e5b71f..929bdcc 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
>  }
>  EXPORT_SYMBOL(sock_rfree);
>  
> +void sock_edemux(struct sk_buff *skb)
> +{
> +	sock_put(skb->sk);
> +}
> +EXPORT_SYMBOL(sock_edemux);
>  
>  int sock_i_uid(struct sock *sk)
>  {
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index e4e8e00..a2bd2d2 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
>  
>  	kfree(rcu_dereference_protected(inet->inet_opt, 1));
>  	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
> +	dst_release(sk->sk_rx_dst);
>  	sk_refcnt_debug_dec(sk);
>  }
>  EXPORT_SYMBOL(inet_sock_destruct);
> @@ -1520,14 +1521,15 @@ static const struct net_protocol igmp_protocol = {
>  #endif
>  
>  static const struct net_protocol tcp_protocol = {
> -	.handler =	tcp_v4_rcv,
> -	.err_handler =	tcp_v4_err,
> -	.gso_send_check = tcp_v4_gso_send_check,
> -	.gso_segment =	tcp_tso_segment,
> -	.gro_receive =	tcp4_gro_receive,
> -	.gro_complete =	tcp4_gro_complete,
> -	.no_policy =	1,
> -	.netns_ok =	1,
> +	.early_demux	=	tcp_v4_early_demux,
> +	.handler	=	tcp_v4_rcv,
> +	.err_handler	=	tcp_v4_err,
> +	.gso_send_check	=	tcp_v4_gso_send_check,
> +	.gso_segment	=	tcp_tso_segment,
> +	.gro_receive	=	tcp4_gro_receive,
> +	.gro_complete	=	tcp4_gro_complete,
> +	.no_policy	=	1,
> +	.netns_ok	=	1,
>  };
>  
>  static const struct net_protocol udp_protocol = {
> diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> index 8590144..cb883e1 100644
> --- a/net/ipv4/ip_input.c
> +++ b/net/ipv4/ip_input.c
> @@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
>  	 *	how the packet travels inside Linux networking.
>  	 */
>  	if (skb_dst(skb) == NULL) {
> -		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> -					       iph->tos, skb->dev);
> -		if (unlikely(err)) {
> -			if (err == -EHOSTUNREACH)
> -				IP_INC_STATS_BH(dev_net(skb->dev),
> -						IPSTATS_MIB_INADDRERRORS);
> -			else if (err == -ENETUNREACH)
> -				IP_INC_STATS_BH(dev_net(skb->dev),
> -						IPSTATS_MIB_INNOROUTES);
> -			else if (err == -EXDEV)
> -				NET_INC_STATS_BH(dev_net(skb->dev),
> -						 LINUX_MIB_IPRPFILTER);
> -			goto drop;
> +		const struct net_protocol *ipprot;
> +		int protocol = iph->protocol;
> +		int hash, err;
> +
> +		hash = protocol & (MAX_INET_PROTOS - 1);
> +
> +		rcu_read_lock();
> +		ipprot = rcu_dereference(inet_protos[hash]);
> +		err = -ENOENT;
> +		if (ipprot && ipprot->early_demux)
> +			err = ipprot->early_demux(skb);
> +		rcu_read_unlock();
> +
> +		if (err) {
> +			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> +						   iph->tos, skb->dev);
> +			if (unlikely(err)) {
> +				if (err == -EHOSTUNREACH)
> +					IP_INC_STATS_BH(dev_net(skb->dev),
> +							IPSTATS_MIB_INADDRERRORS);
> +				else if (err == -ENETUNREACH)
> +					IP_INC_STATS_BH(dev_net(skb->dev),
> +							IPSTATS_MIB_INNOROUTES);
> +				else if (err == -EXDEV)
> +					NET_INC_STATS_BH(dev_net(skb->dev),
> +							 LINUX_MIB_IPRPFILTER);
> +				goto drop;
> +			}
>  		}
>  	}
>  
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index b224eb8..8416f8a 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
>  	struct tcp_sock *tp = tcp_sk(sk);
>  	int res;
>  
> +	if (sk->sk_rx_dst) {
> +		struct dst_entry *dst = sk->sk_rx_dst;
> +		if (unlikely(dst->obsolete)) {
> +			if (dst->ops->check(dst, 0) == NULL) {
> +				dst_release(dst);
> +				sk->sk_rx_dst = NULL;
> +			}
> +		}
> +	}
> +	if (unlikely(sk->sk_rx_dst == NULL))
> +		sk->sk_rx_dst = dst_clone(skb_dst(skb));
> +
>  	/*
>  	 *	Header prediction.
>  	 *	The code loosely follows the one in the famous
> @@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
>  
>  	tcp_set_state(sk, TCP_ESTABLISHED);
>  
> -	if (skb != NULL)
> +	if (skb != NULL) {
> +		sk->sk_rx_dst = dst_clone(skb_dst(skb));
>  		security_inet_conn_established(sk, skb);
> +	}
>  
>  	/* Make sure socket is routed, for correct metrics.  */
>  	icsk->icsk_af_ops->rebuild_header(sk);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index fda2ca1..13857df 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1671,6 +1671,52 @@ csum_err:
>  }
>  EXPORT_SYMBOL(tcp_v4_do_rcv);
>  
> +int tcp_v4_early_demux(struct sk_buff *skb)
> +{
> +	struct net *net = dev_net(skb->dev);
> +	const struct iphdr *iph;
> +	const struct tcphdr *th;
> +	struct sock *sk;
> +	int err;
> +
> +	err = -ENOENT;
> +	if (skb->pkt_type != PACKET_HOST)
> +		goto out_err;
> +
> +	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
> +		goto out_err;
> +
> +	iph = ip_hdr(skb);
> +	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
> +
> +	if (th->doff < sizeof(struct tcphdr) / 4)
> +		goto out_err;
> +
> +	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
> +		goto out_err;
> +
> +	sk = __inet_lookup_established(net, &tcp_hashinfo,
> +				       iph->saddr, th->source,
> +				       iph->daddr, th->dest,
> +				       skb->dev->ifindex);
> +	if (sk) {
> +		skb->sk = sk;
> +		skb->destructor = sock_edemux;
> +		if (sk->sk_state != TCP_TIME_WAIT) {
> +			struct dst_entry *dst = sk->sk_rx_dst;
> +			if (dst)
> +				dst = dst_check(dst, 0);
> +			if (dst) {
> +				skb_dst_set_noref(skb, dst);
> +				err = 0;
> +			}
> +		}
> +	}
> +
> +out_err:
> +	return err;
> +}
> +
>  /*
>   *	From tcp_input.c
>   */
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index cb01531..72b7c63 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
>  		struct tcp_sock *oldtp = tcp_sk(sk);
>  		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
>  
> +		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
> +
>  		/* TCP Cookie Transactions require space for the cookie pair,
>  		 * as it differs for each connection.  There is no need to
>  		 * copy any s_data_payload stored at the original socket.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Any benchmark numbers?

I think the number of ref count operations per packet is going to be
the next line in the sand.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  2:35 ` Stephen Hemminger
@ 2012-06-20  4:46   ` David Miller
  2012-06-20  5:49     ` Eric Dumazet
  2012-06-20  5:59     ` Eric Dumazet
  0 siblings, 2 replies; 26+ messages in thread
From: David Miller @ 2012-06-20  4:46 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 19 Jun 2012 19:35:49 -0700

> Any benchmark numbers?

Measuring the path from ip_rcv_finish() to where we lock the socket in
tcp_v4_rcv(), on a SPARC-T3, with a pre-warmed routing cache:

Both sk and RT lookup:	~4200 cycles
Optimized early demux:	~2800 cycles

These numbers can be decreased further, because since we're already
looking at the TCP header we can pre-cook the TCP control block in the
SKB and skip much of the stuff that tcp_v4_rcv() does since we've done
it already in the early demux code.

> I think the number of ref count operations per packet is going to be
> the next line in the sand.

There is only one, for the socket.  We haven't taken a reference on the
route for years.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  4:46   ` David Miller
@ 2012-06-20  5:49     ` Eric Dumazet
  2012-06-20  5:51       ` Eric Dumazet
  2012-06-20  5:59     ` Eric Dumazet
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2012-06-20  5:49 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, netdev

On Tue, 2012-06-19 at 21:46 -0700, David Miller wrote:
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 19 Jun 2012 19:35:49 -0700
> 
> > Any benchmark numbers?
> 
> Measuring the path from ip_rcv_finish() to where we lock the socket in
> tcp_v4_rcv(), on a SPARC-T3, with a pre-warmed routing cache:
> 
> Both sk and RT lookup:	~4200 cycles
> Optimized early demux:	~2800 cycles
> 
> These numbers can be decreased further, because since we're already
> looking at the TCP header we can pre-cook the TCP control block in the
> SKB and skip much of the stuff that tcp_v4_rcv() does since we've done
> it already in the early demux code.
> 
> > I think the number of ref count operations per packet is going to be
> > the next line in the sand.
> 
> There is only one, for the socket.  We haven't taken a reference on the
> route for years.

Actually this patch makes things probably slower for :

1) routers :

Each incoming tcp packet has to perform lookups 
(ESTABLISHED and TIMEWAIT), adding one cache miss

2) small lived tcp sessions

   input dst is now dirtied because of the additional
dst_clone()/dst_release()


1) can be solved using a knob as suggested by Changli, possibly using a
JUMP_LABEL shadowing ip_forward ?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  5:49     ` Eric Dumazet
@ 2012-06-20  5:51       ` Eric Dumazet
  2012-06-20  6:14         ` David Miller
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2012-06-20  5:51 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, netdev

On Wed, 2012-06-20 at 07:49 +0200, Eric Dumazet wrote:

> 2) small lived tcp sessions
> 
>    input dst is now dirtied because of the additional
> dst_clone()/dst_release()

Not realy a concern because we dirty cache line anyway

dst_use_noref()
{
	dst->__use++;
	dst->lastuse = time;
}

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  4:46   ` David Miller
  2012-06-20  5:49     ` Eric Dumazet
@ 2012-06-20  5:59     ` Eric Dumazet
  2012-06-20  6:14       ` David Miller
  2012-06-20 18:07       ` Ben Hutchings
  1 sibling, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2012-06-20  5:59 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, netdev

On Tue, 2012-06-19 at 21:46 -0700, David Miller wrote:

> These numbers can be decreased further, because since we're already
> looking at the TCP header we can pre-cook the TCP control block in the
> SKB and skip much of the stuff that tcp_v4_rcv() does since we've done
> it already in the early demux code.

It could be done at GRO level and remove one another demux.

As routers probably have no use of GRO, no need of additional knob.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  5:51       ` Eric Dumazet
@ 2012-06-20  6:14         ` David Miller
  2012-06-20 16:21           ` Rick Jones
  0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2012-06-20  6:14 UTC (permalink / raw)
  To: eric.dumazet; +Cc: shemminger, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 20 Jun 2012 07:51:29 +0200

> On Wed, 2012-06-20 at 07:49 +0200, Eric Dumazet wrote:
> 
>> 2) small lived tcp sessions
>> 
>>    input dst is now dirtied because of the additional
>> dst_clone()/dst_release()
> 
> Not realy a concern because we dirty cache line anyway
> 
> dst_use_noref()
> {
> 	dst->__use++;
> 	dst->lastuse = time;
> }

Right, the costs probably even out for short TCP flows.

But better to do real tests than to believe what any of
us say. :-)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  5:59     ` Eric Dumazet
@ 2012-06-20  6:14       ` David Miller
  2012-06-20 10:15         ` David Miller
  2012-06-20 18:07       ` Ben Hutchings
  1 sibling, 1 reply; 26+ messages in thread
From: David Miller @ 2012-06-20  6:14 UTC (permalink / raw)
  To: eric.dumazet; +Cc: shemminger, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 20 Jun 2012 07:59:00 +0200

> On Tue, 2012-06-19 at 21:46 -0700, David Miller wrote:
> 
>> These numbers can be decreased further, because since we're already
>> looking at the TCP header we can pre-cook the TCP control block in the
>> SKB and skip much of the stuff that tcp_v4_rcv() does since we've done
>> it already in the early demux code.
> 
> It could be done at GRO level and remove one another demux.
> 
> As routers probably have no use of GRO, no need of additional knob.

That's a great idea.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  6:14       ` David Miller
@ 2012-06-20 10:15         ` David Miller
  2012-06-20 11:03           ` Eric Dumazet
  2012-06-20 12:38           ` Eric Dumazet
  0 siblings, 2 replies; 26+ messages in thread
From: David Miller @ 2012-06-20 10:15 UTC (permalink / raw)
  To: eric.dumazet; +Cc: shemminger, netdev

From: David Miller <davem@davemloft.net>
Date: Tue, 19 Jun 2012 23:14:12 -0700 (PDT)

> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 20 Jun 2012 07:59:00 +0200
> 
>> On Tue, 2012-06-19 at 21:46 -0700, David Miller wrote:
>> 
>>> These numbers can be decreased further, because since we're already
>>> looking at the TCP header we can pre-cook the TCP control block in the
>>> SKB and skip much of the stuff that tcp_v4_rcv() does since we've done
>>> it already in the early demux code.
>> 
>> It could be done at GRO level and remove one another demux.
>> 
>> As routers probably have no use of GRO, no need of additional knob.
> 
> That's a great idea.

Here's what I have so far, the ipv6 implementation we get nearly for
free :-)

Initially I tried to use ->gro_complete() for this as it was more
natural, but we abort before we get there for a lot of cases where we
want to use the early demux and cached route (ACKs, FINs, sub-mss
sized packets, etc.)

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 967b926..a1b1b53 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,7 +37,6 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5b21522..c1b5626 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2956,6 +2956,12 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 		return -ENOMEM;
 
 	__copy_skb_header(nskb, p);
+	if (p->sk) {
+		nskb->sk = p->sk;
+		nskb->destructor = p->destructor;
+		p->sk = NULL;
+		p->destructor = NULL;
+	}
 	nskb->mac_len = p->mac_len;
 
 	skb_reserve(nskb, headroom);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 07a02f6..0aabad7 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1519,7 +1519,6 @@ static const struct net_protocol igmp_protocol = {
 #endif
 
 static const struct net_protocol tcp_protocol = {
-	.early_demux	=	tcp_v4_early_demux,
 	.handler	=	tcp_v4_rcv,
 	.err_handler	=	tcp_v4_err,
 	.gso_send_check	=	tcp_v4_gso_send_check,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 93b092c..c4fe1d2 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -323,32 +323,19 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		const struct net_protocol *ipprot;
-		int protocol = iph->protocol;
-		int err;
-
-		rcu_read_lock();
-		ipprot = rcu_dereference(inet_protos[protocol]);
-		err = -ENOENT;
-		if (ipprot && ipprot->early_demux)
-			err = ipprot->early_demux(skb);
-		rcu_read_unlock();
-
-		if (err) {
-			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-						   iph->tos, skb->dev);
-			if (unlikely(err)) {
-				if (err == -EHOSTUNREACH)
-					IP_INC_STATS_BH(dev_net(skb->dev),
-							IPSTATS_MIB_INADDRERRORS);
-				else if (err == -ENETUNREACH)
-					IP_INC_STATS_BH(dev_net(skb->dev),
-							IPSTATS_MIB_INNOROUTES);
-				else if (err == -EXDEV)
-					NET_INC_STATS_BH(dev_net(skb->dev),
-							 LINUX_MIB_IPRPFILTER);
-				goto drop;
-			}
+		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					       iph->tos, skb->dev);
+		if (unlikely(err)) {
+			if (err == -EHOSTUNREACH)
+				IP_INC_STATS_BH(dev_net(skb->dev),
+						IPSTATS_MIB_INADDRERRORS);
+			else if (err == -ENETUNREACH)
+				IP_INC_STATS_BH(dev_net(skb->dev),
+						IPSTATS_MIB_INNOROUTES);
+			else if (err == -EXDEV)
+				NET_INC_STATS_BH(dev_net(skb->dev),
+						 LINUX_MIB_IPRPFILTER);
+			goto drop;
 		}
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13857df..2a483ad 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,52 +1671,6 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-int tcp_v4_early_demux(struct sk_buff *skb)
-{
-	struct net *net = dev_net(skb->dev);
-	const struct iphdr *iph;
-	const struct tcphdr *th;
-	struct sock *sk;
-	int err;
-
-	err = -ENOENT;
-	if (skb->pkt_type != PACKET_HOST)
-		goto out_err;
-
-	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
-		goto out_err;
-
-	iph = ip_hdr(skb);
-	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
-
-	if (th->doff < sizeof(struct tcphdr) / 4)
-		goto out_err;
-
-	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
-		goto out_err;
-
-	sk = __inet_lookup_established(net, &tcp_hashinfo,
-				       iph->saddr, th->source,
-				       iph->daddr, th->dest,
-				       skb->dev->ifindex);
-	if (sk) {
-		skb->sk = sk;
-		skb->destructor = sock_edemux;
-		if (sk->sk_state != TCP_TIME_WAIT) {
-			struct dst_entry *dst = sk->sk_rx_dst;
-			if (dst)
-				dst = dst_check(dst, 0);
-			if (dst) {
-				skb_dst_set_noref(skb, dst);
-				err = 0;
-			}
-		}
-	}
-
-out_err:
-	return err;
-}
-
 /*
  *	From tcp_input.c
  */
@@ -2576,6 +2530,7 @@ void tcp4_proc_exit(void)
 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
 	const struct iphdr *iph = skb_gro_network_header(skb);
+	struct sk_buff **pp;
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
@@ -2591,7 +2546,36 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 		return NULL;
 	}
 
-	return tcp_gro_receive(head, skb);
+	pp = tcp_gro_receive(head, skb);
+
+	if (!NAPI_GRO_CB(skb)->same_flow) {
+		const struct tcphdr *th = tcp_hdr(skb);
+		struct net_device *dev = skb->dev;
+		struct sock *sk;
+
+		sk = __inet_lookup_established(dev_net(dev), &tcp_hashinfo,
+					       iph->saddr, th->source,
+					       iph->daddr, th->dest,
+					       dev->ifindex);
+		if (sk) {
+			skb_orphan(skb);
+			skb->sk = sk;
+			skb->destructor = sock_edemux;
+			if (!skb_dst(skb) &&
+			    sk->sk_state != TCP_TIME_WAIT) {
+				struct dst_entry *dst = sk->sk_rx_dst;
+				if (dst)
+					dst = dst_check(dst, 0);
+				if (dst) {
+					struct rtable *rt = (struct rtable *) dst;
+
+					if (rt->rt_iif == dev->ifindex)
+						skb_dst_set_noref(skb, dst);
+				}
+			}
+		}
+	}
+	return pp;
 }
 
 int tcp4_gro_complete(struct sk_buff *skb)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 26a8862..b8ea463 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -797,6 +797,7 @@ static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
 					 struct sk_buff *skb)
 {
 	const struct ipv6hdr *iph = skb_gro_network_header(skb);
+	struct sk_buff **pp;
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
@@ -812,7 +813,32 @@ static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
 		return NULL;
 	}
 
-	return tcp_gro_receive(head, skb);
+	pp = tcp_gro_receive(head, skb);
+
+	if (!NAPI_GRO_CB(skb)->same_flow) {
+		const struct tcphdr *th = tcp_hdr(skb);
+		struct net_device *dev = skb->dev;
+		struct sock *sk;
+
+		sk = __inet6_lookup_established(dev_net(dev), &tcp_hashinfo,
+						&iph->saddr, th->source,
+						&iph->daddr, th->dest,
+						dev->ifindex);
+		if (sk) {
+			skb_orphan(skb);
+			skb->sk = sk;
+			skb->destructor = sock_edemux;
+			if (!skb_dst(skb) &&
+			    sk->sk_state != TCP_TIME_WAIT) {
+				struct dst_entry *dst = sk->sk_rx_dst;
+				if (dst)
+					dst = dst_check(dst, 0);
+				if (dst)
+					skb_dst_set(skb, dst);
+			}
+		}
+	}
+	return pp;
 }
 
 static int tcp6_gro_complete(struct sk_buff *skb)

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 10:15         ` David Miller
@ 2012-06-20 11:03           ` Eric Dumazet
  2012-06-20 11:09             ` David Miller
  2012-06-20 12:38           ` Eric Dumazet
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2012-06-20 11:03 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, netdev

On Wed, 2012-06-20 at 03:15 -0700, David Miller wrote:

> Here's what I have so far, the ipv6 implementation we get nearly for
> free :-)
> 
> Initially I tried to use ->gro_complete() for this as it was more
> natural, but we abort before we get there for a lot of cases where we
> want to use the early demux and cached route (ACKs, FINs, sub-mss
> sized packets, etc.)
> 

Seems very good, I only have one remark :


>  /*
>   *	From tcp_input.c
>   */
> @@ -2576,6 +2530,7 @@ void tcp4_proc_exit(void)
>  struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
>  {
>  	const struct iphdr *iph = skb_gro_network_header(skb);
> +	struct sk_buff **pp;
>  
>  	switch (skb->ip_summed) {
>  	case CHECKSUM_COMPLETE:
> @@ -2591,7 +2546,36 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
>  		return NULL;
>  	}
>  
> -	return tcp_gro_receive(head, skb);
> +	pp = tcp_gro_receive(head, skb);
> +
> +	if (!NAPI_GRO_CB(skb)->same_flow) {
> +		const struct tcphdr *th = tcp_hdr(skb);
> +		struct net_device *dev = skb->dev;
> +		struct sock *sk;
> +
> +		sk = __inet_lookup_established(dev_net(dev), &tcp_hashinfo,
> +					       iph->saddr, th->source,
> +					       iph->daddr, th->dest,
> +					       dev->ifindex);
> +		if (sk) {
> +			skb_orphan(skb);
> +			skb->sk = sk;
> +			skb->destructor = sock_edemux;
> +			if (!skb_dst(skb) &&

I am not sure we need the skb_dst(skb) test here, it should be NULL
anyway in GRO layer ? (loopback device don't use GRO ;) )

> +			    sk->sk_state != TCP_TIME_WAIT) {
> +				struct dst_entry *dst = sk->sk_rx_dst;
> +				if (dst)
> +					dst = dst_check(dst, 0);
> +				if (dst) {
> +					struct rtable *rt = (struct rtable *) dst;
> +
> +					if (rt->rt_iif == dev->ifindex)
> +						skb_dst_set_noref(skb, dst);
> +				}
> +			}
> +		}
> +	}
> +	return pp;
>  }
>  
>  int tcp4_gro_complete(struct sk_buff *skb)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 11:03           ` Eric Dumazet
@ 2012-06-20 11:09             ` David Miller
  0 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2012-06-20 11:09 UTC (permalink / raw)
  To: eric.dumazet; +Cc: shemminger, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 20 Jun 2012 13:03:26 +0200

> I am not sure we need the skb_dst(skb) test here, it should be NULL
> anyway in GRO layer ? (loopback device don't use GRO ;) )

Thanks, I was too lazy to check this and just assumed that a non-NULL
skb_dst(skb) was a very real possibility.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 10:15         ` David Miller
  2012-06-20 11:03           ` Eric Dumazet
@ 2012-06-20 12:38           ` Eric Dumazet
  2012-06-20 22:29             ` David Miller
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2012-06-20 12:38 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, netdev

On Wed, 2012-06-20 at 03:15 -0700, David Miller wrote:
> 			       dev->ifindex);
> +		if (sk) {
> +			skb_orphan(skb);
> +			skb->sk = sk;
> +			skb->destructor = sock_edemux;
> +			if (!skb_dst(skb) &&
> +			    sk->sk_state != TCP_TIME_WAIT) {
> +				struct dst_entry *dst = sk->sk_rx_dst;
> +				if (dst)
> +					dst = dst_check(dst, 0);
> +				if (dst) {
> +					struct rtable *rt = (struct rtable *) dst;
> +
> +					if (rt->rt_iif == dev->ifindex)
> +						skb_dst_set_noref(skb, dst);
> +				}
> +			}
> +		}
> +	}
> +	return pp;

I am trying to convince myself its safe.

skb_dst_set_noref() assumes caller hold rcu_read_lock() until we use the
skb dst.

And dev_gro_receive() releases RCU...

Problem could happen if sk->sk_rx_dst is freed while some packets are
still in napi or socket backlog (can happen with some network
reordering)

1) Socket backlog must be flushed before sk->sk_rx_dst freeing

2) Even if we move rcu_read_lock() in net_rx_action(), we need some
napi_gro_forcedstrefs() in case we sofnet_break

Or maybe just use napi_gro_flush() ?

diff --git a/net/core/dev.c b/net/core/dev.c
index 57c4f9b..c0f71a0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3861,6 +3861,9 @@ static void net_rx_action(struct softirq_action *h)
 
 		budget -= work;
 
+		if (work == weight)
+			napi_gro_flush(n);
+
 		local_irq_disable();
 
 		/* Drivers must not modify the NAPI state if they

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  6:14         ` David Miller
@ 2012-06-20 16:21           ` Rick Jones
  0 siblings, 0 replies; 26+ messages in thread
From: Rick Jones @ 2012-06-20 16:21 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, shemminger, netdev

On 06/19/2012 11:14 PM, David Miller wrote:
> From: Eric Dumazet<eric.dumazet@gmail.com>
> Date: Wed, 20 Jun 2012 07:51:29 +0200
>
>> On Wed, 2012-06-20 at 07:49 +0200, Eric Dumazet wrote:
>>
>>> 2) small lived tcp sessions
>>>
>>>     input dst is now dirtied because of the additional
>>> dst_clone()/dst_release()
>>
>> Not realy a concern because we dirty cache line anyway
>>
>> dst_use_noref()
>> {
>> 	dst->__use++;
>> 	dst->lastuse = time;
>> }
>
> Right, the costs probably even out for short TCP flows.
>
> But better to do real tests than to believe what any of
> us say. :-)

netperf -c -C  -t TCP_CC ...  #just connect/close

or

netperf -c -C -t TCP_CRR ...  # with a request/response pair in there

For some definition of "real" anyway :)

rick jones

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  5:59     ` Eric Dumazet
  2012-06-20  6:14       ` David Miller
@ 2012-06-20 18:07       ` Ben Hutchings
  2012-06-20 18:40         ` Eric Dumazet
  1 sibling, 1 reply; 26+ messages in thread
From: Ben Hutchings @ 2012-06-20 18:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, shemminger, netdev

On Wed, 2012-06-20 at 07:59 +0200, Eric Dumazet wrote:
> On Tue, 2012-06-19 at 21:46 -0700, David Miller wrote:
> 
> > These numbers can be decreased further, because since we're already
> > looking at the TCP header we can pre-cook the TCP control block in the
> > SKB and skip much of the stuff that tcp_v4_rcv() does since we've done
> > it already in the early demux code.
> 
> It could be done at GRO level and remove one another demux.
> 
> As routers probably have no use of GRO, no need of additional knob.

I don't know what you mean by 'have no use'.  It's enabled anyway, and I
would expect that it's beneficial for some smaller routers.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20  2:02         ` David Miller
@ 2012-06-20 18:16           ` Ben Hutchings
  0 siblings, 0 replies; 26+ messages in thread
From: Ben Hutchings @ 2012-06-20 18:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Tue, 2012-06-19 at 19:02 -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Tue, 19 Jun 2012 18:05:27 -0700 (PDT)
> 
> > From: Ben Hutchings <bhutchings@solarflare.com>
> > Date: Wed, 20 Jun 2012 02:03:26 +0100
> > 
> >> On Tue, 2012-06-19 at 17:54 -0700, David Miller wrote:
> >>> The hash is perfect, what's the big deal?
> >> 
> >> It obscures what we're really doing and relying on.
> > 
> > If it matters to you, patches are always welcome :-)
> 
> Nevermind, I just committed the following to net-next:
[...]

Thanks very much, David.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 18:07       ` Ben Hutchings
@ 2012-06-20 18:40         ` Eric Dumazet
  2012-06-20 21:01           ` David Miller
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2012-06-20 18:40 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, shemminger, netdev

On Wed, 2012-06-20 at 19:07 +0100, Ben Hutchings wrote:

> I don't know what you mean by 'have no use'.  It's enabled anyway, and I
> would expect that it's beneficial for some smaller routers.
> 

I believe vast majority of linux powered devices are more hosts,
not routers (ip_forward default value is 0 by the way)

If someone wants to tune its linux router, he probably already disables
GRO because of various issues with too big packets.

GRO adds a significant cost to forwarding path.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 18:40         ` Eric Dumazet
@ 2012-06-20 21:01           ` David Miller
  2012-06-20 21:04             ` Stephen Hemminger
  0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2012-06-20 21:01 UTC (permalink / raw)
  To: eric.dumazet; +Cc: bhutchings, shemminger, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 20 Jun 2012 20:40:04 +0200

> If someone wants to tune its linux router, he probably already disables
> GRO because of various issues with too big packets.
> 
> GRO adds a significant cost to forwarding path.

No, Ben is right Eric.  GRO decreases the costs, because it means we
only need to make one forwarding/netfilter/classification decision for
N packets instead of 1.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 21:01           ` David Miller
@ 2012-06-20 21:04             ` Stephen Hemminger
  2012-06-20 21:17               ` Eric Dumazet
  0 siblings, 1 reply; 26+ messages in thread
From: Stephen Hemminger @ 2012-06-20 21:04 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, bhutchings, netdev

On Wed, 20 Jun 2012 14:01:21 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 20 Jun 2012 20:40:04 +0200
> 
> > If someone wants to tune its linux router, he probably already disables
> > GRO because of various issues with too big packets.
> > 
> > GRO adds a significant cost to forwarding path.
> 
> No, Ben is right Eric.  GRO decreases the costs, because it means we
> only need to make one forwarding/netfilter/classification decision for
> N packets instead of 1.

GRO is also important for routers that interact with VM's.
It helps reduce the per-packet wakeup of the guest VM's.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 21:04             ` Stephen Hemminger
@ 2012-06-20 21:17               ` Eric Dumazet
  2012-06-20 21:26                 ` David Miller
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2012-06-20 21:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, bhutchings, netdev

On Wed, 2012-06-20 at 14:04 -0700, Stephen Hemminger wrote:
> On Wed, 20 Jun 2012 14:01:21 -0700 (PDT)
> David Miller <davem@davemloft.net> wrote:
> 
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 20 Jun 2012 20:40:04 +0200
> > 
> > > If someone wants to tune its linux router, he probably already disables
> > > GRO because of various issues with too big packets.
> > > 
> > > GRO adds a significant cost to forwarding path.
> > 
> > No, Ben is right Eric.  GRO decreases the costs, because it means we
> > only need to make one forwarding/netfilter/classification decision for
> > N packets instead of 1.
> 
> GRO is also important for routers that interact with VM's.
> It helps reduce the per-packet wakeup of the guest VM's.

I spoke of mere routers, I was _not_ saying GRO is useless.

In most routers setups I used, I had to disable GRO, because 64Kbytes
packets on output path broke the tc setups (SFQ)

netfilter cost was hardly a problem, once correctly done.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 21:17               ` Eric Dumazet
@ 2012-06-20 21:26                 ` David Miller
  0 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2012-06-20 21:26 UTC (permalink / raw)
  To: eric.dumazet; +Cc: shemminger, bhutchings, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 20 Jun 2012 23:17:56 +0200

> In most routers setups I used, I had to disable GRO, because 64Kbytes
> packets on output path broke the tc setups (SFQ)

Then you speak of bugs and mis-features, rather than real fundamental
disadvantages of using GRO on a router :-)

> netfilter cost was hardly a problem, once correctly done.

But cost is not zero, and if you can divide it by N then you do it.
And GRO is what allows this.

Every demux, lookup, etc. is transaction cost.

Even routing cache lookup with no dst reference, which is _very_
cheap, takes up a serious amount of cpu cycles.  Enough that we think
early demux is worth it, right?

And such a routing cache lookup is significantly cheaper than a trip
down into netfilter.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] ipv4: Early TCP socket demux.
  2012-06-20 12:38           ` Eric Dumazet
@ 2012-06-20 22:29             ` David Miller
  0 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2012-06-20 22:29 UTC (permalink / raw)
  To: eric.dumazet; +Cc: shemminger, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 20 Jun 2012 14:38:40 +0200

> Problem could happen if sk->sk_rx_dst is freed while some packets are
> still in napi or socket backlog (can happen with some network
> reordering)
> 
> 1) Socket backlog must be flushed before sk->sk_rx_dst freeing
> 
> 2) Even if we move rcu_read_lock() in net_rx_action(), we need some
> napi_gro_forcedstrefs() in case we sofnet_break
> 
> Or maybe just use napi_gro_flush() ?

Good catch, but I've just figured out a more fundamental issue
with doing this at the GRO layer.

The IPV4 input path is going to undo our early socket demux by
orphaning the SKB in ip_rcv().  So we'll end up looking up the
socket twice.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2012-06-20 22:30 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-19 23:39 [PATCH v2] ipv4: Early TCP socket demux David Miller
2012-06-20  0:21 ` Ben Hutchings
2012-06-20  0:54   ` David Miller
2012-06-20  1:03     ` Ben Hutchings
2012-06-20  1:05       ` David Miller
2012-06-20  2:02         ` David Miller
2012-06-20 18:16           ` Ben Hutchings
2012-06-20  2:35 ` Stephen Hemminger
2012-06-20  4:46   ` David Miller
2012-06-20  5:49     ` Eric Dumazet
2012-06-20  5:51       ` Eric Dumazet
2012-06-20  6:14         ` David Miller
2012-06-20 16:21           ` Rick Jones
2012-06-20  5:59     ` Eric Dumazet
2012-06-20  6:14       ` David Miller
2012-06-20 10:15         ` David Miller
2012-06-20 11:03           ` Eric Dumazet
2012-06-20 11:09             ` David Miller
2012-06-20 12:38           ` Eric Dumazet
2012-06-20 22:29             ` David Miller
2012-06-20 18:07       ` Ben Hutchings
2012-06-20 18:40         ` Eric Dumazet
2012-06-20 21:01           ` David Miller
2012-06-20 21:04             ` Stephen Hemminger
2012-06-20 21:17               ` Eric Dumazet
2012-06-20 21:26                 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.