All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ipv4: Early TCP socket demux.
@ 2012-06-19  2:40 David Miller
  2012-06-19  4:03 ` Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: David Miller @ 2012-06-19  2:40 UTC (permalink / raw)
  To: netdev


You know you want it.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 875f489..6c47bf8 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -34,6 +34,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index 4a45216..87b424a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -319,6 +319,7 @@ struct sock {
 	unsigned long 		sk_flags;
 	struct dst_entry	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
+	struct dst_entry	*sk_rx_dst;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
@@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
+extern void			sock_edemux(struct sk_buff *skb);
 
 extern int			sock_setsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9332f34..6660ffc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e5b71f..929bdcc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_rfree);
 
+void sock_edemux(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
 
 int sock_i_uid(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4e8e00..a2bd2d2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
 
 	kfree(rcu_dereference_protected(inet->inet_opt, 1));
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	dst_release(sk->sk_rx_dst);
 	sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -1520,14 +1521,15 @@ static const struct net_protocol igmp_protocol = {
 #endif
 
 static const struct net_protocol tcp_protocol = {
-	.handler =	tcp_v4_rcv,
-	.err_handler =	tcp_v4_err,
-	.gso_send_check = tcp_v4_gso_send_check,
-	.gso_segment =	tcp_tso_segment,
-	.gro_receive =	tcp4_gro_receive,
-	.gro_complete =	tcp4_gro_complete,
-	.no_policy =	1,
-	.netns_ok =	1,
+	.early_demux	=	tcp_v4_early_demux,
+	.handler	=	tcp_v4_rcv,
+	.err_handler	=	tcp_v4_err,
+	.gso_send_check	=	tcp_v4_gso_send_check,
+	.gso_segment	=	tcp_tso_segment,
+	.gro_receive	=	tcp4_gro_receive,
+	.gro_complete	=	tcp4_gro_complete,
+	.no_policy	=	1,
+	.netns_ok	=	1,
 };
 
 static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144..cb883e1 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
-		if (unlikely(err)) {
-			if (err == -EHOSTUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INADDRERRORS);
-			else if (err == -ENETUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INNOROUTES);
-			else if (err == -EXDEV)
-				NET_INC_STATS_BH(dev_net(skb->dev),
-						 LINUX_MIB_IPRPFILTER);
-			goto drop;
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+		int hash, err;
+
+		hash = protocol & (MAX_INET_PROTOS - 1);
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[hash]);
+		err = -ENOENT;
+		if (ipprot && ipprot->early_demux)
+			err = ipprot->early_demux(skb);
+		rcu_read_unlock();
+
+		if (err) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb->dev);
+			if (unlikely(err)) {
+				if (err == -EHOSTUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INADDRERRORS);
+				else if (err == -ENETUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INNOROUTES);
+				else if (err == -EXDEV)
+					NET_INC_STATS_BH(dev_net(skb->dev),
+							 LINUX_MIB_IPRPFILTER);
+				goto drop;
+			}
 		}
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca1..bd90181 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,6 +1671,50 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct sock *sk;
+	int err;
+
+	err = -ENOENT;
+	if (skb->pkt_type != PACKET_HOST)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+		goto out_err;
+
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+		goto out_err;
+
+	sk = __inet_lookup_established(net, &tcp_hashinfo,
+				       iph->saddr, th->source,
+				       iph->daddr, th->dest,
+				       skb->dev->ifindex);
+	if (sk) {
+		skb->sk = sk;
+		skb->destructor = sock_edemux;
+		if (sk->sk_state != TCP_TIME_WAIT) {
+			struct dst_entry *dst = sk->sk_rx_dst;
+			if (dst) {
+				skb_dst_set_noref(skb, dst);
+				err = 0;
+			}
+		}
+	}
+
+out_err:
+	return err;
+}
+
 /*
  *	From tcp_input.c
  */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb01531..72b7c63 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
+		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
 		 * copy any s_data_payload stored at the original socket.

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  2:40 [PATCH] ipv4: Early TCP socket demux David Miller
@ 2012-06-19  4:03 ` Stephen Hemminger
  2012-06-19  4:10   ` David Miller
  2012-06-19  4:13   ` Changli Gao
  2012-06-19  4:07 ` Eric Dumazet
  2012-06-19  4:43 ` Changli Gao
  2 siblings, 2 replies; 19+ messages in thread
From: Stephen Hemminger @ 2012-06-19  4:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


> 
> You know you want it.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>

David, I understand it, Eric understands it, and maybe one or
two others. But on the principal of what is "good for the goose
is good for the gander", you really need to provide a reasonable
change log entry. Just because you are the network maintainer
doesn't mean you get to skip all the documented rules about submitting
patches.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  2:40 [PATCH] ipv4: Early TCP socket demux David Miller
  2012-06-19  4:03 ` Stephen Hemminger
@ 2012-06-19  4:07 ` Eric Dumazet
  2012-06-19  4:15   ` David Miller
  2012-06-19  4:43 ` Changli Gao
  2 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2012-06-19  4:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
> You know you want it.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>

Yeah, very good idea David ;)

needs some polishing of course.

This reminds the idea of having seperate dst per tcp socket, to remove
the dst refcnt contention as well.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:03 ` Stephen Hemminger
@ 2012-06-19  4:10   ` David Miller
  2012-06-19  4:13   ` Changli Gao
  1 sibling, 0 replies; 19+ messages in thread
From: David Miller @ 2012-06-19  4:10 UTC (permalink / raw)
  To: stephen.hemminger; +Cc: netdev

From: Stephen Hemminger <stephen.hemminger@vyatta.com>
Date: Mon, 18 Jun 2012 21:03:26 -0700 (PDT)

> David, I understand it, Eric understands it, and maybe one or
> two others. But on the principal of what is "good for the goose
> is good for the gander", you really need to provide a reasonable
> change log entry. Just because you are the network maintainer
> doesn't mean you get to skip all the documented rules about submitting
> patches.

This wasn't going to be the final commit log entry.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:03 ` Stephen Hemminger
  2012-06-19  4:10   ` David Miller
@ 2012-06-19  4:13   ` Changli Gao
  2012-06-19  4:16     ` David Miller
  1 sibling, 1 reply; 19+ messages in thread
From: Changli Gao @ 2012-06-19  4:13 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

On Tue, Jun 19, 2012 at 12:03 PM, Stephen Hemminger
<stephen.hemminger@vyatta.com> wrote:
>
>>
>> You know you want it.
>>
>> Signed-off-by: David S. Miller <davem@davemloft.net>
>
> David, I understand it, Eric understands it, and maybe one or
> two others. But on the principal of what is "good for the goose
> is good for the gander", you really need to provide a reasonable
> change log entry. Just because you are the network maintainer
> doesn't mean you get to skip all the documented rules about submitting
> patches.

Agree. Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:07 ` Eric Dumazet
@ 2012-06-19  4:15   ` David Miller
  2012-06-19  4:23     ` Eric Dumazet
  2012-06-19  4:31     ` Eric Dumazet
  0 siblings, 2 replies; 19+ messages in thread
From: David Miller @ 2012-06-19  4:15 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Jun 2012 06:07:26 +0200

> On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
>> You know you want it.
>> 
>> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> Yeah, very good idea David ;)
> 
> needs some polishing of course.

Such as?  IPv6 support?

> This reminds the idea of having seperate dst per tcp socket, to remove
> the dst refcnt contention as well.

I'm leery of this.

We're going to move towards having dst entries more strongly shared
as we move to remove the routing cache.

In fact I have another short-term change planned that adjusts which
keys the routing cache uses based upon what kinds of keys are actually
active in the current FIB rule configuration.

I think we want to encourage sharing and make the route footprint
smaller rather than expanding it's size artificually on even the
socket level.

If you really care about this refcount problem, then it's another
reason to never orphan socket sourced packets.  Then we wouldn't need
to ever refcount the route just to send a packet, we'd just use the
implicit reference held by the socket instead.  Socket route releasing
would be held back by the presence of any packets in the socket send
queue.  If we have to reset the dst mis-lifetime due to route flushes,
we'd need to use a specific packet as a sequence point.

That to me sounds like a more reasonable approach than just making
more and more routes.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:13   ` Changli Gao
@ 2012-06-19  4:16     ` David Miller
  2012-06-19 18:31       ` Stephen Hemminger
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-19  4:16 UTC (permalink / raw)
  To: xiaosuo; +Cc: stephen.hemminger, netdev

From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 19 Jun 2012 12:13:48 +0800

> On Tue, Jun 19, 2012 at 12:03 PM, Stephen Hemminger
> <stephen.hemminger@vyatta.com> wrote:
>>
>>>
>>> You know you want it.
>>>
>>> Signed-off-by: David S. Miller <davem@davemloft.net>
>>
>> David, I understand it, Eric understands it, and maybe one or
>> two others. But on the principal of what is "good for the goose
>> is good for the gander", you really need to provide a reasonable
>> change log entry. Just because you are the network maintainer
>> doesn't mean you get to skip all the documented rules about submitting
>> patches.
> 
> Agree. Thanks.

That's the last time I try to be even slightly humerous on this
list.

Thanks for killing the fun Stephen.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:15   ` David Miller
@ 2012-06-19  4:23     ` Eric Dumazet
  2012-06-19  4:25       ` Eric Dumazet
  2012-06-19  4:26       ` David Miller
  2012-06-19  4:31     ` Eric Dumazet
  1 sibling, 2 replies; 19+ messages in thread
From: Eric Dumazet @ 2012-06-19  4:23 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, 2012-06-18 at 21:15 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 19 Jun 2012 06:07:26 +0200
> 
> > On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
> >> You know you want it.
> >> 
> >> Signed-off-by: David S. Miller <davem@davemloft.net>
> > 
> > Yeah, very good idea David ;)
> > 
> > needs some polishing of course.
> 
> Such as?  IPv6 support?
> 

I was referring to socket leak in :

+       if (sk) {
+               skb->sk = sk;
+               skb->destructor = sock_edemux;
+               if (sk->sk_state != TCP_TIME_WAIT) {
+                       struct dst_entry *dst = sk->sk_rx_dst;
+                       if (dst) {
+                               skb_dst_set_noref(skb, dst);
+                               err = 0;
+                       }
+               }
+       }

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:23     ` Eric Dumazet
@ 2012-06-19  4:25       ` Eric Dumazet
  2012-06-19  4:27         ` David Miller
  2012-06-19  4:26       ` David Miller
  1 sibling, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2012-06-19  4:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Tue, 2012-06-19 at 06:23 +0200, Eric Dumazet wrote:
> On Mon, 2012-06-18 at 21:15 -0700, David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Tue, 19 Jun 2012 06:07:26 +0200
> > 
> > > On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
> > >> You know you want it.
> > >> 
> > >> Signed-off-by: David S. Miller <davem@davemloft.net>
> > > 
> > > Yeah, very good idea David ;)
> > > 
> > > needs some polishing of course.
> > 
> > Such as?  IPv6 support?
> > 
> 
> I was referring to socket leak in :
> 
> +       if (sk) {
> +               skb->sk = sk;
> +               skb->destructor = sock_edemux;
> +               if (sk->sk_state != TCP_TIME_WAIT) {
> +                       struct dst_entry *dst = sk->sk_rx_dst;
> +                       if (dst) {
> +                               skb_dst_set_noref(skb, dst);
> +                               err = 0;
> +                       }
> +               }
> +       }
> 
> 

It's not a leak, but seems strange to keep it around if we dont use it
yet.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:23     ` Eric Dumazet
  2012-06-19  4:25       ` Eric Dumazet
@ 2012-06-19  4:26       ` David Miller
  1 sibling, 0 replies; 19+ messages in thread
From: David Miller @ 2012-06-19  4:26 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Jun 2012 06:23:33 +0200

> I was referring to socket leak in :
> 
> +       if (sk) {
> +               skb->sk = sk;
> +               skb->destructor = sock_edemux;
> +               if (sk->sk_state != TCP_TIME_WAIT) {
> +                       struct dst_entry *dst = sk->sk_rx_dst;
> +                       if (dst) {
> +                               skb_dst_set_noref(skb, dst);
> +                               err = 0;
> +                       }
> +               }
> +       }
> 

I see no leak.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:25       ` Eric Dumazet
@ 2012-06-19  4:27         ` David Miller
  2012-06-19  4:37           ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-19  4:27 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Jun 2012 06:25:38 +0200

> On Tue, 2012-06-19 at 06:23 +0200, Eric Dumazet wrote:
>> On Mon, 2012-06-18 at 21:15 -0700, David Miller wrote:
>> > From: Eric Dumazet <eric.dumazet@gmail.com>
>> > Date: Tue, 19 Jun 2012 06:07:26 +0200
>> > 
>> > > On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
>> > >> You know you want it.
>> > >> 
>> > >> Signed-off-by: David S. Miller <davem@davemloft.net>
>> > > 
>> > > Yeah, very good idea David ;)
>> > > 
>> > > needs some polishing of course.
>> > 
>> > Such as?  IPv6 support?
>> > 
>> 
>> I was referring to socket leak in :
>> 
>> +       if (sk) {
>> +               skb->sk = sk;
>> +               skb->destructor = sock_edemux;
>> +               if (sk->sk_state != TCP_TIME_WAIT) {
>> +                       struct dst_entry *dst = sk->sk_rx_dst;
>> +                       if (dst) {
>> +                               skb_dst_set_noref(skb, dst);
>> +                               err = 0;
>> +                       }
>> +               }
>> +       }
>> 
>> 
> 
> It's not a leak, but seems strange to keep it around if we dont use it
> yet.

How are we not using it?  We use the cached SKB socket no matter what
happens.

Look at how inet hash lookup works.

The error tells the caller solely whether a route lookup is still
necessary.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:15   ` David Miller
  2012-06-19  4:23     ` Eric Dumazet
@ 2012-06-19  4:31     ` Eric Dumazet
  1 sibling, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2012-06-19  4:31 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, 2012-06-18 at 21:15 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>

> > This reminds the idea of having seperate dst per tcp socket, to remove
> > the dst refcnt contention as well.
> 
> I'm leery of this.
> 
> We're going to move towards having dst entries more strongly shared
> as we move to remove the routing cache.
> 
> In fact I have another short-term change planned that adjusts which
> keys the routing cache uses based upon what kinds of keys are actually
> active in the current FIB rule configuration.
> 
> I think we want to encourage sharing and make the route footprint
> smaller rather than expanding it's size artificually on even the
> socket level.
> 
> If you really care about this refcount problem, then it's another
> reason to never orphan socket sourced packets.  Then we wouldn't need
> to ever refcount the route just to send a packet, we'd just use the
> implicit reference held by the socket instead.  Socket route releasing
> would be held back by the presence of any packets in the socket send
> queue.  If we have to reset the dst mis-lifetime due to route flushes,
> we'd need to use a specific packet as a sequence point.
> 
> That to me sounds like a more reasonable approach than just making
> more and more routes.

We already don't touch dst refcnt on TCP xmit path, unless packet is
parked in Qdisc queue...

But with BQL this is becoming less effective.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:27         ` David Miller
@ 2012-06-19  4:37           ` Eric Dumazet
  2012-06-19  6:07             ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2012-06-19  4:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, 2012-06-18 at 21:27 -0700, David Miller wrote:

> How are we not using it?  We use the cached SKB socket no matter what
> happens.
> 
> Look at how inet hash lookup works.
> 
> The error tells the caller solely whether a route lookup is still
> necessary.

OK, remove the unlikely() in __inet_lookup_skb() so that its obvious we
have this skb_steal_sock() thing :)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  2:40 [PATCH] ipv4: Early TCP socket demux David Miller
  2012-06-19  4:03 ` Stephen Hemminger
  2012-06-19  4:07 ` Eric Dumazet
@ 2012-06-19  4:43 ` Changli Gao
  2012-06-19  4:47   ` Eric Dumazet
  2 siblings, 1 reply; 19+ messages in thread
From: Changli Gao @ 2012-06-19  4:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Tom Herbert

On Tue, Jun 19, 2012 at 10:40 AM, David Miller <davem@davemloft.net> wrote:
> @@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
>         *      how the packet travels inside Linux networking.
>         */
>        if (skb_dst(skb) == NULL) {
> -               int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> -                                              iph->tos, skb->dev);
> -               if (unlikely(err)) {
> -                       if (err == -EHOSTUNREACH)
> -                               IP_INC_STATS_BH(dev_net(skb->dev),
> -                                               IPSTATS_MIB_INADDRERRORS);
> -                       else if (err == -ENETUNREACH)
> -                               IP_INC_STATS_BH(dev_net(skb->dev),
> -                                               IPSTATS_MIB_INNOROUTES);
> -                       else if (err == -EXDEV)
> -                               NET_INC_STATS_BH(dev_net(skb->dev),
> -                                                LINUX_MIB_IPRPFILTER);
> -                       goto drop;
> +               const struct net_protocol *ipprot;
> +               int protocol = iph->protocol;
> +               int hash, err;
> +
> +               hash = protocol & (MAX_INET_PROTOS - 1);
> +
> +               rcu_read_lock();
> +               ipprot = rcu_dereference(inet_protos[hash]);
> +               err = -ENOENT;
> +               if (ipprot && ipprot->early_demux)
> +                       err = ipprot->early_demux(skb);

I am afraid that this lookup with hurt the performance of the
forwarding path. A knob?

If this approach is acceptable, maybe we can use sockets to do finer RFS.

Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:43 ` Changli Gao
@ 2012-06-19  4:47   ` Eric Dumazet
  2012-06-19  4:51     ` Changli Gao
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2012-06-19  4:47 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, netdev, Tom Herbert

On Tue, 2012-06-19 at 12:43 +0800, Changli Gao wrote:

> I am afraid that this lookup with hurt the performance of the
> forwarding path. A knob?
> 

ip_rcv() & ip_rcv_finish() in the forwarding path ?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:47   ` Eric Dumazet
@ 2012-06-19  4:51     ` Changli Gao
  0 siblings, 0 replies; 19+ messages in thread
From: Changli Gao @ 2012-06-19  4:51 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Tom Herbert

On Tue, Jun 19, 2012 at 12:47 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2012-06-19 at 12:43 +0800, Changli Gao wrote:
>
>> I am afraid that this lookup with hurt the performance of the
>> forwarding path. A knob?
>>
>
> ip_rcv() & ip_rcv_finish() in the forwarding path ?
>
>

Yes, the two routines are shared by both. I think you mean
ip_local_deliver() and ip_local_deliver_finish().

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:37           ` Eric Dumazet
@ 2012-06-19  6:07             ` David Miller
  2012-06-19  6:28               ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-19  6:07 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Jun 2012 06:37:41 +0200

> On Mon, 2012-06-18 at 21:27 -0700, David Miller wrote:
> 
>> How are we not using it?  We use the cached SKB socket no matter what
>> happens.
>> 
>> Look at how inet hash lookup works.
>> 
>> The error tells the caller solely whether a route lookup is still
>> necessary.
> 
> OK, remove the unlikely() in __inet_lookup_skb() so that its obvious we
> have this skb_steal_sock() thing :)

Sure thing.

We also need to add some dst->ops->check() handling as well.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  6:07             ` David Miller
@ 2012-06-19  6:28               ` Eric Dumazet
  0 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2012-06-19  6:28 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, 2012-06-18 at 23:07 -0700, David Miller wrote:

> Sure thing.
> 
> We also need to add some dst->ops->check() handling as well.

Yes, rp_filter comes to mind.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] ipv4: Early TCP socket demux.
  2012-06-19  4:16     ` David Miller
@ 2012-06-19 18:31       ` Stephen Hemminger
  0 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2012-06-19 18:31 UTC (permalink / raw)
  To: David Miller; +Cc: xiaosuo, stephen.hemminger, netdev

On Mon, 18 Jun 2012 21:16:13 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Changli Gao <xiaosuo@gmail.com>
> Date: Tue, 19 Jun 2012 12:13:48 +0800
> 
> > On Tue, Jun 19, 2012 at 12:03 PM, Stephen Hemminger
> > <stephen.hemminger@vyatta.com> wrote:
> >>
> >>>
> >>> You know you want it.
> >>>
> >>> Signed-off-by: David S. Miller <davem@davemloft.net>
> >>
> >> David, I understand it, Eric understands it, and maybe one or
> >> two others. But on the principal of what is "good for the goose
> >> is good for the gander", you really need to provide a reasonable
> >> change log entry. Just because you are the network maintainer
> >> doesn't mean you get to skip all the documented rules about submitting
> >> patches.
> > 
> > Agree. Thanks.
> 
> That's the last time I try to be even slightly humerous on this
> list.
> 
> Thanks for killing the fun Stephen.

We all need to lighten up, keep trying to be humorous.
Maybe you just need better material. The IP stack is just in too good
a shape, how about something with more natural whackiness like bluetooth.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2012-06-19 18:31 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-19  2:40 [PATCH] ipv4: Early TCP socket demux David Miller
2012-06-19  4:03 ` Stephen Hemminger
2012-06-19  4:10   ` David Miller
2012-06-19  4:13   ` Changli Gao
2012-06-19  4:16     ` David Miller
2012-06-19 18:31       ` Stephen Hemminger
2012-06-19  4:07 ` Eric Dumazet
2012-06-19  4:15   ` David Miller
2012-06-19  4:23     ` Eric Dumazet
2012-06-19  4:25       ` Eric Dumazet
2012-06-19  4:27         ` David Miller
2012-06-19  4:37           ` Eric Dumazet
2012-06-19  6:07             ` David Miller
2012-06-19  6:28               ` Eric Dumazet
2012-06-19  4:26       ` David Miller
2012-06-19  4:31     ` Eric Dumazet
2012-06-19  4:43 ` Changli Gao
2012-06-19  4:47   ` Eric Dumazet
2012-06-19  4:51     ` Changli Gao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.