All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
@ 2013-10-10  6:33 Steffen Klassert
  2013-10-10  6:33 ` [PATCH RFC 1/2] xfrm: Remove ancient sleeping when the SA is in acquire state Steffen Klassert
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Steffen Klassert @ 2013-10-10  6:33 UTC (permalink / raw)
  To: netdev

Does anyone still rely on the ancient sleeping when the SA is in
acquire state? It is disabled by default since more that five years,
but can cause indefinite task hangs if enabled and the needed state
does not get resolved.

We now queue packets to the policy if the states are not yet resolved
if we are in a code path that can not sleep. We could do this even in
the case we can sleep. As a bonus, we can remove the FLOWI_FLAG_CAN_SLEEP
flag because the only thing this flag does, is to notify xfrm that we are
in a codepath that can sleep.

The two RFC patches to remove the sleeping code are in reply to this
mail. I'd add this to the ipsec-next tree if there are no objections.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH RFC 1/2] xfrm: Remove ancient sleeping when the SA is in acquire state
  2013-10-10  6:33 [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Steffen Klassert
@ 2013-10-10  6:33 ` Steffen Klassert
  2013-10-10  6:34 ` [PATCH RFC 2/2] net: Remove FLOWI_FLAG_CAN_SLEEP Steffen Klassert
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 17+ messages in thread
From: Steffen Klassert @ 2013-10-10  6:33 UTC (permalink / raw)
  To: netdev

We now queue packets to the policy if the states are not yet resolved,
this replaces the ancient sleeping code. Also the sleeping can cause
indefinite task hangs if the needed state does not get resolved.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h |    2 --
 net/key/af_key.c         |    5 ++---
 net/xfrm/xfrm_policy.c   |   21 ++-------------------
 net/xfrm/xfrm_state.c    |   21 +--------------------
 4 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 5299e69..bda8039 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -33,8 +33,6 @@ struct netns_xfrm {
 	struct hlist_head	state_gc_list;
 	struct work_struct	state_gc_work;
 
-	wait_queue_head_t	km_waitq;
-
 	struct list_head	policy_all;
 	struct hlist_head	*policy_byidx;
 	unsigned int		policy_idx_hmask;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 9d58537..239289d 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1379,10 +1379,9 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb
 		return 0;
 
 	spin_lock_bh(&x->lock);
-	if (x->km.state == XFRM_STATE_ACQ) {
+	if (x->km.state == XFRM_STATE_ACQ)
 		x->km.state = XFRM_STATE_ERROR;
-		wake_up(&net->xfrm.km_waitq);
-	}
+
 	spin_unlock_bh(&x->lock);
 	xfrm_state_put(x);
 	return 0;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ed38d5d..b885c76 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1875,8 +1875,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
 	if (IS_ERR(xdst))
 		return xdst;
 
-	if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0 ||
-	    (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP))
+	if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0)
 		return xdst;
 
 	dst1 = &xdst->u.dst;
@@ -2051,7 +2050,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
 	int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
 
-restart:
 	dst = NULL;
 	xdst = NULL;
 	route = NULL;
@@ -2131,23 +2129,8 @@ restart:
 
 			return make_blackhole(net, family, dst_orig);
 		}
-		if (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP) {
-			DECLARE_WAITQUEUE(wait, current);
 
-			add_wait_queue(&net->xfrm.km_waitq, &wait);
-			set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
-			set_current_state(TASK_RUNNING);
-			remove_wait_queue(&net->xfrm.km_waitq, &wait);
-
-			if (!signal_pending(current)) {
-				dst_release(dst);
-				goto restart;
-			}
-
-			err = -ERESTART;
-		} else
-			err = -EAGAIN;
+		err = -EAGAIN;
 
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
 		goto error;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index b9c3f9e..14518eb 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -374,8 +374,6 @@ static void xfrm_state_gc_task(struct work_struct *work)
 
 	hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
 		xfrm_state_gc_destroy(x);
-
-	wake_up(&net->xfrm.km_waitq);
 }
 
 static inline unsigned long make_jiffies(long secs)
@@ -390,7 +388,6 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me)
 {
 	struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
 	struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
-	struct net *net = xs_net(x);
 	unsigned long now = get_seconds();
 	long next = LONG_MAX;
 	int warn = 0;
@@ -460,12 +457,8 @@ resched:
 	goto out;
 
 expired:
-	if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) {
+	if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0)
 		x->km.state = XFRM_STATE_EXPIRED;
-		wake_up(&net->xfrm.km_waitq);
-		next = 2;
-		goto resched;
-	}
 
 	err = __xfrm_state_delete(x);
 	if (!err && x->id.spi)
@@ -637,7 +630,6 @@ restart:
 
 out:
 	spin_unlock_bh(&xfrm_state_lock);
-	wake_up(&net->xfrm.km_waitq);
 	return err;
 }
 EXPORT_SYMBOL(xfrm_state_flush);
@@ -950,8 +942,6 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 	if (x->replay_maxage)
 		mod_timer(&x->rtimer, jiffies + x->replay_maxage);
 
-	wake_up(&net->xfrm.km_waitq);
-
 	net->xfrm.state_num++;
 
 	xfrm_hash_grow_check(net, x->bydst.next != NULL);
@@ -1655,16 +1645,12 @@ EXPORT_SYMBOL(km_state_notify);
 
 void km_state_expired(struct xfrm_state *x, int hard, u32 portid)
 {
-	struct net *net = xs_net(x);
 	struct km_event c;
 
 	c.data.hard = hard;
 	c.portid = portid;
 	c.event = XFRM_MSG_EXPIRE;
 	km_state_notify(x, &c);
-
-	if (hard)
-		wake_up(&net->xfrm.km_waitq);
 }
 
 EXPORT_SYMBOL(km_state_expired);
@@ -1707,16 +1693,12 @@ EXPORT_SYMBOL(km_new_mapping);
 
 void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid)
 {
-	struct net *net = xp_net(pol);
 	struct km_event c;
 
 	c.data.hard = hard;
 	c.portid = portid;
 	c.event = XFRM_MSG_POLEXPIRE;
 	km_policy_notify(pol, dir, &c);
-
-	if (hard)
-		wake_up(&net->xfrm.km_waitq);
 }
 EXPORT_SYMBOL(km_policy_expired);
 
@@ -2025,7 +2007,6 @@ int __net_init xfrm_state_init(struct net *net)
 	INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
 	INIT_HLIST_HEAD(&net->xfrm.state_gc_list);
 	INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task);
-	init_waitqueue_head(&net->xfrm.km_waitq);
 	return 0;
 
 out_byspi:
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH RFC 2/2] net: Remove FLOWI_FLAG_CAN_SLEEP
  2013-10-10  6:33 [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Steffen Klassert
  2013-10-10  6:33 ` [PATCH RFC 1/2] xfrm: Remove ancient sleeping when the SA is in acquire state Steffen Klassert
@ 2013-10-10  6:34 ` Steffen Klassert
  2013-10-10  7:02 ` [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Fan Du
  2013-10-11 19:01 ` David Miller
  3 siblings, 0 replies; 17+ messages in thread
From: Steffen Klassert @ 2013-10-10  6:34 UTC (permalink / raw)
  To: netdev

FLOWI_FLAG_CAN_SLEEP was used to notify xfrm about the posibility
to sleep until the needed states are resolved. This code is gone,
so FLOWI_FLAG_CAN_SLEEP is not needed anymore.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/flow.h               |    3 +--
 include/net/ipv6.h               |    6 ++----
 include/net/route.h              |    8 +++-----
 net/dccp/ipv4.c                  |    2 +-
 net/dccp/ipv6.c                  |    8 ++++----
 net/decnet/dn_route.c            |    2 --
 net/ipv4/af_inet.c               |    2 +-
 net/ipv4/datagram.c              |    2 +-
 net/ipv4/raw.c                   |    2 +-
 net/ipv4/tcp_ipv4.c              |    2 +-
 net/ipv4/udp.c                   |    2 +-
 net/ipv6/af_inet6.c              |    2 +-
 net/ipv6/datagram.c              |    2 +-
 net/ipv6/inet6_connection_sock.c |    4 ++--
 net/ipv6/ip6_output.c            |   12 ++----------
 net/ipv6/ping.c                  |    2 +-
 net/ipv6/raw.c                   |    2 +-
 net/ipv6/syncookies.c            |    2 +-
 net/ipv6/tcp_ipv6.c              |    4 ++--
 net/ipv6/udp.c                   |    2 +-
 net/l2tp/l2tp_ip6.c              |    2 +-
 net/sctp/ipv6.c                  |    4 ++--
 22 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 65ce471..d23e7fa 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -20,8 +20,7 @@ struct flowi_common {
 	__u8	flowic_proto;
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
-#define FLOWI_FLAG_CAN_SLEEP		0x02
-#define FLOWI_FLAG_KNOWN_NH		0x04
+#define FLOWI_FLAG_KNOWN_NH		0x02
 	__u32	flowic_secid;
 };
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index fe1c7f6..4f91706 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -710,11 +710,9 @@ void ip6_flush_pending_frames(struct sock *sk);
 
 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6);
 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
-				      const struct in6_addr *final_dst,
-				      bool can_sleep);
+				      const struct in6_addr *final_dst);
 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
-					 const struct in6_addr *final_dst,
-					 bool can_sleep);
+					 const struct in6_addr *final_dst);
 struct dst_entry *ip6_blackhole_route(struct net *net,
 				      struct dst_entry *orig_dst);
 
diff --git a/include/net/route.h b/include/net/route.h
index 0ad8e01..a25fcca 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -247,14 +247,12 @@ static inline char rt_tos2priority(u8 tos)
 static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src,
 					 u32 tos, int oif, u8 protocol,
 					 __be16 sport, __be16 dport,
-					 struct sock *sk, bool can_sleep)
+					 struct sock *sk)
 {
 	__u8 flow_flags = 0;
 
 	if (inet_sk(sk)->transparent)
 		flow_flags |= FLOWI_FLAG_ANYSRC;
-	if (can_sleep)
-		flow_flags |= FLOWI_FLAG_CAN_SLEEP;
 
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
 			   protocol, flow_flags, dst, src, dport, sport);
@@ -264,13 +262,13 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
 					      __be32 dst, __be32 src, u32 tos,
 					      int oif, u8 protocol,
 					      __be16 sport, __be16 dport,
-					      struct sock *sk, bool can_sleep)
+					      struct sock *sk)
 {
 	struct net *net = sock_net(sk);
 	struct rtable *rt;
 
 	ip_route_connect_init(fl4, dst, src, tos, oif, protocol,
-			      sport, dport, sk, can_sleep);
+			      sport, dport, sk);
 
 	if (!dst || !src) {
 		rt = __ip_route_output_key(net, fl4);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index ebc54fe..751637a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -75,7 +75,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 			      IPPROTO_DCCP,
-			      orig_sport, orig_dport, sk, true);
+			      orig_sport, orig_dport, sk);
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6cf9f77..3d2574e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -237,7 +237,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
 
 	final_p = fl6_update_dst(&fl6, np->opt, &final);
 
-	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
 		dst = NULL;
@@ -302,7 +302,7 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
 
 	/* sk = NULL, but it is safe for now. RST socket required. */
-	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
+	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(skb, dst);
 		ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
@@ -513,7 +513,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		fl6.fl6_sport = inet_rsk(req)->loc_port;
 		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+		dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 		if (IS_ERR(dst))
 			goto out;
 	}
@@ -933,7 +933,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	final_p = fl6_update_dst(&fl6, np->opt, &final);
 
-	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
 		goto failure;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index fe32388..ad2efa5 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1288,8 +1288,6 @@ int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *fl, stru
 
 	err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD);
 	if (err == 0 && fl->flowidn_proto) {
-		if (!(flags & MSG_DONTWAIT))
-			fl->flowidn_flags |= FLOWI_FLAG_CAN_SLEEP;
 		*pprt = xfrm_lookup(&init_net, *pprt,
 				    flowidn_to_flowi(fl), sk, 0);
 		if (IS_ERR(*pprt)) {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7a1874b..c3bf51e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1162,7 +1162,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	fl4 = &inet->cork.fl.u.ip4;
 	rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
 			      sk->sk_bound_dev_if, sk->sk_protocol,
-			      inet->inet_sport, inet->inet_dport, sk, false);
+			      inet->inet_sport, inet->inet_dport, sk);
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
 
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b28e863..dcd4148 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -53,7 +53,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
 			      RT_CONN_FLAGS(sk), oif,
 			      sk->sk_protocol,
-			      inet->inet_sport, usin->sin_port, sk, true);
+			      inet->inet_sport, usin->sin_port, sk);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		if (err == -ENETUNREACH)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a3fe534..6fd4d0c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -573,7 +573,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
 			   RT_SCOPE_UNIVERSE,
 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-			   inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP |
+			   inet_sk_flowi_flags(sk) |
 			    (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
 			   daddr, saddr, 0, 0);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b14266b..6959be9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -173,7 +173,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 			      IPPROTO_TCP,
-			      orig_sport, orig_dport, sk, true);
+			      orig_sport, orig_dport, sk);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		if (err == -ENETUNREACH)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 22462d94..70a9e28 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -966,7 +966,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		fl4 = &fl4_stack;
 		flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
 				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
-				   inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP,
+				   inet_sk_flowi_flags(sk),
 				   faddr, saddr, dport, inet->inet_sport);
 
 		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4966b12..cd3c181 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -666,7 +666,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 
 		final_p = fl6_update_dst(&fl6, np->opt, &final);
 
-		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+		dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 		if (IS_ERR(dst)) {
 			sk->sk_route_caps = 0;
 			sk->sk_err_soft = -PTR_ERR(dst);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 48b6bd2..0bc3297 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -171,7 +171,7 @@ ipv4_connected:
 	opt = flowlabel ? flowlabel->opt : np->opt;
 	final_p = fl6_update_dst(&fl6, opt, &final);
 
-	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	err = 0;
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e4311cb..b65ca8c 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -86,7 +86,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
 	fl6->fl6_sport = inet_rsk(req)->loc_port;
 	security_req_classify_flow(req, flowi6_to_flowi(fl6));
 
-	dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
+	dst = ip6_dst_lookup_flow(sk, fl6, final_p);
 	if (IS_ERR(dst))
 		return NULL;
 
@@ -217,7 +217,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 
 	dst = __inet6_csk_dst_check(sk, np->dst_cookie);
 	if (!dst) {
-		dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
+		dst = ip6_dst_lookup_flow(sk, fl6, final_p);
 
 		if (!IS_ERR(dst))
 			__inet6_csk_dst_store(sk, dst, NULL, NULL);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3a692d5..7a0d3c4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -937,7 +937,6 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup);
  *	@sk: socket which provides route info
  *	@fl6: flow to lookup
  *	@final_dst: final destination address for ipsec lookup
- *	@can_sleep: we are in a sleepable context
  *
  *	This function performs a route lookup on the given flow.
  *
@@ -945,8 +944,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup);
  *	error code.
  */
 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
-				      const struct in6_addr *final_dst,
-				      bool can_sleep)
+				      const struct in6_addr *final_dst)
 {
 	struct dst_entry *dst = NULL;
 	int err;
@@ -956,8 +954,6 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 		return ERR_PTR(err);
 	if (final_dst)
 		fl6->daddr = *final_dst;
-	if (can_sleep)
-		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
 
 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 }
@@ -968,7 +964,6 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
  *	@sk: socket which provides the dst cache and route info
  *	@fl6: flow to lookup
  *	@final_dst: final destination address for ipsec lookup
- *	@can_sleep: we are in a sleepable context
  *
  *	This function performs a route lookup on the given flow with the
  *	possibility of using the cached route in the socket if it is valid.
@@ -979,8 +974,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
  *	error code.
  */
 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
-					 const struct in6_addr *final_dst,
-					 bool can_sleep)
+					 const struct in6_addr *final_dst)
 {
 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
 	int err;
@@ -992,8 +986,6 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 		return ERR_PTR(err);
 	if (final_dst)
 		fl6->daddr = *final_dst;
-	if (can_sleep)
-		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
 
 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 }
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 18f19df..4ec91bb 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -144,7 +144,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
 
-	dst = ip6_sk_dst_lookup_flow(sk, &fl6,  daddr, 1);
+	dst = ip6_sk_dst_lookup_flow(sk, &fl6,  daddr);
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 	rt = (struct rt6_info *) dst;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 58916bb..c74425d 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -866,7 +866,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		fl6.flowi6_oif = np->ucast_oif;
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
 		goto out;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index d703218..b0fbbfc 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -243,7 +243,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		fl6.fl6_sport = inet_sk(sk)->inet_sport;
 		security_req_classify_flow(req, flowi6_to_flowi(&fl6));
 
-		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+		dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 		if (IS_ERR(dst))
 			goto out_free;
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5c71501..b762603 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -258,7 +258,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
 		goto failure;
@@ -800,7 +800,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 	 * Underlying function will use this to retrieve the network
 	 * namespace
 	 */
-	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
+	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(buff, dst);
 		ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f405815..a20b562 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1204,7 +1204,7 @@ do_udp_sendmsg:
 
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, true);
+	dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
 		dst = NULL;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index b8a6039..d1149aa 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -598,7 +598,7 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk,
 
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
 		goto out;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e7b2d4f..2b3d523 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -263,7 +263,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	}
 
 	final_p = fl6_update_dst(fl6, np->opt, &final);
-	dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
+	dst = ip6_dst_lookup_flow(sk, fl6, final_p);
 	if (!asoc || saddr)
 		goto out;
 
@@ -320,7 +320,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 		fl6->saddr = baddr->v6.sin6_addr;
 		fl6->fl6_sport = baddr->v6.sin6_port;
 		final_p = fl6_update_dst(fl6, np->opt, &final);
-		dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
+		dst = ip6_dst_lookup_flow(sk, fl6, final_p);
 	}
 
 out:
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-10  6:33 [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Steffen Klassert
  2013-10-10  6:33 ` [PATCH RFC 1/2] xfrm: Remove ancient sleeping when the SA is in acquire state Steffen Klassert
  2013-10-10  6:34 ` [PATCH RFC 2/2] net: Remove FLOWI_FLAG_CAN_SLEEP Steffen Klassert
@ 2013-10-10  7:02 ` Fan Du
  2013-10-10  8:57   ` Steffen Klassert
  2013-10-11 19:01 ` David Miller
  3 siblings, 1 reply; 17+ messages in thread
From: Fan Du @ 2013-10-10  7:02 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: netdev



On 2013年10月10日 14:33, Steffen Klassert wrote:
> Does anyone still rely on the ancient sleeping when the SA is in
> acquire state? It is disabled by default since more that five years,
> but can cause indefinite task hangs if enabled and the needed state
> does not get resolved.

I saw that "can_sleep" is set true in ip_route_connect which upper layer
protocol relies on it, which ensure not dropping *any* skb.
And acquire timer will make sure the task will not hangs indefinitely.

In xfrm policy queue, XFRM_MAX_QUEUE_LEN is 100, which means 101th skb
will be dropped, how about make it configurable? If CAN_SLEEP flags is
removed, user could adjust this knob if needed in any circumstance.


> We now queue packets to the policy if the states are not yet resolved
> if we are in a code path that can not sleep. We could do this even in
> the case we can sleep. As a bonus, we can remove the FLOWI_FLAG_CAN_SLEEP
> flag because the only thing this flag does, is to notify xfrm that we are
> in a codepath that can sleep.
>
> The two RFC patches to remove the sleeping code are in reply to this
> mail. I'd add this to the ipsec-next tree if there are no objections.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

-- 
浮沉随浪只记今朝笑

--fan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-10  7:02 ` [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Fan Du
@ 2013-10-10  8:57   ` Steffen Klassert
  2013-10-11  7:18     ` Fan Du
  0 siblings, 1 reply; 17+ messages in thread
From: Steffen Klassert @ 2013-10-10  8:57 UTC (permalink / raw)
  To: Fan Du; +Cc: netdev

On Thu, Oct 10, 2013 at 03:02:14PM +0800, Fan Du wrote:
> 
> 
> On 2013年10月10日 14:33, Steffen Klassert wrote:
> >Does anyone still rely on the ancient sleeping when the SA is in
> >acquire state? It is disabled by default since more that five years,
> >but can cause indefinite task hangs if enabled and the needed state
> >does not get resolved.
> 
> I saw that "can_sleep" is set true in ip_route_connect which upper layer
> protocol relies on it, which ensure not dropping *any* skb.

'Any' means one per task in this context. Also, we can't ensure that
this packet reaches it's destination. So where is the difference
between dropping the packet locally or on the network?

> And acquire timer will make sure the task will not hangs indefinitely.
> 

Did you try that? It makes sure that the task wakes up from time to time,
but it goes immediately back to sleep if the needed state is not resolved.
The only terminating contition is when the task gets a signal to exit.

> In xfrm policy queue, XFRM_MAX_QUEUE_LEN is 100, which means 101th skb
> will be dropped, how about make it configurable?

IMO we would have yet another useless knob then. Currently we send all
packets by default to a blackhole as long as the state is not resolved
and most people are fine with it. The queueing is mostly to speed up
tcp handshakes, so 100 packets should be enough. If it really turnes
out that we need more that 100 packets in some cases, we can add a
sysctl then.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-10  8:57   ` Steffen Klassert
@ 2013-10-11  7:18     ` Fan Du
  2013-10-11  9:21       ` Steffen Klassert
  0 siblings, 1 reply; 17+ messages in thread
From: Fan Du @ 2013-10-11  7:18 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: netdev

hehe, I didn't object this cleanup except learning from it :)

On 2013年10月10日 16:57, Steffen Klassert wrote:
> On Thu, Oct 10, 2013 at 03:02:14PM +0800, Fan Du wrote:
>>
>>
>> On 2013年10月10日 14:33, Steffen Klassert wrote:
>>> Does anyone still rely on the ancient sleeping when the SA is in
>>> acquire state? It is disabled by default since more that five years,
>>> but can cause indefinite task hangs if enabled and the needed state
>>> does not get resolved.
>>
>> I saw that "can_sleep" is set true in ip_route_connect which upper layer
>> protocol relies on it, which ensure not dropping *any* skb.
>
> 'Any' means one per task in this context. Also, we can't ensure that
> this packet reaches it's destination. So where is the difference
> between dropping the packet locally or on the network?
>
>> And acquire timer will make sure the task will not hangs indefinitely.
>>
>
> Did you try that? It makes sure that the task wakes up from time to time,
> but it goes immediately back to sleep if the needed state is not resolved.
> The only terminating contition is when the task gets a signal to exit.
>
>> In xfrm policy queue, XFRM_MAX_QUEUE_LEN is 100, which means 101th skb
>> will be dropped, how about make it configurable?
>
> IMO we would have yet another useless knob then. Currently we send all
> packets by default to a blackhole as long as the state is not resolved
> and most people are fine with it. The queueing is mostly to speed up
> tcp handshakes,

I cannot follow on this part. Would you please mind to explain how making a
policy queue will speed up TCP handshakes than orignal CAN_SLEEP mechanism?


Thanks

-- 
浮沉随浪只记今朝笑

--fan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-11  7:18     ` Fan Du
@ 2013-10-11  9:21       ` Steffen Klassert
  0 siblings, 0 replies; 17+ messages in thread
From: Steffen Klassert @ 2013-10-11  9:21 UTC (permalink / raw)
  To: Fan Du; +Cc: netdev

On Fri, Oct 11, 2013 at 03:18:40PM +0800, Fan Du wrote:
> >>In xfrm policy queue, XFRM_MAX_QUEUE_LEN is 100, which means 101th skb
> >>will be dropped, how about make it configurable?
> >
> >IMO we would have yet another useless knob then. Currently we send all
> >packets by default to a blackhole as long as the state is not resolved
> >and most people are fine with it. The queueing is mostly to speed up
> >tcp handshakes,
> 
> I cannot follow on this part. Would you please mind to explain how making a
> policy queue will speed up TCP handshakes than orignal CAN_SLEEP mechanism?
> 

I have not said that queueing is any faster than the sleeping mechanism.
But it is faster than the current default that simply sends the first
packets to a blackhole and can be used regardless if we can sleep.

All I wanted to say is that there are not many packets needed to establish
tcp connections, so I think a 100 packets queue is sufficient.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-10  6:33 [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Steffen Klassert
                   ` (2 preceding siblings ...)
  2013-10-10  7:02 ` [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Fan Du
@ 2013-10-11 19:01 ` David Miller
  2013-10-15  7:30   ` Steffen Klassert
  3 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2013-10-11 19:01 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 10 Oct 2013 08:33:01 +0200

> Does anyone still rely on the ancient sleeping when the SA is in
> acquire state? It is disabled by default since more that five years,
> but can cause indefinite task hangs if enabled and the needed state
> does not get resolved.
> 
> We now queue packets to the policy if the states are not yet resolved
> if we are in a code path that can not sleep. We could do this even in
> the case we can sleep. As a bonus, we can remove the FLOWI_FLAG_CAN_SLEEP
> flag because the only thing this flag does, is to notify xfrm that we are
> in a codepath that can sleep.
> 
> The two RFC patches to remove the sleeping code are in reply to this
> mail. I'd add this to the ipsec-next tree if there are no objections.

The sleep path has the slight benefit that the TCP retransmit timers
for the initial SYN packet will not be started until the IPSEC rule
is resolved and the SYN actually goes out.

With the packet queue, if the IPSEC resolution is slow then we'll have
spurious SYN retransmits.

It makes no sense for TCP to keep queueing up SYNs if they will just
all get stuck in the packet queue.  The first one is enough.

On the other hand we do want TCP to timeout, we do want the user to
be able to "Ctrl-C" (ie. send a SIGINT) during a connect, etc.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-11 19:01 ` David Miller
@ 2013-10-15  7:30   ` Steffen Klassert
  2013-10-15 23:14     ` David Miller
  2013-10-15 23:58     ` Eric Dumazet
  0 siblings, 2 replies; 17+ messages in thread
From: Steffen Klassert @ 2013-10-15  7:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Fri, Oct 11, 2013 at 03:01:24PM -0400, David Miller wrote:
> From: Steffen Klassert <steffen.klassert@secunet.com>
> Date: Thu, 10 Oct 2013 08:33:01 +0200
> 
> > The two RFC patches to remove the sleeping code are in reply to this
> > mail. I'd add this to the ipsec-next tree if there are no objections.
> 
> The sleep path has the slight benefit that the TCP retransmit timers
> for the initial SYN packet will not be started until the IPSEC rule
> is resolved and the SYN actually goes out.

Yes, that's a slight advantage of the sleeping. But if the IPsec state
does not get resolved for whatever reason, the retransmit timer will
never start. The task will wake up but goes back to sleep immediately
because the needed state is not resolved.

> 
> With the packet queue, if the IPSEC resolution is slow then we'll have
> spurious SYN retransmits.
> 
> It makes no sense for TCP to keep queueing up SYNs if they will just
> all get stuck in the packet queue.  The first one is enough.

Right, that's why I've limited the queue to 100 packets. We can
queue the SYNs of up to 100 tcp connestions that want to use
this IPsec state. It surely can happen that we queue multiple
retransmitted SYNs if the IPsec resolution is slow. But the
queueing code tries at least to get the packets out before
the first tcp retransmit. I think there is still room for
optimizations, maybe reducing the queue lenght or the queue
timeout to avoid queueing retransmitted SYNs as much as possible.

> 
> On the other hand we do want TCP to timeout, we do want the user to
> be able to "Ctrl-C" (ie. send a SIGINT) during a connect, etc.

As mentioned above, tcp does not timeout if the state is not
getting resolved and the task that tried to open the tcp
conection hangs indefinitely.

We could fiddle something to get a terminating condition if the
state is not resolved after some time, but my plan was to disable
the larval_drop sysctl by default some day again. At best without
any notable change to userspace. That's why I would prefer to
remove the sleeping entirely.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-15  7:30   ` Steffen Klassert
@ 2013-10-15 23:14     ` David Miller
  2013-10-15 23:58     ` Eric Dumazet
  1 sibling, 0 replies; 17+ messages in thread
From: David Miller @ 2013-10-15 23:14 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 15 Oct 2013 09:30:20 +0200

> We could fiddle something to get a terminating condition if the
> state is not resolved after some time, but my plan was to disable
> the larval_drop sysctl by default some day again. At best without
> any notable change to userspace. That's why I would prefer to
> remove the sleeping entirely.

Ok I have no problem with removing the sleeping stuff.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-15  7:30   ` Steffen Klassert
  2013-10-15 23:14     ` David Miller
@ 2013-10-15 23:58     ` Eric Dumazet
  2013-10-16  9:45       ` Steffen Klassert
  1 sibling, 1 reply; 17+ messages in thread
From: Eric Dumazet @ 2013-10-15 23:58 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: David Miller, netdev

On Tue, 2013-10-15 at 09:30 +0200, Steffen Klassert wrote:

> Right, that's why I've limited the queue to 100 packets. We can
> queue the SYNs of up to 100 tcp connestions that want to use
> this IPsec state. It surely can happen that we queue multiple
> retransmitted SYNs if the IPsec resolution is slow. But the
> queueing code tries at least to get the packets out before
> the first tcp retransmit. I think there is still room for
> optimizations, maybe reducing the queue lenght or the queue
> timeout to avoid queueing retransmitted SYNs as much as possible.

Note that its totally possible to avoid retransmitting SYN if original
SYN is still in a host queue.

We currently increment a SNMP counter when we detect this, we could
do something else (like not queuing a copy of the packet)

http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=0e280af026a5662ffd57c4e623b822df1f7f47ff

Another work in progress is to delay RTO arming at the time TCP
packet leaves the host queues, instead of at the enqueue time.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
  2013-10-15 23:58     ` Eric Dumazet
@ 2013-10-16  9:45       ` Steffen Klassert
  2013-10-16 11:42         ` [PATCH RFC] xfrm: Don't queue retransmitted packets if the original is still on the host Steffen Klassert
  0 siblings, 1 reply; 17+ messages in thread
From: Steffen Klassert @ 2013-10-16  9:45 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Tue, Oct 15, 2013 at 04:58:34PM -0700, Eric Dumazet wrote:
> On Tue, 2013-10-15 at 09:30 +0200, Steffen Klassert wrote:
> 
> > Right, that's why I've limited the queue to 100 packets. We can
> > queue the SYNs of up to 100 tcp connestions that want to use
> > this IPsec state. It surely can happen that we queue multiple
> > retransmitted SYNs if the IPsec resolution is slow. But the
> > queueing code tries at least to get the packets out before
> > the first tcp retransmit. I think there is still room for
> > optimizations, maybe reducing the queue lenght or the queue
> > timeout to avoid queueing retransmitted SYNs as much as possible.
> 
> Note that its totally possible to avoid retransmitting SYN if original
> SYN is still in a host queue.
> 
> We currently increment a SNMP counter when we detect this, we could
> do something else (like not queuing a copy of the packet)
> 
> http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=0e280af026a5662ffd57c4e623b822df1f7f47ff

This is interesting, we could do the same check before we queue
the packet and drop it if the original packet is still in a host
queue.

I'll do a RFC patch.

> 
> Another work in progress is to delay RTO arming at the time TCP
> packet leaves the host queues, instead of at the enqueue time.
> 

That would be even better, are there already patches publicly
available?

Thanks!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH RFC] xfrm: Don't queue retransmitted packets if the original is still on the host
  2013-10-16  9:45       ` Steffen Klassert
@ 2013-10-16 11:42         ` Steffen Klassert
  2013-10-18 20:19           ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Steffen Klassert @ 2013-10-16 11:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

It does not make sense to queue retransmitted packets if the
original packet is still in some queue of this host. So add
a check to xdst_queue_output() and drop the packet if the
original packet is not yet sent.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c |    7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ed38d5d..e09edfc 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1832,6 +1832,13 @@ static int xdst_queue_output(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
 	struct xfrm_policy_queue *pq = &xdst->pols[0]->polq;
+	const struct sk_buff *fclone = skb + 1;
+
+	if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
+		     fclone->fclone == SKB_FCLONE_CLONE)) {
+		kfree_skb(skb);
+		return 0;
+	}
 
 	if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
 		kfree_skb(skb);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC] xfrm: Don't queue retransmitted packets if the original is still on the host
  2013-10-16 11:42         ` [PATCH RFC] xfrm: Don't queue retransmitted packets if the original is still on the host Steffen Klassert
@ 2013-10-18 20:19           ` David Miller
  2013-10-18 20:23             ` Eric Dumazet
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2013-10-18 20:19 UTC (permalink / raw)
  To: steffen.klassert; +Cc: eric.dumazet, netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 16 Oct 2013 13:42:47 +0200

> It does not make sense to queue retransmitted packets if the
> original packet is still in some queue of this host. So add
> a check to xdst_queue_output() and drop the packet if the
> original packet is not yet sent.
> 
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>

I have no problems with this, what about you Eric?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC] xfrm: Don't queue retransmitted packets if the original is still on the host
  2013-10-18 20:19           ` David Miller
@ 2013-10-18 20:23             ` Eric Dumazet
  2013-10-18 20:34               ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Eric Dumazet @ 2013-10-18 20:23 UTC (permalink / raw)
  To: David Miller; +Cc: steffen.klassert, netdev

On Fri, 2013-10-18 at 16:19 -0400, David Miller wrote:
> From: Steffen Klassert <steffen.klassert@secunet.com>
> Date: Wed, 16 Oct 2013 13:42:47 +0200
> 
> > It does not make sense to queue retransmitted packets if the
> > original packet is still in some queue of this host. So add
> > a check to xdst_queue_output() and drop the packet if the
> > original packet is not yet sent.
> > 
> > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> 
> I have no problems with this, what about you Eric?

It looks fine to me.

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC] xfrm: Don't queue retransmitted packets if the original is still on the host
  2013-10-18 20:23             ` Eric Dumazet
@ 2013-10-18 20:34               ` David Miller
  2013-10-21 14:51                 ` Steffen Klassert
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2013-10-18 20:34 UTC (permalink / raw)
  To: eric.dumazet; +Cc: steffen.klassert, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 18 Oct 2013 13:23:45 -0700

> On Fri, 2013-10-18 at 16:19 -0400, David Miller wrote:
>> From: Steffen Klassert <steffen.klassert@secunet.com>
>> Date: Wed, 16 Oct 2013 13:42:47 +0200
>> 
>> > It does not make sense to queue retransmitted packets if the
>> > original packet is still in some queue of this host. So add
>> > a check to xdst_queue_output() and drop the packet if the
>> > original packet is not yet sent.
>> > 
>> > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
>> 
>> I have no problems with this, what about you Eric?
> 
> It looks fine to me.
> 
> Acked-by: Eric Dumazet <edumazet@google.com>

Great, Steffen I assume you'll merge this in via your ipsec tree.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH RFC] xfrm: Don't queue retransmitted packets if the original is still on the host
  2013-10-18 20:34               ` David Miller
@ 2013-10-21 14:51                 ` Steffen Klassert
  0 siblings, 0 replies; 17+ messages in thread
From: Steffen Klassert @ 2013-10-21 14:51 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev

On Fri, Oct 18, 2013 at 04:34:12PM -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Fri, 18 Oct 2013 13:23:45 -0700
> 
> > On Fri, 2013-10-18 at 16:19 -0400, David Miller wrote:
> >> From: Steffen Klassert <steffen.klassert@secunet.com>
> >> Date: Wed, 16 Oct 2013 13:42:47 +0200
> >> 
> >> > It does not make sense to queue retransmitted packets if the
> >> > original packet is still in some queue of this host. So add
> >> > a check to xdst_queue_output() and drop the packet if the
> >> > original packet is not yet sent.
> >> > 
> >> > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> >> 
> >> I have no problems with this, what about you Eric?
> > 
> > It looks fine to me.
> > 
> > Acked-by: Eric Dumazet <edumazet@google.com>
> 
> Great, Steffen I assume you'll merge this in via your ipsec tree.

Now applied to the ipsec-next tree. Thanks everyone for the review!

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2013-10-21 14:51 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-10  6:33 [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Steffen Klassert
2013-10-10  6:33 ` [PATCH RFC 1/2] xfrm: Remove ancient sleeping when the SA is in acquire state Steffen Klassert
2013-10-10  6:34 ` [PATCH RFC 2/2] net: Remove FLOWI_FLAG_CAN_SLEEP Steffen Klassert
2013-10-10  7:02 ` [PATCH RFC 0/2] xfrm: Remove ancient sleeping code Fan Du
2013-10-10  8:57   ` Steffen Klassert
2013-10-11  7:18     ` Fan Du
2013-10-11  9:21       ` Steffen Klassert
2013-10-11 19:01 ` David Miller
2013-10-15  7:30   ` Steffen Klassert
2013-10-15 23:14     ` David Miller
2013-10-15 23:58     ` Eric Dumazet
2013-10-16  9:45       ` Steffen Klassert
2013-10-16 11:42         ` [PATCH RFC] xfrm: Don't queue retransmitted packets if the original is still on the host Steffen Klassert
2013-10-18 20:19           ` David Miller
2013-10-18 20:23             ` Eric Dumazet
2013-10-18 20:34               ` David Miller
2013-10-21 14:51                 ` Steffen Klassert

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.