netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* xfrm, ip tunnel: non released device reference upon device unregistration
@ 2018-02-04 11:21 Eyal Birger
  2018-02-04 13:32 ` Eyal Birger
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Eyal Birger @ 2018-02-04 11:21 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev, herbert, davem, shmulik

Hi,

We've encountered a non released device reference upon device
unregistration which seems to stem from xfrm policy code.

The setup includes:
- an underlay device (e.g. eth0) using IPv4
- an xfrm IPv6 over IPv4 tunnel routed via the underlay device
- an ipip6 tunnel over the xfrm IPv6 tunnel

When tearing down the underlay device, after traffic had passed via the ipip6
tunnel, log messages of the following form are observed:

unregister_netdevice: waiting for eth0 to become free. Usage count = 2

The below synthetic script reproduces this consistently on a fresh ubuntu vm
running net-next v4.15-6066-ge9522a5:
---------------------------------------------------------
#!/bin/bash

ipsec_underlay_dst=192.168.6.1
ipsec_underlay_src=192.168.5.2
ipv6_pfx=1234
local_ipv6_addr="$ipv6_pfx::1"
remote_ipv6_addr="$ipv6_pfx::2"

# create dummy ipsec underlay
ip l add dev dummy1 type dummy
ip l set dev dummy1 up
ip r add "$ipsec_underlay_dst/32" dev dummy1
ip -6 r add "$ipv6_pfx::/16" dev dummy1

ip a add dev dummy1 "$local_ipv6_addr/128"
ip a add dev dummy1 "$ipsec_underlay_src/24"

# add xfrm policy and state
ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out tmpl src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp reqid 1 mode tunnel
ip x s add src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp spi 0xcd440ce6 reqid 1 mode tunnel auth-trunc 'hmac(sha1)' 0x34a546d309031628962b814ef073aff1a638ad21 96 enc 'cbc(aes)' 0xf31e14149c328297fe7925ad7448420e encap espinudp 4500 4500 0.0.0.0

# add 4o6 tunnel
ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr" remote "$remote_ipv6_addr"
ip l set dev tnl46 up
ip r add 10.64.0.0/10 dev tnl46 

# pass traffic so route is cached
ping -w 1 -c 1 10.64.0.1

# remove dummy underlay
ip l del dummy1
---------------------------------------------------------

Analysis:

ip6_tunnel holds a dst_cache which caches its underlay dst objects.
When devices are unregistered, non-xfrm dst objects are invlidated by their
original creators (ipv4/ipv6/...) and thus are wiped from dst_cache.

xfrm created routes otoh are not tracked by xfrm, and are not invalidated upon
device unregistration, thus hold the device upon unregistration.

The following rough sketch patch illustrates an approach overcoming this
issue:
---------------------------------------------------------
From e188dc5295e3500bc59e8780049840afa2eb3e24 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal@metanetworks.com>
Date: Sun, 4 Feb 2018 13:08:02 +0200
Subject: [PATCH] net: xfrm_policy: invalidate xfrm_dsts on device
 unregistration

---
 include/net/xfrm.h     | 10 ++-----
 net/xfrm/xfrm_policy.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 7d20776..c1c8493 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -994,6 +994,8 @@ struct xfrm_dst {
 	u32 child_mtu_cached;
 	u32 route_cookie;
 	u32 path_cookie;
+	struct list_head xfrm_dst_gc;
+	struct xfrm_dst_list *xfrm_dst_gc_list;
 };
 
 static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
@@ -1025,13 +1027,7 @@ static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *c
 	xdst->child = child;
 }
 
-static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
-{
-	xfrm_pols_put(xdst->pols, xdst->num_pols);
-	dst_release(xdst->route);
-	if (likely(xdst->u.dst.xfrm))
-		xfrm_state_put(xdst->u.dst.xfrm);
-}
+void xfrm_dst_destroy(struct xfrm_dst *xdst);
 #endif
 
 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a23078..d446641 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -94,6 +94,58 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
 		(fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
 }
 
+struct xfrm_dst_list {
+	spinlock_t		lock;
+	struct list_head	head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list);
+
+static void xfrm_add_dst_list(struct xfrm_dst *xdst)
+{
+	struct xfrm_dst_list *xl = raw_cpu_ptr(&xfrm_dst_gc_list);
+
+	xdst->xfrm_dst_gc_list = xl;
+
+	spin_lock_bh(&xl->lock);
+	list_add_tail(&xdst->xfrm_dst_gc, &xl->head);
+	spin_unlock_bh(&xl->lock);
+}
+
+void xfrm_dst_destroy(struct xfrm_dst *xdst)
+{
+	xfrm_pols_put(xdst->pols, xdst->num_pols);
+	dst_release(xdst->route);
+	if (likely(xdst->u.dst.xfrm))
+		xfrm_state_put(xdst->u.dst.xfrm);
+	if (!list_empty(&xdst->xfrm_dst_gc)) {
+		struct xfrm_dst_list *xl = xdst->xfrm_dst_gc_list;
+
+		spin_lock_bh(&xl->lock);
+		list_del(&xdst->xfrm_dst_gc);
+		spin_unlock_bh(&xl->lock);
+	}
+}
+EXPORT_SYMBOL_GPL(xfrm_dst_destroy);
+
+void xfrm_flush_dev(struct net_device *dev)
+{
+	struct xfrm_dst *xdst;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
+
+		spin_lock_bh(&xl->lock);
+		list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) {
+			if (xdst->u.dst.dev != dev)
+				continue;
+			dst_dev_put(&xdst->u.dst);
+		}
+		spin_unlock_bh(&xl->lock);
+	}
+}
+
 bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
 			 unsigned short family)
 {
@@ -1581,6 +1633,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		}
 
 		bundle[i] = xdst;
+		xfrm_add_dst_list(xdst);
+
 		if (!xdst_prev)
 			xdst0 = xdst;
 		else
@@ -2984,8 +3038,22 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
 	.exit = xfrm_net_exit,
 };
 
+static int xfrm_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (event == NETDEV_UNREGISTER)
+		xfrm_flush_dev(dev);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block xfrm_netdev_notifier = {
+	.notifier_call = xfrm_netdev_event,
+};
+
 void __init xfrm_init(void)
 {
+	int cpu;
 	int i;
 
 	xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
@@ -2998,6 +3066,13 @@ void __init xfrm_init(void)
 	register_pernet_subsys(&xfrm_net_ops);
 	seqcount_init(&xfrm_policy_hash_generation);
 	xfrm_input_init();
+	for_each_possible_cpu(cpu) {
+		struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
+
+		INIT_LIST_HEAD(&xl->head);
+		spin_lock_init(&xl->lock);
+	}
+	register_netdevice_notifier(&xfrm_netdev_notifier);
 }
 
 #ifdef CONFIG_AUDITSYSCALL
-- 
2.7.4

---------------------------------------------------------

This approach has the unfortunate side effects of adding a spin lock for the
tracked list, as well as increasing struct xfrm_dst.

Any thoughts on how to best approach this within xfrm would be most welcomed.

Thanks,
Eyal.

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-04 11:21 xfrm, ip tunnel: non released device reference upon device unregistration Eyal Birger
@ 2018-02-04 13:32 ` Eyal Birger
  2018-02-06  8:53 ` Steffen Klassert
  2018-02-06 12:56 ` Florian Westphal
  2 siblings, 0 replies; 12+ messages in thread
From: Eyal Birger @ 2018-02-04 13:32 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev, herbert, davem, shmulik

On Sun, 4 Feb 2018 13:21:18 +0200
Eyal Birger <eyal.birger@gmail.com> wrote:

> Hi,
> 
> We've encountered a non released device reference upon device
> unregistration which seems to stem from xfrm policy code.
> 
> The setup includes:
> - an underlay device (e.g. eth0) using IPv4
> - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> - an ipip6 tunnel over the xfrm IPv6 tunnel
> 
> When tearing down the underlay device, after traffic had passed via
> the ipip6 tunnel, log messages of the following form are observed:
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 2
> 
> The below synthetic script reproduces this consistently on a fresh
> ubuntu vm running net-next v4.15-6066-ge9522a5:

Minor correction: net-next version v4.15-10442-g617aebe

Eyal.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-04 11:21 xfrm, ip tunnel: non released device reference upon device unregistration Eyal Birger
  2018-02-04 13:32 ` Eyal Birger
@ 2018-02-06  8:53 ` Steffen Klassert
  2018-02-06 10:32   ` Florian Westphal
  2018-02-06 10:42   ` Eyal Birger
  2018-02-06 12:56 ` Florian Westphal
  2 siblings, 2 replies; 12+ messages in thread
From: Steffen Klassert @ 2018-02-06  8:53 UTC (permalink / raw)
  To: Eyal Birger; +Cc: netdev, herbert, davem, shmulik, Wei Wang

Cc Wei Wang

On Sun, Feb 04, 2018 at 01:21:18PM +0200, Eyal Birger wrote:
> Hi,
> 
> We've encountered a non released device reference upon device
> unregistration which seems to stem from xfrm policy code.
> 
> The setup includes:
> - an underlay device (e.g. eth0) using IPv4
> - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> - an ipip6 tunnel over the xfrm IPv6 tunnel
> 
> When tearing down the underlay device, after traffic had passed via the ipip6
> tunnel, log messages of the following form are observed:
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 2

Looks like this happened when the dst garbage collection code was
removed. I could not point to a commit that introduced it so I
did a bisection and this pointed to:

commit 9514528d92d4cbe086499322370155ed69f5d06c
ipv6: call dst_dev_put() properly

With this commit we leak the one refcount and some further commit
leaked the second one.

> 
> The below synthetic script reproduces this consistently on a fresh ubuntu vm
> running net-next v4.15-6066-ge9522a5:
> ---------------------------------------------------------
> #!/bin/bash
> 
> ipsec_underlay_dst=192.168.6.1
> ipsec_underlay_src=192.168.5.2
> ipv6_pfx=1234
> local_ipv6_addr="$ipv6_pfx::1"
> remote_ipv6_addr="$ipv6_pfx::2"
> 
> # create dummy ipsec underlay
> ip l add dev dummy1 type dummy
> ip l set dev dummy1 up
> ip r add "$ipsec_underlay_dst/32" dev dummy1
> ip -6 r add "$ipv6_pfx::/16" dev dummy1
> 
> ip a add dev dummy1 "$local_ipv6_addr/128"
> ip a add dev dummy1 "$ipsec_underlay_src/24"
> 
> # add xfrm policy and state
> ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out tmpl src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp reqid 1 mode tunnel
> ip x s add src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp spi 0xcd440ce6 reqid 1 mode tunnel auth-trunc 'hmac(sha1)' 0x34a546d309031628962b814ef073aff1a638ad21 96 enc 'cbc(aes)' 0xf31e14149c328297fe7925ad7448420e encap espinudp 4500 4500 0.0.0.0
> 
> # add 4o6 tunnel
> ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr" remote "$remote_ipv6_addr"
> ip l set dev tnl46 up
> ip r add 10.64.0.0/10 dev tnl46 
> 
> # pass traffic so route is cached
> ping -w 1 -c 1 10.64.0.1
> 
> # remove dummy underlay
> ip l del dummy1
> ---------------------------------------------------------
> 
> Analysis:
> 
> ip6_tunnel holds a dst_cache which caches its underlay dst objects.
> When devices are unregistered, non-xfrm dst objects are invlidated by their
> original creators (ipv4/ipv6/...) and thus are wiped from dst_cache.
> 
> xfrm created routes otoh are not tracked by xfrm, and are not invalidated upon
> device unregistration, thus hold the device upon unregistration.
> 
> The following rough sketch patch illustrates an approach overcoming this
> issue:
> ---------------------------------------------------------
> >From e188dc5295e3500bc59e8780049840afa2eb3e24 Mon Sep 17 00:00:00 2001
> From: Eyal Birger <eyal@metanetworks.com>
> Date: Sun, 4 Feb 2018 13:08:02 +0200
> Subject: [PATCH] net: xfrm_policy: invalidate xfrm_dsts on device
>  unregistration
> 
> ---
>  include/net/xfrm.h     | 10 ++-----
>  net/xfrm/xfrm_policy.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 78 insertions(+), 7 deletions(-)
> 
> diff --git a/include/net/xfrm.h b/include/net/xfrm.h
> index 7d20776..c1c8493 100644
> --- a/include/net/xfrm.h
> +++ b/include/net/xfrm.h
> @@ -994,6 +994,8 @@ struct xfrm_dst {
>  	u32 child_mtu_cached;
>  	u32 route_cookie;
>  	u32 path_cookie;
> +	struct list_head xfrm_dst_gc;
> +	struct xfrm_dst_list *xfrm_dst_gc_list;
>  };
>  
>  static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
> @@ -1025,13 +1027,7 @@ static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *c
>  	xdst->child = child;
>  }
>  
> -static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
> -{
> -	xfrm_pols_put(xdst->pols, xdst->num_pols);
> -	dst_release(xdst->route);
> -	if (likely(xdst->u.dst.xfrm))
> -		xfrm_state_put(xdst->u.dst.xfrm);
> -}
> +void xfrm_dst_destroy(struct xfrm_dst *xdst);
>  #endif
>  
>  void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> index 7a23078..d446641 100644
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -94,6 +94,58 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
>  		(fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
>  }
>  
> +struct xfrm_dst_list {
> +	spinlock_t		lock;
> +	struct list_head	head;
> +};
> +
> +static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list);
> +
> +static void xfrm_add_dst_list(struct xfrm_dst *xdst)
> +{
> +	struct xfrm_dst_list *xl = raw_cpu_ptr(&xfrm_dst_gc_list);
> +
> +	xdst->xfrm_dst_gc_list = xl;
> +
> +	spin_lock_bh(&xl->lock);
> +	list_add_tail(&xdst->xfrm_dst_gc, &xl->head);
> +	spin_unlock_bh(&xl->lock);
> +}
> +
> +void xfrm_dst_destroy(struct xfrm_dst *xdst)
> +{
> +	xfrm_pols_put(xdst->pols, xdst->num_pols);
> +	dst_release(xdst->route);
> +	if (likely(xdst->u.dst.xfrm))
> +		xfrm_state_put(xdst->u.dst.xfrm);
> +	if (!list_empty(&xdst->xfrm_dst_gc)) {
> +		struct xfrm_dst_list *xl = xdst->xfrm_dst_gc_list;
> +
> +		spin_lock_bh(&xl->lock);
> +		list_del(&xdst->xfrm_dst_gc);
> +		spin_unlock_bh(&xl->lock);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(xfrm_dst_destroy);
> +
> +void xfrm_flush_dev(struct net_device *dev)
> +{
> +	struct xfrm_dst *xdst;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
> +
> +		spin_lock_bh(&xl->lock);
> +		list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) {
> +			if (xdst->u.dst.dev != dev)
> +				continue;
> +			dst_dev_put(&xdst->u.dst);
> +		}
> +		spin_unlock_bh(&xl->lock);
> +	}
> +}
> +
>  bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
>  			 unsigned short family)
>  {
> @@ -1581,6 +1633,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
>  		}
>  
>  		bundle[i] = xdst;
> +		xfrm_add_dst_list(xdst);
> +
>  		if (!xdst_prev)
>  			xdst0 = xdst;
>  		else
> @@ -2984,8 +3038,22 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
>  	.exit = xfrm_net_exit,
>  };
>  
> +static int xfrm_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
> +{
> +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +
> +	if (event == NETDEV_UNREGISTER)
> +		xfrm_flush_dev(dev);
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block xfrm_netdev_notifier = {
> +	.notifier_call = xfrm_netdev_event,
> +};
> +
>  void __init xfrm_init(void)
>  {
> +	int cpu;
>  	int i;
>  
>  	xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
> @@ -2998,6 +3066,13 @@ void __init xfrm_init(void)
>  	register_pernet_subsys(&xfrm_net_ops);
>  	seqcount_init(&xfrm_policy_hash_generation);
>  	xfrm_input_init();
> +	for_each_possible_cpu(cpu) {
> +		struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
> +
> +		INIT_LIST_HEAD(&xl->head);
> +		spin_lock_init(&xl->lock);
> +	}
> +	register_netdevice_notifier(&xfrm_netdev_notifier);
>  }
>  
>  #ifdef CONFIG_AUDITSYSCALL
> -- 
> 2.7.4
> 
> ---------------------------------------------------------
> 
> This approach has the unfortunate side effects of adding a spin lock for the
> tracked list, as well as increasing struct xfrm_dst.

Reintroducing garbage collection is probably not a so good idea. I think
the patch below should fix it a bit less intrusive.


Subject: [PATCH RFC] xfrm: Fix netdev refcount leak when flushing the percpu dst cache.

The dst garbage collection code is removed, so we need to call
dst_dev_put() on cached dst entries before we release them.
Otherwise we leak the refcount to the netdev.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a23078132cf..7836b7601b49 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1715,8 +1715,10 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
 static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
 {
 	this_cpu_write(xfrm_last_dst, xdst);
-	if (old)
+	if (old) {
+		dst_dev_put(&old->u.dst);
 		dst_release(&old->u.dst);
+	}
 }
 
 static void __xfrm_pcpu_work_fn(void)
@@ -1787,6 +1789,7 @@ void xfrm_policy_cache_flush(void)
 		old = per_cpu(xfrm_last_dst, cpu);
 		if (old && !xfrm_bundle_ok(old)) {
 			per_cpu(xfrm_last_dst, cpu) = NULL;
+			dst_dev_put(&old->u.dst);
 			dst_release(&old->u.dst);
 		}
 		rcu_read_unlock();
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-06  8:53 ` Steffen Klassert
@ 2018-02-06 10:32   ` Florian Westphal
  2018-02-06 10:42   ` Eyal Birger
  1 sibling, 0 replies; 12+ messages in thread
From: Florian Westphal @ 2018-02-06 10:32 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Eyal Birger, netdev, herbert, davem, shmulik, Wei Wang

Steffen Klassert <steffen.klassert@secunet.com> wrote:
> Cc Wei Wang
> 
> On Sun, Feb 04, 2018 at 01:21:18PM +0200, Eyal Birger wrote:
> > Hi,
> > 
> > We've encountered a non released device reference upon device
> > unregistration which seems to stem from xfrm policy code.
> > 
> > The setup includes:
> > - an underlay device (e.g. eth0) using IPv4
> > - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> > - an ipip6 tunnel over the xfrm IPv6 tunnel
> > 
> > When tearing down the underlay device, after traffic had passed via the ipip6
> > tunnel, log messages of the following form are observed:
> > 
> > unregister_netdevice: waiting for eth0 to become free. Usage count = 2
> 
> Looks like this happened when the dst garbage collection code was
> removed. I could not point to a commit that introduced it so I
> did a bisection and this pointed to:
> 
> commit 9514528d92d4cbe086499322370155ed69f5d06c
> ipv6: call dst_dev_put() properly
> 
> With this commit we leak the one refcount and some further commit
> leaked the second one.
> Subject: [PATCH RFC] xfrm: Fix netdev refcount leak when flushing the percpu dst cache.
> 
> The dst garbage collection code is removed, so we need to call
> dst_dev_put() on cached dst entries before we release them.
> Otherwise we leak the refcount to the netdev.

I don't think this is related to the xfrm pcpu cache at all.

AFAIU any xfrm dst that gets cached in a tunnel dst cache will
hold the device reference.

Perhaps its best to add a device notifier to the tunnel code
and put device refcount there.

I'll try to come up with a patch *unless* I'm wrong and this is
really just because of xfrm pcpu cache.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-06  8:53 ` Steffen Klassert
  2018-02-06 10:32   ` Florian Westphal
@ 2018-02-06 10:42   ` Eyal Birger
  1 sibling, 0 replies; 12+ messages in thread
From: Eyal Birger @ 2018-02-06 10:42 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: netdev, herbert, davem, shmulik, Wei Wang, fw

Hi Steffen,

On Tue, 6 Feb 2018 09:53:38 +0100
Steffen Klassert <steffen.klassert@secunet.com> wrote:

> Cc Wei Wang
> 
> On Sun, Feb 04, 2018 at 01:21:18PM +0200, Eyal Birger wrote:
> > Hi,
> > 
> > We've encountered a non released device reference upon device
> > unregistration which seems to stem from xfrm policy code.
> > 
> > The setup includes:
> > - an underlay device (e.g. eth0) using IPv4
> > - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> > - an ipip6 tunnel over the xfrm IPv6 tunnel
> > 
> > When tearing down the underlay device, after traffic had passed via
> > the ipip6 tunnel, log messages of the following form are observed:
> > 
> > unregister_netdevice: waiting for eth0 to become free. Usage count
> > = 2  
> 
> Looks like this happened when the dst garbage collection code was
> removed. I could not point to a commit that introduced it so I
> did a bisection and this pointed to:
> 
> commit 9514528d92d4cbe086499322370155ed69f5d06c
> ipv6: call dst_dev_put() properly
> 
> With this commit we leak the one refcount and some further commit
> leaked the second one.
> 
> > 
> > The below synthetic script reproduces this consistently on a fresh
> > ubuntu vm running net-next v4.15-6066-ge9522a5:
> > ---------------------------------------------------------
> > #!/bin/bash
> > 
> > ipsec_underlay_dst=192.168.6.1
> > ipsec_underlay_src=192.168.5.2
> > ipv6_pfx=1234
> > local_ipv6_addr="$ipv6_pfx::1"
> > remote_ipv6_addr="$ipv6_pfx::2"
> > 
> > # create dummy ipsec underlay
> > ip l add dev dummy1 type dummy
> > ip l set dev dummy1 up
> > ip r add "$ipsec_underlay_dst/32" dev dummy1
> > ip -6 r add "$ipv6_pfx::/16" dev dummy1
> > 
> > ip a add dev dummy1 "$local_ipv6_addr/128"
> > ip a add dev dummy1 "$ipsec_underlay_src/24"
> > 
> > # add xfrm policy and state
> > ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out
> > tmpl src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp
> > reqid 1 mode tunnel ip x s add src "$ipsec_underlay_src" dst
> > "$ipsec_underlay_dst" proto esp spi 0xcd440ce6 reqid 1 mode tunnel
> > auth-trunc 'hmac(sha1)' 0x34a546d309031628962b814ef073aff1a638ad21
> > 96 enc 'cbc(aes)' 0xf31e14149c328297fe7925ad7448420e encap espinudp
> > 4500 4500 0.0.0.0
> > 
> > # add 4o6 tunnel
> > ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr"
> > remote "$remote_ipv6_addr" ip l set dev tnl46 up
> > ip r add 10.64.0.0/10 dev tnl46 
> > 
> > # pass traffic so route is cached
> > ping -w 1 -c 1 10.64.0.1
> > 
> > # remove dummy underlay
> > ip l del dummy1
> > ---------------------------------------------------------
> > 
> > Analysis:
> > 
> > ip6_tunnel holds a dst_cache which caches its underlay dst objects.
> > When devices are unregistered, non-xfrm dst objects are invlidated
> > by their original creators (ipv4/ipv6/...) and thus are wiped from
> > dst_cache.
> > 
> > xfrm created routes otoh are not tracked by xfrm, and are not
> > invalidated upon device unregistration, thus hold the device upon
> > unregistration.
> > 
> > The following rough sketch patch illustrates an approach overcoming
> > this issue:
> > ---------------------------------------------------------  

[snip]

> > ---------------------------------------------------------
> > 
> > This approach has the unfortunate side effects of adding a spin
> > lock for the tracked list, as well as increasing struct xfrm_dst.  
> 
> Reintroducing garbage collection is probably not a so good idea. I
> think the patch below should fix it a bit less intrusive.
> 
> 
> Subject: [PATCH RFC] xfrm: Fix netdev refcount leak when flushing the
> percpu dst cache.
> 
> The dst garbage collection code is removed, so we need to call
> dst_dev_put() on cached dst entries before we release them.
> Otherwise we leak the refcount to the netdev.
> 
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> ---
>  net/xfrm/xfrm_policy.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> index 7a23078132cf..7836b7601b49 100644
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -1715,8 +1715,10 @@ static int xfrm_expand_policies(const struct
> flowi *fl, u16 family, static void xfrm_last_dst_update(struct
> xfrm_dst *xdst, struct xfrm_dst *old) {
>  	this_cpu_write(xfrm_last_dst, xdst);
> -	if (old)
> +	if (old) {
> +		dst_dev_put(&old->u.dst);
>  		dst_release(&old->u.dst);
> +	}
>  }
>  
>  static void __xfrm_pcpu_work_fn(void)
> @@ -1787,6 +1789,7 @@ void xfrm_policy_cache_flush(void)
>  		old = per_cpu(xfrm_last_dst, cpu);
>  		if (old && !xfrm_bundle_ok(old)) {
>  			per_cpu(xfrm_last_dst, cpu) = NULL;
> +			dst_dev_put(&old->u.dst);
>  			dst_release(&old->u.dst);
>  		}
>  		rcu_read_unlock();
I have tested this and indeed it prevents the leak.

But... IIUC the xfrm_last_dst cache is a single instance that is updated
every time a new bundle is created, whereas ip6_tunnel uses a different
dst_cache for each tunnel.

Invalidating the dst every time a new bundle is created effectively means
that in a multiple tunnels scenario (multiple ip6_tunnels over multiple
xfrm policies) there is only one active ip6_tunnel dst_cache at a time.

In case multiple tunnels are used at the same times, I think this
essentially renders the ip6_tunnel dst_cache useless.

Eyal.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-04 11:21 xfrm, ip tunnel: non released device reference upon device unregistration Eyal Birger
  2018-02-04 13:32 ` Eyal Birger
  2018-02-06  8:53 ` Steffen Klassert
@ 2018-02-06 12:56 ` Florian Westphal
  2018-02-06 13:09   ` Steffen Klassert
  2 siblings, 1 reply; 12+ messages in thread
From: Florian Westphal @ 2018-02-06 12:56 UTC (permalink / raw)
  To: Eyal Birger; +Cc: steffen.klassert, netdev, herbert, davem, shmulik

Eyal Birger <eyal.birger@gmail.com> wrote:
> Hi,
> 
> We've encountered a non released device reference upon device
> unregistration which seems to stem from xfrm policy code.
> 
> The setup includes:
> - an underlay device (e.g. eth0) using IPv4
> - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> - an ipip6 tunnel over the xfrm IPv6 tunnel
> 
> When tearing down the underlay device, after traffic had passed via the ipip6
> tunnel, log messages of the following form are observed:
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 2

This looks like it might be related to a bug reported by Hangbin Liu.

Setup is:

       Host A                --        Host B
tun0 -- br0 (ipsec) -- eth0  --  eth0 (ipsec) -- tun0

... where tun0 are ipip tunnels.

module removal then hangs forever as device refcount is held by
dst_cache.
Hangbin gave following reproducer script:

Host B:
ip link add tun0 type ipip local 192.168.7.1 remote 192.168.7.10
ip link set tun0 up
ip addr add 10.0.0.2/24 dev tun0
ip xfrm state add src 192.168.7.1 dst 192.168.7.10 spi 1000 proto ah
auth sha1 beef_fish_pork_salad mode transport
ip xfrm state add src 192.168.7.1 dst 192.168.7.1 spi 1000 proto ah auth
sha1 beef_fish_pork_salad mode transport
ip xfrm policy add src 192.168.7.1 dst 192.168.7.10 dir out tmpl src
192.168.7.1 dst 192.168.7.10 proto ah spi 1000 mode transport
ip xfrm policy add src 192.168.7.10 dst 192.168.7.1 dir in tmpl src
192.168.7.10 dst 192.168.7.1 proto ah spi 1000 mode transport level use

On Host A (tun0 10.0.0.1, br0 192.168.7.10):
ip addr flush dev eth0
brctl addbr br0
brctl addif br0 eth0
ip link set br0 up
ip addr add 192.168.7.10/24 dev br0

ip link add tun0 type ipip local 192.168.7.10 remote 192.168.7.1
ip link set tun0 up
ip addr add 10.0.0.1/24 dev tun0

ip xfrm state add src 192.168.7.10 dst 192.168.7.1 spi 1000
proto ah auth sha1 beef_fish_pork_salad mode transport
ip xfrm state add src 192.168.7.1 dst 192.168.7.10 spi 1000
proto ah auth sha1 beef_fish_pork_salad mode transport
ip xfrm policy add src 192.168.7.10 dst 192.168.7.1 dir out tmpl
src 192.168.7.10 dst 192.168.7.1 proto ah spi 1000 mode transport
ip xfrm policy add src 192.168.7.1 dst 192.168.7.10 dir in tmpl src 192.168.7.1 dst 192.168.7.10 proto ah spi 1000 mode transport level use

# should work, else something is broken:
ping 10.0.0.2 -c 2

# This hangs:
modprobe -r bridge

This hangs even when I remove the ipsec xfrm pcpu cache.

What does work in above setup is this:

diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
index 72fd5067c353..d5bcd6ea77f3 100644
--- a/include/net/dst_cache.h
+++ b/include/net/dst_cache.h
@@ -95,4 +95,13 @@ int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp);
  */
 void dst_cache_destroy(struct dst_cache *dst_cache);
 
+/**
+ *	dst_cache_clear_dev - put device from cached tunnel
+ *	@dst_cache: the cache
+ *	@dev: device being removed
+ *
+ *	puts the device and resets the cache in case the device is
+ *	used by the cached dst.
+ */
+void dst_cache_clear_dev(struct dst_cache *dstcache, const struct net_device *dev);
 #endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 1f16773cfd76..4e337b99fa96 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -289,6 +289,8 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p, __u32 fwmark);
 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
+void ip_tunnel_device_down(const struct net_device *dev, unsigned int net_id);
+
 struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 554d36449231..56cafe08756d 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -166,3 +166,32 @@ void dst_cache_destroy(struct dst_cache *dst_cache)
 	free_percpu(dst_cache->cache);
 }
 EXPORT_SYMBOL_GPL(dst_cache_destroy);
+
+void dst_cache_clear_dev(struct dst_cache *dst_cache, const struct net_device *dev)
+{
+	struct dst_entry *dst;
+	bool reset = false;
+	int i;
+
+	if (!dst_cache->cache)
+		return;
+
+	rcu_read_lock();
+	for_each_possible_cpu(i) {
+		dst = per_cpu_ptr(dst_cache->cache, i)->dst;
+		if (!dst)
+			continue;
+
+		if (!dst_hold_safe(dst))
+			continue;
+
+		if (dst->dev == dev) {
+			dst_dev_put(dst);
+			reset = true;
+		}
+		dst_release(dst);
+	}
+	rcu_read_unlock();
+	if (reset)
+		dst_cache_reset(dst_cache);
+}
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d786a8441bce..cea578190258 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1224,4 +1224,25 @@ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
 
+void ip_tunnel_device_down(const struct net_device *dev, unsigned int id)
+{
+	struct net *net = dev_net(dev);
+	struct ip_tunnel_net *itn = net_generic(net,  id);
+	unsigned int i;
+
+	ASSERT_RTNL();
+
+	for (i = 0; i < IP_TNL_HASH_SIZE; i++) {
+		struct ip_tunnel *t;
+		struct hlist_node *n;
+		struct hlist_head *thead = &itn->tunnels[i];
+
+		hlist_for_each_entry_safe(t, n, thead, hash_node)
+			dst_cache_clear_dev(&t->dst_cache, dev);
+
+		cond_resched();
+	}
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_device_down);
+
 MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..da9640e082eb 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -357,6 +357,21 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return 0;
 }
 
+int ipip_tunnel_device_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_DOWN:
+		ip_tunnel_device_down(dev, ipip_net_id);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(ipip_tunnel_device_event);
+
 static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_init       = ipip_tunnel_init,
 	.ndo_uninit     = ip_tunnel_uninit,
@@ -671,6 +686,10 @@ static struct pernet_operations ipip_net_ops = {
 	.size = sizeof(struct ip_tunnel_net),
 };
 
+static struct notifier_block ipip_device_notifier = {
+	.notifier_call	= ipip_tunnel_device_event,
+};
+
 static int __init ipip_init(void)
 {
 	int err;
@@ -696,6 +715,11 @@ static int __init ipip_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	err = register_netdevice_notifier(&ipip_device_notifier);
+	if (err < 0) {
+		rtnl_link_unregister(&ipip_link_ops);
+		goto rtnl_link_failed;
+	}
 out:
 	return err;
 
@@ -713,6 +737,7 @@ static int __init ipip_init(void)
 
 static void __exit ipip_fini(void)
 {
+	unregister_netdevice_notifier(&ipip_device_notifier);
 	rtnl_link_unregister(&ipip_link_ops);
 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
 		pr_info("%s: can't deregister tunnel\n", __func__);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-06 12:56 ` Florian Westphal
@ 2018-02-06 13:09   ` Steffen Klassert
  2018-02-06 13:15     ` Florian Westphal
  0 siblings, 1 reply; 12+ messages in thread
From: Steffen Klassert @ 2018-02-06 13:09 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Eyal Birger, netdev, herbert, davem, shmulik

On Tue, Feb 06, 2018 at 01:56:24PM +0100, Florian Westphal wrote:
> Eyal Birger <eyal.birger@gmail.com> wrote:
> > Hi,
> > 
> > We've encountered a non released device reference upon device
> > unregistration which seems to stem from xfrm policy code.
> > 
> > The setup includes:
> > - an underlay device (e.g. eth0) using IPv4
> > - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> > - an ipip6 tunnel over the xfrm IPv6 tunnel
> > 
> > When tearing down the underlay device, after traffic had passed via the ipip6
> > tunnel, log messages of the following form are observed:
> > 
> > unregister_netdevice: waiting for eth0 to become free. Usage count = 2
> 
> This looks like it might be related to a bug reported by Hangbin Liu.
> 
> Setup is:
> 
>        Host A                --        Host B
> tun0 -- br0 (ipsec) -- eth0  --  eth0 (ipsec) -- tun0
> 
> ... where tun0 are ipip tunnels.
> 
> module removal then hangs forever as device refcount is held by
> dst_cache.
> Hangbin gave following reproducer script:
> 
> Host B:
> ip link add tun0 type ipip local 192.168.7.1 remote 192.168.7.10
> ip link set tun0 up
> ip addr add 10.0.0.2/24 dev tun0
> ip xfrm state add src 192.168.7.1 dst 192.168.7.10 spi 1000 proto ah
> auth sha1 beef_fish_pork_salad mode transport
> ip xfrm state add src 192.168.7.1 dst 192.168.7.1 spi 1000 proto ah auth
> sha1 beef_fish_pork_salad mode transport
> ip xfrm policy add src 192.168.7.1 dst 192.168.7.10 dir out tmpl src
> 192.168.7.1 dst 192.168.7.10 proto ah spi 1000 mode transport
> ip xfrm policy add src 192.168.7.10 dst 192.168.7.1 dir in tmpl src
> 192.168.7.10 dst 192.168.7.1 proto ah spi 1000 mode transport level use
> 
> On Host A (tun0 10.0.0.1, br0 192.168.7.10):
> ip addr flush dev eth0
> brctl addbr br0
> brctl addif br0 eth0
> ip link set br0 up
> ip addr add 192.168.7.10/24 dev br0
> 
> ip link add tun0 type ipip local 192.168.7.10 remote 192.168.7.1
> ip link set tun0 up
> ip addr add 10.0.0.1/24 dev tun0
> 
> ip xfrm state add src 192.168.7.10 dst 192.168.7.1 spi 1000
> proto ah auth sha1 beef_fish_pork_salad mode transport
> ip xfrm state add src 192.168.7.1 dst 192.168.7.10 spi 1000
> proto ah auth sha1 beef_fish_pork_salad mode transport
> ip xfrm policy add src 192.168.7.10 dst 192.168.7.1 dir out tmpl
> src 192.168.7.10 dst 192.168.7.1 proto ah spi 1000 mode transport
> ip xfrm policy add src 192.168.7.1 dst 192.168.7.10 dir in tmpl src 192.168.7.1 dst 192.168.7.10 proto ah spi 1000 mode transport level use
> 
> # should work, else something is broken:
> ping 10.0.0.2 -c 2
> 
> # This hangs:
> modprobe -r bridge
> 
> This hangs even when I remove the ipsec xfrm pcpu cache.
> 
> What does work in above setup is this:

I gave the patch a quick try, but still I get this:

unregister_netdevice: waiting for dummy1 to become free. Usage count = 2

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-06 13:09   ` Steffen Klassert
@ 2018-02-06 13:15     ` Florian Westphal
  2018-02-06 13:21       ` Steffen Klassert
  2018-02-06 19:19       ` Eyal Birger
  0 siblings, 2 replies; 12+ messages in thread
From: Florian Westphal @ 2018-02-06 13:15 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Florian Westphal, Eyal Birger, netdev, herbert, davem, shmulik

Steffen Klassert <steffen.klassert@secunet.com> wrote:
> I gave the patch a quick try, but still I get this:
> 
> unregister_netdevice: waiting for dummy1 to become free. Usage count = 2

Was that with Eyals setup or the bridge one I posted?

If it was Eyals setup, its possible the patch missed hookup
to whatever tunnel infra is used (the setup I used has ipip tunnel,
everything is ipv4).

Also, perhaps it would be best to not bother with checking the
device in question at all and unconditionally put device reference
of all the dst_caches.  With setups that have e.g. 1k devices going down
per second (ppp dialin and the like) doing the full search for every
notify event would be rather expensive.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-06 13:15     ` Florian Westphal
@ 2018-02-06 13:21       ` Steffen Klassert
  2018-02-06 19:19       ` Eyal Birger
  1 sibling, 0 replies; 12+ messages in thread
From: Steffen Klassert @ 2018-02-06 13:21 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Eyal Birger, netdev, herbert, davem, shmulik

On Tue, Feb 06, 2018 at 02:15:09PM +0100, Florian Westphal wrote:
> Steffen Klassert <steffen.klassert@secunet.com> wrote:
> > I gave the patch a quick try, but still I get this:
> > 
> > unregister_netdevice: waiting for dummy1 to become free. Usage count = 2
> 
> Was that with Eyals setup or the bridge one I posted?

It was with Eyals setup, sorry for not mentioning this.

> 
> If it was Eyals setup, its possible the patch missed hookup
> to whatever tunnel infra is used (the setup I used has ipip tunnel,
> everything is ipv4).

That might be, some IPv6 is involved with Eyals setup.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-06 13:15     ` Florian Westphal
  2018-02-06 13:21       ` Steffen Klassert
@ 2018-02-06 19:19       ` Eyal Birger
  2018-02-11 15:46         ` Florian Westphal
  1 sibling, 1 reply; 12+ messages in thread
From: Eyal Birger @ 2018-02-06 19:19 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Steffen Klassert, netdev, herbert, davem, shmulik

On Tue, 6 Feb 2018 14:15:09 +0100
Florian Westphal <fw@strlen.de> wrote:

> Steffen Klassert <steffen.klassert@secunet.com> wrote:
> > I gave the patch a quick try, but still I get this:
> > 
> > unregister_netdevice: waiting for dummy1 to become free. Usage
> > count = 2  
> 
> Was that with Eyals setup or the bridge one I posted?
> 
> If it was Eyals setup, its possible the patch missed hookup
> to whatever tunnel infra is used (the setup I used has ipip tunnel,
> everything is ipv4).
> 

Thanks!

Indeed the setup I'm testing uses ip6_tunnel.
I have tested a fix in the spirit of the patch and it looks valid 
for ip6_tunnel as well.

It looks though that this change would need to be added to any tunnel
device using dst_cache (vxlan, geneve, gre, ...).

> Also, perhaps it would be best to not bother with checking the
> device in question at all and unconditionally put device reference
> of all the dst_caches.  With setups that have e.g. 1k devices going
> down per second (ppp dialin and the like) doing the full search for
> every notify event would be rather expensive.
> 

I'm wondering - non-xfrm dsts are already correctly invalidated,
so do you think it makes sense to invalidate caches for devices that
have no xfrm dsts? or maybe I didn't understand the suggestion?

Eyal.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-06 19:19       ` Eyal Birger
@ 2018-02-11 15:46         ` Florian Westphal
  2018-02-12 11:54           ` Eyal Birger
  0 siblings, 1 reply; 12+ messages in thread
From: Florian Westphal @ 2018-02-11 15:46 UTC (permalink / raw)
  To: Eyal Birger
  Cc: Florian Westphal, Steffen Klassert, netdev, herbert, davem, shmulik

Eyal Birger <eyal.birger@gmail.com> wrote:

Sorry for taking so long to respond.

> On Tue, 6 Feb 2018 14:15:09 +0100
> Florian Westphal <fw@strlen.de> wrote:
> 
> > Steffen Klassert <steffen.klassert@secunet.com> wrote:
> > > I gave the patch a quick try, but still I get this:
> > > 
> > > unregister_netdevice: waiting for dummy1 to become free. Usage
> > > count = 2  
> > 
> > Was that with Eyals setup or the bridge one I posted?
> > 
> > If it was Eyals setup, its possible the patch missed hookup
> > to whatever tunnel infra is used (the setup I used has ipip tunnel,
> > everything is ipv4).
> > 
> 
> Thanks!
> 
> Indeed the setup I'm testing uses ip6_tunnel.
> I have tested a fix in the spirit of the patch and it looks valid 
> for ip6_tunnel as well.
>
> It looks though that this change would need to be added to any tunnel
> device using dst_cache (vxlan, geneve, gre, ...).

Yes.  Meanwhile I tested your patch and it works for me too.
As your patch is shorter and ipv4/ipv6 seem to take care of refcount
put just fine I think your patch is the right way to go.

The xfrm_dst size incrase isn't much of a big deal, there is ample of
padding at the end so it will still be allocated from same slab.

We could reduce num_pols and num_xfrms to u8, which creates a 16 bit
hole, then store the cpu number instead of a list pointer.

This would limit growth to 16 instead of 24.

But, as I said, i do not think its a big deal.

> I'm wondering - non-xfrm dsts are already correctly invalidated,
> so do you think it makes sense to invalidate caches for devices that
> have no xfrm dsts? or maybe I didn't understand the suggestion?

See above, I think your patch is the way to go.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: xfrm, ip tunnel: non released device reference upon device unregistration
  2018-02-11 15:46         ` Florian Westphal
@ 2018-02-12 11:54           ` Eyal Birger
  0 siblings, 0 replies; 12+ messages in thread
From: Eyal Birger @ 2018-02-12 11:54 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Steffen Klassert, netdev, herbert, davem, shmulik

On Sun, 11 Feb 2018 16:46:48 +0100
Florian Westphal <fw@strlen.de> wrote:

> Eyal Birger <eyal.birger@gmail.com> wrote:
> 
> Sorry for taking so long to respond.
> 
> > On Tue, 6 Feb 2018 14:15:09 +0100
> > Florian Westphal <fw@strlen.de> wrote:
> >   
> > > Steffen Klassert <steffen.klassert@secunet.com> wrote:  
> > > > I gave the patch a quick try, but still I get this:
> > > > 
> > > > unregister_netdevice: waiting for dummy1 to become free. Usage
> > > > count = 2    
> > > 
> > > Was that with Eyals setup or the bridge one I posted?
> > > 
> > > If it was Eyals setup, its possible the patch missed hookup
> > > to whatever tunnel infra is used (the setup I used has ipip
> > > tunnel, everything is ipv4).
> > >   
> > 
> > Thanks!
> > 
> > Indeed the setup I'm testing uses ip6_tunnel.
> > I have tested a fix in the spirit of the patch and it looks valid 
> > for ip6_tunnel as well.
> >
> > It looks though that this change would need to be added to any
> > tunnel device using dst_cache (vxlan, geneve, gre, ...).  
> 
> Yes.  Meanwhile I tested your patch and it works for me too.
> As your patch is shorter and ipv4/ipv6 seem to take care of refcount
> put just fine I think your patch is the right way to go.
> 
> The xfrm_dst size incrase isn't much of a big deal, there is ample of
> padding at the end so it will still be allocated from same slab.
> 
> We could reduce num_pols and num_xfrms to u8, which creates a 16 bit
> hole, then store the cpu number instead of a list pointer.
> 
> This would limit growth to 16 instead of 24.
> 
> But, as I said, i do not think its a big deal.
> 
> > I'm wondering - non-xfrm dsts are already correctly invalidated,
> > so do you think it makes sense to invalidate caches for devices that
> > have no xfrm dsts? or maybe I didn't understand the suggestion?  
> 
> See above, I think your patch is the way to go.

Ok, thanks. Will submit a formal patch.

Eyal.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-02-12 11:54 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-04 11:21 xfrm, ip tunnel: non released device reference upon device unregistration Eyal Birger
2018-02-04 13:32 ` Eyal Birger
2018-02-06  8:53 ` Steffen Klassert
2018-02-06 10:32   ` Florian Westphal
2018-02-06 10:42   ` Eyal Birger
2018-02-06 12:56 ` Florian Westphal
2018-02-06 13:09   ` Steffen Klassert
2018-02-06 13:15     ` Florian Westphal
2018-02-06 13:21       ` Steffen Klassert
2018-02-06 19:19       ` Eyal Birger
2018-02-11 15:46         ` Florian Westphal
2018-02-12 11:54           ` Eyal Birger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).