All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] caching bundles, iteration 3
@ 2010-04-01 12:52 Timo Teras
  2010-04-01 12:52 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
                   ` (4 more replies)
  0 siblings, 5 replies; 42+ messages in thread
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

Applies on top of the previous patches I sent.

Changes to previous iteration:
 - "flow: delayed deletion of flow cache entries" refactored to go
   after the other patches per Herbert's request
 - fixed hlist searching in the above mentioned patch
 - uses now ERR_CAST and other similar functions for better readability
 - added basic gc for per-socket bundles
 - some other minor clean ups

I'm now running this on a test box, with my specific setup, and it
seems to be working pretty well. However, this changes quite a bit
of things, so detailed review is needed.

Timo Teras (4):
  flow: virtualize flow cache entry methods
  xfrm: cache bundles instead of policies for outgoing flows
  xfrm: remove policy garbage collection
  flow: delayed deletion of flow cache entries

 include/net/flow.h      |   18 +-
 include/net/xfrm.h      |   12 +-
 net/core/flow.c         |  203 +++++++-----
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  822 +++++++++++++++++++++++++----------------------
 6 files changed, 583 insertions(+), 525 deletions(-)


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-01 12:52 [PATCH 0/4] caching bundles, iteration 3 Timo Teras
@ 2010-04-01 12:52 ` Timo Teras
  2010-04-01 13:05   ` Eric Dumazet
                     ` (2 more replies)
  2010-04-01 12:52 ` [PATCH 2/4] xfrm: cache bundles instead of policies for outgoing flows Timo Teras
                   ` (3 subsequent siblings)
  4 siblings, 3 replies; 42+ messages in thread
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/flow.h     |   18 ++++++--
 include/net/xfrm.h     |    2 +
 net/core/flow.c        |  118 ++++++++++++++++++++++++-----------------------
 net/xfrm/xfrm_policy.c |  113 ++++++++++++++++++++++++++++++---------------
 4 files changed, 151 insertions(+), 100 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..65d68a6 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,21 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
+struct flow_cache_entry_ops {
+	struct flow_cache_entry_ops ** (*get)(struct flow_cache_entry_ops **);
+	int (*check)(struct flow_cache_entry_ops **);
+	void (*delete)(struct flow_cache_entry_ops **);
+};
+
+typedef struct flow_cache_entry_ops **(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx);
+
+extern struct flow_cache_entry_ops **flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
+
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..cb8934b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@ struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_entry_ops *fc_ops;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6..f938137 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,13 +26,12 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry *	next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_entry_ops **	ops;
 };
 
 struct flow_cache_percpu {
@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->ops && !(*fle->ops)->check(fle->ops))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
-	if (fle->object)
-		atomic_dec(fle->object_ref);
+	if (fle->ops)
+		(*fle->ops)->delete(fle->ops);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_entry_ops **flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family, u8 dir,
+		flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_entry_ops **ops;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	ops = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,24 +198,14 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
 	if (!fle) {
@@ -215,37 +218,37 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
+			fle->ops = NULL;
 			memcpy(&fle->key, key, sizeof(*key));
-			fle->object = NULL;
+			fle->ops = NULL;
 			fcp->hash_count++;
 		}
+	} else if (fle->genid == atomic_read(&flow_cache_genid)) {
+		ops = fle->ops;
+		if (!ops)
+			goto ret_ops;
+		ops = (*ops)->get(ops);
+		if (ops)
+			goto ret_ops;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
+	ops = resolver(net, key, family, dir, fle ? fle->ops : NULL, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (IS_ERR(ops)) {
+			fle->genid--;
+			fle->ops = NULL;
+		} else {
+			fle->ops = ops;
 		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	} else {
+		if (ops && !IS_ERR(ops))
+			(*ops)->delete(ops);
 	}
+ret_ops:
+	local_bh_enable();
+	return ops;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +264,12 @@ static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
-			fle->object = NULL;
-			atomic_dec(fle->object_ref);
+			if (fle->ops)
+				(*fle->ops)->delete(fle->ops);
+			fle->ops = NULL;
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf..20f5b01 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,36 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_entry_ops **xfrm_policy_get_fce(
+		struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_policy *pol = container_of(ops, struct xfrm_policy, fc_ops);
+
+	if (unlikely(pol->walk.dead))
+		ops = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return ops;
+}
+
+static int xfrm_policy_check_fce(struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_policy *pol = container_of(ops, struct xfrm_policy, fc_ops);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_delete_fce(struct flow_cache_entry_ops **ops)
+{
+	xfrm_pol_put(container_of(ops, struct xfrm_policy, fc_ops));
+}
+
+static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
+	.get = xfrm_policy_get_fce,
+	.check = xfrm_policy_check_fce,
+	.delete = xfrm_policy_delete_fce,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +266,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->fc_ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +300,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +689,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +729,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +846,6 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +999,35 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_entry_ops **xfrm_policy_lookup(
+		struct net *net, struct flowi *fl, u16 family,
+		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_ops)
+		xfrm_pol_put(container_of(old_ops, struct xfrm_policy, fc_ops));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->fc_ops;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1117,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1602,24 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_entry_ops **ops;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		ops = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(ops);
+		if (IS_ERR(ops)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (ops)
+			policy = container_of(ops, struct xfrm_policy, fc_ops);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1969,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_entry_ops **ops;
+
+		ops = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR(ops))
+			pol = ERR_CAST(ops);
+		else if (ops)
+			pol = container_of(ops, struct xfrm_policy, fc_ops);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 2/4] xfrm: cache bundles instead of policies for outgoing flows
  2010-04-01 12:52 [PATCH 0/4] caching bundles, iteration 3 Timo Teras
  2010-04-01 12:52 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
@ 2010-04-01 12:52 ` Timo Teras
  2010-04-01 12:52 ` [PATCH 3/4] xfrm: remove policy garbage collection Timo Teras
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

__xfrm_lookup() is called for each packet transmitted out of
system. The xfrm_find_bundle() does a linear search which can
kill system performance depending on how many bundles are
required per policy.

This modifies __xfrm_lookup() to store bundles directly in
the flow cache. If we did not get a hit, we just create a new
bundle instead of doing slow search. This means that we can now
get multiple xfrm_dst's for same flow (on per-cpu basis).

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/xfrm.h      |   10 +-
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  712 +++++++++++++++++++++++++----------------------
 4 files changed, 386 insertions(+), 389 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index cb8934b..6ce593f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -267,7 +267,6 @@ struct xfrm_policy_afinfo {
 					       xfrm_address_t *saddr,
 					       xfrm_address_t *daddr);
 	int			(*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr);
-	struct dst_entry	*(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy);
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
@@ -483,13 +482,13 @@ struct xfrm_policy {
 	struct timer_list	timer;
 
 	struct flow_cache_entry_ops *fc_ops;
+	atomic_t		genid;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
 	struct xfrm_selector	selector;
 	struct xfrm_lifetime_cfg lft;
 	struct xfrm_lifetime_cur curlft;
-	struct dst_entry       *bundles;
 	struct xfrm_policy_walk_entry walk;
 	u8			type;
 	u8			action;
@@ -879,11 +878,15 @@ struct xfrm_dst {
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+	struct flow_cache_entry_ops *fc_ops;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	int num_pols, num_xfrms;
 #ifdef CONFIG_XFRM_SUB_POLICY
 	struct flowi *origin;
 	struct xfrm_selector *partner;
 #endif
-	u32 genid;
+	u32 xfrm_genid;
+	u32 policy_genid;
 	u32 route_mtu_cached;
 	u32 child_mtu_cached;
 	u32 route_cookie;
@@ -893,6 +896,7 @@ struct xfrm_dst {
 #ifdef CONFIG_XFRM
 static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 {
+	xfrm_pols_put(xdst->pols, xdst->num_pols);
 	dst_release(xdst->route);
 	if (likely(xdst->u.dst.xfrm))
 		xfrm_state_put(xdst->u.dst.xfrm);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e4a1483..1705476 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -59,27 +59,6 @@ static int xfrm4_get_saddr(struct net *net,
 	return 0;
 }
 
-static struct dst_entry *
-__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
-	struct dst_entry *dst;
-
-	read_lock_bh(&policy->lock);
-	for (dst = policy->bundles; dst; dst = dst->next) {
-		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-		if (xdst->u.rt.fl.oif == fl->oif &&	/*XXX*/
-		    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
-		    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
-		    xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
-		    xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
-			dst_clone(dst);
-			break;
-		}
-	}
-	read_unlock_bh(&policy->lock);
-	return dst;
-}
-
 static int xfrm4_get_tos(struct flowi *fl)
 {
 	return fl->fl4_tos;
@@ -259,7 +238,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
-	.find_bundle = 		__xfrm4_find_bundle,
 	.decode_session =	_decode_session4,
 	.get_tos =		xfrm4_get_tos,
 	.init_path =		xfrm4_init_path,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ae18165..8c452fd 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -67,36 +67,6 @@ static int xfrm6_get_saddr(struct net *net,
 	return 0;
 }
 
-static struct dst_entry *
-__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
-	struct dst_entry *dst;
-
-	/* Still not clear if we should set fl->fl6_{src,dst}... */
-	read_lock_bh(&policy->lock);
-	for (dst = policy->bundles; dst; dst = dst->next) {
-		struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
-		struct in6_addr fl_dst_prefix, fl_src_prefix;
-
-		ipv6_addr_prefix(&fl_dst_prefix,
-				 &fl->fl6_dst,
-				 xdst->u.rt6.rt6i_dst.plen);
-		ipv6_addr_prefix(&fl_src_prefix,
-				 &fl->fl6_src,
-				 xdst->u.rt6.rt6i_src.plen);
-		if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
-		    ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
-		    xfrm_bundle_ok(policy, xdst, fl, AF_INET6,
-				   (xdst->u.rt6.rt6i_dst.plen != 128 ||
-				    xdst->u.rt6.rt6i_src.plen != 128))) {
-			dst_clone(dst);
-			break;
-		}
-	}
-	read_unlock_bh(&policy->lock);
-	return dst;
-}
-
 static int xfrm6_get_tos(struct flowi *fl)
 {
 	return 0;
@@ -291,7 +261,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.get_saddr = 		xfrm6_get_saddr,
-	.find_bundle =		__xfrm6_find_bundle,
 	.decode_session =	_decode_session6,
 	.get_tos =		xfrm6_get_tos,
 	.init_path =		xfrm6_init_path,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 20f5b01..3489e2d 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,6 +37,8 @@
 DEFINE_MUTEX(xfrm_cfg_mutex);
 EXPORT_SYMBOL(xfrm_cfg_mutex);
 
+static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock);
+static struct dst_entry *xfrm_policy_sk_bundles;
 static DEFINE_RWLOCK(xfrm_policy_lock);
 
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
@@ -50,6 +52,7 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
+static int stale_bundle(struct dst_entry *dst);
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
@@ -278,8 +281,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 {
 	BUG_ON(!policy->walk.dead);
 
-	BUG_ON(policy->bundles);
-
 	if (del_timer(&policy->timer))
 		BUG();
 
@@ -290,12 +291,7 @@ EXPORT_SYMBOL(xfrm_policy_destroy);
 
 static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 {
-	struct dst_entry *dst;
-
-	while ((dst = policy->bundles) != NULL) {
-		policy->bundles = dst->next;
-		dst_free(dst);
-	}
+	atomic_inc(&policy->genid);
 
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
@@ -573,7 +569,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	struct xfrm_policy *delpol;
 	struct hlist_head *chain;
 	struct hlist_node *entry, *newpos;
-	struct dst_entry *gc_list;
 	u32 mark = policy->mark.v & policy->mark.m;
 
 	write_lock_bh(&xfrm_policy_lock);
@@ -624,33 +619,11 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		schedule_work(&net->xfrm.policy_hash_work);
 
 	read_lock_bh(&xfrm_policy_lock);
-	gc_list = NULL;
 	entry = &policy->bydst;
-	hlist_for_each_entry_continue(policy, entry, bydst) {
-		struct dst_entry *dst;
-
-		write_lock(&policy->lock);
-		dst = policy->bundles;
-		if (dst) {
-			struct dst_entry *tail = dst;
-			while (tail->next)
-				tail = tail->next;
-			tail->next = gc_list;
-			gc_list = dst;
-
-			policy->bundles = NULL;
-		}
-		write_unlock(&policy->lock);
-	}
+	hlist_for_each_entry_continue(policy, entry, bydst)
+		atomic_inc(&policy->genid);
 	read_unlock_bh(&xfrm_policy_lock);
 
-	while (gc_list) {
-		struct dst_entry *dst = gc_list;
-
-		gc_list = dst->next;
-		dst_free(dst);
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
@@ -999,6 +972,20 @@ fail:
 	return ret;
 }
 
+static struct xfrm_policy *__xfrm_policy_lookup(
+		struct net *net, struct flowi *fl,
+		u16 family, u8 dir)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_policy *pol;
+
+	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+	if (pol != NULL)
+		return pol;
+#endif
+	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+}
+
 static struct flow_cache_entry_ops **xfrm_policy_lookup(
 		struct net *net, struct flowi *fl, u16 family,
 		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx)
@@ -1008,21 +995,10 @@ static struct flow_cache_entry_ops **xfrm_policy_lookup(
 	if (old_ops)
 		xfrm_pol_put(container_of(old_ops, struct xfrm_policy, fc_ops));
 
-#ifdef CONFIG_XFRM_SUB_POLICY
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol))
+	pol = __xfrm_policy_lookup(net, fl, family, dir);
+	if (IS_ERR_OR_NULL(pol))
 		return ERR_CAST(pol);
-	if (pol)
-		goto found;
-#endif
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol))
-		return ERR_CAST(pol);
-	if (pol)
-		goto found;
-	return NULL;
 
-found:
 	/* Resolver returns two references:
 	 * one for cache and one for caller of flow_cache_lookup() */
 	xfrm_pol_hold(pol);
@@ -1314,18 +1290,6 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
  * still valid.
  */
 
-static struct dst_entry *
-xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
-{
-	struct dst_entry *x;
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	if (unlikely(afinfo == NULL))
-		return ERR_PTR(-EINVAL);
-	x = afinfo->find_bundle(fl, policy);
-	xfrm_policy_put_afinfo(afinfo);
-	return x;
-}
-
 static inline int xfrm_get_tos(struct flowi *fl, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1341,6 +1305,55 @@ static inline int xfrm_get_tos(struct flowi *fl, int family)
 	return tos;
 }
 
+static struct flow_cache_entry_ops **xfrm_bundle_get_fce(
+		struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (xdst->route == NULL) {
+		/* Dummy bundle - if it has xfrms we were not
+		 * able to build bundle as template resolution failed.
+		 * It means we need to try again resolving. */
+		if (xdst->num_xfrms > 0)
+			return NULL;
+	} else {
+		/* Real bundle */
+		if (stale_bundle(dst))
+			return NULL;
+	}
+
+	dst_hold(dst);
+	return ops;
+}
+
+static int xfrm_bundle_check_fce(struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (!xdst->route)
+		return 0;
+	if (stale_bundle(dst))
+		return 0;
+
+	return 1;
+}
+
+static void xfrm_bundle_delete_fce(struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	dst_free(dst);
+}
+
+static struct flow_cache_entry_ops xfrm_bundle_fc_ops __read_mostly = {
+	.get = xfrm_bundle_get_fce,
+	.check = xfrm_bundle_check_fce,
+	.delete = xfrm_bundle_delete_fce,
+};
+
 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1363,9 +1376,10 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 		BUG();
 	}
 	xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS);
-
 	xfrm_policy_put_afinfo(afinfo);
 
+	xdst->fc_ops = &xfrm_bundle_fc_ops;
+
 	return xdst;
 }
 
@@ -1403,6 +1417,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	return err;
 }
 
+
 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
  * all the metrics... Shortly, bundle a bundle.
  */
@@ -1466,7 +1481,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 			dst_hold(dst);
 
 		dst1->xfrm = xfrm[i];
-		xdst->genid = xfrm[i]->genid;
+		xdst->xfrm_genid = xfrm[i]->genid;
 
 		dst1->obsolete = -1;
 		dst1->flags |= DST_HOST;
@@ -1559,7 +1574,185 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
 #endif
 }
 
-static int stale_bundle(struct dst_entry *dst);
+static int xfrm_expand_policies(struct flowi *fl, u16 family,
+				struct xfrm_policy **pols,
+				int *num_pols, int *num_xfrms)
+{
+	int i;
+
+	if (*num_pols == 0 || !pols[0]) {
+		*num_pols = 0;
+		*num_xfrms = 0;
+		return 0;
+	}
+	if (IS_ERR(pols[0]))
+		return PTR_ERR(pols[0]);
+
+	*num_xfrms = pols[0]->xfrm_nr;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
+	    pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+		pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
+						    XFRM_POLICY_TYPE_MAIN,
+						    fl, family,
+						    XFRM_POLICY_OUT);
+		if (pols[1]) {
+			if (IS_ERR(pols[1])) {
+				xfrm_pols_put(pols, *num_pols);
+				return PTR_ERR(pols[1]);
+			}
+			(*num_pols) ++;
+			(*num_xfrms) += pols[1]->xfrm_nr;
+		}
+	}
+#endif
+	for (i = 0; i < *num_pols; i++) {
+		if (pols[i]->action != XFRM_POLICY_ALLOW) {
+			*num_xfrms = -1;
+			break;
+		}
+	}
+
+	return 0;
+
+}
+
+static struct xfrm_dst *xfrm_resolve_and_create_bundle(
+		struct xfrm_policy **pols, int num_pols,
+		struct flowi *fl, u16 family, struct dst_entry *dst_orig)
+{
+	struct net *net = xp_net(pols[0]);
+	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct dst_entry *dst;
+	struct xfrm_dst *xdst;
+	int err;
+
+	/* Try to instantiate a bundle */
+	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
+	if (err < 0) {
+		if (err != -EAGAIN)
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		return ERR_PTR(err);
+	}
+
+	dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+	if (IS_ERR(dst)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
+		return ERR_CAST(dst);
+	}
+
+	xdst = (struct xfrm_dst *)dst;
+	xdst->num_xfrms = err;
+	if (num_pols > 1)
+		err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+	else
+		err = xfrm_dst_update_origin(dst, fl);
+	if (unlikely(err)) {
+		dst_free(dst);
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+		return ERR_PTR(err);
+	}
+
+	xdst->num_pols = num_pols;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+	xdst->policy_genid = atomic_read(&pols[0]->genid);
+
+	return xdst;
+}
+
+static struct flow_cache_entry_ops **xfrm_bundle_lookup(
+		struct net *net, struct flowi *fl, u16 family, u8 dir,
+		struct flow_cache_entry_ops **old_ops, void *ctx)
+{
+	struct dst_entry *dst_orig = (struct dst_entry *)ctx;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	struct xfrm_dst *xdst, *new_xdst;
+	int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
+
+	/* Check if the policies from old bundle are usable */
+	xdst = NULL;
+	if (old_ops) {
+		xdst = container_of(old_ops, struct xfrm_dst, fc_ops);
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		pol_dead = 0;
+		for (i = 0; i < num_pols; i++) {
+			pols[i] = xdst->pols[i];
+			pol_dead |= pols[i]->walk.dead;
+		}
+		if (pol_dead) {
+			dst_free(&xdst->u.dst);
+			xdst = NULL;
+			num_pols = 0;
+			num_xfrms = 0;
+			old_ops = NULL;
+		}
+	}
+
+	/* Resolve policies to use if we couldn't get them from
+	 * previous cache entry */
+	if (xdst == NULL) {
+		num_pols = 1;
+		pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
+			goto inc_error;
+		if (num_pols == 0)
+			return NULL;
+		if (num_xfrms <= 0)
+			goto make_dummy_bundle;
+	}
+
+	new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig);
+	if (IS_ERR(new_xdst)) {
+		err = PTR_ERR(new_xdst);
+		if (err != -EAGAIN)
+			goto error;
+		if (old_ops == NULL)
+			goto make_dummy_bundle;
+		dst_hold(&xdst->u.dst);
+		return old_ops;
+	}
+
+	/* Kill the previous bundle */
+	if (xdst) {
+		/* The policies were stolen for newly generated bundle */
+		xdst->num_pols = 0;
+		dst_free(&xdst->u.dst);
+	}
+
+	/* Flow cache does not have reference, it dst_free()'s,
+	 * but we do need to return one reference for original caller */
+	dst_hold(&new_xdst->u.dst);
+	return &new_xdst->fc_ops;
+
+make_dummy_bundle:
+	/* We found policies, but there's no bundles to instantiate:
+	 * either because the policy blocks, has no transformations or
+	 * we could not build template (no xfrm_states).*/
+	xdst = xfrm_alloc_dst(net, family);
+	if (IS_ERR(xdst)) {
+		xfrm_pols_put(pols, num_pols);
+		return ERR_CAST(xdst);
+	}
+	xdst->num_pols = num_pols;
+	xdst->num_xfrms = num_xfrms;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+
+	dst_hold(&xdst->u.dst);
+	return &xdst->fc_ops;
+
+inc_error:
+	XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+error:
+	if (xdst != NULL)
+		dst_free(&xdst->u.dst);
+	else
+		xfrm_pols_put(pols, num_pols);
+	return ERR_PTR(err);
+}
 
 /* Main function: finds/creates a bundle for given flow.
  *
@@ -1569,248 +1762,152 @@ static int stale_bundle(struct dst_entry *dst);
 int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 		  struct sock *sk, int flags)
 {
-	struct xfrm_policy *policy;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
-	int npols;
-	int pol_dead;
-	int xfrm_nr;
-	int pi;
-	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
-	struct dst_entry *dst, *dst_orig = *dst_p;
-	int nx = 0;
-	int err;
-	u32 genid;
-	u16 family;
+	struct flow_cache_entry_ops **ops;
+	struct xfrm_dst *xdst;
+	struct dst_entry *dst, *dst_orig = *dst_p, *route;
+	u16 family = dst_orig->ops->family;
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+	int i, err, num_pols, num_xfrms, drop_pols = 0;
 
 restart:
-	genid = atomic_read(&flow_cache_genid);
-	policy = NULL;
-	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
-		pols[pi] = NULL;
-	npols = 0;
-	pol_dead = 0;
-	xfrm_nr = 0;
+	dst = NULL;
+	xdst = NULL;
+	route = NULL;
 
 	if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
-		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		num_pols = 1;
+		pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
 			goto dropdst;
+
+		if (num_pols) {
+			if (num_xfrms <= 0) {
+				drop_pols = num_pols;
+				goto no_transform;
+			}
+
+			xdst = xfrm_resolve_and_create_bundle(
+					pols, num_pols, fl,
+					family, dst_orig);
+			if (IS_ERR(xdst)) {
+				xfrm_pols_put(pols, num_pols);
+				err = PTR_ERR(xdst);
+				goto dropdst;
+			}
+
+			spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+			xdst->u.dst.next = xfrm_policy_sk_bundles;
+			xfrm_policy_sk_bundles = &xdst->u.dst;
+			spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+
+			route = xdst->route;
 		}
 	}
 
-	if (!policy) {
-		struct flow_cache_entry_ops **ops;
-
+	if (xdst == NULL) {
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		ops = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					dir, xfrm_policy_lookup, NULL);
-		err = PTR_ERR(ops);
+		ops = flow_cache_lookup(net, fl, family, dir,
+					xfrm_bundle_lookup, dst_orig);
+		if (ops == NULL)
+			goto nopol;
 		if (IS_ERR(ops)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+			err = PTR_ERR(ops);
 			goto dropdst;
 		}
-		if (ops)
-			policy = container_of(ops, struct xfrm_policy, fc_ops);
-		else
-			policy = NULL;
+		xdst = container_of(ops, struct xfrm_dst, fc_ops);
+
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols);
+		route = xdst->route;
+	}
+
+	dst = &xdst->u.dst;
+	if (route == NULL && num_xfrms > 0) {
+		/* The only case when xfrm_bundle_lookup() returns a
+		 * bundle with null route, is when the template could
+		 * not be resolved. It means policies are there, but
+		 * bundle could not be created, since we don't yet
+		 * have the xfrm_state's. We need to wait for KM to
+		 * negotiate new SA's or bail out with error.*/
+		if (net->xfrm.sysctl_larval_drop) {
+			/* EREMOTE tells the caller to generate
+			 * a one-shot blackhole route. */
+			dst_release(dst);
+			xfrm_pols_put(pols, num_pols);
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+			return -EREMOTE;
+		}
+		if (flags & XFRM_LOOKUP_WAIT) {
+			DECLARE_WAITQUEUE(wait, current);
+
+			add_wait_queue(&net->xfrm.km_waitq, &wait);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&net->xfrm.km_waitq, &wait);
+
+			if (!signal_pending(current)) {
+				dst_release(dst);
+				goto restart;
+			}
+
+			err = -ERESTART;
+		} else
+			err = -EAGAIN;
+
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+		goto error;
 	}
 
-	if (!policy)
+no_transform:
+	if (num_pols == 0)
 		goto nopol;
 
-	family = dst_orig->ops->family;
-	pols[0] = policy;
-	npols ++;
-	xfrm_nr += pols[0]->xfrm_nr;
-
-	err = -ENOENT;
-	if ((flags & XFRM_LOOKUP_ICMP) && !(policy->flags & XFRM_POLICY_ICMP))
+	if ((flags & XFRM_LOOKUP_ICMP) &&
+	    !(pols[0]->flags & XFRM_POLICY_ICMP)) {
+		err = -ENOENT;
 		goto error;
+	}
 
-	policy->curlft.use_time = get_seconds();
+	for (i = 0; i < num_pols; i++)
+		pols[i]->curlft.use_time = get_seconds();
 
-	switch (policy->action) {
-	default:
-	case XFRM_POLICY_BLOCK:
+	if (num_xfrms < 0) {
 		/* Prohibit the flow */
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
 		err = -EPERM;
 		goto error;
-
-	case XFRM_POLICY_ALLOW:
-#ifndef CONFIG_XFRM_SUB_POLICY
-		if (policy->xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pol_put(policy);
-			return 0;
-		}
-#endif
-
-		/* Try to find matching bundle.
-		 *
-		 * LATER: help from flow cache. It is optional, this
-		 * is required only for output policy.
-		 */
-		dst = xfrm_find_bundle(fl, policy, family);
-		if (IS_ERR(dst)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			err = PTR_ERR(dst);
-			goto error;
-		}
-
-		if (dst)
-			break;
-
-#ifdef CONFIG_XFRM_SUB_POLICY
-		if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
-			pols[1] = xfrm_policy_lookup_bytype(net,
-							    XFRM_POLICY_TYPE_MAIN,
-							    fl, family,
-							    XFRM_POLICY_OUT);
-			if (pols[1]) {
-				if (IS_ERR(pols[1])) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
-					err = PTR_ERR(pols[1]);
-					goto error;
-				}
-				if (pols[1]->action == XFRM_POLICY_BLOCK) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
-					err = -EPERM;
-					goto error;
-				}
-				npols ++;
-				xfrm_nr += pols[1]->xfrm_nr;
-			}
-		}
-
-		/*
-		 * Because neither flowi nor bundle information knows about
-		 * transformation template size. On more than one policy usage
-		 * we can realize whether all of them is bypass or not after
-		 * they are searched. See above not-transformed bypass
-		 * is surrounded by non-sub policy configuration, too.
-		 */
-		if (xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
-#endif
-		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
-
-		if (unlikely(nx<0)) {
-			err = nx;
-			if (err == -EAGAIN && net->xfrm.sysctl_larval_drop) {
-				/* EREMOTE tells the caller to generate
-				 * a one-shot blackhole route.
-				 */
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-				xfrm_pol_put(policy);
-				return -EREMOTE;
-			}
-			if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) {
-				DECLARE_WAITQUEUE(wait, current);
-
-				add_wait_queue(&net->xfrm.km_waitq, &wait);
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule();
-				set_current_state(TASK_RUNNING);
-				remove_wait_queue(&net->xfrm.km_waitq, &wait);
-
-				nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
-
-				if (nx == -EAGAIN && signal_pending(current)) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-					err = -ERESTART;
-					goto error;
-				}
-				if (nx == -EAGAIN ||
-				    genid != atomic_read(&flow_cache_genid)) {
-					xfrm_pols_put(pols, npols);
-					goto restart;
-				}
-				err = nx;
-			}
-			if (err < 0) {
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-				goto error;
-			}
-		}
-		if (nx == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
-		dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig);
-		err = PTR_ERR(dst);
-		if (IS_ERR(dst)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
-			goto error;
-		}
-
-		for (pi = 0; pi < npols; pi++)
-			pol_dead |= pols[pi]->walk.dead;
-
-		write_lock_bh(&policy->lock);
-		if (unlikely(pol_dead || stale_bundle(dst))) {
-			/* Wow! While we worked on resolving, this
-			 * policy has gone. Retry. It is not paranoia,
-			 * we just cannot enlist new bundle to dead object.
-			 * We can't enlist stable bundles either.
-			 */
-			write_unlock_bh(&policy->lock);
-			dst_free(dst);
-
-			if (pol_dead)
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLDEAD);
-			else
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			err = -EHOSTUNREACH;
-			goto error;
-		}
-
-		if (npols > 1)
-			err = xfrm_dst_update_parent(dst, &pols[1]->selector);
-		else
-			err = xfrm_dst_update_origin(dst, fl);
-		if (unlikely(err)) {
-			write_unlock_bh(&policy->lock);
-			dst_free(dst);
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			goto error;
-		}
-
-		dst->next = policy->bundles;
-		policy->bundles = dst;
-		dst_hold(dst);
-		write_unlock_bh(&policy->lock);
+	} else if (num_xfrms > 0) {
+		/* Flow transformed */
+		*dst_p = dst;
+		dst_release(dst_orig);
+	} else {
+		/* Flow passes untransformed */
+		dst_release(dst);
 	}
-	*dst_p = dst;
-	dst_release(dst_orig);
-	xfrm_pols_put(pols, npols);
+ok:
+	xfrm_pols_put(pols, drop_pols);
 	return 0;
 
+nopol:
+	if (!(flags & XFRM_LOOKUP_ICMP))
+		goto ok;
+	err = -ENOENT;
 error:
-	xfrm_pols_put(pols, npols);
+	dst_release(dst);
 dropdst:
 	dst_release(dst_orig);
 	*dst_p = NULL;
+	xfrm_pols_put(pols, drop_pols);
 	return err;
-
-nopol:
-	err = -ENOENT;
-	if (flags & XFRM_LOOKUP_ICMP)
-		goto dropdst;
-	return 0;
 }
 EXPORT_SYMBOL(__xfrm_lookup);
 
@@ -2162,71 +2259,24 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
-static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p)
-{
-	struct dst_entry *dst, **dstp;
-
-	write_lock(&pol->lock);
-	dstp = &pol->bundles;
-	while ((dst=*dstp) != NULL) {
-		if (func(dst)) {
-			*dstp = dst->next;
-			dst->next = *gc_list_p;
-			*gc_list_p = dst;
-		} else {
-			dstp = &dst->next;
-		}
-	}
-	write_unlock(&pol->lock);
-}
-
-static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *))
+static void __xfrm_garbage_collect(struct net *net)
 {
-	struct dst_entry *gc_list = NULL;
-	int dir;
+	struct dst_entry *head, *next;
 
-	read_lock_bh(&xfrm_policy_lock);
-	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
-		struct xfrm_policy *pol;
-		struct hlist_node *entry;
-		struct hlist_head *table;
-		int i;
+	flow_cache_flush();
 
-		hlist_for_each_entry(pol, entry,
-				     &net->xfrm.policy_inexact[dir], bydst)
-			prune_one_bundle(pol, func, &gc_list);
+	spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+	head = xfrm_policy_sk_bundles;
+	xfrm_policy_sk_bundles = NULL;
+	spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
 
-		table = net->xfrm.policy_bydst[dir].table;
-		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
-			hlist_for_each_entry(pol, entry, table + i, bydst)
-				prune_one_bundle(pol, func, &gc_list);
-		}
-	}
-	read_unlock_bh(&xfrm_policy_lock);
-
-	while (gc_list) {
-		struct dst_entry *dst = gc_list;
-		gc_list = dst->next;
-		dst_free(dst);
+	while (head) {
+		next = head->next;
+		dst_free(head);
+		head = next;
 	}
 }
 
-static int unused_bundle(struct dst_entry *dst)
-{
-	return !atomic_read(&dst->__refcnt);
-}
-
-static void __xfrm_garbage_collect(struct net *net)
-{
-	xfrm_prune_bundles(net, unused_bundle);
-}
-
-static int xfrm_flush_bundles(struct net *net)
-{
-	xfrm_prune_bundles(net, stale_bundle);
-	return 0;
-}
-
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
 	do {
@@ -2284,7 +2334,9 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 			return 0;
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
-		if (xdst->genid != dst->xfrm->genid)
+		if (xdst->xfrm_genid != dst->xfrm->genid)
+			return 0;
+		if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
 			return 0;
 
 		if (strict && fl &&
@@ -2445,11 +2497,9 @@ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 
 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
-	struct net_device *dev = ptr;
-
 	switch (event) {
 	case NETDEV_DOWN:
-		xfrm_flush_bundles(dev_net(dev));
+		flow_cache_flush();
 	}
 	return NOTIFY_DONE;
 }
@@ -2781,7 +2831,6 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       struct xfrm_migrate *m, int num_migrate)
 {
 	struct xfrm_migrate *mp;
-	struct dst_entry *dst;
 	int i, j, n = 0;
 
 	write_lock_bh(&pol->lock);
@@ -2806,10 +2855,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       sizeof(pol->xfrm_vec[i].saddr));
 			pol->xfrm_vec[i].encap_family = mp->new_family;
 			/* flush bundles */
-			while ((dst = pol->bundles) != NULL) {
-				pol->bundles = dst->next;
-				dst_free(dst);
-			}
+			atomic_inc(&pol->genid);
 		}
 	}
 
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 3/4] xfrm: remove policy garbage collection
  2010-04-01 12:52 [PATCH 0/4] caching bundles, iteration 3 Timo Teras
  2010-04-01 12:52 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
  2010-04-01 12:52 ` [PATCH 2/4] xfrm: cache bundles instead of policies for outgoing flows Timo Teras
@ 2010-04-01 12:52 ` Timo Teras
  2010-04-01 12:52 ` [PATCH 4/4] flow: delayed deletion of flow cache entries Timo Teras
  2010-04-02  3:00 ` [PATCH 0/4] caching bundles, iteration 3 David Miller
  4 siblings, 0 replies; 42+ messages in thread
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

Policies are now properly reference counted and destroyed from
all code paths. The delayed gc is just an overhead now and can
be removed.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/xfrm/xfrm_policy.c |   39 +++++----------------------------------
 1 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 3489e2d..0a80978 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,9 +46,6 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 
-static HLIST_HEAD(xfrm_policy_gc_list);
-static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
-
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
@@ -289,32 +286,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
 
-static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
-{
-	atomic_inc(&policy->genid);
-
-	if (del_timer(&policy->timer))
-		atomic_dec(&policy->refcnt);
-
-	xfrm_pol_put(policy);
-}
-
-static void xfrm_policy_gc_task(struct work_struct *work)
-{
-	struct xfrm_policy *policy;
-	struct hlist_node *entry, *tmp;
-	struct hlist_head gc_list;
-
-	spin_lock_bh(&xfrm_policy_gc_lock);
-	gc_list.first = xfrm_policy_gc_list.first;
-	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
-	spin_unlock_bh(&xfrm_policy_gc_lock);
-
-	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
-		xfrm_policy_gc_kill(policy);
-}
-static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
-
 /* Rule must be locked. Release descentant resources, announce
  * entry dead. The rule must be unlinked from lists to the moment.
  */
@@ -323,11 +294,12 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 {
 	policy->walk.dead = 1;
 
-	spin_lock_bh(&xfrm_policy_gc_lock);
-	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
-	spin_unlock_bh(&xfrm_policy_gc_lock);
+	atomic_inc(&policy->genid);
 
-	schedule_work(&xfrm_policy_gc_work);
+	if (del_timer(&policy->timer))
+		xfrm_pol_put(policy);
+
+	xfrm_pol_put(policy);
 }
 
 static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
@@ -2605,7 +2577,6 @@ static void xfrm_policy_fini(struct net *net)
 	audit_info.sessionid = -1;
 	audit_info.secid = 0;
 	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
-	flush_work(&xfrm_policy_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.policy_all));
 
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 4/4] flow: delayed deletion of flow cache entries
  2010-04-01 12:52 [PATCH 0/4] caching bundles, iteration 3 Timo Teras
                   ` (2 preceding siblings ...)
  2010-04-01 12:52 ` [PATCH 3/4] xfrm: remove policy garbage collection Timo Teras
@ 2010-04-01 12:52 ` Timo Teras
  2010-04-02  3:00 ` [PATCH 0/4] caching bundles, iteration 3 David Miller
  4 siblings, 0 replies; 42+ messages in thread
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

Speed up lookups by freeing flow cache entries later. After
virtualizing flow cache entry operations, the flow cache may now
end up calling policy or bundle destructor which can be slowish.

As gc_list is more effective with double linked list, the flow cache
is converted to use common hlist and list macroes where appropriate.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/core/flow.c |  101 ++++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 70 insertions(+), 31 deletions(-)

diff --git a/net/core/flow.c b/net/core/flow.c
index f938137..d3a8f80 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,7 +26,10 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry *	next;
+	union {
+		struct hlist_node	hlist;
+		struct list_head	gc_list;
+	} u;
 	u16				family;
 	u8				dir;
 	u32				genid;
@@ -35,7 +38,7 @@ struct flow_cache_entry {
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry **	hash_table;
+	struct hlist_head *		hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -62,6 +65,9 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0);
 static struct flow_cache flow_cache_global;
 static struct kmem_cache *flow_cachep;
 
+static DEFINE_SPINLOCK(flow_cache_gc_lock);
+static LIST_HEAD(flow_cache_gc_list);
+
 #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
 #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
@@ -86,38 +92,66 @@ static int flow_entry_valid(struct flow_cache_entry *fle)
 	return 1;
 }
 
-static void flow_entry_kill(struct flow_cache *fc,
-			    struct flow_cache_percpu *fcp,
-			    struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle)
 {
 	if (fle->ops)
 		(*fle->ops)->delete(fle->ops);
 	kmem_cache_free(flow_cachep, fle);
-	fcp->hash_count--;
+}
+
+static void flow_cache_gc_task(struct work_struct *work)
+{
+	struct list_head gc_list;
+	struct flow_cache_entry *fce, *n;
+
+	INIT_LIST_HEAD(&gc_list);
+	spin_lock_bh(&flow_cache_gc_lock);
+	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
+	spin_unlock_bh(&flow_cache_gc_lock);
+
+	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+		flow_entry_kill(fce);
+}
+static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
+
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
+				     int deleted, struct list_head *gc_list)
+{
+	if (deleted) {
+		fcp->hash_count -= deleted;
+		spin_lock_bh(&flow_cache_gc_lock);
+		list_splice_tail(gc_list, &flow_cache_gc_list);
+		spin_unlock_bh(&flow_cache_gc_lock);
+		schedule_work(&flow_cache_gc_work);
+	}
 }
 
 static void __flow_cache_shrink(struct flow_cache *fc,
 				struct flow_cache_percpu *fcp,
 				int shrink_to)
 {
-	struct flow_cache_entry *fle, **flp;
-	int i;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int saved = 0;
 
-		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
 			if (saved < shrink_to &&
 			    flow_entry_valid(fle)) {
 				saved++;
-				flp = &fle->next;
 			} else {
-				*flp = fle->next;
-				flow_entry_kill(fc, fcp, fle);
+				deleted++;
+				hlist_del(&fle->u.hlist);
+				list_add_tail(&fle->u.gc_list, &gc_list);
 			}
 		}
 	}
+
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
 }
 
 static void flow_cache_shrink(struct flow_cache *fc,
@@ -182,8 +216,9 @@ struct flow_cache_entry_ops **flow_cache_lookup(
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
-	struct flow_cache_entry *fle, **head;
 	struct flow_cache_entry_ops **ops;
+	struct flow_cache_entry *fle, *tfle;
+	struct hlist_node *entry;
 	unsigned int hash;
 
 	local_bh_disable();
@@ -200,12 +235,13 @@ struct flow_cache_entry_ops **flow_cache_lookup(
 		flow_new_hash_rnd(fc, fcp);
 
 	hash = flow_hash_code(fc, fcp, key);
-	head = &fcp->hash_table[hash];
-	for (fle = *head; fle; fle = fle->next) {
-		if (fle->family == family &&
-		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0)
+	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
+		if (tfle->family == family &&
+		    tfle->dir == dir &&
+		    flow_key_compare(key, &tfle->key) == 0) {
+			fle = tfle;
 			break;
+		}
 	}
 
 	if (!fle) {
@@ -214,13 +250,13 @@ struct flow_cache_entry_ops **flow_cache_lookup(
 
 		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 		if (fle) {
-			fle->next = *head;
-			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
 			fle->ops = NULL;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->ops = NULL;
+
+			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 			fcp->hash_count++;
 		}
 	} else if (fle->genid == atomic_read(&flow_cache_genid)) {
@@ -256,23 +292,26 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	struct flow_flush_info *info = (void *)data;
 	struct flow_cache *fc = info->cache;
 	struct flow_cache_percpu *fcp;
-	int i;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
 
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		struct flow_cache_entry *fle;
-
-		fle = fcp->hash_table[i];
-		for (; fle; fle = fle->next) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
 			if (flow_entry_valid(fle))
 				continue;
 
-			if (fle->ops)
-				(*fle->ops)->delete(fle->ops);
-			fle->ops = NULL;
+			deleted++;
+			hlist_del(&fle->u.hlist);
+			list_add_tail(&fle->u.gc_list, &gc_list);
 		}
 	}
 
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
 }
@@ -314,7 +353,7 @@ void flow_cache_flush(void)
 static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
 					  struct flow_cache_percpu *fcp)
 {
-	fcp->hash_table = (struct flow_cache_entry **)
+	fcp->hash_table = (struct hlist_head *)
 		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
 	if (!fcp->hash_table)
 		panic("NET: failed to allocate flow cache order %lu\n", fc->order);
@@ -348,7 +387,7 @@ static int flow_cache_init(struct flow_cache *fc)
 
 	for (order = 0;
 	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc));
+		     (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
 	     order++)
 		/* NOTHING */;
 	fc->order = order;
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-01 12:52 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
@ 2010-04-01 13:05   ` Eric Dumazet
  2010-04-01 13:07     ` Timo Teräs
  2010-04-03  3:38   ` Herbert Xu
  2010-04-04 10:42   ` Herbert Xu
  2 siblings, 1 reply; 42+ messages in thread
From: Eric Dumazet @ 2010-04-01 13:05 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev, Herbert Xu

Le jeudi 01 avril 2010 à 15:52 +0300, Timo Teras a écrit :
> This allows to validate the cached object before returning it.
> It also allows to destruct object properly, if the last reference
> was held in flow cache. This is also a prepartion for caching
> bundles in the flow cache.
> 
> In return for virtualizing the methods, we save on:
> - not having to regenerate the whole flow cache on policy removal:
>   each flow matching a killed policy gets refreshed as the getter
>   function notices it smartly.
> - we do not have to call flow_cache_flush from policy gc, since the
>   flow cache now properly deletes the object if it had any references
> 
> Signed-off-by: Timo Teras <timo.teras@iki.fi>
> ---

...

> @@ -481,6 +482,7 @@ struct xfrm_policy {
>  	atomic_t		refcnt;
>  	struct timer_list	timer;
>  
> +	struct flow_cache_entry_ops *fc_ops;
>  	u32			priority;
>  	u32			index;
>  	struct xfrm_mark	mark;
...

> +static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
> +	.get = xfrm_policy_get_fce,
> +	.check = xfrm_policy_check_fce,
> +	.delete = xfrm_policy_delete_fce,
> +};
>  

Any particular reason these flow_cache_entry_ops are not const ?



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-01 13:05   ` Eric Dumazet
@ 2010-04-01 13:07     ` Timo Teräs
  0 siblings, 0 replies; 42+ messages in thread
From: Timo Teräs @ 2010-04-01 13:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Herbert Xu

Eric Dumazet wrote:
> Le jeudi 01 avril 2010 à 15:52 +0300, Timo Teras a écrit :
>> @@ -481,6 +482,7 @@ struct xfrm_policy {
>>  	atomic_t		refcnt;
>>  	struct timer_list	timer;
>>  
>> +	struct flow_cache_entry_ops *fc_ops;
>>  	u32			priority;
>>  	u32			index;
>>  	struct xfrm_mark	mark;
> ...
> 
>> +static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
>> +	.get = xfrm_policy_get_fce,
>> +	.check = xfrm_policy_check_fce,
>> +	.delete = xfrm_policy_delete_fce,
>> +};
>>  
> 
> Any particular reason these flow_cache_entry_ops are not const ?

Umm... No. I'll constify them. Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 0/4] caching bundles, iteration 3
  2010-04-01 12:52 [PATCH 0/4] caching bundles, iteration 3 Timo Teras
                   ` (3 preceding siblings ...)
  2010-04-01 12:52 ` [PATCH 4/4] flow: delayed deletion of flow cache entries Timo Teras
@ 2010-04-02  3:00 ` David Miller
  2010-04-02 13:12   ` Herbert Xu
  4 siblings, 1 reply; 42+ messages in thread
From: David Miller @ 2010-04-02  3:00 UTC (permalink / raw)
  To: timo.teras; +Cc: netdev, herbert

From: Timo Teras <timo.teras@iki.fi>
Date: Thu,  1 Apr 2010 15:52:16 +0300

> Applies on top of the previous patches I sent.
> 
> Changes to previous iteration:
>  - "flow: delayed deletion of flow cache entries" refactored to go
>    after the other patches per Herbert's request
>  - fixed hlist searching in the above mentioned patch
>  - uses now ERR_CAST and other similar functions for better readability
>  - added basic gc for per-socket bundles
>  - some other minor clean ups
> 
> I'm now running this on a test box, with my specific setup, and it
> seems to be working pretty well. However, this changes quite a bit
> of things, so detailed review is needed.

I'm kind of burnt out on patch review for today, so I hope Herbert
can take a look at this when he wakes up.

Otherwise I'll study it in detail tomorrow.

Thanks Timo!

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 0/4] caching bundles, iteration 3
  2010-04-02  3:00 ` [PATCH 0/4] caching bundles, iteration 3 David Miller
@ 2010-04-02 13:12   ` Herbert Xu
  0 siblings, 0 replies; 42+ messages in thread
From: Herbert Xu @ 2010-04-02 13:12 UTC (permalink / raw)
  To: David Miller; +Cc: timo.teras, netdev

On Thu, Apr 01, 2010 at 08:00:42PM -0700, David Miller wrote:
>
> I'm kind of burnt out on patch review for today, so I hope Herbert
> can take a look at this when he wakes up.

I've been busy with dash stuff today, but I should be able to
review them tomorrow.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-01 12:52 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
  2010-04-01 13:05   ` Eric Dumazet
@ 2010-04-03  3:38   ` Herbert Xu
  2010-04-03  8:36     ` Herbert Xu
  2010-04-04 10:42   ` Herbert Xu
  2 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-03  3:38 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev

On Thu, Apr 01, 2010 at 03:52:17PM +0300, Timo Teras wrote:
> This allows to validate the cached object before returning it.
> It also allows to destruct object properly, if the last reference
> was held in flow cache. This is also a prepartion for caching
> bundles in the flow cache.
> 
> In return for virtualizing the methods, we save on:
> - not having to regenerate the whole flow cache on policy removal:
>   each flow matching a killed policy gets refreshed as the getter
>   function notices it smartly.
> - we do not have to call flow_cache_flush from policy gc, since the
>   flow cache now properly deletes the object if it had any references
> 
> Signed-off-by: Timo Teras <timo.teras@iki.fi>

With repsect to removing the cache flush upon policy removal,
what takes care of the timely purging of the corresponding cache
entries if no new traffic comes through?

The concern is that if they're not purged in the absence of new
traffic, then we may hold references on all sorts of objects,
leading to consequences such as the inability to unregister net
devices.
  
>  struct flow_cache_entry {
> -	struct flow_cache_entry	*next;
> -	u16			family;
> -	u8			dir;
> -	u32			genid;
> -	struct flowi		key;
> -	void			*object;
> -	atomic_t		*object_ref;
> +	struct flow_cache_entry *	next;

Please follow the existing coding style.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-03  3:38   ` Herbert Xu
@ 2010-04-03  8:36     ` Herbert Xu
  2010-04-03 13:50       ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-03  8:36 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev

On Sat, Apr 03, 2010 at 11:38:57AM +0800, Herbert Xu wrote:
> 
> With repsect to removing the cache flush upon policy removal,
> what takes care of the timely purging of the corresponding cache
> entries if no new traffic comes through?

In fact this change would seem to render the existing bundle
pruning mechanism when devices are unregistered ineffective.

xfrm_prune_bundles walks through active policy lists to find
the bundles to purge.  However, if a policy has been deleted
while a bundle referencing it is still in the cache, that bundle
will not be pruned.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-03  8:36     ` Herbert Xu
@ 2010-04-03 13:50       ` Timo Teräs
  2010-04-03 14:17         ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-03 13:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> With repsect to removing the cache flush upon policy removal,
> what takes care of the timely purging of the corresponding cache
> entries if no new traffic comes through?
> 
> The concern is that if they're not purged in the absence of new
> traffic, then we may hold references on all sorts of objects,
> leading to consequences such as the inability to unregister net
> devices.

The flow cache is randomized every ten minutes. Thus all flow
cache entries get recreated regularly.

> On Sat, Apr 03, 2010 at 11:38:57AM +0800, Herbert Xu wrote:
>> With repsect to removing the cache flush upon policy removal,
>> what takes care of the timely purging of the corresponding cache
>> entries if no new traffic comes through?
> 
> In fact this change would seem to render the existing bundle
> pruning mechanism when devices are unregistered ineffective.
> 
> xfrm_prune_bundles walks through active policy lists to find
> the bundles to purge.  However, if a policy has been deleted
> while a bundle referencing it is still in the cache, that bundle
> will not be pruned.

When policy is killed, the policy->genid is incremented which
makes xfrm_bundle_ok check fail and the bundle to get pruned
immediately on flush.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-03 13:50       ` Timo Teräs
@ 2010-04-03 14:17         ` Herbert Xu
  2010-04-03 14:26           ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-03 14:17 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sat, Apr 03, 2010 at 04:50:08PM +0300, Timo Teräs wrote:
>
> The flow cache is randomized every ten minutes. Thus all flow
> cache entries get recreated regularly.

Having rmmod <netdrv> block for up to ten minutes is hardly
ideal.

Besides, in future we may want to get rid of the regular reseeding
just like we did for IPv4 routes.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-03 14:17         ` Herbert Xu
@ 2010-04-03 14:26           ` Timo Teräs
  2010-04-03 15:53             ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-03 14:26 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Sat, Apr 03, 2010 at 04:50:08PM +0300, Timo Teräs wrote:
>> The flow cache is randomized every ten minutes. Thus all flow
>> cache entries get recreated regularly.
> 
> Having rmmod <netdrv> block for up to ten minutes is hardly
> ideal.

Why would this block? The device down hook calls flow cache
flush. On flush all bundles with non-up devices get pruned
immediately (via stale_bundle check).

> Besides, in future we may want to get rid of the regular reseeding
> just like we did for IPv4 routes.

Right. It certainly sounds good. And needs a separate change
then.

If this is done, we will need to still have some sort of
periodic gc for flow cache, just like ipv4 routes have. The
gc would on each tick scan just some of the flow cache hash
chains.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-03 14:26           ` Timo Teräs
@ 2010-04-03 15:53             ` Herbert Xu
  2010-04-03 20:19               ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-03 15:53 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sat, Apr 03, 2010 at 05:26:00PM +0300, Timo Teräs wrote:
>
> Why would this block? The device down hook calls flow cache
> flush. On flush all bundles with non-up devices get pruned
> immediately (via stale_bundle check).

Perhaps I missed something in your patch, but the flush that
we currently perform is limited to the bundles from hashed policies.
So if a policy has just recently been removed, then its bundles
won't be flushed.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-03 15:53             ` Herbert Xu
@ 2010-04-03 20:19               ` Timo Teräs
  2010-04-04  2:06                 ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-03 20:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Sat, Apr 03, 2010 at 05:26:00PM +0300, Timo Teräs wrote:
>> Why would this block? The device down hook calls flow cache
>> flush. On flush all bundles with non-up devices get pruned
>> immediately (via stale_bundle check).
> 
> Perhaps I missed something in your patch, but the flush that
> we currently perform is limited to the bundles from hashed policies.
> So if a policy has just recently been removed, then its bundles
> won't be flushed.

If a policy is removed, policy->genid is incremented invalidating
the bundles. Those bundles get freed when:
 - specific flow gets hit
 - cache is flushed due to GC call, or interface going down
 - flow cache randomization

If someone is then removing a net driver, we still execute
flush on the 'device down' hook, and all stale bundles
get flushed.

But yes, this means that xfrm_policy struct can now be held
allocated up to ten extra minutes. But it's only memory that
it's holding, not any extra refs. And it's still reclaimable
by the GC.

If this feels troublesome, we could add asynchronous flush
request that would be called on policy removal. Or even stick
to the synchronous one.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-03 20:19               ` Timo Teräs
@ 2010-04-04  2:06                 ` Herbert Xu
  2010-04-04  5:50                   ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-04  2:06 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sat, Apr 03, 2010 at 11:19:04PM +0300, Timo Teräs wrote:
>
> If someone is then removing a net driver, we still execute
> flush on the 'device down' hook, and all stale bundles
> get flushed.

Not if the bundle belongs to a policy recently deleted.

> But yes, this means that xfrm_policy struct can now be held
> allocated up to ten extra minutes. But it's only memory that
> it's holding, not any extra refs. And it's still reclaimable
> by the GC.

You also hold down the bundle xdst's along with it, which can
hold netdev references preventing modules from being unloaded.

> If this feels troublesome, we could add asynchronous flush
> request that would be called on policy removal. Or even stick
> to the synchronous one.

How about change xfrm_flush_bundles to flush bundles from the
cache instead of xfrm_policy?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04  2:06                 ` Herbert Xu
@ 2010-04-04  5:50                   ` Timo Teräs
  2010-04-04  5:58                     ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-04  5:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Sat, Apr 03, 2010 at 11:19:04PM +0300, Timo Teräs wrote:
>> If someone is then removing a net driver, we still execute
>> flush on the 'device down' hook, and all stale bundles
>> get flushed.
> 
> Not if the bundle belongs to a policy recently deleted.
> 
>> But yes, this means that xfrm_policy struct can now be held
>> allocated up to ten extra minutes. But it's only memory that
>> it's holding, not any extra refs. And it's still reclaimable
>> by the GC.
> 
> You also hold down the bundle xdst's along with it, which can
> hold netdev references preventing modules from being unloaded.
> 
>> If this feels troublesome, we could add asynchronous flush
>> request that would be called on policy removal. Or even stick
>> to the synchronous one.
> 
> How about change xfrm_flush_bundles to flush bundles from the
> cache instead of xfrm_policy?

For the common case:

1. Policy deleted; policy->walk.dead set, policy->genid incremented
2. NETDEV_DOWN hook called, calls flow_cache_flush()
3. flow_cache_flush enumerates all policy and bundle refs
   in it's cache
4. for each bundle xfrm_bundle_check_fce() is called, which
   calls stale_bundle()
5. all bundles using stale policy, fail that check because
     xdst->policy_genid != xdst->pols[0]->genid
   (checked in xfrm_bundle_ok)
6. flow cache calls entry's ->delete which is dst_free for bundles
7. flow_cache_flush() returns

flow_cache_flush really frees the bundles in it on flush.

But now that I look my code again. Your statement is true for
per-socket bundles. They would not get deleted in this case.
I'll change NETDEV_DOWN to call garbage collect instead of
flow cache flush which will then also free the per-socket bundles.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04  5:50                   ` Timo Teräs
@ 2010-04-04  5:58                     ` Herbert Xu
  2010-04-04  6:07                       ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-04  5:58 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sun, Apr 04, 2010 at 08:50:41AM +0300, Timo Teräs wrote:
>
> For the common case:
>
> 1. Policy deleted; policy->walk.dead set, policy->genid incremented
> 2. NETDEV_DOWN hook called, calls flow_cache_flush()
> 3. flow_cache_flush enumerates all policy and bundle refs
>   in it's cache
> 4. for each bundle xfrm_bundle_check_fce() is called, which
>   calls stale_bundle()
> 5. all bundles using stale policy, fail that check because
>     xdst->policy_genid != xdst->pols[0]->genid
>   (checked in xfrm_bundle_ok)
> 6. flow cache calls entry's ->delete which is dst_free for bundles
> 7. flow_cache_flush() returns

Ah, you're doing it in 2/4.  Can we please have each patch be
a self-contained unit? It should be possible to apply 1/4 and
have a resulting kernel that works properly without having to
apply the rest of your patches.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04  5:58                     ` Herbert Xu
@ 2010-04-04  6:07                       ` Timo Teräs
  2010-04-04  6:19                         ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-04  6:07 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 08:50:41AM +0300, Timo Teräs wrote:
>> For the common case:
>>
>> 1. Policy deleted; policy->walk.dead set, policy->genid incremented
>> 2. NETDEV_DOWN hook called, calls flow_cache_flush()
>> 3. flow_cache_flush enumerates all policy and bundle refs
>>   in it's cache
>> 4. for each bundle xfrm_bundle_check_fce() is called, which
>>   calls stale_bundle()
>> 5. all bundles using stale policy, fail that check because
>>     xdst->policy_genid != xdst->pols[0]->genid
>>   (checked in xfrm_bundle_ok)
>> 6. flow cache calls entry's ->delete which is dst_free for bundles
>> 7. flow_cache_flush() returns
> 
> Ah, you're doing it in 2/4.  Can we please have each patch be
> a self-contained unit? It should be possible to apply 1/4 and
> have a resulting kernel that works properly without having to
> apply the rest of your patches.

With 1/4 only, the bundle deletion is not touched. In that case
the policy GC deletes explicitly the bundles. The bundles get
deleted immediately, and only the struct xfrm_policy might get
held up allocated longer.

The code flow would be:
 1. xfrm_policy_kill() queues to GC
 2. xfrm_policy_gc_kill() called from xfrm_policy_gc_task()
    frees all bundles in that policy
 3. xfrm_policy_gc_kill() releases it's reference
 4. ... time passes (flush, randomization, or flow hit occurs)
 5. flow cache releases it's final reference, calls
    xfrm_policy_destroy() which only frees the xfrm_policy memory

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04  6:07                       ` Timo Teräs
@ 2010-04-04  6:19                         ` Herbert Xu
  2010-04-04  6:28                           ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-04  6:19 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sun, Apr 04, 2010 at 09:07:12AM +0300, Timo Teräs wrote:
>
> With 1/4 only, the bundle deletion is not touched. In that case
> the policy GC deletes explicitly the bundles. The bundles get
> deleted immediately, and only the struct xfrm_policy might get
> held up allocated longer.
>
> The code flow would be:
> 1. xfrm_policy_kill() queues to GC
> 2. xfrm_policy_gc_kill() called from xfrm_policy_gc_task()
>    frees all bundles in that policy
> 3. xfrm_policy_gc_kill() releases it's reference
> 4. ... time passes (flush, randomization, or flow hit occurs)
> 5. flow cache releases it's final reference, calls
>    xfrm_policy_destroy() which only frees the xfrm_policy memory

With 1/4 only, you've removed the flow cache flush when policies
are deleted.  However, you don't add the flow cache flush to the
NETDEV_DOWN case until 2/4.  So with only 1/4, bundles (and hence
netdev objects) may be held for up to 10 minutes.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04  6:19                         ` Herbert Xu
@ 2010-04-04  6:28                           ` Timo Teräs
  2010-04-04  8:35                             ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-04  6:28 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 09:07:12AM +0300, Timo Teräs wrote:
>> With 1/4 only, the bundle deletion is not touched. In that case
>> the policy GC deletes explicitly the bundles. The bundles get
>> deleted immediately, and only the struct xfrm_policy might get
>> held up allocated longer.
>>
>> The code flow would be:
>> 1. xfrm_policy_kill() queues to GC
>> 2. xfrm_policy_gc_kill() called from xfrm_policy_gc_task()
>>    frees all bundles in that policy
>> 3. xfrm_policy_gc_kill() releases it's reference
>> 4. ... time passes (flush, randomization, or flow hit occurs)
>> 5. flow cache releases it's final reference, calls
>>    xfrm_policy_destroy() which only frees the xfrm_policy memory
> 
> With 1/4 only, you've removed the flow cache flush when policies
> are deleted.  However, you don't add the flow cache flush to the
> NETDEV_DOWN case until 2/4.  So with only 1/4, bundles (and hence
> netdev objects) may be held for up to 10 minutes.

No. The flow cache flush removal does not prevent bundle deletion.
The flow cache flush is in current code *after* deleting the bundles
from the policy. Freeing bundles and flushing cache are completely
two separate things in current code. Only in 2/4 the bundle deletion
becomes dependent on flow cache flush.

Please, read xfrm_policy_kill() and xfrm_policy_gc_kill() in current
code, and after applying 1/4. The diff of 1/4 is not as informative
as the bundle deletion code is not shown there (since it's not touched).

The only reason why current code does flow cache flush on policy
removal, is to make sure that it's not the flow cache atomic_dec
to drop last reference; if that happened xfrm_policy_destroy would
not get ever called.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04  6:28                           ` Timo Teräs
@ 2010-04-04  8:35                             ` Herbert Xu
  0 siblings, 0 replies; 42+ messages in thread
From: Herbert Xu @ 2010-04-04  8:35 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sun, Apr 04, 2010 at 09:28:43AM +0300, Timo Teräs wrote:
>
> No. The flow cache flush removal does not prevent bundle deletion.
> The flow cache flush is in current code *after* deleting the bundles
> from the policy. Freeing bundles and flushing cache are completely
> two separate things in current code. Only in 2/4 the bundle deletion
> becomes dependent on flow cache flush.

Ah yes I confused myself.  The problem I thought would occur
doesn't because 1/4 doesn't put the bundles in the flow cache
yet.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-01 12:52 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
  2010-04-01 13:05   ` Eric Dumazet
  2010-04-03  3:38   ` Herbert Xu
@ 2010-04-04 10:42   ` Herbert Xu
  2010-04-04 10:50     ` Timo Teräs
  2 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-04 10:42 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev

On Thu, Apr 01, 2010 at 03:52:17PM +0300, Timo Teras wrote:
>
> -extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
> -			       u8 dir, flow_resolve_t resolver);
> +struct flow_cache_entry_ops {
> +	struct flow_cache_entry_ops ** (*get)(struct flow_cache_entry_ops **);
> +	int (*check)(struct flow_cache_entry_ops **);
> +	void (*delete)(struct flow_cache_entry_ops **);
> +};
> +
> +typedef struct flow_cache_entry_ops **(*flow_resolve_t)(
> +		struct net *net, struct flowi *key, u16 family,
> +		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx);

OK this bit really bugs me.

When I first looked at it, my reaction was why on earth are we
returning an ops pointer? Only after some digging around do I see
the fact that this ops pointer is in fact embedded in xfrm_policy.

How about embedding flow_cache_entry in xfrm_policy instead? Returning
flow_cache_entry * would make a lot more sense than a nested pointer
to flow_cache_entry_ops.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04 10:42   ` Herbert Xu
@ 2010-04-04 10:50     ` Timo Teräs
  2010-04-04 11:00       ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-04 10:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Thu, Apr 01, 2010 at 03:52:17PM +0300, Timo Teras wrote:
>> -extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
>> -			       u8 dir, flow_resolve_t resolver);
>> +struct flow_cache_entry_ops {
>> +	struct flow_cache_entry_ops ** (*get)(struct flow_cache_entry_ops **);
>> +	int (*check)(struct flow_cache_entry_ops **);
>> +	void (*delete)(struct flow_cache_entry_ops **);
>> +};
>> +
>> +typedef struct flow_cache_entry_ops **(*flow_resolve_t)(
>> +		struct net *net, struct flowi *key, u16 family,
>> +		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx);
> 
> OK this bit really bugs me.
> 
> When I first looked at it, my reaction was why on earth are we
> returning an ops pointer? Only after some digging around do I see
> the fact that this ops pointer is in fact embedded in xfrm_policy.
> 
> How about embedding flow_cache_entry in xfrm_policy instead? Returning
> flow_cache_entry * would make a lot more sense than a nested pointer
> to flow_cache_entry_ops.

Because flow_cache_entry is per-cpu, and multiple entries (due to
different flows matching same policies, or same flow having multiple
per-cpu entries) can point to same policy. If we cached "dummy" objects
for even policies, then this would be better approach.

This would make actually sense, since it'd be useful to cache all
policies involved in check path (main + sub policy refs). In which
case we might want to make the ops 'per flow cache instance' instead
of 'per cache entry'.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04 10:50     ` Timo Teräs
@ 2010-04-04 11:00       ` Herbert Xu
  2010-04-04 11:06         ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-04 11:00 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sun, Apr 04, 2010 at 01:50:16PM +0300, Timo Teräs wrote:
>
> Because flow_cache_entry is per-cpu, and multiple entries (due to
> different flows matching same policies, or same flow having multiple
> per-cpu entries) can point to same policy. If we cached "dummy" objects
> for even policies, then this would be better approach.

Oh yes of course.

But what we could do is embed most of flow_cache_entry into
xfrm_policy (and xdst in your latter patches) along with the
ops pointer.

Like this:

struct flow_cache_object {
	u16			family;
	u8			dir;
	u32			genid;
	struct flowi		key;
	struct flow_cache_ops **ops;
};

struct flow_cache_entry {
	struct flow_cache_entry	*next;
	struct flow_cache_object *obj;
};

struct xfrm_policy {
	struct flow_cache_object flo;
	...
};

What do you think?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04 11:00       ` Herbert Xu
@ 2010-04-04 11:06         ` Timo Teräs
  2010-04-04 11:26           ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-04 11:06 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 01:50:16PM +0300, Timo Teräs wrote:
>> Because flow_cache_entry is per-cpu, and multiple entries (due to
>> different flows matching same policies, or same flow having multiple
>> per-cpu entries) can point to same policy. If we cached "dummy" objects
>> for even policies, then this would be better approach.
> 
> Oh yes of course.
> 
> But what we could do is embed most of flow_cache_entry into
> xfrm_policy (and xdst in your latter patches) along with the
> ops pointer.
> 
> Like this:
> 
> struct flow_cache_object {
> 	u16			family;
> 	u8			dir;
> 	u32			genid;
> 	struct flowi		key;
> 	struct flow_cache_ops **ops;
> };
> 
> struct flow_cache_entry {
> 	struct flow_cache_entry	*next;
> 	struct flow_cache_object *obj;
> };
> 
> struct xfrm_policy {
> 	struct flow_cache_object flo;
> 	...
> };
> 
> What do you think?

It would still not work for policies. For every policy X we
can get N+1 different matches with separate struct flowi contents.
It's not possible to put single struct flowi or any other of
the flow details in to xfrm_policy. It's a N-to-1 mapping. Not
a 1-to-1 mapping.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04 11:06         ` Timo Teräs
@ 2010-04-04 11:26           ` Herbert Xu
  2010-04-04 11:31             ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-04 11:26 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sun, Apr 04, 2010 at 02:06:55PM +0300, Timo Teräs wrote:
>
> It would still not work for policies. For every policy X we
> can get N+1 different matches with separate struct flowi contents.
> It's not possible to put single struct flowi or any other of
> the flow details in to xfrm_policy. It's a N-to-1 mapping. Not
> a 1-to-1 mapping.

Fine, move key into flow_cache_entry but the rest should still
work, no?

struct flow_cache_object {
	u16			family;
	u8			dir;
	u32			genid;
	struct flow_cache_ops  *ops;
};

struct flow_cache_entry {
	struct flow_cache_entry	*next;
	struct flowi key;
	struct flow_cache_object *obj;
};

struct xfrm_policy {
	struct flow_cache_object flo;
	...
};

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04 11:26           ` Herbert Xu
@ 2010-04-04 11:31             ` Herbert Xu
  2010-04-04 12:09               ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-04 11:31 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Sun, Apr 04, 2010 at 07:26:36PM +0800, Herbert Xu wrote:
> 
> Fine, move key into flow_cache_entry but the rest should still
> work, no?
> 
> struct flow_cache_object {
> 	u16			family;
> 	u8			dir;
> 	u32			genid;
> 	struct flow_cache_ops  *ops;
> };
> 
> struct flow_cache_entry {
> 	struct flow_cache_entry	*next;
> 	struct flowi key;
> 	struct flow_cache_object *obj;
> };
> 
> struct xfrm_policy {
> 	struct flow_cache_object flo;
> 	...
> };

OK this doesn't work either as we still have NULL objects for
now.  But I still think even if the ops pointer is the only
member in flow_cache_object, it looks better than returning the
nested ops pointer directly from flow_cache_lookup.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-04 11:31             ` Herbert Xu
@ 2010-04-04 12:09               ` Timo Teräs
  0 siblings, 0 replies; 42+ messages in thread
From: Timo Teräs @ 2010-04-04 12:09 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 07:26:36PM +0800, Herbert Xu wrote:
>> Fine, move key into flow_cache_entry but the rest should still
>> work, no?
>
> OK this doesn't work either as we still have NULL objects for
> now.  But I still think even if the ops pointer is the only
> member in flow_cache_object, it looks better than returning the
> nested ops pointer directly from flow_cache_lookup.

Yes, it'll look better. I'll wrap the pointer in a struct.

Ok, so far it's:
 - constify ops
 - indentation fixes for flow.c struct's with pointer members
 - wrap ops* in a struct* to avoid ops**

Will fix and resend refreshed patches tomorrow.

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-07 10:30 [PATCH 0/4] caching bundles, iteration 5 Timo Teras
@ 2010-04-07 10:30 ` Timo Teras
  0 siblings, 0 replies; 42+ messages in thread
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/net/flow.h     |   23 +++++++--
 include/net/xfrm.h     |    2 +
 net/core/flow.c        |  128 +++++++++++++++++++++++++----------------------
 net/xfrm/xfrm_policy.c |  112 ++++++++++++++++++++++++++++--------------
 4 files changed, 163 insertions(+), 102 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..bb08692 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,26 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
+struct flow_cache_ops;
+
+struct flow_cache_object {
+	const struct flow_cache_ops *ops;
+};
+
+struct flow_cache_ops {
+	struct flow_cache_object *(*get)(struct flow_cache_object *);
+	int (*check)(struct flow_cache_object *);
+	void (*delete)(struct flow_cache_object *);
+};
+
+typedef struct flow_cache_object *(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_object *oldobj, void *ctx);
+
+extern struct flow_cache_object *flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..35396e2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@ struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_object flo;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6..521df52 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,17 +26,16 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry		*next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_object	*object;
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry **	hash_table;
+	struct flow_cache_entry		**hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -44,7 +43,7 @@ struct flow_cache_percpu {
 };
 
 struct flow_flush_info {
-	struct flow_cache *		cache;
+	struct flow_cache		*cache;
 	atomic_t			cpuleft;
 	struct completion		completion;
 };
@@ -52,7 +51,7 @@ struct flow_flush_info {
 struct flow_cache {
 	u32				hash_shift;
 	unsigned long			order;
-	struct flow_cache_percpu *	percpu;
+	struct flow_cache_percpu	*percpu;
 	struct notifier_block		hotcpu_notifier;
 	int				low_watermark;
 	int				high_watermark;
@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->object && !fle->object->ops->check(fle->object))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
 	if (fle->object)
-		atomic_dec(fle->object_ref);
+		fle->object->ops->delete(fle->object);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+		  flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_object *flo;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	flo = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,27 +198,17 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
-	if (!fle) {
+	if (unlikely(!fle)) {
 		if (fcp->hash_count > fc->high_watermark)
 			flow_cache_shrink(fc, fcp);
 
@@ -219,33 +222,39 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			fle->object = NULL;
 			fcp->hash_count++;
 		}
+	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+		flo = fle->object;
+		if (!flo)
+			goto ret_object;
+		flo = flo->ops->get(flo);
+		if (flo)
+			goto ret_object;
+	} else if (fle->object) {
+	        flo = fle->object;
+	        flo->ops->delete(flo);
+	        fle->object = NULL;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
-		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	flo = NULL;
+	if (fle) {
+		flo = fle->object;
+		fle->object = NULL;
+	}
+	flo = resolver(net, key, family, dir, flo, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (!IS_ERR(flo))
+			fle->object = flo;
+		else
+			fle->genid--;
+	} else {
+		if (flo && !IS_ERR(flo))
+			flo->ops->delete(flo);
 	}
+ret_object:
+	local_bh_enable();
+	return flo;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +270,12 @@ static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
+			if (fle->object)
+				fle->object->ops->delete(fle->object);
 			fle->object = NULL;
-			atomic_dec(fle->object_ref);
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf..7722bae 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	if (unlikely(pol->walk.dead))
+		flo = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return flo;
+}
+
+static int xfrm_policy_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
+{
+	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
+}
+
+static const struct flow_cache_ops xfrm_policy_fc_ops = {
+	.get = xfrm_policy_flo_get,
+	.check = xfrm_policy_flo_check,
+	.delete = xfrm_policy_flo_delete,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +265,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->flo.ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +299,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +688,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +728,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +845,6 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +998,35 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_object *
+xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_obj)
+		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->flo;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1116,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1601,24 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_object *flo;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(flo);
+		if (IS_ERR(flo)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (flo)
+			policy = container_of(flo, struct xfrm_policy, flo);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1968,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_object *flo;
+
+		flo = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR_OR_NULL(flo))
+			pol = ERR_CAST(flo);
+		else
+			pol = container_of(flo, struct xfrm_policy, flo);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-06 13:26                   ` Timo Teräs
@ 2010-04-07  9:15                     ` David Miller
  0 siblings, 0 replies; 42+ messages in thread
From: David Miller @ 2010-04-07  9:15 UTC (permalink / raw)
  To: timo.teras; +Cc: herbert, netdev

From: Timo Teräs <timo.teras@iki.fi>
Date: Tue, 06 Apr 2010 16:26:28 +0300

> As noticed in review of 2/4, this needs to be fixed by calling
> flow object delete() unconditionally if genid is outdated. I'll
> repost with this fixed for next iteration.

I went over the current versions of these patches and
the general scheme looks fine to me.

Please resubmit a new set of patches with the final fixes to the genid
delete handlign.

Thanks a lot for doing this work!

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-06 12:34                 ` Herbert Xu
@ 2010-04-06 13:26                   ` Timo Teräs
  2010-04-07  9:15                     ` David Miller
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-06 13:26 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Mon, Apr 05, 2010 at 08:01:24PM +0300, Timo Teras wrote:
>> This allows to validate the cached object before returning it.
>> It also allows to destruct object properly, if the last reference
>> was held in flow cache. This is also a prepartion for caching
>> bundles in the flow cache.
>>
>> In return for virtualizing the methods, we save on:
>> - not having to regenerate the whole flow cache on policy removal:
>>   each flow matching a killed policy gets refreshed as the getter
>>   function notices it smartly.
>> - we do not have to call flow_cache_flush from policy gc, since the
>>   flow cache now properly deletes the object if it had any references
>>
>> Signed-off-by: Timo Teras <timo.teras@iki.fi>
> 
> Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
> 
> Thanks a lot for the patch!

As noticed in review of 2/4, this needs to be fixed by calling
flow object delete() unconditionally if genid is outdated. I'll
repost with this fixed for next iteration.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05 17:01               ` Timo Teras
@ 2010-04-06 12:34                 ` Herbert Xu
  2010-04-06 13:26                   ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-06 12:34 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev

On Mon, Apr 05, 2010 at 08:01:24PM +0300, Timo Teras wrote:
> This allows to validate the cached object before returning it.
> It also allows to destruct object properly, if the last reference
> was held in flow cache. This is also a prepartion for caching
> bundles in the flow cache.
> 
> In return for virtualizing the methods, we save on:
> - not having to regenerate the whole flow cache on policy removal:
>   each flow matching a killed policy gets refreshed as the getter
>   function notices it smartly.
> - we do not have to call flow_cache_flush from policy gc, since the
>   flow cache now properly deletes the object if it had any references
> 
> Signed-off-by: Timo Teras <timo.teras@iki.fi>

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>

Thanks a lot for the patch!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05  9:12             ` Herbert Xu
@ 2010-04-05 17:01               ` Timo Teras
  2010-04-06 12:34                 ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teras @ 2010-04-05 17:01 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, Timo Teras

This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/flow.h     |   23 +++++++--
 include/net/xfrm.h     |    2 +
 net/core/flow.c        |  122 +++++++++++++++++++++++++-----------------------
 net/xfrm/xfrm_policy.c |  112 +++++++++++++++++++++++++++++---------------
 4 files changed, 158 insertions(+), 101 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..bb08692 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,26 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
+struct flow_cache_ops;
+
+struct flow_cache_object {
+	const struct flow_cache_ops *ops;
+};
+
+struct flow_cache_ops {
+	struct flow_cache_object *(*get)(struct flow_cache_object *);
+	int (*check)(struct flow_cache_object *);
+	void (*delete)(struct flow_cache_object *);
+};
+
+typedef struct flow_cache_object *(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_object *oldobj, void *ctx);
+
+extern struct flow_cache_object *flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..35396e2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@ struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_object flo;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6..4e9fd37 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,17 +26,16 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry		*next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_object	*object;
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry **	hash_table;
+	struct flow_cache_entry		**hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -44,7 +43,7 @@ struct flow_cache_percpu {
 };
 
 struct flow_flush_info {
-	struct flow_cache *		cache;
+	struct flow_cache		*cache;
 	atomic_t			cpuleft;
 	struct completion		completion;
 };
@@ -52,7 +51,7 @@ struct flow_flush_info {
 struct flow_cache {
 	u32				hash_shift;
 	unsigned long			order;
-	struct flow_cache_percpu *	percpu;
+	struct flow_cache_percpu	*percpu;
 	struct notifier_block		hotcpu_notifier;
 	int				low_watermark;
 	int				high_watermark;
@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->object && !fle->object->ops->check(fle->object))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
 	if (fle->object)
-		atomic_dec(fle->object_ref);
+		fle->object->ops->delete(fle->object);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+		  flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_object *flo;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	flo = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,24 +198,14 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
 	if (!fle) {
@@ -219,33 +222,35 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			fle->object = NULL;
 			fcp->hash_count++;
 		}
+	} else if (fle->genid == atomic_read(&flow_cache_genid)) {
+		flo = fle->object;
+		if (!flo)
+			goto ret_object;
+		flo = flo->ops->get(flo);
+		if (flo)
+			goto ret_object;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
-		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	flo = NULL;
+	if (fle) {
+		flo = fle->object;
+		fle->object = NULL;
+	}
+	flo = resolver(net, key, family, dir, flo, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (!IS_ERR(flo))
+			fle->object = flo;
+		else
+			fle->genid--;
+	} else {
+		if (flo && !IS_ERR(flo))
+			flo->ops->delete(flo);
 	}
+ret_object:
+	local_bh_enable();
+	return flo;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +266,12 @@ static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
+			if (fle->object)
+				fle->object->ops->delete(fle->object);
 			fle->object = NULL;
-			atomic_dec(fle->object_ref);
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf..7722bae 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	if (unlikely(pol->walk.dead))
+		flo = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return flo;
+}
+
+static int xfrm_policy_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
+{
+	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
+}
+
+static const struct flow_cache_ops xfrm_policy_fc_ops = {
+	.get = xfrm_policy_flo_get,
+	.check = xfrm_policy_flo_check,
+	.delete = xfrm_policy_flo_delete,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +265,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->flo.ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +299,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +688,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +728,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +845,6 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +998,35 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_object *
+xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_obj)
+		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->flo;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1116,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1601,24 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_object *flo;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(flo);
+		if (IS_ERR(flo)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (flo)
+			policy = container_of(flo, struct xfrm_policy, flo);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1968,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_object *flo;
+
+		flo = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR_OR_NULL(flo))
+			pol = ERR_CAST(flo);
+		else
+			pol = container_of(flo, struct xfrm_policy, flo);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05  8:53           ` Timo Teräs
@ 2010-04-05  9:12             ` Herbert Xu
  2010-04-05 17:01               ` Timo Teras
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-05  9:12 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Mon, Apr 05, 2010 at 11:53:07AM +0300, Timo Teräs wrote:
>
> Right. I'm fine either way. Is there any preference?

I don't mind either.  You can choose either or maybe come up with
something better :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05  8:49         ` Herbert Xu
@ 2010-04-05  8:53           ` Timo Teräs
  2010-04-05  9:12             ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-05  8:53 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Mon, Apr 05, 2010 at 04:44:22PM +0800, Herbert Xu wrote:
>>> It might actually make more sense to pass struct flow_cache_object**
>>> so the resolver can twiddle the flow_cache_entry's object. Then it'd
>>> be more explicit that the resolver is replacing entries.
>> Yes that sounds good.
> 
> Alternatively you can pass in a struct flow_cache_entry *.
> 
> Yet another way would be to keep it the same but move the NULL
> setting before the resolver call:
> 
> 	flo = NULL;
> 	if (fle) {
> 		flo = fle->object;
> 		fle->object = NULL;
> 	}
> 	flo = resolver(..., flo, ...);
> 
> This way it's obvious that we've given the reference over to
> the resolver.

Right. I'm fine either way. Is there any preference?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05  8:44       ` Herbert Xu
@ 2010-04-05  8:49         ` Herbert Xu
  2010-04-05  8:53           ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-05  8:49 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Mon, Apr 05, 2010 at 04:44:22PM +0800, Herbert Xu wrote:
>
> > It might actually make more sense to pass struct flow_cache_object**
> > so the resolver can twiddle the flow_cache_entry's object. Then it'd
> > be more explicit that the resolver is replacing entries.
> 
> Yes that sounds good.

Alternatively you can pass in a struct flow_cache_entry *.

Yet another way would be to keep it the same but move the NULL
setting before the resolver call:

	flo = NULL;
	if (fle) {
		flo = fle->object;
		fle->object = NULL;
	}
	flo = resolver(..., flo, ...);

This way it's obvious that we've given the reference over to
the resolver.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05  8:36     ` Timo Teräs
@ 2010-04-05  8:44       ` Herbert Xu
  2010-04-05  8:49         ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-05  8:44 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Mon, Apr 05, 2010 at 11:36:35AM +0300, Timo Teräs wrote:
> Herbert Xu wrote:
>> On Mon, Apr 05, 2010 at 10:00:21AM +0300, Timo Teras wrote:
>>> @@ -219,33 +222,32 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
>>> +	flo = resolver(net, key, family, dir, fle ? fle->object : NULL, ctx);
>>> +	if (fle) {
>>> +		fle->genid = atomic_read(&flow_cache_genid);
>>> +		if (IS_ERR(flo)) {
>>> +			fle->genid--;
>>> +			fle->object = NULL;
>>
>> Shouldn't we call fle->object->ops->delete here?
>
> The resolver function releases the old object.

I see.

> It might actually make more sense to pass struct flow_cache_object**
> so the resolver can twiddle the flow_cache_entry's object. Then it'd
> be more explicit that the resolver is replacing entries.

Yes that sounds good.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05  8:33   ` Herbert Xu
@ 2010-04-05  8:36     ` Timo Teräs
  2010-04-05  8:44       ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teräs @ 2010-04-05  8:36 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Mon, Apr 05, 2010 at 10:00:21AM +0300, Timo Teras wrote:
>> @@ -219,33 +222,32 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
>> +	flo = resolver(net, key, family, dir, fle ? fle->object : NULL, ctx);
>> +	if (fle) {
>> +		fle->genid = atomic_read(&flow_cache_genid);
>> +		if (IS_ERR(flo)) {
>> +			fle->genid--;
>> +			fle->object = NULL;
> 
> Shouldn't we call fle->object->ops->delete here?

The resolver function releases the old object.

It might actually make more sense to pass struct flow_cache_object**
so the resolver can twiddle the flow_cache_entry's object. Then it'd
be more explicit that the resolver is replacing entries.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05  7:00 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
@ 2010-04-05  8:33   ` Herbert Xu
  2010-04-05  8:36     ` Timo Teräs
  0 siblings, 1 reply; 42+ messages in thread
From: Herbert Xu @ 2010-04-05  8:33 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev

On Mon, Apr 05, 2010 at 10:00:21AM +0300, Timo Teras wrote:
>
> @@ -219,33 +222,32 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
>  			fle->object = NULL;
>  			fcp->hash_count++;
>  		}
> +	} else if (fle->genid == atomic_read(&flow_cache_genid)) {
> +		flo = fle->object;
> +		if (!flo)
> +			goto ret_object;
> +		flo = flo->ops->get(flo);
> +		if (flo)
> +			goto ret_object;
>  	}
>  
>  nocache:
> -	{
> -		int err;
> -		void *obj;
> -		atomic_t *obj_ref;
> -
> -		err = resolver(net, key, family, dir, &obj, &obj_ref);
> -
> -		if (fle && !err) {
> -			fle->genid = atomic_read(&flow_cache_genid);
> -
> -			if (fle->object)
> -				atomic_dec(fle->object_ref);
> -
> -			fle->object = obj;
> -			fle->object_ref = obj_ref;
> -			if (obj)
> -				atomic_inc(fle->object_ref);
> +	flo = resolver(net, key, family, dir, fle ? fle->object : NULL, ctx);
> +	if (fle) {
> +		fle->genid = atomic_read(&flow_cache_genid);
> +		if (IS_ERR(flo)) {
> +			fle->genid--;
> +			fle->object = NULL;

Shouldn't we call fle->object->ops->delete here?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 1/4] flow: virtualize flow cache entry methods
  2010-04-05  7:00 [PATCH 0/4] caching bundles, iteration 4 Timo Teras
@ 2010-04-05  7:00 ` Timo Teras
  2010-04-05  8:33   ` Herbert Xu
  0 siblings, 1 reply; 42+ messages in thread
From: Timo Teras @ 2010-04-05  7:00 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/flow.h     |   23 ++++++++--
 include/net/xfrm.h     |    2 +
 net/core/flow.c        |  117 ++++++++++++++++++++++++------------------------
 net/xfrm/xfrm_policy.c |  112 ++++++++++++++++++++++++++++++---------------
 4 files changed, 154 insertions(+), 100 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..bb08692 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,26 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
+struct flow_cache_ops;
+
+struct flow_cache_object {
+	const struct flow_cache_ops *ops;
+};
+
+struct flow_cache_ops {
+	struct flow_cache_object *(*get)(struct flow_cache_object *);
+	int (*check)(struct flow_cache_object *);
+	void (*delete)(struct flow_cache_object *);
+};
+
+typedef struct flow_cache_object *(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_object *oldobj, void *ctx);
+
+extern struct flow_cache_object *flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..35396e2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@ struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_object flo;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6..15151f5 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,17 +26,16 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry		*next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_object	*object;
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry **	hash_table;
+	struct flow_cache_entry		**hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -44,7 +43,7 @@ struct flow_cache_percpu {
 };
 
 struct flow_flush_info {
-	struct flow_cache *		cache;
+	struct flow_cache		*cache;
 	atomic_t			cpuleft;
 	struct completion		completion;
 };
@@ -52,7 +51,7 @@ struct flow_flush_info {
 struct flow_cache {
 	u32				hash_shift;
 	unsigned long			order;
-	struct flow_cache_percpu *	percpu;
+	struct flow_cache_percpu	*percpu;
 	struct notifier_block		hotcpu_notifier;
 	int				low_watermark;
 	int				high_watermark;
@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->object && !fle->object->ops->check(fle->object))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
 	if (fle->object)
-		atomic_dec(fle->object_ref);
+		fle->object->ops->delete(fle->object);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+		  flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_object *flo;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	flo = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,24 +198,14 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
 	if (!fle) {
@@ -219,33 +222,32 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			fle->object = NULL;
 			fcp->hash_count++;
 		}
+	} else if (fle->genid == atomic_read(&flow_cache_genid)) {
+		flo = fle->object;
+		if (!flo)
+			goto ret_object;
+		flo = flo->ops->get(flo);
+		if (flo)
+			goto ret_object;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
+	flo = resolver(net, key, family, dir, fle ? fle->object : NULL, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (IS_ERR(flo)) {
+			fle->genid--;
+			fle->object = NULL;
+		} else {
+			fle->object = flo;
 		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	} else {
+		if (flo && !IS_ERR(flo))
+			flo->ops->delete(flo);
 	}
+ret_object:
+	local_bh_enable();
+	return flo;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +263,12 @@ static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
+			if (fle->object)
+				fle->object->ops->delete(fle->object);
 			fle->object = NULL;
-			atomic_dec(fle->object_ref);
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf..7722bae 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	if (unlikely(pol->walk.dead))
+		flo = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return flo;
+}
+
+static int xfrm_policy_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
+{
+	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
+}
+
+static const struct flow_cache_ops xfrm_policy_fc_ops = {
+	.get = xfrm_policy_flo_get,
+	.check = xfrm_policy_flo_check,
+	.delete = xfrm_policy_flo_delete,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +265,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->flo.ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +299,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +688,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +728,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +845,6 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +998,35 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_object *
+xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_obj)
+		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->flo;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1116,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1601,24 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_object *flo;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(flo);
+		if (IS_ERR(flo)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (flo)
+			policy = container_of(flo, struct xfrm_policy, flo);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1968,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_object *flo;
+
+		flo = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR_OR_NULL(flo))
+			pol = ERR_CAST(flo);
+		else
+			pol = container_of(flo, struct xfrm_policy, flo);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2010-04-07 10:30 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-01 12:52 [PATCH 0/4] caching bundles, iteration 3 Timo Teras
2010-04-01 12:52 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
2010-04-01 13:05   ` Eric Dumazet
2010-04-01 13:07     ` Timo Teräs
2010-04-03  3:38   ` Herbert Xu
2010-04-03  8:36     ` Herbert Xu
2010-04-03 13:50       ` Timo Teräs
2010-04-03 14:17         ` Herbert Xu
2010-04-03 14:26           ` Timo Teräs
2010-04-03 15:53             ` Herbert Xu
2010-04-03 20:19               ` Timo Teräs
2010-04-04  2:06                 ` Herbert Xu
2010-04-04  5:50                   ` Timo Teräs
2010-04-04  5:58                     ` Herbert Xu
2010-04-04  6:07                       ` Timo Teräs
2010-04-04  6:19                         ` Herbert Xu
2010-04-04  6:28                           ` Timo Teräs
2010-04-04  8:35                             ` Herbert Xu
2010-04-04 10:42   ` Herbert Xu
2010-04-04 10:50     ` Timo Teräs
2010-04-04 11:00       ` Herbert Xu
2010-04-04 11:06         ` Timo Teräs
2010-04-04 11:26           ` Herbert Xu
2010-04-04 11:31             ` Herbert Xu
2010-04-04 12:09               ` Timo Teräs
2010-04-01 12:52 ` [PATCH 2/4] xfrm: cache bundles instead of policies for outgoing flows Timo Teras
2010-04-01 12:52 ` [PATCH 3/4] xfrm: remove policy garbage collection Timo Teras
2010-04-01 12:52 ` [PATCH 4/4] flow: delayed deletion of flow cache entries Timo Teras
2010-04-02  3:00 ` [PATCH 0/4] caching bundles, iteration 3 David Miller
2010-04-02 13:12   ` Herbert Xu
2010-04-05  7:00 [PATCH 0/4] caching bundles, iteration 4 Timo Teras
2010-04-05  7:00 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras
2010-04-05  8:33   ` Herbert Xu
2010-04-05  8:36     ` Timo Teräs
2010-04-05  8:44       ` Herbert Xu
2010-04-05  8:49         ` Herbert Xu
2010-04-05  8:53           ` Timo Teräs
2010-04-05  9:12             ` Herbert Xu
2010-04-05 17:01               ` Timo Teras
2010-04-06 12:34                 ` Herbert Xu
2010-04-06 13:26                   ` Timo Teräs
2010-04-07  9:15                     ` David Miller
2010-04-07 10:30 [PATCH 0/4] caching bundles, iteration 5 Timo Teras
2010-04-07 10:30 ` [PATCH 1/4] flow: virtualize flow cache entry methods Timo Teras

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.