All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfrm: cache bundle lookup results in flow cache
@ 2010-03-15 12:20 Timo Teras
  2010-03-17 13:07 ` Herbert Xu
  2010-03-19  7:20 ` Herbert Xu
  0 siblings, 2 replies; 41+ messages in thread
From: Timo Teras @ 2010-03-15 12:20 UTC (permalink / raw)
  To: netdev; +Cc: Timo Teras, Herbert Xu

Instead of doing O(n) xfrm_find_bundle() call per-packet, cache
the previous lookup results in flow cache. The flow cache is
updated to be per-netns and more generic.

The flow cache no longer holds reference (which was not really
used in the first place, as it depended on garbage collection).
Now this is more explicit. The cache validity is maintained as follows:
- On policy insert, the whole cache is invalideted by incrementing
  generation id. No synchronization required as genid checks make
  sure no old objects are dereferenced.
- On policy removal from lists the object is marked deleted, and
  this invalidated the policy pointer.
- Policy object deletion requires explicit synchronization to remove
  stale pointers before can actually free the policy objects.
  xfrm_policy_gc_task() synchronizes the cache.
- Bundle creation and expiry is reflected in xfrm_bundle_ok() check
  before any bundle from cache is used.
- Bundle deletion is done by incrementing policy->bundles_genid and
  synchronizing with other cpu's so there is no stale bundle pointers
  left. After this the bundle objects can be safely deleted.

Basic testing done on 2.6.32 based kernel. This gives a boost of
several magnitudes on transmit path.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/net/flow.h               |   39 ++++-
 include/net/netns/xfrm.h         |    4 +
 include/net/xfrm.h               |    1 +
 net/core/flow.c                  |  342 ++++++++++++++++++--------------------
 net/ipv6/inet6_connection_sock.c |    6 +-
 net/xfrm/xfrm_policy.c           |  271 +++++++++++++++++++++---------
 6 files changed, 394 insertions(+), 269 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..814a9d2 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -8,6 +8,9 @@
 #define _NET_FLOW_H
 
 #include <linux/in6.h>
+#include <linux/notifier.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 
 struct flowi {
@@ -86,13 +89,37 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
-extern void flow_cache_flush(void);
-extern atomic_t flow_cache_genid;
+struct flow_cache_percpu;
+struct flow_cache_entry;
+
+struct flow_cache {
+	u32				hash_shift;
+	u32				order;
+	struct flow_cache_percpu *	percpu;
+	struct notifier_block		hotcpu_notifier;
+	int				low_watermark;
+	int				high_watermark;
+	struct timer_list		rnd_timer;
+	struct kmem_cache *		flow_cachep;
+};
+
+struct flow_cache_entry {
+	struct flow_cache_entry	*next;
+	struct flowi		key;
+	u16			family;
+	u8			dir;
+};
+
+extern struct flow_cache_entry *flow_cache_lookup(
+	struct flow_cache *cache, struct flowi *key,
+	u16 family, u8 dir);
+extern void flow_cache_entry_put(struct flow_cache_entry *fce);
+
+void flow_cache_flush(struct flow_cache *fc,
+		      void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce));
+extern int flow_cache_init(struct flow_cache *cache, size_t entry_size);
+extern void flow_cache_fini(struct flow_cache *cache);
 
 static inline int flow_cache_uli_match(struct flowi *fl1, struct flowi *fl2)
 {
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 74f119a..1b223c9 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -42,6 +42,10 @@ struct netns_xfrm {
 	struct xfrm_policy_hash	policy_bydst[XFRM_POLICY_MAX * 2];
 	unsigned int		policy_count[XFRM_POLICY_MAX * 2];
 	struct work_struct	policy_hash_work;
+	atomic_t		policy_genid;
+	struct hlist_head	policy_gc_list;
+	struct work_struct	policy_gc_work;
+	struct flow_cache	flow_cache;
 
 	struct dst_ops		xfrm4_dst_ops;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..f469b9b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -488,6 +488,7 @@ struct xfrm_policy {
 	struct xfrm_lifetime_cfg lft;
 	struct xfrm_lifetime_cur curlft;
 	struct dst_entry       *bundles;
+	atomic_t		bundles_genid;
 	struct xfrm_policy_walk_entry walk;
 	u8			type;
 	u8			action;
diff --git a/net/core/flow.c b/net/core/flow.c
index 9601587..e3782c2 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -25,114 +25,85 @@
 #include <asm/atomic.h>
 #include <linux/security.h>
 
-struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-
-static u32 flow_hash_shift;
-#define flow_hash_size	(1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
-
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-
-static struct kmem_cache *flow_cachep __read_mostly;
 
-static int flow_lwm, flow_hwm;
-
-struct flow_percpu_info {
-	int hash_rnd_recalc;
-	u32 hash_rnd;
-	int count;
+struct flow_cache_percpu {
+	struct flow_cache_entry **	hash_table;
+	int				hash_count;
+	u32				hash_rnd;
+	int				hash_rnd_recalc;
+	struct tasklet_struct		flush_tasklet;
 };
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
-
-#define flow_hash_rnd_recalc(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
-#define flow_hash_rnd(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd)
-#define flow_count(cpu) \
-	(per_cpu(flow_hash_info, cpu).count)
-
-static struct timer_list flow_hash_rnd_timer;
-
-#define FLOW_HASH_RND_PERIOD	(10 * 60 * HZ)
 
 struct flow_flush_info {
-	atomic_t cpuleft;
-	struct completion completion;
+	void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce);
+	struct flow_cache *		cache;
+	atomic_t			cpuleft;
+	struct completion		completion;
 };
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
 
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
+#define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
 static void flow_cache_new_hashrnd(unsigned long arg)
 {
+	struct flow_cache *fc = (struct flow_cache *) arg;
 	int i;
 
 	for_each_possible_cpu(i)
-		flow_hash_rnd_recalc(i) = 1;
+		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
 
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 }
 
-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
-{
-	if (fle->object)
-		atomic_dec(fle->object_ref);
-	kmem_cache_free(flow_cachep, fle);
-	flow_count(cpu)--;
-}
-
-static void __flow_cache_shrink(int cpu, int shrink_to)
+static void __flow_cache_shrink(struct flow_cache *fc,
+				struct flow_cache_percpu *fcp,
+				int shrink_to)
 {
 	struct flow_cache_entry *fle, **flp;
 	int i;
 
-	for (i = 0; i < flow_hash_size; i++) {
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int k = 0;
 
-		flp = &flow_table(cpu)[i];
+		flp = &fcp->hash_table[i];
 		while ((fle = *flp) != NULL && k < shrink_to) {
 			k++;
 			flp = &fle->next;
 		}
 		while ((fle = *flp) != NULL) {
 			*flp = fle->next;
-			flow_entry_kill(cpu, fle);
+
+			kmem_cache_free(fc->flow_cachep, fle);
+			fcp->hash_count--;
 		}
 	}
 }
 
-static void flow_cache_shrink(int cpu)
+static void flow_cache_shrink(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	int shrink_to = flow_lwm / flow_hash_size;
+	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
 
-	__flow_cache_shrink(cpu, shrink_to);
+	__flow_cache_shrink(fc, fcp, shrink_to);
 }
 
-static void flow_new_hash_rnd(int cpu)
+static void flow_new_hash_rnd(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
-	flow_hash_rnd_recalc(cpu) = 0;
-
-	__flow_cache_shrink(cpu, 0);
+	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+	fcp->hash_rnd_recalc = 0;
+	__flow_cache_shrink(fc, fcp, 0);
 }
 
-static u32 flow_hash_code(struct flowi *key, int cpu)
+static u32 flow_hash_code(struct flow_cache *fc,
+			  struct flow_cache_percpu *fcp,
+			  struct flowi *key)
 {
 	u32 *k = (u32 *) key;
 
-	return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
-		(flow_hash_size - 1));
+	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+		& (flow_cache_hash_size(fc) - 1));
 }
 
 #if (BITS_PER_LONG == 64)
@@ -165,128 +136,100 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_entry *flow_cache_lookup(struct flow_cache *fc,
+					   struct flowi *key,
+					   u16 family, u8 dir)
 {
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_percpu *fcp;
 	unsigned int hash;
-	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
-	if (!flow_table(cpu))
+	if (!fcp->hash_table)
 		goto nocache;
 
-	if (flow_hash_rnd_recalc(cpu))
-		flow_new_hash_rnd(cpu);
-	hash = flow_hash_code(key, cpu);
+	if (fcp->hash_rnd_recalc)
+		flow_new_hash_rnd(fc, fcp);
+
+	hash = flow_hash_code(fc, fcp, key);
 
-	head = &flow_table(cpu)[hash];
+	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
 		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
-			break;
-		}
-	}
-
-	if (!fle) {
-		if (flow_count(cpu) > flow_hwm)
-			flow_cache_shrink(cpu);
-
-		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
-		if (fle) {
-			fle->next = *head;
-			*head = fle;
-			fle->family = family;
-			fle->dir = dir;
-			memcpy(&fle->key, key, sizeof(*key));
-			fle->object = NULL;
-			flow_count(cpu)++;
+			return fle;
 		}
 	}
 
-nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
+	if (fcp->hash_count > fc->high_watermark)
+		flow_cache_shrink(fc, fcp);
 
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
+	fle = kmem_cache_zalloc(fc->flow_cachep, GFP_ATOMIC);
+	if (!fle)
+		goto nocache;
 
-			if (fle->object)
-				atomic_dec(fle->object_ref);
+	fle->next = *head;
+	*head = fle;
+	fle->family = family;
+	fle->dir = dir;
+	memcpy(&fle->key, key, sizeof(*key));
+	fcp->hash_count++;
+	return fle;
 
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
-		}
-		local_bh_enable();
+nocache:
+	local_bh_enable();
+	return NULL;
+}
 
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
-	}
+void flow_cache_entry_put(struct flow_cache_entry *fce)
+{
+	local_bh_enable();
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
 {
-	struct flow_flush_info *info = (void *)data;
+	struct flow_flush_info *info = (void *) data;
+	struct flow_cache *fc = (void *) info->cache;
+	struct flow_cache_percpu *fcp;
 	int i;
-	int cpu;
 
-	cpu = smp_processor_id();
-	for (i = 0; i < flow_hash_size; i++) {
-		struct flow_cache_entry *fle;
+	if (info->flush == NULL)
+		goto done;
 
-		fle = flow_table(cpu)[i];
-		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
-				continue;
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
+		struct flow_cache_entry *fle;
 
-			fle->object = NULL;
-			atomic_dec(fle->object_ref);
-		}
+		fle = fcp->hash_table[i];
+		for (; fle; fle = fle->next)
+			info->flush(fc, fle);
 	}
 
+done:
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
 }
 
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
 static void flow_cache_flush_per_cpu(void *data)
 {
 	struct flow_flush_info *info = data;
-	int cpu;
 	struct tasklet_struct *tasklet;
+	int cpu;
 
 	cpu = smp_processor_id();
-
-	tasklet = flow_flush_tasklet(cpu);
-	tasklet->data = (unsigned long)info;
+	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
+	tasklet->data = (unsigned long) data;
 	tasklet_schedule(tasklet);
 }
 
-void flow_cache_flush(void)
+void flow_cache_flush(struct flow_cache *fc,
+		      void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce))
 {
 	struct flow_flush_info info;
 	static DEFINE_MUTEX(flow_flush_sem);
@@ -294,6 +237,8 @@ void flow_cache_flush(void)
 	/* Don't want cpus going down or up during this. */
 	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
+	info.cache = fc;
+	info.flush = flush;
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
 
@@ -307,62 +252,99 @@ void flow_cache_flush(void)
 	put_online_cpus();
 }
 
-static void __init flow_cache_cpu_prepare(int cpu)
+static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
+					  struct flow_cache_percpu *fcp)
+{
+	fcp->hash_table = (struct flow_cache_entry **)
+		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
+	fcp->hash_rnd_recalc = 1;
+	fcp->hash_count = 0;
+
+	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+}
+
+static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
+				    unsigned long action,
+				    void *hcpu)
+{
+	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+	int cpu = (unsigned long) hcpu;
+	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		flow_cache_cpu_prepare(fc, fcp);
+		if (!fcp->hash_table)
+			return NOTIFY_BAD;
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		if (fcp->hash_table) {
+			__flow_cache_shrink(fc, fcp, 0);
+			free_pages((unsigned long) fcp->hash_table, fc->order);
+			fcp->hash_table = NULL;
+		}
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+int flow_cache_init(struct flow_cache *fc, size_t entry_size)
 {
-	struct tasklet_struct *tasklet;
 	unsigned long order;
+	int i, r;
+
+	BUG_ON(entry_size < sizeof(struct flow_cache_entry));
+	fc->flow_cachep = kmem_cache_create("flow_cache",
+					entry_size,
+					0, SLAB_PANIC,
+					NULL);
+	fc->hash_shift = 10;
+	fc->low_watermark = 2 * flow_cache_hash_size(fc);
+	fc->high_watermark = 4 * flow_cache_hash_size(fc);
+	fc->percpu = alloc_percpu(struct flow_cache_percpu);
 
 	for (order = 0;
 	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_hash_size);
+		(sizeof(struct flow_cache_entry *) * flow_cache_hash_size(fc));
 	     order++)
 		/* NOTHING */;
+	fc->order = order;
 
-	flow_table(cpu) = (struct flow_cache_entry **)
-		__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-	if (!flow_table(cpu))
-		panic("NET: failed to allocate flow cache order %lu\n", order);
+	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, (unsigned long) fc);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 
-	flow_hash_rnd_recalc(cpu) = 1;
-	flow_count(cpu) = 0;
+	for_each_online_cpu(i) {
+		r = flow_cache_cpu(&fc->hotcpu_notifier,
+				   CPU_UP_PREPARE, (void*) i);
+		if (r != NOTIFY_OK)
+			panic("NET: failed to allocate flow cache order %lu\n", order);
+	}
 
-	tasklet = flow_flush_tasklet(cpu);
-	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
-}
+	fc->hotcpu_notifier = (struct notifier_block){
+		.notifier_call = flow_cache_cpu,
+	};
+	register_hotcpu_notifier(&fc->hotcpu_notifier);
 
-static int flow_cache_cpu(struct notifier_block *nfb,
-			  unsigned long action,
-			  void *hcpu)
-{
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-		__flow_cache_shrink((unsigned long)hcpu, 0);
-	return NOTIFY_OK;
+	return 0;
 }
 
-static int __init flow_cache_init(void)
+void flow_cache_fini(struct flow_cache *fc)
 {
 	int i;
 
-	flow_cachep = kmem_cache_create("flow_cache",
-					sizeof(struct flow_cache_entry),
-					0, SLAB_PANIC,
-					NULL);
-	flow_hash_shift = 10;
-	flow_lwm = 2 * flow_hash_size;
-	flow_hwm = 4 * flow_hash_size;
-
-	setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	del_timer(&fc->rnd_timer);
+	unregister_hotcpu_notifier(&fc->hotcpu_notifier);
 
 	for_each_possible_cpu(i)
-		flow_cache_cpu_prepare(i);
+		flow_cache_cpu(&fc->hotcpu_notifier, CPU_DEAD, (void*) i);
 
-	hotcpu_notifier(flow_cache_cpu, 0);
-	return 0;
+	free_percpu(fc->percpu);
+	kmem_cache_destroy(fc->flow_cachep);
 }
 
-module_init(flow_cache_init);
-
-EXPORT_SYMBOL(flow_cache_genid);
 EXPORT_SYMBOL(flow_cache_lookup);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 3516e6f..588ba76 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -151,8 +151,9 @@ void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
 
 #ifdef CONFIG_XFRM
 	{
+		struct net *net = sock_net(sk);
 		struct rt6_info *rt = (struct rt6_info  *)dst;
-		rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid);
+		rt->rt6i_flow_cache_genid = atomic_read(&net->xfrm.policy_genid);
 	}
 #endif
 }
@@ -166,8 +167,9 @@ struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
 
 #ifdef CONFIG_XFRM
 	if (dst) {
+		struct net *net = sock_net(sk);
 		struct rt6_info *rt = (struct rt6_info *)dst;
-		if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) {
+		if (rt->rt6i_flow_cache_genid != atomic_read(&net->xfrm.policy_genid)) {
 			__sk_dst_reset(sk);
 			dst = NULL;
 		}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 843e066..228b813 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -44,7 +44,6 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 
-static HLIST_HEAD(xfrm_policy_gc_list);
 static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
@@ -53,6 +52,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst);
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
+static int stale_bundle(struct dst_entry *dst);
 
 static inline int
 __xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl)
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+struct xfrm_flow_cache_entry {
+	struct flow_cache_entry fce;
+	struct xfrm_policy *policy;
+	struct xfrm_dst *dst;
+	u32 policy_genid, bundles_genid;
+};
+#define XFRM_CACHE_NO_POLICY ((struct xfrm_policy *) -1)
+
+void xfrm_flow_cache_entry_validate(struct flow_cache *fc,
+				    struct flow_cache_entry *fce)
+{
+	struct net *net = container_of(fc, struct net, xfrm.flow_cache);
+	struct xfrm_flow_cache_entry *xfc =
+		container_of(fce, struct xfrm_flow_cache_entry, fce);
+
+	if (xfc->policy_genid != atomic_read(&net->xfrm.policy_genid))
+		goto invalid;
+	if (xfc->policy == NULL || xfc->policy == XFRM_CACHE_NO_POLICY)
+		return;
+	if (xfc->policy->walk.dead)
+		goto invalid;
+	if (xfc->bundles_genid != atomic_read(&xfc->policy->bundles_genid))
+		goto invalid_dst;
+	return;
+invalid:
+	xfc->policy = NULL;
+invalid_dst:
+	xfc->dst = NULL;
+}
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -269,27 +298,26 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
 static void xfrm_policy_gc_task(struct work_struct *work)
 {
+	struct net *net = container_of(work, struct net, xfrm.policy_gc_work);
 	struct xfrm_policy *policy;
 	struct hlist_node *entry, *tmp;
 	struct hlist_head gc_list;
 
 	spin_lock_bh(&xfrm_policy_gc_lock);
-	gc_list.first = xfrm_policy_gc_list.first;
-	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
+	gc_list.first = net->xfrm.policy_gc_list.first;
+	INIT_HLIST_HEAD(&net->xfrm.policy_gc_list);
 	spin_unlock_bh(&xfrm_policy_gc_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, xfrm_flow_cache_entry_validate);
+
 	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
 		xfrm_policy_gc_kill(policy);
 }
-static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
 
 /* Rule must be locked. Release descentant resources, announce
  * entry dead. The rule must be unlinked from lists to the moment.
@@ -297,6 +325,7 @@ static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
 
 static void xfrm_policy_kill(struct xfrm_policy *policy)
 {
+	struct net *net = xp_net(policy);
 	int dead;
 
 	write_lock_bh(&policy->lock);
@@ -310,10 +339,10 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 	}
 
 	spin_lock_bh(&xfrm_policy_gc_lock);
-	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
+	hlist_add_head(&policy->bydst, &net->xfrm.policy_gc_list);
 	spin_unlock_bh(&xfrm_policy_gc_lock);
 
-	schedule_work(&xfrm_policy_gc_work);
+	schedule_work(&net->xfrm.policy_gc_work);
 }
 
 static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
@@ -588,7 +617,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		hlist_add_head(&policy->bydst, chain);
 	xfrm_pol_hold(policy);
 	net->xfrm.policy_count[dir]++;
-	atomic_inc(&flow_cache_genid);
+	atomic_inc(&net->xfrm.policy_genid);
 	if (delpol)
 		__xfrm_policy_unlink(delpol, dir);
 	policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir);
@@ -621,11 +650,13 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 			gc_list = dst;
 
 			policy->bundles = NULL;
+			atomic_inc(&policy->bundles_genid);
 		}
 		write_unlock(&policy->lock);
 	}
 	read_unlock_bh(&xfrm_policy_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
 	while (gc_list) {
 		struct dst_entry *dst = gc_list;
 
@@ -672,7 +703,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	write_unlock_bh(&xfrm_policy_lock);
 
 	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+		atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(ret);
 	}
 	return ret;
@@ -714,7 +745,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	write_unlock_bh(&xfrm_policy_lock);
 
 	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+		atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(ret);
 	}
 	return ret;
@@ -835,7 +866,7 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
+	atomic_inc(&net->xfrm.policy_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -989,32 +1020,18 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct xfrm_policy *xfrm_policy_lookup(
+		struct net *net, struct flowi *fl,
+		u16 family, u8 dir)
 {
+#ifdef CONFIG_XFRM_SUB_POLICY
 	struct xfrm_policy *pol;
-	int err = 0;
 
-#ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (pol != NULL)
+		return pol;
 #endif
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1100,12 +1117,14 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 
 int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 {
+	struct net *net = xp_net(pol);
+
 	write_lock_bh(&xfrm_policy_lock);
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
 		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
+			atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1545,13 +1564,34 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
 #endif
 }
 
-static int stale_bundle(struct dst_entry *dst);
-
 /* Main function: finds/creates a bundle for given flow.
  *
  * At the moment we eat a raw IP route. Mostly to speed up lookups
  * on interfaces with disabled IPsec.
  */
+
+static void xfrm_flow_cache_update(struct net *net, struct flowi *key,
+				   u16 family, u8 dir,
+				   struct xfrm_policy *pol,
+				   struct xfrm_dst *dst)
+{
+	struct flow_cache_entry *fce;
+	struct xfrm_flow_cache_entry *xf;
+
+	fce = flow_cache_lookup(&net->xfrm.flow_cache,
+				key, family, dir);
+	if (fce == NULL)
+		return;
+
+	xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+	xf->policy_genid = atomic_read(&net->xfrm.policy_genid);
+	xf->policy = pol;
+	if (dst != NULL)
+		xf->bundles_genid = atomic_read(&pol->bundles_genid);
+	xf->dst = dst;
+	flow_cache_entry_put(fce);
+}
+
 int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 		  struct sock *sk, int flags)
 {
@@ -1570,8 +1610,10 @@ int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
 
 restart:
-	genid = atomic_read(&flow_cache_genid);
+	family = dst_orig->ops->family;
+	genid = atomic_read(&net->xfrm.policy_genid);
 	policy = NULL;
+	dst = NULL;
 	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
 		pols[pi] = NULL;
 	npols = 0;
@@ -1588,24 +1630,51 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_entry *fce;
+		struct xfrm_flow_cache_entry *xf;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
-			goto dropdst;
+		fce = flow_cache_lookup(&net->xfrm.flow_cache,
+					fl, family, dir);
+		if (fce == NULL)
+			goto no_cache;
+
+		xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+		xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce);
+		if (xf->policy != NULL) {
+			policy = xf->policy;
+			if (policy != XFRM_CACHE_NO_POLICY)
+				xfrm_pol_hold(policy);
+			if (xf->dst != NULL)
+				dst = dst_clone((struct dst_entry *) xf->dst);
+		}
+		flow_cache_entry_put(fce);
+		if (policy == XFRM_CACHE_NO_POLICY)
+			goto nopol;
+		if (dst && !xfrm_bundle_ok(policy, (struct xfrm_dst *) dst, fl, family, 0)) {
+			dst_release(dst);
+			dst = NULL;
 		}
 	}
+no_cache:
+	if (!policy) {
+		policy = xfrm_policy_lookup(net, fl, family, dir);
+		if (!policy) {
+			xfrm_flow_cache_update(
+				net, fl, family, dir,
+				XFRM_CACHE_NO_POLICY, NULL);
+			goto nopol;
+		}
+	}
+	if (IS_ERR(policy)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		goto dropdst;
+	}
 
-	if (!policy)
-		goto nopol;
-
-	family = dst_orig->ops->family;
 	pols[0] = policy;
 	npols ++;
 	xfrm_nr += pols[0]->xfrm_nr;
@@ -1616,6 +1685,9 @@ restart:
 
 	policy->curlft.use_time = get_seconds();
 
+	if (dst)
+		goto dst_found;
+
 	switch (policy->action) {
 	default:
 	case XFRM_POLICY_BLOCK:
@@ -1626,18 +1698,11 @@ restart:
 
 	case XFRM_POLICY_ALLOW:
 #ifndef CONFIG_XFRM_SUB_POLICY
-		if (policy->xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pol_put(policy);
-			return 0;
-		}
+		if (policy->xfrm_nr == 0)
+			goto no_transform;
 #endif
 
-		/* Try to find matching bundle.
-		 *
-		 * LATER: help from flow cache. It is optional, this
-		 * is required only for output policy.
-		 */
+		/* Try to find matching bundle the hard way. */
 		dst = xfrm_find_bundle(fl, policy, family);
 		if (IS_ERR(dst)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
@@ -1677,12 +1742,8 @@ restart:
 		 * they are searched. See above not-transformed bypass
 		 * is surrounded by non-sub policy configuration, too.
 		 */
-		if (xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
+		if (xfrm_nr == 0)
+			goto no_transform;
 #endif
 		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
 
@@ -1713,7 +1774,7 @@ restart:
 					goto error;
 				}
 				if (nx == -EAGAIN ||
-				    genid != atomic_read(&flow_cache_genid)) {
+				    genid != atomic_read(&net->xfrm.policy_genid)) {
 					xfrm_pols_put(pols, npols);
 					goto restart;
 				}
@@ -1724,11 +1785,8 @@ restart:
 				goto error;
 			}
 		}
-		if (nx == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
+		if (nx == 0)
+			goto no_transform;
 
 		dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig);
 		err = PTR_ERR(dst);
@@ -1777,6 +1835,9 @@ restart:
 		dst_hold(dst);
 		write_unlock_bh(&policy->lock);
 	}
+	xfrm_flow_cache_update(net, fl, family, dir,
+			       policy, (struct xfrm_dst *) dst);
+dst_found:
 	*dst_p = dst;
 	dst_release(dst_orig);
 	xfrm_pols_put(pols, npols);
@@ -1794,7 +1855,12 @@ nopol:
 	if (flags & XFRM_LOOKUP_ICMP)
 		goto dropdst;
 	return 0;
+no_transform:
+	/* Flow passes not transformed. */
+	xfrm_pols_put(pols, npols);
+	return 0;
 }
+
 EXPORT_SYMBOL(__xfrm_lookup);
 
 int xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
@@ -1952,10 +2018,35 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
-
+	if (!pol) {
+		struct flow_cache_entry *fce;
+		struct xfrm_flow_cache_entry *xf;
+
+		fce = flow_cache_lookup(&net->xfrm.flow_cache,
+					&fl, family, dir);
+		if (fce != NULL) {
+			xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+			xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce);
+			if (xf->policy != NULL) {
+				pol = xf->policy;
+				if (pol != XFRM_CACHE_NO_POLICY)
+					xfrm_pol_hold(pol);
+				else
+					pol = NULL;
+			} else {
+				pol = xfrm_policy_lookup(net, &fl, family, dir);
+				if (!IS_ERR(pol)) {
+					if (pol)
+						xf->policy = pol;
+					else
+						xf->policy = XFRM_CACHE_NO_POLICY;
+				}
+				xf->dst = NULL;
+				xf->policy_genid = atomic_read(&net->xfrm.policy_genid);
+			}
+			flow_cache_entry_put(fce);
+		}
+	}
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
 		return 0;
@@ -2153,6 +2244,7 @@ static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_ent
 			dstp = &dst->next;
 		}
 	}
+	atomic_inc(&pol->bundles_genid);
 	write_unlock(&pol->lock);
 }
 
@@ -2180,6 +2272,7 @@ static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *))
 	}
 	read_unlock_bh(&xfrm_policy_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
 	while (gc_list) {
 		struct dst_entry *dst = gc_list;
 		gc_list = dst->next;
@@ -2498,6 +2591,9 @@ static int __net_init xfrm_policy_init(struct net *net)
 
 	INIT_LIST_HEAD(&net->xfrm.policy_all);
 	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
+	INIT_HLIST_HEAD(&net->xfrm.policy_gc_list);
+	INIT_WORK(&net->xfrm.policy_gc_work, xfrm_policy_gc_task);
+	flow_cache_init(&net->xfrm.flow_cache, sizeof(struct xfrm_flow_cache_entry));
 	if (net_eq(net, &init_net))
 		register_netdevice_notifier(&xfrm_dev_notifier);
 	return 0;
@@ -2531,7 +2627,7 @@ static void xfrm_policy_fini(struct net *net)
 	audit_info.sessionid = -1;
 	audit_info.secid = 0;
 	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
-	flush_work(&xfrm_policy_gc_work);
+	flush_work(&net->xfrm.policy_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.policy_all));
 
@@ -2549,6 +2645,8 @@ static void xfrm_policy_fini(struct net *net)
 	sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
 	WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
 	xfrm_hash_free(net->xfrm.policy_byidx, sz);
+
+	flow_cache_fini(&net->xfrm.flow_cache);
 }
 
 static int __net_init xfrm_net_init(struct net *net)
@@ -2756,8 +2854,9 @@ static int migrate_tmpl_match(struct xfrm_migrate *m, struct xfrm_tmpl *t)
 static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       struct xfrm_migrate *m, int num_migrate)
 {
+	struct net *net = xp_net(pol);
 	struct xfrm_migrate *mp;
-	struct dst_entry *dst;
+	struct dst_entry *gc_list = NULL, *tail;
 	int i, j, n = 0;
 
 	write_lock_bh(&pol->lock);
@@ -2782,15 +2881,25 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       sizeof(pol->xfrm_vec[i].saddr));
 			pol->xfrm_vec[i].encap_family = mp->new_family;
 			/* flush bundles */
-			while ((dst = pol->bundles) != NULL) {
-				pol->bundles = dst->next;
-				dst_free(dst);
-			}
+			tail = pol->bundles;
+			while (tail->next)
+				tail = tail->next;
+			tail->next = gc_list;
+			gc_list = pol->bundles;
+			pol->bundles = NULL;
+			atomic_inc(&pol->bundles_genid);
 		}
 	}
-
 	write_unlock_bh(&pol->lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
+	while (gc_list) {
+		struct dst_entry *dst = gc_list;
+
+		gc_list = dst->next;
+		dst_free(dst);
+	}
+
 	if (!n)
 		return -ENODATA;
 
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-15 12:20 [PATCH] xfrm: cache bundle lookup results in flow cache Timo Teras
@ 2010-03-17 13:07 ` Herbert Xu
  2010-03-17 14:16   ` Timo Teräs
  2010-03-19  7:20 ` Herbert Xu
  1 sibling, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-17 13:07 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev

On Mon, Mar 15, 2010 at 02:20:10PM +0200, Timo Teras wrote:
> Instead of doing O(n) xfrm_find_bundle() call per-packet, cache
> the previous lookup results in flow cache. The flow cache is
> updated to be per-netns and more generic.

This only works well if the traffic doesn't switch bundles much.
But if that were the case then the number of bundles is likely
to be small anyway.

IOW I think if we're doing this then we should go the whole
distance and directly cache bundles instead of policies in the
flow cache.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-17 13:07 ` Herbert Xu
@ 2010-03-17 14:16   ` Timo Teräs
  2010-03-17 14:58     ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-17 14:16 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Mon, Mar 15, 2010 at 02:20:10PM +0200, Timo Teras wrote:
>> Instead of doing O(n) xfrm_find_bundle() call per-packet, cache
>> the previous lookup results in flow cache. The flow cache is
>> updated to be per-netns and more generic.
> 
> This only works well if the traffic doesn't switch bundles much.
> But if that were the case then the number of bundles is likely
> to be small anyway.

The problem is if I have multipoint gre1 and policy that says
"encrypt all gre in transport mode".

Thus for each public address, I get one bundle. But the
xfrm_lookup() is called for each packet because ipgre_tunnel_xmit()
calls ip_route_output_key() on per-packet basis.

For my use-case it makes a huge difference.

> IOW I think if we're doing this then we should go the whole
> distance and directly cache bundles instead of policies in the
> flow cache.

Then we cannot maintain policy use time. But if it's not a
requirement, we could drop the policy from cache.

Also. With this and your recent flowi patch, I'm seeing pmtu
issues. Seems like xfrm_bundle_ok uses the original dst which
resulted in the creation of the bundle. Somehow that dst
does not get updated with pmtu... but the new dst used in
next xfrm_lookup for same target does have proper mtu.
I'm debugging right now why this is happening. Any ideas?

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-17 14:16   ` Timo Teräs
@ 2010-03-17 14:58     ` Herbert Xu
  2010-03-17 15:56       ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-17 14:58 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Wed, Mar 17, 2010 at 04:16:21PM +0200, Timo Teräs wrote:
>
> The problem is if I have multipoint gre1 and policy that says
> "encrypt all gre in transport mode".
>
> Thus for each public address, I get one bundle. But the
> xfrm_lookup() is called for each packet because ipgre_tunnel_xmit()
> calls ip_route_output_key() on per-packet basis.
>
> For my use-case it makes a huge difference.

But if your traffic switches between those tunnels on each packet,
we're back to square one, right?

> Then we cannot maintain policy use time. But if it's not a
> requirement, we could drop the policy from cache.

I don't see why we can't maintain the policy use time if we did
this, all you need is a back-pointer from the top xfrm_dst.

> Also. With this and your recent flowi patch, I'm seeing pmtu
> issues. Seems like xfrm_bundle_ok uses the original dst which
> resulted in the creation of the bundle. Somehow that dst
> does not get updated with pmtu... but the new dst used in
> next xfrm_lookup for same target does have proper mtu.
> I'm debugging right now why this is happening. Any ideas?

The dynamic MTU is always maintained in a normal dst object in
the IPv4 routing cache.  Each xfrm_dst points to such a dst
through xdst->route.

If you were looking at the xfrm_dst's own MTU then that may well
cause problems.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-17 14:58     ` Herbert Xu
@ 2010-03-17 15:56       ` Timo Teräs
  2010-03-17 16:32         ` Timo Teräs
  2010-03-18 19:30         ` Timo Teräs
  0 siblings, 2 replies; 41+ messages in thread
From: Timo Teräs @ 2010-03-17 15:56 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Wed, Mar 17, 2010 at 04:16:21PM +0200, Timo Teräs wrote:
>> The problem is if I have multipoint gre1 and policy that says
>> "encrypt all gre in transport mode".
>>
>> Thus for each public address, I get one bundle. But the
>> xfrm_lookup() is called for each packet because ipgre_tunnel_xmit()
>> calls ip_route_output_key() on per-packet basis.
>>
>> For my use-case it makes a huge difference.
> 
> But if your traffic switches between those tunnels on each packet,
> we're back to square one, right?

Not to my understanding. Why would it change?

>> Then we cannot maintain policy use time. But if it's not a
>> requirement, we could drop the policy from cache.
> 
> I don't see why we can't maintain the policy use time if we did
> this, all you need is a back-pointer from the top xfrm_dst.

Sure.

>> Also. With this and your recent flowi patch, I'm seeing pmtu
>> issues. Seems like xfrm_bundle_ok uses the original dst which
>> resulted in the creation of the bundle. Somehow that dst
>> does not get updated with pmtu... but the new dst used in
>> next xfrm_lookup for same target does have proper mtu.
>> I'm debugging right now why this is happening. Any ideas?
> 
> The dynamic MTU is always maintained in a normal dst object in
> the IPv4 routing cache.  Each xfrm_dst points to such a dst
> through xdst->route.
> 
> If you were looking at the xfrm_dst's own MTU then that may well
> cause problems.

I figured the root cause. The original dst gets expired
rt_genid goes old. But xfrm_dst does not notice that so it
won't create a new bundle. xfrm_bundle_ok calls dst_check,
but dst->obsolete = 0, and ipv4_dst_check is a no-op anyway.

Somehow the rtable object should be able to tell back to
xfrm that the dst is not good anymore. Any ideas?

- Timo



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-17 15:56       ` Timo Teräs
@ 2010-03-17 16:32         ` Timo Teräs
  2010-03-18 19:30         ` Timo Teräs
  1 sibling, 0 replies; 41+ messages in thread
From: Timo Teräs @ 2010-03-17 16:32 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Timo Teräs wrote:
>>> Also. With this and your recent flowi patch, I'm seeing pmtu
>>> issues. Seems like xfrm_bundle_ok uses the original dst which
>>> resulted in the creation of the bundle. Somehow that dst
>>> does not get updated with pmtu... but the new dst used in
>>> next xfrm_lookup for same target does have proper mtu.
>>> I'm debugging right now why this is happening. Any ideas?
>>
>> The dynamic MTU is always maintained in a normal dst object in
>> the IPv4 routing cache.  Each xfrm_dst points to such a dst
>> through xdst->route.
>>
>> If you were looking at the xfrm_dst's own MTU then that may well
>> cause problems.
> 
> I figured the root cause. The original dst gets expired
> rt_genid goes old. But xfrm_dst does not notice that so it
> won't create a new bundle. xfrm_bundle_ok calls dst_check,
> but dst->obsolete = 0, and ipv4_dst_check is a no-op anyway.
> 
> Somehow the rtable object should be able to tell back to
> xfrm that the dst is not good anymore. Any ideas?

Checked ipv6, it does like xfrm: sets obsolote to -1 and
on dst_check checks the genid. We need to do same in for
ipv4. I just wrote an hack, and tested it. It solves the
pmtu issues.

I will post a proper patch soon.

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-17 15:56       ` Timo Teräs
  2010-03-17 16:32         ` Timo Teräs
@ 2010-03-18 19:30         ` Timo Teräs
  2010-03-19  0:31           ` Herbert Xu
  2010-03-19  0:32           ` Herbert Xu
  1 sibling, 2 replies; 41+ messages in thread
From: Timo Teräs @ 2010-03-18 19:30 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Timo Teräs wrote:
> Herbert Xu wrote:
>> On Wed, Mar 17, 2010 at 04:16:21PM +0200, Timo Teräs wrote:
>>> The problem is if I have multipoint gre1 and policy that says
>>> "encrypt all gre in transport mode".
>>>
>>> Thus for each public address, I get one bundle. But the
>>> xfrm_lookup() is called for each packet because ipgre_tunnel_xmit()
>>> calls ip_route_output_key() on per-packet basis.
>>>
>>> For my use-case it makes a huge difference.
>>
>> But if your traffic switches between those tunnels on each packet,
>> we're back to square one, right?
> 
> Not to my understanding. Why would it change?

Here's how things go to my understanding.

When we are forwarding packets, each packet goes through
__xfrm_route_forward(). Or if we are sending from ip_gre (like in
my case), the packets go through ip_route_output_key().

Both of these call xfrm_lookup() to get the xfrm_dst instead
of the real rtable dst. This is done on per-packet basis.
Basically, the xfrm_dst is never kept referenced directly
on either code path. Instead it needs to be xfrm'ed per-packet.

Now, these created xfrm_dst gets cached in policy->bundles
with ref count zero and not deleted until garbage collection
is required. That's how xfrm_find_bundle tries to reuse them
(but it does O(n) lookup).

On the gre+esp case (should apply to any forward path too)
the caching of bundle speeds up the output path considerably
as there can be a lot of xfrm_dst's. Especially if it's a
wildcard transport mode policy which basically get's bundles
for each destination. So there can be a lot of xfrm_dst's
all valid, since they refer to unique xfrm_state on
per-destination basis.

Now, even more, since xfrm_dst needs to be regenerated if
the underlying ipv4 rtable entry has expired there can be
a lot of bundles. So the linear search is a major performance
killer.

Btw. it looks like the xfrm_dst garbage collection is broke.
It's only garbage collected if a network device goes down,
or dst_alloc calls it after gc threshold is exceeded. Since
gc threshold got dynamic not long ago, it can be very big.
This causes stale xfrm_dst's to be kept alive, and what is
worse their inner rtable dst is kept referenced. But as they
were expired and dst_free'd the dst core "free still referenced"
dst's goes through that list over and over again and will
kill the whole system performance when the list grows long.

I think as a minimum we should add 'do stale_bundle' check
to all xfrm_dst's every n minutes or so.

>>> Then we cannot maintain policy use time. But if it's not a
>>> requirement, we could drop the policy from cache.
>>
>> I don't see why we can't maintain the policy use time if we did
>> this, all you need is a back-pointer from the top xfrm_dst.
> 
> Sure.

Actually no. As the pmtu case showed, it's more likely that
xfrm_dst needs to be regenerated, but the policy stays the
same since policy db isn't touched that often. If we keep
them separately we can almost most of the time avoid doing
policy lookup which is also O(n). Also the currently cache
entry validation is needs to check policy's bundles_genid
before allowing touching of xfrm_dst. Otherwise we would have
to keep global bundle_genid, and we'd lose the parent pointer
on cache miss.

Caching bundles be another win too. Since if do cache entries
like this, we could track how many cache miss xfrm_dst's
we've had and use that decide when to trigger xfrm_dst
garbage collector instead of (or in addition to) fixed timer.

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-18 19:30         ` Timo Teräs
@ 2010-03-19  0:31           ` Herbert Xu
  2010-03-19  5:48             ` Timo Teräs
  2010-03-19  0:32           ` Herbert Xu
  1 sibling, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-19  0:31 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Thu, Mar 18, 2010 at 09:30:58PM +0200, Timo Teräs wrote:
>
> Now, these created xfrm_dst gets cached in policy->bundles
> with ref count zero and not deleted until garbage collection
> is required. That's how xfrm_find_bundle tries to reuse them
> (but it does O(n) lookup).

Yes but the way you're caching it in the policy means that it
only helps if two consecutive packets happen to match the same
bundle.  If your traffic mix was such that each packet required
a different bundle, then we're back to where we started.

That's why I was asking for you to directly cache the xfrm_dst
objects in the flow cache.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-18 19:30         ` Timo Teräs
  2010-03-19  0:31           ` Herbert Xu
@ 2010-03-19  0:32           ` Herbert Xu
  1 sibling, 0 replies; 41+ messages in thread
From: Herbert Xu @ 2010-03-19  0:32 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Thu, Mar 18, 2010 at 09:30:58PM +0200, Timo Teräs wrote:
>
>>> I don't see why we can't maintain the policy use time if we did
>>> this, all you need is a back-pointer from the top xfrm_dst.
>>
>> Sure.
>
> Actually no. As the pmtu case showed, it's more likely that
> xfrm_dst needs to be regenerated, but the policy stays the
> same since policy db isn't touched that often. If we keep
> them separately we can almost most of the time avoid doing
> policy lookup which is also O(n). Also the currently cache
> entry validation is needs to check policy's bundles_genid
> before allowing touching of xfrm_dst. Otherwise we would have
> to keep global bundle_genid, and we'd lose the parent pointer
> on cache miss.

A back-pointer does not require an O(n) lookup.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  0:31           ` Herbert Xu
@ 2010-03-19  5:48             ` Timo Teräs
  2010-03-19  6:03               ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-19  5:48 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Thu, Mar 18, 2010 at 09:30:58PM +0200, Timo Teräs wrote:
>> Now, these created xfrm_dst gets cached in policy->bundles
>> with ref count zero and not deleted until garbage collection
>> is required. That's how xfrm_find_bundle tries to reuse them
>> (but it does O(n) lookup).
> 
> Yes but the way you're caching it in the policy means that it
> only helps if two consecutive packets happen to match the same
> bundle.  If your traffic mix was such that each packet required
> a different bundle, then we're back to where we started.
> 
> That's why I was asking for you to directly cache the xfrm_dst
> objects in the flow cache.

But it always matches. The caching happens using the inner
flow. Inner flow always matches with the same bundle unless
the bundle expires or goes stale. What happens is that I get
multiple cache entries per-inner flow each referencing to the
same bundle.

And this is even more useful with the gre+esp. No matter what
I send inside gre (with private IPs), it ends up being routed
to more limited set of outer public IP parties. The bundle
lookup happens with the public-IP flow, so the speed up works
even better.

>> Actually no. As the pmtu case showed, it's more likely that
>> xfrm_dst needs to be regenerated, but the policy stays the
>> same since policy db isn't touched that often. If we keep
>> them separately we can almost most of the time avoid doing
>> policy lookup which is also O(n). Also the currently cache
>> entry validation is needs to check policy's bundles_genid
>> before allowing touching of xfrm_dst. Otherwise we would have
>> to keep global bundle_genid, and we'd lose the parent pointer
>> on cache miss.
> A back-pointer does not require an O(n) lookup.

True. But if we go and prune a bundle due to it being bad or
needing garbage collection we need to invalidate all bundles
pointers, and we cannot access the back-pointer. Alternatively
we need to keep xfrm_dst references again in the flow cache
requiring an expensive iteration of all flow cache entries
whenever a xfrm_dst needs to be deleted (which happens often).

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  5:48             ` Timo Teräs
@ 2010-03-19  6:03               ` Herbert Xu
  2010-03-19  6:21                 ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-19  6:03 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Fri, Mar 19, 2010 at 07:48:57AM +0200, Timo Teräs wrote:
>
> But it always matches. The caching happens using the inner
> flow. Inner flow always matches with the same bundle unless
> the bundle expires or goes stale. What happens is that I get
> multiple cache entries per-inner flow each referencing to the
> same bundle.

Sorry for being slow, but if it always matches, doesn't that mean
you'll only have a single bundle in the policy bundle list? IOW
why do we need this at all?

Or have I misread your patch? You *are* proposing to cache the last
used bundle in the policy, right?

> True. But if we go and prune a bundle due to it being bad or
> needing garbage collection we need to invalidate all bundles
> pointers, and we cannot access the back-pointer. Alternatively

Why can't you access the back-pointer? You should always have
a reference held on the policy, either explicit or implicit.

> we need to keep xfrm_dst references again in the flow cache
> requiring an expensive iteration of all flow cache entries
> whenever a xfrm_dst needs to be deleted (which happens often).

So does the IPv4 routing cache.  I think what this reflects is
just that the IPsec garbage collection mechanism is broken.

There is no point in doing a GC on every dst_alloc if we know
that it isn't going to go below the threshold.  It should gain
a minimum GC interval like IPv4.  Or perhaps we can move the
minimum GC interval check into the dst core.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  6:03               ` Herbert Xu
@ 2010-03-19  6:21                 ` Timo Teräs
  2010-03-19  7:17                   ` Herbert Xu
  2010-03-19  7:27                   ` Timo Teräs
  0 siblings, 2 replies; 41+ messages in thread
From: Timo Teräs @ 2010-03-19  6:21 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Fri, Mar 19, 2010 at 07:48:57AM +0200, Timo Teräs wrote:
>> But it always matches. The caching happens using the inner
>> flow. Inner flow always matches with the same bundle unless
>> the bundle expires or goes stale. What happens is that I get
>> multiple cache entries per-inner flow each referencing to the
>> same bundle.
> 
> Sorry for being slow, but if it always matches, doesn't that mean
> you'll only have a single bundle in the policy bundle list? IOW
> why do we need this at all?

No. The bundle created for specific flow, matches always later
that flow.

With transport mode wildcard policy, e.g. single policy saying
encrypt traffic with protocol X to all IP-address, you get a
separate bundle per-public IP destination. Bundle matches only
that specific IP since it gets a separate xfrm_state. But you
can talk to all the hosts in internet using same policy, so
you can end up with a whole lot of valid bundles in the same
policy.

I'm not sure how this works in tunnel mode. It might be that
single bundle can be reused for all packets. But I think the same
applies to tunnel mode. Since afinfo->fill_dst() puts the
inner flow to bundle xfrm_dst->u.rt.fl, which is later compared
against the inner flow by afinfo->find_bundle(). I think this
implies that for each flow traveling inside tunnel, it gets it's
separate xfrm_dst, so again you end up with a whole lot of
valid bundles in the same policy.

> Or have I misread your patch? You *are* proposing to cache the last
> used bundle in the policy, right?

Yes and no. The bundle used is cached on per-flow basis.
The flow cache can have lot of entries each referring to
same policy but separate bundle.

>> True. But if we go and prune a bundle due to it being bad or
>> needing garbage collection we need to invalidate all bundles
>> pointers, and we cannot access the back-pointer. Alternatively
> 
> Why can't you access the back-pointer? You should always have
> a reference held on the policy, either explicit or implicit.
> 
>> we need to keep xfrm_dst references again in the flow cache
>> requiring an expensive iteration of all flow cache entries
>> whenever a xfrm_dst needs to be deleted (which happens often).
> 
> So does the IPv4 routing cache.  I think what this reflects is
> just that the IPsec garbage collection mechanism is broken.
> 
> There is no point in doing a GC on every dst_alloc if we know
> that it isn't going to go below the threshold.  It should gain
> a minimum GC interval like IPv4.  Or perhaps we can move the
> minimum GC interval check into the dst core.

Yes, I reported xfrm_dst GC being broke in the earlier mail.

But keeping policy and bundle in cache is still a win. If we
kill the xfrm_dst due to GC, we will also lose the policy
the flow matched. We might need to kill xfrm_dst due to the
inside dst going old, but the flow cache would still give
hit with policy info (but no bundle) the next a packet comes
in using the same flow.

- Timo


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  6:21                 ` Timo Teräs
@ 2010-03-19  7:17                   ` Herbert Xu
  2010-03-19  7:27                   ` Timo Teräs
  1 sibling, 0 replies; 41+ messages in thread
From: Herbert Xu @ 2010-03-19  7:17 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Fri, Mar 19, 2010 at 08:21:02AM +0200, Timo Teräs wrote:
>
>> Or have I misread your patch? You *are* proposing to cache the last
>> used bundle in the policy, right?
>
> Yes and no. The bundle used is cached on per-flow basis.
> The flow cache can have lot of entries each referring to
> same policy but separate bundle.

OK so I did misread your patch.

In fact it is already doing exactly what I was suggesting, I'll
review your patch again with this new insignt :)

> But keeping policy and bundle in cache is still a win. If we
> kill the xfrm_dst due to GC, we will also lose the policy
> the flow matched. We might need to kill xfrm_dst due to the
> inside dst going old, but the flow cache would still give
> hit with policy info (but no bundle) the next a packet comes
> in using the same flow.

We were in complete agreement all along :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-15 12:20 [PATCH] xfrm: cache bundle lookup results in flow cache Timo Teras
  2010-03-17 13:07 ` Herbert Xu
@ 2010-03-19  7:20 ` Herbert Xu
  2010-03-19  7:48   ` Timo Teräs
  1 sibling, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-19  7:20 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev

On Mon, Mar 15, 2010 at 02:20:10PM +0200, Timo Teras wrote:
>
> -		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
> -					   dir, xfrm_policy_lookup);
> -		err = PTR_ERR(policy);
> -		if (IS_ERR(policy)) {
> -			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
> -			goto dropdst;
> +		fce = flow_cache_lookup(&net->xfrm.flow_cache,
> +					fl, family, dir);
> +		if (fce == NULL)
> +			goto no_cache;
> +
> +		xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
> +		xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce);

This doesn't work.

The flow cache operates without locking as it is a per-cpu cache.
To make this work you must ensure that you stay on the same CPU
or use some other form of synchronoisation if you write to the
object returned.

AFAICS there is no synchronisation here and you're writing to fce.

So you'll need to disable preemption around the bit that touches
fce.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  6:21                 ` Timo Teräs
  2010-03-19  7:17                   ` Herbert Xu
@ 2010-03-19  7:27                   ` Timo Teräs
  1 sibling, 0 replies; 41+ messages in thread
From: Timo Teräs @ 2010-03-19  7:27 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Timo Teräs wrote:
> Herbert Xu wrote:
>> On Fri, Mar 19, 2010 at 07:48:57AM +0200, Timo Teräs wrote:
>>> But it always matches. The caching happens using the inner
>>> flow. Inner flow always matches with the same bundle unless
>>> the bundle expires or goes stale. What happens is that I get
>>> multiple cache entries per-inner flow each referencing to the
>>> same bundle.
>>
>> Sorry for being slow, but if it always matches, doesn't that mean
>> you'll only have a single bundle in the policy bundle list? IOW
>> why do we need this at all?
> 
> No. The bundle created for specific flow, matches always later
> that flow.

Just figured that's it's easier to explain with an example.

We have SPD:
	10.1.0.0/16 - 10.2.0.0/16 tunnel
		1.2.3.4 - 4.3.2.1

Now we get n+1 clients to connect to server in 10.2.0.1.
They each get separate bundle, since the xfrm_dst will be
created and search using flow id's like:
	src 10.1.x.x dst 10.2.0.1

So there's one xfrm_policy and xfrm_state, but n+1
xfrm_dst's.

Since the flow cache caches the result of lookups on the
inner flow "10.1.x.x->10.2.0.1" basis, it always returns
matching valid bundle in O(1) time unless the xfrm_dst
expired.

Currently it's looked up with O(n) search in find_bundle.

Same thing happens with wildcard transport mode SPD's.

E.g. SPD:
	0.0.0.0/0 - 0.0.0.0/0 proto gre, transport

We are talking with gre to n+1 tunnel destinations.
We get n+1 xfrm_dst's in that xfrm_policy. Flow cache
works on inner flow using flows like:
	src 1.2.3.4 dst 4.3.2.1 proto gre
And can keep in cache the right policy always, and
the bundle to use as long as it stays valid.

Hopefully this explains why I think the patch is
useful.

- Timo


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  7:20 ` Herbert Xu
@ 2010-03-19  7:48   ` Timo Teräs
  2010-03-19  8:29     ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-19  7:48 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Mon, Mar 15, 2010 at 02:20:10PM +0200, Timo Teras wrote:
>> -		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
>> -					   dir, xfrm_policy_lookup);
>> -		err = PTR_ERR(policy);
>> -		if (IS_ERR(policy)) {
>> -			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
>> -			goto dropdst;
>> +		fce = flow_cache_lookup(&net->xfrm.flow_cache,
>> +					fl, family, dir);
>> +		if (fce == NULL)
>> +			goto no_cache;
>> +
>> +		xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
>> +		xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce);
> 
> This doesn't work.
> 
> The flow cache operates without locking as it is a per-cpu cache.
> To make this work you must ensure that you stay on the same CPU
> or use some other form of synchronoisation if you write to the
> object returned.
> 
> AFAICS there is no synchronisation here and you're writing to fce.
> 
> So you'll need to disable preemption around the bit that touches
> fce.

But flow_cache_lookup disables pre-emption until _put is called.
So it should work. Would there be a cleaner way?

However, now I figured that we need to make walk.dead atomic
because it's read some times without taking the policy lock.

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  7:48   ` Timo Teräs
@ 2010-03-19  8:29     ` Herbert Xu
  2010-03-19  8:37       ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-19  8:29 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Fri, Mar 19, 2010 at 09:48:17AM +0200, Timo Teräs wrote:
>
> But flow_cache_lookup disables pre-emption until _put is called.
> So it should work. Would there be a cleaner way?

Previously the flow cache returned a policy directly which works
because whenever we modify that policy we'd take the appropriate
lock.

Your patch changes it so that it now returns an fce.  But nothing
is guarding the code that modifies fce.  So two CPUs may end up
modifying the same fce.

However, it would appear that this race could be harmless, provided
that you are careful about dereferencing fce->policy and fce->dst.

IOW, this is not OK

	if (fce->policy)
		use fce->policy;

and this should work

	policy = fce->policy;
	if (policy)
		use policy;

Actually on second tought, even this isn't totally safe.  Who
is taking a reference count on the policy and dst? I see a ref
count on the fce, but nothing on fce->dst and fce->policy.  Do
you have an implicit reference on them?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  8:29     ` Herbert Xu
@ 2010-03-19  8:37       ` Timo Teräs
  2010-03-19  8:47         ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-19  8:37 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Fri, Mar 19, 2010 at 09:48:17AM +0200, Timo Teräs wrote:
>> But flow_cache_lookup disables pre-emption until _put is called.
>> So it should work. Would there be a cleaner way?
> 
> Previously the flow cache returned a policy directly which works
> because whenever we modify that policy we'd take the appropriate
> lock.
>
> Your patch changes it so that it now returns an fce.  But nothing
> is guarding the code that modifies fce.  So two CPUs may end up
> modifying the same fce.

But I changed that. the flow cache now does *not* call local_bh_enable
if it returns something. This is deferred until corresponding _put
call. So bh's are disable while we are touching the lookup results.

It'd probably make sense to remove that. And require _lookup to
be called with bh disabled so it's more obvious that bh's are
disabled when touching the cache entry.

> However, it would appear that this race could be harmless, provided
> that you are careful about dereferencing fce->policy and fce->dst.
> 
> IOW, this is not OK
> 
> 	if (fce->policy)
> 		use fce->policy;
> 
> and this should work
> 
> 	policy = fce->policy;
> 	if (policy)
> 		use policy;

Not a race. We need to keep bh's disabled while touching fce
for various reasons.

> Actually on second tought, even this isn't totally safe.  Who
> is taking a reference count on the policy and dst? I see a ref
> count on the fce, but nothing on fce->dst and fce->policy.  Do
> you have an implicit reference on them?

Noone. When policy and dst is on cache there's no reference.
The cache generation id's ensure that the object exists when
they are in cache. It might make sense to add references to
both objects and do a BUG_ON if the flow cache flusher would
need to delete an object. I guess this would be the proper
way, since that's how the dst stuff works too.

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  8:37       ` Timo Teräs
@ 2010-03-19  8:47         ` Herbert Xu
  2010-03-19  9:12           ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-19  8:47 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Fri, Mar 19, 2010 at 10:37:58AM +0200, Timo Teräs wrote:
>
> But I changed that. the flow cache now does *not* call local_bh_enable
> if it returns something. This is deferred until corresponding _put
> call. So bh's are disable while we are touching the lookup results.

I'm sorry but making a function like flow_cache_lookup return with
BH disabled is just wrong!

> It'd probably make sense to remove that. And require _lookup to
> be called with bh disabled so it's more obvious that bh's are
> disabled when touching the cache entry.

That would be better but it's still hacky.  Proper reference
counting like we had before would be my preference.

> Not a race. We need to keep bh's disabled while touching fce
> for various reasons.

What are those reasons (apart from this race)?

> Noone. When policy and dst is on cache there's no reference.
> The cache generation id's ensure that the object exists when
> they are in cache. It might make sense to add references to
> both objects and do a BUG_ON if the flow cache flusher would
> need to delete an object. I guess this would be the proper
> way, since that's how the dst stuff works too.

The cache genid is not enough:

CPU1			CPU2
check genid == OK
			update genid
			kill policy
			kfree on policy
use policy == BOOM

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  8:47         ` Herbert Xu
@ 2010-03-19  9:12           ` Timo Teräs
  2010-03-19  9:32             ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-19  9:12 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Fri, Mar 19, 2010 at 10:37:58AM +0200, Timo Teräs wrote:
>> But I changed that. the flow cache now does *not* call local_bh_enable
>> if it returns something. This is deferred until corresponding _put
>> call. So bh's are disable while we are touching the lookup results.
> 
> I'm sorry but making a function like flow_cache_lookup return with
> BH disabled is just wrong!
> 
>> It'd probably make sense to remove that. And require _lookup to
>> be called with bh disabled so it's more obvious that bh's are
>> disabled when touching the cache entry.
> 
> That would be better but it's still hacky.  Proper reference
> counting like we had before would be my preference.

Well, the cache entry is still referenced only very shortly,
I don't see why keeping bh disabled why doing it is considered
a hack. Refcounting the cache entries is trickier. Though,
it could be used to optimize the update process: we could safely
update it instead of doing now lookup later.

>> Not a race. We need to keep bh's disabled while touching fce
>> for various reasons.
> 
> What are those reasons (apart from this race)?

This. And that the cache is synchronized by flow_cache_flush
executing stuff on other cpu's, ensuring that it's not running
any protected cache accessing code. See below.

> 
>> Noone. When policy and dst is on cache there's no reference.
>> The cache generation id's ensure that the object exists when
>> they are in cache. It might make sense to add references to
>> both objects and do a BUG_ON if the flow cache flusher would
>> need to delete an object. I guess this would be the proper
>> way, since that's how the dst stuff works too.
> 
> The cache genid is not enough:
> 
> CPU1			CPU2
> check genid == OK
> 			update genid
> 			kill policy
> 			kfree on policy
> use policy == BOOM

The sequence goes currently.

CPU1			CPU2
disable_bh
check genid == OK
			update genid
			call cache_flush
			blocks
use policy == OK
and take refcount
enable_bh
cache_flush smpcall executes and ublocks cpu2
			returns from cache_flush
			kill policy
			kfree on policy

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  9:12           ` Timo Teräs
@ 2010-03-19  9:32             ` Herbert Xu
  2010-03-19  9:53               ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-19  9:32 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev

On Fri, Mar 19, 2010 at 11:12:21AM +0200, Timo Teräs wrote:
>
>> That would be better but it's still hacky.  Proper reference
>> counting like we had before would be my preference.
>
> Well, the cache entry is still referenced only very shortly,
> I don't see why keeping bh disabled why doing it is considered
> a hack. Refcounting the cache entries is trickier. Though,
> it could be used to optimize the update process: we could safely
> update it instead of doing now lookup later.

Well we had a nicely type-agnostic cache which is self-contained,
but your patch is bleeding generic code into xfrm_policy.c, that's
why I felt it to be hacky :)

Anyway I see how your scheme works now as far as object life
is concerned, and I agree that it is safe.

However, I wonder if we could do it while still leaving all the
object life-cycle management stuff (and the BH disabling bits)
in flow.c

The crux of the issue is that you now have two objects to track
instead of one.  As the direction is a key in the lookup, we're
really only worried about the outbound case here.

So how about going back to what I suggested earlier, and keeping
a back-pointer from xfrm_dst to the policy? Of course xfrm_dst
would also hold a ref count on the policy.  You'd only have to
do it for the top-level xfrm_dst.

It does mean that you'll need to write a different resolver for
outbound vs. inbound/forward, but that makes sense because we
only use bundles for outbound policies.

What do you think?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  9:32             ` Herbert Xu
@ 2010-03-19  9:53               ` Timo Teräs
  2010-03-20 15:17                 ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-19  9:53 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Fri, Mar 19, 2010 at 11:12:21AM +0200, Timo Teräs wrote:
>>> That would be better but it's still hacky.  Proper reference
>>> counting like we had before would be my preference.
>> Well, the cache entry is still referenced only very shortly,
>> I don't see why keeping bh disabled why doing it is considered
>> a hack. Refcounting the cache entries is trickier. Though,
>> it could be used to optimize the update process: we could safely
>> update it instead of doing now lookup later.
> 
> Well we had a nicely type-agnostic cache which is self-contained,
> but your patch is bleeding generic code into xfrm_policy.c, that's
> why I felt it to be hacky :)
> 
> Anyway I see how your scheme works now as far as object life
> is concerned, and I agree that it is safe.
> 
> However, I wonder if we could do it while still leaving all the
> object life-cycle management stuff (and the BH disabling bits)
> in flow.c
> 
> The crux of the issue is that you now have two objects to track
> instead of one.  As the direction is a key in the lookup, we're
> really only worried about the outbound case here.
> 
> So how about going back to what I suggested earlier, and keeping
> a back-pointer from xfrm_dst to the policy? Of course xfrm_dst
> would also hold a ref count on the policy.  You'd only have to
> do it for the top-level xfrm_dst.
> 
> It does mean that you'll need to write a different resolver for
> outbound vs. inbound/forward, but that makes sense because we
> only use bundles for outbound policies.
> 
> What do you think?

When I first started reading the code, I got confused slightly
on how the garbage collection is happening. What I did not like
in current is the atomic_dec() in flow.c that does not check if
it was turned to zero. Because on policy objects it means you
need to delete it (which would a bug if it happened in flow.c;
the policy gc calls flush before releasing it's own reference),
but on xfrm_dst it's perfectly ok to do atomic_dec() and the
dst core will garbage collect items.

But now, thinking more, it would probably make more sense to
just cache xfrm_dst's and keep ref to the policy on them. So
in general I agree with your recommendation. The only immediate
problem I can think now is that the resolved would need to
atomically check if xfrm_dst is valid, if not, resolve new one.
But creating new xfrm_dst involves taking locks and can sleep
so it cannot be inside the main resolver.

Alternatively, we'd need to:
a) still expose flow cache entry structs and do locking or
   refcounting on them
b) have two version of flow lookup: one that calls resolver
   with bh disabled and can atomically lookup and update entry
   and a second version that lookups, calls validation, if
   not-valid it calls resolver with bh enabled and does a new
   flow cache lookup to update cache

Also, relatedly. Is there way to release xfrm_dst's child and
route refs when xfrm_bundle_ok fails? This would improve GC
collection of the ipv4 rtable entries they referenced.

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-19  9:53               ` Timo Teräs
@ 2010-03-20 15:17                 ` Herbert Xu
  2010-03-20 16:26                   ` Timo Teräs
                                     ` (2 more replies)
  0 siblings, 3 replies; 41+ messages in thread
From: Herbert Xu @ 2010-03-20 15:17 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev, David S. Miller

On Fri, Mar 19, 2010 at 11:53:44AM +0200, Timo Teräs wrote:
>
> But now, thinking more, it would probably make more sense to
> just cache xfrm_dst's and keep ref to the policy on them. So
> in general I agree with your recommendation. The only immediate
> problem I can think now is that the resolved would need to
> atomically check if xfrm_dst is valid, if not, resolve new one.
> But creating new xfrm_dst involves taking locks and can sleep
> so it cannot be inside the main resolver.

OK this brings out my favourite topic :)

The reason we have to sleep is to resolve the template.  So if
we had queueing for pending xfrm_dst objects, we wouldn't have
to sleep at all when creating the top-level xfrm_dst.

Packets using that xfrm_dst can wait in the queue until it is
fully resolved.  Now obviously this code doesn't exist so this
is all just a wet dream.

Setting my favourite topic aside, I have to come to the conclusion
that your patch still doesn't fully resolve the problem you set out
to fix.

The crux of the issue is the linked list of all bundles in a
policy and the obvious problems stemming from walking a linked
list that is unbounded.

The reason I think it doesn't fully resolve this is because of
the flow cache.  Being a per-cpu cache, when you create the xfrm
dst the first time around, you'll at most put it in one CPU's
cache.

The next CPU that comes along will still have to walk that same
bundle linked list.  So we're back to square one.

Now Dave, my impression is that we picked the per-cpu design
because it was the best data structure we had back in 2002,
right?

If so I'd like us to think about the possibility of switching
over to a different design, in particular, an RCU-based hash
table, similar to the one I just used for bridge multicasting.

This would eliminate the need for walking the bundle list apart
from the case when we're destroying the policy, which can be
done in process context.

Actually I just realised that the other way we can fix this is
to make xfrm_dst objects per-cpu just like IPv4 routes.  That
is, when you fail to find an xfrm_dst object in the per-cpu
cache, you dont' bother calling xfrm_find_bundle but just make
a new bundle.

This is probably much easier than replacing the whole flow cache.
Can any one think of any problems with duplicate xfrm_dst objects?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-20 15:17                 ` Herbert Xu
@ 2010-03-20 16:26                   ` Timo Teräs
  2010-03-21  0:46                     ` Herbert Xu
  2010-03-22  1:26                   ` David Miller
  2010-03-22  1:28                   ` David Miller
  2 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-20 16:26 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, David S. Miller

Herbert Xu wrote:
> On Fri, Mar 19, 2010 at 11:53:44AM +0200, Timo Teräs wrote:
>> But now, thinking more, it would probably make more sense to
>> just cache xfrm_dst's and keep ref to the policy on them. So
>> in general I agree with your recommendation. The only immediate
>> problem I can think now is that the resolved would need to
>> atomically check if xfrm_dst is valid, if not, resolve new one.
>> But creating new xfrm_dst involves taking locks and can sleep
>> so it cannot be inside the main resolver.
> 
> OK this brings out my favourite topic :)
> 
> The reason we have to sleep is to resolve the template.  So if
> we had queueing for pending xfrm_dst objects, we wouldn't have
> to sleep at all when creating the top-level xfrm_dst.
> 
> Packets using that xfrm_dst can wait in the queue until it is
> fully resolved.  Now obviously this code doesn't exist so this
> is all just a wet dream.

Right. That sounds useful.

> Setting my favourite topic aside, I have to come to the conclusion
> that your patch still doesn't fully resolve the problem you set out
> to fix.
> 
> The crux of the issue is the linked list of all bundles in a
> policy and the obvious problems stemming from walking a linked
> list that is unbounded.
> 
> The reason I think it doesn't fully resolve this is because of
> the flow cache.  Being a per-cpu cache, when you create the xfrm
> dst the first time around, you'll at most put it in one CPU's
> cache.
> 
> The next CPU that comes along will still have to walk that same
> bundle linked list.  So we're back to square one.

Not exactly, each CPU does one slow lookup after which it
finds it fast. But yes, it's not perfect solution. Especially,
if cpu happens to get switched between the initial lookup and
the update.

> Now Dave, my impression is that we picked the per-cpu design
> because it was the best data structure we had back in 2002,
> right?
> 
> If so I'd like us to think about the possibility of switching
> over to a different design, in particular, an RCU-based hash
> table, similar to the one I just used for bridge multicasting.
> 
> This would eliminate the need for walking the bundle list apart
> from the case when we're destroying the policy, which can be
> done in process context.

Right. This would speed the bundle lookup in all cases.

Except... we can have override policy on per-socket basis.
We should include the per-socket override in the flow lookups
so that those sockets get also boost from the cache. Though
usual use case is to disable all policies (so e.g. IKE can
be talked without policies applying).

> Actually I just realised that the other way we can fix this is
> to make xfrm_dst objects per-cpu just like IPv4 routes.  That
> is, when you fail to find an xfrm_dst object in the per-cpu
> cache, you dont' bother calling xfrm_find_bundle but just make
> a new bundle.
> 
> This is probably much easier than replacing the whole flow cache.
> Can any one think of any problems with duplicate xfrm_dst objects?

Sounds like a very good idea. If we instantiate new xfrm_dst,
all that it shares with others is xfrm_state and xfrm_policy
(inner objects will be unique). Since that's what happens anyway
I don't see any problem with this.

So should go ahead and:
1. modify flow cache to be more generic (have virtual put and get
   for each object; and remove the atomic_t pointer)
2. modify flow cache to have slow and fast resolvers so we can
   copy with the current sleeping requirement
3. cache bundles instead of policies for outgoing stuff
4. kill find_bundle and just instantiate new ones if we get cache
   miss
5. put all bundles to global hlist (since only place that walks
   through them is gc, and stale bundle can be dst_free'd right
   away); use genid's for policy to flush old bundles
6. dst_free and unlink bundle immediately if it's found to be stale

- Timo


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-20 16:26                   ` Timo Teräs
@ 2010-03-21  0:46                     ` Herbert Xu
  2010-03-21  7:34                       ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-21  0:46 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev, David S. Miller

On Sat, Mar 20, 2010 at 06:26:00PM +0200, Timo Teräs wrote:
>
> So should go ahead and:
> 1. modify flow cache to be more generic (have virtual put and get
>   for each object; and remove the atomic_t pointer)
> 2. modify flow cache to have slow and fast resolvers so we can
>   copy with the current sleeping requirement

I don't think we need either of these.  To support the sleep
requirement, just return -EAGAIN from the resolver when the
template can't be resolved.  Then the caller of flow_cache_lookup
can sleep as it does now.  It simply has to repeat the flow
cache lookup afterwards.

> 3. cache bundles instead of policies for outgoing stuff
> 4. kill find_bundle and just instantiate new ones if we get cache
>   miss
> 5. put all bundles to global hlist (since only place that walks
>   through them is gc, and stale bundle can be dst_free'd right
>   away); use genid's for policy to flush old bundles
> 6. dst_free and unlink bundle immediately if it's found to be stale

Sounds good.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-21  0:46                     ` Herbert Xu
@ 2010-03-21  7:34                       ` Timo Teräs
  2010-03-21  8:31                         ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-21  7:34 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, David S. Miller

Herbert Xu wrote:
> On Sat, Mar 20, 2010 at 06:26:00PM +0200, Timo Teräs wrote:
>> So should go ahead and:
>> 1. modify flow cache to be more generic (have virtual put and get
>>   for each object; and remove the atomic_t pointer)
>> 2. modify flow cache to have slow and fast resolvers so we can
>>   copy with the current sleeping requirement
> 
> I don't think we need either of these.  To support the sleep
> requirement, just return -EAGAIN from the resolver when the
> template can't be resolved.  Then the caller of flow_cache_lookup
> can sleep as it does now.  It simply has to repeat the flow
> cache lookup afterwards.

Ok, we can do that to skip 2. But I think 1 would be still useful.
It'd probably be good to actually have flow_cache_ops pointer in
each entry instead of the atomic_t pointer.

The reasoning:
- we can then have type-based checks that the reference count
  is valid (e.g. policy's refcount must not go to zero, it's bug,
  and we can call dst_release which warns if refcount goes to
  negative); imho it's hack to call atomic_dec instead of the
  real type's xxx_put
- the flow cache needs to somehow know if the entry is stale so
  it'll try to refresh it atomically; e.g. if there's no
  check for 'stale', the lookup returns stale xfrm_dst. we'd
  then need new api to update the stale entry, or flush it out
  and repeat the lookup. the virtual get could check for it being
  stale (if so release the entry) and then return null for the
  generic code to call the resolver atomically
- for paranoia we can actually check the type of the object in
  cache via the ops (if needed)

>> 3. cache bundles instead of policies for outgoing stuff
>> 4. kill find_bundle and just instantiate new ones if we get cache
>>   miss
>> 5. put all bundles to global hlist (since only place that walks
>>   through them is gc, and stale bundle can be dst_free'd right
>>   away); use genid's for policy to flush old bundles
>> 6. dst_free and unlink bundle immediately if it's found to be stale
> 
> Sounds good.

Okay. Sounds like a plan. I'll work on this next week.
I'll also try to make it a series of patches instead of the big
hunk I sent initially.

- Timo


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-21  7:34                       ` Timo Teräs
@ 2010-03-21  8:31                         ` Timo Teräs
  2010-03-22  3:52                           ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-21  8:31 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, David S. Miller

Timo Teräs wrote:
> Herbert Xu wrote:
>> On Sat, Mar 20, 2010 at 06:26:00PM +0200, Timo Teräs wrote:
>>> So should go ahead and:
>>> 1. modify flow cache to be more generic (have virtual put and get
>>>   for each object; and remove the atomic_t pointer)
>>> 2. modify flow cache to have slow and fast resolvers so we can
>>>   copy with the current sleeping requirement
>>
>> I don't think we need either of these.  To support the sleep
>> requirement, just return -EAGAIN from the resolver when the
>> template can't be resolved.  Then the caller of flow_cache_lookup
>> can sleep as it does now.  It simply has to repeat the flow
>> cache lookup afterwards.
> 
> Ok, we can do that to skip 2. But I think 1 would be still useful.
> It'd probably be good to actually have flow_cache_ops pointer in
> each entry instead of the atomic_t pointer.
> 
> The reasoning:
> - we can then have type-based checks that the reference count
>  is valid (e.g. policy's refcount must not go to zero, it's bug,
>  and we can call dst_release which warns if refcount goes to
>  negative); imho it's hack to call atomic_dec instead of the
>  real type's xxx_put
> - the flow cache needs to somehow know if the entry is stale so
>  it'll try to refresh it atomically; e.g. if there's no
>  check for 'stale', the lookup returns stale xfrm_dst. we'd
>  then need new api to update the stale entry, or flush it out
>  and repeat the lookup. the virtual get could check for it being
>  stale (if so release the entry) and then return null for the
>  generic code to call the resolver atomically
> - for paranoia we can actually check the type of the object in
>  cache via the ops (if needed)

- could cache bundle OR policy for outgoing stuff. it's useful
  to cache the policy in case we need to sleep, or if it's a
  policy forbidding traffic. in those cases there's no bundle
  to cache at all. alternatively we can make dummy bundles that
  are marked invalid and are just used to keep a reference to
  the policy.

Oh, this also implies that the resolver function should be
changed to get the old stale object so it can re-use it to
get the policy object instead of searching it all over again.

- Timo


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-20 15:17                 ` Herbert Xu
  2010-03-20 16:26                   ` Timo Teräs
@ 2010-03-22  1:26                   ` David Miller
  2010-03-22  1:28                   ` David Miller
  2 siblings, 0 replies; 41+ messages in thread
From: David Miller @ 2010-03-22  1:26 UTC (permalink / raw)
  To: herbert; +Cc: timo.teras, netdev

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 20 Mar 2010 23:17:51 +0800

> Now Dave, my impression is that we picked the per-cpu design
> because it was the best data structure we had back in 2002,
> right?

Basically.

It was envisioned that flows at that level of detail would be spread
between different cpus, and that individual flows wouldn't propagate
onto multiple cpus much, if at all.  And if they did, no big deal we
have an entry in the cache of those cpus.

Do we know cases where it happens often?

In any event, RCU would certainly fit the bill just as easily and I
have no qualms against going in that direction.

Timo mentioned the socket overrides, we handle them at the top level
right before we look into the flow cache and I think it should stay
that way and we shouldn't bother tossing those into the flow cache at
all.  Just my humble opinion on this :-)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-20 15:17                 ` Herbert Xu
  2010-03-20 16:26                   ` Timo Teräs
  2010-03-22  1:26                   ` David Miller
@ 2010-03-22  1:28                   ` David Miller
  2010-03-22  1:32                     ` Herbert Xu
  2 siblings, 1 reply; 41+ messages in thread
From: David Miller @ 2010-03-22  1:28 UTC (permalink / raw)
  To: herbert; +Cc: timo.teras, netdev

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 20 Mar 2010 23:17:51 +0800

> Actually I just realised that the other way we can fix this is
> to make xfrm_dst objects per-cpu just like IPv4 routes.  That
> is, when you fail to find an xfrm_dst object in the per-cpu
> cache, you dont' bother calling xfrm_find_bundle but just make
> a new bundle.

How are ipv4 routing cache entries per-cpu?  That would screw up route
metrics for TCP sockets quite a lot if they were.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-22  1:28                   ` David Miller
@ 2010-03-22  1:32                     ` Herbert Xu
  2010-03-22  1:36                       ` David Miller
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-22  1:32 UTC (permalink / raw)
  To: David Miller; +Cc: timo.teras, netdev

On Sun, Mar 21, 2010 at 06:28:46PM -0700, David Miller wrote:
> From: Herbert Xu <herbert@gondor.apana.org.au>
> Date: Sat, 20 Mar 2010 23:17:51 +0800
> 
> > Actually I just realised that the other way we can fix this is
> > to make xfrm_dst objects per-cpu just like IPv4 routes.  That
> > is, when you fail to find an xfrm_dst object in the per-cpu
> > cache, you dont' bother calling xfrm_find_bundle but just make
> > a new bundle.
> 
> How are ipv4 routing cache entries per-cpu?  That would screw up route
> metrics for TCP sockets quite a lot if they were.

You're right of course, s/just like IPv4 routes// :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-22  1:32                     ` Herbert Xu
@ 2010-03-22  1:36                       ` David Miller
  2010-03-22  1:40                         ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: David Miller @ 2010-03-22  1:36 UTC (permalink / raw)
  To: herbert; +Cc: timo.teras, netdev

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 22 Mar 2010 09:32:57 +0800

> On Sun, Mar 21, 2010 at 06:28:46PM -0700, David Miller wrote:
>> From: Herbert Xu <herbert@gondor.apana.org.au>
>> Date: Sat, 20 Mar 2010 23:17:51 +0800
>> 
>> > Actually I just realised that the other way we can fix this is
>> > to make xfrm_dst objects per-cpu just like IPv4 routes.  That
>> > is, when you fail to find an xfrm_dst object in the per-cpu
>> > cache, you dont' bother calling xfrm_find_bundle but just make
>> > a new bundle.
>> 
>> How are ipv4 routing cache entries per-cpu?  That would screw up route
>> metrics for TCP sockets quite a lot if they were.
> 
> You're right of course, s/just like IPv4 routes// :)

And as a consequence, making the xfrm_dst's be per-cpu would mess with
route metrics for TCP.

If we do something like that, then there is simply no reason any
longer to have such fine-grained routing metrics if the one thing that
would use it heavily (ipsec) stops doing so completely.

At that point we can go to a host cache for metrics just like BSD, and
pull all of the metrics out of struct dst (enormous win as it makes
all routes significantly smaller).

I'm willing to consider this seriously, to be honest.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-22  1:36                       ` David Miller
@ 2010-03-22  1:40                         ` Herbert Xu
  2010-03-22  3:12                           ` David Miller
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-22  1:40 UTC (permalink / raw)
  To: David Miller; +Cc: timo.teras, netdev

On Sun, Mar 21, 2010 at 06:36:56PM -0700, David Miller wrote:
>
> And as a consequence, making the xfrm_dst's be per-cpu would mess with
> route metrics for TCP.

Actually xfrm_dst currently relies on IPv4 rt objects to maintain
the metrics.  So as long as IPv4 routes are still global, then the
metrics won't be affected as far as can I see.

Did I miss something?

> I'm willing to consider this seriously, to be honest.

I would consider that too.  But right now I'm just looking for
the bare minimum to solve the problem at hand :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-22  1:40                         ` Herbert Xu
@ 2010-03-22  3:12                           ` David Miller
  2010-03-22  3:52                             ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: David Miller @ 2010-03-22  3:12 UTC (permalink / raw)
  To: herbert; +Cc: timo.teras, netdev

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 22 Mar 2010 09:40:10 +0800

> On Sun, Mar 21, 2010 at 06:36:56PM -0700, David Miller wrote:
>>
>> And as a consequence, making the xfrm_dst's be per-cpu would mess with
>> route metrics for TCP.
> 
> Actually xfrm_dst currently relies on IPv4 rt objects to maintain
> the metrics.  So as long as IPv4 routes are still global, then the
> metrics won't be affected as far as can I see.
> 
> Did I miss something?

Good point, I was misunderstanding how things work now and how
that would change with your proposal.

Having multiple xfrm_dsts exist for an IPSEC route seems fine
to me.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-21  8:31                         ` Timo Teräs
@ 2010-03-22  3:52                           ` Herbert Xu
  2010-03-22 18:03                             ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-22  3:52 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev, David S. Miller

On Sun, Mar 21, 2010 at 10:31:23AM +0200, Timo Teräs wrote:
>
>> Ok, we can do that to skip 2. But I think 1 would be still useful.
>> It'd probably be good to actually have flow_cache_ops pointer in
>> each entry instead of the atomic_t pointer.
>>
>> The reasoning:
>> - we can then have type-based checks that the reference count
>>  is valid (e.g. policy's refcount must not go to zero, it's bug,
>>  and we can call dst_release which warns if refcount goes to
>>  negative); imho it's hack to call atomic_dec instead of the
>>  real type's xxx_put
>> - the flow cache needs to somehow know if the entry is stale so
>>  it'll try to refresh it atomically; e.g. if there's no
>>  check for 'stale', the lookup returns stale xfrm_dst. we'd
>>  then need new api to update the stale entry, or flush it out
>>  and repeat the lookup. the virtual get could check for it being
>>  stale (if so release the entry) and then return null for the
>>  generic code to call the resolver atomically
>> - for paranoia we can actually check the type of the object in
>>  cache via the ops (if needed)

The reason I'd prefer to keep the current scheme is to avoid
an additional indirect function call on each packet.

The way it would work is (we need flow_cache_lookup to return
fle instead of the object):

	fle = flow_cache_lookup
	xdst = fle->object
	if (xdst is stale) {
		flow_cache_mark_obsolete(fle)
		fle = flow_cache_lookup
		xdst = fle->object
		if (xdst is stale)
			return error
	}

Where flow_cache_mark_obsolete would set a flag in the fle that's
checked by flow_cache_lookup.  To prevent the very rare case
where we mark an entry obsolete incorrectly, the resolver function
should double-check that the existing entry is indeed obsolete
before making a new one.

This way we give the overhead over to the slow path where the
bundle is stale.

You were saying that our bundles are going stale very frequently,
that would sound like a bug that we should look into.  The whole
caching scheme is pointless if the bundle is going stale every
other packet.

> - could cache bundle OR policy for outgoing stuff. it's useful
>  to cache the policy in case we need to sleep, or if it's a
>  policy forbidding traffic. in those cases there's no bundle
>  to cache at all. alternatively we can make dummy bundles that
>  are marked invalid and are just used to keep a reference to
>  the policy.

My instinct is to go with dummy bundles.  That way given the
direction we know exactly what object type it is.  Having mixed
object types is just too much of a pain.

> Oh, this also implies that the resolver function should be
> changed to get the old stale object so it can re-use it to
> get the policy object instead of searching it all over again.

That should be easy to implement.  Just prefill the obj argument
to the resolver with either NULL or the stale object.

For the bundle resolver, it should also remove the stale bundle
from the policy bundle list and drop its reference.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-22  3:12                           ` David Miller
@ 2010-03-22  3:52                             ` Herbert Xu
  2010-03-22 18:31                               ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-22  3:52 UTC (permalink / raw)
  To: David Miller; +Cc: timo.teras, netdev

On Sun, Mar 21, 2010 at 08:12:58PM -0700, David Miller wrote:
> 
> Good point, I was misunderstanding how things work now and how
> that would change with your proposal.
> 
> Having multiple xfrm_dsts exist for an IPSEC route seems fine
> to me.

Thanks for the confirmation.

Timo, let's roll along with the per-cpu xfrm_dst approach.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-22  3:52                           ` Herbert Xu
@ 2010-03-22 18:03                             ` Timo Teräs
  2010-03-23  7:28                               ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-22 18:03 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, David S. Miller

Herbert Xu wrote:
> On Sun, Mar 21, 2010 at 10:31:23AM +0200, Timo Teräs wrote:
>>> Ok, we can do that to skip 2. But I think 1 would be still useful.
>>> It'd probably be good to actually have flow_cache_ops pointer in
>>> each entry instead of the atomic_t pointer.
>>>
>>> The reasoning:
>>> - we can then have type-based checks that the reference count
>>>  is valid (e.g. policy's refcount must not go to zero, it's bug,
>>>  and we can call dst_release which warns if refcount goes to
>>>  negative); imho it's hack to call atomic_dec instead of the
>>>  real type's xxx_put
>>> - the flow cache needs to somehow know if the entry is stale so
>>>  it'll try to refresh it atomically; e.g. if there's no
>>>  check for 'stale', the lookup returns stale xfrm_dst. we'd
>>>  then need new api to update the stale entry, or flush it out
>>>  and repeat the lookup. the virtual get could check for it being
>>>  stale (if so release the entry) and then return null for the
>>>  generic code to call the resolver atomically
>>> - for paranoia we can actually check the type of the object in
>>>  cache via the ops (if needed)
> 
> The reason I'd prefer to keep the current scheme is to avoid
> an additional indirect function call on each packet.
> 
> The way it would work is (we need flow_cache_lookup to return
> fle instead of the object):
> 
> 	fle = flow_cache_lookup
> 	xdst = fle->object
> 	if (xdst is stale) {
> 		flow_cache_mark_obsolete(fle)
> 		fle = flow_cache_lookup
> 		xdst = fle->object
> 		if (xdst is stale)
> 			return error
> 	}
> 
> Where flow_cache_mark_obsolete would set a flag in the fle that's
> checked by flow_cache_lookup.  To prevent the very rare case
> where we mark an entry obsolete incorrectly, the resolver function
> should double-check that the existing entry is indeed obsolete
> before making a new one.
> 
> This way we give the overhead over to the slow path where the
> bundle is stale.

Well, yes. The fast past would be slightly faster.

However, I still find the indirect call based thingy more elegant. 
We would also get more common code, as flow_cache_lookup could then
figure out from the virtual call if the entry needs refreshing or
not. And doing just atomic_dec instead of the type based thingy
feels slightly kludgy. I don't think the speed difference between
direct/indirect call is that significant.

Also the fle would just need "struct flow_cache_ops *". And have
wrappers that use container_of to figure out the real address of
the cached struct. This would allow real type agnostic cache.
So we'd just need the 'ops' pointer instead of the current
object pointer and atomic_t pointer, saving in fle size.
But yes, it does impose the small speed penalty of indirect call.

I prefer the 'ops' thingy, but have no strong feelings either way.
Do you fell strongly to go with the current scheme here?

> You were saying that our bundles are going stale very frequently,
> that would sound like a bug that we should look into.  The whole
> caching scheme is pointless if the bundle is going stale every
> other packet.

I mean frequently as in 'minutes' not as in 'milliseconds'. The 
bundles goes stale only when the policy (mostly by user action) or
ip route (pmtu / minutes) changes. So no biggie here.

>> - could cache bundle OR policy for outgoing stuff. it's useful
>>  to cache the policy in case we need to sleep, or if it's a
>>  policy forbidding traffic. in those cases there's no bundle
>>  to cache at all. alternatively we can make dummy bundles that
>>  are marked invalid and are just used to keep a reference to
>>  the policy.
> 
> My instinct is to go with dummy bundles.  That way given the
> direction we know exactly what object type it is.  Having mixed
> object types is just too much of a pain.

Sounds good.

>> Oh, this also implies that the resolver function should be
>> changed to get the old stale object so it can re-use it to
>> get the policy object instead of searching it all over again.
> 
> That should be easy to implement.  Just prefill the obj argument
> to the resolver with either NULL or the stale object.
> 
> For the bundle resolver, it should also remove the stale bundle
> from the policy bundle list and drop its reference.

Yup.

Cheers,
 Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-22  3:52                             ` Herbert Xu
@ 2010-03-22 18:31                               ` Timo Teräs
  0 siblings, 0 replies; 41+ messages in thread
From: Timo Teräs @ 2010-03-22 18:31 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, netdev

Herbert Xu wrote:
> On Sun, Mar 21, 2010 at 08:12:58PM -0700, David Miller wrote:
>> Good point, I was misunderstanding how things work now and how
>> that would change with your proposal.
>>
>> Having multiple xfrm_dsts exist for an IPSEC route seems fine
>> to me.
> 
> Thanks for the confirmation.
> 
> Timo, let's roll along with the per-cpu xfrm_dst approach.

Okay.

- Timo


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-22 18:03                             ` Timo Teräs
@ 2010-03-23  7:28                               ` Timo Teräs
  2010-03-23  7:42                                 ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-23  7:28 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, David S. Miller

Timo Teräs wrote:
> Herbert Xu wrote:
>> On Sun, Mar 21, 2010 at 10:31:23AM +0200, Timo Teräs wrote:
>>>> Ok, we can do that to skip 2. But I think 1 would be still useful.
>>>> It'd probably be good to actually have flow_cache_ops pointer in
>>>> each entry instead of the atomic_t pointer.
>>>>
>>>> The reasoning:
>>>> - we can then have type-based checks that the reference count
>>>>  is valid (e.g. policy's refcount must not go to zero, it's bug,
>>>>  and we can call dst_release which warns if refcount goes to
>>>>  negative); imho it's hack to call atomic_dec instead of the
>>>>  real type's xxx_put
>>>> - the flow cache needs to somehow know if the entry is stale so
>>>>  it'll try to refresh it atomically; e.g. if there's no
>>>>  check for 'stale', the lookup returns stale xfrm_dst. we'd
>>>>  then need new api to update the stale entry, or flush it out
>>>>  and repeat the lookup. the virtual get could check for it being
>>>>  stale (if so release the entry) and then return null for the
>>>>  generic code to call the resolver atomically
>>>> - for paranoia we can actually check the type of the object in
>>>>  cache via the ops (if needed)
>>
>> The reason I'd prefer to keep the current scheme is to avoid
>> an additional indirect function call on each packet.
>>
>> The way it would work is (we need flow_cache_lookup to return
>> fle instead of the object):
>>
>>     fle = flow_cache_lookup
>>     xdst = fle->object
>>     if (xdst is stale) {
>>         flow_cache_mark_obsolete(fle)
>>         fle = flow_cache_lookup
>>         xdst = fle->object
>>         if (xdst is stale)
>>             return error
>>     }

I've been thinking more about doing this. And I think this
approach is fundamentally racy.

Underlying fle->object can be changed underneath as since
bh are not disabled. This means the refcounting needs to
be done on fle level, and additional measures are needed
to ensure that fle->object is as expected.

Additionally, the second flow_cache_lookup can happen
on another cpu, and could return a stale object that
indeed would need refreshing instead of generating an
error.

Options:
1. return fle and fle->object separately, refcount both
2. call flow_cache_lookup with bh disabled
3. use flow_cache_entry_ops and virtualize put/get

Since 2. was previously said to leak generic code,
it does not sound good.

On 1 vs 3, I'd still choose 3 since:
- adding one more refcount means more atomic calls
  which have same (or more) overhead as indirect call
- two refcounts sounds more complicated than one
- exposing fle and touching it without bh's disabled
  requires a whole lot of more extra care; doing
  3 is simpler

So, should I go ahead and do the virtualization of the
getters and putters?

- Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-23  7:28                               ` Timo Teräs
@ 2010-03-23  7:42                                 ` Herbert Xu
  2010-03-23  9:19                                   ` Timo Teräs
  0 siblings, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2010-03-23  7:42 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev, David S. Miller

On Tue, Mar 23, 2010 at 09:28:33AM +0200, Timo Teräs wrote:
>
> So, should I go ahead and do the virtualization of the
> getters and putters?

If we're going to have indirect calls per packet then let's at
least minimise them.  I think one should be enough, i.e., just
add a ->stale() function pointer to be used in flow_cache_lookup.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-23  7:42                                 ` Herbert Xu
@ 2010-03-23  9:19                                   ` Timo Teräs
  2010-03-23  9:41                                     ` Herbert Xu
  0 siblings, 1 reply; 41+ messages in thread
From: Timo Teräs @ 2010-03-23  9:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, David S. Miller

Herbert Xu wrote:
> On Tue, Mar 23, 2010 at 09:28:33AM +0200, Timo Teräs wrote:
>> So, should I go ahead and do the virtualization of the
>> getters and putters?
> 
> If we're going to have indirect calls per packet then let's at
> least minimise them.  I think one should be enough, i.e., just
> add a ->stale() function pointer to be used in flow_cache_lookup.

Normally, only the getter is called per-packet. So I'm thinking
to have get() that would check for entry being stale or not,
and bump up the refcount.

The resolver can just swap the old entry and call appropriate
_put/_get, so we can avoid virtual calls there.

Thinking more about the flushing of the flow cache. It's basically
only needed to flush before the policies are garbage collected.
This is strictly needed so we can do the atomic_dec() without
turning policy's refcount to zero and causing a leak.

I'm now thinking if it'd be worth doing virtual _put(). This
way we would not need flushing at all for in policy garbage
collector (we could kill flow_cache_flush). We could also
call dst_release immediately in the flow cache _put, which
would release the memory faster. This way we would not need
at all any periodic xfrm_dst checker as flow cache is
regenerated regularly.

The code paths that would require calling the virtual put
are:
- randomization of flow cache (every 10 mins currently)
  from flow_cache_lookup() with bh disabled
- flow cache going too full, from flow_cache_lookup()
  with bh disabled
- cpu notifier

Do you think the virtual _put doing more work would be
too slow?

In that case the plain atomic_dec sounds ok, but we'd then
need to periodically check through the global bundle list
for garbage collection.

Cheers,
  Timo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] xfrm: cache bundle lookup results in flow cache
  2010-03-23  9:19                                   ` Timo Teräs
@ 2010-03-23  9:41                                     ` Herbert Xu
  0 siblings, 0 replies; 41+ messages in thread
From: Herbert Xu @ 2010-03-23  9:41 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev, David S. Miller

On Tue, Mar 23, 2010 at 11:19:05AM +0200, Timo Teräs wrote:
>
> Normally, only the getter is called per-packet. So I'm thinking
> to have get() that would check for entry being stale or not,
> and bump up the refcount.

OK if it's just one call per packet it sounds good to me.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2010-03-23 10:13 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-03-15 12:20 [PATCH] xfrm: cache bundle lookup results in flow cache Timo Teras
2010-03-17 13:07 ` Herbert Xu
2010-03-17 14:16   ` Timo Teräs
2010-03-17 14:58     ` Herbert Xu
2010-03-17 15:56       ` Timo Teräs
2010-03-17 16:32         ` Timo Teräs
2010-03-18 19:30         ` Timo Teräs
2010-03-19  0:31           ` Herbert Xu
2010-03-19  5:48             ` Timo Teräs
2010-03-19  6:03               ` Herbert Xu
2010-03-19  6:21                 ` Timo Teräs
2010-03-19  7:17                   ` Herbert Xu
2010-03-19  7:27                   ` Timo Teräs
2010-03-19  0:32           ` Herbert Xu
2010-03-19  7:20 ` Herbert Xu
2010-03-19  7:48   ` Timo Teräs
2010-03-19  8:29     ` Herbert Xu
2010-03-19  8:37       ` Timo Teräs
2010-03-19  8:47         ` Herbert Xu
2010-03-19  9:12           ` Timo Teräs
2010-03-19  9:32             ` Herbert Xu
2010-03-19  9:53               ` Timo Teräs
2010-03-20 15:17                 ` Herbert Xu
2010-03-20 16:26                   ` Timo Teräs
2010-03-21  0:46                     ` Herbert Xu
2010-03-21  7:34                       ` Timo Teräs
2010-03-21  8:31                         ` Timo Teräs
2010-03-22  3:52                           ` Herbert Xu
2010-03-22 18:03                             ` Timo Teräs
2010-03-23  7:28                               ` Timo Teräs
2010-03-23  7:42                                 ` Herbert Xu
2010-03-23  9:19                                   ` Timo Teräs
2010-03-23  9:41                                     ` Herbert Xu
2010-03-22  1:26                   ` David Miller
2010-03-22  1:28                   ` David Miller
2010-03-22  1:32                     ` Herbert Xu
2010-03-22  1:36                       ` David Miller
2010-03-22  1:40                         ` Herbert Xu
2010-03-22  3:12                           ` David Miller
2010-03-22  3:52                             ` Herbert Xu
2010-03-22 18:31                               ` Timo Teräs

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.