All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2]
@ 2009-06-04 11:07 Pablo Neira Ayuso
  2009-06-04 11:08 ` [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure Pablo Neira Ayuso
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-04 11:07 UTC (permalink / raw)
  To: netfilter-devel; +Cc: kaber

Hi Patrick,

The first patch here re-works the conntrack event cache to use the
extension infrastructure so there is an event cache per-conntrack.
This is used by the second patch, which aims to improve ctnetlink
reliability.

Please, have a look at the patch descriptions for more details.
If you like them, you can pull them from:

git://1984.lsi.us.es/nf-next-2.6 master

Wait for your comments!

---

Pablo Neira Ayuso (2):
      netfilter: conntrack: optional reliable conntrack event delivery
      netfilter: conntrack: move event cache to conntrack extension infrastructure


 include/net/netfilter/nf_conntrack.h        |    2 
 include/net/netfilter/nf_conntrack_ecache.h |  133 +++++++++--------
 include/net/netfilter/nf_conntrack_extend.h |    2 
 include/net/netfilter/nf_conntrack_helper.h |    2 
 include/net/netns/conntrack.h               |    7 +
 net/netfilter/nf_conntrack_core.c           |  106 ++++++++++---
 net/netfilter/nf_conntrack_ecache.c         |  216 ++++++++++++++++++---------
 net/netfilter/nf_conntrack_helper.c         |   15 ++
 net/netfilter/nf_conntrack_netlink.c        |   94 +++++++-----
 9 files changed, 379 insertions(+), 198 deletions(-)


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure
  2009-06-04 11:07 [PATCH 0/2] Pablo Neira Ayuso
@ 2009-06-04 11:08 ` Pablo Neira Ayuso
  2009-06-04 12:16   ` Pablo Neira Ayuso
  2009-06-05 11:04   ` Patrick McHardy
  2009-06-04 11:08 ` [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery Pablo Neira Ayuso
  2009-06-04 11:17 ` [PATCH 0/2] reliable per-conntrack event cache Pablo Neira Ayuso
  2 siblings, 2 replies; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-04 11:08 UTC (permalink / raw)
  To: netfilter-devel; +Cc: kaber

This patch reworks the event caching infrastructure to use the
conntrack extension infrastructure. As a result, you can enable and
disable event delivery via /proc/sys/net/netfilter/nf_conntrack_events
in runtime opposed to compilation time. The main drawback is that
we consume more memory per conntrack if event delivery is enabled.
This patch is required by the reliable event delivery that follows
to this patch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---

 include/net/netfilter/nf_conntrack_ecache.h |  125 +++++++++--------
 include/net/netfilter/nf_conntrack_extend.h |    2 
 include/net/netns/conntrack.h               |    5 -
 net/netfilter/nf_conntrack_core.c           |   17 +-
 net/netfilter/nf_conntrack_ecache.c         |  199 +++++++++++++++++----------
 net/netfilter/nf_conntrack_netlink.c        |   74 ++++++----
 6 files changed, 249 insertions(+), 173 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 1afb907..2fb019a 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -6,59 +6,50 @@
 #define _NF_CONNTRACK_ECACHE_H
 #include <net/netfilter/nf_conntrack.h>
 
-#include <linux/interrupt.h>
 #include <net/net_namespace.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
+#include <net/netfilter/nf_conntrack_extend.h>
 
-/* Connection tracking event bits */
+/* Connection tracking event types */
 enum ip_conntrack_events
 {
-	/* New conntrack */
-	IPCT_NEW_BIT = 0,
-	IPCT_NEW = (1 << IPCT_NEW_BIT),
-
-	/* Expected connection */
-	IPCT_RELATED_BIT = 1,
-	IPCT_RELATED = (1 << IPCT_RELATED_BIT),
-
-	/* Destroyed conntrack */
-	IPCT_DESTROY_BIT = 2,
-	IPCT_DESTROY = (1 << IPCT_DESTROY_BIT),
-
-	/* Status has changed */
-	IPCT_STATUS_BIT = 3,
-	IPCT_STATUS = (1 << IPCT_STATUS_BIT),
-
-	/* Update of protocol info */
-	IPCT_PROTOINFO_BIT = 4,
-	IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT),
-
-	/* New helper for conntrack */
-	IPCT_HELPER_BIT = 5,
-	IPCT_HELPER = (1 << IPCT_HELPER_BIT),
-
-	/* Mark is set */
-	IPCT_MARK_BIT = 6,
-	IPCT_MARK = (1 << IPCT_MARK_BIT),
-
-	/* NAT sequence adjustment */
-	IPCT_NATSEQADJ_BIT = 7,
-	IPCT_NATSEQADJ = (1 << IPCT_NATSEQADJ_BIT),
-
-	/* Secmark is set */
-	IPCT_SECMARK_BIT = 8,
-	IPCT_SECMARK = (1 << IPCT_SECMARK_BIT),
+	IPCT_NEW		= 0,	/* new conntrack */
+	IPCT_RELATED		= 1,	/* related conntrack */
+	IPCT_DESTROY		= 2,	/* destroyed conntrack */
+	IPCT_STATUS		= 3,	/* status has changed */
+	IPCT_PROTOINFO		= 4,	/* protocol information has changed */
+	IPCT_HELPER		= 5,	/* new helper has been set */
+	IPCT_MARK		= 6,	/* new mark has been set */
+	IPCT_NATSEQADJ		= 7,	/* NAT is doing sequence adjustment */
+	IPCT_SECMARK		= 8,	/* new security mark has been set */
 };
 
 enum ip_conntrack_expect_events {
-	IPEXP_NEW_BIT = 0,
-	IPEXP_NEW = (1 << IPEXP_NEW_BIT),
+	IPEXP_NEW		= 0,	/* new expectation */
 };
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 struct nf_conntrack_ecache {
-	struct nf_conn *ct;
-	unsigned int events;
+	unsigned long cache;	/* bitops want long */
+};
+
+static inline struct nf_conntrack_ecache *
+nf_ct_ecache_find(const struct nf_conn *ct)
+{
+	return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE);
+}
+
+static inline struct nf_conntrack_ecache *
+nf_ct_ecache_ext_add(struct nf_conn *ct, gfp_t gfp)
+{
+	struct net *net = nf_ct_net(ct);
+
+	if (!net->ct.sysctl_events)
+		return NULL;
+
+	return nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
 };
 
 /* This structure is passed to event handler */
@@ -76,22 +67,32 @@ extern struct nf_ct_event_notifier *nf_conntrack_event_cb;
 extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb);
 extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb);
 
-extern void nf_ct_deliver_cached_events(const struct nf_conn *ct);
-extern void __nf_ct_event_cache_init(struct nf_conn *ct);
-extern void nf_ct_event_cache_flush(struct net *net);
+extern void nf_ct_deliver_cached_events_report(struct nf_conn *ct, u32 pid, int report);
+
+static inline void nf_ct_deliver_cached_events(struct nf_conn *ct)
+{
+	nf_ct_deliver_cached_events_report(ct, 0, 0);
+}
 
 static inline void
 nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
 {
-	struct net *net = nf_ct_net(ct);
-	struct nf_conntrack_ecache *ecache;
-
-	local_bh_disable();
-	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
-	if (ct != ecache->ct)
-		__nf_ct_event_cache_init(ct);
-	ecache->events |= event;
-	local_bh_enable();
+	struct nf_conntrack_ecache *e;
+	struct nf_ct_event_notifier *notify;
+
+	rcu_read_lock();
+	notify = rcu_dereference(nf_conntrack_event_cb);
+	if (notify == NULL) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+
+	e = nf_ct_ecache_find(ct);
+	if (e == NULL)
+		return;
+
+	set_bit(event, &e->cache);
 }
 
 static inline void
@@ -100,6 +101,7 @@ nf_conntrack_event_report(enum ip_conntrack_events event,
 			  u32 pid,
 			  int report)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_ct_event_notifier *notify;
 
 	rcu_read_lock();
@@ -107,13 +109,16 @@ nf_conntrack_event_report(enum ip_conntrack_events event,
 	if (notify == NULL)
 		goto out_unlock;
 
+	if (!net->ct.sysctl_events)
+		goto out_unlock;
+
 	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
 		struct nf_ct_event item = {
 			.ct 	= ct,
 			.pid	= pid,
 			.report = report
 		};
-		notify->fcn(event, &item);
+		notify->fcn((1 << event), &item);
 	}
 out_unlock:
 	rcu_read_unlock();
@@ -145,6 +150,7 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
 			  u32 pid,
 			  int report)
 {
+	struct net *net = nf_ct_exp_net(exp);
 	struct nf_exp_event_notifier *notify;
 
 	rcu_read_lock();
@@ -152,13 +158,16 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
 	if (notify == NULL)
 		goto out_unlock;
 
+	if (!net->ct.sysctl_events)
+		goto out_unlock;
+
 	{
 		struct nf_exp_event item = {
 			.exp	= exp,
 			.pid	= pid,
 			.report = report
 		};
-		notify->fcn(event, &item);
+		notify->fcn((1 << event), &item);
 	}
 out_unlock:
 	rcu_read_unlock();
@@ -191,12 +200,6 @@ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e,
 					     struct nf_conntrack_expect *exp,
  					     u32 pid,
  					     int report) {}
-static inline void nf_ct_event_cache_flush(struct net *net) {}
-
-static inline int nf_conntrack_ecache_init(struct net *net)
-{
-	return 0;
-}
 
 static inline void nf_conntrack_ecache_fini(struct net *net)
 {
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index da8ee52..7f8fc5d 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -8,12 +8,14 @@ enum nf_ct_ext_id
 	NF_CT_EXT_HELPER,
 	NF_CT_EXT_NAT,
 	NF_CT_EXT_ACCT,
+	NF_CT_EXT_ECACHE,
 	NF_CT_EXT_NUM,
 };
 
 #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
 #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
 #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
+#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
 
 /* Extensions: optional stuff which isn't permanently in struct. */
 struct nf_ct_ext {
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 9dc5840..505a51c 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -15,15 +15,14 @@ struct netns_ct {
 	struct hlist_head	*expect_hash;
 	struct hlist_nulls_head	unconfirmed;
 	struct ip_conntrack_stat *stat;
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-	struct nf_conntrack_ecache *ecache;
-#endif
+	int			sysctl_events;
 	int			sysctl_acct;
 	int			sysctl_checksum;
 	unsigned int		sysctl_log_invalid; /* Log invalid packets */
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*sysctl_header;
 	struct ctl_table_header	*acct_sysctl_header;
+	struct ctl_table_header	*event_sysctl_header;
 #endif
 	int			hash_vmalloc;
 	int			expect_vmalloc;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index b54c234..e8905a9 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -576,6 +576,7 @@ init_conntrack(struct net *net,
 	}
 
 	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+	nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
 
 	spin_lock_bh(&nf_conntrack_lock);
 	exp = nf_ct_find_expectation(net, tuple);
@@ -738,6 +739,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 	NF_CT_ASSERT(skb->nfct);
 
+	/* We may have pending events, deliver them and clear the cache */
+	nf_ct_deliver_cached_events(ct);
+
 	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
 	if (ret <= 0) {
 		/* Invalid: inverse of the return code tells
@@ -1035,8 +1039,6 @@ static void nf_conntrack_cleanup_init_net(void)
 
 static void nf_conntrack_cleanup_net(struct net *net)
 {
-	nf_ct_event_cache_flush(net);
-	nf_conntrack_ecache_fini(net);
  i_see_dead_people:
 	nf_ct_iterate_cleanup(net, kill_all, NULL);
 	if (atomic_read(&net->ct.count) != 0) {
@@ -1049,6 +1051,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
 
 	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
 			     nf_conntrack_htable_size);
+	nf_conntrack_ecache_fini(net);
 	nf_conntrack_acct_fini(net);
 	nf_conntrack_expect_fini(net);
 	free_percpu(net->ct.stat);
@@ -1224,9 +1227,6 @@ static int nf_conntrack_init_net(struct net *net)
 		ret = -ENOMEM;
 		goto err_stat;
 	}
-	ret = nf_conntrack_ecache_init(net);
-	if (ret < 0)
-		goto err_ecache;
 	net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
 					     &net->ct.hash_vmalloc, 1);
 	if (!net->ct.hash) {
@@ -1240,6 +1240,9 @@ static int nf_conntrack_init_net(struct net *net)
 	ret = nf_conntrack_acct_init(net);
 	if (ret < 0)
 		goto err_acct;
+	ret = nf_conntrack_ecache_init(net);
+	if (ret < 0)
+		goto err_ecache;
 
 	/* Set up fake conntrack:
 	    - to never be deleted, not in any hashes */
@@ -1252,14 +1255,14 @@ static int nf_conntrack_init_net(struct net *net)
 
 	return 0;
 
+err_ecache:
+	nf_conntrack_acct_fini(net);
 err_acct:
 	nf_conntrack_expect_fini(net);
 err_expect:
 	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
 			     nf_conntrack_htable_size);
 err_hash:
-	nf_conntrack_ecache_fini(net);
-err_ecache:
 	free_percpu(net->ct.stat);
 err_stat:
 	return ret;
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 5516b3e..3fe2682 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -1,7 +1,7 @@
 /* Event cache for netfilter. */
 
 /* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2009 Netfilter Core Team <coreteam@netfilter.org>
  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -21,6 +21,7 @@
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
 
 static DEFINE_MUTEX(nf_ct_ecache_mutex);
 
@@ -32,93 +33,35 @@ EXPORT_SYMBOL_GPL(nf_expect_event_cb);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
-static inline void
-__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
+void nf_ct_deliver_cached_events_report(struct nf_conn *ct, u32 pid, int report)
 {
 	struct nf_ct_event_notifier *notify;
+	struct nf_conntrack_ecache *e;
 
 	rcu_read_lock();
 	notify = rcu_dereference(nf_conntrack_event_cb);
 	if (notify == NULL)
 		goto out_unlock;
 
-	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
-	    && ecache->events) {
+	e = nf_ct_ecache_find(ct);
+	if (e == NULL)
+		goto out_unlock;
+
+	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && e->cache) {
 		struct nf_ct_event item = {
-			.ct 	= ecache->ct,
-			.pid	= 0,
-			.report	= 0
+			.ct	= ct,
+			.pid	= pid,
+			.report	= report
 		};
 
-		notify->fcn(ecache->events, &item);
+		notify->fcn(e->cache, &item);
 	}
-
-	ecache->events = 0;
-	nf_ct_put(ecache->ct);
-	ecache->ct = NULL;
+	xchg(&e->cache, 0);
 
 out_unlock:
 	rcu_read_unlock();
 }
-
-/* Deliver all cached events for a particular conntrack. This is called
- * by code prior to async packet handling for freeing the skb */
-void nf_ct_deliver_cached_events(const struct nf_conn *ct)
-{
-	struct net *net = nf_ct_net(ct);
-	struct nf_conntrack_ecache *ecache;
-
-	local_bh_disable();
-	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
-	if (ecache->ct == ct)
-		__nf_ct_deliver_cached_events(ecache);
-	local_bh_enable();
-}
-EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
-
-/* Deliver cached events for old pending events, if current conntrack != old */
-void __nf_ct_event_cache_init(struct nf_conn *ct)
-{
-	struct net *net = nf_ct_net(ct);
-	struct nf_conntrack_ecache *ecache;
-
-	/* take care of delivering potentially old events */
-	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
-	BUG_ON(ecache->ct == ct);
-	if (ecache->ct)
-		__nf_ct_deliver_cached_events(ecache);
-	/* initialize for this conntrack/packet */
-	ecache->ct = ct;
-	nf_conntrack_get(&ct->ct_general);
-}
-EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init);
-
-/* flush the event cache - touches other CPU's data and must not be called
- * while packets are still passing through the code */
-void nf_ct_event_cache_flush(struct net *net)
-{
-	struct nf_conntrack_ecache *ecache;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		ecache = per_cpu_ptr(net->ct.ecache, cpu);
-		if (ecache->ct)
-			nf_ct_put(ecache->ct);
-	}
-}
-
-int nf_conntrack_ecache_init(struct net *net)
-{
-	net->ct.ecache = alloc_percpu(struct nf_conntrack_ecache);
-	if (!net->ct.ecache)
-		return -ENOMEM;
-	return 0;
-}
-
-void nf_conntrack_ecache_fini(struct net *net)
-{
-	free_percpu(net->ct.ecache);
-}
+EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events_report);
 
 int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
 {
@@ -185,3 +128,115 @@ void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
 	mutex_unlock(&nf_ct_ecache_mutex);
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+#define NF_CT_EVENTS_DEFAULT 1
+#else
+#define NF_CT_EVENTS_DEFAULT 0
+#endif
+
+static int nf_ct_events_switch __read_mostly = NF_CT_EVENTS_DEFAULT;
+
+module_param_named(event, nf_ct_events_switch, bool, 0644);
+MODULE_PARM_DESC(event, "Enable connection tracking event delivery");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table event_sysctl_table[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nf_conntrack_events",
+		.data		= &init_net.ct.sysctl_events,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type event_extend __read_mostly = {
+	.len	= sizeof(struct nf_conntrack_ecache),
+	.align	= __alignof__(struct nf_conntrack_ecache),
+	.id	= NF_CT_EXT_ECACHE,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	table[0].data = &net->ct.sysctl_events;
+
+	net->ct.event_sysctl_header =
+		register_net_sysctl_table(net,
+					  nf_net_netfilter_sysctl_path, table);
+	if (!net->ct.event_sysctl_header) {
+		printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");
+		goto out_register;
+	}
+	return 0;
+
+out_register:
+	kfree(table);
+out:
+	return -ENOMEM;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ct.event_sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.event_sysctl_header);
+	kfree(table);
+}
+#else
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_ecache_init(struct net *net)
+{
+	int ret;
+
+	net->ct.sysctl_events = nf_ct_events_switch;
+
+	if (net_eq(net, &init_net)) {
+		ret = nf_ct_extend_register(&event_extend);
+		if (ret < 0) {
+			printk(KERN_ERR "nf_ct_event: Unable to register "
+					"event extension.\n");
+			goto out_extend_register;
+		}
+	}
+
+	ret = nf_conntrack_event_init_sysctl(net);
+	if (ret < 0)
+		goto out_sysctl;
+
+	return 0;
+
+out_sysctl:
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&event_extend);
+out_extend_register:
+	return ret;
+}
+
+void nf_conntrack_ecache_fini(struct net *net)
+{
+	nf_conntrack_event_fini_sysctl(net);
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&event_extend);
+}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4448b06..2dd2910 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -468,10 +468,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	if (ct == &nf_conntrack_untracked)
 		return 0;
 
-	if (events & IPCT_DESTROY) {
+	if (events & (1 << IPCT_DESTROY)) {
 		type = IPCTNL_MSG_CT_DELETE;
 		group = NFNLGRP_CONNTRACK_DESTROY;
-	} else  if (events & (IPCT_NEW | IPCT_RELATED)) {
+	} else  if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) {
 		type = IPCTNL_MSG_CT_NEW;
 		flags = NLM_F_CREATE|NLM_F_EXCL;
 		group = NFNLGRP_CONNTRACK_NEW;
@@ -519,7 +519,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	if (ctnetlink_dump_status(skb, ct) < 0)
 		goto nla_put_failure;
 
-	if (events & IPCT_DESTROY) {
+	if (events & (1 << IPCT_DESTROY)) {
 		if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
 		    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
 			goto nla_put_failure;
@@ -527,31 +527,31 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 		if (ctnetlink_dump_timeout(skb, ct) < 0)
 			goto nla_put_failure;
 
-		if (events & IPCT_PROTOINFO
+		if (events & (1 << IPCT_PROTOINFO)
 		    && ctnetlink_dump_protoinfo(skb, ct) < 0)
 			goto nla_put_failure;
 
-		if ((events & IPCT_HELPER || nfct_help(ct))
+		if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
 		    && ctnetlink_dump_helpinfo(skb, ct) < 0)
 			goto nla_put_failure;
 
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
-		if ((events & IPCT_SECMARK || ct->secmark)
+		if ((events & (1 << IPCT_SECMARK) || ct->secmark)
 		    && ctnetlink_dump_secmark(skb, ct) < 0)
 			goto nla_put_failure;
 #endif
 
-		if (events & IPCT_RELATED &&
+		if (events & (1 << IPCT_RELATED) &&
 		    ctnetlink_dump_master(skb, ct) < 0)
 			goto nla_put_failure;
 
-		if (events & IPCT_NATSEQADJ &&
+		if (events & (1 << IPCT_NATSEQADJ) &&
 		    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
 			goto nla_put_failure;
 	}
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
-	if ((events & IPCT_MARK || ct->mark)
+	if ((events & (1 << IPCT_MARK) || ct->mark)
 	    && ctnetlink_dump_mark(skb, ct) < 0)
 		goto nla_put_failure;
 #endif
@@ -1123,6 +1123,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct, struct nlattr *cda[])
 		err = ctnetlink_change_helper(ct, cda);
 		if (err < 0)
 			return err;
+
+		nf_conntrack_event_cache(IPCT_HELPER, ct);
 	}
 
 	if (cda[CTA_TIMEOUT]) {
@@ -1135,17 +1137,23 @@ ctnetlink_change_conntrack(struct nf_conn *ct, struct nlattr *cda[])
 		err = ctnetlink_change_status(ct, cda);
 		if (err < 0)
 			return err;
+
+		nf_conntrack_event_cache(IPCT_STATUS, ct);
 	}
 
 	if (cda[CTA_PROTOINFO]) {
 		err = ctnetlink_change_protoinfo(ct, cda);
 		if (err < 0)
 			return err;
+
+		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
 	}
 
 #if defined(CONFIG_NF_CONNTRACK_MARK)
-	if (cda[CTA_MARK])
+	if (cda[CTA_MARK]) {
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+		nf_conntrack_event_cache(IPCT_MARK, ct);
+	}
 #endif
 
 #ifdef CONFIG_NF_NAT_NEEDED
@@ -1153,6 +1161,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct, struct nlattr *cda[])
 		err = ctnetlink_change_nat_seq_adj(ct, cda);
 		if (err < 0)
 			return err;
+
+		nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
 	}
 #endif
 
@@ -1179,6 +1189,7 @@ ctnetlink_create_conntrack(struct nlattr *cda[],
 
 	ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
 	ct->status |= IPS_CONFIRMED;
+	nf_conntrack_event_cache(IPCT_STATUS, ct);
 
 	rcu_read_lock();
  	if (cda[CTA_HELP]) {
@@ -1218,12 +1229,17 @@ ctnetlink_create_conntrack(struct nlattr *cda[],
 
 			/* not in hash table yet so not strictly necessary */
 			rcu_assign_pointer(help->helper, helper);
+			nf_conntrack_event_cache(IPCT_HELPER, ct);
 		}
 	} else {
 		/* try an implicit helper assignation */
 		err = __nf_ct_try_assign_helper(ct, GFP_ATOMIC);
 		if (err < 0)
 			goto err2;
+
+		/* we've got a helper, add this to the event cache */
+		if (nfct_help(ct))
+			nf_conntrack_event_cache(IPCT_HELPER, ct);
 	}
 
 	if (cda[CTA_STATUS]) {
@@ -1243,6 +1259,8 @@ ctnetlink_create_conntrack(struct nlattr *cda[],
 		err = ctnetlink_change_nat_seq_adj(ct, cda);
 		if (err < 0)
 			goto err2;
+
+		nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
 	}
 #endif
 
@@ -1250,13 +1268,18 @@ ctnetlink_create_conntrack(struct nlattr *cda[],
 		err = ctnetlink_change_protoinfo(ct, cda);
 		if (err < 0)
 			goto err2;
+
+		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
 	}
 
 	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+	nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
 
 #if defined(CONFIG_NF_CONNTRACK_MARK)
-	if (cda[CTA_MARK])
+	if (cda[CTA_MARK]) {
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+		nf_conntrack_event_cache(IPCT_MARK, ct);
+	}
 #endif
 
 	/* setup master conntrack: this is a confirmed expectation */
@@ -1324,7 +1347,6 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		err = -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_CREATE) {
 			struct nf_conn *ct;
-			enum ip_conntrack_events events;
 
 			ct = ctnetlink_create_conntrack(cda, &otuple,
 							&rtuple, u3);
@@ -1336,17 +1358,13 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 			nf_conntrack_get(&ct->ct_general);
 			spin_unlock_bh(&nf_conntrack_lock);
 			if (test_bit(IPS_EXPECTED_BIT, &ct->status))
-				events = IPCT_RELATED;
+				nf_conntrack_event_cache(IPCT_RELATED, ct);
 			else
-				events = IPCT_NEW;
-
-			nf_conntrack_event_report(IPCT_STATUS |
-						  IPCT_HELPER |
-						  IPCT_PROTOINFO |
-						  IPCT_NATSEQADJ |
-						  IPCT_MARK | events,
-						  ct, NETLINK_CB(skb).pid,
-						  nlmsg_report(nlh));
+				nf_conntrack_event_cache(IPCT_NEW, ct);
+
+			nf_ct_deliver_cached_events_report(ct,
+							   NETLINK_CB(skb).pid,
+							   nlmsg_report(nlh));
 			nf_ct_put(ct);
 		} else
 			spin_unlock_bh(&nf_conntrack_lock);
@@ -1365,13 +1383,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		if (err == 0) {
 			nf_conntrack_get(&ct->ct_general);
 			spin_unlock_bh(&nf_conntrack_lock);
-			nf_conntrack_event_report(IPCT_STATUS |
-						  IPCT_HELPER |
-						  IPCT_PROTOINFO |
-						  IPCT_NATSEQADJ |
-						  IPCT_MARK,
-						  ct, NETLINK_CB(skb).pid,
-						  nlmsg_report(nlh));
+			nf_ct_deliver_cached_events_report(ct,
+							   NETLINK_CB(skb).pid,
+							   nlmsg_report(nlh));
 			nf_ct_put(ct);
 		} else
 			spin_unlock_bh(&nf_conntrack_lock);
@@ -1515,7 +1529,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	unsigned int type;
 	int flags = 0;
 
-	if (events & IPEXP_NEW) {
+	if (events & (1 << IPEXP_NEW)) {
 		type = IPCTNL_MSG_EXP_NEW;
 		flags = NLM_F_CREATE|NLM_F_EXCL;
 	} else


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-04 11:07 [PATCH 0/2] Pablo Neira Ayuso
  2009-06-04 11:08 ` [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure Pablo Neira Ayuso
@ 2009-06-04 11:08 ` Pablo Neira Ayuso
  2009-06-05 14:37   ` Patrick McHardy
  2009-06-04 11:17 ` [PATCH 0/2] reliable per-conntrack event cache Pablo Neira Ayuso
  2 siblings, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-04 11:08 UTC (permalink / raw)
  To: netfilter-devel; +Cc: kaber

This patch improves ctnetlink event reliability if one broadcast
listener has set the NETLINK_BROADCAST_ERROR socket option.

The logic is the following: if the event delivery, we keep the
undelivered events in the per-conntrack event cache. Thus, once
the next packet arrives, we trigger another event delivery in
nf_conntrack_in(). If things don't go well in this second try,
we accumulate the pending events in the cache but we try to
deliver the current state as soon as possible. Therefore, we may
lost state transitions but the userspace process gets in sync at
some point.

At worst case, if no events were delivered to userspace, we make
sure that destroy events are successfully delivered. This happens
because if ctnetlink fails to deliver the destroy event, we remove
the conntrack entry from the hashes and insert them in the dying
list, which contains inactive entries. Then, the conntrack timer
is added with an extra grace timeout of random32() % 15 seconds
to trigger the event again (this grace timeout is tunable via
/proc). The use of a limited random timeout value allows
distributing the "destroy" resends, thus, avoiding accumulating
lots "destroy" events at the same time.

The maximum number of conntrack entries (active or inactive) is
still handled by nf_conntrack_max. Thus, we may start dropping
packets at some point if we accumulate a lot of inactive conntrack
entries waiting to deliver the destroy event to userspace.
During my stress tests consisting of setting a very small buffer
of 2048 bytes for conntrackd and the NETLINK_BROADCAST_ERROR socket
flag, and generating lots of very small connections, I noticed
very few destroy entries on the fly waiting to be resend.

A simple way to test this patch consist of creating a lot of
entries, set a very small Netlink buffer in conntrackd (+ a patch
which is not in the git tree to set the BROADCAST_ERROR flag)
and invoke `conntrack -F'.

For expectations, no changes are introduced in this patch.
Currently, event delivery is only done for new expectations (no
events from expectation removal and confirmation) and, apart from
the conntrack command line tool, I don't see any client that may
benefit of reliable expectation event delivery, we have to
introduce confirm and destroy events before.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---

 include/net/netfilter/nf_conntrack.h        |    2 +
 include/net/netfilter/nf_conntrack_ecache.h |   10 ++-
 include/net/netfilter/nf_conntrack_helper.h |    2 +
 include/net/netns/conntrack.h               |    2 +
 net/netfilter/nf_conntrack_core.c           |   89 ++++++++++++++++++++++-----
 net/netfilter/nf_conntrack_ecache.c         |   21 ++++++
 net/netfilter/nf_conntrack_helper.c         |   15 +++++
 net/netfilter/nf_conntrack_netlink.c        |   20 ++++--
 8 files changed, 133 insertions(+), 28 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 2ba36dd..8671635 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -293,6 +293,8 @@ extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
 extern unsigned int nf_conntrack_htable_size;
 extern unsigned int nf_conntrack_max;
 
+extern void nf_ct_setup_event_timer(struct nf_conn *ct);
+
 #define NF_CT_STAT_INC(net, count)	\
 	(per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++)
 #define NF_CT_STAT_INC_ATOMIC(net, count)		\
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 2fb019a..a37929d 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -95,12 +95,13 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
 	set_bit(event, &e->cache);
 }
 
-static inline void
+static inline int
 nf_conntrack_event_report(enum ip_conntrack_events event,
 			  struct nf_conn *ct,
 			  u32 pid,
 			  int report)
 {
+	int ret = 0;
 	struct net *net = nf_ct_net(ct);
 	struct nf_ct_event_notifier *notify;
 
@@ -118,16 +119,17 @@ nf_conntrack_event_report(enum ip_conntrack_events event,
 			.pid	= pid,
 			.report = report
 		};
-		notify->fcn((1 << event), &item);
+		ret = notify->fcn((1 << event), &item);
 	}
 out_unlock:
 	rcu_read_unlock();
+	return ret;
 }
 
-static inline void
+static inline int
 nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
 {
-	nf_conntrack_event_report(event, ct, 0, 0);
+	return nf_conntrack_event_report(event, ct, 0, 0);
 }
 
 struct nf_exp_event {
diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index ee2a4b3..1b70680 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -50,6 +50,8 @@ extern struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp);
 
 extern int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags);
 
+extern void nf_ct_helper_destroy(struct nf_conn *ct);
+
 static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct)
 {
 	return nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 505a51c..ba1ba0c 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -14,8 +14,10 @@ struct netns_ct {
 	struct hlist_nulls_head	*hash;
 	struct hlist_head	*expect_hash;
 	struct hlist_nulls_head	unconfirmed;
+	struct hlist_nulls_head	dying;
 	struct ip_conntrack_stat *stat;
 	int			sysctl_events;
+	unsigned int		sysctl_events_retry_timeout;
 	int			sysctl_acct;
 	int			sysctl_checksum;
 	unsigned int		sysctl_log_invalid; /* Log invalid packets */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e8905a9..deb7736 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -182,10 +182,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 	NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
-	if (!test_bit(IPS_DYING_BIT, &ct->status))
-		nf_conntrack_event(IPCT_DESTROY, ct);
-	set_bit(IPS_DYING_BIT, &ct->status);
-
 	/* To make sure we don't get any weird locking issues here:
 	 * destroy_conntrack() MUST NOT be called with a write lock
 	 * to nf_conntrack_lock!!! -HW */
@@ -219,20 +215,9 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	nf_conntrack_free(ct);
 }
 
-static void death_by_timeout(unsigned long ul_conntrack)
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
-	struct nf_conn *ct = (void *)ul_conntrack;
 	struct net *net = nf_ct_net(ct);
-	struct nf_conn_help *help = nfct_help(ct);
-	struct nf_conntrack_helper *helper;
-
-	if (help) {
-		rcu_read_lock();
-		helper = rcu_dereference(help->helper);
-		if (helper && helper->destroy)
-			helper->destroy(ct);
-		rcu_read_unlock();
-	}
 
 	spin_lock_bh(&nf_conntrack_lock);
 	/* Inside lock so preempt is disabled on module removal path.
@@ -240,6 +225,60 @@ static void death_by_timeout(unsigned long ul_conntrack)
 	NF_CT_STAT_INC(net, delete_list);
 	clean_from_lists(ct);
 	spin_unlock_bh(&nf_conntrack_lock);
+}
+
+static void death_by_event(unsigned long ul_conntrack)
+{
+	struct nf_conn *ct = (void *)ul_conntrack;
+	struct net *net = nf_ct_net(ct);
+
+	if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
+		/* bad luck, let's retry again */
+		ct->timeout.expires = jiffies +
+			(random32() % net->ct.sysctl_events_retry_timeout);
+		add_timer(&ct->timeout);
+		return;
+	}
+	/* we've got the event delivered, now it's dying */
+	set_bit(IPS_DYING_BIT, &ct->status);
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+	spin_unlock_bh(&nf_conntrack_lock);
+	nf_ct_helper_destroy(ct);
+	nf_ct_put(ct);
+}
+
+void nf_ct_setup_event_timer(struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+
+	nf_ct_delete_from_lists(ct);
+	/* add this conntrack to the dying list */
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+				 &net->ct.dying);
+	/* set a new timer to retry event delivery */
+	setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
+	ct->timeout.expires = jiffies +
+		(random32() % net->ct.sysctl_events_retry_timeout);
+	add_timer(&ct->timeout);
+	spin_unlock_bh(&nf_conntrack_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_setup_event_timer);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+	struct nf_conn *ct = (void *)ul_conntrack;
+
+	if (!test_bit(IPS_DYING_BIT, &ct->status) && 
+	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+		/* destroy event was not delivered */
+		nf_ct_setup_event_timer(ct);
+		return;
+	}
+	set_bit(IPS_DYING_BIT, &ct->status);
+	nf_ct_helper_destroy(ct);
+	nf_ct_delete_from_lists(ct);
 	nf_ct_put(ct);
 }
 
@@ -1030,6 +1069,22 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
 
+static void nf_ct_release_dying_list(void)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct hlist_nulls_node *n;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		/* never fails to remove them, no listeners at this point */
+		if (del_timer(&ct->timeout))
+			ct->timeout.function((unsigned long)ct);
+	}
+	spin_unlock_bh(&nf_conntrack_lock);
+}
+
 static void nf_conntrack_cleanup_init_net(void)
 {
 	nf_conntrack_helper_fini();
@@ -1041,6 +1096,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
 {
  i_see_dead_people:
 	nf_ct_iterate_cleanup(net, kill_all, NULL);
+	nf_ct_release_dying_list();
 	if (atomic_read(&net->ct.count) != 0) {
 		schedule();
 		goto i_see_dead_people;
@@ -1222,6 +1278,7 @@ static int nf_conntrack_init_net(struct net *net)
 
 	atomic_set(&net->ct.count, 0);
 	INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0);
+	INIT_HLIST_NULLS_HEAD(&net->ct.dying, 0);
 	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
 	if (!net->ct.stat) {
 		ret = -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 3fe2682..ee5d3cc 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -37,6 +37,7 @@ void nf_ct_deliver_cached_events_report(struct nf_conn *ct, u32 pid, int report)
 {
 	struct nf_ct_event_notifier *notify;
 	struct nf_conntrack_ecache *e;
+	int ret = 0, delivered = 0;
 
 	rcu_read_lock();
 	notify = rcu_dereference(nf_conntrack_event_cb);
@@ -54,9 +55,12 @@ void nf_ct_deliver_cached_events_report(struct nf_conn *ct, u32 pid, int report)
 			.report	= report
 		};
 
-		notify->fcn(e->cache, &item);
+		ret = notify->fcn(e->cache, &item);
+		if (ret == 0)
+			delivered = 1;
 	}
-	xchg(&e->cache, 0);
+	if (delivered)
+		xchg(&e->cache, 0);
 
 out_unlock:
 	rcu_read_unlock();
@@ -136,9 +140,12 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
 #endif
 
 static int nf_ct_events_switch __read_mostly = NF_CT_EVENTS_DEFAULT;
+static int nf_ct_events_retry_timeout __read_mostly = 15*HZ;
 
 module_param_named(event, nf_ct_events_switch, bool, 0644);
 MODULE_PARM_DESC(event, "Enable connection tracking event delivery");
+module_param_named(retry_timeout, nf_ct_events_retry_timeout, bool, 0644);
+MODULE_PARM_DESC(retry_timeout, "Event delivery retry timeout");
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table event_sysctl_table[] = {
@@ -150,6 +157,14 @@ static struct ctl_table event_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nf_conntrack_events_retry_timeout",
+		.data		= &init_net.ct.sysctl_events_retry_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
 	{}
 };
 #endif /* CONFIG_SYSCTL */
@@ -171,6 +186,7 @@ static int nf_conntrack_event_init_sysctl(struct net *net)
 		goto out;
 
 	table[0].data = &net->ct.sysctl_events;
+	table[1].data = &net->ct.sysctl_events_retry_timeout;
 
 	net->ct.event_sysctl_header =
 		register_net_sysctl_table(net,
@@ -211,6 +227,7 @@ int nf_conntrack_ecache_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_events = nf_ct_events_switch;
+	net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
 
 	if (net_eq(net, &init_net)) {
 		ret = nf_ct_extend_register(&event_extend);
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 0fa5a42..5fc1fe7 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -136,6 +136,21 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
 	return 0;
 }
 
+void nf_ct_helper_destroy(struct nf_conn *ct)
+{
+	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_conntrack_helper *helper;
+
+	if (help) {
+		rcu_read_lock();
+		helper = rcu_dereference(help->helper);
+		if (helper && helper->destroy)
+			helper->destroy(ct);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_destroy);
+
 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 {
 	unsigned int h = helper_hash(&me->tuple);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2dd2910..6695e4b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -463,6 +463,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	struct sk_buff *skb;
 	unsigned int type;
 	unsigned int flags = 0, group;
+	int err;
 
 	/* ignore our fake conntrack entry */
 	if (ct == &nf_conntrack_untracked)
@@ -558,7 +559,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
-	nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+	err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+	if ((err == -ENOBUFS) || (err == -EAGAIN))
+		return -ENOBUFS;
+
 	return 0;
 
 nla_put_failure:
@@ -567,7 +571,7 @@ nla_put_failure:
 nlmsg_failure:
 	kfree_skb(skb);
 errout:
-	nfnetlink_set_err(0, group, -ENOBUFS);
+	nfnetlink_set_err(item->pid, group, -ENOBUFS);
 	return 0;
 }
 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
@@ -798,10 +802,14 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		}
 	}
 
-	nf_conntrack_event_report(IPCT_DESTROY,
-				  ct,
-				  NETLINK_CB(skb).pid,
-				  nlmsg_report(nlh));
+	if (nf_conntrack_event_report(IPCT_DESTROY, ct,
+				      NETLINK_CB(skb).pid,
+				      nlmsg_report(nlh)) < 0) {
+		/* we failed to report the event, try later */
+		nf_ct_setup_event_timer(ct);
+		nf_ct_put(ct);
+		return 0;
+	}
 
 	/* death_by_timeout would report the event again */
 	set_bit(IPS_DYING_BIT, &ct->status);


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 0/2] reliable per-conntrack event cache
  2009-06-04 11:07 [PATCH 0/2] Pablo Neira Ayuso
  2009-06-04 11:08 ` [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure Pablo Neira Ayuso
  2009-06-04 11:08 ` [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery Pablo Neira Ayuso
@ 2009-06-04 11:17 ` Pablo Neira Ayuso
  2 siblings, 0 replies; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-04 11:17 UTC (permalink / raw)
  To: netfilter-devel; +Cc: kaber

Oops, I forgot to include the subject sorry.

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure
  2009-06-04 11:08 ` [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure Pablo Neira Ayuso
@ 2009-06-04 12:16   ` Pablo Neira Ayuso
  2009-06-05 11:04   ` Patrick McHardy
  1 sibling, 0 replies; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-04 12:16 UTC (permalink / raw)
  To: netfilter-devel; +Cc: kaber

Pablo Neira Ayuso wrote:
>  #ifdef CONFIG_NF_CONNTRACK_EVENTS
>  struct nf_conntrack_ecache {
> -	struct nf_conn *ct;
> -	unsigned int events;
> +	unsigned long cache;	/* bitops want long */
> +};

Sorry, I noticed that this should out of the #ifdef, otherwise it breaks
compilation of configurations with no events enabled. I'm going to
review this.

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure
  2009-06-04 11:08 ` [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure Pablo Neira Ayuso
  2009-06-04 12:16   ` Pablo Neira Ayuso
@ 2009-06-05 11:04   ` Patrick McHardy
  2009-06-05 13:06     ` Pablo Neira Ayuso
  1 sibling, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-05 11:04 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel

Pablo Neira Ayuso wrote:
> -/* Connection tracking event bits */
> +/* Connection tracking event types */
>  enum ip_conntrack_events
>  {
> -	/* New conntrack */
> -	IPCT_NEW_BIT = 0,
> -	IPCT_NEW = (1 << IPCT_NEW_BIT),
> -
> -...
> +	IPCT_NEW		= 0,	/* new conntrack */

Why this change? Further down, you change the code to use
(1 << IPCT_*), which isn't really an improvement.

>  static inline void
>  nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
>  {
> -	struct net *net = nf_ct_net(ct);
> -	struct nf_conntrack_ecache *ecache;
> -
> -	local_bh_disable();
> -	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
> -	if (ct != ecache->ct)
> -		__nf_ct_event_cache_init(ct);
> -	ecache->events |= event;
> -	local_bh_enable();
> +	struct nf_conntrack_ecache *e;
> +	struct nf_ct_event_notifier *notify;
> +
> +	rcu_read_lock();
> +	notify = rcu_dereference(nf_conntrack_event_cb);
> +	if (notify == NULL) {
> +		rcu_read_unlock();
> +		return;
> +	}
> +	rcu_read_unlock();

Why the rcu handling? Its not dereferenced, so its not necessary.

> +
> +	e = nf_ct_ecache_find(ct);
> +	if (e == NULL)
> +		return;
> +
> +	set_bit(event, &e->cache);

This looks quite expensive, given how often this operation is performed.
Did you benchmark this?

> diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
> index da8ee52..7f8fc5d 100644
> --- a/include/net/netfilter/nf_conntrack_extend.h
> +++ b/include/net/netfilter/nf_conntrack_extend.h
> @@ -8,12 +8,14 @@ enum nf_ct_ext_id
>  	NF_CT_EXT_HELPER,
>  	NF_CT_EXT_NAT,
>  	NF_CT_EXT_ACCT,
> +	NF_CT_EXT_ECACHE,
>  	NF_CT_EXT_NUM,

Quoting nf_conntrack_extend.c:

/* This assumes that extended areas in conntrack for the types
    whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */

Is that actually the case here? It might be beneficial to move
this before accounting if possible, I guess its used more often.


> @@ -738,6 +739,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
>  
>  	NF_CT_ASSERT(skb->nfct);
>  
> +	/* We may have pending events, deliver them and clear the cache */
> +	nf_ct_deliver_cached_events(ct);

How does this guarantee that an event will be delivered in time?
As far as I can see, it might never be delivered, or at least not
until a timeout occurs.

> +void nf_ct_deliver_cached_events_report(struct nf_conn *ct, u32 pid, int report)
>  {
>  	struct nf_ct_event_notifier *notify;
> +	struct nf_conntrack_ecache *e;
>  
>  	rcu_read_lock();
>  	notify = rcu_dereference(nf_conntrack_event_cb);
>  	if (notify == NULL)
>  		goto out_unlock;
>  
> -	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
> -	    && ecache->events) {
> +	e = nf_ct_ecache_find(ct);
> +	if (e == NULL)
> +		goto out_unlock;
> +
> +	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && e->cache) {
>  		struct nf_ct_event item = {
> -			.ct 	= ecache->ct,
> -			.pid	= 0,
> -			.report	= 0
> +			.ct	= ct,
> +			.pid	= pid,
> +			.report	= report
>  		};
>  
> -		notify->fcn(ecache->events, &item);
> +		notify->fcn(e->cache, &item);
>  	}
> -
> -	ecache->events = 0;
> -	nf_ct_put(ecache->ct);
> -	ecache->ct = NULL;
> +	xchg(&e->cache, 0);

This races with delivery. There's no guarantee that it didn't cache
another event between delivery and clearing the bits.

> +static int nf_ct_events_switch __read_mostly = NF_CT_EVENTS_DEFAULT;
> +
> +module_param_named(event, nf_ct_events_switch, bool, 0644);
> +MODULE_PARM_DESC(event, "Enable connection tracking event delivery");

I think its actually possible to use a bool type for the variable
nowadays. But I don't think we need the _switch suffix. Actually
I don't really see the need for a module parameter at all, we already
have the sysctl.

> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 4448b06..2dd2910 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -468,10 +468,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
>  	if (ct == &nf_conntrack_untracked)
>  		return 0;
>  
> -	if (events & IPCT_DESTROY) {
> +	if (events & (1 << IPCT_DESTROY)) {

This is really no improvement :)

> @@ -1123,6 +1123,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct, struct nlattr *cda[])
>  		err = ctnetlink_change_helper(ct, cda);
>  		if (err < 0)
>  			return err;
> +
> +		nf_conntrack_event_cache(IPCT_HELPER, ct);

Why are we suddenly caching a lot more events manually?

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure
  2009-06-05 11:04   ` Patrick McHardy
@ 2009-06-05 13:06     ` Pablo Neira Ayuso
  2009-06-05 14:13       ` Patrick McHardy
  0 siblings, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-05 13:06 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel

Patrick McHardy wrote:
> Pablo Neira Ayuso wrote:
>> -/* Connection tracking event bits */
>> +/* Connection tracking event types */
>>  enum ip_conntrack_events
>>  {
>> -    /* New conntrack */
>> -    IPCT_NEW_BIT = 0,
>> -    IPCT_NEW = (1 << IPCT_NEW_BIT),
>> -
>> -...
>> +    IPCT_NEW        = 0,    /* new conntrack */
> 
> Why this change? Further down, you change the code to use
> (1 << IPCT_*), which isn't really an improvement.

Oh, this is not intended to be an improvement. I needed to change this 
to use bitopts operations, that's all. Or I could use IPCT_*_BIT and 
change every reference to this in the whole conntrack table, but the 
patch would be much bigger and noisy :).

>>  static inline void
>>  nf_conntrack_event_cache(enum ip_conntrack_events event, struct 
>> nf_conn *ct)
>>  {
>> -    struct net *net = nf_ct_net(ct);
>> -    struct nf_conntrack_ecache *ecache;
>> -
>> -    local_bh_disable();
>> -    ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
>> -    if (ct != ecache->ct)
>> -        __nf_ct_event_cache_init(ct);
>> -    ecache->events |= event;
>> -    local_bh_enable();
>> +    struct nf_conntrack_ecache *e;
>> +    struct nf_ct_event_notifier *notify;
>> +
>> +    rcu_read_lock();
>> +    notify = rcu_dereference(nf_conntrack_event_cb);
>> +    if (notify == NULL) {
>> +        rcu_read_unlock();
>> +        return;
>> +    }
>> +    rcu_read_unlock();
> 
> Why the rcu handling? Its not dereferenced, so its not necessary.

I'll remove this.

>> +
>> +    e = nf_ct_ecache_find(ct);
>> +    if (e == NULL)
>> +        return;
>> +
>> +    set_bit(event, &e->cache);
> 
> This looks quite expensive, given how often this operation is performed.
> Did you benchmark this?

I'll do that benchmark. I was initially using nf_conntrack_lock to 
protect the per-conntrack event cache, I think that use bitops is a bit 
better?

>> diff --git a/include/net/netfilter/nf_conntrack_extend.h 
>> b/include/net/netfilter/nf_conntrack_extend.h
>> index da8ee52..7f8fc5d 100644
>> --- a/include/net/netfilter/nf_conntrack_extend.h
>> +++ b/include/net/netfilter/nf_conntrack_extend.h
>> @@ -8,12 +8,14 @@ enum nf_ct_ext_id
>>      NF_CT_EXT_HELPER,
>>      NF_CT_EXT_NAT,
>>      NF_CT_EXT_ACCT,
>> +    NF_CT_EXT_ECACHE,
>>      NF_CT_EXT_NUM,
> 
> Quoting nf_conntrack_extend.c:
> 
> /* This assumes that extended areas in conntrack for the types
>    whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
> 
> Is that actually the case here? It might be beneficial to move
> this before accounting if possible, I guess its used more often.

I think that accounting information is updated more often. Events are 
only updated for very few packet specifically the setup and the 
tear-down packets of a flow.

>> @@ -738,6 +739,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, 
>> unsigned int hooknum,
>>  
>>      NF_CT_ASSERT(skb->nfct);
>>  
>> +    /* We may have pending events, deliver them and clear the cache */
>> +    nf_ct_deliver_cached_events(ct);
> 
> How does this guarantee that an event will be delivered in time?
> As far as I can see, it might never be delivered, or at least not
> until a timeout occurs.

Yes, that's the idea. In short, if we have cached events, we trigger a 
delivery via ctnetlink. If the delivery fails, we keep the cached events 
and the next packet will trigger a new delivery. We can lose events but 
at worse case, the destroy will be delivered.

If we add a new conntrack extension to store the creation and 
destruction time of the conntrack entries, we can have reliable 
flow-accouting since, at least, the destroy event will be delivered. In 
the case of the state synchronization, we aim to ensure that 
long-standing flows survive failures, thus, under event loss, the backup 
nodes would gett the state after some tries, specially for long-standing 
flows since every packet would trigger another delivery in case of problems.

BTW, I have removed that line locally. So the next delivery try for 
pending events is done only in nf_conntrack_confirm(), not twice, one in 
nf_conntrack_in() and nf_conntrack_confirm() as it happens in this patch.

>> +void nf_ct_deliver_cached_events_report(struct nf_conn *ct, u32 pid, 
>> int report)
>>  {
>>      struct nf_ct_event_notifier *notify;
>> +    struct nf_conntrack_ecache *e;
>>  
>>      rcu_read_lock();
>>      notify = rcu_dereference(nf_conntrack_event_cb);
>>      if (notify == NULL)
>>          goto out_unlock;
>>  
>> -    if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
>> -        && ecache->events) {
>> +    e = nf_ct_ecache_find(ct);
>> +    if (e == NULL)
>> +        goto out_unlock;
>> +
>> +    if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && e->cache) {
>>          struct nf_ct_event item = {
>> -            .ct     = ecache->ct,
>> -            .pid    = 0,
>> -            .report    = 0
>> +            .ct    = ct,
>> +            .pid    = pid,
>> +            .report    = report
>>          };
>>  
>> -        notify->fcn(ecache->events, &item);
>> +        notify->fcn(e->cache, &item);
>>      }
>> -
>> -    ecache->events = 0;
>> -    nf_ct_put(ecache->ct);
>> -    ecache->ct = NULL;
>> +    xchg(&e->cache, 0);
> 
> This races with delivery. There's no guarantee that it didn't cache
> another event between delivery and clearing the bits.

Yes, I can do something like:

long events;
events = xchg(&e->cache, 0);

at the very beginning of this function, and then use "events" instead of 
e->cache along this function.

>> +static int nf_ct_events_switch __read_mostly = NF_CT_EVENTS_DEFAULT;
>> +
>> +module_param_named(event, nf_ct_events_switch, bool, 0644);
>> +MODULE_PARM_DESC(event, "Enable connection tracking event delivery");
> 
> I think its actually possible to use a bool type for the variable
> nowadays. But I don't think we need the _switch suffix.

OK, I'll remove that prefix and I'll use bool (before making sure that 
it's possible :).

> Actually
> I don't really see the need for a module parameter at all, we already
> have the sysctl.

OK, I'll remove it.

>> diff --git a/net/netfilter/nf_conntrack_netlink.c 
>> b/net/netfilter/nf_conntrack_netlink.c
>> index 4448b06..2dd2910 100644
>> --- a/net/netfilter/nf_conntrack_netlink.c
>> +++ b/net/netfilter/nf_conntrack_netlink.c
>> @@ -468,10 +468,10 @@ ctnetlink_conntrack_event(unsigned int events, 
>> struct nf_ct_event *item)
>>      if (ct == &nf_conntrack_untracked)
>>          return 0;
>>  
>> -    if (events & IPCT_DESTROY) {
>> +    if (events & (1 << IPCT_DESTROY)) {
> 
> This is really no improvement :)

The other choice is to use __test_bit().

>> @@ -1123,6 +1123,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct, 
>> struct nlattr *cda[])
>>          err = ctnetlink_change_helper(ct, cda);
>>          if (err < 0)
>>              return err;
>> +
>> +        nf_conntrack_event_cache(IPCT_HELPER, ct);
> 
> Why are we suddenly caching a lot more events manually?

Currently, in user-space triggered events, we are including in the event 
message some fields that may not have been updated. Now we can provide 
more accurante events by notifying only the conntrack object fields that 
have been updated.

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure
  2009-06-05 13:06     ` Pablo Neira Ayuso
@ 2009-06-05 14:13       ` Patrick McHardy
  2009-06-06  6:24         ` Pablo Neira Ayuso
  0 siblings, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-05 14:13 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel

Pablo Neira Ayuso wrote:
> Patrick McHardy wrote:
>>>
>>> -    /* New conntrack */
>>> -    IPCT_NEW_BIT = 0,
>>> -    IPCT_NEW = (1 << IPCT_NEW_BIT),
>>> -
>>> -...
>>> +    IPCT_NEW        = 0,    /* new conntrack */
>>
>> Why this change? Further down, you change the code to use
>> (1 << IPCT_*), which isn't really an improvement.
>
> Oh, this is not intended to be an improvement. I needed to change this 
> to use bitopts operations, that's all. Or I could use IPCT_*_BIT and 
> change every reference to this in the whole conntrack table, but the 
> patch would be much bigger and noisy :).

The major part is in ctnetlink I think, and you do seem to touch every
one of them :) But OK, using _BIT for all the events doesn't seem too
appealing either.

>>> +
>>> +    e = nf_ct_ecache_find(ct);
>>> +    if (e == NULL)
>>> +        return;
>>> +
>>> +    set_bit(event, &e->cache);
>>
>> This looks quite expensive, given how often this operation is performed.
>> Did you benchmark this?
>
> I'll do that benchmark. I was initially using nf_conntrack_lock to 
> protect the per-conntrack event cache, I think that use bitops is a 
> bit better?

I actually meant the lookup done potentially multiple times per packet.
But I incorrectly thought that it was more expensive, that seems fine.

>>> @@ -8,12 +8,14 @@ enum nf_ct_ext_id
>>>      NF_CT_EXT_HELPER,
>>>      NF_CT_EXT_NAT,
>>>      NF_CT_EXT_ACCT,
>>> +    NF_CT_EXT_ECACHE,
>>>      NF_CT_EXT_NUM,
>>
>> Quoting nf_conntrack_extend.c:
>>
>> /* This assumes that extended areas in conntrack for the types
>>    whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
>>
>> Is that actually the case here? It might be beneficial to move
>> this before accounting if possible, I guess its used more often.
>
> I think that accounting information is updated more often. Events are 
> only updated for very few packet specifically the setup and the 
> tear-down packets of a flow.

No, events are only sent to userspace every seldom. But f.i. TCP
conntrack generates at least one event per packet.

But what I actually meant was that its used more often I think.
Never mind, also forget about the PREALLOC question, I should
have read what I pasted :) Of course you could add the PREALLOC
flag, when events are enabled you add the extension for every
conntrack anyways.

>>> @@ -738,6 +739,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, 
>>> unsigned int hooknum,
>>>  
>>>      NF_CT_ASSERT(skb->nfct);
>>>  
>>> +    /* We may have pending events, deliver them and clear the cache */
>>> +    nf_ct_deliver_cached_events(ct);
>>
>> How does this guarantee that an event will be delivered in time?
>> As far as I can see, it might never be delivered, or at least not
>> until a timeout occurs.
>
> Yes, that's the idea. In short, if we have cached events, we trigger a 
> delivery via ctnetlink. If the delivery fails, we keep the cached 
> events and the next packet will trigger a new delivery. We can lose 
> events but at worse case, the destroy will be delivered.

OK, so this is essentially replacing the delivery we did previously when
beginning to cache events for a different conntrack. Thats fine and 
necessary
in case a packet triggering an event didn't made it past POST_ROUTING.

>
> If we add a new conntrack extension to store the creation and 
> destruction time of the conntrack entries, we can have reliable 
> flow-accouting since, at least, the destroy event will be delivered. 
> In the case of the state synchronization, we aim to ensure that 
> long-standing flows survive failures, thus, under event loss, the 
> backup nodes would gett the state after some tries, specially for 
> long-standing flows since every packet would trigger another delivery 
> in case of problems.

*conntrackd* aims at that I think :) Anyways, on second - third thought, I
agree that this is fine, the previous guarantees weren't any stronger.

>
> BTW, I have removed that line locally. So the next delivery try for 
> pending events is done only in nf_conntrack_confirm(), not twice, one 
> in nf_conntrack_in() and nf_conntrack_confirm() as it happens in this 
> patch.

That means we're only delivering events if a packet actually made
it through. I guess this is fine too, ideally we wouldn't even have
state transistions.

>>> @@ -1123,6 +1123,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct, 
>>> struct nlattr *cda[])
>>>          err = ctnetlink_change_helper(ct, cda);
>>>          if (err < 0)
>>>              return err;
>>> +
>>> +        nf_conntrack_event_cache(IPCT_HELPER, ct);
>>
>> Why are we suddenly caching a lot more events manually?
>
> Currently, in user-space triggered events, we are including in the 
> event message some fields that may not have been updated. Now we can 
> provide more accurante events by notifying only the conntrack object 
> fields that have been updated.
>
The patch is already pretty large, please seperate that part if
doesn't has to be in this patch to make it work.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-04 11:08 ` [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery Pablo Neira Ayuso
@ 2009-06-05 14:37   ` Patrick McHardy
  2009-06-06  6:34     ` Pablo Neira Ayuso
  2009-06-09 22:36     ` Pablo Neira Ayuso
  0 siblings, 2 replies; 31+ messages in thread
From: Patrick McHardy @ 2009-06-05 14:37 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel

Pablo Neira Ayuso wrote:
> This patch improves ctnetlink event reliability if one broadcast
> listener has set the NETLINK_BROADCAST_ERROR socket option.
> 
> The logic is the following: if the event delivery, we keep the
> undelivered events in the per-conntrack event cache. Thus, once
> the next packet arrives, we trigger another event delivery in
> nf_conntrack_in(). If things don't go well in this second try,
> we accumulate the pending events in the cache but we try to
> deliver the current state as soon as possible. Therefore, we may
> lost state transitions but the userspace process gets in sync at
> some point.
> 
> At worst case, if no events were delivered to userspace, we make
> sure that destroy events are successfully delivered. This happens
> because if ctnetlink fails to deliver the destroy event, we remove
> the conntrack entry from the hashes and insert them in the dying
> list, which contains inactive entries. Then, the conntrack timer
> is added with an extra grace timeout of random32() % 15 seconds
> to trigger the event again (this grace timeout is tunable via
> /proc). The use of a limited random timeout value allows
> distributing the "destroy" resends, thus, avoiding accumulating
> lots "destroy" events at the same time.
> 
> The maximum number of conntrack entries (active or inactive) is
> still handled by nf_conntrack_max. Thus, we may start dropping
> packets at some point if we accumulate a lot of inactive conntrack
> entries waiting to deliver the destroy event to userspace.
> During my stress tests consisting of setting a very small buffer
> of 2048 bytes for conntrackd and the NETLINK_BROADCAST_ERROR socket
> flag, and generating lots of very small connections, I noticed
> very few destroy entries on the fly waiting to be resend.

Conceptually, I think this all makes sense and is an improvement.

> @@ -240,6 +225,60 @@ static void death_by_timeout(unsigned long ul_conntrack)
>  	NF_CT_STAT_INC(net, delete_list);
>  	clean_from_lists(ct);
>  	spin_unlock_bh(&nf_conntrack_lock);
> +}
> +
> +static void death_by_event(unsigned long ul_conntrack)
> +{
> +	struct nf_conn *ct = (void *)ul_conntrack;
> +	struct net *net = nf_ct_net(ct);
> +
> +	if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
> +		/* bad luck, let's retry again */
> +		ct->timeout.expires = jiffies +
> +			(random32() % net->ct.sysctl_events_retry_timeout);
> +		add_timer(&ct->timeout);
> +		return;
> +	}
> +	/* we've got the event delivered, now it's dying */
> +	set_bit(IPS_DYING_BIT, &ct->status);
> +	spin_lock_bh(&nf_conntrack_lock);

_bh is not needed here, the timer is already running in BH context.

> +	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);

Why is _rcu used? The lists are never used for a lookup.

> +	spin_unlock_bh(&nf_conntrack_lock);
> +	nf_ct_helper_destroy(ct);
> +	nf_ct_put(ct);
> +}
> +
> +void nf_ct_setup_event_timer(struct nf_conn *ct)
> +{
> +	struct net *net = nf_ct_net(ct);
> +
> +	nf_ct_delete_from_lists(ct);

This doesn't really belong here. I realize its because of ctnetlink,
but I think I'd rather have an additional export.

> +	/* add this conntrack to the dying list */
> +	spin_lock_bh(&nf_conntrack_lock);
> +	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
> +				 &net->ct.dying);
> +	/* set a new timer to retry event delivery */
> +	setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
> +	ct->timeout.expires = jiffies +
> +		(random32() % net->ct.sysctl_events_retry_timeout);
> +	add_timer(&ct->timeout);
> +	spin_unlock_bh(&nf_conntrack_lock);

Its not necessary to hold the lock around the timer setup. There's
only this single reference left to the conntrack - at least it should
be that way.

> +}
> +EXPORT_SYMBOL_GPL(nf_ct_setup_event_timer);
> +
> +static void death_by_timeout(unsigned long ul_conntrack)
> +{
> +	struct nf_conn *ct = (void *)ul_conntrack;
> +
> +	if (!test_bit(IPS_DYING_BIT, &ct->status) && 
> +	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
> +		/* destroy event was not delivered */
> +		nf_ct_setup_event_timer(ct);
> +		return;
> +	}
> +	set_bit(IPS_DYING_BIT, &ct->status);
> +	nf_ct_helper_destroy(ct);
> +	nf_ct_delete_from_lists(ct);

The helpers might keep global data themselves thats cleaned
up by ->destroy() (gre keymap for instance). The cleanup step
might need to be done immediately to avoid clashes with new data
or incorrectly returning data thats supposed to be gone.

>  	nf_ct_put(ct);
>  }
>  
> @@ -1030,6 +1069,22 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
>  }
>  EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
>  
> +static void nf_ct_release_dying_list(void)
> +{
> +	struct nf_conntrack_tuple_hash *h;
> +	struct nf_conn *ct;
> +	struct hlist_nulls_node *n;
> +
> +	spin_lock_bh(&nf_conntrack_lock);
> +	hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
> +		ct = nf_ct_tuplehash_to_ctrack(h);
> +		/* never fails to remove them, no listeners at this point */
> +		if (del_timer(&ct->timeout))
> +			ct->timeout.function((unsigned long)ct);

nf_ct_kill()?

> +	}
> +	spin_unlock_bh(&nf_conntrack_lock);
> +}

> diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
> index 0fa5a42..5fc1fe7 100644
> --- a/net/netfilter/nf_conntrack_helper.c
> +++ b/net/netfilter/nf_conntrack_helper.c
> @@ -136,6 +136,21 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
>  	return 0;
>  }
>  
> +void nf_ct_helper_destroy(struct nf_conn *ct)
> +{
> +	struct nf_conn_help *help = nfct_help(ct);
> +	struct nf_conntrack_helper *helper;
> +
> +	if (help) {
> +		rcu_read_lock();
> +		helper = rcu_dereference(help->helper);
> +		if (helper && helper->destroy)
> +			helper->destroy(ct);
> +		rcu_read_unlock();
> +	}
> +}
> +EXPORT_SYMBOL_GPL(nf_ct_helper_destroy);

The export looks unnecessary, its only used in nf_conntrack_core.c
unless I've missed something

> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 2dd2910..6695e4b 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -558,7 +559,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
>  	rcu_read_unlock();
>  
>  	nlmsg_end(skb, nlh);
> -	nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
> +	err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
> +	if ((err == -ENOBUFS) || (err == -EAGAIN))

minor nit: unnecessary parens

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure
  2009-06-05 14:13       ` Patrick McHardy
@ 2009-06-06  6:24         ` Pablo Neira Ayuso
  0 siblings, 0 replies; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-06  6:24 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel

Patrick McHardy wrote:
> Pablo Neira Ayuso wrote:
>> Patrick McHardy wrote:
>>>> @@ -8,12 +8,14 @@ enum nf_ct_ext_id
>>>>      NF_CT_EXT_HELPER,
>>>>      NF_CT_EXT_NAT,
>>>>      NF_CT_EXT_ACCT,
>>>> +    NF_CT_EXT_ECACHE,
>>>>      NF_CT_EXT_NUM,
>>>
>>> Quoting nf_conntrack_extend.c:
>>>
>>> /* This assumes that extended areas in conntrack for the types
>>>    whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
>>>
>>> Is that actually the case here? It might be beneficial to move
>>> this before accounting if possible, I guess its used more often.
>>
>> I think that accounting information is updated more often. Events are
>> only updated for very few packet specifically the setup and the
>> tear-down packets of a flow.
> 
> No, events are only sent to userspace every seldom. But f.i. TCP
> conntrack generates at least one event per packet.

Yes, that's true for small TCP connections, but not for long TCP ones.

> But what I actually meant was that its used more often I think.
> Never mind, also forget about the PREALLOC question, I should
> have read what I pasted :) Of course you could add the PREALLOC
> flag, when events are enabled you add the extension for every
> conntrack anyways.

Indeed, I'll add the PREALLOC flag.

[...]
>>> Why are we suddenly caching a lot more events manually?
>>
>> Currently, in user-space triggered events, we are including in the
>> event message some fields that may not have been updated. Now we can
>> provide more accurante events by notifying only the conntrack object
>> fields that have been updated.
>>
> The patch is already pretty large, please seperate that part if
> doesn't has to be in this patch to make it work.

I'll try to split this into another patch. Thanks for your comments!

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-05 14:37   ` Patrick McHardy
@ 2009-06-06  6:34     ` Pablo Neira Ayuso
  2009-06-08 13:49       ` Patrick McHardy
  2009-06-09 22:36     ` Pablo Neira Ayuso
  1 sibling, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-06  6:34 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel

Patrick McHardy wrote:
> Pablo Neira Ayuso wrote:
>> @@ -240,6 +225,60 @@ static void death_by_timeout(unsigned long
>> ul_conntrack)
>>      NF_CT_STAT_INC(net, delete_list);
>>      clean_from_lists(ct);
>>      spin_unlock_bh(&nf_conntrack_lock);
>> +}
>> +
>> +static void death_by_event(unsigned long ul_conntrack)
>> +{
>> +    struct nf_conn *ct = (void *)ul_conntrack;
>> +    struct net *net = nf_ct_net(ct);
>> +
>> +    if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
>> +        /* bad luck, let's retry again */
>> +        ct->timeout.expires = jiffies +
>> +            (random32() % net->ct.sysctl_events_retry_timeout);
>> +        add_timer(&ct->timeout);
>> +        return;
>> +    }
>> +    /* we've got the event delivered, now it's dying */
>> +    set_bit(IPS_DYING_BIT, &ct->status);
>> +    spin_lock_bh(&nf_conntrack_lock);
> 
> _bh is not needed here, the timer is already running in BH context.
> 
>> +    hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> 
> Why is _rcu used? The lists are never used for a lookup.

I'll fix these two.

>> +    spin_unlock_bh(&nf_conntrack_lock);
>> +    nf_ct_helper_destroy(ct);
>> +    nf_ct_put(ct);
>> +}
>> +
>> +void nf_ct_setup_event_timer(struct nf_conn *ct)
>> +{
>> +    struct net *net = nf_ct_net(ct);
>> +
>> +    nf_ct_delete_from_lists(ct);
> 
> This doesn't really belong here. I realize its because of ctnetlink,
> but I think I'd rather have an additional export.

I'll do.

>> +    /* add this conntrack to the dying list */
>> +    spin_lock_bh(&nf_conntrack_lock);
>> +    hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
>> +                 &net->ct.dying);
>> +    /* set a new timer to retry event delivery */
>> +    setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
>> +    ct->timeout.expires = jiffies +
>> +        (random32() % net->ct.sysctl_events_retry_timeout);
>> +    add_timer(&ct->timeout);
>> +    spin_unlock_bh(&nf_conntrack_lock);
> 
> Its not necessary to hold the lock around the timer setup. There's
> only this single reference left to the conntrack - at least it should
> be that way.

Indeed.

>> +}
>> +EXPORT_SYMBOL_GPL(nf_ct_setup_event_timer);
>> +
>> +static void death_by_timeout(unsigned long ul_conntrack)
>> +{
>> +    struct nf_conn *ct = (void *)ul_conntrack;
>> +
>> +    if (!test_bit(IPS_DYING_BIT, &ct->status) && +       
>> unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
>> +        /* destroy event was not delivered */
>> +        nf_ct_setup_event_timer(ct);
>> +        return;
>> +    }
>> +    set_bit(IPS_DYING_BIT, &ct->status);
>> +    nf_ct_helper_destroy(ct);
>> +    nf_ct_delete_from_lists(ct);
> 
> The helpers might keep global data themselves thats cleaned
> up by ->destroy() (gre keymap for instance). The cleanup step
> might need to be done immediately to avoid clashes with new data
> or incorrectly returning data thats supposed to be gone.

This is a good catch that I have missed :-). I'll move this before the
event delivery since the internal protocol helper data is not used by
ctnetlink. I can include a comment here to warn about this if we do it
in the future.

>>      nf_ct_put(ct);
>>  }
>>  
>> @@ -1030,6 +1069,22 @@ void nf_conntrack_flush_report(struct net *net,
>> u32 pid, int report)
>>  }
>>  EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
>>  
>> +static void nf_ct_release_dying_list(void)
>> +{
>> +    struct nf_conntrack_tuple_hash *h;
>> +    struct nf_conn *ct;
>> +    struct hlist_nulls_node *n;
>> +
>> +    spin_lock_bh(&nf_conntrack_lock);
>> +    hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
>> +        ct = nf_ct_tuplehash_to_ctrack(h);
>> +        /* never fails to remove them, no listeners at this point */
>> +        if (del_timer(&ct->timeout))
>> +            ct->timeout.function((unsigned long)ct);
> 
> nf_ct_kill()?

I'll do. I have a patch here to replace a couple of similar invocation
by nf_ct_kill(), I'll send it to you once we're done with this patchset.

>> +    }
>> +    spin_unlock_bh(&nf_conntrack_lock);
>> +}
> 
>> diff --git a/net/netfilter/nf_conntrack_helper.c
>> b/net/netfilter/nf_conntrack_helper.c
>> index 0fa5a42..5fc1fe7 100644
>> --- a/net/netfilter/nf_conntrack_helper.c
>> +++ b/net/netfilter/nf_conntrack_helper.c
>> @@ -136,6 +136,21 @@ static inline int unhelp(struct
>> nf_conntrack_tuple_hash *i,
>>      return 0;
>>  }
>>  
>> +void nf_ct_helper_destroy(struct nf_conn *ct)
>> +{
>> +    struct nf_conn_help *help = nfct_help(ct);
>> +    struct nf_conntrack_helper *helper;
>> +
>> +    if (help) {
>> +        rcu_read_lock();
>> +        helper = rcu_dereference(help->helper);
>> +        if (helper && helper->destroy)
>> +            helper->destroy(ct);
>> +        rcu_read_unlock();
>> +    }
>> +}
>> +EXPORT_SYMBOL_GPL(nf_ct_helper_destroy);
> 
> The export looks unnecessary, its only used in nf_conntrack_core.c
> unless I've missed something

Indeed, the only client in nf_conntrack_core.c

>> diff --git a/net/netfilter/nf_conntrack_netlink.c
>> b/net/netfilter/nf_conntrack_netlink.c
>> index 2dd2910..6695e4b 100644
>> --- a/net/netfilter/nf_conntrack_netlink.c
>> +++ b/net/netfilter/nf_conntrack_netlink.c
>> @@ -558,7 +559,10 @@ ctnetlink_conntrack_event(unsigned int events,
>> struct nf_ct_event *item)
>>      rcu_read_unlock();
>>  
>>      nlmsg_end(skb, nlh);
>> -    nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
>> +    err = nfnetlink_send(skb, item->pid, group, item->report,
>> GFP_ATOMIC);
>> +    if ((err == -ENOBUFS) || (err == -EAGAIN))
> 
> minor nit: unnecessary parens

I'll remove them. Thanks.

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-06  6:34     ` Pablo Neira Ayuso
@ 2009-06-08 13:49       ` Patrick McHardy
  0 siblings, 0 replies; 31+ messages in thread
From: Patrick McHardy @ 2009-06-08 13:49 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel

Pablo Neira Ayuso wrote:
> Patrick McHardy wrote:
>>> +static void death_by_timeout(unsigned long ul_conntrack)
>>> +{
>>> +    struct nf_conn *ct = (void *)ul_conntrack;
>>> +
>>> +    if (!test_bit(IPS_DYING_BIT, &ct->status) && +       
>>> unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
>>> +        /* destroy event was not delivered */
>>> +        nf_ct_setup_event_timer(ct);
>>> +        return;
>>> +    }
>>> +    set_bit(IPS_DYING_BIT, &ct->status);
>>> +    nf_ct_helper_destroy(ct);
>>> +    nf_ct_delete_from_lists(ct);
>> The helpers might keep global data themselves thats cleaned
>> up by ->destroy() (gre keymap for instance). The cleanup step
>> might need to be done immediately to avoid clashes with new data
>> or incorrectly returning data thats supposed to be gone.
> 
> This is a good catch that I have missed :-). I'll move this before the
> event delivery since the internal protocol helper data is not used by
> ctnetlink. I can include a comment here to warn about this if we do it
> in the future.

Sounds good enough for now.

>>> +static void nf_ct_release_dying_list(void)
>>> +{
>>> +    struct nf_conntrack_tuple_hash *h;
>>> +    struct nf_conn *ct;
>>> +    struct hlist_nulls_node *n;
>>> +
>>> +    spin_lock_bh(&nf_conntrack_lock);
>>> +    hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
>>> +        ct = nf_ct_tuplehash_to_ctrack(h);
>>> +        /* never fails to remove them, no listeners at this point */
>>> +        if (del_timer(&ct->timeout))
>>> +            ct->timeout.function((unsigned long)ct);
>> nf_ct_kill()?
> 
> I'll do. I have a patch here to replace a couple of similar invocation
> by nf_ct_kill(), I'll send it to you once we're done with this patchset.

OK, thanks.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-05 14:37   ` Patrick McHardy
  2009-06-06  6:34     ` Pablo Neira Ayuso
@ 2009-06-09 22:36     ` Pablo Neira Ayuso
  2009-06-09 22:43       ` Patrick McHardy
  1 sibling, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-09 22:36 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel

Hi Patrick,

A couple of minor issues related to this patch.

Patrick McHardy wrote:
>> +    hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> 
> Why is _rcu used? The lists are never used for a lookup.

There's no hlist_nulls_del() operation without rcu. I have a patch here
to add hlist_nulls_add_head() and hlist_nulls_del() although I guess
that you are going to tell me that you cannot apply that patch? So,
where to go?

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-09 22:36     ` Pablo Neira Ayuso
@ 2009-06-09 22:43       ` Patrick McHardy
  2009-06-09 22:45         ` Patrick McHardy
  0 siblings, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-09 22:43 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Eric Dumazet

Pablo Neira Ayuso wrote:
> Hi Patrick,
> 
> A couple of minor issues related to this patch.
> 
> Patrick McHardy wrote:
>>> +    hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
>> Why is _rcu used? The lists are never used for a lookup.
> 
> There's no hlist_nulls_del() operation without rcu. I have a patch here
> to add hlist_nulls_add_head() and hlist_nulls_del() although I guess
> that you are going to tell me that you cannot apply that patch? So,
> where to go?

Either way is fine I guess. Adding non _rcu function makes sense
I think (Eric CCed to correct me :)). But there's also no harm
in using the RCU functions as long as you add a comment stating
that you don't in fact rely on the RCU properties to avoid confusing
people.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-09 22:43       ` Patrick McHardy
@ 2009-06-09 22:45         ` Patrick McHardy
  2009-06-09 22:58           ` Pablo Neira Ayuso
  0 siblings, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-09 22:45 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Eric Dumazet

Patrick McHardy wrote:
> Pablo Neira Ayuso wrote:
>> Hi Patrick,
>>
>> A couple of minor issues related to this patch.
>>
>> Patrick McHardy wrote:
>>>> +    hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
>>> Why is _rcu used? The lists are never used for a lookup.
>>
>> There's no hlist_nulls_del() operation without rcu. I have a patch here
>> to add hlist_nulls_add_head() and hlist_nulls_del() although I guess
>> that you are going to tell me that you cannot apply that patch? So,
>> where to go?
> 
> Either way is fine I guess. Adding non _rcu function makes sense
> I think (Eric CCed to correct me :)). But there's also no harm
> in using the RCU functions as long as you add a comment stating
> that you don't in fact rely on the RCU properties to avoid confusing
> people.

But to clarify: the non-RCU functions would be preferred :)

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-09 22:45         ` Patrick McHardy
@ 2009-06-09 22:58           ` Pablo Neira Ayuso
  2009-06-10  1:18             ` Eric Dumazet
  0 siblings, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-09 22:58 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, Eric Dumazet

[-- Attachment #1: Type: text/plain, Size: 1057 bytes --]

Patrick McHardy wrote:
> Patrick McHardy wrote:
>> Pablo Neira Ayuso wrote:
>>> Hi Patrick,
>>>
>>> A couple of minor issues related to this patch.
>>>
>>> Patrick McHardy wrote:
>>>>> +    hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
>>>> Why is _rcu used? The lists are never used for a lookup.
>>>
>>> There's no hlist_nulls_del() operation without rcu. I have a patch here
>>> to add hlist_nulls_add_head() and hlist_nulls_del() although I guess
>>> that you are going to tell me that you cannot apply that patch? So,
>>> where to go?
>>
>> Either way is fine I guess. Adding non _rcu function makes sense
>> I think (Eric CCed to correct me :)). But there's also no harm
>> in using the RCU functions as long as you add a comment stating
>> that you don't in fact rely on the RCU properties to avoid confusing
>> people.
> 
> But to clarify: the non-RCU functions would be preferred :)

OK :). Eric, could you tell what if this patch is OK? It's based on the
RCU version.

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

[-- Attachment #2: hlist-nulls-add-head.patch --]
[-- Type: text/x-diff, Size: 1470 bytes --]

list_nulls: add hlist_nulls_add_head and hlist_nulls_del

This patch adds the hlist_nulls_add_head() function which is
based on hlist_nulls_add_head_rcu() but without the use of
rcu_assign_pointer(). It also adds hlist_nulls_del which is
exactly the same hlist_nulls_del_rcu.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---

 include/linux/list_nulls.h |   18 ++++++++++++++++++
 1 files changed, 18 insertions(+), 0 deletions(-)


diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 93150ec..5d10ae3 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -56,6 +56,18 @@ static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
 	return is_a_nulls(h->first);
 }
 
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+					struct hlist_nulls_head *h)
+{
+	struct hlist_nulls_node *first = h->first;
+
+	n->next = first;
+	n->pprev = &h->first;
+	h->first = n;
+	if (!is_a_nulls(first))
+		first->pprev = &n->next;
+}
+
 static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
 {
 	struct hlist_nulls_node *next = n->next;
@@ -65,6 +77,12 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
 		next->pprev = pprev;
 }
 
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	__hlist_nulls_del(n);
+	n->pprev = LIST_POISON2;
+}
+
 /**
  * hlist_nulls_for_each_entry	- iterate over list of given type
  * @tpos:	the type * to use as a loop cursor.

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-09 22:58           ` Pablo Neira Ayuso
@ 2009-06-10  1:18             ` Eric Dumazet
  2009-06-10  9:55               ` Patrick McHardy
  0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2009-06-10  1:18 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Patrick McHardy, netfilter-devel

Pablo Neira Ayuso a écrit :
> Patrick McHardy wrote:
>> Patrick McHardy wrote:
>>> Pablo Neira Ayuso wrote:
>>>> Hi Patrick,
>>>>
>>>> A couple of minor issues related to this patch.
>>>>
>>>> Patrick McHardy wrote:
>>>>>> +    hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
>>>>> Why is _rcu used? The lists are never used for a lookup.
>>>> There's no hlist_nulls_del() operation without rcu. I have a patch here
>>>> to add hlist_nulls_add_head() and hlist_nulls_del() although I guess
>>>> that you are going to tell me that you cannot apply that patch? So,
>>>> where to go?
>>> Either way is fine I guess. Adding non _rcu function makes sense
>>> I think (Eric CCed to correct me :)). But there's also no harm
>>> in using the RCU functions as long as you add a comment stating
>>> that you don't in fact rely on the RCU properties to avoid confusing
>>> people.
>> But to clarify: the non-RCU functions would be preferred :)
> 
> OK :). Eric, could you tell what if this patch is OK? It's based on the
> RCU version.
> 
> 

Sure, this patch is fine !

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Thank you
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10  1:18             ` Eric Dumazet
@ 2009-06-10  9:55               ` Patrick McHardy
  2009-06-10 10:36                 ` Pablo Neira Ayuso
  0 siblings, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-10  9:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Pablo Neira Ayuso, netfilter-devel

Eric Dumazet wrote:
> Pablo Neira Ayuso a écrit :
>> OK :). Eric, could you tell what if this patch is OK? It's based on the
>> RCU version.
>>
> Sure, this patch is fine !
> 
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Thanks Eric. Pablo, please send me those patch as soon as possible,
I plan to send my tree upstream today. Thanks!

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10  9:55               ` Patrick McHardy
@ 2009-06-10 10:36                 ` Pablo Neira Ayuso
  2009-06-10 10:55                   ` Patrick McHardy
  0 siblings, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-10 10:36 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric Dumazet, netfilter-devel

Patrick McHardy wrote:
> Eric Dumazet wrote:
>> Pablo Neira Ayuso a écrit :
>>> OK :). Eric, could you tell what if this patch is OK? It's based on the
>>> RCU version.
>>>
>> Sure, this patch is fine !
>>
>> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Thanks Eric. Pablo, please send me those patch as soon as possible,
> I plan to send my tree upstream today. Thanks!

I'll do in a couple of minutes: I have another issue that you're going
to notice in the new patchset, I put it here before posting them if you
have time to look at it. I'd like to know if you're OK with this.

events = xchg(&e->cache, 0);
[...]
-               notify->fcn(events, &item);
+               ret = notify->fcn(events, &item);
+               if (likely(ret == 0))
+                       delivered = 1;
+       }
+       if (unlikely(!delivered)) {
+               unsigned int old = 0;
+retry:
+               /* restore undelivered events to the cache */
+               ret = cmpxchg(&e->cache, old, events);
+               /* ... but we've got new events during delivery */
+               if (unlikely(ret != old)) {
+                       old = ret;
+                       events |= ret;
+                       goto retry;
+               }
        }

 out_unlock:

To avoid races between the cache clearing and event delivery:

1) I retrieve the event cache and clear it with xchg.
2) Then, if I fail to deliver the event, I restore the cache. However,
if the event cache is not zero anymore, then some new events have been
cached during the delivery, I use cmpxchg to conditionally restore the
events without losing the new events.

Can you see any problem with this optimistic approach? I know, it can
potentially try to restore the cache, but this is very unlikely to happen?

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 10:36                 ` Pablo Neira Ayuso
@ 2009-06-10 10:55                   ` Patrick McHardy
  2009-06-10 11:01                     ` Patrick McHardy
  0 siblings, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-10 10:55 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Eric Dumazet, netfilter-devel

Pablo Neira Ayuso wrote:
> I'll do in a couple of minutes: I have another issue that you're going
> to notice in the new patchset, I put it here before posting them if you
> have time to look at it. I'd like to know if you're OK with this.
> 
> events = xchg(&e->cache, 0);
> [...]
> -               notify->fcn(events, &item);
> +               ret = notify->fcn(events, &item);
> +               if (likely(ret == 0))
> +                       delivered = 1;
> +       }
> +       if (unlikely(!delivered)) {
> +               unsigned int old = 0;
> +retry:
> +               /* restore undelivered events to the cache */
> +               ret = cmpxchg(&e->cache, old, events);
> +               /* ... but we've got new events during delivery */
> +               if (unlikely(ret != old)) {
> +                       old = ret;
> +                       events |= ret;
> +                       goto retry;
> +               }
>         }
> 
>  out_unlock:
> 
> To avoid races between the cache clearing and event delivery:
> 
> 1) I retrieve the event cache and clear it with xchg.
> 2) Then, if I fail to deliver the event, I restore the cache. However,
> if the event cache is not zero anymore, then some new events have been
> cached during the delivery, I use cmpxchg to conditionally restore the
> events without losing the new events.
> 
> Can you see any problem with this optimistic approach? I know, it can
> potentially try to restore the cache, but this is very unlikely to happen?

Its probably quite unlikely, so not a big issue. OTOH this is
effectively doing something quite similar to a spinlock. Maybe
its finally time to add per-conntrack locking.

Eric even had a patch for this IIRC :)


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 10:55                   ` Patrick McHardy
@ 2009-06-10 11:01                     ` Patrick McHardy
  2009-06-10 11:40                       ` Patrick McHardy
  0 siblings, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-10 11:01 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Eric Dumazet, netfilter-devel

Patrick McHardy wrote:
> Pablo Neira Ayuso wrote:
>> Can you see any problem with this optimistic approach? I know, it can
>> potentially try to restore the cache, but this is very unlikely to 
>> happen?
> 
> Its probably quite unlikely, so not a big issue. OTOH this is
> effectively doing something quite similar to a spinlock. Maybe
> its finally time to add per-conntrack locking.
> 
> Eric even had a patch for this IIRC :)

I'll take a quick stab at this, will let you know in 30-60 minutes.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 11:01                     ` Patrick McHardy
@ 2009-06-10 11:40                       ` Patrick McHardy
  2009-06-10 12:22                         ` Pablo Neira Ayuso
  2009-06-10 12:26                         ` Jozsef Kadlecsik
  0 siblings, 2 replies; 31+ messages in thread
From: Patrick McHardy @ 2009-06-10 11:40 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Eric Dumazet, netfilter-devel

[-- Attachment #1: Type: text/plain, Size: 1586 bytes --]

Patrick McHardy wrote:
> Patrick McHardy wrote:
>> Pablo Neira Ayuso wrote:
>>> Can you see any problem with this optimistic approach? I know, it can
>>> potentially try to restore the cache, but this is very unlikely to 
>>> happen?
>>
>> Its probably quite unlikely, so not a big issue. OTOH this is
>> effectively doing something quite similar to a spinlock. Maybe
>> its finally time to add per-conntrack locking.
>>
>> Eric even had a patch for this IIRC :)
> 
> I'll take a quick stab at this, will let you know in 30-60 minutes.

This is a first attempt to replace some global locks by private
per conntrack locks. On 64 bit, it fits into a hole and doesn't
enlarge struct nf_conn.

Wrt. to the event cache, we certainly don't want to take and release
the lock for every event. I was thinking about something like this:

- add a new member to the event structure to hold undelivered events
   (named "missed" below)
- cache events in the existing member as you're doing currently
- on delivery, do something like this:

	events = xchg(&e->cache, 0);
	missed = e->missed;

	ret = notify->fcn(events | missed, &item);
	if (!success || missed) {
		spin_lock_bh(&ct->lock);
		if (!success)
			e->missed |= events;
		else
			e->missed &= ~missed;
		spin_unlock_bh(&ct->lock);
	}

so if we failed to deliver the events, we add them to the missed
events for the next delivery attempt. Once we've delivered the
missed events, we clear them from the cache.

Now is that really better - I'm not sure myself :) The per-conntrack
locking would be an improvement though. What do you think?



[-- Attachment #2: lock.diff --]
[-- Type: text/x-patch, Size: 15304 bytes --]

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 2b87737..ecc79f9 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -93,6 +93,8 @@ struct nf_conn {
            plus 1 for any connection(s) we are `master' for */
 	struct nf_conntrack ct_general;
 
+	spinlock_t lock;
+
 	/* XXX should I move this to the tail ? - Y.K */
 	/* These are my tuples; original and reply */
 	struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index ba32ed7..3767fb4 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -59,11 +59,11 @@ struct nf_conntrack_l4proto
 			   const struct nf_conntrack_tuple *);
 
 	/* Print out the private part of the conntrack. */
-	int (*print_conntrack)(struct seq_file *s, const struct nf_conn *);
+	int (*print_conntrack)(struct seq_file *s, struct nf_conn *);
 
 	/* convert protoinfo to nfnetink attributes */
 	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct);
+			 struct nf_conn *ct);
 	/* Calculate protoinfo nlattr size */
 	int (*nlattr_size)(void);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index b54c234..edf9569 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -519,6 +519,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	spin_lock_init(&ct->lock);
 	atomic_set(&ct->ct_general.use, 1);
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4448b06..4e503ad 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -143,7 +143,7 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *nest_proto;
@@ -347,7 +347,7 @@ nla_put_failure:
 
 static int
 ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
-		    int event, const struct nf_conn *ct)
+		    int event, struct nf_conn *ct)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 11801c4..6b08d32 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -24,8 +24,6 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_log.h>
 
-static DEFINE_RWLOCK(dccp_lock);
-
 /* Timeouts are based on values from RFC4340:
  *
  * - REQUEST:
@@ -491,7 +489,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		return NF_ACCEPT;
 	}
 
-	write_lock_bh(&dccp_lock);
+	spin_lock_bh(&ct->lock);
 
 	role = ct->proto.dccp.role[dir];
 	old_state = ct->proto.dccp.state;
@@ -535,13 +533,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		ct->proto.dccp.last_dir = dir;
 		ct->proto.dccp.last_pkt = type;
 
-		write_unlock_bh(&dccp_lock);
+		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_DCCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_dccp: invalid packet ignored ");
 		return NF_ACCEPT;
 	case CT_DCCP_INVALID:
-		write_unlock_bh(&dccp_lock);
+		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_DCCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_dccp: invalid state transition ");
@@ -551,7 +549,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 	ct->proto.dccp.last_dir = dir;
 	ct->proto.dccp.last_pkt = type;
 	ct->proto.dccp.state = new_state;
-	write_unlock_bh(&dccp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	dn = dccp_pernet(net);
 	nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]);
@@ -617,18 +615,18 @@ static int dccp_print_tuple(struct seq_file *s,
 			  ntohs(tuple->dst.u.dccp.port));
 }
 
-static int dccp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
+static int dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
 	return seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
 }
 
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  const struct nf_conn *ct)
+			  struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 
-	read_lock_bh(&dccp_lock);
+	spin_lock_bh(&ct->lock);
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -638,11 +636,11 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	NLA_PUT_BE64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
 		     cpu_to_be64(ct->proto.dccp.handshake_seq));
 	nla_nest_end(skb, nest_parms);
-	read_unlock_bh(&dccp_lock);
+	spin_unlock_bh(&ct->lock);
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&dccp_lock);
+	spin_unlock_bh(&ct->lock);
 	return -1;
 }
 
@@ -673,7 +671,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 		return -EINVAL;
 	}
 
-	write_lock_bh(&dccp_lock);
+	spin_lock_bh(&ct->lock);
 	ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
 	if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
 		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
@@ -686,7 +684,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 		ct->proto.dccp.handshake_seq =
 		be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
 	}
-	write_unlock_bh(&dccp_lock);
+	spin_unlock_bh(&ct->lock);
 	return 0;
 }
 
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 117b801..175a28c 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -219,8 +219,7 @@ static int gre_print_tuple(struct seq_file *s,
 }
 
 /* print private data for conntrack */
-static int gre_print_conntrack(struct seq_file *s,
-			       const struct nf_conn *ct)
+static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
 	return seq_printf(s, "timeout=%u, stream_timeout=%u ",
 			  (ct->proto.gre.timeout / HZ),
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 101b4ad..c10e6f3 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -25,9 +25,6 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 
-/* Protects ct->proto.sctp */
-static DEFINE_RWLOCK(sctp_lock);
-
 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
    closely.  They're more complex. --RR
 
@@ -164,13 +161,13 @@ static int sctp_print_tuple(struct seq_file *s,
 }
 
 /* Print out the private part of the conntrack. */
-static int sctp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
+static int sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
 	enum sctp_conntrack state;
 
-	read_lock_bh(&sctp_lock);
+	spin_lock_bh(&ct->lock);
 	state = ct->proto.sctp.state;
-	read_unlock_bh(&sctp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	return seq_printf(s, "%s ", sctp_conntrack_names[state]);
 }
@@ -318,7 +315,7 @@ static int sctp_packet(struct nf_conn *ct,
 	}
 
 	old_state = new_state = SCTP_CONNTRACK_NONE;
-	write_lock_bh(&sctp_lock);
+	spin_lock_bh(&ct->lock);
 	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
 		/* Special cases of Verification tag check (Sec 8.5.1) */
 		if (sch->type == SCTP_CID_INIT) {
@@ -371,7 +368,7 @@ static int sctp_packet(struct nf_conn *ct,
 		if (old_state != new_state)
 			nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
 	}
-	write_unlock_bh(&sctp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	nf_ct_refresh_acct(ct, ctinfo, skb, sctp_timeouts[new_state]);
 
@@ -386,7 +383,7 @@ static int sctp_packet(struct nf_conn *ct,
 	return NF_ACCEPT;
 
 out_unlock:
-	write_unlock_bh(&sctp_lock);
+	spin_unlock_bh(&ct->lock);
 out:
 	return -NF_ACCEPT;
 }
@@ -469,11 +466,11 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  const struct nf_conn *ct)
+			  struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 
-	read_lock_bh(&sctp_lock);
+	spin_lock_bh(&ct->lock);
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -488,14 +485,14 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 		     CTA_PROTOINFO_SCTP_VTAG_REPLY,
 		     ct->proto.sctp.vtag[IP_CT_DIR_REPLY]);
 
-	read_unlock_bh(&sctp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	nla_nest_end(skb, nest_parms);
 
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&sctp_lock);
+	spin_unlock_bh(&ct->lock);
 	return -1;
 }
 
@@ -527,13 +524,13 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
 	    !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY])
 		return -EINVAL;
 
-	write_lock_bh(&sctp_lock);
+	spin_lock_bh(&ct->lock);
 	ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]);
 	ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] =
 		nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]);
 	ct->proto.sctp.vtag[IP_CT_DIR_REPLY] =
 		nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]);
-	write_unlock_bh(&sctp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	return 0;
 }
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b7e8a82..5c5739c 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -29,9 +29,6 @@
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
     If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -309,13 +306,13 @@ static int tcp_print_tuple(struct seq_file *s,
 }
 
 /* Print out the private part of the conntrack. */
-static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
+static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
 	enum tcp_conntrack state;
 
-	read_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->lock);
 	state = ct->proto.tcp.state;
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -725,14 +722,14 @@ void nf_conntrack_tcp_update(const struct sk_buff *skb,
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->lock);
 	/*
 	 * We have to worry for the ack in the reply packet only...
 	 */
 	if (after(end, ct->proto.tcp.seen[dir].td_end))
 		ct->proto.tcp.seen[dir].td_end = end;
 	ct->proto.tcp.last_end = end;
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->lock);
 	pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -841,7 +838,7 @@ static int tcp_packet(struct nf_conn *ct,
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->lock);
 	old_state = ct->proto.tcp.state;
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
@@ -871,7 +868,7 @@ static int tcp_packet(struct nf_conn *ct,
 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
 			/* Attempt to reopen a closed/aborted connection.
 			 * Delete this connection and look up again. */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(&ct->lock);
 
 			/* Only repeat if we can actually remove the timer.
 			 * Destruction may already be in progress in process
@@ -907,7 +904,7 @@ static int tcp_packet(struct nf_conn *ct,
 			 * that the client cannot but retransmit its SYN and
 			 * thus initiate a clean new session.
 			 */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(&ct->lock);
 			if (LOG_INVALID(net, IPPROTO_TCP))
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
@@ -920,7 +917,7 @@ static int tcp_packet(struct nf_conn *ct,
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored ");
@@ -929,7 +926,7 @@ static int tcp_packet(struct nf_conn *ct,
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
@@ -960,7 +957,7 @@ static int tcp_packet(struct nf_conn *ct,
 
 	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
 			   skb, dataoff, th, pf)) {
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->lock);
 		return -NF_ACCEPT;
 	}
      in_window:
@@ -989,7 +986,7 @@ static int tcp_packet(struct nf_conn *ct,
 		timeout = nf_ct_tcp_timeout_unacknowledged;
 	else
 		timeout = tcp_timeouts[new_state];
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	if (new_state != old_state)
 		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
@@ -1106,12 +1103,12 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct)
+			 struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 	struct nf_ct_tcp_flags tmp = {};
 
-	read_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->lock);
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -1131,14 +1128,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	tmp.flags = ct->proto.tcp.seen[1].flags;
 	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
 		sizeof(struct nf_ct_tcp_flags), &tmp);
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	nla_nest_end(skb, nest_parms);
 
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->lock);
 	return -1;
 }
 
@@ -1169,7 +1166,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
 		return -EINVAL;
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->lock);
 	if (tb[CTA_PROTOINFO_TCP_STATE])
 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
 
@@ -1196,7 +1193,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 		ct->proto.tcp.seen[1].td_scale =
 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
 	}
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->lock);
 
 	return 0;
 }

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 11:40                       ` Patrick McHardy
@ 2009-06-10 12:22                         ` Pablo Neira Ayuso
  2009-06-10 12:27                           ` Patrick McHardy
  2009-06-10 12:26                         ` Jozsef Kadlecsik
  1 sibling, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-10 12:22 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric Dumazet, netfilter-devel

Patrick McHardy wrote:
> This is a first attempt to replace some global locks by private
> per conntrack locks. On 64 bit, it fits into a hole and doesn't
> enlarge struct nf_conn.
> 
> Wrt. to the event cache, we certainly don't want to take and release
> the lock for every event. I was thinking about something like this:
> 
> - add a new member to the event structure to hold undelivered events
>   (named "missed" below)
> - cache events in the existing member as you're doing currently
> - on delivery, do something like this:
> 
>     events = xchg(&e->cache, 0);
>     missed = e->missed;
               ^^^
I think that we need to take the lock since we read e->missed, I see
this possible issue:

CPU0 gets a copy of the missed events (without taking the lock)
CPU1 has already delivered the missed events, it clears them
CPU0 delivers missed events that were already delivered by CPU1.

>     ret = notify->fcn(events | missed, &item);
>     if (!success || missed) {
>         spin_lock_bh(&ct->lock);
>         if (!success)
>             e->missed |= events;
>         else
>             e->missed &= ~missed;
>         spin_unlock_bh(&ct->lock);
>     }
> 
> so if we failed to deliver the events, we add them to the missed
> events for the next delivery attempt. Once we've delivered the
> missed events, we clear them from the cache.
> 
> Now is that really better - I'm not sure myself :) The per-conntrack
> locking would be an improvement though. What do you think?

Indeed, I also think that the per-conntrack locking would be an
improvement for the protocol helpers.

wrt. the event cache, the missed field can save us from doing the
locking in every event caching at the cost of consuming a bit more of
memory. I think this is more conservative but safer than my approach (no
potential defering by calling cmpxchg forever, even if it's unlikely).
Still, we would need to take the spin lock for the event delivery. Let
me know what you think.

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 11:40                       ` Patrick McHardy
  2009-06-10 12:22                         ` Pablo Neira Ayuso
@ 2009-06-10 12:26                         ` Jozsef Kadlecsik
  2009-06-10 12:30                           ` Patrick McHardy
  1 sibling, 1 reply; 31+ messages in thread
From: Jozsef Kadlecsik @ 2009-06-10 12:26 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Pablo Neira Ayuso, Eric Dumazet, netfilter-devel

On Wed, 10 Jun 2009, Patrick McHardy wrote:

> Patrick McHardy wrote:
> > Patrick McHardy wrote:
> > > Pablo Neira Ayuso wrote:
> > > > Can you see any problem with this optimistic approach? I know, it can
> > > > potentially try to restore the cache, but this is very unlikely to
> > > > happen?
> > > 
> > > Its probably quite unlikely, so not a big issue. OTOH this is
> > > effectively doing something quite similar to a spinlock. Maybe
> > > its finally time to add per-conntrack locking.
> > > 
> > > Eric even had a patch for this IIRC :)
> > 
> > I'll take a quick stab at this, will let you know in 30-60 minutes.
> 
> This is a first attempt to replace some global locks by private
> per conntrack locks. On 64 bit, it fits into a hole and doesn't
> enlarge struct nf_conn.
[...] 
> Now is that really better - I'm not sure myself :) The per-conntrack
> locking would be an improvement though. What do you think?

I think it's cool and highly required, in order to add per hash bucket 
locks too :-). And hash bucket locking can even more improve performance. 
(We need the ordered locking of the buckets for both directions only when 
entries are added/deleted, otherwise a single read-lock of the given 
bucket is enough.)

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@mail.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 12:22                         ` Pablo Neira Ayuso
@ 2009-06-10 12:27                           ` Patrick McHardy
  2009-06-10 12:43                             ` Pablo Neira Ayuso
  0 siblings, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-10 12:27 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Eric Dumazet, netfilter-devel

Pablo Neira Ayuso wrote:
> Patrick McHardy wrote:
>> This is a first attempt to replace some global locks by private
>> per conntrack locks. On 64 bit, it fits into a hole and doesn't
>> enlarge struct nf_conn.
>>
>> Wrt. to the event cache, we certainly don't want to take and release
>> the lock for every event. I was thinking about something like this:
>>
>> - add a new member to the event structure to hold undelivered events
>>   (named "missed" below)
>> - cache events in the existing member as you're doing currently
>> - on delivery, do something like this:
>>
>>     events = xchg(&e->cache, 0);
>>     missed = e->missed;
>                ^^^
> I think that we need to take the lock since we read e->missed, I see
> this possible issue:
> 
> CPU0 gets a copy of the missed events (without taking the lock)
> CPU1 has already delivered the missed events, it clears them
> CPU0 delivers missed events that were already delivered by CPU1.

Indeed, I forgot to mention that. Its harmless though, no?

>>     ret = notify->fcn(events | missed, &item);
>>     if (!success || missed) {
>>         spin_lock_bh(&ct->lock);
>>         if (!success)
>>             e->missed |= events;
>>         else
>>             e->missed &= ~missed;
>>         spin_unlock_bh(&ct->lock);
>>     }
>>
>> so if we failed to deliver the events, we add them to the missed
>> events for the next delivery attempt. Once we've delivered the
>> missed events, we clear them from the cache.
>>
>> Now is that really better - I'm not sure myself :) The per-conntrack
>> locking would be an improvement though. What do you think?
> 
> Indeed, I also think that the per-conntrack locking would be an
> improvement for the protocol helpers.
> 
> wrt. the event cache, the missed field can save us from doing the
> locking in every event caching at the cost of consuming a bit more of
> memory. I think this is more conservative but safer than my approach (no
> potential defering by calling cmpxchg forever, even if it's unlikely).
> Still, we would need to take the spin lock for the event delivery. Let
> me know what you think.

Would we really have to? The events are incremental anyways, so
it shouldn't matter if we very rarely deliver an event twice.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 12:26                         ` Jozsef Kadlecsik
@ 2009-06-10 12:30                           ` Patrick McHardy
  2009-06-10 12:41                             ` Patrick McHardy
  0 siblings, 1 reply; 31+ messages in thread
From: Patrick McHardy @ 2009-06-10 12:30 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: Pablo Neira Ayuso, Eric Dumazet, netfilter-devel

Jozsef Kadlecsik wrote:
> On Wed, 10 Jun 2009, Patrick McHardy wrote:
> 
>> This is a first attempt to replace some global locks by private
>> per conntrack locks. On 64 bit, it fits into a hole and doesn't
>> enlarge struct nf_conn.
> [...] 
>> Now is that really better - I'm not sure myself :) The per-conntrack
>> locking would be an improvement though. What do you think?
> 
> I think it's cool and highly required, in order to add per hash bucket 
> locks too :-). And hash bucket locking can even more improve performance. 
> (We need the ordered locking of the buckets for both directions only when 
> entries are added/deleted, otherwise a single read-lock of the given 
> bucket is enough.)

Great, I'll go over it once more and will commit it to my tree.

I'll post a quick note once its committed so Pablo can rebase
his patches.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 12:30                           ` Patrick McHardy
@ 2009-06-10 12:41                             ` Patrick McHardy
  0 siblings, 0 replies; 31+ messages in thread
From: Patrick McHardy @ 2009-06-10 12:41 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: Pablo Neira Ayuso, Eric Dumazet, netfilter-devel

Patrick McHardy wrote:
> Jozsef Kadlecsik wrote:
>> On Wed, 10 Jun 2009, Patrick McHardy wrote:
>>
>>> This is a first attempt to replace some global locks by private
>>> per conntrack locks. On 64 bit, it fits into a hole and doesn't
>>> enlarge struct nf_conn.
>> [...]
>>> Now is that really better - I'm not sure myself :) The per-conntrack
>>> locking would be an improvement though. What do you think?
>>
>> I think it's cool and highly required, in order to add per hash bucket 
>> locks too :-). And hash bucket locking can even more improve 
>> performance. (We need the ordered locking of the buckets for both 
>> directions only when entries are added/deleted, otherwise a single 
>> read-lock of the given bucket is enough.)
> 
> Great, I'll go over it once more and will commit it to my tree.
> 
> I'll post a quick note once its committed so Pablo can rebase
> his patches.

I've committed the patch and pushed it out.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 12:27                           ` Patrick McHardy
@ 2009-06-10 12:43                             ` Pablo Neira Ayuso
  2009-06-10 12:56                               ` Patrick McHardy
  0 siblings, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-06-10 12:43 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric Dumazet, netfilter-devel

Patrick McHardy wrote:
> Pablo Neira Ayuso wrote:
>> wrt. the event cache, the missed field can save us from doing the
>> locking in every event caching at the cost of consuming a bit more of
>> memory. I think this is more conservative but safer than my approach (no
>> potential defering by calling cmpxchg forever, even if it's unlikely).
>> Still, we would need to take the spin lock for the event delivery. Let
>> me know what you think.
> 
> Would we really have to? The events are incremental anyways, so
> it shouldn't matter if we very rarely deliver an event twice.

No problem. I'll add a comment to tell about this, we can re-visit this
issue later if it becomes a problem.

Please, let me know once you are done with your patch to rebase mine ;).

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-06-10 12:43                             ` Pablo Neira Ayuso
@ 2009-06-10 12:56                               ` Patrick McHardy
  0 siblings, 0 replies; 31+ messages in thread
From: Patrick McHardy @ 2009-06-10 12:56 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Eric Dumazet, netfilter-devel

Pablo Neira Ayuso wrote:
> Patrick McHardy wrote:
>> Pablo Neira Ayuso wrote:
>>> wrt. the event cache, the missed field can save us from doing the
>>> locking in every event caching at the cost of consuming a bit more of
>>> memory. I think this is more conservative but safer than my approach (no
>>> potential defering by calling cmpxchg forever, even if it's unlikely).
>>> Still, we would need to take the spin lock for the event delivery. Let
>>> me know what you think.
>> Would we really have to? The events are incremental anyways, so
>> it shouldn't matter if we very rarely deliver an event twice.
> 
> No problem. I'll add a comment to tell about this, we can re-visit this
> issue later if it becomes a problem.

Agreed on the comment - I have to insist though that this can't cause
problems based on the duplicate delivery if userspace is using the
API correctly :) We can already have partial deliveries, so userspace
needs to incrementally accumulate the information in any case.

> Please, let me know once you are done with your patch to rebase mine ;).

I'm done.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-05-04 13:53 ` [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery Pablo Neira Ayuso
@ 2009-05-04 14:02   ` Pablo Neira Ayuso
  0 siblings, 0 replies; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-05-04 14:02 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, kaber

Pablo Neira Ayuso wrote:
> This patch improves ctnetlink event reliability if one broadcast
> listener has set the NETLINK_BROADCAST_ERROR socket option.

I have just detected an inconsistency in the handling of
IPCT_DELIVERY_FAILED, I'll resend this patch alone again.

-- 
"Los honestos son inadaptados sociales" -- Les Luthiers

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery
  2009-05-04 13:53 [PATCH 0/2] conntrack event subsystem updates for 2.6.31 (part 2) Pablo Neira Ayuso
@ 2009-05-04 13:53 ` Pablo Neira Ayuso
  2009-05-04 14:02   ` Pablo Neira Ayuso
  0 siblings, 1 reply; 31+ messages in thread
From: Pablo Neira Ayuso @ 2009-05-04 13:53 UTC (permalink / raw)
  To: netfilter-devel; +Cc: kaber

This patch improves ctnetlink event reliability if one broadcast
listener has set the NETLINK_BROADCAST_ERROR socket option.

The logic is the following: if the event delivery fails, ctnetlink
sets IPCT_DELIVERY_FAILED event bit and keep the undelivered
events in the conntrack event cache. Thus, once the next packet
arrives, we trigger another event delivery in nf_conntrack_in(). If
things don't go well in this second try, we accumulate the pending
events in the cache but we try to deliver the current state as soon
as possible. Therefore, we may lost state transitions but the
userspace process gets in sync at some point.

At worst case, if no events were delivered to userspace, we make
sure that destroy events are successfully delivered. This happens
because if ctnetlink fails to deliver the destroy event, we remove
the conntrack entry from the hashes and insert them in the dying
list, which contains inactive entries. Then, the conntrack timer
is added with an extra grace timeout of 15 seconds to trigger the
event again (this grace timeout is tunable via /proc).

The maximum number of conntrack entries (active or inactive) is
still handled by nf_conntrack_max. Thus, we may start dropping
packets at some point if we accumulate a lot of inactive conntrack
entries waiting to deliver the destroy event to userspace.
During my stress tests consisting of setting a very small buffer
of 2048 bytes for conntrackd and the NETLINK_BROADCAST_ERROR socket
flag, and generating lots of very small connections, we hit "table
full, dropping packet" at some point.

For expectations, no changes are introduced in this patch.
Currently, event delivery is only done for new expectations (no
events from expectation removal and confirmation) and, apart from
the conntrack command line tool, I don't see any client that may
benefit of reliable expectation event delivery, we have to
introduce confirm and destroy events before.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---

 include/net/netfilter/nf_conntrack.h        |    2 +
 include/net/netfilter/nf_conntrack_core.h   |    6 +-
 include/net/netfilter/nf_conntrack_ecache.h |   19 ++++--
 include/net/netfilter/nf_conntrack_helper.h |    2 +
 include/net/netns/conntrack.h               |    2 +
 net/netfilter/nf_conntrack_core.c           |   89 ++++++++++++++++++++++-----
 net/netfilter/nf_conntrack_ecache.c         |   24 ++++++-
 net/netfilter/nf_conntrack_helper.c         |   15 +++++
 net/netfilter/nf_conntrack_netlink.c        |   58 ++++++++++++------
 9 files changed, 170 insertions(+), 47 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index f34d596..ceacd5b 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -291,6 +291,8 @@ extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
 extern unsigned int nf_conntrack_htable_size;
 extern unsigned int nf_conntrack_max;
 
+extern void nf_ct_setup_event_timer(struct nf_conn *ct);
+
 #define NF_CT_STAT_INC(net, count)	\
 	(per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++)
 #define NF_CT_STAT_INC_ATOMIC(net, count)		\
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 5a449b4..1be51ba 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -62,8 +62,10 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
 	if (ct && ct != &nf_conntrack_untracked) {
 		if (!nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct))
 			ret = __nf_conntrack_confirm(skb);
-		if (likely(ret == NF_ACCEPT))
-			nf_ct_deliver_cached_events(ct);
+		if (unlikely(ret == NF_DROP))
+			return NF_DROP;
+		if (unlikely(nf_ct_deliver_cached_events(ct) < 0))
+			nf_conntrack_event_cache(IPCT_DELIVERY_FAILED, ct);
 	}
 	return ret;
 }
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 10244f5..c7d7b5e 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -24,6 +24,7 @@ enum ip_conntrack_events
 	IPCT_MARK		= 6,	/* new mark has been set */
 	IPCT_NATSEQADJ		= 7,	/* NAT is doing sequence adjustment */
 	IPCT_SECMARK		= 8,	/* new security mark has been set */
+	IPCT_DELIVERY_FAILED	= 31,	/* previous event delivery has failed */
 };
 
 enum ip_conntrack_expect_events {
@@ -67,7 +68,7 @@ extern struct nf_ct_event_notifier *nf_conntrack_event_cb;
 extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb);
 extern int nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb);
 
-extern void nf_ct_deliver_cached_events(struct nf_conn *ct);
+extern int nf_ct_deliver_cached_events(struct nf_conn *ct);
 
 static inline void
 nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
@@ -90,10 +91,11 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
 	set_bit(event, &e->cache);
 }
 
-static inline void
+static inline int
 nf_conntrack_event_bitmask_report(unsigned int bitmask,
 				  struct nf_conn *ct, u32 pid, int report)
 {
+	int ret = 0;
 	struct net *net = nf_ct_net(ct);
 	struct nf_ct_event_notifier *notify;
 
@@ -111,18 +113,20 @@ nf_conntrack_event_bitmask_report(unsigned int bitmask,
 			.pid	= pid,
 			.report = report
 		};
-		notify->fcn(bitmask, &item);
+		ret = notify->fcn(bitmask, &item);
 	}
 out_unlock:
 	rcu_read_unlock();
+	return ret;
 }
 
-static inline void
+static inline int
 nf_conntrack_event_report(enum ip_conntrack_events event,
 			  struct nf_conn *ct,
 			  u32 pid,
 			  int report)
 {
+	int ret = 0;
 	struct net *net = nf_ct_net(ct);
 	struct nf_ct_event_notifier *notify;
 
@@ -140,16 +144,17 @@ nf_conntrack_event_report(enum ip_conntrack_events event,
 			.pid	= pid,
 			.report = report
 		};
-		notify->fcn((1 << event), &item);
+		ret = notify->fcn((1 << event), &item);
 	}
 out_unlock:
 	rcu_read_unlock();
+	return ret;
 }
 
-static inline void
+static inline int
 nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
 {
-	nf_conntrack_event_report(event, ct, 0, 0);
+	return nf_conntrack_event_report(event, ct, 0, 0);
 }
 
 struct nf_exp_event {
diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index ee2a4b3..1b70680 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -50,6 +50,8 @@ extern struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp);
 
 extern int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags);
 
+extern void nf_ct_helper_destroy(struct nf_conn *ct);
+
 static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct)
 {
 	return nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 505a51c..ba1ba0c 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -14,8 +14,10 @@ struct netns_ct {
 	struct hlist_nulls_head	*hash;
 	struct hlist_head	*expect_hash;
 	struct hlist_nulls_head	unconfirmed;
+	struct hlist_nulls_head	dying;
 	struct ip_conntrack_stat *stat;
 	int			sysctl_events;
+	unsigned int		sysctl_events_retry_timeout;
 	int			sysctl_acct;
 	int			sysctl_checksum;
 	unsigned int		sysctl_log_invalid; /* Log invalid packets */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e8905a9..b314541 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -182,10 +182,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 	NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
-	if (!test_bit(IPS_DYING_BIT, &ct->status))
-		nf_conntrack_event(IPCT_DESTROY, ct);
-	set_bit(IPS_DYING_BIT, &ct->status);
-
 	/* To make sure we don't get any weird locking issues here:
 	 * destroy_conntrack() MUST NOT be called with a write lock
 	 * to nf_conntrack_lock!!! -HW */
@@ -219,20 +215,9 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	nf_conntrack_free(ct);
 }
 
-static void death_by_timeout(unsigned long ul_conntrack)
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
-	struct nf_conn *ct = (void *)ul_conntrack;
 	struct net *net = nf_ct_net(ct);
-	struct nf_conn_help *help = nfct_help(ct);
-	struct nf_conntrack_helper *helper;
-
-	if (help) {
-		rcu_read_lock();
-		helper = rcu_dereference(help->helper);
-		if (helper && helper->destroy)
-			helper->destroy(ct);
-		rcu_read_unlock();
-	}
 
 	spin_lock_bh(&nf_conntrack_lock);
 	/* Inside lock so preempt is disabled on module removal path.
@@ -240,6 +225,60 @@ static void death_by_timeout(unsigned long ul_conntrack)
 	NF_CT_STAT_INC(net, delete_list);
 	clean_from_lists(ct);
 	spin_unlock_bh(&nf_conntrack_lock);
+}
+
+static void death_by_event(unsigned long ul_conntrack)
+{
+	struct nf_conn *ct = (void *)ul_conntrack;
+	struct net *net = nf_ct_net(ct);
+
+	if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
+		/* bad luck, let's retry again */
+		ct->timeout.expires =
+			jiffies + net->ct.sysctl_events_retry_timeout;
+		add_timer(&ct->timeout);
+		return;
+	}
+	/* we've got the event delivered, now it's dying */
+	set_bit(IPS_DYING_BIT, &ct->status);
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+	spin_unlock_bh(&nf_conntrack_lock);
+	nf_ct_helper_destroy(ct);
+	nf_ct_put(ct);
+}
+
+void nf_ct_setup_event_timer(struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+
+	nf_ct_delete_from_lists(ct);
+	/* add this conntrack to the dying list */
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+				 &net->ct.dying);
+	/* set a new timer to retry event delivery */
+	setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
+	ct->timeout.expires =
+		jiffies + net->ct.sysctl_events_retry_timeout;
+	add_timer(&ct->timeout);
+	spin_unlock_bh(&nf_conntrack_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_setup_event_timer);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+	struct nf_conn *ct = (void *)ul_conntrack;
+
+	if (!test_bit(IPS_DYING_BIT, &ct->status) && 
+	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+		/* destroy event was not delivered */
+		nf_ct_setup_event_timer(ct);
+		return;
+	}
+	set_bit(IPS_DYING_BIT, &ct->status);
+	nf_ct_helper_destroy(ct);
+	nf_ct_delete_from_lists(ct);
 	nf_ct_put(ct);
 }
 
@@ -1030,6 +1069,22 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
 
+static void nf_ct_release_dying_list(void)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct hlist_nulls_node *n;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		/* never fails to remove them, no listeners at this point */
+		if (del_timer(&ct->timeout))
+			ct->timeout.function((unsigned long)ct);
+	}
+	spin_unlock_bh(&nf_conntrack_lock);
+}
+
 static void nf_conntrack_cleanup_init_net(void)
 {
 	nf_conntrack_helper_fini();
@@ -1041,6 +1096,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
 {
  i_see_dead_people:
 	nf_ct_iterate_cleanup(net, kill_all, NULL);
+	nf_ct_release_dying_list();
 	if (atomic_read(&net->ct.count) != 0) {
 		schedule();
 		goto i_see_dead_people;
@@ -1222,6 +1278,7 @@ static int nf_conntrack_init_net(struct net *net)
 
 	atomic_set(&net->ct.count, 0);
 	INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0);
+	INIT_HLIST_NULLS_HEAD(&net->ct.dying, 0);
 	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
 	if (!net->ct.stat) {
 		ret = -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 04dde1a..e97f6dc 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -33,10 +33,11 @@ EXPORT_SYMBOL_GPL(nf_expect_event_cb);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
-void nf_ct_deliver_cached_events(struct nf_conn *ct)
+int nf_ct_deliver_cached_events(struct nf_conn *ct)
 {
 	struct nf_ct_event_notifier *notify;
 	struct nf_conntrack_ecache *e;
+	int ret = 0, delivered = 0;
 
 	rcu_read_lock_bh();
 	notify = rcu_dereference(nf_conntrack_event_cb);
@@ -54,12 +55,16 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
 			.report	= 0
 		};
 
-		notify->fcn(e->cache, &item);
+		ret = notify->fcn(e->cache, &item);
+		if (ret == 0)
+			delivered = 1;
 	}
-	xchg(&e->cache, 0);
+	if (delivered)
+		xchg(&e->cache, 0);
 
 out_unlock:
 	rcu_read_unlock_bh();
+	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
 
@@ -154,9 +159,12 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
 #endif
 
 static int nf_ct_events_switch __read_mostly = NF_CT_EVENTS_DEFAULT;
+static int nf_ct_events_retry_timeout __read_mostly = 15*HZ;
 
 module_param_named(event, nf_ct_events_switch, bool, 0644);
 MODULE_PARM_DESC(event, "Enable connection tracking event delivery");
+module_param_named(retry_timeout, nf_ct_events_retry_timeout, bool, 0644);
+MODULE_PARM_DESC(retry_timeout, "Event delivery retry timeout");
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table event_sysctl_table[] = {
@@ -168,6 +176,14 @@ static struct ctl_table event_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nf_conntrack_events_retry_timeout",
+		.data		= &init_net.ct.sysctl_events_retry_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
 	{}
 };
 #endif /* CONFIG_SYSCTL */
@@ -189,6 +205,7 @@ static int nf_conntrack_event_init_sysctl(struct net *net)
 		goto out;
 
 	table[0].data = &net->ct.sysctl_events;
+	table[1].data = &net->ct.sysctl_events_retry_timeout;
 
 	net->ct.event_sysctl_header =
 		register_net_sysctl_table(net,
@@ -229,6 +246,7 @@ int nf_conntrack_ecache_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_events = nf_ct_events_switch;
+	net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
 
 	if (net_eq(net, &init_net)) {
 		ret = nf_ct_extend_register(&event_extend);
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 0fa5a42..5fc1fe7 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -136,6 +136,21 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
 	return 0;
 }
 
+void nf_ct_helper_destroy(struct nf_conn *ct)
+{
+	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_conntrack_helper *helper;
+
+	if (help) {
+		rcu_read_lock();
+		helper = rcu_dereference(help->helper);
+		if (helper && helper->destroy)
+			helper->destroy(ct);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_destroy);
+
 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 {
 	unsigned int h = helper_hash(&me->tuple);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index ee9e1bc..c7d2a65 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -487,6 +487,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	unsigned int type;
 	sk_buff_data_t b;
 	unsigned int flags = 0, group;
+	int err;
 
 	/* ignore our fake conntrack entry */
 	if (ct == &nf_conntrack_untracked)
@@ -583,7 +584,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	rcu_read_unlock();
 
 	nlh->nlmsg_len = skb->tail - b;
-	nfnetlink_send(skb, item->pid, group, item->report);
+	err = nfnetlink_send(skb, item->pid, group, item->report);
+	if ((err == -ENOBUFS) || (err == -EAGAIN))
+		return -ENOBUFS;
+
 	return 0;
 
 nla_put_failure:
@@ -591,7 +595,7 @@ nla_put_failure:
 nlmsg_failure:
 	kfree_skb(skb);
 errout:
-	nfnetlink_set_err(0, group, -ENOBUFS);
+	nfnetlink_set_err(item->pid, group, -ENOBUFS);
 	return 0;
 }
 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
@@ -823,10 +827,14 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		}
 	}
 
-	nf_conntrack_event_report(IPCT_DESTROY,
-				  ct,
-				  NETLINK_CB(skb).pid,
-				  nlmsg_report(nlh));
+	if (nf_conntrack_event_report(IPCT_DESTROY, ct,
+				      NETLINK_CB(skb).pid,
+				      nlmsg_report(nlh)) < 0) {
+		/* we failed to report the event, try later */
+		nf_ct_setup_event_timer(ct);
+		nf_ct_put(ct);
+		return 0;
+	}
 
 	/* death_by_timeout would report the event again */
 	set_bit(IPS_DYING_BIT, &ct->status);
@@ -1184,7 +1192,7 @@ ctnetlink_change_conntrack(struct nf_conn *ct, struct nlattr *cda[])
 	return 0;
 }
 
-static inline void
+static inline int
 ctnetlink_event_report(struct nf_conn *ct, u32 pid, int report)
 {
 	unsigned int events = 0;
@@ -1194,12 +1202,12 @@ ctnetlink_event_report(struct nf_conn *ct, u32 pid, int report)
 	else
 		events |= (1 << IPCT_NEW);
 
-	nf_conntrack_event_bitmask_report((1 << IPCT_STATUS) |
-					  (1 << IPCT_HELPER) |
-					  (1 << IPCT_PROTOINFO) |
-					  (1 << IPCT_NATSEQADJ) |
-					  (1 << IPCT_MARK) | events,
-					  ct, pid, report);
+	return nf_conntrack_event_bitmask_report((1 << IPCT_STATUS) |
+						 (1 << IPCT_HELPER) |
+						 (1 << IPCT_PROTOINFO) |
+						 (1 << IPCT_NATSEQADJ) |
+						 (1 << IPCT_MARK) | events,
+						 ct, pid, report);
 }
 
 static struct nf_conn *
@@ -1378,9 +1386,16 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 			err = 0;
 			nf_conntrack_get(&ct->ct_general);
 			spin_unlock_bh(&nf_conntrack_lock);
-			ctnetlink_event_report(ct,
-					       NETLINK_CB(skb).pid,
-					       nlmsg_report(nlh));
+			if (ctnetlink_event_report(ct,
+						   NETLINK_CB(skb).pid,
+						   nlmsg_report(nlh)) < 0) {
+				/* first packet matching this entry will
+				 * trigger a new event delivery. */
+				nf_conntrack_event_cache(IPCT_DELIVERY_FAILED,
+							 ct);
+				nf_ct_put(ct);
+				return 0;
+			}
 			nf_ct_put(ct);
 		} else
 			spin_unlock_bh(&nf_conntrack_lock);
@@ -1399,9 +1414,14 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		if (err == 0) {
 			nf_conntrack_get(&ct->ct_general);
 			spin_unlock_bh(&nf_conntrack_lock);
-			ctnetlink_event_report(ct,
-					       NETLINK_CB(skb).pid,
-					       nlmsg_report(nlh));
+			if (ctnetlink_event_report(ct,
+						   NETLINK_CB(skb).pid,
+						   nlmsg_report(nlh)) < 0) {
+				nf_conntrack_event_cache(IPCT_DELIVERY_FAILED,
+							 ct);
+				nf_ct_put(ct);
+				return 0;
+			}
 			nf_ct_put(ct);
 		} else
 			spin_unlock_bh(&nf_conntrack_lock);


^ permalink raw reply related	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2009-06-10 12:56 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-04 11:07 [PATCH 0/2] Pablo Neira Ayuso
2009-06-04 11:08 ` [PATCH 1/2] netfilter: conntrack: move event cache to conntrack extension infrastructure Pablo Neira Ayuso
2009-06-04 12:16   ` Pablo Neira Ayuso
2009-06-05 11:04   ` Patrick McHardy
2009-06-05 13:06     ` Pablo Neira Ayuso
2009-06-05 14:13       ` Patrick McHardy
2009-06-06  6:24         ` Pablo Neira Ayuso
2009-06-04 11:08 ` [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery Pablo Neira Ayuso
2009-06-05 14:37   ` Patrick McHardy
2009-06-06  6:34     ` Pablo Neira Ayuso
2009-06-08 13:49       ` Patrick McHardy
2009-06-09 22:36     ` Pablo Neira Ayuso
2009-06-09 22:43       ` Patrick McHardy
2009-06-09 22:45         ` Patrick McHardy
2009-06-09 22:58           ` Pablo Neira Ayuso
2009-06-10  1:18             ` Eric Dumazet
2009-06-10  9:55               ` Patrick McHardy
2009-06-10 10:36                 ` Pablo Neira Ayuso
2009-06-10 10:55                   ` Patrick McHardy
2009-06-10 11:01                     ` Patrick McHardy
2009-06-10 11:40                       ` Patrick McHardy
2009-06-10 12:22                         ` Pablo Neira Ayuso
2009-06-10 12:27                           ` Patrick McHardy
2009-06-10 12:43                             ` Pablo Neira Ayuso
2009-06-10 12:56                               ` Patrick McHardy
2009-06-10 12:26                         ` Jozsef Kadlecsik
2009-06-10 12:30                           ` Patrick McHardy
2009-06-10 12:41                             ` Patrick McHardy
2009-06-04 11:17 ` [PATCH 0/2] reliable per-conntrack event cache Pablo Neira Ayuso
  -- strict thread matches above, loose matches on Subject: below --
2009-05-04 13:53 [PATCH 0/2] conntrack event subsystem updates for 2.6.31 (part 2) Pablo Neira Ayuso
2009-05-04 13:53 ` [PATCH 2/2] netfilter: conntrack: optional reliable conntrack event delivery Pablo Neira Ayuso
2009-05-04 14:02   ` Pablo Neira Ayuso

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.