All of lore.kernel.org
 help / color / mirror / Atom feed
From: SaRaVanAn <saravanan.nagarajan87@gmail.com>
To: netfilter-devel@vger.kernel.org
Subject: Regarding Nat-reservation and tproxy feature in latest netfilter code
Date: Thu, 18 Jul 2013 12:44:13 +0530	[thread overview]
Message-ID: <CA+86yMig+_kNGno8jxrumLd5U+pfQeQvsJLDnP54Sj-uj+oMqQ@mail.gmail.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 627 bytes --]

Hi Team,
   We are using NAT reservation and Tproxy patch for Netfilter code in
2.6.x Kernel series. Recently we have migrated from 2.6.x to 3.x
kernel series. I want to clarify whether NAT reservation and Tproxy
feature has resolved under netfilter comes in 3.x kernel series. If
its not resolved, whether NAT reservation & tproxy functionality can
be achieved in 3.x Kernel series with any latest patch in netfilter.

I could find any patch related to NAT reservation for latest netfilter code.
Please help us in this regard.

I have also attached the old patches for your reference.

Thanks in advance,
Regards,
Saravanan N

[-- Attachment #2: 01-nat_reservations.patch --]
[-- Type: application/octet-stream, Size: 25530 bytes --]

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 51dbec1..440e444 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -164,6 +164,11 @@ #ifdef CONFIG_IP_NF_NAT_NEEDED
 	/* Direction relative to the master connection. */
 	enum ip_conntrack_dir dir;
 #endif
+
+#ifdef CONFIG_IP_NF_NAT_NRES
+	/* List of registered reservations */
+	struct list_head reserved_list;
+#endif
 };
 
 #define IP_CT_EXPECT_PERMANENT	0x1
@@ -239,6 +244,10 @@ extern void ip_conntrack_tcp_update(stru
 
 /* Call me when a conntrack is destroyed. */
 extern void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack);
+#ifdef CONFIG_IP_NF_NAT_NRES
+/* Call when an expectation is destroyed. */
+extern void (*ip_conntrack_expect_destroyed)(struct ip_conntrack_expect *exp);
+#endif
 
 /* Fake conntrack entry for untracked connections */
 extern struct ip_conntrack ip_conntrack_untracked;
diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h
index e9f5ed1..0337a85 100644
--- a/include/linux/netfilter_ipv4/ip_nat.h
+++ b/include/linux/netfilter_ipv4/ip_nat.h
@@ -17,6 +17,10 @@ #define HOOK2MANIP(hooknum) ((hooknum) !
 #define IP_NAT_RANGE_MAP_IPS 1
 #define IP_NAT_RANGE_PROTO_SPECIFIED 2
 
+#ifdef CONFIG_IP_NF_NAT_NRES
+#define IP_NAT_RANGE_USE_RESERVED 8
+#endif
+
 /* NAT sequence number modifications */
 struct ip_nat_seq {
 	/* position of the last TCP sequence number 
@@ -63,6 +67,18 @@ struct ip_nat_info
 
 struct ip_conntrack;
 
+#ifdef CONFIG_IP_NF_NAT_NRES
+/* Structure to store reserved manips */
+struct ip_nat_reserved {
+	struct list_head hash;			/* Hash chain */
+	struct list_head exp;			/* Per-expectation list */
+	atomic_t use;				/* Reference count */
+	struct ip_conntrack_manip manip;	/* Reserved manip */
+	struct ip_conntrack_manip peer;		/* Peer (optional) */
+	u_int16_t proto;			/* Protocol number of reserved manip */
+};
+#endif
+
 /* Set up the info structure to map into this range. */
 extern unsigned int ip_nat_setup_info(struct ip_conntrack *conntrack,
 				      const struct ip_nat_range *range,
@@ -70,12 +86,45 @@ extern unsigned int ip_nat_setup_info(st
 
 /* Is this tuple already taken? (not by us)*/
 extern int ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
-			     const struct ip_conntrack *ignored_conntrack);
+			     const struct ip_conntrack *ignored_conntrack,
+                             const enum ip_nat_manip_type maniptype,
+                             const unsigned int flags);
 
 /* Calculate relative checksum. */
 extern u_int16_t ip_nat_cheat_check(u_int32_t oldvalinv,
 				    u_int32_t newval,
 				    u_int16_t oldcheck);
+
+#ifdef CONFIG_IP_NF_NAT_NRES
+struct ip_conntrack_expect;
+
+/* NAT port reservation: allocate and hash a new entry */
+extern struct ip_nat_reserved *__ip_nat_reserved_new_hash(const struct ip_conntrack_manip *manip,
+					 const u_int16_t proto, const struct ip_conntrack_manip *peer);
+
+/* NAT port reservation: unhash an entry */
+extern struct ip_nat_reserved *__ip_nat_reserved_unhash(const struct ip_conntrack_manip *manip,
+				       const u_int16_t proto, const struct ip_conntrack_manip *peer);
+
+/* NAT port reservation: free a reservation */
+extern void __ip_nat_reserved_free(struct ip_nat_reserved *res);
+
+/* NAT port reservation: register a new reservation */
+extern int ip_nat_reserved_register(struct ip_conntrack_expect *exp,
+				    const struct ip_conntrack_manip *manip,
+				    const u_int16_t proto,
+				    const struct ip_conntrack_manip *peer);
+
+/* NAT port reservation: unregister a reservation */
+extern int ip_nat_reserved_unregister(struct ip_conntrack_expect *exp,
+				      const struct ip_conntrack_manip *manip,
+				      const u_int16_t proto,
+				      const struct ip_conntrack_manip *peer);
+
+/* NAT port reservation: unregister all reservations for a given expectation */
+extern void ip_nat_reserved_unregister_all(struct ip_conntrack_expect *exp);
+#endif /*CONFIG_IP_NF_NAT_NRES*/
+
 #else  /* !__KERNEL__: iptables wants this to compile. */
 #define ip_nat_multi_range ip_nat_multi_range_compat
 #endif /*__KERNEL__*/
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ef0b5aa..ac751c7 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -474,6 +474,16 @@ config IP_NF_TARGET_SAME
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_NAT_NRES
+	bool "NAT reservations support"
+	depends on IP_NF_NAT
+	help
+	  This option enables support for NAT reservations. This makes
+	  transparent proxying more reliable, but unneeded if you don't
+	  need TProxy support.
+
+	  If unsure, say 'N'.
+
 config IP_NF_NAT_SNMP_BASIC
 	tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && IP_NF_NAT
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index aa45917..f9349bf 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -63,6 +63,9 @@ DEFINE_RWLOCK(ip_conntrack_lock);
 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
 
 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
+#ifdef CONFIG_IP_NF_NAT_NRES
+void (*ip_conntrack_expect_destroyed)(struct ip_conntrack_expect *expect) = NULL;
+#endif
 LIST_HEAD(ip_conntrack_expect_list);
 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
 static LIST_HEAD(helpers);
@@ -207,6 +210,12 @@ void ip_ct_unlink_expect(struct ip_connt
 	list_del(&exp->list);
 	CONNTRACK_STAT_INC(expect_delete);
 	exp->master->expecting--;
+
+#ifdef CONFIG_IP_NF_NAT_NRES
+	if (ip_conntrack_expect_destroyed)
+		ip_conntrack_expect_destroyed(exp);
+#endif
+
 	ip_conntrack_expect_put(exp);
 }
 
@@ -952,6 +961,9 @@ struct ip_conntrack_expect *ip_conntrack
 	}
 	new->master = me;
 	atomic_set(&new->use, 1);
+#ifdef CONFIG_IP_NF_NAT_NRES
+	INIT_LIST_HEAD(&new->reserved_list);
+#endif
 	return new;
 }
 
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 7a9fa04..c050399 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -930,6 +930,9 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_expect_
 EXPORT_SYMBOL_GPL(ip_conntrack_expect_find);
 EXPORT_SYMBOL(ip_conntrack_expect_related);
 EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+#ifdef CONFIG_IP_NF_NAT_NRES
+EXPORT_SYMBOL(ip_conntrack_expect_destroyed);
+#endif
 EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
 EXPORT_SYMBOL_GPL(ip_ct_unlink_expect);
 
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 1741d55..d40df7e 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -14,6 +14,7 @@ #include <linux/timer.h>
 #include <linux/skbuff.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include <net/checksum.h>
 #include <net/icmp.h>
 #include <net/ip.h>
@@ -46,6 +47,12 @@ DEFINE_RWLOCK(ip_nat_lock);
 /* Calculated at init based on memory size */
 static unsigned int ip_nat_htable_size;
 
+#ifdef CONFIG_IP_NF_NAT_NRES
+static kmem_cache_t *ip_nat_reserved_cachep;
+static atomic_t ip_nat_reserved_count;
+static struct list_head *natreserved;
+#endif
+
 static struct list_head *bysource;
 
 #define MAX_IP_NAT_PROTO 256
@@ -90,6 +97,19 @@ hash_by_src(const struct ip_conntrack_tu
 			    tuple->dst.protonum, 0) % ip_nat_htable_size;
 }
 
+#ifdef CONFIG_IP_NF_NAT_NRES
+static inline unsigned int
+hash_nat_reserved(const struct ip_conntrack_manip *foreign,
+		  const struct ip_conntrack_manip *peer,
+		  const u_int16_t proto)
+{
+	return jhash_3words(foreign->ip,
+			    (proto << 16) + foreign->u.all,
+			    (peer ? (peer->ip + peer->u.all) : 0),
+			    0) % ip_nat_htable_size;
+}
+#endif
+
 /* Noone using conntrack by the time this called. */
 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
 {
@@ -113,10 +133,362 @@ ip_nat_cheat_check(u_int32_t oldvalinv,
 }
 EXPORT_SYMBOL(ip_nat_cheat_check);
 
+#ifdef CONFIG_IP_NF_NAT_NRES
+static inline int
+reserved_manip_cmp(const struct ip_nat_reserved *i,
+		   const struct ip_conntrack_manip *manip,
+		   const u_int16_t proto)
+{
+	DEBUGP("reserved_manip_cmp: manip proto %u %u.%u.%u.%u:%u, "
+	       "reservation proto %u %u.%u.%u.%u:%u\n peer %u.%u.%u.%u:%u\n",
+			proto, NIPQUAD(manip->ip), ntohs(manip->u.all),
+			i->proto, NIPQUAD(i->manip.ip), ntohs(i->manip.u.all),
+			NIPQUAD(i->peer.ip), ntohs(i->peer.u.all));
+	return (i->proto == proto &&
+		i->manip.ip == manip->ip && i->manip.u.all == manip->u.all);
+}
+
+static inline int
+reserved_manip_cmp_peer(const struct ip_nat_reserved *i,
+			const struct ip_conntrack_manip *manip,
+			const u_int16_t proto,
+			const struct ip_conntrack_manip *peer)
+{
+	DEBUGP("reserved_manip_cmp_peer: manip proto %u %u.%u.%u.%u:%u peer %u.%u.%u.%u:%u, "
+	       "reservation proto %u %u.%u.%u.%u:%u peer %u.%u.%u.%u:%u\n",
+	       proto, NIPQUAD(manip->ip), ntohs(manip->u.all),
+	       NIPQUAD(peer->ip), ntohs(peer->u.all),
+	       i->proto, NIPQUAD(i->manip.ip), ntohs(i->manip.u.all),
+	       NIPQUAD(i->peer.ip), ntohs(i->peer.u.all));
+
+	return (i->proto == proto &&
+		i->manip.ip == manip->ip && i->manip.u.all == manip->u.all &&
+		((i->peer.ip == 0) || (i->peer.ip == peer->ip && i->peer.u.all == peer->u.all)));
+}
+
+static inline int
+reserved_manip_cmp_peer_exact(const struct ip_nat_reserved *i,
+			      const struct ip_conntrack_manip *manip,
+			      const u_int16_t proto,
+			      const struct ip_conntrack_manip *peer)
+{
+	DEBUGP("reserved_manip_cmp_peer_exact: manip proto %u %u.%u.%u.%u:%u peer %u.%u.%u.%u:%u, "
+	       "reservation proto %u %u.%u.%u.%u:%u peer %u.%u.%u.%u:%u\n",
+	       proto, NIPQUAD(manip->ip), ntohs(manip->u.all),
+	       NIPQUAD(peer->ip), ntohs(peer->u.all),
+	       i->proto, NIPQUAD(i->manip.ip), ntohs(i->manip.u.all),
+	       NIPQUAD(i->peer.ip), ntohs(i->peer.u.all));
+
+	return (i->proto == proto &&
+		i->manip.ip == manip->ip && i->manip.u.all == manip->u.all &&
+		i->peer.ip == peer->ip && i->peer.u.all == peer->u.all);
+}
+
+/* Is this manip reserved?
+ * exact means full peer match is required, used for reservation deletion */
+static struct ip_nat_reserved *
+__ip_nat_reserved_find_manip(const struct ip_conntrack_manip *manip,
+			     const u_int16_t proto,
+			     const struct ip_conntrack_manip *peer,
+			     const int exact)
+{
+	struct ip_nat_reserved *i;
+	unsigned int h = hash_nat_reserved(manip, peer, proto);
+
+	DEBUGP("__ip_nat_reserved_find_manip: find proto %u %u.%u.%u.%u:%u\n",
+			proto, NIPQUAD(manip->ip), ntohs(manip->u.all));
+
+	if (peer) {
+		if (exact)
+			i = LIST_FIND(&natreserved[h], reserved_manip_cmp_peer_exact,
+				      struct ip_nat_reserved *, manip, proto, peer);
+		else
+			i = LIST_FIND(&natreserved[h], reserved_manip_cmp_peer,
+				      struct ip_nat_reserved *, manip, proto, peer);
+	} else
+		i = LIST_FIND(&natreserved[h], reserved_manip_cmp,
+			      struct ip_nat_reserved *, manip, proto);
+
+	return i;
+}
+
+/* Is this tuple clashing with a reserved manip? */
+static struct ip_nat_reserved *
+__ip_nat_reserved_find_tuple(const struct ip_conntrack_tuple *tuple,
+			     enum ip_nat_manip_type maniptype)
+{
+	struct ip_conntrack_manip m = {.ip = tuple->dst.ip, .u = {.all = tuple->dst.u.all}};
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		DEBUGP("__ip_nat_reserved_find_tuple: IP_NAT_MANIP_SRC search\n");
+		return __ip_nat_reserved_find_manip(&tuple->src, tuple->dst.protonum, &m, 0);
+	} else {
+		DEBUGP("__ip_nat_reserved_find_tuple: IP_NAT_MANIP_DST search\n");
+		return __ip_nat_reserved_find_manip(&m, tuple->dst.protonum, &tuple->src, 0);
+	}
+}
+
+static inline int
+clashing_ct_cmp(const struct ip_conntrack_tuple_hash *i, const void *data)
+{
+	const struct ip_conntrack_manip *m = (struct ip_conntrack_manip *) data;
+	const struct ip_conntrack_tuple *t = &i->tuple;
+
+	/* FIXME: every connection has two entries, we should check only the REPLY direction */
+
+	DEBUGP("clashing_ct_cmp: manip %u.%u.%u.%u:%u ct reply src %u.%u.%u.%u:%u dst %u.%u.%u.%u:%u\n",
+			NIPQUAD(m->ip), ntohs(m->u.all), NIPQUAD(t->src.ip), ntohs(t->src.u.all),
+			NIPQUAD(t->dst.ip), ntohs(t->dst.u.all));
+	return (((t->src.ip == m->ip) && (t->src.u.all == m->u.all)) ||
+		((t->dst.ip == m->ip) && (t->dst.u.all == m->u.all)));
+}
+
+/* Create a new reservation */
+struct ip_nat_reserved *
+__ip_nat_reserved_new_hash(const struct ip_conntrack_manip *manip,
+			   const u_int16_t proto,
+			   const struct ip_conntrack_manip *peer)
+{
+	struct ip_nat_reserved *res;
+	unsigned int h;
+
+	DEBUGP("__ip_nat_reserved_new_hash: manip proto %u %u.%u.%u.%u:%u\n",
+			proto, NIPQUAD(manip->ip), ntohs(manip->u.all));
+
+	/* check if it's already reserved */
+	if (__ip_nat_reserved_find_manip(manip, proto, peer, 1)) {
+		DEBUGP("__ip_nat_reserved_new_hash: already reserved\n");
+		return NULL;
+	}
+
+	/* FIXME: check if a clashing connection exists... This is problematic,
+	 * since the final decision in ip_nat_used_tuple() is based on a full
+	 * tuple, but we only have a manip... =(:< */
+
+	/* Current solutuion: we provide two methods for checking:
+	 *   - Strong check: in this case, the conntrack table is scanned if an
+	 *     already existing connection uses the manip in its REPLY direction.
+	 *     if such a conntrack entry is found, the mapping fails. This check is
+	 *     extremely pessimistic, since it fails to register reservations which could
+	 *     happily coexist with current conntracks if the other side of the tuple is
+	 *     different...
+	 *   - Exact check: if the callee provides a peer manip, then an exact lookup
+	 *     can be made in the conntrack hash. This is a more fine-grained check.
+	 */
+
+	if (peer) {
+		/* Exact check */
+		struct ip_conntrack_tuple t = {.src = *peer,
+					       .dst = {.protonum = proto,
+						       .ip = manip->ip,
+						       .u = {.all = manip->u.all}}};
+
+		if (ip_conntrack_tuple_taken(&t, NULL)) {
+			DEBUGP("__ip_nat_reserved_new_hash: manip clashes with an already existing connection\n");
+			return NULL;
+		}
+	} else {
+		/* Strong check: we have only a manip, unfortunately we scan the whole conntrack
+		 * hash for possible clashing connections... */
+		struct ip_conntrack_tuple_hash *h = NULL;
+		unsigned int i;
+
+		read_lock_bh(&ip_conntrack_lock);
+		for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
+			h = LIST_FIND(&ip_conntrack_hash[i], clashing_ct_cmp,
+				      struct ip_conntrack_tuple_hash *, manip);
+			if (h)
+				break;
+		}
+		read_unlock_bh(&ip_conntrack_lock);
+		if (h) {
+			DEBUGP("__ip_nat_reserved_new_hash: manip clashes with an already existing connection\n");
+			return NULL;
+		}
+	}
+
+	/* else allocate a new structure */
+	res = kmem_cache_alloc(ip_nat_reserved_cachep, GFP_ATOMIC);
+	if (!res)
+		return NULL;
+
+	memset(res, 0, sizeof(*res));
+	res->proto = proto;
+	res->manip = *manip;
+	if (peer)
+		res->peer = *peer;
+
+	/* put it into the hash */
+	h = hash_nat_reserved(manip, peer, proto);
+	atomic_inc(&ip_nat_reserved_count);
+	list_prepend(&natreserved[h], &res->hash);
+	DEBUGP("__ip_nat_reserved_new_hash: hashed manip proto %u %u.%u.%u.%u:%u\n",
+			proto, NIPQUAD(manip->ip), ntohs(manip->u.all));
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(__ip_nat_reserved_new_hash);
+
+/* Register a new reservation */
+static int
+__ip_nat_reserved_register(struct ip_conntrack_expect *expect,
+			   const struct ip_conntrack_manip *manip,
+			   const u_int16_t proto,
+			   const struct ip_conntrack_manip *peer)
+{
+	struct ip_nat_reserved *res;
+
+	DEBUGP("__ip_nat_reserved_register: registering proto %u %u.%u.%u.%u:%u\n",
+			proto, NIPQUAD(manip->ip), ntohs(manip->u.all));
+
+	/* allocate and put into the hash */
+	res = __ip_nat_reserved_new_hash(manip, proto, peer);
+	if (!res)
+		return 0;
+
+	/* append to the per-expectation reserved list */
+	list_append(&expect->reserved_list, &res->exp);
+
+	return 1;
+}
+
+int
+ip_nat_reserved_register(struct ip_conntrack_expect *expect,
+			 const struct ip_conntrack_manip *manip,
+			 const u_int16_t proto,
+			 const struct ip_conntrack_manip *peer)
+{
+	int ret;
+
+	write_lock_bh(&ip_nat_lock);
+
+	ret = __ip_nat_reserved_register(expect, manip, proto, peer);
+
+	write_unlock_bh(&ip_nat_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ip_nat_reserved_register);
+
+/* Unhash a reservation */
+struct ip_nat_reserved *
+__ip_nat_reserved_unhash(const struct ip_conntrack_manip *manip,
+		         const u_int16_t proto,
+			 const struct ip_conntrack_manip *peer)
+{
+	struct ip_nat_reserved *res;
+
+	DEBUGP("__ip_nat_reserved_unhash: unhashing proto %u %u.%u.%u.%u:%u\n",
+			proto, NIPQUAD(manip->ip), ntohs(manip->u.all));
+
+	/* check if it's really reserved */
+	if (!(res = __ip_nat_reserved_find_manip(manip, proto, peer, 1))) {
+		DEBUGP("__ip_nat_reserved_unhash: trying to unreg a nonexisting reservation\n");
+		return NULL;
+	}
+
+	/* delete from the hash table */
+	list_del(&res->hash);
+
+	atomic_dec(&ip_nat_reserved_count);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(__ip_nat_reserved_unhash);
+
+/* Return a reservation structure into the slab cache */
+void
+__ip_nat_reserved_free(struct ip_nat_reserved *res)
+{
+	kmem_cache_free(ip_nat_reserved_cachep, res);
+}
+EXPORT_SYMBOL_GPL(__ip_nat_reserved_free);
+
+/* Unregister a reservation */
+static int
+__ip_nat_reserved_unregister(struct ip_conntrack_expect *expect,
+			     const struct ip_conntrack_manip *manip,
+			     const u_int16_t proto,
+			     const struct ip_conntrack_manip *peer)
+{
+	struct ip_nat_reserved *res;
+
+	DEBUGP("__ip_nat_reserved_unregister: unregistering proto %u %u.%u.%u.%u:%u\n",
+			proto, NIPQUAD(manip->ip), ntohs(manip->u.all));
+
+	/* look up and unhash */
+	res = __ip_nat_reserved_unhash(manip, proto, peer);
+	if (!res)
+		return 0;
+
+	/* delete from the per-expectation list */
+	list_del(&res->exp);
+
+	/* free the structure */
+	__ip_nat_reserved_free(res);
+
+	return 1;
+}
+
+int
+ip_nat_reserved_unregister(struct ip_conntrack_expect *expect,
+			   const struct ip_conntrack_manip *manip,
+			   const u_int16_t proto,
+			   const struct ip_conntrack_manip *peer)
+{
+	int ret;
+
+	write_lock_bh(&ip_nat_lock);
+
+	ret = __ip_nat_reserved_unregister(expect, manip, proto, peer);
+
+	write_unlock_bh(&ip_nat_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ip_nat_reserved_unregister);
+
+/* Unregister all reservations for a given expectation */
+void
+ip_nat_reserved_unregister_all(struct ip_conntrack_expect *expect)
+{
+	struct list_head *i;
+	struct ip_nat_reserved *res;
+
+	DEBUGP("ip_nat_reserved_unregister_all: deleting all reservations for expectation %p\n",
+			expect);
+
+	write_lock_bh(&ip_nat_lock);
+
+	i = expect->reserved_list.next;
+	while (i != &expect->reserved_list) {
+		res = list_entry(i, struct ip_nat_reserved, exp);
+		i = i->next;
+
+		/* clear from lists */
+		list_del(&res->hash);
+		list_del(&res->exp);
+
+		kmem_cache_free(ip_nat_reserved_cachep, res);
+	}
+
+	write_unlock_bh(&ip_nat_lock);
+}
+EXPORT_SYMBOL_GPL(ip_nat_reserved_unregister_all);
+
+static void
+ip_nat_reserved_cleanup_expect(struct ip_conntrack_expect *expect)
+{
+	ip_nat_reserved_unregister_all(expect);
+}
+#endif /* CONFIG_IP_NF_NAT_NRES */
+
 /* Is this tuple already taken? (not by us) */
 int
 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
-		  const struct ip_conntrack *ignored_conntrack)
+		  const struct ip_conntrack *ignored_conntrack,
+		  const enum ip_nat_manip_type maniptype,
+		  const unsigned int flags)
 {
 	/* Conntrack tracking doesn't keep track of outgoing tuples; only
 	   incoming ones.  NAT means they don't have a fixed mapping,
@@ -124,6 +496,20 @@ ip_nat_used_tuple(const struct ip_conntr
 
 	   We could keep a separate hash if this proves too slow. */
 	struct ip_conntrack_tuple reply;
+#ifdef CONFIG_IP_NF_NAT_NRES
+	struct ip_nat_reserved *res;
+
+	/* check if the tuple is reserved if there are any reservations */
+	if (atomic_read(&ip_nat_reserved_count)) {
+		read_lock_bh(&ip_nat_lock);
+		res = __ip_nat_reserved_find_tuple(tuple, maniptype);
+		read_unlock_bh(&ip_nat_lock);
+
+		/* If we may not allocate reserved ports, return */
+		if (!(flags & IP_NAT_RANGE_USE_RESERVED) && res)
+			return 1;
+	}
+#endif
 
 	invert_tuplepr(&reply, tuple);
 	return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
@@ -262,7 +648,7 @@ get_unique_tuple(struct ip_conntrack_tup
 	if (maniptype == IP_NAT_MANIP_SRC) {
 		if (find_appropriate_src(orig_tuple, tuple, range)) {
 			DEBUGP("get_unique_tuple: Found current src map\n");
-			if (!ip_nat_used_tuple(tuple, conntrack))
+			if (!ip_nat_used_tuple(tuple, conntrack, maniptype, range->flags))
 				return;
 		}
 	}
@@ -280,7 +666,7 @@ get_unique_tuple(struct ip_conntrack_tup
 	/* Only bother mapping if it's not already in range and unique */
 	if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
 	     || proto->in_range(tuple, maniptype, &range->min, &range->max))
-	    && !ip_nat_used_tuple(tuple, conntrack)) {
+	    && !ip_nat_used_tuple(tuple, conntrack, maniptype, range->flags)) {
 		ip_nat_proto_put(proto);
 		return;
 	}
@@ -596,10 +982,29 @@ static int __init ip_nat_init(void)
 	/* Leave them the same for the moment. */
 	ip_nat_htable_size = ip_conntrack_htable_size;
 
+#ifdef CONFIG_IP_NF_NAT_NRES
+	/* Create nat_reserved slab cache */
+	ip_nat_reserved_cachep = kmem_cache_create("ip_nat_reserved",
+						   sizeof(struct ip_nat_reserved), 0,
+						   SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ip_nat_reserved_cachep) {
+		printk(KERN_ERR "Unable to create ip_nat_reserved slab cache\n");
+		return -ENOMEM;
+	}
+#endif
+
 	/* One vmalloc for both hash tables */
+#ifndef CONFIG_IP_NF_NAT_NRES
 	bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
+#else
+        bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size * 2);
+#endif
 	if (!bysource)
-		return -ENOMEM;
+		goto free_reserved_slab;
+
+#ifdef CONFIG_IP_NF_NAT_NRES
+	natreserved = bysource + ip_nat_htable_size;
+#endif
 
 	/* Sew in builtin protocols. */
 	write_lock_bh(&ip_nat_lock);
@@ -612,15 +1017,28 @@ static int __init ip_nat_init(void)
 
 	for (i = 0; i < ip_nat_htable_size; i++) {
 		INIT_LIST_HEAD(&bysource[i]);
+#ifdef CONFIG_IP_NF_NAT_NRES
+		INIT_LIST_HEAD(&natreserved[i]);
+#endif
 	}
 
 	/* FIXME: Man, this is a hack.  <SIGH> */
 	IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
 	ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
+#ifdef CONFIG_IP_NF_NAT_NRES
+	IP_NF_ASSERT(ip_conntrack_expect_destroyed == NULL);
+	ip_conntrack_expect_destroyed = &ip_nat_reserved_cleanup_expect;
+#endif
 
 	/* Initialize fake conntrack so that NAT will skip it */
 	ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
 	return 0;
+
+free_reserved_slab:
+#ifdef CONFIG_IP_NF_NAT_NRES
+	kmem_cache_destroy(ip_nat_reserved_cachep);
+#endif
+	return -ENOMEM;
 }
 
 /* Clear NAT section of all conntracks, in case we're loaded again. */
@@ -635,6 +1053,10 @@ static void __exit ip_nat_cleanup(void)
 {
 	ip_ct_iterate_cleanup(&clean_nat, NULL);
 	ip_conntrack_destroyed = NULL;
+#ifdef CONFIG_IP_NF_NAT_NRES
+	ip_conntrack_expect_destroyed = NULL;
+	kmem_cache_destroy(ip_nat_reserved_cachep);
+#endif
 	vfree(bysource);
 }
 
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
index 38acfdf..9d6211c 100644
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -88,7 +88,7 @@ gre_unique_tuple(struct ip_conntrack_tup
 
 	for (i = 0; i < range_size; i++, key++) {
 		*keyptr = htons(min + key % range_size);
-		if (!ip_nat_used_tuple(tuple, conntrack))
+		if (!ip_nat_used_tuple(tuple, conntrack, maniptype, range->flags))
 			return 1;
 	}
 
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 31a3f4c..de48dd9 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -46,7 +46,7 @@ icmp_unique_tuple(struct ip_conntrack_tu
 	for (i = 0; i < range_size; i++, id++) {
 		tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
 		                             (id % range_size));
-		if (!ip_nat_used_tuple(tuple, conntrack))
+		if (!ip_nat_used_tuple(tuple, conntrack, maniptype, range->flags))
 			return 1;
 	}
 	return 0;
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a3d1407..7f325ca 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -77,7 +77,7 @@ tcp_unique_tuple(struct ip_conntrack_tup
 
 	for (i = 0; i < range_size; i++, port++) {
 		*portptr = htons(min + port % range_size);
-		if (!ip_nat_used_tuple(tuple, conntrack)) {
+		if (!ip_nat_used_tuple(tuple, conntrack, maniptype, range->flags)) {
 			return 1;
 		}
 	}
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index ec6053f..54b1fe8 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -76,7 +76,7 @@ udp_unique_tuple(struct ip_conntrack_tup
 
 	for (i = 0; i < range_size; i++, port++) {
 		*portptr = htons(min + port % range_size);
-		if (!ip_nat_used_tuple(tuple, conntrack))
+		if (!ip_nat_used_tuple(tuple, conntrack, maniptype, range->flags))
 			return 1;
 	}
 	return 0;

[-- Attachment #3: 03-nat_delete.patch --]
[-- Type: application/octet-stream, Size: 11047 bytes --]

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 81392e5..b727243 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -81,6 +81,9 @@ #if defined(CONFIG_IP_NF_TPROXY) || defi
 
 	IPS_TPROXY_RELATED_BIT = 12,
 	IPS_TPROXY_RELATED = (1 << IPS_TPROXY_RELATED_BIT),
+
+	IPS_MAY_DELETE_BIT = 12,
+	IPS_MAY_DELETE = (1 << IPS_MAY_DELETE_BIT),
 #endif
 };
 
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 4369150..44c19ad 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -244,6 +244,8 @@ static inline void ip_ct_refresh(struct
 	__ip_ct_refresh_acct(ct, 0, skb, extra_jiffies, 0);
 }
 
+extern void __death_by_timeout(unsigned long ul_conntrack);
+
 /* These are for NAT.  Icky. */
 /* Update TCP window tracking data when NAT mangles the packet */
 extern void ip_conntrack_tcp_update(struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index b96427b..4ee301a 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -366,6 +366,50 @@ destroy_conntrack(struct nf_conntrack *n
 	ip_conntrack_free(ct);
 }
 
+static void
+__destroy_conntrack(struct nf_conntrack *nfct)
+{
+	struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
+	struct ip_conntrack_protocol *proto;
+
+	DEBUGP("destroy_conntrack(%p)\n", ct);
+	IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
+	IP_NF_ASSERT(!timer_pending(&ct->timeout));
+
+	ip_conntrack_event(IPCT_DESTROY, ct);
+	set_bit(IPS_DYING_BIT, &ct->status);
+
+	/* To make sure we don't get any weird locking issues here:
+	 * destroy_conntrack() MUST NOT be called with a write lock
+	 * to ip_conntrack_lock!!! -HW */
+	proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+	if (proto && proto->destroy)
+		proto->destroy(ct);
+
+	if (ip_conntrack_destroyed)
+		ip_conntrack_destroyed(ct);
+
+	/* Expectations will have been removed in clean_from_lists,
+	 * except TFTP can create an expectation on the first packet,
+	 * before connection is in the list, so we need to clean here,
+	 * too. */
+	ip_ct_remove_expectations(ct);
+
+	/* We overload first tuple to link into unconfirmed list. */
+	if (!is_confirmed(ct)) {
+		BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	}
+
+	CONNTRACK_STAT_INC(delete);
+
+	if (ct->master)
+		ip_conntrack_put(ct->master);
+
+	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
+	ip_conntrack_free(ct);
+}
+
 static void death_by_timeout(unsigned long ul_conntrack)
 {
 	struct ip_conntrack *ct = (void *)ul_conntrack;
@@ -379,6 +423,19 @@ static void death_by_timeout(unsigned lo
 	ip_conntrack_put(ct);
 }
 
+void __death_by_timeout(unsigned long ul_conntrack)
+{
+	struct ip_conntrack *ct = (void *)ul_conntrack;
+
+	/* Inside lock so preempt is disabled on module removal path.
+	 * Otherwise we can get spurious warnings. */
+	CONNTRACK_STAT_INC(delete_list);
+	clean_from_lists(ct);
+
+        if (atomic_dec_and_test(&ct->ct_general.use))
+		__destroy_conntrack((struct nf_conntrack *)ct);
+}
+
 static inline int
 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
 		    const struct ip_conntrack_tuple *tuple,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index fb920e7..fd51229 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -31,6 +31,7 @@ #include <linux/spinlock.h>
 
 #include <net/tcp.h>
 
+#include <linux/netfilter/nf_conntrack_common.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -983,6 +984,15 @@ static int tcp_packet(struct ip_conntrac
 					      NULL, "ip_ct_tcp: invalid SYN");
 			return -NF_ACCEPT;
 		}
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+	case TCP_CONNTRACK_TIME_WAIT:
+		/* Set MAY_DELETE if NAT subsystem may drop connection when it is clashing */
+		if (test_bit(IPS_TPROXY_BIT, &conntrack->status)) {
+			DEBUGP(KERN_DEBUG "Marking TPROXY-related TIME_WAIT conntrack entry MAY_DELETE\n");
+			set_bit(IPS_MAY_DELETE_BIT, &conntrack->status);
+		}
+		break;
+#endif
 	case TCP_CONNTRACK_CLOSE:
 		if (index == TCP_RST_SET
 		    && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 260c281..6f7c072 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -972,3 +972,6 @@ #if defined(CONFIG_IP_NF_CONNTRACK_NETLI
 EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
 EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
 #endif
+#if defined(CONFIG_IP_NF_TPROXY) || defined(CONFIG_IP_NF_TPROXY_MODULE)
+EXPORT_SYMBOL_GPL(__death_by_timeout);
+#endif
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 262f36e..47bbe94 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -26,6 +26,7 @@ #include <linux/jhash.h>
 #define ASSERT_READ_LOCK(x)
 #define ASSERT_WRITE_LOCK(x)
 
+#include <linux/netfilter/nf_conntrack_common.h>
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -122,6 +123,15 @@ static void ip_nat_cleanup_conntrack(str
 	write_unlock_bh(&ip_nat_lock);
 }
 
+static void __ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
+{
+	if (!(conn->status & IPS_NAT_DONE_MASK))
+		return;
+
+	list_del(&conn->nat.info.bysource);
+}
+
+
 /* We do checksum mangling, so if they were wrong before they're still
  * wrong.  Also works for incomplete packets (eg. ICMP dest
  * unreachables.) */
@@ -251,7 +261,8 @@ __ip_nat_reserved_new_hash(const struct
 			   const struct ip_conntrack_manip *peer)
 {
 	struct ip_nat_reserved *res;
-	unsigned int h;
+	struct ip_conntrack_tuple_hash *h = NULL;
+	unsigned int hash;
 
 	DEBUGP("__ip_nat_reserved_new_hash: manip proto %u %u.%u.%u.%u:%u\n",
 			proto, NIPQUAD(manip->ip), ntohs(manip->u.all));
@@ -283,25 +294,61 @@ __ip_nat_reserved_new_hash(const struct
 					       .dst = {.protonum = proto,
 						       .ip = manip->ip,
 						       .u = {.all = manip->u.all}}};
+		struct ip_conntrack *ctrack;
+
+		h = ip_conntrack_find_get(&t, NULL);
+
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+		if ((h != NULL) &&
+		    (ctrack = tuplehash_to_ctrack(h)) &&
+		    test_bit(IPS_MAY_DELETE_BIT, &ctrack->status)) {
+			DEBUGP("Deleting old conntrack entry for NAT\n");
+			__ip_nat_cleanup_conntrack(ctrack);
+			ctrack->status &= ~IPS_NAT_DONE_MASK;
+			if (del_timer(&ctrack->timeout))
+				ctrack->timeout.function((unsigned long)ctrack);
+			ip_conntrack_put(ctrack);
+			h = NULL;
+		}
+#endif
 
-		if (ip_conntrack_tuple_taken(&t, NULL)) {
+		if (h) {
 			DEBUGP("__ip_nat_reserved_new_hash: manip clashes with an already existing connection\n");
+			ip_conntrack_put(tuplehash_to_ctrack(h));
 			return NULL;
 		}
 	} else {
 		/* Strong check: we have only a manip, unfortunately we scan the whole conntrack
 		 * hash for possible clashing connections... */
-		struct ip_conntrack_tuple_hash *h = NULL;
 		unsigned int i;
+		int repeat;
+		struct ip_conntrack *ctrack;
 
-		read_lock_bh(&ip_conntrack_lock);
+		write_lock_bh(&ip_conntrack_lock);
 		for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
-			h = LIST_FIND(&ip_conntrack_hash[i], clashing_ct_cmp,
-				      struct ip_conntrack_tuple_hash *, manip);
+			do {
+				repeat = 0;
+				h = LIST_FIND(&ip_conntrack_hash[i], clashing_ct_cmp,
+					      struct ip_conntrack_tuple_hash *, manip);
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+				if ((h != NULL) &&
+				    (ctrack = tuplehash_to_ctrack(h)) &&
+				    test_bit(IPS_MAY_DELETE_BIT, &ctrack->status)) {
+					DEBUGP("Deleting old conntrack entry for NAT\n");
+					__ip_nat_cleanup_conntrack(ctrack);
+					ctrack->status &= ~IPS_NAT_DONE_MASK;
+					if (del_timer(&ctrack->timeout))
+						__death_by_timeout((unsigned long)ctrack);
+					h = NULL;
+					repeat = 1;
+				}
+#endif
+			} while (repeat);
+			/* there's a clashing connection, break */
 			if (h)
 				break;
 		}
-		read_unlock_bh(&ip_conntrack_lock);
+		write_unlock_bh(&ip_conntrack_lock);
 		if (h) {
 			DEBUGP("__ip_nat_reserved_new_hash: manip clashes with an already existing connection\n");
 			return NULL;
@@ -320,9 +367,9 @@ __ip_nat_reserved_new_hash(const struct
 		res->peer = *peer;
 
 	/* put it into the hash */
-	h = hash_nat_reserved(manip, peer, proto);
+	hash = hash_nat_reserved(manip, peer, proto);
 	atomic_inc(&ip_nat_reserved_count);
-	list_prepend(&natreserved[h], &res->hash);
+	list_prepend(&natreserved[hash], &res->hash);
 	DEBUGP("__ip_nat_reserved_new_hash: hashed manip proto %u %u.%u.%u.%u:%u\n",
 			proto, NIPQUAD(manip->ip), ntohs(manip->u.all));
 
@@ -497,6 +544,8 @@ ip_nat_used_tuple(const struct ip_conntr
 
 	   We could keep a separate hash if this proves too slow. */
 	struct ip_conntrack_tuple reply;
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack *ctrack;
 #ifdef CONFIG_IP_NF_NAT_NRES
 	struct ip_nat_reserved *res;
 
@@ -512,8 +561,29 @@ #ifdef CONFIG_IP_NF_NAT_NRES
 	}
 #endif
 
+	/* check if it's taken by an existing connection */
 	invert_tuplepr(&reply, tuple);
-	return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
+	h = ip_conntrack_find_get(&reply, ignored_conntrack);
+
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+	/* check if that conntrack is marked MAY_DELETE, if so, get rid of it... */
+	if ((h != NULL) &&
+	    (ctrack = tuplehash_to_ctrack(h)) &&
+	    test_bit(IPS_MAY_DELETE_BIT, &ctrack->status)) {
+		DEBUGP("Deleting old conntrack entry for NAT\n");
+		__ip_nat_cleanup_conntrack(ctrack);
+		ctrack->status &= ~IPS_NAT_DONE_MASK;
+		if (del_timer(&ctrack->timeout))
+			ctrack->timeout.function((unsigned long)ctrack);
+		ip_conntrack_put(ctrack);
+		h = NULL;
+	}
+#endif
+
+	if (h)
+		ip_conntrack_put(tuplehash_to_ctrack(h));
+
+	return h != NULL;
 }
 EXPORT_SYMBOL(ip_nat_used_tuple);
 
diff --git a/net/ipv4/netfilter/iptable_tproxy.c b/net/ipv4/netfilter/iptable_tproxy.c
index ea35da7..4b46463 100644
--- a/net/ipv4/netfilter/iptable_tproxy.c
+++ b/net/ipv4/netfilter/iptable_tproxy.c
@@ -1356,7 +1356,7 @@ ip_tproxy_setsockopt_unassign(struct soc
 	if (!sr) {
 		DEBUGP(KERN_DEBUG "IP_TPROXY: IP_TPROXY_UNASSIGN not unhashing socket, "
 		       "%08x:%04x, proto=%d, sk->state=%d\n",
-		       saddr, sport, proto, sk->sk_socket ? sk->sk_socket->state : -1);
+		       saddr, sport, proto, sk->sk_state);
 		res = -ENOENT;
 		goto write_unlk;
 	}

[-- Attachment #4: linux_tproxy2_qos.patch --]
[-- Type: application/octet-stream, Size: 9939 bytes --]

diff -Naur linux-2.6.18.1/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.18.1-new/include/linux/netfilter_ipv4/ip_conntrack.h
--- linux-2.6.18.1/include/linux/netfilter_ipv4/ip_conntrack.h	2007-11-06 21:42:38.000000000 +0530
+++ linux-2.6.18.1-new/include/linux/netfilter_ipv4/ip_conntrack.h	2007-11-06 21:36:53.000000000 +0530
@@ -120,6 +120,8 @@
 		void *sockref;
 		struct list_head related;
 	} tproxy;
+	unsigned long tproxyBit;
+        u32 tproxyFaddr;
 #endif /* CONFIG_IP_NF_TPROXY */
 
 #endif /* CONFIG_IP_NF_NAT_NEEDED */
diff -Naur linux-2.6.18.1/include/linux/netfilter_ipv4/ip_tproxy.h linux-2.6.18.1-new/include/linux/netfilter_ipv4/ip_tproxy.h
--- linux-2.6.18.1/include/linux/netfilter_ipv4/ip_tproxy.h	2007-11-06 21:42:28.000000000 +0530
+++ linux-2.6.18.1-new/include/linux/netfilter_ipv4/ip_tproxy.h	2007-11-06 21:36:53.000000000 +0530
@@ -74,5 +74,60 @@
 	} v;
 };
 
+#ifdef __KERNEL__
+struct ip_tproxy_sockref;
+
+struct ip_tproxy_hash
+{
+	struct list_head list;
+	struct ip_tproxy_sockref *sockref;
+};
+
+struct ip_tproxy_sockref
+{
+	int flags;
+	atomic_t references;
+
+	u8 proto;
+
+	/* foreign address associated with a local socket */
+	u32 faddr;
+	u16 fport;
+
+	/* local socket address */
+	u32 laddr;
+	u16 lport;
+
+	/* remote addresses, needed for datagram protocols when the peer
+	 * sends the packet triggering the NAT translation. (as there might
+	 * be multiple sockrefs on the same foreign address).
+	 */
+	u32 raddr;
+	u16 rport;
+
+	/* hash chains indexed by local and foreign addresses */
+	struct ip_tproxy_hash bylocal, byforeign;
+
+	/* lock protecting access to related list */
+	spinlock_t relatedlock;
+	/* number of related connections */
+	atomic_t related;
+	/* list of related connections */
+	struct list_head relatedct;
+
+	/* socket which we were assigned to */
+	struct sock *assigned_to;
+
+	/* How many sockets use this sockref? Used for mark-only sockrefs,
+	 * which can be shared between multiple sockets bound to the same local
+	 * address */
+	atomic_t socket_count;
+
+	/* when was this entry inserted in hash */
+	struct timespec tv_hashed;
+};
+#endif //__KERNEL__
+
+
 #endif
 
diff -Naur linux-2.6.18.1/net/ipv4/netfilter/ip_tables.c linux-2.6.18.1-new/net/ipv4/netfilter/ip_tables.c
--- linux-2.6.18.1/net/ipv4/netfilter/ip_tables.c	2006-10-14 09:04:03.000000000 +0530
+++ linux-2.6.18.1-new/net/ipv4/netfilter/ip_tables.c	2007-11-06 21:36:53.000000000 +0530
@@ -33,6 +33,9 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tproxy.h>
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv4 packet filter");
@@ -47,12 +50,16 @@
 #define dprintf(format, args...)
 #endif
 
+
+DEFINE_RWLOCK(ip_conntrack_lock);
+
 #ifdef DEBUG_IP_FIREWALL_USER
 #define duprintf(format, args...) printk(format , ## args)
 #else
 #define duprintf(format, args...)
 #endif
 
+#if 0
 #ifdef CONFIG_NETFILTER_DEBUG
 #define IP_NF_ASSERT(x)						\
 do {								\
@@ -63,6 +70,7 @@
 #else
 #define IP_NF_ASSERT(x)
 #endif
+#endif // #if 0
 
 #if 0
 /* All the better to debug you with... */
@@ -70,6 +78,7 @@
 #define inline
 #endif
 
+
 /*
    We keep a set of rules for each CPU, so we can avoid write-locking
    them in the softirq when updating the counters and therefore
@@ -85,7 +94,11 @@
 		const char *indev,
 		const char *outdev,
 		const struct ipt_ip *ipinfo,
-		int isfrag)
+		int isfrag
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+                , char TproxyBitSet, u32 tproxyFaddr
+#endif /* CONFIG_IP_NF_TPROXY */
+                )
 {
 	size_t i;
 	unsigned long ret;
@@ -108,7 +121,15 @@
 			NIPQUAD(ipinfo->dmsk.s_addr),
 			NIPQUAD(ipinfo->dst.s_addr),
 			ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
-		return 0;
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+                if (TproxyBitSet && ((tproxyFaddr == ipinfo->src.s_addr) ||
+                                     ((tproxyFaddr & ipinfo->smsk.s_addr) == ipinfo->src.s_addr)))
+                {
+                     // Do nothing
+                }
+                else
+#endif /* CONFIG_IP_NF_TPROXY */
+                     return 0;
 	}
 
 	/* Look for ifname matches; this should unroll nicely. */
@@ -232,11 +253,52 @@
 	struct ipt_entry *e, *back;
 	struct xt_table_info *private;
 
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+        char TproxyBitSet;
+        u32 tproxyFaddr;
+#endif /* CONFIG_IP_NF_TPROXY */
+
 	/* Initialization */
 	ip = (*pskb)->nh.iph;
 	datalen = (*pskb)->len - ip->ihl * 4;
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
+
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+	read_lock_bh(&ip_conntrack_lock);
+	TproxyBitSet = 0;
+	tproxyFaddr = 0;
+        if (NF_IP_POST_ROUTING == hook)
+        {
+             ct = ip_conntrack_get (*pskb, &ctinfo);
+
+             if (ct)
+             {
+                  if (IPS_TPROXY_BIT & ct->tproxyBit)
+                  {
+                       tproxyFaddr = ct->tproxyFaddr;
+                       TproxyBitSet = 1;
+
+                  }
+                  else
+                  {
+                       TproxyBitSet = 0;
+                       tproxyFaddr = 0;
+                  }
+             }
+             else
+             {
+                  TproxyBitSet = 0;
+                  tproxyFaddr = 0;
+             }
+	}
+	read_unlock_bh(&ip_conntrack_lock);
+#endif /* CONFIG_IP_NF_TPROXY */
+	
+
 	/* We handle fragments by dealing with the first fragment as
 	 * if it was a normal packet.  All other fragments are treated
 	 * normally, except that they will NEVER match rules that ask
@@ -257,7 +319,12 @@
 	do {
 		IP_NF_ASSERT(e);
 		IP_NF_ASSERT(back);
-		if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
+		if (ip_packet_match(ip, indev, outdev, &e->ip, offset
+#if defined(CONFIG_IP_NF_TPROXY) || defined (CONFIG_IP_NF_TPROXY_MODULE)
+                                    ,TproxyBitSet, tproxyFaddr
+#endif /* CONFIG_IP_NF_TPROXY */
+                                   ))
+                {
 			struct ipt_entry_target *t;
 
 			if (IPT_MATCH_ITERATE(e, do_match,
diff -Naur linux-2.6.18.1/net/ipv4/netfilter/iptable_tproxy.c linux-2.6.18.1-new/net/ipv4/netfilter/iptable_tproxy.c
--- linux-2.6.18.1/net/ipv4/netfilter/iptable_tproxy.c	2007-11-06 21:42:38.000000000 +0530
+++ linux-2.6.18.1-new/net/ipv4/netfilter/iptable_tproxy.c	2007-11-06 21:41:00.000000000 +0530
@@ -45,7 +45,7 @@
 
 #include <linux/netfilter_ipv4/listhelp.h>
 
-#define TPROXY_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
+#define TPROXY_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT)) 
 
 #if 0
 #define DEBUGP printk
@@ -147,6 +147,7 @@
 
 #define TF_STATE_MASK	  0xffff0000
 
+#if 0
 struct ip_tproxy_sockref;
 
 struct ip_tproxy_hash
@@ -198,6 +199,7 @@
 	/* when was this entry inserted in hash */
 	struct timespec tv_hashed;
 };
+#endif
 
 static int hashsize = 0;
 module_param(hashsize, uint, 0600);
@@ -724,6 +726,13 @@
 		       hooknum, sr->laddr, sr->lport, newip, newport);
 	}
 	else {
+		if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
+		{
+			test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
+                        ct->tproxyBit = IPS_TPROXY_BIT;
+                        ct->tproxyFaddr = newip;
+		}
+
 		/* we store a reference to the sockref in the conntrack */
 		if (!test_and_set_bit(IPS_TPROXY_BIT, &ct->status)) {
 			if (flags & TN_STOREREF) {
@@ -828,6 +837,7 @@
 	enum ip_conntrack_info ctinfo;
 	unsigned int verdict = NF_ACCEPT;
 
+
 	ct = ip_conntrack_get(*pskb, &ctinfo);
 
 	if (ct && ctinfo == IP_CT_NEW) {
@@ -851,6 +861,7 @@
 		read_lock_bh(&ip_tproxy_lock);
 		if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) {
 
+
 			/*
 			 * We either received a connection from the network (PREROUTING case)
 			 * or a local process generated one (LOCAL_OUT case).
@@ -869,6 +880,7 @@
 			sr = ip_tproxy_sockref_find_foreign(iph->daddr, tports[1],
 						iph->protocol, iph->saddr, tports[0]);
 
+
 			if (sr && sr->flags & TF_ORPHAN) {
 				/* This sockref is orphaned, the listening socket is already unassigned,
 				 * so it should not be used for setting up NAT for a new connection. */
@@ -891,6 +903,7 @@
 				if ((sr->flags & TF_MARK_ONLY) == 0)
 					sr = NULL;
 			}
+
 		}
 		else if (hooknum == NF_IP_POST_ROUTING) {
 
@@ -906,6 +919,7 @@
 
 			/* source address is interesting */
 
+			
 			sr = ip_tproxy_sockref_find_local(iph->saddr, tports[0], iph->protocol,
 					1, iph->daddr, tports[1]);
 			if (sr && (sr->flags & (TF_CONNECT|TF_MARK_ONLY)) == 0) {
@@ -984,9 +998,9 @@
 			/* there was no matching sockref, so we consult the
 			 * TPROXY table
 			 */
-
 			ui.changed = 0;
 			verdict = ipt_do_table(pskb, hooknum, in, out, &tproxy_table, &ui);
+
 			if (ui.changed && verdict == NF_ACCEPT) {
 				struct ip_tproxy_sockref sr;
 				u32 laddr;
@@ -1014,9 +1028,17 @@
 				if (!ip_tproxy_setup_nat(pskb, hooknum, &sr, 0))
 					verdict = NF_DROP;
 			}
+                        else
+                        {
+                             // 
+                             // Letz make these values zero so that no junk values are
+                             // present.
+                             //
+                                ct->tproxyBit = 0;
+                                ct->tproxyFaddr = 0;
+                        }
 		}
 	}
-
 	return verdict;
 }
 
@@ -1826,7 +1848,8 @@
 	.owner		= THIS_MODULE,
 	.pf		= PF_INET,
 	.hooknum	= NF_IP_POST_ROUTING,
-	.priority	= -130
+        //.priority	= -130
+	.priority	= -155
 };
 
 static struct nf_hook_ops ip_tproxy_local_out_ops = {

                 reply	other threads:[~2013-07-18  7:14 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CA+86yMig+_kNGno8jxrumLd5U+pfQeQvsJLDnP54Sj-uj+oMqQ@mail.gmail.com \
    --to=saravanan.nagarajan87@gmail.com \
    --cc=netfilter-devel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.