All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition.
@ 2015-10-20 22:20 Jarno Rajahalme
  2015-10-20 22:20 ` [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info() Jarno Rajahalme
                   ` (5 more replies)
  0 siblings, 6 replies; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-20 22:20 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA; +Cc: dev-yBygre7rU0TnMu66kgdUjQ

Remove the definition of IP_CT_NEW_REPLY as it does not make sense.
This allows the definition of IP_CT_NUMBER to be simplified as well.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 7 ++++---
 net/openvswitch/conntrack.c                        | 2 --
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 319f471..e0aebc8 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -20,9 +20,10 @@ enum ip_conntrack_info {
 
 	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
 	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
-	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
-	/* Number of distinct IP_CT types (no NEW in reply dirn). */
-	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
+	/* No IP_CT_NEW_REPLY */
+
+	/* Number of distinct IP_CT types. */
+	IP_CT_NUMBER
 };
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ad61426..097ace4 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -73,7 +73,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 	switch (ctinfo) {
 	case IP_CT_ESTABLISHED_REPLY:
 	case IP_CT_RELATED_REPLY:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_REPLY_DIR;
 		break;
 	default:
@@ -90,7 +89,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 		ct_state |= OVS_CS_F_RELATED;
 		break;
 	case IP_CT_NEW:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_NEW;
 		break;
 	default:
-- 
2.1.4

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info().
  2015-10-20 22:20 [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
@ 2015-10-20 22:20 ` Jarno Rajahalme
       [not found]   ` <1445379629-112880-2-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
  2015-10-21 10:45   ` Pablo Neira Ayuso
  2015-10-20 22:20 ` [RFC PATCH 3/5] netfilter: Allow calling into nat helper without skb_dst Jarno Rajahalme
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-20 22:20 UTC (permalink / raw)
  To: netdev; +Cc: dev, jrajahalme

Define a new inline function to map conntrack status to enum
ip_conntrack_info.  This removes the need to otherwise duplicate this
code in a later patch.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
---
 include/net/netfilter/nf_conntrack.h | 15 +++++++++++++++
 net/netfilter/nf_conntrack_core.c    | 22 +++-------------------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index fde4068..b3de10e 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -125,6 +125,21 @@ nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
 			    tuplehash[hash->tuple.dst.dir]);
 }
 
+static inline enum ip_conntrack_info
+nf_ct_get_info(const struct nf_conntrack_tuple_hash *h)
+{
+	const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+		return IP_CT_ESTABLISHED_REPLY;
+	/* Once we've had two way comms, always ESTABLISHED. */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+		return IP_CT_ESTABLISHED;
+	if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+		return IP_CT_RELATED;
+	return IP_CT_NEW;
+}
+
 static inline u_int16_t nf_ct_l3num(const struct nf_conn *ct)
 {
 	return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3cb3cb8..70ddbd8 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1056,25 +1056,9 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	ct = nf_ct_tuplehash_to_ctrack(h);
 
 	/* It exists; we have (non-exclusive) reference. */
-	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
-		*ctinfo = IP_CT_ESTABLISHED_REPLY;
-		/* Please set reply bit if this packet OK */
-		*set_reply = 1;
-	} else {
-		/* Once we've had two way comms, always ESTABLISHED. */
-		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
-			pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
-			*ctinfo = IP_CT_ESTABLISHED;
-		} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
-			pr_debug("nf_conntrack_in: related packet for %p\n",
-				 ct);
-			*ctinfo = IP_CT_RELATED;
-		} else {
-			pr_debug("nf_conntrack_in: new packet for %p\n", ct);
-			*ctinfo = IP_CT_NEW;
-		}
-		*set_reply = 0;
-	}
+	*ctinfo = nf_ct_get_info(h);
+	*set_reply = NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY;
+
 	skb->nfct = &ct->ct_general;
 	skb->nfctinfo = *ctinfo;
 	return ct;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH 3/5] netfilter: Allow calling into nat helper without skb_dst.
  2015-10-20 22:20 [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
  2015-10-20 22:20 ` [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info() Jarno Rajahalme
@ 2015-10-20 22:20 ` Jarno Rajahalme
       [not found]   ` <1445379629-112880-3-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
       [not found] ` <1445379629-112880-1-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-20 22:20 UTC (permalink / raw)
  To: netdev; +Cc: dev, jrajahalme

NAT checksum recalculation code assumed existence of skb_dst, which
becomes a problem for a later patch in the series.  Simplify this by
removing the checks, as the checksum will be dealt with later in the
stack.

Suggested-by: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
---
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 29 ++++++++---------------------
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 29 ++++++++---------------------
 2 files changed, 16 insertions(+), 42 deletions(-)

diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 5075b7e..f8aad03 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -127,28 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
 				    u8 proto, void *data, __sum16 *check,
 				    int datalen, int oldlen)
 {
-	const struct iphdr *iph = ip_hdr(skb);
-	struct rtable *rt = skb_rtable(skb);
-
 	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  ip_hdrlen(skb);
-			skb->csum_offset = (void *)check - data;
-			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-						    datalen, proto, 0);
-		} else {
-			*check = 0;
-			*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-						   datalen, proto,
-						   csum_partial(data, datalen,
-								0));
-			if (proto == IPPROTO_UDP && !*check)
-				*check = CSUM_MANGLED_0;
-		}
+		const struct iphdr *iph = ip_hdr(skb);
+
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+			ip_hdrlen(skb);
+		skb->csum_offset = (void *)check - data;
+		*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+					    proto, 0);
 	} else
 		inet_proto_csum_replace2(check, skb,
 					 htons(oldlen), htons(datalen), true);
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 238e70c..e0be97e 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -131,28 +131,15 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
 				    u8 proto, void *data, __sum16 *check,
 				    int datalen, int oldlen)
 {
-	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
-	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-
 	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt6i_flags & RTF_LOCAL) &&
-		    (!skb->dev || skb->dev->features & NETIF_F_V6_CSUM)) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  (data - (void *)skb->data);
-			skb->csum_offset = (void *)check - data;
-			*check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
-						  datalen, proto, 0);
-		} else {
-			*check = 0;
-			*check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
-						 datalen, proto,
-						 csum_partial(data, datalen,
-							      0));
-			if (proto == IPPROTO_UDP && !*check)
-				*check = CSUM_MANGLED_0;
-		}
+		const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+			(data - (void *)skb->data);
+		skb->csum_offset = (void *)check - data;
+		*check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+					  datalen, proto, 0);
 	} else
 		inet_proto_csum_replace2(check, skb,
 					 htons(oldlen), htons(datalen), true);
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH 4/5] openvswitch: conntrack netlink API updates.
       [not found] ` <1445379629-112880-1-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
@ 2015-10-20 22:20   ` Jarno Rajahalme
  2015-10-21 10:41     ` Pablo Neira Ayuso
  0 siblings, 1 reply; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-20 22:20 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA; +Cc: dev-yBygre7rU0TnMu66kgdUjQ

This patch makes changes already done in the net repo, so that the
rest of the code compiles on top of the current net-next.  Eventually,
this patch will not be needed and should be removed before merging.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
---
 include/uapi/linux/openvswitch.h | 18 +++++-------------
 net/openvswitch/conntrack.c      | 15 +++++++++------
 net/openvswitch/flow_netlink.c   |  6 +++---
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 4036e1b..098d8b5 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -323,7 +323,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_MPLS,      /* array of struct ovs_key_mpls.
 				 * The implementation may restrict
 				 * the accepted length of the array. */
-	OVS_KEY_ATTR_CT_STATE,	/* u8 bitmask of OVS_CS_F_* */
+	OVS_KEY_ATTR_CT_STATE,	/* u32 bitmask of OVS_CS_F_* */
 	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
 	OVS_KEY_ATTR_CT_MARK,	/* u32 connection tracking mark */
 	OVS_KEY_ATTR_CT_LABEL,	/* 16-octet connection tracking label */
@@ -451,9 +451,9 @@ struct ovs_key_ct_label {
 #define OVS_CS_F_ESTABLISHED       0x02 /* Part of an existing connection. */
 #define OVS_CS_F_RELATED           0x04 /* Related to an established
 					 * connection. */
-#define OVS_CS_F_INVALID           0x20 /* Could not track connection. */
-#define OVS_CS_F_REPLY_DIR         0x40 /* Flow is in the reply direction. */
-#define OVS_CS_F_TRACKED           0x80 /* Conntrack has occurred. */
+#define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
+#define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
+#define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
 
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
@@ -632,7 +632,7 @@ struct ovs_action_hash {
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
-	OVS_CT_ATTR_FLAGS,      /* u8 bitmask of OVS_CT_F_*. */
+	OVS_CT_ATTR_COMMIT,     /* No argument, commits connection. */
 	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
 	OVS_CT_ATTR_MARK,       /* mark to associate with this connection. */
 	OVS_CT_ATTR_LABEL,      /* label to associate with this connection. */
@@ -643,14 +643,6 @@ enum ovs_ct_attr {
 
 #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
 
-/*
- * OVS_CT_ATTR_FLAGS flags - bitmask of %OVS_CT_F_*
- * @OVS_CT_F_COMMIT: Commits the flow to the conntrack table. This allows
- * future packets for the same connection to be identified as 'established'
- * or 'related'.
- */
-#define OVS_CT_F_COMMIT		0x01
-
 /**
  * enum ovs_action_attr - Action types.
  *
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 097ace4..6997107 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -42,6 +42,8 @@ struct md_label {
 	struct ovs_key_ct_label mask;
 };
 
+#define OVS_CT_F_COMMIT		0x01
+
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
 	struct nf_conntrack_helper *helper;
@@ -165,7 +167,7 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 
 int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
 {
-	if (nla_put_u8(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
@@ -537,8 +539,8 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 }
 
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
-	[OVS_CT_ATTR_FLAGS]	= { .minlen = sizeof(u32),
-				    .maxlen = sizeof(u32) },
+	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0,
+				    .maxlen = 0 },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
 				    .maxlen = sizeof(u16) },
 	[OVS_CT_ATTR_MARK]	= { .minlen = sizeof(struct md_mark),
@@ -574,8 +576,8 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 		}
 
 		switch (type) {
-		case OVS_CT_ATTR_FLAGS:
-			info->flags = nla_get_u32(a);
+		case OVS_CT_ATTR_COMMIT:
+			info->flags |= OVS_CT_F_COMMIT;
 			break;
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 		case OVS_CT_ATTR_ZONE:
@@ -699,7 +701,8 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 	if (!start)
 		return -EMSGSIZE;
 
-	if (nla_put_u32(skb, OVS_CT_ATTR_FLAGS, ct_info->flags))
+	if (ct_info->flags & OVS_CT_F_COMMIT &&
+	    nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
 		return -EMSGSIZE;
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
 	    nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 77850f1..1e6c2ac 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -291,7 +291,7 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_DP_HASH */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_RECIRC_ID */
-		+ nla_total_size(1)   /* OVS_KEY_ATTR_CT_STATE */
+		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_STATE */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABEL */
@@ -351,7 +351,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_TUNNEL]	 = { .len = OVS_ATTR_NESTED,
 				     .next = ovs_tunnel_key_lens, },
 	[OVS_KEY_ATTR_MPLS]	 = { .len = sizeof(struct ovs_key_mpls) },
-	[OVS_KEY_ATTR_CT_STATE]	 = { .len = sizeof(u8) },
+	[OVS_KEY_ATTR_CT_STATE]	 = { .len = sizeof(u32) },
 	[OVS_KEY_ATTR_CT_ZONE]	 = { .len = sizeof(u16) },
 	[OVS_KEY_ATTR_CT_MARK]	 = { .len = sizeof(u32) },
 	[OVS_KEY_ATTR_CT_LABEL]	 = { .len = sizeof(struct ovs_key_ct_label) },
@@ -864,7 +864,7 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 
 	if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) &&
 	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) {
-		u8 ct_state = nla_get_u8(a[OVS_KEY_ATTR_CT_STATE]);
+		u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]);
 
 		SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
-- 
2.1.4

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH 5/5] openvswitch: Interface with NAT.
  2015-10-20 22:20 [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
                   ` (2 preceding siblings ...)
       [not found] ` <1445379629-112880-1-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
@ 2015-10-20 22:20 ` Jarno Rajahalme
       [not found]   ` <1445379629-112880-5-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
  2015-10-21 10:59   ` Thomas Graf
  2015-10-20 22:28 ` [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
  2015-10-21  8:33 ` [ovs-dev] " Thomas Graf
  5 siblings, 2 replies; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-20 22:20 UTC (permalink / raw)
  To: netdev; +Cc: dev, jrajahalme

Extend OVS conntrack interface to cover NAT.  New nested
OVS_CT_ATTR_NAT may be used to include NAT with a CT action.  A bare
OVS_CT_ATTR_NAT only mangles existing connections.  If
OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
attributes, new (non-committed/non-confirmed) connections are mangled
according to the rest of the nested attributes.

This work extends on a branch by Thomas Graf at
https://github.com/tgraf/ovs/tree/nat.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
---
 include/uapi/linux/openvswitch.h |  48 +++-
 net/openvswitch/actions.c        |  25 +-
 net/openvswitch/conntrack.c      | 543 ++++++++++++++++++++++++++++++++++++---
 net/openvswitch/conntrack.h      |   2 +
 net/openvswitch/flow.h           |  11 +-
 5 files changed, 580 insertions(+), 49 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 098d8b5..9d63472 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -454,6 +454,12 @@ struct ovs_key_ct_label {
 #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
 #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
 #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
+#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
+					   mangled by NAT. */
+#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
+					   was mangled by NAT. */
+
+#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
@@ -629,6 +635,8 @@ struct ovs_action_hash {
  * mask. For each bit set in the mask, the corresponding bit in the value is
  * copied to the connection tracking label field in the connection.
  * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
+ * translation (NAT) on the packet.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -638,13 +646,50 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_LABEL,      /* label to associate with this connection. */
 	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
 				   related connections. */
+	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
 	__OVS_CT_ATTR_MAX
 };
 
 #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
 
 /**
- * enum ovs_action_attr - Action types.
+ * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
+ * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
+ * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
+ * address/port).  Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
+ * specified.  Effective only for packets for ct_state NEW connections.
+ * Packets of committed connections are mangled by the NAT action according to
+ * the committed NAT type regardless of the flags specified.  As a corollary, a
+ * NAT action without a NAT type flag will only mangle packets of committed
+ * connections.  The following NAT attributes only apply for NEW
+ * (non-committed) connections, and they may be included only when the CT
+ * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
+ * @OVS_NAT_ATTR_DST is also included.
+ * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
+ * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
+ * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
+ * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
+ * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
+ */
+enum ovs_nat_attr {
+	OVS_NAT_ATTR_UNSPEC,
+	OVS_NAT_ATTR_SRC,
+	OVS_NAT_ATTR_DST,
+	OVS_NAT_ATTR_IP_MIN,
+	OVS_NAT_ATTR_IP_MAX,
+	OVS_NAT_ATTR_PROTO_MIN,
+	OVS_NAT_ATTR_PROTO_MAX,
+	OVS_NAT_ATTR_PERSISTENT,
+	OVS_NAT_ATTR_PROTO_HASH,
+	OVS_NAT_ATTR_PROTO_RANDOM,
+	__OVS_NAT_ATTR_MAX,
+};
+
+#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
+
+/** * enum ovs_action_attr - Action types.
  *
  * @OVS_ACTION_ATTR_OUTPUT: Output packet to port.
  * @OVS_ACTION_ATTR_USERSPACE: Send packet to userspace according to nested
@@ -700,7 +745,6 @@ enum ovs_action_attr {
 				       * The data must be zero for the unmasked
 				       * bits. */
 	OVS_ACTION_ATTR_CT,           /* One nested OVS_CT_ATTR_* . */
-
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 1d21ab9..e31cc55 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -127,16 +127,6 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
 	return da;
 }
 
-static void invalidate_flow_key(struct sw_flow_key *key)
-{
-	key->eth.type = htons(0);
-}
-
-static bool is_flow_key_valid(const struct sw_flow_key *key)
-{
-	return !!key->eth.type;
-}
-
 static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_mpls *mpls)
 {
@@ -169,7 +159,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 		skb_set_inner_protocol(skb, skb->protocol);
 	skb->protocol = mpls->mpls_ethertype;
 
-	invalidate_flow_key(key);
+	ovs_invalidate_flow_key(key);
 	return 0;
 }
 
@@ -199,7 +189,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 	if (eth_p_mpls(skb->protocol))
 		skb->protocol = ethertype;
 
-	invalidate_flow_key(key);
+	ovs_invalidate_flow_key(key);
 	return 0;
 }
 
@@ -234,7 +224,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 
 	err = skb_vlan_pop(skb);
 	if (skb_vlan_tag_present(skb))
-		invalidate_flow_key(key);
+		ovs_invalidate_flow_key(key);
 	else
 		key->eth.tci = 0;
 	return err;
@@ -244,7 +234,7 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_vlan *vlan)
 {
 	if (skb_vlan_tag_present(skb))
-		invalidate_flow_key(key);
+		ovs_invalidate_flow_key(key);
 	else
 		key->eth.tci = vlan->vlan_tci;
 	return skb_vlan_push(skb, vlan->vlan_tpid,
@@ -746,7 +736,7 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
 			struct net *net = read_pnet(&dp->net);
 			__be16 ethertype = key->eth.type;
 
-			if (!is_flow_key_valid(key)) {
+			if (!ovs_is_flow_key_valid(key)) {
 				if (eth_p_mpls(skb->protocol))
 					ethertype = skb->inner_protocol;
 				else
@@ -983,14 +973,14 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
 {
 	struct deferred_action *da;
 
-	if (!is_flow_key_valid(key)) {
+	if (!ovs_is_flow_key_valid(key)) {
 		int err;
 
 		err = ovs_flow_key_update(skb, key);
 		if (err)
 			return err;
 	}
-	BUG_ON(!is_flow_key_valid(key));
+	BUG_ON(!ovs_is_flow_key_valid(key));
 
 	if (!nla_is_last(a, rem)) {
 		/* Recirc action is the not the last action
@@ -1100,6 +1090,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			break;
 
 		case OVS_ACTION_ATTR_CT:
+			/* XXX: 'key' must be valid. */
 			err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
 					     nla_data(a));
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 6997107..4a1b56a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/module.h>
+
 #include <linux/openvswitch.h>
 #include <net/ip.h>
 #include <net/netfilter/nf_conntrack_core.h>
@@ -20,6 +21,13 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#endif
+
 #include "datapath.h"
 #include "conntrack.h"
 #include "flow.h"
@@ -42,7 +50,16 @@ struct md_label {
 	struct ovs_key_ct_label mask;
 };
 
-#define OVS_CT_F_COMMIT		0x01
+#define OVS_CT_F_COMMIT		0x01   /* Commit the connection. */
+#ifdef CONFIG_NF_NAT_NEEDED
+#define OVS_CT_F_NAT		0x02   /* NAT for committed connections
+					  only. */
+#define OVS_CT_F_SRC_NAT	0x04   /* Source NAT for NEW connections. */
+#define OVS_CT_F_DST_NAT	0x08   /* Destination NAT for NEW
+					  connections. */
+
+#define OVS_CT_F_NAT_MASK (OVS_CT_F_NAT | OVS_CT_F_SRC_NAT | OVS_CT_F_DST_NAT)
+#endif
 
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
@@ -53,6 +70,9 @@ struct ovs_conntrack_info {
 	u16 family;
 	struct md_mark mark;
 	struct md_label label;
+#ifdef CONFIG_NF_NAT_NEEDED
+	struct nf_nat_range range;   /* Only present for SNAT and DNAT. */
+#endif
 };
 
 static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -87,6 +107,8 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 		ct_state |= OVS_CS_F_ESTABLISHED;
 		break;
 	case IP_CT_RELATED:
+		ct_state |= OVS_CS_F_NEW;
+		/* Fall through. */
 	case IP_CT_RELATED_REPLY:
 		ct_state |= OVS_CS_F_RELATED;
 		break;
@@ -137,11 +159,17 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	ovs_ct_get_label(ct, &key->ct.label);
 }
 
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
- * previously sent the packet to conntrack via the ct action.
+/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action.  If
+ * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
+ * initialized from the connection status.
  */
 static void ovs_ct_update_key(const struct sk_buff *skb,
-			      struct sw_flow_key *key, bool post_ct)
+			      struct sw_flow_key *key, bool post_ct
+#ifdef CONFIG_NF_NAT_NEEDED
+			      , bool keep_nat_flags
+#endif
+			      )
 {
 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
@@ -151,8 +179,20 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 	ct = nf_ct_get(skb, &ctinfo);
 	if (ct) {
 		state = ovs_ct_get_state(ctinfo);
+		/* OVS persists the related flag for the duration of the
+		 * connection. */
 		if (ct->master)
 			state |= OVS_CS_F_RELATED;
+#ifdef CONFIG_NF_NAT_NEEDED
+		if (keep_nat_flags)
+			state |= key->ct.state & OVS_CS_F_NAT_MASK;
+		else {
+			if (ct->status & IPS_SRC_NAT)
+				state |= OVS_CS_F_SRC_NAT;
+			if (ct->status & IPS_DST_NAT)
+				state |= OVS_CS_F_DST_NAT;
+		}
+#endif
 		zone = nf_ct_zone(ct);
 	} else if (post_ct) {
 		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -160,9 +200,15 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 	__ovs_ct_update_key(key, state, zone, ct);
 }
 
+/* This is called to initialize CT key fields possibly coming in from the local
+ * stack. */
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 {
-	ovs_ct_update_key(skb, key, false);
+	ovs_ct_update_key(skb, key, false
+#ifdef CONFIG_NF_NAT_NEEDED
+			  , false
+#endif
+			  );
 }
 
 int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -291,7 +337,16 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 		return NF_DROP;
 	}
 
-	return helper->help(skb, protoff, ct, ctinfo);
+	if (helper->help(skb, protoff, ct, ctinfo) != NF_ACCEPT)
+		return NF_DROP;
+
+#ifdef CONFIG_NF_NAT_NEEDED
+	/* Adjust seqs after helper. */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
+	    && !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+		return NF_DROP;
+#endif
+	return NF_ACCEPT;
 }
 
 static int handle_fragments(struct net *net, struct sw_flow_key *key,
@@ -377,7 +432,211 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
 	return true;
 }
 
-static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
+#ifdef CONFIG_NF_NAT_NEEDED
+/* Modeled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP. */
+static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+			      enum ip_conntrack_info ctinfo,
+			      const struct nf_nat_range *range,
+			      enum nf_nat_manip_type maniptype)
+{
+	int hooknum, nh_off, err = NF_ACCEPT;
+
+	nh_off = skb_network_offset(skb);
+	skb_pull(skb, nh_off);
+
+	/* See HOOK2MANIP(). */
+	if (maniptype == NF_NAT_MANIP_SRC)
+		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+	else
+		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (skb->protocol == htons(ETH_P_IP)
+		    && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				err = NF_DROP;
+			goto push;
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			__be16 frag_off;
+			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+			int hdrlen = ipv6_skip_exthdr(skb,
+						      sizeof(struct ipv6hdr),
+						      &nexthdr, &frag_off);
+
+			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+				if (!nf_nat_icmpv6_reply_translation(skb, ct,
+								     ctinfo,
+								     hooknum,
+								     hdrlen))
+					err = NF_DROP;
+				goto push;
+			}
+		}
+		/* Non-ICMP, fall thru to initialize if needed. */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			/* Initialize according to the NAT action. */
+			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+				/* Action is set up to establish a new
+				 * mapping */
+				? nf_nat_setup_info(ct, range, maniptype)
+				: nf_nat_alloc_null_binding(ct, hooknum);
+		}
+		break;
+
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		break;
+
+	default:
+		err = NF_DROP;
+		goto push;
+	}
+
+	if (err == NF_ACCEPT)
+		err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+push:
+	skb_push(skb, nh_off);
+
+	return err;
+}
+
+/* Find an existing conntrack entry for which this packet was already applied
+ * to.  This is only called when there is evidence that the packet was already
+ * tracked and commited, but we lost the ct reference due to an userspace
+ * upcall. This means that on entry skb->nfct is NULL.
+ * On success, returns conntrack ptr, sets skb->nfct and ctinfo.
+ * Must be called rcu_read_lock()ed. */
+static struct nf_conn *
+ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
+		     u_int8_t l3num, struct sk_buff *skb,
+		     enum ip_conntrack_info *ctinfo)
+{
+	struct nf_conntrack_l3proto *l3proto;
+	struct nf_conntrack_l4proto *l4proto;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	unsigned int dataoff;
+	u_int8_t protonum;
+
+	BUG_ON(skb->nfct != NULL);
+
+	l3proto = __nf_ct_l3proto_find(l3num);
+	if (!l3proto)
+		return NULL;
+	if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+				 &protonum) <= 0) {
+		pr_warn("ovs_ct_find_existing: Can't get l4proto\n");
+		return NULL;
+	}
+	l4proto = __nf_ct_l4proto_find(l3num, protonum);
+	if (!l4proto)
+		return NULL;
+
+	if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+			     protonum, net, &tuple, l3proto, l4proto)) {
+		pr_warn("ovs_ct_find_existing: Can't get tuple\n");
+		return NULL;
+	}
+
+	/* look for tuple match */
+	h = nf_conntrack_find_get(net, zone, &tuple);
+	if (!h) {
+		pr_warn("ovs_ct_find_existing: Can't find tuple\n");
+		return NULL;
+	}
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	*ctinfo = nf_ct_get_info(h);
+	if (*ctinfo == IP_CT_NEW) {
+		/* This should not happen. */
+		pr_warn("ovs_ct_find_existing: new packet for %p\n", ct);
+	}
+	skb->nfct = &ct->ct_general;
+	skb->nfctinfo = *ctinfo;
+	return ct;
+}
+
+/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise.
+ * This action can be used to both NAT and reverse NAT, however, reverse NAT
+ * can also be done with the conntrack action. */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb)
+{
+	enum nf_nat_manip_type maniptype;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	int err;
+
+	/* No NAT action or already NATed? */
+	if (!(info->flags & OVS_CT_F_NAT_MASK)
+	    || key->ct.state & OVS_CS_F_NAT_MASK)
+		return NF_ACCEPT;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	/* Check if an existing conntrack entry may be found for this skb.
+	 * This happens when we lose the ct entry pointer due to an upcall.
+	 * Don't lookup invalid connections. */
+	if (!ct && key->ct.state & OVS_CS_F_TRACKED
+	    && !(key->ct.state & OVS_CS_F_INVALID))
+		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
+					  &ctinfo);
+	if (!ct || nf_ct_is_untracked(ct))
+		/* A NAT action may only be performed on tracked packets. */
+		return NF_ACCEPT;
+
+	/* Add NAT extension if not commited yet. */
+	if (!nf_ct_is_confirmed(ct)) {
+		if (!nf_ct_nat_ext_add(ct))
+			return NF_ACCEPT;   /* Can't NAT. */
+	}
+
+	/* Determine NAT type.
+	 * Check if the NAT type can be deduced from the tracked connection.
+	 * Make sure expected traffic is NATted only when commiting. */
+	if (info->flags & OVS_CT_F_NAT && ctinfo != IP_CT_NEW
+	    && ct->status & IPS_NAT_MASK
+	    && (!(ct->status & IPS_EXPECTED_BIT)
+		|| info->flags & OVS_CT_F_COMMIT)) {
+		/* NAT an established or related connection like before. */
+		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+			/* This is the REPLY direction for a connection
+			 * for which NAT was applied in the forward
+			 * direction.  Do the reverse NAT. */
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+		else
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+	}
+	else if (info->flags & OVS_CT_F_SRC_NAT)
+		maniptype = NF_NAT_MANIP_SRC;
+	else if (info->flags & OVS_CT_F_DST_NAT)
+		maniptype = NF_NAT_MANIP_DST;
+	else
+		return NF_ACCEPT; /* Connection is not NATed. */
+
+	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+
+	/* Mark NAT done if successful. */
+	if (err == NF_ACCEPT)
+		key->ct.state |= (maniptype == NF_NAT_MANIP_SRC)
+			? OVS_CS_F_SRC_NAT : OVS_CS_F_DST_NAT;
+	return err;
+}
+#endif
+
+static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 			   const struct ovs_conntrack_info *info,
 			   struct sk_buff *skb)
 {
@@ -386,7 +645,9 @@ static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
 	 * actually run the packet through conntrack twice unless it's for a
 	 * different zone.
 	 */
-	if (!skb_nfct_cached(net, skb, info)) {
+	bool cached = skb_nfct_cached(net, skb, info);
+
+	if (!cached) {
 		struct nf_conn *tmpl = info->ct;
 
 		/* Associate skb with specified zone. */
@@ -402,10 +663,38 @@ static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
 				    skb) != NF_ACCEPT)
 			return -ENOENT;
 
-		if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
-			WARN_ONCE(1, "helper rejected packet");
-			return -EINVAL;
-		}
+		/* Clear CT state NAT flags to mark that we have not yet done
+		 * NAT after the nf_conntrack_in() call.  We can actually clear
+		 * the whole state, as it will be re-initialized below. */
+		key->ct.state = 0;
+
+		/* Update the key, but keep the NAT flags. */
+		ovs_ct_update_key(skb, key, true
+#ifdef CONFIG_NF_NAT_NEEDED
+				  , true);
+#endif
+	}
+
+	/* Never NAT or Help NEW connections without commit. */
+	if ((key->ct.state & OVS_CS_F_NEW) && !(info->flags & OVS_CT_F_COMMIT))
+		return 0;
+
+#ifdef CONFIG_NF_NAT_NEEDED
+	/* NAT action must be executed once on every packet, and before the
+	 * helper, if any. */
+	if (ovs_ct_nat(net, key, info, skb) != NF_ACCEPT) {
+		WARN_ONCE(1, "NAT rejected packet");
+		return -EINVAL;
+	}
+#endif
+	/* Call helper after the first time nf_conntrack_in is called,
+	 * and for new connections that are being commited. */
+	if ((!cached ||
+	     ((key->ct.state & OVS_CS_F_NEW)
+	      && (info->flags & OVS_CT_F_COMMIT)))
+	    && ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+		WARN_ONCE(1, "helper rejected packet");
+		return -EINVAL;
 	}
 
 	return 0;
@@ -422,18 +711,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	if (exp) {
 		u8 state;
 
+		/* NOTE: New connections are NATted and Helped only when
+		 * commited, so we are not calling into NAT here. */
 		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
 		__ovs_ct_update_key(key, state, &info->zone, exp->master);
 	} else {
-		int err;
-
-		err = __ovs_ct_lookup(net, key, info, skb);
-		if (err)
-			return err;
-
-		ovs_ct_update_key(skb, key, true);
+		return __ovs_ct_lookup(net, key, info, skb);
 	}
-
 	return 0;
 }
 
@@ -448,9 +732,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 	state = key->ct.state;
 	if (key->ct.zone == info->zone.id &&
 	    ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
-		/* Previous lookup has shown that this connection is already
-		 * tracked and committed. Skip committing.
-		 */
+		/* Previous lookup has shown that this packet is already
+		 * tracked and the connection is committed. Skip committing. */
 		return 0;
 	}
 
@@ -459,9 +742,6 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 		return err;
 	if (nf_conntrack_confirm(skb) != NF_ACCEPT)
 		return -EINVAL;
-
-	ovs_ct_update_key(skb, key, true);
-
 	return 0;
 }
 
@@ -495,8 +775,9 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 
 	if (info->flags & OVS_CT_F_COMMIT)
 		err = ovs_ct_commit(net, key, info, skb);
-	else
+	else {
 		err = ovs_ct_lookup(net, key, info, skb);
+	}
 	if (err)
 		goto err;
 
@@ -538,6 +819,131 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 	return 0;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static int parse_nat(const struct nlattr *attr,
+		     struct ovs_conntrack_info *info, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool have_ip_max = false;
+	bool have_proto_max = false;
+	bool ip_vers = (info->family == NFPROTO_IPV6);
+
+	nla_for_each_nested(a, attr, rem) {
+		static const u16 ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
+			[OVS_NAT_ATTR_SRC] = {0, 0},
+			[OVS_NAT_ATTR_DST] = {0, 0},
+			[OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16),sizeof(u16)},
+			[OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16),sizeof(u16)},
+			[OVS_NAT_ATTR_PERSISTENT] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
+		};
+		int type = nla_type(a);
+
+		if (type > OVS_NAT_ATTR_MAX) {
+			OVS_NLERR(log, "Unknown nat attribute (type=%d, max=%d).\n",
+			type, OVS_NAT_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (ovs_nat_attr_lens[type][ip_vers] != nla_len(a)) {
+			OVS_NLERR(log, "NAT attribute type has unexpected "
+				  " length (type=%d, length=%d, expected=%d).\n",
+				  type, nla_len(a),
+				  ovs_nat_attr_lens[type][ip_vers]);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NAT_ATTR_SRC:
+		case OVS_NAT_ATTR_DST:
+			if (info->flags & OVS_CT_F_NAT) {
+				OVS_NLERR(log, "Only one type of NAT may be "
+					  "specified.\n");
+				return -ERANGE;
+			}
+			info->flags |= OVS_CT_F_NAT;
+			info->flags |= ((type == OVS_NAT_ATTR_SRC)
+					? OVS_CT_F_SRC_NAT : OVS_CT_F_DST_NAT);
+			break;
+
+		case OVS_NAT_ATTR_IP_MIN:
+			nla_memcpy(&info->range.min_addr, a, nla_len(a));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_IP_MAX:
+			have_ip_max = true;
+			nla_memcpy(&info->range.max_addr, a,
+				   sizeof(info->range.max_addr));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MIN:
+			info->range.min_proto.all = nla_get_u16(a);
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MAX:
+			have_proto_max = true;
+			info->range.max_proto.all = nla_get_u16(a);
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PERSISTENT:
+			info->range.flags |= NF_NAT_RANGE_PERSISTENT;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_HASH:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_RANDOM:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
+			break;
+
+		default:
+			OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+		return -EINVAL;
+	}
+	if (!(info->flags & OVS_CT_F_NAT)) {
+		/* Do not allow range if no type is given. */
+		if (info->range.flags) {
+			OVS_NLERR(log, "NAT flags may be given only when NAT "
+				  "range (SRC or DST) is also specified.\n");
+			return -EINVAL;
+		}
+		info->flags |= OVS_CT_F_NAT;   /* NAT existing connections. */
+	} else if (!(info->flags & OVS_CT_F_COMMIT)) {
+		OVS_NLERR(log, "NAT attributes may be specified only "
+			  "when CT COMMIT flag is also specified.\n");
+		return -EINVAL;
+	}
+	/* Allow missing IP_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
+		memcpy(&info->range.max_addr, &info->range.min_addr,
+		       sizeof(info->range.max_addr));
+	}
+	/* Allow missing PROTO_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED
+	    && !have_proto_max) {
+		info->range.max_proto.all = info->range.min_proto.all;
+	}
+	return 0;
+}
+#endif
+
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0,
 				    .maxlen = 0 },
@@ -548,7 +954,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_LABEL]	= { .minlen = sizeof(struct md_label),
 				    .maxlen = sizeof(struct md_label) },
 	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
-				    .maxlen = NF_CT_HELPER_NAME_LEN }
+				    .maxlen = NF_CT_HELPER_NAME_LEN },
+#ifdef CONFIG_NF_NAT_NEEDED
+	[OVS_CT_ATTR_NAT]	= { .minlen = 0,
+				    .maxlen = 96 }
+#endif
 };
 
 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -607,6 +1017,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 				return -EINVAL;
 			}
 			break;
+#ifdef CONFIG_NF_NAT_NEEDED
+		case OVS_CT_ATTR_NAT: {
+			int err = parse_nat(a, info, log);
+			if (err)
+				return err;
+			break;
+		}
+#endif
 		default:
 			OVS_NLERR(log, "Unknown conntrack attr (%d)",
 				  type);
@@ -692,6 +1110,69 @@ err_free_ct:
 	return err;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
+			       struct sk_buff *skb)
+{
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+	if (!start)
+		return false;
+
+	if (info->flags & OVS_CT_F_SRC_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
+			return false;
+	} else if (info->flags & OVS_CT_F_DST_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
+			return false;
+	} else {
+		goto out;
+	}
+
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
+		if (info->family == NFPROTO_IPV4) {
+			if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					info->range.min_addr.ip) ||
+			    (info->range.max_addr.ip != info->range.min_addr.ip
+			     && (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
+						 info->range.max_addr.ip))))
+				return false;
+		} else { /* IPv6 */
+			if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					     &info->range.min_addr.in6) ||
+			    (memcmp(&info->range.max_addr.in6,
+				    &info->range.min_addr.in6,
+				    sizeof info->range.max_addr.in6)
+			     && (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
+						  &info->range.max_addr.in6))))
+				return false;
+		}
+	}
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
+			 info->range.min_proto.all)
+	     || (info->range.max_proto.all != info->range.min_proto.all
+		 && nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
+				info->range.max_proto.all))))
+		return false;
+
+	if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
+		return false;
+out:
+	nla_nest_end(skb, start);
+
+	return true;
+}
+#endif
+
 int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 			  struct sk_buff *skb)
 {
@@ -720,7 +1201,11 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 				   ct_info->helper->name))
 			return -EMSGSIZE;
 	}
-
+#ifdef CONFIG_NF_NAT_NEEDED
+	if (ct_info->flags & OVS_CT_F_NAT_MASK &&
+	    !ovs_ct_nat_to_attr(ct_info, skb))
+		return -EMSGSIZE;
+#endif
 	nla_nest_end(skb, start);
 
 	return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 43f5dd7..2b8ff65 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -34,6 +34,7 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
 int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
 void ovs_ct_free_action(const struct nlattr *a);
+
 #else
 #include <linux/errno.h>
 
@@ -82,5 +83,6 @@ static inline int ovs_ct_put_key(const struct sw_flow_key *key,
 }
 
 static inline void ovs_ct_free_action(const struct nlattr *a) { }
+
 #endif /* CONFIG_NF_CONNTRACK */
 #endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 5688e33..2837690 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -119,7 +119,6 @@ struct sw_flow_key {
 		u8 state;
 		struct ovs_key_ct_label label;
 	} ct;
-
 } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
 
 struct sw_flow_key_range {
@@ -210,6 +209,16 @@ static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid)
 	return !ovs_identifier_is_ufid(sfid);
 }
 
+static inline void ovs_invalidate_flow_key(struct sw_flow_key *key)
+{
+	key->eth.type = htons(0);
+}
+
+static inline bool ovs_is_flow_key_valid(const struct sw_flow_key *key)
+{
+	return !!key->eth.type;
+}
+
 void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags,
 			   const struct sk_buff *);
 void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *,
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition.
  2015-10-20 22:20 [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
                   ` (3 preceding siblings ...)
  2015-10-20 22:20 ` [RFC PATCH 5/5] openvswitch: Interface with NAT Jarno Rajahalme
@ 2015-10-20 22:28 ` Jarno Rajahalme
  2015-10-21  8:33 ` [ovs-dev] " Thomas Graf
  5 siblings, 0 replies; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-20 22:28 UTC (permalink / raw)
  To: netdev; +Cc: dev

I missed the “net-next” label from the title, sorry for that.

  Jarno

> On Oct 20, 2015, at 3:20 PM, Jarno Rajahalme <jrajahalme@nicira.com> wrote:
> 
> Remove the definition of IP_CT_NEW_REPLY as it does not make sense.
> This allows the definition of IP_CT_NUMBER to be simplified as well.
> 
> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
> ---
> include/uapi/linux/netfilter/nf_conntrack_common.h | 7 ++++---
> net/openvswitch/conntrack.c                        | 2 --
> 2 files changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
> index 319f471..e0aebc8 100644
> --- a/include/uapi/linux/netfilter/nf_conntrack_common.h
> +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
> @@ -20,9 +20,10 @@ enum ip_conntrack_info {
> 
> 	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
> 	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
> -	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
> -	/* Number of distinct IP_CT types (no NEW in reply dirn). */
> -	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
> +	/* No IP_CT_NEW_REPLY */
> +
> +	/* Number of distinct IP_CT types. */
> +	IP_CT_NUMBER
> };
> 
> #define NF_CT_STATE_INVALID_BIT			(1 << 0)
> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> index ad61426..097ace4 100644
> --- a/net/openvswitch/conntrack.c
> +++ b/net/openvswitch/conntrack.c
> @@ -73,7 +73,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
> 	switch (ctinfo) {
> 	case IP_CT_ESTABLISHED_REPLY:
> 	case IP_CT_RELATED_REPLY:
> -	case IP_CT_NEW_REPLY:
> 		ct_state |= OVS_CS_F_REPLY_DIR;
> 		break;
> 	default:
> @@ -90,7 +89,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
> 		ct_state |= OVS_CS_F_RELATED;
> 		break;
> 	case IP_CT_NEW:
> -	case IP_CT_NEW_REPLY:
> 		ct_state |= OVS_CS_F_NEW;
> 		break;
> 	default:
> -- 
> 2.1.4
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [ovs-dev] [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition.
  2015-10-20 22:20 [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
                   ` (4 preceding siblings ...)
  2015-10-20 22:28 ` [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
@ 2015-10-21  8:33 ` Thomas Graf
       [not found]   ` <20151021083323.GB15539-4EA/1caXOu0mYvmMESoHnA@public.gmane.org>
  5 siblings, 1 reply; 23+ messages in thread
From: Thomas Graf @ 2015-10-21  8:33 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: netdev, dev, netfilter-devel

[Copying netfilter-devel]

On 10/20/15 at 03:20pm, Jarno Rajahalme wrote:
> Remove the definition of IP_CT_NEW_REPLY as it does not make sense.
> This allows the definition of IP_CT_NUMBER to be simplified as well.
> 
> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
> ---
>  include/uapi/linux/netfilter/nf_conntrack_common.h | 7 ++++---
>  net/openvswitch/conntrack.c                        | 2 --
>  2 files changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
> index 319f471..e0aebc8 100644
> --- a/include/uapi/linux/netfilter/nf_conntrack_common.h
> +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
> @@ -20,9 +20,10 @@ enum ip_conntrack_info {
>  
>  	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
>  	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
> -	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
> -	/* Number of distinct IP_CT types (no NEW in reply dirn). */
> -	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
> +	/* No IP_CT_NEW_REPLY */
> +
> +	/* Number of distinct IP_CT types. */
> +	IP_CT_NUMBER
>  };

I understand what you are doing here but this is part of the published
UAPI and removing this might break compilation of a user application
even if the definition is not used right now. It's probably safer to
leave the definition and obsolete it with a comment.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info().
       [not found]   ` <1445379629-112880-2-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
@ 2015-10-21  8:50     ` Thomas Graf
  0 siblings, 0 replies; 23+ messages in thread
From: Thomas Graf @ 2015-10-21  8:50 UTC (permalink / raw)
  To: Jarno Rajahalme
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA

On 10/20/15 at 03:20pm, Jarno Rajahalme wrote:
> Define a new inline function to map conntrack status to enum
> ip_conntrack_info.  This removes the need to otherwise duplicate this
> code in a later patch.
> 
> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>

LGTM

> ---
>  include/net/netfilter/nf_conntrack.h | 15 +++++++++++++++
>  net/netfilter/nf_conntrack_core.c    | 22 +++-------------------
>  2 files changed, 18 insertions(+), 19 deletions(-)
> 
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index fde4068..b3de10e 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -125,6 +125,21 @@ nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
>  			    tuplehash[hash->tuple.dst.dir]);
>  }
>  
> +static inline enum ip_conntrack_info
> +nf_ct_get_info(const struct nf_conntrack_tuple_hash *h)
> +{
> +	const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
> +
> +	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
> +		return IP_CT_ESTABLISHED_REPLY;
> +	/* Once we've had two way comms, always ESTABLISHED. */
> +	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
> +		return IP_CT_ESTABLISHED;
> +	if (test_bit(IPS_EXPECTED_BIT, &ct->status))
> +		return IP_CT_RELATED;
> +	return IP_CT_NEW;
> +}
> +
>  static inline u_int16_t nf_ct_l3num(const struct nf_conn *ct)
>  {
>  	return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 3cb3cb8..70ddbd8 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -1056,25 +1056,9 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
>  	ct = nf_ct_tuplehash_to_ctrack(h);
>  
>  	/* It exists; we have (non-exclusive) reference. */
> -	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
> -		*ctinfo = IP_CT_ESTABLISHED_REPLY;
> -		/* Please set reply bit if this packet OK */
> -		*set_reply = 1;
> -	} else {
> -		/* Once we've had two way comms, always ESTABLISHED. */
> -		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
> -			pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
> -			*ctinfo = IP_CT_ESTABLISHED;
> -		} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
> -			pr_debug("nf_conntrack_in: related packet for %p\n",
> -				 ct);
> -			*ctinfo = IP_CT_RELATED;
> -		} else {
> -			pr_debug("nf_conntrack_in: new packet for %p\n", ct);
> -			*ctinfo = IP_CT_NEW;
> -		}
> -		*set_reply = 0;
> -	}
> +	*ctinfo = nf_ct_get_info(h);
> +	*set_reply = NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY;
> +
>  	skb->nfct = &ct->ct_general;
>  	skb->nfctinfo = *ctinfo;
>  	return ct;
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 5/5] openvswitch: Interface with NAT.
       [not found]   ` <1445379629-112880-5-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
@ 2015-10-21  9:34     ` Florian Westphal
  2015-10-21 11:30       ` Thomas Graf
  0 siblings, 1 reply; 23+ messages in thread
From: Florian Westphal @ 2015-10-21  9:34 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA

Jarno Rajahalme <jrajahalme@nicira.com> wrote:
> Extend OVS conntrack interface to cover NAT.  New nested
> OVS_CT_ATTR_NAT may be used to include NAT with a CT action.  A bare
> OVS_CT_ATTR_NAT only mangles existing connections.  If
> OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
> attributes, new (non-committed/non-confirmed) connections are mangled
> according to the rest of the nested attributes.
> 
> This work extends on a branch by Thomas Graf at
> https://github.com/tgraf/ovs/tree/nat.
> 
> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
> ---
>  include/uapi/linux/openvswitch.h |  48 +++-
>  net/openvswitch/actions.c        |  25 +-
>  net/openvswitch/conntrack.c      | 543 ++++++++++++++++++++++++++++++++++++---
>  net/openvswitch/conntrack.h      |   2 +
>  net/openvswitch/flow.h           |  11 +-
>  5 files changed, 580 insertions(+), 49 deletions(-)
> 
> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> index 098d8b5..9d63472 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -454,6 +454,12 @@ struct ovs_key_ct_label {
>  #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
>  #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
>  #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
> +#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
> +					   mangled by NAT. */
> +#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
> +					   was mangled by NAT. */

I'm blind -- how does ovs deal with change of output device and the
ether dst mac as result of a l3 dst translation?
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 4/5] openvswitch: conntrack netlink API updates.
  2015-10-20 22:20   ` [RFC PATCH 4/5] openvswitch: conntrack netlink API updates Jarno Rajahalme
@ 2015-10-21 10:41     ` Pablo Neira Ayuso
  2015-10-21 11:18       ` Thomas Graf
  0 siblings, 1 reply; 23+ messages in thread
From: Pablo Neira Ayuso @ 2015-10-21 10:41 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: netdev, dev, netfilter-devel

On Tue, Oct 20, 2015 at 03:20:28PM -0700, Jarno Rajahalme wrote:
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index 77850f1..1e6c2ac 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> @@ -291,7 +291,7 @@ size_t ovs_key_attr_size(void)
>  		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
>  		+ nla_total_size(4)   /* OVS_KEY_ATTR_DP_HASH */
>  		+ nla_total_size(4)   /* OVS_KEY_ATTR_RECIRC_ID */
> -		+ nla_total_size(1)   /* OVS_KEY_ATTR_CT_STATE */
> +		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_STATE */
>  		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
>  		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
>  		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABEL */

This is very strange.

You're changing the size of the netlink attribute, this will break
existing userspace applications.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 3/5] netfilter: Allow calling into nat helper without skb_dst.
       [not found]   ` <1445379629-112880-3-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
@ 2015-10-21 10:44     ` Pablo Neira Ayuso
  2015-10-21 20:44       ` Jarno Rajahalme
  0 siblings, 1 reply; 23+ messages in thread
From: Pablo Neira Ayuso @ 2015-10-21 10:44 UTC (permalink / raw)
  To: Jarno Rajahalme
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA

On Tue, Oct 20, 2015 at 03:20:27PM -0700, Jarno Rajahalme wrote:
> NAT checksum recalculation code assumed existence of skb_dst, which
> becomes a problem for a later patch in the series.  Simplify this by
> removing the checks, as the checksum will be dealt with later in the
> stack.

Please, resubmit netfilter patches to netfilter-devel@vger.kernel.org
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info().
  2015-10-20 22:20 ` [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info() Jarno Rajahalme
       [not found]   ` <1445379629-112880-2-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
@ 2015-10-21 10:45   ` Pablo Neira Ayuso
  2015-10-21 20:43     ` Jarno Rajahalme
  2015-10-21 21:38     ` Jarno Rajahalme
  1 sibling, 2 replies; 23+ messages in thread
From: Pablo Neira Ayuso @ 2015-10-21 10:45 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: netdev, dev, netfilter-devel

On Tue, Oct 20, 2015 at 03:20:26PM -0700, Jarno Rajahalme wrote:
> Define a new inline function to map conntrack status to enum
> ip_conntrack_info.  This removes the need to otherwise duplicate this
> code in a later patch.

Where is that later patch that justifies this update?

> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
> ---
>  include/net/netfilter/nf_conntrack.h | 15 +++++++++++++++
>  net/netfilter/nf_conntrack_core.c    | 22 +++-------------------
>  2 files changed, 18 insertions(+), 19 deletions(-)
> 
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index fde4068..b3de10e 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -125,6 +125,21 @@ nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
>  			    tuplehash[hash->tuple.dst.dir]);
>  }
>  
> +static inline enum ip_conntrack_info
> +nf_ct_get_info(const struct nf_conntrack_tuple_hash *h)
> +{
> +	const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
> +
> +	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
> +		return IP_CT_ESTABLISHED_REPLY;
> +	/* Once we've had two way comms, always ESTABLISHED. */
> +	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
> +		return IP_CT_ESTABLISHED;
> +	if (test_bit(IPS_EXPECTED_BIT, &ct->status))
> +		return IP_CT_RELATED;
> +	return IP_CT_NEW;
> +}

I would really like to see how you want to use this.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 5/5] openvswitch: Interface with NAT.
  2015-10-20 22:20 ` [RFC PATCH 5/5] openvswitch: Interface with NAT Jarno Rajahalme
       [not found]   ` <1445379629-112880-5-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
@ 2015-10-21 10:59   ` Thomas Graf
  2015-10-21 21:04     ` Jarno Rajahalme
  1 sibling, 1 reply; 23+ messages in thread
From: Thomas Graf @ 2015-10-21 10:59 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: netdev, dev, netfilter-devel

On 10/20/15 at 03:20pm, Jarno Rajahalme wrote:
> Extend OVS conntrack interface to cover NAT.  New nested
> OVS_CT_ATTR_NAT may be used to include NAT with a CT action.  A bare
> OVS_CT_ATTR_NAT only mangles existing connections.  If
> OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
> attributes, new (non-committed/non-confirmed) connections are mangled
> according to the rest of the nested attributes.
> 
> This work extends on a branch by Thomas Graf at
> https://github.com/tgraf/ovs/tree/nat.
> 
> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>

Awesome work. This is a great start.

There are some, probably unintended, unrelated style changes. More
comments below.

> +enum ovs_nat_attr {
> +	OVS_NAT_ATTR_UNSPEC,
> +	OVS_NAT_ATTR_SRC,
> +	OVS_NAT_ATTR_DST,
> +	OVS_NAT_ATTR_IP_MIN,
> +	OVS_NAT_ATTR_IP_MAX,
> +	OVS_NAT_ATTR_PROTO_MIN,
> +	OVS_NAT_ATTR_PROTO_MAX,
> +	OVS_NAT_ATTR_PERSISTENT,
> +	OVS_NAT_ATTR_PROTO_HASH,
> +	OVS_NAT_ATTR_PROTO_RANDOM,

Simplify this with an OVS_NAT_ATTR_FLAGS?

> @@ -137,11 +159,17 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
>  	ovs_ct_get_label(ct, &key->ct.label);
>  }
>  
> -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
> - * previously sent the packet to conntrack via the ct action.
> +/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
> + * previously sent the packet to conntrack via the ct action.  If
> + * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
> + * initialized from the connection status.
>   */
>  static void ovs_ct_update_key(const struct sk_buff *skb,
> -			      struct sw_flow_key *key, bool post_ct)
> +			      struct sw_flow_key *key, bool post_ct
> +#ifdef CONFIG_NF_NAT_NEEDED
> +			      , bool keep_nat_flags
> +#endif
> +			      )

I suggest keeping the argument even for !CONFIG_NF_NAT_NEEDED. This
unclutters the call sites of this function. An ifdef inside the
keep_nat_flags branch should be enough. The compiler will optimize
the code away and it's much prettier to read.

>  {
>  	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
>  	enum ip_conntrack_info ctinfo;
> @@ -151,8 +179,20 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
>  	ct = nf_ct_get(skb, &ctinfo);
>  	if (ct) {
>  		state = ovs_ct_get_state(ctinfo);
> +		/* OVS persists the related flag for the duration of the
> +		 * connection. */
>  		if (ct->master)
>  			state |= OVS_CS_F_RELATED;
> +#ifdef CONFIG_NF_NAT_NEEDED
> +		if (keep_nat_flags)
> +			state |= key->ct.state & OVS_CS_F_NAT_MASK;
> +		else {
> +			if (ct->status & IPS_SRC_NAT)
> +				state |= OVS_CS_F_SRC_NAT;
> +			if (ct->status & IPS_DST_NAT)
> +				state |= OVS_CS_F_DST_NAT;
> +		}
> +#endif
>  		zone = nf_ct_zone(ct);
>  	} else if (post_ct) {
>  		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
> @@ -291,7 +337,16 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
>  		return NF_DROP;
>  	}
>  
> -	return helper->help(skb, protoff, ct, ctinfo);
> +	if (helper->help(skb, protoff, ct, ctinfo) != NF_ACCEPT)
> +		return NF_DROP;

Return the returned value here instead of hardcoding NF_DROP?

> +#ifdef CONFIG_NF_NAT_NEEDED
> +	/* Adjust seqs after helper. */

A comment on why this is needed would be helpful.

> +	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
> +	    && !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
> +		return NF_DROP;
> +#endif
> +	return NF_ACCEPT;

> @@ -377,7 +432,211 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
>  	return true;
>  }
>  
> -static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
> +#ifdef CONFIG_NF_NAT_NEEDED
> +/* Modeled after nf_nat_ipv[46]_fn().
> + * range is only used for new, uninitialized NAT state.
> + * Returns either NF_ACCEPT or NF_DROP. */
> +static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
> +			      enum ip_conntrack_info ctinfo,
> +			      const struct nf_nat_range *range,
> +			      enum nf_nat_manip_type maniptype)
> +{
> +	int hooknum, nh_off, err = NF_ACCEPT;
> +
> +	nh_off = skb_network_offset(skb);
> +	skb_pull(skb, nh_off);
> +
> +	/* See HOOK2MANIP(). */
> +	if (maniptype == NF_NAT_MANIP_SRC)
> +		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
> +	else
> +		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
> +
> +	switch (ctinfo) {
> +	case IP_CT_RELATED:
> +	case IP_CT_RELATED_REPLY:
> +		if (skb->protocol == htons(ETH_P_IP)
> +		    && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
> +			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
> +							   hooknum))
> +				err = NF_DROP;
> +			goto push;
> +		} else if (skb->protocol == htons(ETH_P_IPV6)) {
> +			__be16 frag_off;
> +			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
> +			int hdrlen = ipv6_skip_exthdr(skb,
> +						      sizeof(struct ipv6hdr),
> +						      &nexthdr, &frag_off);
> +
> +			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
> +				if (!nf_nat_icmpv6_reply_translation(skb, ct,
> +								     ctinfo,
> +								     hooknum,
> +								     hdrlen))
> +					err = NF_DROP;
> +				goto push;
> +			}
> +		}
> +		/* Non-ICMP, fall thru to initialize if needed. */
> +	case IP_CT_NEW:
> +		/* Seen it before?  This can happen for loopback, retrans,
> +		 * or local packets.
> +		 */
> +		if (!nf_nat_initialized(ct, maniptype)) {

Explicit unlikely()?

> +			/* Initialize according to the NAT action. */
> +			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
> +				/* Action is set up to establish a new
> +				 * mapping */
> +				? nf_nat_setup_info(ct, range, maniptype)
> +				: nf_nat_alloc_null_binding(ct, hooknum);
> +		}
> +		break;
> +
> +	case IP_CT_ESTABLISHED:
> +	case IP_CT_ESTABLISHED_REPLY:
> +		break;
> +
> +	default:
> +		err = NF_DROP;
> +		goto push;
> +	}
> +
> +	if (err == NF_ACCEPT)
> +		err = nf_nat_packet(ct, ctinfo, hooknum, skb);

If you goto push on init failure (IP_CT_NEW branch), then this
conditional is no longer needed and a more straight forward exception
handling is seen.

> +push:
> +	skb_push(skb, nh_off);
> +
> +	return err;
> +}

> +/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise.
> + * This action can be used to both NAT and reverse NAT, however, reverse NAT
> + * can also be done with the conntrack action. */
> +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
> +		      const struct ovs_conntrack_info *info,
> +		      struct sk_buff *skb)
> +{
> +	enum nf_nat_manip_type maniptype;
> +	enum ip_conntrack_info ctinfo;
> +	struct nf_conn *ct;
> +	int err;
> +
> +	/* No NAT action or already NATed? */
> +	if (!(info->flags & OVS_CT_F_NAT_MASK)
> +	    || key->ct.state & OVS_CS_F_NAT_MASK)
> +		return NF_ACCEPT;
> +
> +	ct = nf_ct_get(skb, &ctinfo);
> +	/* Check if an existing conntrack entry may be found for this skb.
> +	 * This happens when we lose the ct entry pointer due to an upcall.
> +	 * Don't lookup invalid connections. */
> +	if (!ct && key->ct.state & OVS_CS_F_TRACKED
> +	    && !(key->ct.state & OVS_CS_F_INVALID))
> +		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
> +					  &ctinfo);
> +	if (!ct || nf_ct_is_untracked(ct))
> +		/* A NAT action may only be performed on tracked packets. */
> +		return NF_ACCEPT;

Braces

> +	/* Add NAT extension if not commited yet. */
> +	if (!nf_ct_is_confirmed(ct)) {
> +		if (!nf_ct_nat_ext_add(ct))
> +			return NF_ACCEPT;   /* Can't NAT. */
> +	}

&&

> +	/* Determine NAT type.
> +	 * Check if the NAT type can be deduced from the tracked connection.
> +	 * Make sure expected traffic is NATted only when commiting. */
> +	if (info->flags & OVS_CT_F_NAT && ctinfo != IP_CT_NEW
> +	    && ct->status & IPS_NAT_MASK
> +	    && (!(ct->status & IPS_EXPECTED_BIT)
> +		|| info->flags & OVS_CT_F_COMMIT)) {
> +		/* NAT an established or related connection like before. */
> +		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
> +			/* This is the REPLY direction for a connection
> +			 * for which NAT was applied in the forward
> +			 * direction.  Do the reverse NAT. */
> +			maniptype = ct->status & IPS_SRC_NAT
> +				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
> +		else
> +			maniptype = ct->status & IPS_SRC_NAT
> +				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
> +	}
> +	else if (info->flags & OVS_CT_F_SRC_NAT)
> +		maniptype = NF_NAT_MANIP_SRC;
> +	else if (info->flags & OVS_CT_F_DST_NAT)
> +		maniptype = NF_NAT_MANIP_DST;
> +	else
> +		return NF_ACCEPT; /* Connection is not NATed. */
> +
> +	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
> +
> +	/* Mark NAT done if successful. */
> +	if (err == NF_ACCEPT)
> +		key->ct.state |= (maniptype == NF_NAT_MANIP_SRC)
> +			? OVS_CS_F_SRC_NAT : OVS_CS_F_DST_NAT;
> +	return err;
> +}
> +#endif
> +
> +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
>  			   const struct ovs_conntrack_info *info,
>  			   struct sk_buff *skb)
>  {


> @@ -538,6 +819,131 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_NF_NAT_NEEDED
> +static int parse_nat(const struct nlattr *attr,
> +		     struct ovs_conntrack_info *info, bool log)
> +{
> +	struct nlattr *a;
> +	int rem;
> +	bool have_ip_max = false;
> +	bool have_proto_max = false;
> +	bool ip_vers = (info->family == NFPROTO_IPV6);
> +
> +	nla_for_each_nested(a, attr, rem) {
> +		static const u16 ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
> +			[OVS_NAT_ATTR_SRC] = {0, 0},
> +			[OVS_NAT_ATTR_DST] = {0, 0},
> +			[OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
> +						 sizeof(struct in6_addr)},
> +			[OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
> +						 sizeof(struct in6_addr)},
> +			[OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16),sizeof(u16)},
> +			[OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16),sizeof(u16)},
> +			[OVS_NAT_ATTR_PERSISTENT] = {0, 0},
> +			[OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
> +			[OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
> +		};
> +		int type = nla_type(a);
> +
> +		if (type > OVS_NAT_ATTR_MAX) {
> +			OVS_NLERR(log, "Unknown nat attribute (type=%d, max=%d).\n",
> +			type, OVS_NAT_ATTR_MAX);

Formatting

> +			return -EINVAL;
> +		}
> +
> +		case OVS_NAT_ATTR_IP_MIN:
> +			nla_memcpy(&info->range.min_addr, a, nla_len(a));

The length attribute should be sizeof of min_addr like for max_addr
below.

> +			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
> +			break;
> +
> +		case OVS_NAT_ATTR_IP_MAX:
> +			have_ip_max = true;
> +			nla_memcpy(&info->range.max_addr, a,
> +				   sizeof(info->range.max_addr));
> +			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
> +			break;
> +
> +	}

>  static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
>  	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0,
>  				    .maxlen = 0 },
> @@ -548,7 +954,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
>  	[OVS_CT_ATTR_LABEL]	= { .minlen = sizeof(struct md_label),
>  				    .maxlen = sizeof(struct md_label) },
>  	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
> -				    .maxlen = NF_CT_HELPER_NAME_LEN }
> +				    .maxlen = NF_CT_HELPER_NAME_LEN },
> +#ifdef CONFIG_NF_NAT_NEEDED
> +	[OVS_CT_ATTR_NAT]	= { .minlen = 0,
> +				    .maxlen = 96 }
> +#endif

Is the 96 a temporary hack here?

> @@ -607,6 +1017,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
>  				return -EINVAL;
>  			}
>  			break;
> +#ifdef CONFIG_NF_NAT_NEEDED
> +		case OVS_CT_ATTR_NAT: {
> +			int err = parse_nat(a, info, log);
> +			if (err)
> +				return err;
> +			break;
> +		}
> +#endif

We should probably bark if user space provides a OVS_CT_ATTR_NAT but the
kernel is compiled without support for it.

> +#ifdef CONFIG_NF_NAT_NEEDED
> +static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
> +			       struct sk_buff *skb)
> +{
> +	struct nlattr *start;
> +
> +	start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
> +	if (!start)
> +		return false;
> +
> +	if (info->flags & OVS_CT_F_SRC_NAT) {
> +		if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
> +			return false;
> +	} else if (info->flags & OVS_CT_F_DST_NAT) {
> +		if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
> +			return false;
> +	} else {
> +		goto out;

Is the empty nested attribute intended here?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 4/5] openvswitch: conntrack netlink API updates.
  2015-10-21 10:41     ` Pablo Neira Ayuso
@ 2015-10-21 11:18       ` Thomas Graf
  0 siblings, 0 replies; 23+ messages in thread
From: Thomas Graf @ 2015-10-21 11:18 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Jarno Rajahalme, netdev, dev, netfilter-devel

On 10/21/15 at 12:41pm, Pablo Neira Ayuso wrote:
> On Tue, Oct 20, 2015 at 03:20:28PM -0700, Jarno Rajahalme wrote:
> > diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> > index 77850f1..1e6c2ac 100644
> > --- a/net/openvswitch/flow_netlink.c
> > +++ b/net/openvswitch/flow_netlink.c
> > @@ -291,7 +291,7 @@ size_t ovs_key_attr_size(void)
> >  		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
> >  		+ nla_total_size(4)   /* OVS_KEY_ATTR_DP_HASH */
> >  		+ nla_total_size(4)   /* OVS_KEY_ATTR_RECIRC_ID */
> > -		+ nla_total_size(1)   /* OVS_KEY_ATTR_CT_STATE */
> > +		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_STATE */
> >  		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
> >  		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
> >  		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABEL */
> 
> This is very strange.
> 
> You're changing the size of the netlink attribute, this will break
> existing userspace applications.

This seems to be a temporary forward port of the following commit. The
change is backwards compatible.

commit fbccce5965a58d56aaed9e9acd1bec75d8a66e87
Author: Joe Stringer <joestringer@nicira.com>
Date:   Tue Oct 6 11:00:00 2015 -0700

    openvswitch: Extend ct_state match field to 32 bits

    The ct_state field was initially added as an 8-bit field, however six of
    the bits are already being used and use cases are already starting to
    appear that may push the limits of this field. This patch extends the
    field to 32 bits while retaining the internal representation of 8 bits.
    This should cover forward compatibility of the ABI for the foreseeable
    future.

    This patch also reorders the OVS_CS_F_* bits to be sequential.

    Suggested-by: Jarno Rajahalme <jrajahalme@nicira.com>
    Signed-off-by: Joe Stringer <joestringer@nicira.com>
    Acked-by: Pravin B Shelar <pshelar@nicira.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 5/5] openvswitch: Interface with NAT.
  2015-10-21  9:34     ` Florian Westphal
@ 2015-10-21 11:30       ` Thomas Graf
  2015-10-21 14:42         ` Florian Westphal
  0 siblings, 1 reply; 23+ messages in thread
From: Thomas Graf @ 2015-10-21 11:30 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Jarno Rajahalme, netdev, dev

On 10/21/15 at 11:34am, Florian Westphal wrote:
> Jarno Rajahalme <jrajahalme@nicira.com> wrote:
> >  #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
> >  #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
> >  #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
> > +#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
> > +					   mangled by NAT. */
> > +#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
> > +					   was mangled by NAT. */
> 
> I'm blind -- how does ovs deal with change of output device and the
> ether dst mac as result of a l3 dst translation?

I assume you are referring to rewriting of L2 and the forwarding decision
after NAT. As NAT is performed in combination with conntrack, the packet
is recirculated and hits the flow table again after NAT. That 2nd
stage flow must take are of performing L3 by rewriting L2, decrementing
TTL, etc.

Is this what you are referring to?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 5/5] openvswitch: Interface with NAT.
  2015-10-21 11:30       ` Thomas Graf
@ 2015-10-21 14:42         ` Florian Westphal
  0 siblings, 0 replies; 23+ messages in thread
From: Florian Westphal @ 2015-10-21 14:42 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Florian Westphal, Jarno Rajahalme, netdev, dev

Thomas Graf <tgraf@suug.ch> wrote:
> On 10/21/15 at 11:34am, Florian Westphal wrote:
> > Jarno Rajahalme <jrajahalme@nicira.com> wrote:
> > >  #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
> > >  #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
> > >  #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
> > > +#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
> > > +					   mangled by NAT. */
> > > +#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
> > > +					   was mangled by NAT. */
> > 
> > I'm blind -- how does ovs deal with change of output device and the
> > ether dst mac as result of a l3 dst translation?
> 
> I assume you are referring to rewriting of L2 and the forwarding decision
> after NAT. As NAT is performed in combination with conntrack, the packet
> is recirculated and hits the flow table again after NAT. That 2nd
> stage flow must take are of performing L3 by rewriting L2, decrementing
> TTL, etc.

> Is this what you are referring to?

Yes, exactly, thanks for answering my question.

[ in classic bridge netfilter this requires route lookup & neigh stunts
  to deal with the consequences of dnat, i.e.

- route says dst is reachable via some other interface not part of
bridge
- route says that dst is localhost
- route says its on same bridge, but neigh has no idea what the new
dst mac address is,etc.

I was kinda disappointed to not see similar tur^W hacks ;)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition.
       [not found]   ` <20151021083323.GB15539-4EA/1caXOu0mYvmMESoHnA@public.gmane.org>
@ 2015-10-21 20:15     ` Jarno Rajahalme
  2015-10-21 23:32       ` [ovs-dev] " Thomas Graf
  0 siblings, 1 reply; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-21 20:15 UTC (permalink / raw)
  To: Thomas Graf
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA


> On Oct 21, 2015, at 1:33 AM, Thomas Graf <tgraf@suug.ch> wrote:
> 
> [Copying netfilter-devel]
> 
> On 10/20/15 at 03:20pm, Jarno Rajahalme wrote:
>> Remove the definition of IP_CT_NEW_REPLY as it does not make sense.
>> This allows the definition of IP_CT_NUMBER to be simplified as well.
>> 
>> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
>> ---
>> include/uapi/linux/netfilter/nf_conntrack_common.h | 7 ++++---
>> net/openvswitch/conntrack.c                        | 2 --
>> 2 files changed, 4 insertions(+), 5 deletions(-)
>> 
>> diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
>> index 319f471..e0aebc8 100644
>> --- a/include/uapi/linux/netfilter/nf_conntrack_common.h
>> +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
>> @@ -20,9 +20,10 @@ enum ip_conntrack_info {
>> 
>> 	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
>> 	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
>> -	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
>> -	/* Number of distinct IP_CT types (no NEW in reply dirn). */
>> -	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
>> +	/* No IP_CT_NEW_REPLY */
>> +
>> +	/* Number of distinct IP_CT types. */
>> +	IP_CT_NUMBER
>> };
> 
> I understand what you are doing here but this is part of the published
> UAPI and removing this might break compilation of a user application
> even if the definition is not used right now. It's probably safer to
> leave the definition and obsolete it with a comment.

OK. I should probably separate the netlink and openvswitch changes to separate patches as well?

  Jarno

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info().
  2015-10-21 10:45   ` Pablo Neira Ayuso
@ 2015-10-21 20:43     ` Jarno Rajahalme
  2015-10-21 21:38     ` Jarno Rajahalme
  1 sibling, 0 replies; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-21 20:43 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA


> On Oct 21, 2015, at 3:45 AM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> 
> On Tue, Oct 20, 2015 at 03:20:26PM -0700, Jarno Rajahalme wrote:
>> Define a new inline function to map conntrack status to enum
>> ip_conntrack_info.  This removes the need to otherwise duplicate this
>> code in a later patch.
> 
> Where is that later patch that justifies this update?
> 

It is in patch 5/5, ovs_ct_find_existing(). The intent it to locate an existing conntrack entry without modifying it, if such an entry exists. This is needed in a case where we first go through the conntrack (calling nf_conntrack_in), then recirculate (to match on the state bits), but do not find a matching flow entry in the flow table. In this case we issue an upcall and let the OVS userspace compute a new flow entry. When the packet gets back to kernel we have lost the skb (as we do not buffer it in the kernel after issuing the upcall). ovs_ct_find_existing() can then be used to populate the nfct and nfctinfo fields of the new skb, as if this skb had gone through the nf_conntrack_in() call. This allows us to pass the packet through NAT after such an upcall without passing the packet through nf_conntrack_in() twice.

ovs_ct_find_existing() is intended to be used only when we have evidence in the OVS flow key that the packet actually went through nf_conntrack_in() prior to the upcall. In the current version of the patches I do not have a test case for this, and I intend to separate out this change into a separate patch before the NAT patch for the next (non-RFC) version.

I could implement ovs_ct_find_existing() using exported conntrack APIs apart from this mapping from conntrack status to enum ip_conntrack_info. ovs_ct_find_existing() is in no way OVS specific, though, so it could be added to the conntrack proper as well, if so desired.

  Jarno

>> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
>> ---
>> include/net/netfilter/nf_conntrack.h | 15 +++++++++++++++
>> net/netfilter/nf_conntrack_core.c    | 22 +++-------------------
>> 2 files changed, 18 insertions(+), 19 deletions(-)
>> 
>> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
>> index fde4068..b3de10e 100644
>> --- a/include/net/netfilter/nf_conntrack.h
>> +++ b/include/net/netfilter/nf_conntrack.h
>> @@ -125,6 +125,21 @@ nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
>> 			    tuplehash[hash->tuple.dst.dir]);
>> }
>> 
>> +static inline enum ip_conntrack_info
>> +nf_ct_get_info(const struct nf_conntrack_tuple_hash *h)
>> +{
>> +	const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>> +
>> +	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
>> +		return IP_CT_ESTABLISHED_REPLY;
>> +	/* Once we've had two way comms, always ESTABLISHED. */
>> +	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
>> +		return IP_CT_ESTABLISHED;
>> +	if (test_bit(IPS_EXPECTED_BIT, &ct->status))
>> +		return IP_CT_RELATED;
>> +	return IP_CT_NEW;
>> +}
> 
> I would really like to see how you want to use this.

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 3/5] netfilter: Allow calling into nat helper without skb_dst.
  2015-10-21 10:44     ` Pablo Neira Ayuso
@ 2015-10-21 20:44       ` Jarno Rajahalme
  0 siblings, 0 replies; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-21 20:44 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netdev, dev, netfilter-devel


> On Oct 21, 2015, at 3:44 AM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> 
> On Tue, Oct 20, 2015 at 03:20:27PM -0700, Jarno Rajahalme wrote:
>> NAT checksum recalculation code assumed existence of skb_dst, which
>> becomes a problem for a later patch in the series.  Simplify this by
>> removing the checks, as the checksum will be dealt with later in the
>> stack.
> 
> Please, resubmit netfilter patches to netfilter-devel@vger.kernel.org

Will do.

  Jarno

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 5/5] openvswitch: Interface with NAT.
  2015-10-21 10:59   ` Thomas Graf
@ 2015-10-21 21:04     ` Jarno Rajahalme
  2015-10-21 23:30       ` Thomas Graf
  0 siblings, 1 reply; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-21 21:04 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, dev, netfilter-devel


> On Oct 21, 2015, at 3:59 AM, Thomas Graf <tgraf@suug.ch> wrote:
> 
> On 10/20/15 at 03:20pm, Jarno Rajahalme wrote:
>> Extend OVS conntrack interface to cover NAT.  New nested
>> OVS_CT_ATTR_NAT may be used to include NAT with a CT action.  A bare
>> OVS_CT_ATTR_NAT only mangles existing connections.  If
>> OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
>> attributes, new (non-committed/non-confirmed) connections are mangled
>> according to the rest of the nested attributes.
>> 
>> This work extends on a branch by Thomas Graf at
>> https://github.com/tgraf/ovs/tree/nat.
>> 
>> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
> 
> Awesome work. This is a great start.
> 
> There are some, probably unintended, unrelated style changes. More
> comments below.
> 

Thanks for the review!

>> +enum ovs_nat_attr {
>> +	OVS_NAT_ATTR_UNSPEC,
>> +	OVS_NAT_ATTR_SRC,
>> +	OVS_NAT_ATTR_DST,
>> +	OVS_NAT_ATTR_IP_MIN,
>> +	OVS_NAT_ATTR_IP_MAX,
>> +	OVS_NAT_ATTR_PROTO_MIN,
>> +	OVS_NAT_ATTR_PROTO_MAX,
>> +	OVS_NAT_ATTR_PERSISTENT,
>> +	OVS_NAT_ATTR_PROTO_HASH,
>> +	OVS_NAT_ATTR_PROTO_RANDOM,
> 
> Simplify this with an OVS_NAT_ATTR_FLAGS?

The use of separate flag attributes was actually intentional, as it makes the interface easier to understand, and code also easier to read.

> 
>> @@ -137,11 +159,17 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
>> 	ovs_ct_get_label(ct, &key->ct.label);
>> }
>> 
>> -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
>> - * previously sent the packet to conntrack via the ct action.
>> +/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
>> + * previously sent the packet to conntrack via the ct action.  If
>> + * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
>> + * initialized from the connection status.
>>  */
>> static void ovs_ct_update_key(const struct sk_buff *skb,
>> -			      struct sw_flow_key *key, bool post_ct)
>> +			      struct sw_flow_key *key, bool post_ct
>> +#ifdef CONFIG_NF_NAT_NEEDED
>> +			      , bool keep_nat_flags
>> +#endif
>> +			      )
> 
> I suggest keeping the argument even for !CONFIG_NF_NAT_NEEDED. This
> unclutters the call sites of this function. An ifdef inside the
> keep_nat_flags branch should be enough. The compiler will optimize
> the code away and it's much prettier to read.
> 

OK.

>> {
>> 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
>> 	enum ip_conntrack_info ctinfo;
>> @@ -151,8 +179,20 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
>> 	ct = nf_ct_get(skb, &ctinfo);
>> 	if (ct) {
>> 		state = ovs_ct_get_state(ctinfo);
>> +		/* OVS persists the related flag for the duration of the
>> +		 * connection. */
>> 		if (ct->master)
>> 			state |= OVS_CS_F_RELATED;
>> +#ifdef CONFIG_NF_NAT_NEEDED
>> +		if (keep_nat_flags)
>> +			state |= key->ct.state & OVS_CS_F_NAT_MASK;
>> +		else {
>> +			if (ct->status & IPS_SRC_NAT)
>> +				state |= OVS_CS_F_SRC_NAT;
>> +			if (ct->status & IPS_DST_NAT)
>> +				state |= OVS_CS_F_DST_NAT;
>> +		}
>> +#endif
>> 		zone = nf_ct_zone(ct);
>> 	} else if (post_ct) {
>> 		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
>> @@ -291,7 +337,16 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
>> 		return NF_DROP;
>> 	}
>> 
>> -	return helper->help(skb, protoff, ct, ctinfo);
>> +	if (helper->help(skb, protoff, ct, ctinfo) != NF_ACCEPT)
>> +		return NF_DROP;
> 
> Return the returned value here instead of hardcoding NF_DROP?
> 

OK

>> +#ifdef CONFIG_NF_NAT_NEEDED
>> +	/* Adjust seqs after helper. */
> 
> A comment on why this is needed would be helpful.
> 

Will add a comment like: /* Needed when a helper adjusts payload size (e.g., FTP PORT command). */

>> +	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
>> +	    && !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
>> +		return NF_DROP;
>> +#endif
>> +	return NF_ACCEPT;
> 
>> @@ -377,7 +432,211 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
>> 	return true;
>> }
>> 
>> -static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
>> +#ifdef CONFIG_NF_NAT_NEEDED
>> +/* Modeled after nf_nat_ipv[46]_fn().
>> + * range is only used for new, uninitialized NAT state.
>> + * Returns either NF_ACCEPT or NF_DROP. */
>> +static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
>> +			      enum ip_conntrack_info ctinfo,
>> +			      const struct nf_nat_range *range,
>> +			      enum nf_nat_manip_type maniptype)
>> +{
>> +	int hooknum, nh_off, err = NF_ACCEPT;
>> +
>> +	nh_off = skb_network_offset(skb);
>> +	skb_pull(skb, nh_off);
>> +
>> +	/* See HOOK2MANIP(). */
>> +	if (maniptype == NF_NAT_MANIP_SRC)
>> +		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
>> +	else
>> +		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
>> +
>> +	switch (ctinfo) {
>> +	case IP_CT_RELATED:
>> +	case IP_CT_RELATED_REPLY:
>> +		if (skb->protocol == htons(ETH_P_IP)
>> +		    && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
>> +			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
>> +							   hooknum))
>> +				err = NF_DROP;
>> +			goto push;
>> +		} else if (skb->protocol == htons(ETH_P_IPV6)) {
>> +			__be16 frag_off;
>> +			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
>> +			int hdrlen = ipv6_skip_exthdr(skb,
>> +						      sizeof(struct ipv6hdr),
>> +						      &nexthdr, &frag_off);
>> +
>> +			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
>> +				if (!nf_nat_icmpv6_reply_translation(skb, ct,
>> +								     ctinfo,
>> +								     hooknum,
>> +								     hdrlen))
>> +					err = NF_DROP;
>> +				goto push;
>> +			}
>> +		}
>> +		/* Non-ICMP, fall thru to initialize if needed. */
>> +	case IP_CT_NEW:
>> +		/* Seen it before?  This can happen for loopback, retrans,
>> +		 * or local packets.
>> +		 */
>> +		if (!nf_nat_initialized(ct, maniptype)) {
> 
> Explicit unlikely()?
> 

Normally initialization is needed, but for IP_CT_RELATED the expectation handling may have already initialized NAT. As such, I do not see a strong case for if (likely(!nf_nat_initialized(ct, maniptype)). I’ll see if I can improve the comment, though, as the current one is copied from nf_nat_ipv[46]_fn().

>> +			/* Initialize according to the NAT action. */
>> +			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
>> +				/* Action is set up to establish a new
>> +				 * mapping */
>> +				? nf_nat_setup_info(ct, range, maniptype)
>> +				: nf_nat_alloc_null_binding(ct, hooknum);
>> +		}
>> +		break;
>> +
>> +	case IP_CT_ESTABLISHED:
>> +	case IP_CT_ESTABLISHED_REPLY:
>> +		break;
>> +
>> +	default:
>> +		err = NF_DROP;
>> +		goto push;
>> +	}
>> +
>> +	if (err == NF_ACCEPT)
>> +		err = nf_nat_packet(ct, ctinfo, hooknum, skb);
> 
> If you goto push on init failure (IP_CT_NEW branch), then this
> conditional is no longer needed and a more straight forward exception
> handling is seen.
> 

OK.

>> +push:
>> +	skb_push(skb, nh_off);
>> +
>> +	return err;
>> +}
> 
>> +/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise.
>> + * This action can be used to both NAT and reverse NAT, however, reverse NAT
>> + * can also be done with the conntrack action. */
>> +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
>> +		      const struct ovs_conntrack_info *info,
>> +		      struct sk_buff *skb)
>> +{
>> +	enum nf_nat_manip_type maniptype;
>> +	enum ip_conntrack_info ctinfo;
>> +	struct nf_conn *ct;
>> +	int err;
>> +
>> +	/* No NAT action or already NATed? */
>> +	if (!(info->flags & OVS_CT_F_NAT_MASK)
>> +	    || key->ct.state & OVS_CS_F_NAT_MASK)
>> +		return NF_ACCEPT;
>> +
>> +	ct = nf_ct_get(skb, &ctinfo);
>> +	/* Check if an existing conntrack entry may be found for this skb.
>> +	 * This happens when we lose the ct entry pointer due to an upcall.
>> +	 * Don't lookup invalid connections. */
>> +	if (!ct && key->ct.state & OVS_CS_F_TRACKED
>> +	    && !(key->ct.state & OVS_CS_F_INVALID))
>> +		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
>> +					  &ctinfo);
>> +	if (!ct || nf_ct_is_untracked(ct))
>> +		/* A NAT action may only be performed on tracked packets. */
>> +		return NF_ACCEPT;
> 
> Braces
> 

Needed due to the comment?

>> +	/* Add NAT extension if not commited yet. */
>> +	if (!nf_ct_is_confirmed(ct)) {
>> +		if (!nf_ct_nat_ext_add(ct))
>> +			return NF_ACCEPT;   /* Can't NAT. */
>> +	}
> 
> &&
> 

Sure.

>> +	/* Determine NAT type.
>> +	 * Check if the NAT type can be deduced from the tracked connection.
>> +	 * Make sure expected traffic is NATted only when commiting. */
>> +	if (info->flags & OVS_CT_F_NAT && ctinfo != IP_CT_NEW
>> +	    && ct->status & IPS_NAT_MASK
>> +	    && (!(ct->status & IPS_EXPECTED_BIT)
>> +		|| info->flags & OVS_CT_F_COMMIT)) {
>> +		/* NAT an established or related connection like before. */
>> +		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
>> +			/* This is the REPLY direction for a connection
>> +			 * for which NAT was applied in the forward
>> +			 * direction.  Do the reverse NAT. */
>> +			maniptype = ct->status & IPS_SRC_NAT
>> +				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
>> +		else
>> +			maniptype = ct->status & IPS_SRC_NAT
>> +				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
>> +	}
>> +	else if (info->flags & OVS_CT_F_SRC_NAT)
>> +		maniptype = NF_NAT_MANIP_SRC;
>> +	else if (info->flags & OVS_CT_F_DST_NAT)
>> +		maniptype = NF_NAT_MANIP_DST;
>> +	else
>> +		return NF_ACCEPT; /* Connection is not NATed. */
>> +
>> +	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
>> +
>> +	/* Mark NAT done if successful. */
>> +	if (err == NF_ACCEPT)
>> +		key->ct.state |= (maniptype == NF_NAT_MANIP_SRC)
>> +			? OVS_CS_F_SRC_NAT : OVS_CS_F_DST_NAT;
>> +	return err;
>> +}
>> +#endif
>> +
>> +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
>> 			   const struct ovs_conntrack_info *info,
>> 			   struct sk_buff *skb)
>> {
> 
> 
>> @@ -538,6 +819,131 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
>> 	return 0;
>> }
>> 
>> +#ifdef CONFIG_NF_NAT_NEEDED
>> +static int parse_nat(const struct nlattr *attr,
>> +		     struct ovs_conntrack_info *info, bool log)
>> +{
>> +	struct nlattr *a;
>> +	int rem;
>> +	bool have_ip_max = false;
>> +	bool have_proto_max = false;
>> +	bool ip_vers = (info->family == NFPROTO_IPV6);
>> +
>> +	nla_for_each_nested(a, attr, rem) {
>> +		static const u16 ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
>> +			[OVS_NAT_ATTR_SRC] = {0, 0},
>> +			[OVS_NAT_ATTR_DST] = {0, 0},
>> +			[OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
>> +						 sizeof(struct in6_addr)},
>> +			[OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
>> +						 sizeof(struct in6_addr)},
>> +			[OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16),sizeof(u16)},
>> +			[OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16),sizeof(u16)},
>> +			[OVS_NAT_ATTR_PERSISTENT] = {0, 0},
>> +			[OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
>> +			[OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
>> +		};
>> +		int type = nla_type(a);
>> +
>> +		if (type > OVS_NAT_ATTR_MAX) {
>> +			OVS_NLERR(log, "Unknown nat attribute (type=%d, max=%d).\n",
>> +			type, OVS_NAT_ATTR_MAX);
> 
> Formatting

Not readily apparent what you mean here, care to elaborate?

> 
>> +			return -EINVAL;
>> +		}
>> +
>> +		case OVS_NAT_ATTR_IP_MIN:
>> +			nla_memcpy(&info->range.min_addr, a, nla_len(a));
> 
> The length attribute should be sizeof of min_addr like for max_addr
> below.
> 

Right.

>> +			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
>> +			break;
>> +
>> +		case OVS_NAT_ATTR_IP_MAX:
>> +			have_ip_max = true;
>> +			nla_memcpy(&info->range.max_addr, a,
>> +				   sizeof(info->range.max_addr));
>> +			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
>> +			break;
>> +
>> +	}
> 
>> static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
>> 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0,
>> 				    .maxlen = 0 },
>> @@ -548,7 +954,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
>> 	[OVS_CT_ATTR_LABEL]	= { .minlen = sizeof(struct md_label),
>> 				    .maxlen = sizeof(struct md_label) },
>> 	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
>> -				    .maxlen = NF_CT_HELPER_NAME_LEN }
>> +				    .maxlen = NF_CT_HELPER_NAME_LEN },
>> +#ifdef CONFIG_NF_NAT_NEEDED
>> +	[OVS_CT_ATTR_NAT]	= { .minlen = 0,
>> +				    .maxlen = 96 }
>> +#endif
> 
> Is the 96 a temporary hack here?
> 

It is not an exact value. It is much better than my temporary hack of 512 was. As trailing garbage is checked for, I’m not sure if this should be very accurately calculated? Maybe it would be better to disable the length checks for this altogether?

>> @@ -607,6 +1017,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
>> 				return -EINVAL;
>> 			}
>> 			break;
>> +#ifdef CONFIG_NF_NAT_NEEDED
>> +		case OVS_CT_ATTR_NAT: {
>> +			int err = parse_nat(a, info, log);
>> +			if (err)
>> +				return err;
>> +			break;
>> +		}
>> +#endif
> 
> We should probably bark if user space provides a OVS_CT_ATTR_NAT but the
> kernel is compiled without support for it.
> 

We do issue -EINVAL and log “Unknown conntrack attr” in that case.

>> +#ifdef CONFIG_NF_NAT_NEEDED
>> +static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
>> +			       struct sk_buff *skb)
>> +{
>> +	struct nlattr *start;
>> +
>> +	start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
>> +	if (!start)
>> +		return false;
>> +
>> +	if (info->flags & OVS_CT_F_SRC_NAT) {
>> +		if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
>> +			return false;
>> +	} else if (info->flags & OVS_CT_F_DST_NAT) {
>> +		if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
>> +			return false;
>> +	} else {
>> +		goto out;
> 
> Is the empty nested attribute intended here?

Yes. On netlink interface empty nested attribute (NAT without any arguments) signifies to NAT packets of previously NATted connections (only).

  Jarno


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info().
  2015-10-21 10:45   ` Pablo Neira Ayuso
  2015-10-21 20:43     ` Jarno Rajahalme
@ 2015-10-21 21:38     ` Jarno Rajahalme
  1 sibling, 0 replies; 23+ messages in thread
From: Jarno Rajahalme @ 2015-10-21 21:38 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netdev, dev, netfilter-devel


> On Oct 21, 2015, at 3:45 AM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> 
> On Tue, Oct 20, 2015 at 03:20:26PM -0700, Jarno Rajahalme wrote:
>> Define a new inline function to map conntrack status to enum
>> ip_conntrack_info.  This removes the need to otherwise duplicate this
>> code in a later patch.
> 
> Where is that later patch that justifies this update?
> 

It is in patch 5/5, ovs_ct_find_existing(). The intent it to locate an existing conntrack entry without modifying it, if such an entry exists. This is needed in a case where we first go through the conntrack (calling nf_conntrack_in), then recirculate (to match on the state bits), but do not find a matching flow entry in the flow table. In this case we issue an upcall and let the OVS userspace compute a new flow entry. When the packet gets back to kernel we have lost the skb (as we do not buffer it in the kernel after issuing the upcall). ovs_ct_find_existing() can then be used to populate the nfct and nfctinfo fields of the new skb, as if this skb had gone through the nf_conntrack_in() call. This allows us to pass the packet through NAT after such an upcall without passing the packet thro
 ugh nf_conntrack_in() twice.

ovs_ct_find_existing() is intended to be used only when we have evidence in the OVS flow key that the packet actually went through nf_conntrack_in() prior to the upcall. In the current version of the patches I do not have a test case for this, and I intend to separate out this change into a separate patch before the NAT patch for the next (non-RFC) version.

I could implement ovs_ct_find_existing() using exported conntrack APIs apart from this mapping from conntrack status to enum ip_conntrack_info. ovs_ct_find_existing() is in no way OVS specific, though, so it could be added to the conntrack proper as well, if so desired.

  Jarno

>> Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
>> ---
>> include/net/netfilter/nf_conntrack.h | 15 +++++++++++++++
>> net/netfilter/nf_conntrack_core.c    | 22 +++-------------------
>> 2 files changed, 18 insertions(+), 19 deletions(-)
>> 
>> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
>> index fde4068..b3de10e 100644
>> --- a/include/net/netfilter/nf_conntrack.h
>> +++ b/include/net/netfilter/nf_conntrack.h
>> @@ -125,6 +125,21 @@ nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
>> 			    tuplehash[hash->tuple.dst.dir]);
>> }
>> 
>> +static inline enum ip_conntrack_info
>> +nf_ct_get_info(const struct nf_conntrack_tuple_hash *h)
>> +{
>> +	const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>> +
>> +	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
>> +		return IP_CT_ESTABLISHED_REPLY;
>> +	/* Once we've had two way comms, always ESTABLISHED. */
>> +	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
>> +		return IP_CT_ESTABLISHED;
>> +	if (test_bit(IPS_EXPECTED_BIT, &ct->status))
>> +		return IP_CT_RELATED;
>> +	return IP_CT_NEW;
>> +}
> 
> I would really like to see how you want to use this.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH 5/5] openvswitch: Interface with NAT.
  2015-10-21 21:04     ` Jarno Rajahalme
@ 2015-10-21 23:30       ` Thomas Graf
  0 siblings, 0 replies; 23+ messages in thread
From: Thomas Graf @ 2015-10-21 23:30 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: netdev, dev, netfilter-devel

On 10/21/15 at 02:04pm, Jarno Rajahalme wrote:
> 
> > On Oct 21, 2015, at 3:59 AM, Thomas Graf <tgraf@suug.ch> wrote:
> > Simplify this with an OVS_NAT_ATTR_FLAGS?
> 
> The use of separate flag attributes was actually intentional, as it makes the interface easier to understand, and code also easier to read.

OK, either is fine with me.

> >> +					  &ctinfo);
> >> +	if (!ct || nf_ct_is_untracked(ct))
> >> +		/* A NAT action may only be performed on tracked packets. */
> >> +		return NF_ACCEPT;
> > 
> > Braces
> > 
> 
> Needed due to the comment?

The compiler would be fine but most other places like this seem to
put braces on for this single comment + single statement case.

> >> +		if (type > OVS_NAT_ATTR_MAX) {
> >> +			OVS_NLERR(log, "Unknown nat attribute (type=%d, max=%d).\n",
> >> +			type, OVS_NAT_ATTR_MAX);
> > 
> > Formatting
> 
> Not readily apparent what you mean here, care to elaborate?

The argument list should be indented up to the (.

> >> +#ifdef CONFIG_NF_NAT_NEEDED
> >> +	[OVS_CT_ATTR_NAT]	= { .minlen = 0,
> >> +				    .maxlen = 96 }
> >> +#endif
> > 
> > Is the 96 a temporary hack here?
> > 
> 
> It is not an exact value. It is much better than my temporary hack of 512 was. As trailing garbage is checked for, I???m not sure if this should be very accurately calculated? Maybe it would be better to disable the length checks for this altogether?

I would just drop the maxlen then. The nested content should be
verified separately anyway later on.

> > We should probably bark if user space provides a OVS_CT_ATTR_NAT but the
> > kernel is compiled without support for it.
> > 
> 
> We do issue -EINVAL and log ???Unknown conntrack attr??? in that case.

Misread then, sorry.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [ovs-dev] [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition.
  2015-10-21 20:15     ` Jarno Rajahalme
@ 2015-10-21 23:32       ` Thomas Graf
  0 siblings, 0 replies; 23+ messages in thread
From: Thomas Graf @ 2015-10-21 23:32 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: netdev, dev, netfilter-devel

On 10/21/15 at 01:15pm, Jarno Rajahalme wrote:
> 
> > On Oct 21, 2015, at 1:33 AM, Thomas Graf <tgraf@suug.ch> wrote:
> > I understand what you are doing here but this is part of the published
> > UAPI and removing this might break compilation of a user application
> > even if the definition is not used right now. It's probably safer to
> > leave the definition and obsolete it with a comment.
> 
> OK. I should probably separate the netlink and openvswitch changes to separate patches as well?

Given that at that point the changes become unrelated, I'd split
it into separate patches, yes.

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2015-10-21 23:32 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-20 22:20 [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
2015-10-20 22:20 ` [RFC PATCH 2/5] netfilter: Factor out nf_ct_get_info() Jarno Rajahalme
     [not found]   ` <1445379629-112880-2-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
2015-10-21  8:50     ` Thomas Graf
2015-10-21 10:45   ` Pablo Neira Ayuso
2015-10-21 20:43     ` Jarno Rajahalme
2015-10-21 21:38     ` Jarno Rajahalme
2015-10-20 22:20 ` [RFC PATCH 3/5] netfilter: Allow calling into nat helper without skb_dst Jarno Rajahalme
     [not found]   ` <1445379629-112880-3-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
2015-10-21 10:44     ` Pablo Neira Ayuso
2015-10-21 20:44       ` Jarno Rajahalme
     [not found] ` <1445379629-112880-1-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
2015-10-20 22:20   ` [RFC PATCH 4/5] openvswitch: conntrack netlink API updates Jarno Rajahalme
2015-10-21 10:41     ` Pablo Neira Ayuso
2015-10-21 11:18       ` Thomas Graf
2015-10-20 22:20 ` [RFC PATCH 5/5] openvswitch: Interface with NAT Jarno Rajahalme
     [not found]   ` <1445379629-112880-5-git-send-email-jrajahalme-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
2015-10-21  9:34     ` Florian Westphal
2015-10-21 11:30       ` Thomas Graf
2015-10-21 14:42         ` Florian Westphal
2015-10-21 10:59   ` Thomas Graf
2015-10-21 21:04     ` Jarno Rajahalme
2015-10-21 23:30       ` Thomas Graf
2015-10-20 22:28 ` [RFC PATCH 1/5] netfilter: Remove IP_CT_NEW_REPLY definition Jarno Rajahalme
2015-10-21  8:33 ` [ovs-dev] " Thomas Graf
     [not found]   ` <20151021083323.GB15539-4EA/1caXOu0mYvmMESoHnA@public.gmane.org>
2015-10-21 20:15     ` Jarno Rajahalme
2015-10-21 23:32       ` [ovs-dev] " Thomas Graf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.