netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v7] openvswitch: enable NSH support
@ 2017-08-30 12:39 Yi Yang
       [not found] ` <1504096752-102003-1-git-send-email-yi.y.yang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 21+ messages in thread
From: Yi Yang @ 2017-08-30 12:39 UTC (permalink / raw)
  To: netdev; +Cc: davem, dev, jbenc, e, blp, jan.scheurich, Yi Yang

v6->v7
 - Remove NSH GSO patches in v6 because Jiri Benc
   reworked it as another patch series and they have
   been merged.
 - Change it to adapt to nsh kernel module added by NSH
   GSO patch series

v5->v6
 - Fix the rest comments for v4.
 - Add NSH GSO support for VxLAN-gpe + NSH and
   Eth + NSH.

v4->v5
 - Fix many comments by Jiri Benc and Eric Garver
   for v4.

v3->v4
 - Add new NSH match field ttl
 - Update NSH header to the latest format
   which will be final format and won't change
   per its author's confirmation.
 - Fix comments for v3.

v2->v3
 - Change OVS_KEY_ATTR_NSH to nested key to handle
   length-fixed attributes and length-variable
   attriubte more flexibly.
 - Remove struct ovs_action_push_nsh completely
 - Add code to handle nested attribute for SET_MASKED
 - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
   to transfer NSH header data.
 - Fix comments and coding style issues by Jiri and Eric

v1->v2
 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
 - Dynamically allocate struct ovs_action_push_nsh for
   length-variable metadata.

OVS master and 2.8 branch has merged NSH userspace
patch series, this patch is to enable NSH support
in kernel data path in order that OVS can support
NSH in compat mode by porting this.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
---
 include/net/nsh.h                |   3 +
 include/uapi/linux/openvswitch.h |  28 +++
 net/nsh/nsh.c                    |  41 ++++
 net/openvswitch/actions.c        | 141 ++++++++++++++
 net/openvswitch/flow.c           |  55 ++++++
 net/openvswitch/flow.h           |  11 ++
 net/openvswitch/flow_netlink.c   | 406 ++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/flow_netlink.h   |   4 +
 8 files changed, 688 insertions(+), 1 deletion(-)

diff --git a/include/net/nsh.h b/include/net/nsh.h
index a1eaea2..6c0cd57 100644
--- a/include/net/nsh.h
+++ b/include/net/nsh.h
@@ -304,4 +304,7 @@ static inline void nsh_set_flags_ttl_len(struct nshhdr *nsh, u8 flags,
 			NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK);
 }
 
+int skb_push_nsh(struct sk_buff *skb, const struct nshhdr *nsh_src,
+		 bool is_eth);
+
 #endif /* __NET_NSH_H */
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 156ee4c..91dee5b 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -333,6 +333,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_LABELS,	/* 16-octet connection tracking label */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
+	OVS_KEY_ATTR_NSH,       /* Nested set of ovs_nsh_key_* */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -491,6 +492,29 @@ struct ovs_key_ct_tuple_ipv6 {
 	__u8   ipv6_proto;
 };
 
+enum ovs_nsh_key_attr {
+	OVS_NSH_KEY_ATTR_BASE,  /* struct ovs_nsh_key_base. */
+	OVS_NSH_KEY_ATTR_MD1,   /* struct ovs_nsh_key_md1. */
+	OVS_NSH_KEY_ATTR_MD2,   /* variable-length octets for MD type 2. */
+	__OVS_NSH_KEY_ATTR_MAX
+};
+
+#define OVS_NSH_KEY_ATTR_MAX (__OVS_NSH_KEY_ATTR_MAX - 1)
+
+struct ovs_nsh_key_base {
+	__u8 flags;
+	__u8 ttl;
+	__u8 mdtype;
+	__u8 np;
+	__be32 path_hdr;
+};
+
+#define NSH_MD1_CONTEXT_SIZE 4
+
+struct ovs_nsh_key_md1 {
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -806,6 +830,8 @@ struct ovs_action_push_eth {
  * packet.
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
+ * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
+ * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -835,6 +861,8 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
+	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index 58fb827..ad689b5 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -14,6 +14,47 @@
 #include <net/nsh.h>
 #include <net/tun_proto.h>
 
+int skb_push_nsh(struct sk_buff *skb, const struct nshhdr *nsh_src, bool is_eth)
+{
+	struct nshhdr *nsh;
+	size_t length = nsh_hdr_len(nsh_src);
+	u8 next_proto;
+
+	if (is_eth) {
+		next_proto = TUN_P_ETHERNET;
+	} else {
+		next_proto = tun_p_from_eth_p(skb->protocol);
+		if (!next_proto)
+			return -ENOTSUPP;
+	}
+
+	/* Add the NSH header */
+	if (skb_cow_head(skb, length) < 0)
+		return -ENOMEM;
+
+	/* Add the NSH header */
+	if (skb_cow_head(skb, length) < 0)
+		return -ENOMEM;
+
+	if (!skb->inner_protocol) {
+		skb_set_inner_network_header(skb, skb->mac_len);
+		skb_set_inner_protocol(skb, skb->protocol);
+	}
+
+	skb_push(skb, length);
+	nsh = (struct nshhdr *)(skb->data);
+	memcpy(nsh, nsh_src, length);
+	nsh->np = next_proto;
+	nsh->mdtype &= NSH_MDTYPE_MASK;
+
+	skb->protocol = htons(ETH_P_NSH);
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_push_nsh);
+
 static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a54a556..e969fad 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -38,11 +38,13 @@
 #include <net/dsfield.h>
 #include <net/mpls.h>
 #include <net/sctp/checksum.h>
+#include <net/tun_proto.h>
 
 #include "datapath.h"
 #include "flow.h"
 #include "conntrack.h"
 #include "vport.h"
+#include "flow_netlink.h"
 
 struct deferred_action {
 	struct sk_buff *skb;
@@ -380,6 +382,57 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
+		    const struct nshhdr *nsh_src)
+{
+	bool is_eth = false;
+	int ret;
+
+	if (key->mac_proto == MAC_PROTO_ETHERNET)
+		is_eth = true;
+
+	ret = skb_push_nsh(skb, nsh_src, is_eth);
+	if (ret != 0)
+		return ret;
+
+	key->eth.type = htons(ETH_P_NSH);
+
+	/* safe right before invalidate_flow_key */
+	key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
+static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	struct nshhdr *nsh = (struct nshhdr *)(skb->data);
+	size_t length;
+	u16 inner_proto;
+
+	if (ovs_key_mac_proto(key) != MAC_PROTO_NONE ||
+	    skb->protocol != htons(ETH_P_NSH)) {
+		return -EINVAL;
+	}
+
+	inner_proto = tun_p_to_eth_p(nsh->np);
+	if (!inner_proto)
+		return -ENOTSUPP;
+
+	length = nsh_hdr_len(nsh);
+	skb_pull(skb, length);
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+	skb->protocol = inner_proto;
+
+	/* safe right before invalidate_flow_key */
+	if (inner_proto == htons(ETH_P_TEB))
+		key->mac_proto = MAC_PROTO_ETHERNET;
+	else
+		key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
 static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
 				  __be32 addr, __be32 new_addr)
 {
@@ -602,6 +655,53 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	return 0;
 }
 
+static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
+		   const struct ovs_key_nsh *key,
+		   const struct ovs_key_nsh *mask)
+{
+	struct nshhdr *nsh;
+	int err;
+	u8 flags;
+	u8 ttl;
+	int i;
+
+	err = skb_ensure_writable(skb, skb_network_offset(skb) +
+				  sizeof(struct nshhdr));
+	if (unlikely(err))
+		return err;
+
+	nsh = (struct nshhdr *)skb_network_header(skb);
+
+	flags = nsh_get_flags(nsh);
+	flags = OVS_MASKED(flags, key->flags, mask->flags);
+	flow_key->nsh.flags = flags;
+	ttl = nsh_get_ttl(nsh);
+	ttl = OVS_MASKED(ttl, key->ttl, mask->ttl);
+	flow_key->nsh.ttl = ttl;
+	nsh_set_flags_and_ttl(nsh, flags, ttl);
+	nsh->path_hdr = OVS_MASKED(nsh->path_hdr, key->path_hdr,
+				   mask->path_hdr);
+	flow_key->nsh.path_hdr = nsh->path_hdr;
+	switch (nsh->mdtype) {
+	case NSH_M_TYPE1:
+		for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) {
+			nsh->md1.context[i] =
+			    OVS_MASKED(nsh->md1.context[i], key->context[i],
+				       mask->context[i]);
+		}
+		memcpy(flow_key->nsh.context, nsh->md1.context,
+		       sizeof(nsh->md1.context));
+		break;
+	case NSH_M_TYPE2:
+		memset(flow_key->nsh.context, 0,
+		       sizeof(flow_key->nsh.context));
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /* Must follow skb_ensure_writable() since that can move the skb data. */
 static void set_tp_port(struct sk_buff *skb, __be16 *port,
 			__be16 new_port, __sum16 *check)
@@ -1024,6 +1124,32 @@ static int execute_masked_set_action(struct sk_buff *skb,
 				   get_mask(a, struct ovs_key_ethernet *));
 		break;
 
+	case OVS_KEY_ATTR_NSH: {
+		struct ovs_key_nsh nsh;
+		struct ovs_key_nsh nsh_mask;
+		size_t size = nla_len(a) / 2;
+		struct {
+			struct nlattr nla;
+			u8 data[size];
+		} attr, mask;
+
+		attr.nla.nla_type = nla_type(a);
+		attr.nla.nla_len = NLA_HDRLEN + size;
+		memcpy(attr.data, (char *)(a + 1), size);
+
+		mask.nla = attr.nla;
+		memcpy(mask.data, (char *)(a + 1) + size, size);
+
+		err = nsh_key_from_nlattr(&attr.nla, &nsh);
+		if (err)
+			break;
+		err = nsh_key_from_nlattr(&mask.nla, &nsh_mask);
+		if (err)
+			break;
+		err = set_nsh(skb, flow_key, &nsh, &nsh_mask);
+		break;
+	}
+
 	case OVS_KEY_ATTR_IPV4:
 		err = set_ipv4(skb, flow_key, nla_data(a),
 			       get_mask(a, struct ovs_key_ipv4 *));
@@ -1210,6 +1336,21 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_POP_ETH:
 			err = pop_eth(skb, key);
 			break;
+
+		case OVS_ACTION_ATTR_PUSH_NSH: {
+			u8 buffer[NSH_HDR_MAX_LEN];
+			struct nshhdr *nsh_hdr = (struct nshhdr *)buffer;
+			const struct nshhdr *nsh_src = nsh_hdr;
+
+			nsh_hdr_from_nlattr(nla_data(a), nsh_hdr,
+					    NSH_HDR_MAX_LEN);
+			err = push_nsh(skb, key, nsh_src);
+			break;
+		}
+
+		case OVS_ACTION_ATTR_POP_NSH:
+			err = pop_nsh(skb, key);
+			break;
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 8c94cef..7a178d1 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -46,6 +46,7 @@
 #include <net/ipv6.h>
 #include <net/mpls.h>
 #include <net/ndisc.h>
+#include <net/nsh.h>
 
 #include "conntrack.h"
 #include "datapath.h"
@@ -490,6 +491,56 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	struct nshhdr *nsh;
+	unsigned int nh_ofs = skb_network_offset(skb);
+	u8 version, length;
+	int err;
+
+	err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN);
+	if (unlikely(err))
+		return err;
+
+	nsh = (struct nshhdr *)skb_network_header(skb);
+	version = nsh_get_ver(nsh);
+	length = nsh_hdr_len(nsh);
+
+	if (version != 0)
+		return -EINVAL;
+
+	err = check_header(skb, nh_ofs + length);
+	if (unlikely(err))
+		return err;
+
+	nsh = (struct nshhdr *)skb_network_header(skb);
+	key->nsh.flags = nsh_get_flags(nsh);
+	key->nsh.ttl = nsh_get_ttl(nsh);
+	key->nsh.mdtype = nsh->mdtype;
+	key->nsh.np = nsh->np;
+	key->nsh.path_hdr = nsh->path_hdr;
+	switch (key->nsh.mdtype) {
+	case NSH_M_TYPE1:
+		if (length != NSH_M_TYPE1_LEN)
+			return -EINVAL;
+		memcpy(key->nsh.context, nsh->md1.context,
+		       sizeof(nsh->md1));
+		break;
+	case NSH_M_TYPE2:
+		/* Don't support MD type 2 metedata parsing yet */
+		if (length < NSH_BASE_HDR_LEN)
+			return -EINVAL;
+
+		memset(key->nsh.context, 0,
+		       sizeof(nsh->md1));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * key_extract - extracts a flow key from an Ethernet frame.
  * @skb: sk_buff that contains the frame, with skb->data pointing to the
@@ -735,6 +786,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
+	} else if (key->eth.type == htons(ETH_P_NSH)) {
+		error = parse_nsh(skb, key);
+		if (error)
+			return error;
 	}
 	return 0;
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1875bba..6a3cd9c 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -35,6 +35,7 @@
 #include <net/inet_ecn.h>
 #include <net/ip_tunnels.h>
 #include <net/dst_metadata.h>
+#include <net/nsh.h>
 
 struct sk_buff;
 
@@ -66,6 +67,15 @@ struct vlan_head {
 	(offsetof(struct sw_flow_key, recirc_id) +	\
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
 
+struct ovs_key_nsh {
+	u8 flags;
+	u8 ttl;
+	u8 mdtype;
+	u8 np;
+	__be32 path_hdr;
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 struct sw_flow_key {
 	u8 tun_opts[IP_TUNNEL_OPTS_MAX];
 	u8 tun_opts_len;
@@ -144,6 +154,7 @@ struct sw_flow_key {
 			};
 		} ipv6;
 	};
+	struct ovs_key_nsh nsh;         /* network service header */
 	struct {
 		/* Connection tracking fields not packed above. */
 		struct {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e8eb427..17df00a 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -48,6 +48,7 @@
 #include <net/ndisc.h>
 #include <net/mpls.h>
 #include <net/vxlan.h>
+#include <net/tun_proto.h>
 
 #include "flow_netlink.h"
 
@@ -78,9 +79,11 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 		case OVS_ACTION_ATTR_HASH:
 		case OVS_ACTION_ATTR_POP_ETH:
 		case OVS_ACTION_ATTR_POP_MPLS:
+		case OVS_ACTION_ATTR_POP_NSH:
 		case OVS_ACTION_ATTR_POP_VLAN:
 		case OVS_ACTION_ATTR_PUSH_ETH:
 		case OVS_ACTION_ATTR_PUSH_MPLS:
+		case OVS_ACTION_ATTR_PUSH_NSH:
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 		case OVS_ACTION_ATTR_SAMPLE:
 		case OVS_ACTION_ATTR_SET:
@@ -322,12 +325,27 @@ size_t ovs_tun_key_attr_size(void)
 		+ nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
 }
 
+size_t ovs_nsh_key_attr_size(void)
+{
+	/* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider
+	 * updating this function.
+	 */
+	return  nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */
+		/* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are
+		 * mutually exclusive, so the bigger one can cover
+		 * the small one.
+		 *
+		 * OVS_NSH_KEY_ATTR_MD2
+		 */
+		+ nla_total_size(NSH_CTX_HDRS_MAX_LEN);
+}
+
 size_t ovs_key_attr_size(void)
 {
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -341,6 +359,8 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
 		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
+		+ nla_total_size(0)   /* OVS_KEY_ATTR_NSH */
+		  + ovs_nsh_key_attr_size()
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -373,6 +393,13 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
 };
 
+static const struct ovs_len_tbl
+ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = {
+	[OVS_NSH_KEY_ATTR_BASE]     = { .len = 8 },
+	[OVS_NSH_KEY_ATTR_MD1]      = { .len = 16 },
+	[OVS_NSH_KEY_ATTR_MD2]      = { .len = OVS_ATTR_VARIABLE },
+};
+
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
 static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_ENCAP]	 = { .len = OVS_ATTR_NESTED },
@@ -405,6 +432,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
 	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
+	[OVS_KEY_ATTR_NSH]       = { .len = OVS_ATTR_NESTED,
+				     .next = ovs_nsh_key_attr_lens, },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -1179,6 +1208,304 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 	return 0;
 }
 
+int nsh_hdr_from_nlattr(const struct nlattr *attr,
+			struct nshhdr *nsh, size_t size)
+{
+	struct nlattr *a;
+	int rem;
+	u8 flags = 0;
+	u8 ttl = 0;
+	int mdlen = 0;
+	bool has_md1 = false;
+	bool has_md2 = false;
+	u8 len;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(1, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    1,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+			flags = base->flags;
+			ttl = base->ttl;
+			nsh->np = base->np;
+			nsh->mdtype = base->mdtype;
+			nsh->path_hdr = base->path_hdr;
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+			struct nsh_md1_ctx *md1_dst = &nsh->md1;
+
+			has_md1 = true;
+			mdlen = nla_len(a);
+			if (((mdlen + NSH_BASE_HDR_LEN) != NSH_M_TYPE1_LEN) ||
+			    ((mdlen + NSH_BASE_HDR_LEN) > size) ||
+			    (mdlen <= 0)) {
+				OVS_NLERR(
+				    1,
+				    "length %d of nsh attr %d is invalid",
+				    mdlen,
+				    type
+				);
+				return -EINVAL;
+			}
+			memcpy(md1_dst, md1, mdlen);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2: {
+			const struct u8 *md2 = nla_data(a);
+			struct nsh_md2_tlv *md2_dst = &nsh->md2;
+
+			has_md2 = true;
+			mdlen = nla_len(a);
+			if (((mdlen + NSH_BASE_HDR_LEN) > size) ||
+			    (mdlen <= 0)) {
+				OVS_NLERR(
+				    1,
+				    "length %d of nsh attr %d is invalid",
+				    mdlen,
+				    type
+				);
+				return -EINVAL;
+			}
+			memcpy(md2_dst, md2, mdlen);
+			break;
+		}
+		default:
+			OVS_NLERR(1, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(1, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if ((has_md1 && nsh->mdtype != NSH_M_TYPE1) ||
+	    (has_md2 && nsh->mdtype != NSH_M_TYPE2)) {
+		OVS_NLERR(1,
+			  "nsh attribute has unmatched MD type %d.",
+			  nsh->mdtype);
+		return -EINVAL;
+	}
+
+	if (unlikely(has_md1 && has_md2)) {
+		OVS_NLERR(1, "both nsh md1 and md2 attribute are there");
+		return -EINVAL;
+	}
+
+	if (unlikely(!has_md1 && !has_md2)) {
+		OVS_NLERR(1, "neither nsh md1 nor md2 attribute is there");
+		return -EINVAL;
+	}
+
+	/* nsh header length  = NSH_BASE_HDR_LEN + mdlen */
+	nsh->ver_flags_ttl_len = 0;
+	len = NSH_BASE_HDR_LEN + mdlen;
+	nsh_set_flags_ttl_len(nsh, flags, ttl, len);
+
+	return 0;
+}
+
+int nsh_key_from_nlattr(const struct nlattr *attr,
+			struct ovs_key_nsh *nsh)
+{
+	struct nlattr *a;
+	int rem;
+	bool has_md1 = false;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(1, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    1,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+
+			memcpy(nsh, base, sizeof(*base));
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+
+			has_md1 = true;
+			memcpy(nsh->context, md1->context, sizeof(*md1));
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			/* Not supported yet */
+			return -ENOTSUPP;
+		default:
+			OVS_NLERR(1, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(1, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if ((has_md1 && nsh->mdtype != NSH_M_TYPE1)) {
+		OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
+			  nsh->mdtype);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nsh_key_put_from_nlattr(const struct nlattr *attr,
+				   struct sw_flow_match *match, bool is_mask,
+				   bool is_push_nsh, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool has_md1 = false;
+	bool has_md2 = false;
+	u8 mdtype = 0;
+	int mdlen = 0;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+		int i;
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(log, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    log,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+
+			mdtype = base->mdtype;
+			SW_FLOW_KEY_PUT(match, nsh.flags,
+					base->flags, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.ttl,
+					base->ttl, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.mdtype,
+					base->mdtype, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.np,
+					base->np, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.path_hdr,
+					base->path_hdr, is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+
+			has_md1 = true;
+			for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++)
+				SW_FLOW_KEY_PUT(match, nsh.context[i],
+						md1->context[i], is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			if (!is_push_nsh) /* Not supported MD type 2 yet */
+				return -ENOTSUPP;
+
+			has_md2 = true;
+			mdlen = nla_len(a);
+			if ((mdlen > NSH_CTX_HDRS_MAX_LEN) ||
+			    (mdlen <= 0))
+				return -EINVAL;
+			break;
+		default:
+			OVS_NLERR(log, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if (!is_mask) {
+		if ((has_md1 && mdtype != NSH_M_TYPE1)) {
+			OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
+				  mdtype);
+			return -EINVAL;
+		}
+	}
+
+	if (is_push_nsh & !is_mask) {
+		if ((has_md1 && mdtype != NSH_M_TYPE1) ||
+		    (has_md2 && mdtype != NSH_M_TYPE2) ||
+		    (has_md1 && has_md2) ||
+		    (!has_md1 && !has_md2)) {
+			OVS_NLERR(
+			    1,
+			    "push nsh attributes are invalid for type %d.",
+			    mdtype
+			);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				u64 attrs, const struct nlattr **a,
 				bool is_mask, bool log)
@@ -1306,6 +1633,13 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_NSH)) {
+		if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match,
+					    is_mask, false, log) < 0)
+			return -EINVAL;
+		attrs &= ~(1 << OVS_KEY_ATTR_NSH);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
 		const struct ovs_key_mpls *mpls_key;
 
@@ -1622,6 +1956,40 @@ static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
 	return 0;
 }
 
+static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask,
+			     struct sk_buff *skb)
+{
+	struct nlattr *start;
+	struct ovs_nsh_key_base base;
+	struct ovs_nsh_key_md1 md1;
+
+	memcpy(&base, nsh, sizeof(base));
+
+	if (is_mask || nsh->mdtype == NSH_M_TYPE1)
+		memcpy(md1.context, nsh->context, sizeof(md1));
+
+	start = nla_nest_start(skb, OVS_KEY_ATTR_NSH);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(base), &base))
+		goto nla_put_failure;
+
+	if (is_mask || nsh->mdtype == NSH_M_TYPE1) {
+		if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1, sizeof(md1), &md1))
+			goto nla_put_failure;
+	}
+
+	/* Don't support MD type 2 yet */
+
+	nla_nest_end(skb, start);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 			     const struct sw_flow_key *output, bool is_mask,
 			     struct sk_buff *skb)
@@ -1750,6 +2118,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 		ipv6_key->ipv6_tclass = output->ip.tos;
 		ipv6_key->ipv6_hlimit = output->ip.ttl;
 		ipv6_key->ipv6_frag = output->ip.frag;
+	} else if (swkey->eth.type == htons(ETH_P_NSH)) {
+		if (nsh_key_to_nlattr(&output->nsh, is_mask, skb))
+			goto nla_put_failure;
 	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
 		   swkey->eth.type == htons(ETH_P_RARP)) {
 		struct ovs_key_arp *arp_key;
@@ -2242,6 +2613,19 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	return err;
 }
 
+static bool validate_nsh(const struct nlattr *attr, bool is_mask,
+			 bool is_push_nsh, bool log)
+{
+	struct sw_flow_match match;
+	struct sw_flow_key key;
+	int ret = 0;
+
+	ovs_match_init(&match, &key, true, NULL);
+	ret = nsh_key_put_from_nlattr(attr, &match, is_mask,
+				      is_push_nsh, log);
+	return ((ret != 0) ? false : true);
+}
+
 /* Return false if there are any non-masked bits set.
  * Mask follows data immediately, before any netlink padding.
  */
@@ -2384,6 +2768,11 @@ static int validate_set(const struct nlattr *a,
 
 		break;
 
+	case OVS_KEY_ATTR_NSH:
+		if (!validate_nsh(nla_data(a), masked, false, log))
+			return -EINVAL;
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -2482,6 +2871,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
+			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
+			[OVS_ACTION_ATTR_POP_NSH] = 0,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -2636,6 +3027,19 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			mac_proto = MAC_PROTO_ETHERNET;
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_NSH:
+			mac_proto = MAC_PROTO_NONE;
+			if (!validate_nsh(nla_data(a), false, true, true))
+				return -EINVAL;
+			break;
+
+		case OVS_ACTION_ATTR_POP_NSH:
+			if (key->nsh.np == TUN_P_ETHERNET)
+				mac_proto = MAC_PROTO_ETHERNET;
+			else
+				mac_proto = MAC_PROTO_NONE;
+			break;
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 929c665..7be6750 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -79,4 +79,8 @@ int ovs_nla_put_actions(const struct nlattr *attr,
 void ovs_nla_free_flow_actions(struct sw_flow_actions *);
 void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
 
+int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh);
+int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nsh_src,
+			size_t size);
+
 #endif /* flow_netlink.h */
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
       [not found] ` <1504096752-102003-1-git-send-email-yi.y.yang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2017-08-31 10:45   ` Jiri Benc
  2017-09-04  8:00     ` Yang, Yi
  2017-09-08  6:13     ` Yang, Yi
  2017-09-04 13:10   ` Jiri Benc
  1 sibling, 2 replies; 21+ messages in thread
From: Jiri Benc @ 2017-08-31 10:45 UTC (permalink / raw)
  To: Yi Yang
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Wed, 30 Aug 2017 20:39:12 +0800, Yi Yang wrote:
> --- a/net/nsh/nsh.c
> +++ b/net/nsh/nsh.c
> @@ -14,6 +14,47 @@
>  #include <net/nsh.h>
>  #include <net/tun_proto.h>
>  
> +int skb_push_nsh(struct sk_buff *skb, const struct nshhdr *nsh_src, bool is_eth)
> +{
> +	struct nshhdr *nsh;
> +	size_t length = nsh_hdr_len(nsh_src);
> +	u8 next_proto;
> +
> +	if (is_eth) {
> +		next_proto = TUN_P_ETHERNET;

I don't like the is_eth parameter. It should be clear from the skb being
passed to the function, specifically from skb->mac_len.

> +	} else {
> +		next_proto = tun_p_from_eth_p(skb->protocol);
> +		if (!next_proto)
> +			return -ENOTSUPP;

EAFNOSUPPORT is more suitable here.

> +	}
> +
> +	/* Add the NSH header */
> +	if (skb_cow_head(skb, length) < 0)
> +		return -ENOMEM;
> +
> +	/* Add the NSH header */
> +	if (skb_cow_head(skb, length) < 0)
> +		return -ENOMEM;

Duplicate statement.

> +
> +	if (!skb->inner_protocol) {
> +		skb_set_inner_network_header(skb, skb->mac_len);
> +		skb_set_inner_protocol(skb, skb->protocol);

It doesn't make sense to set either of those. Nothing uses the fields for
NSH.

> +	}
> +
> +	skb_push(skb, length);
> +	nsh = (struct nshhdr *)(skb->data);

Use nsh_hdr.

> +	memcpy(nsh, nsh_src, length);
> +	nsh->np = next_proto;
> +	nsh->mdtype &= NSH_MDTYPE_MASK;

The "& NSH_MDTYPE_MASK" operation does not make sense. Just trust the caller
to provide a meaningful value. It's its job to verify the parameters.

> +	skb->protocol = htons(ETH_P_NSH);
> +	skb_reset_mac_header(skb);
> +	skb_reset_mac_len(skb);

You have to reset also the network header.

> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(skb_push_nsh);
> +
>  static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
>  				       netdev_features_t features)
>  {
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index a54a556..e969fad 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -38,11 +38,13 @@
>  #include <net/dsfield.h>
>  #include <net/mpls.h>
>  #include <net/sctp/checksum.h>
> +#include <net/tun_proto.h>
>  
>  #include "datapath.h"
>  #include "flow.h"
>  #include "conntrack.h"
>  #include "vport.h"
> +#include "flow_netlink.h"
>  
>  struct deferred_action {
>  	struct sk_buff *skb;
> @@ -380,6 +382,57 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
>  	return 0;
>  }
>  
> +static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
> +		    const struct nshhdr *nsh_src)
> +{
> +	bool is_eth = false;
> +	int ret;
> +
> +	if (key->mac_proto == MAC_PROTO_ETHERNET)
> +		is_eth = true;
> +
> +	ret = skb_push_nsh(skb, nsh_src, is_eth);
> +	if (ret != 0)

The usual idiom is "if (ret)". And for consistency with the rest of the
file, the name of the variable should be "err".

> +		return ret;
> +
> +	key->eth.type = htons(ETH_P_NSH);
> +
> +	/* safe right before invalidate_flow_key */
> +	key->mac_proto = MAC_PROTO_NONE;
> +	invalidate_flow_key(key);
> +	return 0;
> +}
> +
> +static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> +	struct nshhdr *nsh = (struct nshhdr *)(skb->data);
> +	size_t length;
> +	u16 inner_proto;
> +
> +	if (ovs_key_mac_proto(key) != MAC_PROTO_NONE ||
> +	    skb->protocol != htons(ETH_P_NSH)) {
> +		return -EINVAL;
> +	}
> +
> +	inner_proto = tun_p_to_eth_p(nsh->np);
> +	if (!inner_proto)
> +		return -ENOTSUPP;
> +
> +	length = nsh_hdr_len(nsh);
> +	skb_pull(skb, length);
> +	skb_reset_mac_header(skb);
> +	skb_reset_mac_len(skb);
> +	skb->protocol = inner_proto;

Please factor this out to skb_pop_nsh, similarly to what you did with
skb_push_nsh.

> +
> +	/* safe right before invalidate_flow_key */
> +	if (inner_proto == htons(ETH_P_TEB))
> +		key->mac_proto = MAC_PROTO_ETHERNET;
> +	else
> +		key->mac_proto = MAC_PROTO_NONE;
> +	invalidate_flow_key(key);
> +	return 0;
> +}
> +
>  static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
>  				  __be32 addr, __be32 new_addr)
>  {
> @@ -602,6 +655,53 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
>  	return 0;
>  }
>  
> +static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
> +		   const struct ovs_key_nsh *key,
> +		   const struct ovs_key_nsh *mask)
> +{
> +	struct nshhdr *nsh;
> +	int err;
> +	u8 flags;
> +	u8 ttl;
> +	int i;
> +
> +	err = skb_ensure_writable(skb, skb_network_offset(skb) +
> +				  sizeof(struct nshhdr));
> +	if (unlikely(err))
> +		return err;
> +
> +	nsh = (struct nshhdr *)skb_network_header(skb);

nsh_hdr

> +
> +	flags = nsh_get_flags(nsh);
> +	flags = OVS_MASKED(flags, key->flags, mask->flags);
> +	flow_key->nsh.flags = flags;
> +	ttl = nsh_get_ttl(nsh);
> +	ttl = OVS_MASKED(ttl, key->ttl, mask->ttl);
> +	flow_key->nsh.ttl = ttl;
> +	nsh_set_flags_and_ttl(nsh, flags, ttl);
> +	nsh->path_hdr = OVS_MASKED(nsh->path_hdr, key->path_hdr,
> +				   mask->path_hdr);
> +	flow_key->nsh.path_hdr = nsh->path_hdr;
> +	switch (nsh->mdtype) {
> +	case NSH_M_TYPE1:
> +		for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) {
> +			nsh->md1.context[i] =
> +			    OVS_MASKED(nsh->md1.context[i], key->context[i],
> +				       mask->context[i]);
> +		}
> +		memcpy(flow_key->nsh.context, nsh->md1.context,
> +		       sizeof(nsh->md1.context));

Do you follow the discussion that Hannes Sowa started on the ovs list
regarding matching on the context fields? It would be better to hold on this
until there's a conclusion reached.

> +		break;
> +	case NSH_M_TYPE2:
> +		memset(flow_key->nsh.context, 0,
> +		       sizeof(flow_key->nsh.context));
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +	return 0;
> +}
> +
>  /* Must follow skb_ensure_writable() since that can move the skb data. */
>  static void set_tp_port(struct sk_buff *skb, __be16 *port,
>  			__be16 new_port, __sum16 *check)
> @@ -1024,6 +1124,32 @@ static int execute_masked_set_action(struct sk_buff *skb,
>  				   get_mask(a, struct ovs_key_ethernet *));
>  		break;
>  
> +	case OVS_KEY_ATTR_NSH: {
> +		struct ovs_key_nsh nsh;
> +		struct ovs_key_nsh nsh_mask;
> +		size_t size = nla_len(a) / 2;
> +		struct {
> +			struct nlattr nla;
> +			u8 data[size];
> +		} attr, mask;
> +
> +		attr.nla.nla_type = nla_type(a);
> +		attr.nla.nla_len = NLA_HDRLEN + size;
> +		memcpy(attr.data, (char *)(a + 1), size);
> +
> +		mask.nla = attr.nla;
> +		memcpy(mask.data, (char *)(a + 1) + size, size);

This is much cleaner than before. Thank you for doing the change. It now
allows me to understand what's actually going on here.

> +		err = nsh_key_from_nlattr(&attr.nla, &nsh);
> +		if (err)
> +			break;
> +		err = nsh_key_from_nlattr(&mask.nla, &nsh_mask);
> +		if (err)
> +			break;

This is a lot of copying to just be able to use nla_for_each_nested. While
I'm not a fan of how ovs uses the attributes to store both value and mask,
it's not completely irrational. However, I don't think that the intent was
to store a copy of the whole nested group. It's quite obvious that it does
not fit the pattern from the gymnastics you need to do.

Instead, store the mask in each of the nested attributes individually. There
will be no need for the copying above and the code will get cleaner and
faster. It's not that the nsh_key_from_nlattr function needs to be able to
work separately for the key and mask. Nothing else than the line above uses
this function. Just move the logic inside.

Actually, it seems it can be moved directly to set_nsh.

> +		err = set_nsh(skb, flow_key, &nsh, &nsh_mask);
> +		break;
> +	}
> +
>  	case OVS_KEY_ATTR_IPV4:
>  		err = set_ipv4(skb, flow_key, nla_data(a),
>  			       get_mask(a, struct ovs_key_ipv4 *));
> @@ -1210,6 +1336,21 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>  		case OVS_ACTION_ATTR_POP_ETH:
>  			err = pop_eth(skb, key);
>  			break;
> +
> +		case OVS_ACTION_ATTR_PUSH_NSH: {
> +			u8 buffer[NSH_HDR_MAX_LEN];
> +			struct nshhdr *nsh_hdr = (struct nshhdr *)buffer;
> +			const struct nshhdr *nsh_src = nsh_hdr;
> +
> +			nsh_hdr_from_nlattr(nla_data(a), nsh_hdr,
> +					    NSH_HDR_MAX_LEN);

This is costly. This is called for every packet in the fast path. We should
precompute and store the header instead.

I know I had comments to this part before and it would be ideal if I raised
this earlier but since you had trivial bugs like not taking the buffer size
into account, it was hard to focus on the design side of things. You can
prevent this churn by paying more attention to details before submission.
It's really hard to focus on high level picture when the first thing one
encounters is a trivial bug.

> +			err = push_nsh(skb, key, nsh_src);
> +			break;
> +		}
> +
> +		case OVS_ACTION_ATTR_POP_NSH:
> +			err = pop_nsh(skb, key);
> +			break;
>  		}
>  
>  		if (unlikely(err)) {
> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> index 8c94cef..7a178d1 100644
> --- a/net/openvswitch/flow.c
> +++ b/net/openvswitch/flow.c
> @@ -46,6 +46,7 @@
>  #include <net/ipv6.h>
>  #include <net/mpls.h>
>  #include <net/ndisc.h>
> +#include <net/nsh.h>
>  
>  #include "conntrack.h"
>  #include "datapath.h"
> @@ -490,6 +491,56 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
>  	return 0;
>  }
>  
> +static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> +	struct nshhdr *nsh;
> +	unsigned int nh_ofs = skb_network_offset(skb);
> +	u8 version, length;
> +	int err;
> +
> +	err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN);
> +	if (unlikely(err))
> +		return err;
> +
> +	nsh = (struct nshhdr *)skb_network_header(skb);

nsh_hdr

> +	version = nsh_get_ver(nsh);
> +	length = nsh_hdr_len(nsh);
> +
> +	if (version != 0)
> +		return -EINVAL;
> +
> +	err = check_header(skb, nh_ofs + length);
> +	if (unlikely(err))
> +		return err;
> +
> +	nsh = (struct nshhdr *)skb_network_header(skb);

nsh_hdr

> +	key->nsh.flags = nsh_get_flags(nsh);
> +	key->nsh.ttl = nsh_get_ttl(nsh);
> +	key->nsh.mdtype = nsh->mdtype;
> +	key->nsh.np = nsh->np;
> +	key->nsh.path_hdr = nsh->path_hdr;
> +	switch (key->nsh.mdtype) {
> +	case NSH_M_TYPE1:
> +		if (length != NSH_M_TYPE1_LEN)
> +			return -EINVAL;
> +		memcpy(key->nsh.context, nsh->md1.context,
> +		       sizeof(nsh->md1));
> +		break;
> +	case NSH_M_TYPE2:
> +		/* Don't support MD type 2 metedata parsing yet */
> +		if (length < NSH_BASE_HDR_LEN)
> +			return -EINVAL;

Shouldn't we reject the packet, then? Pretending that something was parsed
correctly while in fact it was not, does not look correct.

> +
> +		memset(key->nsh.context, 0,
> +		       sizeof(nsh->md1));
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
>  /**
>   * key_extract - extracts a flow key from an Ethernet frame.
>   * @skb: sk_buff that contains the frame, with skb->data pointing to the
> @@ -735,6 +786,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
>  				memset(&key->tp, 0, sizeof(key->tp));
>  			}
>  		}
> +	} else if (key->eth.type == htons(ETH_P_NSH)) {
> +		error = parse_nsh(skb, key);
> +		if (error)
> +			return error;
>  	}
>  	return 0;
>  }
> diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
> index 1875bba..6a3cd9c 100644
> --- a/net/openvswitch/flow.h
> +++ b/net/openvswitch/flow.h
> @@ -35,6 +35,7 @@
>  #include <net/inet_ecn.h>
>  #include <net/ip_tunnels.h>
>  #include <net/dst_metadata.h>
> +#include <net/nsh.h>
>  
>  struct sk_buff;
>  
> @@ -66,6 +67,15 @@ struct vlan_head {
>  	(offsetof(struct sw_flow_key, recirc_id) +	\
>  	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
>  
> +struct ovs_key_nsh {
> +	u8 flags;
> +	u8 ttl;
> +	u8 mdtype;
> +	u8 np;
> +	__be32 path_hdr;
> +	__be32 context[NSH_MD1_CONTEXT_SIZE];
> +};
> +
>  struct sw_flow_key {
>  	u8 tun_opts[IP_TUNNEL_OPTS_MAX];
>  	u8 tun_opts_len;
> @@ -144,6 +154,7 @@ struct sw_flow_key {
>  			};
>  		} ipv6;
>  	};
> +	struct ovs_key_nsh nsh;         /* network service header */
>  	struct {
>  		/* Connection tracking fields not packed above. */
>  		struct {
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index e8eb427..17df00a 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> @@ -48,6 +48,7 @@
>  #include <net/ndisc.h>
>  #include <net/mpls.h>
>  #include <net/vxlan.h>
> +#include <net/tun_proto.h>
>  
>  #include "flow_netlink.h"
>  
> @@ -78,9 +79,11 @@ static bool actions_may_change_flow(const struct nlattr *actions)
>  		case OVS_ACTION_ATTR_HASH:
>  		case OVS_ACTION_ATTR_POP_ETH:
>  		case OVS_ACTION_ATTR_POP_MPLS:
> +		case OVS_ACTION_ATTR_POP_NSH:
>  		case OVS_ACTION_ATTR_POP_VLAN:
>  		case OVS_ACTION_ATTR_PUSH_ETH:
>  		case OVS_ACTION_ATTR_PUSH_MPLS:
> +		case OVS_ACTION_ATTR_PUSH_NSH:
>  		case OVS_ACTION_ATTR_PUSH_VLAN:
>  		case OVS_ACTION_ATTR_SAMPLE:
>  		case OVS_ACTION_ATTR_SET:
> @@ -322,12 +325,27 @@ size_t ovs_tun_key_attr_size(void)
>  		+ nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
>  }
>  
> +size_t ovs_nsh_key_attr_size(void)
> +{
> +	/* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider
> +	 * updating this function.
> +	 */
> +	return  nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */
> +		/* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are
> +		 * mutually exclusive, so the bigger one can cover
> +		 * the small one.
> +		 *
> +		 * OVS_NSH_KEY_ATTR_MD2
> +		 */
> +		+ nla_total_size(NSH_CTX_HDRS_MAX_LEN);
> +}
> +
>  size_t ovs_key_attr_size(void)
>  {
>  	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
>  	 * updating this function.
>  	 */
> -	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
> +	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29);
>  
>  	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
>  		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
> @@ -341,6 +359,8 @@ size_t ovs_key_attr_size(void)
>  		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
>  		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
>  		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
> +		+ nla_total_size(0)   /* OVS_KEY_ATTR_NSH */
> +		  + ovs_nsh_key_attr_size()
>  		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
>  		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
>  		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
> @@ -373,6 +393,13 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
>  	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
>  };
>  
> +static const struct ovs_len_tbl
> +ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = {
> +	[OVS_NSH_KEY_ATTR_BASE]     = { .len = 8 },

sizeof(struct ovs_nsh_key_base)

> +	[OVS_NSH_KEY_ATTR_MD1]      = { .len = 16 },

sizeof(struct ovs_nsh_key_md1)

> +	[OVS_NSH_KEY_ATTR_MD2]      = { .len = OVS_ATTR_VARIABLE },
> +};
> +
>  /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
>  static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
>  	[OVS_KEY_ATTR_ENCAP]	 = { .len = OVS_ATTR_NESTED },
> @@ -405,6 +432,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
>  		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
>  	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
>  		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
> +	[OVS_KEY_ATTR_NSH]       = { .len = OVS_ATTR_NESTED,
> +				     .next = ovs_nsh_key_attr_lens, },
>  };
>  
>  static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
> @@ -1179,6 +1208,304 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
>  	return 0;
>  }
>  
> +int nsh_hdr_from_nlattr(const struct nlattr *attr,
> +			struct nshhdr *nsh, size_t size)
> +{
> +	struct nlattr *a;
> +	int rem;
> +	u8 flags = 0;
> +	u8 ttl = 0;
> +	int mdlen = 0;
> +	bool has_md1 = false;
> +	bool has_md2 = false;
> +	u8 len;
> +
> +	nla_for_each_nested(a, attr, rem) {
> +		int type = nla_type(a);
> +
> +		if (type > OVS_NSH_KEY_ATTR_MAX) {
> +			OVS_NLERR(1, "nsh attr %d is out of range max %d",
> +				  type, OVS_NSH_KEY_ATTR_MAX);
> +			return -EINVAL;
> +		}
> +
> +		if (!check_attr_len(nla_len(a),
> +				    ovs_nsh_key_attr_lens[type].len)) {
> +			OVS_NLERR(
> +			    1,
> +			    "nsh attr %d has unexpected len %d expected %d",
> +			    type,
> +			    nla_len(a),
> +			    ovs_nsh_key_attr_lens[type].len
> +			);
> +			return -EINVAL;
> +		}

These checks should be done only once when the action is configured, not for
each packet.

> +
> +		switch (type) {
> +		case OVS_NSH_KEY_ATTR_BASE: {
> +			const struct ovs_nsh_key_base *base =
> +				(struct ovs_nsh_key_base *)nla_data(a);
> +			flags = base->flags;
> +			ttl = base->ttl;
> +			nsh->np = base->np;
> +			nsh->mdtype = base->mdtype;
> +			nsh->path_hdr = base->path_hdr;
> +			break;
> +		}
> +		case OVS_NSH_KEY_ATTR_MD1: {
> +			const struct ovs_nsh_key_md1 *md1 =
> +				(struct ovs_nsh_key_md1 *)nla_data(a);
> +			struct nsh_md1_ctx *md1_dst = &nsh->md1;
> +
> +			has_md1 = true;
> +			mdlen = nla_len(a);
> +			if (((mdlen + NSH_BASE_HDR_LEN) != NSH_M_TYPE1_LEN) ||
> +			    ((mdlen + NSH_BASE_HDR_LEN) > size) ||
> +			    (mdlen <= 0)) {
> +				OVS_NLERR(
> +				    1,
> +				    "length %d of nsh attr %d is invalid",
> +				    mdlen,
> +				    type
> +				);
> +				return -EINVAL;
> +			}

Ditto for most parts of the check. Plus the maximum size should be validated
elsewhere. In this function, there should be a WARN_ON_ONCE if the computed
length > size.

> +			memcpy(md1_dst, md1, mdlen);
> +			break;
> +		}
> +		case OVS_NSH_KEY_ATTR_MD2: {
> +			const struct u8 *md2 = nla_data(a);
> +			struct nsh_md2_tlv *md2_dst = &nsh->md2;
> +
> +			has_md2 = true;
> +			mdlen = nla_len(a);
> +			if (((mdlen + NSH_BASE_HDR_LEN) > size) ||
> +			    (mdlen <= 0)) {
> +				OVS_NLERR(
> +				    1,
> +				    "length %d of nsh attr %d is invalid",
> +				    mdlen,
> +				    type
> +				);
> +				return -EINVAL;
> +			}

Ditto.

> +			memcpy(md2_dst, md2, mdlen);
> +			break;
> +		}
> +		default:
> +			OVS_NLERR(1, "Unknown nsh attribute %d",
> +				  type);
> +			return -EINVAL;

Ditto.

> +		}
> +	}
> +
> +	if (rem > 0) {
> +		OVS_NLERR(1, "nsh attribute has %d unknown bytes.", rem);
> +		return -EINVAL;
> +	}
> +
> +	if ((has_md1 && nsh->mdtype != NSH_M_TYPE1) ||
> +	    (has_md2 && nsh->mdtype != NSH_M_TYPE2)) {
> +		OVS_NLERR(1,
> +			  "nsh attribute has unmatched MD type %d.",
> +			  nsh->mdtype);
> +		return -EINVAL;
> +	}
> +
> +	if (unlikely(has_md1 && has_md2)) {
> +		OVS_NLERR(1, "both nsh md1 and md2 attribute are there");
> +		return -EINVAL;
> +	}
> +
> +	if (unlikely(!has_md1 && !has_md2)) {
> +		OVS_NLERR(1, "neither nsh md1 nor md2 attribute is there");
> +		return -EINVAL;
> +	}

Ditto. Plus I don't see a check that the OVS_NSH_KEY_ATTR_BASE attribute is
present. Seems that the compiler warned you about possibly unitialized flags
and ttl variables and you just silenced the warning without considering
whether it's not actually justified.

> +
> +	/* nsh header length  = NSH_BASE_HDR_LEN + mdlen */
> +	nsh->ver_flags_ttl_len = 0;
> +	len = NSH_BASE_HDR_LEN + mdlen;

In the whole function you open code NSH_BASE_HDR_LEN + mdlen and suddenly,
at the very end, you store it to a variable to be used just once? Call me
confused.

> +	nsh_set_flags_ttl_len(nsh, flags, ttl, len);
> +
> +	return 0;
> +}

So, this whole function does a lot of processing and copying. Why don't we
just parse the attributes once, cache the resulting header and just memcpy
it in the hot path?

> +
> +int nsh_key_from_nlattr(const struct nlattr *attr,
> +			struct ovs_key_nsh *nsh)

See above plus what I wrote for the previous function, it applies here as
well.

I stop here. I suspect there are similar things in the rest of the patch.
Please think about what I wrote and apply it to all similar situations. Not
as you do with nsh_push where you apparenly ignored nsh_pop just because
I mentioned nsh_push only.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-08-31 10:45   ` Jiri Benc
@ 2017-09-04  8:00     ` Yang, Yi
       [not found]       ` <20170904080005.GA70767-re2EX8HDrk21gSHoDXDV2kEOCMrvLtNR@public.gmane.org>
  2017-09-08  6:13     ` Yang, Yi
  1 sibling, 1 reply; 21+ messages in thread
From: Yang, Yi @ 2017-09-04  8:00 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev, davem, dev, e, blp, jan.scheurich

On Thu, Aug 31, 2017 at 06:45:16PM +0800, Jiri Benc wrote:
> > +				       mask->context[i]);
> > +		}
> > +		memcpy(flow_key->nsh.context, nsh->md1.context,
> > +		       sizeof(nsh->md1.context));
> 
> Do you follow the discussion that Hannes Sowa started on the ovs list
> regarding matching on the context fields? It would be better to hold on this
> until there's a conclusion reached.
>

I think we have had similiar discussion about this, the issue is we have
no way to handle both MD type 1 and MD type 2 by using a common flow key struct.

So we have to do so, there is miniflow to handle such issue in
userspace, but kernel data path didn't provide such mechanism. I think
every newly-added key will result in the same space-wasting issue for
kernel data path, isn't it?

> > +		err = nsh_key_from_nlattr(&attr.nla, &nsh);
> > +		if (err)
> > +			break;
> > +		err = nsh_key_from_nlattr(&mask.nla, &nsh_mask);
> > +		if (err)
> > +			break;
> 
> This is a lot of copying to just be able to use nla_for_each_nested. While
> I'm not a fan of how ovs uses the attributes to store both value and mask,
> it's not completely irrational. However, I don't think that the intent was
> to store a copy of the whole nested group. It's quite obvious that it does
> not fit the pattern from the gymnastics you need to do.
> 
> Instead, store the mask in each of the nested attributes individually. There
> will be no need for the copying above and the code will get cleaner and
> faster. It's not that the nsh_key_from_nlattr function needs to be able to
> work separately for the key and mask. Nothing else than the line above uses
> this function. Just move the logic inside.
> 
> Actually, it seems it can be moved directly to set_nsh.

OVS_ATTR_KEY_NSH is the first case to use nested attributes for set
action, so no reference code for this, set action has two use cases, one
has mask but the other hasn't, so it will be easier an nested attribute is
handled as a whole, in user space, nsh_key_from_nlattr is used in
several places. If we change it per your proposal, there will be a lot
of rework, we have to provide two functions to handle this, one is for
the case with mask, the other is for the case without mask.

> 
> > +		err = set_nsh(skb, flow_key, &nsh, &nsh_mask);
> > +		break;
> > +	}
> > +
> >  	case OVS_KEY_ATTR_IPV4:
> >  		err = set_ipv4(skb, flow_key, nla_data(a),
> >  			       get_mask(a, struct ovs_key_ipv4 *));
> > @@ -1210,6 +1336,21 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
> >  		case OVS_ACTION_ATTR_POP_ETH:
> >  			err = pop_eth(skb, key);
> >  			break;
> > +
> > +		case OVS_ACTION_ATTR_PUSH_NSH: {
> > +			u8 buffer[NSH_HDR_MAX_LEN];
> > +			struct nshhdr *nsh_hdr = (struct nshhdr *)buffer;
> > +			const struct nshhdr *nsh_src = nsh_hdr;
> > +
> > +			nsh_hdr_from_nlattr(nla_data(a), nsh_hdr,
> > +					    NSH_HDR_MAX_LEN);
> 
> This is costly. This is called for every packet in the fast path. We should
> precompute and store the header instead.

How can we do so? I see batch process code in user space implementation,
but kernel data path didn't such infrastructure, how can we know next
push_nsh uses the same nsh header as previous one?

> > +	case NSH_M_TYPE2:
> > +		/* Don't support MD type 2 metedata parsing yet */
> > +		if (length < NSH_BASE_HDR_LEN)
> > +			return -EINVAL;
> 
> Shouldn't we reject the packet, then? Pretending that something was parsed
> correctly while in fact it was not, does not look correct.

For MD type 2, we don't implement metadata parsing, but it can still
work. I'm not sure what you mean here, how do we reject a packet here?

> 
> These checks should be done only once when the action is configured, not for
> each packet.

I don't know how to implement a batch processing in kernel data path, it
seems there isn't such code for reference.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
       [not found]       ` <20170904080005.GA70767-re2EX8HDrk21gSHoDXDV2kEOCMrvLtNR@public.gmane.org>
@ 2017-09-04 10:42         ` Jiri Benc
  2017-09-04 12:09           ` Yang, Yi
                             ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Jiri Benc @ 2017-09-04 10:42 UTC (permalink / raw)
  To: Yang, Yi
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Mon, 4 Sep 2017 16:00:05 +0800, Yang, Yi wrote:
> I think we have had similiar discussion about this, the issue is we have
> no way to handle both MD type 1 and MD type 2 by using a common flow key struct.
> 
> So we have to do so, there is miniflow to handle such issue in
> userspace, but kernel data path didn't provide such mechanism. I think
> every newly-added key will result in the same space-wasting issue for
> kernel data path, isn't it?

Yes. And it would be surprising if it didn't have an effect on
performance.

I don't care that much as this is a result of openvswitch module design
decisions but it still would be good to reach a consensus before
implementing uAPI that might not be needed in the end.

> OVS_ATTR_KEY_NSH is the first case to use nested attributes for set
> action, so no reference code for this, set action has two use cases, one
> has mask but the other hasn't, so it will be easier an nested attribute is
> handled as a whole, in user space, nsh_key_from_nlattr is used in
> several places.

I very much disagree. What you're doing is very poor design as can be
seen from the code where you have to do weird gymnastics to implement
that. Compare that to a much simple case where each attribute carries
its own value and mask.

> If we change it per your proposal, there will be a lot
> of rework,

That's not an argument. We care about doing things right, we don't do
things hastily and with as little effort as possible.

> we have to provide two functions to handle this, one is for
> the case with mask, the other is for the case without mask.

I don't see that. The function is called from a single place only.

> How can we do so? I see batch process code in user space implementation,
> but kernel data path didn't such infrastructure,

You're right that there's no infrastructure. I somewhat agree that it
might be out of scope of this patch and it can be optimized later.
That's why I also included other suggestions to speed this up until we
implement better parsing: namely, verify correctness once (at the time
it is set from user space) and just expect things to be correct in the
fast path.

> how can we know next push_nsh uses the same nsh header as previous
> one?

We store the prepopulated header together with the action.

> > Shouldn't we reject the packet, then? Pretending that something was parsed
> > correctly while in fact it was not, does not look correct.
> 
> For MD type 2, we don't implement metadata parsing, but it can still
> work. I'm not sure what you mean here, how do we reject a packet here?

Okay, we can match on mdtype and thus can find out about this type of
packets. No problem, then.

> > These checks should be done only once when the action is configured, not for
> > each packet.
> 
> I don't know how to implement a batch processing in kernel data path, it
> seems there isn't such code for reference.

The checks should be done somewhere in __ovs_nla_copy_action. Which
seems to be done even in your patch by nsh_key_put_from_nlattr
(I didn't get that far during the review last week. The names of
those nsh_(key|hdr)_*_nlattr are so similar that it's impossible to
tell what they do without looking at the call sites - something you
should also consider to improve). That makes it even more wrong: you
appear to check everything twice, first on configuration and then for
every packet.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-04 10:42         ` Jiri Benc
@ 2017-09-04 12:09           ` Yang, Yi
  2017-09-04 12:57             ` Jiri Benc
  2017-09-04 12:57           ` Jan Scheurich
  2017-09-05  6:37           ` Yang, Yi
  2 siblings, 1 reply; 21+ messages in thread
From: Yang, Yi @ 2017-09-04 12:09 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev, davem, dev, e, blp, jan.scheurich

On Mon, Sep 04, 2017 at 12:42:16PM +0200, Jiri Benc wrote:
> On Mon, 4 Sep 2017 16:00:05 +0800, Yang, Yi wrote:
> > I think we have had similiar discussion about this, the issue is we have
> > no way to handle both MD type 1 and MD type 2 by using a common flow key struct.
> > 
> > So we have to do so, there is miniflow to handle such issue in
> > userspace, but kernel data path didn't provide such mechanism. I think
> > every newly-added key will result in the same space-wasting issue for
> > kernel data path, isn't it?
> 
> Yes. And it would be surprising if it didn't have an effect on
> performance.
> 
> I don't care that much as this is a result of openvswitch module design
> decisions but it still would be good to reach a consensus before
> implementing uAPI that might not be needed in the end.
> 
> > OVS_ATTR_KEY_NSH is the first case to use nested attributes for set
> > action, so no reference code for this, set action has two use cases, one
> > has mask but the other hasn't, so it will be easier an nested attribute is
> > handled as a whole, in user space, nsh_key_from_nlattr is used in
> > several places.
> 
> I very much disagree. What you're doing is very poor design as can be
> seen from the code where you have to do weird gymnastics to implement
> that. Compare that to a much simple case where each attribute carries
> its own value and mask.
> 
> > If we change it per your proposal, there will be a lot
> > of rework,
> 
> That's not an argument. We care about doing things right, we don't do
> things hastily and with as little effort as possible.

Jiri, I check OVS source code carefully, if you check OVS code in
format_odp_action in lib/odp-util.c, it has a strong assumption for set
action, mask is following value no matter it is simple attribute or
nested attribute, I copy source code here for your reference,

    case OVS_ACTION_ATTR_SET_MASKED:
        a = nl_attr_get(a);
        size = nl_attr_get_size(a) / 2;
        ds_put_cstr(ds, "set(");

        /* Masked set action not supported for tunnel key, which is
 * bigger. */
        if (size <= sizeof(struct ovs_key_ipv6)) {
            struct nlattr attr[1 + DIV_ROUND_UP(sizeof(struct
ovs_key_ipv6),
                                                sizeof(struct nlattr))];
            struct nlattr mask[1 + DIV_ROUND_UP(sizeof(struct
ovs_key_ipv6),
                                                sizeof(struct nlattr))];

            mask->nla_type = attr->nla_type = nl_attr_type(a);
            mask->nla_len = attr->nla_len = NLA_HDRLEN + size;
            memcpy(attr + 1, (char *)(a + 1), size);
            memcpy(mask + 1, (char *)(a + 1) + size, size);
            format_odp_key_attr(attr, mask, NULL, ds, false);
        } else {
            format_odp_key_attr(a, NULL, NULL, ds, false);
        }
        ds_put_cstr(ds, ")");
        break;

So we must do many changes if we want to break this assumption.

> 
> > we have to provide two functions to handle this, one is for
> > the case with mask, the other is for the case without mask.
> 
> I don't see that. The function is called from a single place only.
> 
> > How can we do so? I see batch process code in user space implementation,
> > but kernel data path didn't such infrastructure,
> 
> You're right that there's no infrastructure. I somewhat agree that it
> might be out of scope of this patch and it can be optimized later.
> That's why I also included other suggestions to speed this up until we
> implement better parsing: namely, verify correctness once (at the time
> it is set from user space) and just expect things to be correct in the
> fast path.
> 
> > how can we know next push_nsh uses the same nsh header as previous
> > one?
> 
> We store the prepopulated header together with the action.
> 
> > > Shouldn't we reject the packet, then? Pretending that something was parsed
> > > correctly while in fact it was not, does not look correct.
> > 
> > For MD type 2, we don't implement metadata parsing, but it can still
> > work. I'm not sure what you mean here, how do we reject a packet here?
> 
> Okay, we can match on mdtype and thus can find out about this type of
> packets. No problem, then.
> 
> > > These checks should be done only once when the action is configured, not for
> > > each packet.
> > 
> > I don't know how to implement a batch processing in kernel data path, it
> > seems there isn't such code for reference.
> 
> The checks should be done somewhere in __ovs_nla_copy_action. Which
> seems to be done even in your patch by nsh_key_put_from_nlattr
> (I didn't get that far during the review last week. The names of
> those nsh_(key|hdr)_*_nlattr are so similar that it's impossible to
> tell what they do without looking at the call sites - something you
> should also consider to improve). That makes it even more wrong: you
> appear to check everything twice, first on configuration and then for
> every packet.

Ok, I'll carefully remove unnecessary checks in next version.

> 
>  Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-04 10:42         ` Jiri Benc
  2017-09-04 12:09           ` Yang, Yi
@ 2017-09-04 12:57           ` Jan Scheurich
       [not found]             ` <CFF8EF42F1132E4CBE2BF0AB6C21C58D787F4E84-hqolJogE5njKJFWPz4pdheaU1rCVNFv4@public.gmane.org>
  2017-09-05  6:37           ` Yang, Yi
  2 siblings, 1 reply; 21+ messages in thread
From: Jan Scheurich @ 2017-09-04 12:57 UTC (permalink / raw)
  To: Jiri Benc, Yang, Yi; +Cc: netdev, davem, dev, e, blp

> On Mon, 4 Sep 2017 16:00:05 +0800, Yang, Yi wrote:
> > I think we have had similiar discussion about this, the issue is we have
> > no way to handle both MD type 1 and MD type 2 by using a common flow key struct.
> >
> > So we have to do so, there is miniflow to handle such issue in
> > userspace, but kernel data path didn't provide such mechanism. I think
> > every newly-added key will result in the same space-wasting issue for
> > kernel data path, isn't it?
> 
> Yes. And it would be surprising if it didn't have an effect on
> performance.
> 
> I don't care that much as this is a result of openvswitch module design
> decisions but it still would be good to reach a consensus before
> implementing uAPI that might not be needed in the end.

Matching on NSH MD1 context headers is definitely required. So these must
be part of the flow.

> 
> > OVS_ATTR_KEY_NSH is the first case to use nested attributes for set
> > action, so no reference code for this, set action has two use cases, one
> > has mask but the other hasn't, so it will be easier an nested attribute is
> > handled as a whole, in user space, nsh_key_from_nlattr is used in
> > several places.
> 
> I very much disagree. What you're doing is very poor design as can be
> seen from the code where you have to do weird gymnastics to implement
> that. Compare that to a much simple case where each attribute carries
> its own value and mask.

I have no principle objections against splitting NSH base header and MD1 context
headers into independent key attributes on the uAPI if that simplifies the code.
But we need to full picture here:

So is what you are suggesting the following?

For matching:
OVS_KEY_ATTR_NSH_BASE_HEADER		mandatory
OVS_KEY_ATTR_NSH_MD1_CONTEXT		optional, in case MDTYPE == MD1

For setting:
OVS_ACTION_ATTR_SET_MASKED
-    OVS_KEY_ATTR_NSH_BASE_HEADER		when setting any base header fields
OVS_ACTION_ATTR_SET_MASKED
-    OVS_KEY_ATTR_NSH_MD1_CONTEXT	when setting any MD1 context data fields

For push_nsh:
OVS_ACTION_ATTR_PUSH_NSH
-    OVS_KEY_ATTR_NSH_BASE_HEADER		mandatory
-    OVS_NSH_ATTR_CONTEXT_HEADERS		optional (opaque metadata octet stream) 

So push_nsh would still require nested attributes. Is that what we want (see below).

> > How can we do so? I see batch process code in user space implementation,
> > but kernel data path didn't such infrastructure,
> 
> You're right that there's no infrastructure. I somewhat agree that it
> might be out of scope of this patch and it can be optimized later.
> That's why I also included other suggestions to speed this up until we
> implement better parsing: namely, verify correctness once (at the time
> it is set from user space) and just expect things to be correct in the
> fast path.
> 
> > how can we know next push_nsh uses the same nsh header as previous
> > one?
> 
> We store the prepopulated header together with the action.

I agree.

In our original modelling of OVS_ACTION_ATTR_PUSH_NSH in OVS 2.8 we 
introduced a monolithic push_nsh action struct including the NSH base header
and variable length opaque context headers. That avoids any logic in the 
kernel datapath to translate a nested action attribute into an internal format
with pre-populated header. The user space does that work.

Should we consider to stick to that simple and efficient approach? As far 
As I can see it will be generic for the foreseeable future.

> > > These checks should be done only once when the action is configured, not for
> > > each packet.
> >
> > I don't know how to implement a batch processing in kernel data path, it
> > seems there isn't such code for reference.
> 
> The checks should be done somewhere in __ovs_nla_copy_action. Which
> seems to be done even in your patch by nsh_key_put_from_nlattr
> (I didn't get that far during the review last week. The names of
> those nsh_(key|hdr)_*_nlattr are so similar that it's impossible to
> tell what they do without looking at the call sites - something you
> should also consider to improve). That makes it even more wrong: you
> appear to check everything twice, first on configuration and then for
> every packet.

Agree. Validation should only happen at action configuration time.

BR, Jan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-04 12:09           ` Yang, Yi
@ 2017-09-04 12:57             ` Jiri Benc
  2017-09-05  5:51               ` Yang, Yi
  0 siblings, 1 reply; 21+ messages in thread
From: Jiri Benc @ 2017-09-04 12:57 UTC (permalink / raw)
  To: Yang, Yi; +Cc: netdev, davem, dev, e, blp, jan.scheurich

On Mon, 4 Sep 2017 20:09:07 +0800, Yang, Yi wrote:
> So we must do many changes if we want to break this assumption.

We may do as many changes as we want to. This is uAPI we're talking
about and we need to get it right since the beginning. Sure, it may
mean that some user space programs need some changes in order to make
use of the new features. That happens every day.

I also don't understand where's the problem. It's very easy to check
for NLA_F_NESTED and generically act based on that in the function you
quoted. Just call a different function than format_odp_key_attr to
handle ovs_nsh_key_attr attributes in case the nested flag is set and
the attribute is OVS_KEY_ATTR_NSH and you're done. You'll need such
function anyway, it's not much different code size wise to call it from
format_odp_key_attr or from format_odp_action.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
       [not found] ` <1504096752-102003-1-git-send-email-yi.y.yang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  2017-08-31 10:45   ` Jiri Benc
@ 2017-09-04 13:10   ` Jiri Benc
  1 sibling, 0 replies; 21+ messages in thread
From: Jiri Benc @ 2017-09-04 13:10 UTC (permalink / raw)
  To: Yi Yang
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Wed, 30 Aug 2017 20:39:12 +0800, Yi Yang wrote:
> +enum ovs_nsh_key_attr {
> +	OVS_NSH_KEY_ATTR_BASE,  /* struct ovs_nsh_key_base. */
> +	OVS_NSH_KEY_ATTR_MD1,   /* struct ovs_nsh_key_md1. */
> +	OVS_NSH_KEY_ATTR_MD2,   /* variable-length octets for MD type 2. */
> +	__OVS_NSH_KEY_ATTR_MAX
> +};

One more thing: 0 is reserved, the first attr must be
OVS_NSH_KEY_ATTR_UNSPEC.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
       [not found]             ` <CFF8EF42F1132E4CBE2BF0AB6C21C58D787F4E84-hqolJogE5njKJFWPz4pdheaU1rCVNFv4@public.gmane.org>
@ 2017-09-04 13:16               ` Jiri Benc
  2017-09-04 14:07                 ` Jan Scheurich
  0 siblings, 1 reply; 21+ messages in thread
From: Jiri Benc @ 2017-09-04 13:16 UTC (permalink / raw)
  To: Jan Scheurich
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Mon, 4 Sep 2017 12:57:15 +0000, Jan Scheurich wrote:
> So is what you are suggesting the following?
> 
> For matching:
> OVS_KEY_ATTR_NSH_BASE_HEADER		mandatory
> OVS_KEY_ATTR_NSH_MD1_CONTEXT		optional, in case MDTYPE == MD1

This needs to be:

OVS_KEY_ATTR_NSH
	OVS_KEY_ATTR_NSH_BASE_HEADER
	OVS_KEY_ATTR_NSH_MD1_CONTEXT

> For setting:
> OVS_ACTION_ATTR_SET_MASKED
> -    OVS_KEY_ATTR_NSH_BASE_HEADER		when setting any base header fields
> OVS_ACTION_ATTR_SET_MASKED
> -    OVS_KEY_ATTR_NSH_MD1_CONTEXT	when setting any MD1 context data fields

This needs to be:

OVS_ACTION_ATTR_SET_MASKED
	OVS_KEY_ATTR_NSH
		OVS_KEY_ATTR_NSH_BASE_HEADER
		OVS_KEY_ATTR_NSH_MD1_CONTEXT

In your example, there's no way to distinguish OVS_KEY_ATTR_ENCAP and
OVS_KEY_ATTR_NSH_BASE_HEADER, they are both "1".

> Should we consider to stick to that simple and efficient approach? As far 
> As I can see it will be generic for the foreseeable future.

I'm not sure. From the kernel point of view, we want the same
functionality offered by the openvswitch module and by a tc action.

In theory, it can be the tc tool that constructs the header from the
command line but there's no precedent. Dunno.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-04 13:16               ` Jiri Benc
@ 2017-09-04 14:07                 ` Jan Scheurich
       [not found]                   ` <CFF8EF42F1132E4CBE2BF0AB6C21C58D787F4EEB-hqolJogE5njKJFWPz4pdheaU1rCVNFv4@public.gmane.org>
  0 siblings, 1 reply; 21+ messages in thread
From: Jan Scheurich @ 2017-09-04 14:07 UTC (permalink / raw)
  To: Jiri Benc; +Cc: Yang, Yi, netdev, davem, dev, e, blp

> > So is what you are suggesting the following?
> >
> > For matching:
> > OVS_KEY_ATTR_NSH_BASE_HEADER		mandatory
> > OVS_KEY_ATTR_NSH_MD1_CONTEXT		optional, in case MDTYPE == MD1
> 
> This needs to be:
> 
> OVS_KEY_ATTR_NSH
> 	OVS_KEY_ATTR_NSH_BASE_HEADER
> 	OVS_KEY_ATTR_NSH_MD1_CONTEXT
> 
> > For setting:
> > OVS_ACTION_ATTR_SET_MASKED
> > -    OVS_KEY_ATTR_NSH_BASE_HEADER		when setting any base header fields
> > OVS_ACTION_ATTR_SET_MASKED
> > -    OVS_KEY_ATTR_NSH_MD1_CONTEXT	when setting any MD1 context data fields
> 
> This needs to be:
> 
> OVS_ACTION_ATTR_SET_MASKED
> 	OVS_KEY_ATTR_NSH
> 		OVS_KEY_ATTR_NSH_BASE_HEADER
> 		OVS_KEY_ATTR_NSH_MD1_CONTEXT
> 

Then perhaps I misunderstood your comment. I thought you didn't like that the 
SET_MASKED action wrapped OVS_KEY_ATTR_NSH which in itself was nested.
I was aiming to avoid this by lifting the two components of the NSH header 
components to the top level:
OVS_NSH_ATTR_BASE_HEADER --> OVS_KEY_ATTR_NSH_BASE_HEADER
OVS_NSH_ATTR_MD1_CONTEXT --> OVS_KEY_ATTR_NSH_MD1_CONTEXT

> In your example, there's no way to distinguish OVS_KEY_ATTR_ENCAP and
> OVS_KEY_ATTR_NSH_BASE_HEADER, they are both "1".

See above. The two would be separate values in the same enum, i.e. distinct.

> > Should we consider to stick to that simple and efficient approach? As far
> > As I can see it will be generic for the foreseeable future.
> 
> I'm not sure. From the kernel point of view, we want the same
> functionality offered by the openvswitch module and by a tc action.
> 
> In theory, it can be the tc tool that constructs the header from the
> command line but there's no precedent. Dunno.

Not sure I have the full picture here. Are you saying that the tc tool uses the same
Netlink API to the kernel as OVS? What would be the back-end for tc? Also the
openvswitch kernel module? 

BR, Jan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
       [not found]                   ` <CFF8EF42F1132E4CBE2BF0AB6C21C58D787F4EEB-hqolJogE5njKJFWPz4pdheaU1rCVNFv4@public.gmane.org>
@ 2017-09-04 14:13                     ` Jiri Benc
  2017-09-04 14:45                       ` Jan Scheurich
  0 siblings, 1 reply; 21+ messages in thread
From: Jiri Benc @ 2017-09-04 14:13 UTC (permalink / raw)
  To: Jan Scheurich
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Mon, 4 Sep 2017 14:07:45 +0000, Jan Scheurich wrote:
> Then perhaps I misunderstood your comment. I thought you didn't like that the 
> SET_MASKED action wrapped OVS_KEY_ATTR_NSH which in itself was nested.
> I was aiming to avoid this by lifting the two components of the NSH header 
> components to the top level:
> OVS_NSH_ATTR_BASE_HEADER --> OVS_KEY_ATTR_NSH_BASE_HEADER
> OVS_NSH_ATTR_MD1_CONTEXT --> OVS_KEY_ATTR_NSH_MD1_CONTEXT

No, this should be a nested attr.

I objected to the way value+mask combo is handled.

> See above. The two would be separate values in the same enum, i.e. distinct.

This is not how netlink attrs should be designed.

> Not sure I have the full picture here. Are you saying that the tc tool uses the same
> Netlink API to the kernel as OVS? What would be the back-end for tc? Also the
> openvswitch kernel module? 

It does not use the same API but it makes sense for these two to share
common code. Plus hw offloading in ovs is done using tc.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-04 14:13                     ` Jiri Benc
@ 2017-09-04 14:45                       ` Jan Scheurich
  2017-09-05  9:49                         ` Jiri Benc
  0 siblings, 1 reply; 21+ messages in thread
From: Jan Scheurich @ 2017-09-04 14:45 UTC (permalink / raw)
  To: Jiri Benc
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

> On Mon, 4 Sep 2017 14:07:45 +0000, Jan Scheurich wrote:
> > Then perhaps I misunderstood your comment. I thought you didn't like that the
> > SET_MASKED action wrapped OVS_KEY_ATTR_NSH which in itself was nested.
> > I was aiming to avoid this by lifting the two components of the NSH header
> > components to the top level:
> > OVS_NSH_ATTR_BASE_HEADER --> OVS_KEY_ATTR_NSH_BASE_HEADER
> > OVS_NSH_ATTR_MD1_CONTEXT --> OVS_KEY_ATTR_NSH_MD1_CONTEXT
> 
> No, this should be a nested attr.
> 
> I objected to the way value+mask combo is handled.

OK, sorry for the confusion. 

So what is the correct layout for MASKED_SET action with nested fields?
1. All nested values, followed by all nested masks, or
2. For each nested field value followed by mask?

I guess alternative 1, but just to be sure.

Jan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-04 12:57             ` Jiri Benc
@ 2017-09-05  5:51               ` Yang, Yi
       [not found]                 ` <20170905055144.GA88800-re2EX8HDrk21gSHoDXDV2kEOCMrvLtNR@public.gmane.org>
  0 siblings, 1 reply; 21+ messages in thread
From: Yang, Yi @ 2017-09-05  5:51 UTC (permalink / raw)
  To: Jiri Benc
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Mon, Sep 04, 2017 at 08:57:44PM +0800, Jiri Benc wrote:
> On Mon, 4 Sep 2017 20:09:07 +0800, Yang, Yi wrote:
> > So we must do many changes if we want to break this assumption.
> 
> We may do as many changes as we want to. This is uAPI we're talking
> about and we need to get it right since the beginning. Sure, it may
> mean that some user space programs need some changes in order to make
> use of the new features. That happens every day.
> 
> I also don't understand where's the problem. It's very easy to check
> for NLA_F_NESTED and generically act based on that in the function you
> quoted. Just call a different function than format_odp_key_attr to
> handle ovs_nsh_key_attr attributes in case the nested flag is set and
> the attribute is OVS_KEY_ATTR_NSH and you're done. You'll need such
> function anyway, it's not much different code size wise to call it from
> format_odp_key_attr or from format_odp_action.

But if we follow your way, how does nla_for_each_nested handle such
pattern?

attribute1
attribute1_mask
attribute2
attribute2_mask
attribute3
attribute3_mask

I don't think this can increase performance and readability.

In current way, we just call nla_for_each_nested twice

one is for

attribute1
attribute2
attribute3

another is for

attribute1_mask
attribute2_mask
attribute3_mask

if we use one function to handle both attributes and masks, I can't
see any substantial difference between two ways as far as the
performance is concerned.

So my proposal is we needn't introduce special handling case for
OVS_KEY_ATTR_NSH in OVS_ACTION_ATTR_SET_MASKED, that will open Pandora's
box.

If we consider OVS_KEY_ATTR_NSH as a big attribute, current way is
precisely following current design pattern

attribute
mask

> 
>  Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-04 10:42         ` Jiri Benc
  2017-09-04 12:09           ` Yang, Yi
  2017-09-04 12:57           ` Jan Scheurich
@ 2017-09-05  6:37           ` Yang, Yi
  2017-09-05  9:47             ` Jiri Benc
  2 siblings, 1 reply; 21+ messages in thread
From: Yang, Yi @ 2017-09-05  6:37 UTC (permalink / raw)
  To: Jiri Benc
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Mon, Sep 04, 2017 at 06:42:16PM +0800, Jiri Benc wrote:
> On Mon, 4 Sep 2017 16:00:05 +0800, Yang, Yi wrote:
> > how can we know next push_nsh uses the same nsh header as previous
> > one?
> 
> We store the prepopulated header together with the action.
>

I checked source code but can't find where we can prepopulate it and how
we can deliver the prepopulated header to push_nsh, can you expand it in
order that I can know how to do this in code level?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
       [not found]                 ` <20170905055144.GA88800-re2EX8HDrk21gSHoDXDV2kEOCMrvLtNR@public.gmane.org>
@ 2017-09-05  9:46                   ` Jiri Benc
  2017-09-05 11:03                     ` Yang, Yi
  0 siblings, 1 reply; 21+ messages in thread
From: Jiri Benc @ 2017-09-05  9:46 UTC (permalink / raw)
  To: Yang, Yi
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Tue, 5 Sep 2017 13:51:45 +0800, Yang, Yi wrote:
> But if we follow your way, how does nla_for_each_nested handle such
> pattern?
> 
> attribute1
> attribute1_mask
> attribute2
> attribute2_mask
> attribute3
> attribute3_mask

Uh? That will just work. Note that nla len contains the size of the
whole attribute, i.e. includes both fields.

> I don't think this can increase performance and readability.

Do you realize you're stating that not copying data around in hot path
can't increase performance? ;-)

> if we use one function to handle both attributes and masks, I can't
> see any substantial difference between two ways as far as the
> performance is concerned.

Except that what you did is so unexpected to netlink that you had to go
out of your way to parse it. Those two memcpys speak for themselves.

> If we consider OVS_KEY_ATTR_NSH as a big attribute, current way is
> precisely following current design pattern

Do you have an idea how nested netlink attributes work? It's very well
defined. What you're doing is breaking the definition. You can't do
that.

The (non-nested) attributes contain header followed by data. The data
is a single value or it is a struct. In ovs for masked actions,
attributes contain a struct of two fields (value and mask). The struct
access is open coded but it's still a struct.

Now, nested attributes are very well defined: it's a nla header
followed by a stream of attributes. There's no requirement on the order
of the attributes. Do you see how you're breaking this assumption? You
expect that each attribute is present exactly twice, once among first
half of attributes, once among second half of attributes. You don't
check that this weird assumption is adhered to. You don't even check
that the point where you split the data is between attributes! You may
very well be splitting an attribute in half and interpreting its data
as nla headers.

Consider your proposal NACKed from my side.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-05  6:37           ` Yang, Yi
@ 2017-09-05  9:47             ` Jiri Benc
  2017-09-05 10:57               ` Yang, Yi
  0 siblings, 1 reply; 21+ messages in thread
From: Jiri Benc @ 2017-09-05  9:47 UTC (permalink / raw)
  To: Yang, Yi; +Cc: netdev, davem, dev, e, blp, jan.scheurich

On Tue, 5 Sep 2017 14:37:05 +0800, Yang, Yi wrote:
> I checked source code but can't find where we can prepopulate it and how
> we can deliver the prepopulated header to push_nsh, can you expand it in
> order that I can know how to do this in code level?

I already said that it's not implemented yet and can be done as a
future performance enhancement.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-04 14:45                       ` Jan Scheurich
@ 2017-09-05  9:49                         ` Jiri Benc
  2017-09-05 10:36                           ` Jan Scheurich
  0 siblings, 1 reply; 21+ messages in thread
From: Jiri Benc @ 2017-09-05  9:49 UTC (permalink / raw)
  To: Jan Scheurich; +Cc: Yang, Yi, netdev, davem, dev, e, blp

On Mon, 4 Sep 2017 14:45:50 +0000, Jan Scheurich wrote:
> So what is the correct layout for MASKED_SET action with nested fields?
> 1. All nested values, followed by all nested masks, or
> 2. For each nested field value followed by mask?
> 
> I guess alternative 1, but just to be sure.

It's 2. Alternative 1 breaks netlink assumptions, is ugly to implement
and probably impossible to be properly validated.

 Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-05  9:49                         ` Jiri Benc
@ 2017-09-05 10:36                           ` Jan Scheurich
  0 siblings, 0 replies; 21+ messages in thread
From: Jan Scheurich @ 2017-09-05 10:36 UTC (permalink / raw)
  To: Jiri Benc
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

> From: Jiri Benc [mailto:jbenc-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org]

> > So what is the correct layout for MASKED_SET action with nested fields?
> > 1. All nested values, followed by all nested masks, or
> > 2. For each nested field value followed by mask?
> >
> > I guess alternative 1, but just to be sure.
> 
> It's 2. Alternative 1 breaks netlink assumptions, is ugly to implement
> and probably impossible to be properly validated.

OK. For outsiders this was far from obvious :-)

So, for OVS_ACTION_ATTR_SET_MASKED any nested attribute, no matter on which nesting level, must contain value directly followed by mask.

If that is the principle of handling masks in Netlink APIs, than we must adhere to it.

BR, Jan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-05  9:47             ` Jiri Benc
@ 2017-09-05 10:57               ` Yang, Yi
  0 siblings, 0 replies; 21+ messages in thread
From: Yang, Yi @ 2017-09-05 10:57 UTC (permalink / raw)
  To: Jiri Benc
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Tue, Sep 05, 2017 at 11:47:41AM +0200, Jiri Benc wrote:
> On Tue, 5 Sep 2017 14:37:05 +0800, Yang, Yi wrote:
> > I checked source code but can't find where we can prepopulate it and how
> > we can deliver the prepopulated header to push_nsh, can you expand it in
> > order that I can know how to do this in code level?
> 
> I already said that it's not implemented yet and can be done as a
> future performance enhancement.

Got it, thanks.

> 
>  Jiri

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-09-05  9:46                   ` Jiri Benc
@ 2017-09-05 11:03                     ` Yang, Yi
  0 siblings, 0 replies; 21+ messages in thread
From: Yang, Yi @ 2017-09-05 11:03 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev, davem, dev, e, blp, jan.scheurich

On Tue, Sep 05, 2017 at 11:46:45AM +0200, Jiri Benc wrote:
> On Tue, 5 Sep 2017 13:51:45 +0800, Yang, Yi wrote:
> > But if we follow your way, how does nla_for_each_nested handle such
> > pattern?
> > 
> > attribute1
> > attribute1_mask
> > attribute2
> > attribute2_mask
> > attribute3
> > attribute3_mask
> 
> Uh? That will just work. Note that nla len contains the size of the
> whole attribute, i.e. includes both fields.
> 
> > I don't think this can increase performance and readability.
>

I got the point, actually value and mask are data of one attribute, so
they are three attributes but not six. 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next v7] openvswitch: enable NSH support
  2017-08-31 10:45   ` Jiri Benc
  2017-09-04  8:00     ` Yang, Yi
@ 2017-09-08  6:13     ` Yang, Yi
  1 sibling, 0 replies; 21+ messages in thread
From: Yang, Yi @ 2017-09-08  6:13 UTC (permalink / raw)
  To: Jiri Benc
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q

On Thu, Aug 31, 2017 at 06:45:16PM +0800, Jiri Benc wrote:
> On Wed, 30 Aug 2017 20:39:12 +0800, Yi Yang wrote:
> > +			nsh->md1.context[i] =
> > +			    OVS_MASKED(nsh->md1.context[i], key->context[i],
> > +				       mask->context[i]);
> > +		}
> > +		memcpy(flow_key->nsh.context, nsh->md1.context,
> > +		       sizeof(nsh->md1.context));
> 
> Do you follow the discussion that Hannes Sowa started on the ovs list
> regarding matching on the context fields? It would be better to hold on this
> until there's a conclusion reached.

We had several back-and-forth discussions, Jan was also involved, the
conslusion is current way is still the best way we can have, we need
set and match for context fields.

> > +			    type,
> > +			    nla_len(a),
> > +			    ovs_nsh_key_attr_lens[type].len
> > +			);
> > +			return -EINVAL;
> > +		}
> 
> These checks should be done only once when the action is configured, not for
> each packet.

in v8, nsh_key_put_from_nlattr is only one check point, validate_nsh calls it
for set and push_nsh, nsh_hdr_from_nlattr and nsh_key_from_nlattr
haven't these checked anymore because validate_nsh has done them.

> > +	if (unlikely(!has_md1 && !has_md2)) {
> > +		OVS_NLERR(1, "neither nsh md1 nor md2 attribute is there");
> > +		return -EINVAL;
> > +	}
> 
> Ditto. Plus I don't see a check that the OVS_NSH_KEY_ATTR_BASE attribute is
> present. Seems that the compiler warned you about possibly unitialized flags
> and ttl variables and you just silenced the warning without considering
> whether it's not actually justified.

nsh_key_put_from_nlattr is used for both set and push_nsh, so
OVS_NSH_KEY_ATTR_BASE check is valid only for push_nsh, it is possible
for set not to have OVS_NSH_KEY_ATTR_BASE. v8 has been successfully built
without any warnings, it is also verified in sfc test environment.

I think v8 has fixed all the comments for v7, I have sent out v8 to
mailing list, please help review it, thanks a lot.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2017-09-08  6:13 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-30 12:39 [PATCH net-next v7] openvswitch: enable NSH support Yi Yang
     [not found] ` <1504096752-102003-1-git-send-email-yi.y.yang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2017-08-31 10:45   ` Jiri Benc
2017-09-04  8:00     ` Yang, Yi
     [not found]       ` <20170904080005.GA70767-re2EX8HDrk21gSHoDXDV2kEOCMrvLtNR@public.gmane.org>
2017-09-04 10:42         ` Jiri Benc
2017-09-04 12:09           ` Yang, Yi
2017-09-04 12:57             ` Jiri Benc
2017-09-05  5:51               ` Yang, Yi
     [not found]                 ` <20170905055144.GA88800-re2EX8HDrk21gSHoDXDV2kEOCMrvLtNR@public.gmane.org>
2017-09-05  9:46                   ` Jiri Benc
2017-09-05 11:03                     ` Yang, Yi
2017-09-04 12:57           ` Jan Scheurich
     [not found]             ` <CFF8EF42F1132E4CBE2BF0AB6C21C58D787F4E84-hqolJogE5njKJFWPz4pdheaU1rCVNFv4@public.gmane.org>
2017-09-04 13:16               ` Jiri Benc
2017-09-04 14:07                 ` Jan Scheurich
     [not found]                   ` <CFF8EF42F1132E4CBE2BF0AB6C21C58D787F4EEB-hqolJogE5njKJFWPz4pdheaU1rCVNFv4@public.gmane.org>
2017-09-04 14:13                     ` Jiri Benc
2017-09-04 14:45                       ` Jan Scheurich
2017-09-05  9:49                         ` Jiri Benc
2017-09-05 10:36                           ` Jan Scheurich
2017-09-05  6:37           ` Yang, Yi
2017-09-05  9:47             ` Jiri Benc
2017-09-05 10:57               ` Yang, Yi
2017-09-08  6:13     ` Yang, Yi
2017-09-04 13:10   ` Jiri Benc

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).