netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
@ 2019-02-21 11:56 Toke Høiland-Jørgensen
  2019-02-21 11:56 ` [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-21 11:56 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov

An XDP program can redirect packets between interfaces using either the
xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the
flexibility of updating maps from userspace, the redirect_map() helper also
uses the map structure to batch packets, which results in a significant
(around 50%) performance boost. However, the xdp_redirect() API is simpler
if one just wants to redirect to another interface, which means people tend
to use this interface and then wonder why they getter worse performance
than expected.

This patch seeks to close this performance difference between the two APIs.
It achieves this by changing xdp_redirect() to use a hidden devmap for
looking up destination interfaces, thus gaining the batching benefit with
no visible difference from the user API point of view.

A hidden per-namespace map is allocated when an XDP program that uses the
non-map xdp_redirect() helper is first loaded. This map is populated with
all available interfaces in its namespace, and kept up to date as
interfaces come and go. Once allocated, the map is kept around until the
namespace is removed.

The hidden map uses the ifindex as map key, which means they are limited to
ifindexes smaller than the map size of 64. A later patch introduces a new
map type to lift this restriction.

Performance numbers:

Before patch:
xdp_redirect:     5426035 pkt/s
xdp_redirect_map: 8412754 pkt/s

After patch:
xdp_redirect:     8314702 pkt/s
xdp_redirect_map: 8411854 pkt/s

This corresponds to a 53% increase in xdp_redirect performance, or a
reduction in per-packet processing time by 64 nanoseconds.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 include/linux/bpf.h         |   12 +++
 include/net/net_namespace.h |    2 -
 include/net/netns/xdp.h     |    5 +
 kernel/bpf/devmap.c         |  159 +++++++++++++++++++++++++++++++++++--------
 kernel/bpf/verifier.c       |    6 ++
 net/core/filter.c           |   58 +---------------
 6 files changed, 157 insertions(+), 85 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bd169a7bcc93..4f8f179df9fd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -606,6 +606,8 @@ struct xdp_buff;
 struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
+int dev_map_alloc_default_map(void);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
@@ -687,6 +689,16 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 	return NULL;
 }
 
+static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
+{
+	return NULL;
+}
+
+static inline int dev_map_alloc_default_map(void)
+{
+	return 0;
+}
+
 static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
 {
 }
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index a68ced28d8f4..6706ecc25d8f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -162,7 +162,7 @@ struct net {
 #if IS_ENABLED(CONFIG_CAN)
 	struct netns_can	can;
 #endif
-#ifdef CONFIG_XDP_SOCKETS
+#ifdef CONFIG_BPF_SYSCALL
 	struct netns_xdp	xdp;
 #endif
 	struct sock		*diag_nlsk;
diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
index e5734261ba0a..4e2d6394b45d 100644
--- a/include/net/netns/xdp.h
+++ b/include/net/netns/xdp.h
@@ -5,9 +5,14 @@
 #include <linux/rculist.h>
 #include <linux/mutex.h>
 
+struct bpf_dtab;
+
 struct netns_xdp {
+#ifdef CONFIG_XDP_SOCKETS
 	struct mutex		lock;
 	struct hlist_head	list;
+#endif
+	struct bpf_dtab	*default_map;
 };
 
 #endif /* __NETNS_XDP_H__ */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..425077664ac6 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -56,6 +56,7 @@
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
 #define DEV_MAP_BULK_SIZE 16
+#define DEV_MAP_DEFAULT_SIZE 64
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
 	struct net_device *dev_rx;
@@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
 }
 
-static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
+			    bool check_memlock)
 {
-	struct bpf_dtab *dtab;
-	int err = -EINVAL;
 	u64 cost;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	/* check sanity of attributes */
-	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
-		return ERR_PTR(-EINVAL);
-
-	dtab = kzalloc(sizeof(*dtab), GFP_USER);
-	if (!dtab)
-		return ERR_PTR(-ENOMEM);
+	int err;
 
 	bpf_map_init_from_attr(&dtab->map, attr);
 
@@ -109,39 +98,63 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
 	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_dtab;
+		return -EINVAL;
 
 	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(dtab->map.pages);
-	if (err)
-		goto free_dtab;
-
-	err = -ENOMEM;
+	if (check_memlock) {
+		/* if map size is larger than memlock limit, reject it early */
+		err = bpf_map_precharge_memlock(dtab->map.pages);
+		if (err)
+			return -EINVAL;
+	}
 
 	/* A per cpu bitfield with a bit per possible net device */
 	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
 						__alignof__(unsigned long),
 						GFP_KERNEL | __GFP_NOWARN);
 	if (!dtab->flush_needed)
-		goto free_dtab;
+		return -ENOMEM;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
 					      dtab->map.numa_node);
-	if (!dtab->netdev_map)
-		goto free_dtab;
+	if (!dtab->netdev_map) {
+		free_percpu(dtab->flush_needed);
+		return -ENOMEM;
+	}
 
 	spin_lock(&dev_map_lock);
 	list_add_tail_rcu(&dtab->list, &dev_map_list);
 	spin_unlock(&dev_map_lock);
 
+	return 0;
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_dtab *dtab;
+	int err = -EINVAL;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	dtab = kzalloc(sizeof(*dtab), GFP_USER);
+	if (!dtab)
+		return ERR_PTR(-ENOMEM);
+
+	err = dev_map_init_map(dtab, attr, true);
+	if (err) {
+		kfree(dtab);
+		return ERR_PTR(err);
+	}
+
 	return &dtab->map;
-free_dtab:
-	free_percpu(dtab->flush_needed);
-	kfree(dtab);
-	return ERR_PTR(err);
 }
 
 static void dev_map_free(struct bpf_map *map)
@@ -311,6 +324,17 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	return obj;
 }
 
+/* This is only being called from xdp_do_redirect() if the xdp_redirect helper
+ * is used; the default map is allocated on XDP program load if the helper is
+ * used, so will always be available at this point.
+ */
+struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+
+	return &net->xdp.default_map->map;
+}
+
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
@@ -496,10 +520,19 @@ static int dev_map_notification(struct notifier_block *notifier,
 				ulong event, void *ptr)
 {
 	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct net *net = dev_net(netdev);
+	u32 idx = netdev->ifindex;
 	struct bpf_dtab *dtab;
 	int i;
 
 	switch (event) {
+	case NETDEV_REGISTER:
+		rcu_read_lock();
+		dtab = READ_ONCE(net->xdp.default_map);
+		if (dtab)
+			dev_map_update_elem(&dtab->map, &idx, &idx, 0);
+		rcu_read_unlock();
+		break;
 	case NETDEV_UNREGISTER:
 		/* This rcu_read_lock/unlock pair is needed because
 		 * dev_map_list is an RCU list AND to ensure a delete
@@ -528,16 +561,84 @@ static int dev_map_notification(struct notifier_block *notifier,
 	return NOTIFY_OK;
 }
 
+static void __net_exit dev_map_net_exit(struct net *net)
+{
+	struct bpf_dtab *dtab;
+
+	dtab = xchg(&net->xdp.default_map, NULL);
+	if (dtab)
+		dev_map_free(&dtab->map);
+}
+
 static struct notifier_block dev_map_notifier = {
 	.notifier_call = dev_map_notification,
 };
 
+static struct pernet_operations dev_map_net_ops = {
+	.exit = dev_map_net_exit,
+};
+
+int dev_map_alloc_default_map(void)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dtab *dtab, *old_dtab;
+	struct net_device *netdev;
+	union bpf_attr attr = {};
+	u32 idx;
+	int err;
+
+	if (READ_ONCE(net->xdp.default_map))
+		return 0;
+
+	dtab = kzalloc(sizeof(*net->xdp.default_map), GFP_USER);
+	if (!dtab)
+		return -ENOMEM;
+
+	attr.max_entries = DEV_MAP_DEFAULT_SIZE;
+	attr.map_type = BPF_MAP_TYPE_DEVMAP;
+	attr.value_size = 4;
+	attr.key_size = 4;
+
+	err = dev_map_init_map(dtab, &attr, false);
+	if (err) {
+		kfree(dtab);
+		return err;
+	}
+
+	for_each_netdev(net, netdev) {
+		if (netdev->ifindex < DEV_MAP_DEFAULT_SIZE) {
+			idx = netdev->ifindex;
+			err = dev_map_update_elem(&dtab->map, &idx, &idx, 0);
+			if (err) {
+				dev_map_free(&dtab->map);
+				return err;
+			}
+		}
+	}
+
+	old_dtab = xchg(&net->xdp.default_map, dtab);
+	if (old_dtab)
+		dev_map_free(&old_dtab->map);
+
+	return 0;
+}
+
 static int __init dev_map_init(void)
 {
+	int err;
+
 	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
 	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
 		     offsetof(struct _bpf_dtab_netdev, dev));
+
 	register_netdevice_notifier(&dev_map_notifier);
+
+	err = register_pernet_subsys(&dev_map_net_ops);
+	if (err) {
+		unregister_netdevice_notifier(&dev_map_notifier);
+		return err;
+	}
+
 	return 0;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b63bc77af2d1..629661db36ee 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7527,6 +7527,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_redirect) {
+			int err = dev_map_alloc_default_map();
+
+			if (err)
+				return err;
+		}
 		if (insn->imm == BPF_FUNC_override_return)
 			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
diff --git a/net/core/filter.c b/net/core/filter.c
index b5a002d7b263..c709b1468bb6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3326,58 +3326,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static int __bpf_tx_xdp(struct net_device *dev,
-			struct bpf_map *map,
-			struct xdp_buff *xdp,
-			u32 index)
-{
-	struct xdp_frame *xdpf;
-	int err, sent;
-
-	if (!dev->netdev_ops->ndo_xdp_xmit) {
-		return -EOPNOTSUPP;
-	}
-
-	err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
-	if (unlikely(err))
-		return err;
-
-	xdpf = convert_to_xdp_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
-	if (sent <= 0)
-		return sent;
-	return 0;
-}
-
-static noinline int
-xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
-		     struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
-{
-	struct net_device *fwd;
-	u32 index = ri->ifindex;
-	int err;
-
-	fwd = dev_get_by_index_rcu(dev_net(dev), index);
-	ri->ifindex = 0;
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
-
-	err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
-	if (unlikely(err))
-		goto err;
-
-	_trace_xdp_redirect(dev, xdp_prog, index);
-	return 0;
-err:
-	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
-	return err;
-}
-
 static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 			    struct bpf_map *map,
 			    struct xdp_buff *xdp,
@@ -3508,10 +3456,10 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct bpf_map *map = READ_ONCE(ri->map);
 
-	if (likely(map))
-		return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
+	if (unlikely(!map))
+		map = __dev_map_get_default_map(dev);
 
-	return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri);
+	return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-21 11:56 [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
@ 2019-02-21 11:56 ` Toke Høiland-Jørgensen
  2019-02-21 15:23   ` Jesper Dangaard Brouer
                     ` (3 more replies)
  2019-02-21 15:19 ` [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device Jesper Dangaard Brouer
  2019-02-22  0:36 ` Jakub Kicinski
  2 siblings, 4 replies; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-21 11:56 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov

A common pattern when using xdp_redirect_map() is to create a device map
where the lookup key is simply ifindex. Because device maps are arrays,
this leaves holes in the map, and the map has to be sized to fit the
largest ifindex, regardless of how many devices actually are actually
needed in the map.

This patch adds a second type of device map where the key is interpreted as
an ifindex and looked up using a hashmap, instead of being used as an array
index. This leads to maps being densely packed, so they can be smaller.

The default maps used by xdp_redirect() are changed to use the new map
type, which means that xdp_redirect() is no longer limited to ifindex < 64,
but instead to 64 total simultaneous interfaces per network namespace. This
also provides an easy way to compare the performance of devmap and
devmap_idx:

xdp_redirect_map (devmap): 8394560 pkt/s
xdp_redirect (devmap_idx): 8179480 pkt/s

Difference: 215080 pkt/s or 3.1 nanoseconds per packet.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 include/linux/bpf.h                     |   12 +
 include/linux/bpf_types.h               |    1 
 include/trace/events/xdp.h              |    3 
 include/uapi/linux/bpf.h                |    1 
 kernel/bpf/devmap.c                     |  304 +++++++++++++++++++++++++++----
 kernel/bpf/verifier.c                   |    2 
 net/core/filter.c                       |   11 +
 tools/bpf/bpftool/map.c                 |    1 
 tools/include/uapi/linux/bpf.h          |    1 
 tools/lib/bpf/libbpf_probes.c           |    1 
 tools/testing/selftests/bpf/test_maps.c |   16 ++
 11 files changed, 310 insertions(+), 43 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4f8f179df9fd..e7308ff59071 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -606,9 +606,10 @@ struct xdp_buff;
 struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct bpf_dtab_netdev *__dev_map_idx_lookup_elem(struct bpf_map *map, u32 key);
 struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
 int dev_map_alloc_default_map(void);
-void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
+void __dev_map_insert_ctx(struct bpf_map *map, struct bpf_dtab_netdev *dst);
 void __dev_map_flush(struct bpf_map *map);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
@@ -689,6 +690,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 	return NULL;
 }
 
+static inline struct net_device  *__dev_map_idx_lookup_elem(struct bpf_map *map,
+							    u32 key)
+{
+	return NULL;
+}
+
 static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
 {
 	return NULL;
@@ -699,7 +706,8 @@ static inline int dev_map_alloc_default_map(void)
 	return 0;
 }
 
-static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
+static inline void __dev_map_insert_ctx(struct bpf_map *map,
+					struct net_device *dst)
 {
 }
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 08bf2f1fe553..374c013ca243 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -59,6 +59,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_IDX, dev_map_idx_ops)
 #if defined(CONFIG_BPF_STREAM_PARSER)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index e95cb86b65cf..fcf006d49f67 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -147,7 +147,8 @@ struct _bpf_dtab_netdev {
 
 #define devmap_ifindex(fwd, map)				\
 	(!fwd ? 0 :						\
-	 ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	 ((map->map_type == BPF_MAP_TYPE_DEVMAP ||              \
+	   map->map_type == BPF_MAP_TYPE_DEVMAP_IDX) ?		\
 	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1777fa0c61e4..a09243a38c2d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 	BPF_MAP_TYPE_QUEUE,
 	BPF_MAP_TYPE_STACK,
+	BPF_MAP_TYPE_DEVMAP_IDX,
 };
 
 /* Note that tracing related programs such as
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 425077664ac6..e9f7b0ad7ff3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -46,6 +46,12 @@
  * notifier hook walks the map we know that new dev references can not be
  * added by the user because core infrastructure ensures dev_get_by_index()
  * calls will fail at this point.
+ *
+ * The devmap_idx type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to be
+ * densely packed instead of having holes in the lookup array for unused
+ * ifindexes. The setup and packet enqueue/send code is shared between the two
+ * types of devmap; only the lookup and insertion is different.
  */
 #include <linux/bpf.h>
 #include <net/xdp.h>
@@ -65,6 +71,8 @@ struct xdp_bulk_queue {
 
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
+	unsigned int ifindex;
+	struct hlist_node index_hlist;
 	struct bpf_dtab *dtab;
 	unsigned int bit;
 	struct xdp_bulk_queue __percpu *bulkq;
@@ -76,11 +84,29 @@ struct bpf_dtab {
 	struct bpf_dtab_netdev **netdev_map;
 	unsigned long __percpu *flush_needed;
 	struct list_head list;
+
+	/* these are only used for DEVMAP_IDX type maps */
+	unsigned long *bits_used;
+	struct hlist_head *dev_index_head;
+	spinlock_t index_lock;
 };
 
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
 
+static struct hlist_head *dev_map_create_hash(void)
+{
+	int i;
+	struct hlist_head *hash;
+
+	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
+	if (hash != NULL)
+		for (i = 0; i < NETDEV_HASHENTRIES; i++)
+			INIT_HLIST_HEAD(&hash[i]);
+
+	return hash;
+}
+
 static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 {
 	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
@@ -97,6 +123,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
 	/* make sure page count doesn't overflow */
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+
+	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_IDX)
+		cost += dev_map_bitmap_size(attr) +
+			sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
+
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return -EINVAL;
 
@@ -119,9 +150,20 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
 					      dtab->map.numa_node);
-	if (!dtab->netdev_map) {
-		free_percpu(dtab->flush_needed);
-		return -ENOMEM;
+	if (!dtab->netdev_map)
+		goto err_map;
+
+	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_IDX) {
+		dtab->bits_used = kzalloc(dev_map_bitmap_size(attr),
+					  GFP_KERNEL);
+		if (!dtab->bits_used)
+			goto err_bitmap;
+
+		dtab->dev_index_head = dev_map_create_hash();
+		if (!dtab->dev_index_head)
+			goto err_idx;
+
+		spin_lock_init(&dtab->index_lock);
 	}
 
 	spin_lock(&dev_map_lock);
@@ -129,6 +171,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
 	spin_unlock(&dev_map_lock);
 
 	return 0;
+ err_idx:
+	kfree(dtab->bits_used);
+ err_bitmap:
+	bpf_map_area_free(dtab->netdev_map);
+ err_map:
+	free_percpu(dtab->flush_needed);
+	return -ENOMEM;
 }
 
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -200,6 +249,10 @@ static void dev_map_free(struct bpf_map *map)
 		kfree(dev);
 	}
 
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP_IDX) {
+		kfree(dtab->dev_index_head);
+		kfree(dtab->bits_used);
+	}
 	free_percpu(dtab->flush_needed);
 	bpf_map_area_free(dtab->netdev_map);
 	kfree(dtab);
@@ -222,12 +275,76 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
+static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
+						    int ifindex)
+{
+	return &dtab->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+}
+
+struct bpf_dtab_netdev *__dev_map_idx_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct hlist_head *head = dev_map_index_hash(dtab, key);
+	struct bpf_dtab_netdev *dev;
+
+	hlist_for_each_entry_rcu(dev, head, index_hlist)
+		if (dev->ifindex == key)
+			return dev;
+
+	return NULL;
+}
+
+static int dev_map_idx_get_next_key(struct bpf_map *map, void *key,
+				    void *next_key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	u32 ifindex, *next = next_key;
+	struct bpf_dtab_netdev *dev, *next_dev;
+	struct hlist_head *head;
+	int i = 0;
+
+	if (!key)
+		goto find_first;
+
+	ifindex = *(u32 *)key;
+
+	dev = __dev_map_idx_lookup_elem(map, ifindex);
+	if (!dev)
+		goto find_first;
+
+	next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
+				    struct bpf_dtab_netdev, index_hlist);
+
+	if (next_dev) {
+		*next = next_dev->ifindex;
+		return 0;
+	}
+
+	i = ifindex & (NETDEV_HASHENTRIES - 1);
+	i++;
+
+ find_first:
+	for (; i < NETDEV_HASHENTRIES; i++) {
+		head = dev_map_index_hash(dtab, i);
+
+		next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					    struct bpf_dtab_netdev,
+					    index_hlist);
+		if (next_dev) {
+			*next = next_dev->ifindex;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+void __dev_map_insert_ctx(struct bpf_map *map, struct bpf_dtab_netdev *dst)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
 
-	__set_bit(bit, bitmap);
+	__set_bit(dst->bit, bitmap);
 }
 
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
@@ -396,9 +513,16 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
-	struct net_device *dev = obj ? obj->dev : NULL;
 
-	return dev ? &dev->ifindex : NULL;
+	return obj ? &obj->ifindex : NULL;
+}
+
+static void *dev_map_idx_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab_netdev *obj = __dev_map_idx_lookup_elem(map,
+								*(u32 *)key);
+
+	return obj ? &obj->ifindex : NULL;
 }
 
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
@@ -453,12 +577,81 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
-				u64 map_flags)
+static int dev_map_idx_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *old_dev;
+	int k = *(u32 *)key;
+
+	old_dev = __dev_map_idx_lookup_elem(map, k);
+	if (!old_dev)
+		return 0;
+
+	spin_lock(&dtab->index_lock);
+	hlist_del_rcu(&old_dev->index_hlist);
+	spin_unlock(&dtab->index_lock);
+
+	xchg(&dtab->netdev_map[old_dev->bit], NULL);
+	clear_bit_unlock(old_dev->bit, dtab->bits_used);
+	call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	return 0;
+}
+
+
+static bool __dev_map_find_bit(struct bpf_dtab *dtab, unsigned int *bit)
+{
+	unsigned int b = 0;
+
+ retry:
+	b = find_next_zero_bit(dtab->bits_used, dtab->map.max_entries, b);
+
+	if (b >= dtab->map.max_entries)
+		return false;
+
+	if (test_and_set_bit_lock(b, dtab->bits_used))
+		goto retry;
+
+	*bit = b;
+	return true;
+}
+
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct bpf_dtab *dtab,
+						    u32 ifindex,
+						    unsigned int bit)
+{
 	struct net *net = current->nsproxy->net_ns;
 	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	struct bpf_dtab_netdev *dev;
+
+	dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+					sizeof(void *), gfp);
+	if (!dev->bulkq) {
+		kfree(dev);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev->dev = dev_get_by_index(net, ifindex);
+	if (!dev->dev) {
+		free_percpu(dev->bulkq);
+		kfree(dev);
+		return ERR_PTR(-EINVAL);
+	}
+
+	dev->ifindex = dev->dev->ifindex;
+	dev->bit = bit;
+	dev->dtab = dtab;
+
+	return dev;
+}
+
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -473,26 +666,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
-		if (!dev)
-			return -ENOMEM;
-
-		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
-						sizeof(void *), gfp);
-		if (!dev->bulkq) {
-			kfree(dev);
-			return -ENOMEM;
-		}
-
-		dev->dev = dev_get_by_index(net, ifindex);
-		if (!dev->dev) {
-			free_percpu(dev->bulkq);
-			kfree(dev);
-			return -EINVAL;
-		}
-
-		dev->bit = i;
-		dev->dtab = dtab;
+		dev = __dev_map_alloc_node(dtab, ifindex, i);
+		if (IS_ERR(dev))
+			return PTR_ERR(dev);
 	}
 
 	/* Use call_rcu() here to ensure rcu critical sections have completed
@@ -506,6 +682,52 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	return 0;
 }
 
+static int dev_map_idx_update_elem(struct bpf_map *map, void *key, void *value,
+				   u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *dev, *old_dev;
+	u32 idx = *(u32 *)key;
+	u32 val = *(u32 *)value;
+	u32 bit;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+
+	old_dev = __dev_map_idx_lookup_elem(map, idx);
+	if (!val) {
+		if (!old_dev)
+			return 0;
+
+		xchg(&dtab->netdev_map[old_dev->bit], NULL);
+		spin_lock(&dtab->index_lock);
+		hlist_del_rcu(&old_dev->index_hlist);
+		spin_unlock(&dtab->index_lock);
+
+		clear_bit_unlock(old_dev->bit, dtab->bits_used);
+		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	} else {
+		if (idx != val)
+			return -EINVAL;
+		if (old_dev)
+			return 0;
+		if (!__dev_map_find_bit(dtab, &bit))
+			return -E2BIG;
+		dev = __dev_map_alloc_node(dtab, idx, bit);
+		if (IS_ERR(dev))
+			return PTR_ERR(dev);
+
+		xchg(&dtab->netdev_map[bit], dev);
+		spin_lock(&dtab->index_lock);
+		hlist_add_head_rcu(&dev->index_hlist,
+				   dev_map_index_hash(dtab, dev->ifindex));
+		spin_unlock(&dtab->index_lock);
+	}
+	return 0;
+}
+
 const struct bpf_map_ops dev_map_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,
@@ -516,6 +738,16 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_check_btf = map_check_no_btf,
 };
 
+const struct bpf_map_ops dev_map_idx_ops = {
+	.map_alloc = dev_map_alloc,
+	.map_free = dev_map_free,
+	.map_get_next_key = dev_map_idx_get_next_key,
+	.map_lookup_elem = dev_map_idx_lookup_elem,
+	.map_update_elem = dev_map_idx_update_elem,
+	.map_delete_elem = dev_map_idx_delete_elem,
+	.map_check_btf = map_check_no_btf,
+};
+
 static int dev_map_notification(struct notifier_block *notifier,
 				ulong event, void *ptr)
 {
@@ -595,7 +827,7 @@ int dev_map_alloc_default_map(void)
 		return -ENOMEM;
 
 	attr.max_entries = DEV_MAP_DEFAULT_SIZE;
-	attr.map_type = BPF_MAP_TYPE_DEVMAP;
+	attr.map_type = BPF_MAP_TYPE_DEVMAP_IDX;
 	attr.value_size = 4;
 	attr.key_size = 4;
 
@@ -606,13 +838,11 @@ int dev_map_alloc_default_map(void)
 	}
 
 	for_each_netdev(net, netdev) {
-		if (netdev->ifindex < DEV_MAP_DEFAULT_SIZE) {
-			idx = netdev->ifindex;
-			err = dev_map_update_elem(&dtab->map, &idx, &idx, 0);
-			if (err) {
-				dev_map_free(&dtab->map);
-				return err;
-			}
+		idx = netdev->ifindex;
+		err = dev_map_update_elem(&dtab->map, &idx, &idx, 0);
+		if (err) {
+			dev_map_free(&dtab->map);
+			return err;
 		}
 	}
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 629661db36ee..d6b9a7f3eb0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2528,6 +2528,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	 * for now.
 	 */
 	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_IDX:
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
@@ -2600,6 +2601,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_redirect_map:
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_DEVMAP_IDX &&
 		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
 		    map->map_type != BPF_MAP_TYPE_XSKMAP)
 			goto error;
diff --git a/net/core/filter.c b/net/core/filter.c
index c709b1468bb6..305c9bbf1753 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3334,13 +3334,14 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	int err;
 
 	switch (map->map_type) {
-	case BPF_MAP_TYPE_DEVMAP: {
+	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_IDX: {
 		struct bpf_dtab_netdev *dst = fwd;
 
 		err = dev_map_enqueue(dst, xdp, dev_rx);
 		if (unlikely(err))
 			return err;
-		__dev_map_insert_ctx(map, index);
+		__dev_map_insert_ctx(map, dst);
 		break;
 	}
 	case BPF_MAP_TYPE_CPUMAP: {
@@ -3373,6 +3374,7 @@ void xdp_do_flush_map(void)
 	if (map) {
 		switch (map->map_type) {
 		case BPF_MAP_TYPE_DEVMAP:
+		case BPF_MAP_TYPE_DEVMAP_IDX:
 			__dev_map_flush(map);
 			break;
 		case BPF_MAP_TYPE_CPUMAP:
@@ -3393,6 +3395,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
 		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_DEVMAP_IDX:
+		return __dev_map_idx_lookup_elem(map, index);
 	case BPF_MAP_TYPE_CPUMAP:
 		return __cpu_map_lookup_elem(map, index);
 	case BPF_MAP_TYPE_XSKMAP:
@@ -3483,7 +3487,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 		goto err;
 	}
 
-	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
+	    map->map_type == BPF_MAP_TYPE_DEVMAP_IDX) {
 		struct bpf_dtab_netdev *dst = fwd;
 
 		err = dev_map_generic_redirect(dst, skb, xdp_prog);
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index e0c650d91784..0864ce33df94 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -37,6 +37,7 @@ const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_ARRAY_OF_MAPS]		= "array_of_maps",
 	[BPF_MAP_TYPE_HASH_OF_MAPS]		= "hash_of_maps",
 	[BPF_MAP_TYPE_DEVMAP]			= "devmap",
+	[BPF_MAP_TYPE_DEVMAP_IDX]		= "devmap_idx",
 	[BPF_MAP_TYPE_SOCKMAP]			= "sockmap",
 	[BPF_MAP_TYPE_CPUMAP]			= "cpumap",
 	[BPF_MAP_TYPE_XSKMAP]			= "xskmap",
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1777fa0c61e4..a09243a38c2d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 	BPF_MAP_TYPE_QUEUE,
 	BPF_MAP_TYPE_STACK,
+	BPF_MAP_TYPE_DEVMAP_IDX,
 };
 
 /* Note that tracing related programs such as
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 8c3a1c04dcb2..b87b760a1355 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -172,6 +172,7 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_IDX:
 	case BPF_MAP_TYPE_SOCKMAP:
 	case BPF_MAP_TYPE_CPUMAP:
 	case BPF_MAP_TYPE_XSKMAP:
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 3c627771f965..63681e4647f9 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -519,6 +519,21 @@ static void test_devmap(unsigned int task, void *data)
 	close(fd);
 }
 
+static void test_devmap_idx(unsigned int task, void *data)
+{
+	int fd;
+	__u32 key, value;
+
+	fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP_IDX, sizeof(key), sizeof(value),
+			    2, 0);
+	if (fd < 0) {
+		printf("Failed to create devmap_idx '%s'!\n", strerror(errno));
+		exit(1);
+	}
+
+	close(fd);
+}
+
 static void test_queuemap(unsigned int task, void *data)
 {
 	const int MAP_SIZE = 32;
@@ -1686,6 +1701,7 @@ static void run_all_tests(void)
 	test_arraymap_percpu_many_keys();
 
 	test_devmap(0, NULL);
+	test_devmap_idx(0, NULL);
 	test_sockmap(0, NULL);
 
 	test_map_large();


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-21 11:56 [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
  2019-02-21 11:56 ` [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
@ 2019-02-21 15:19 ` Jesper Dangaard Brouer
  2019-02-21 15:52   ` Toke Høiland-Jørgensen
  2019-02-22  0:36 ` Jakub Kicinski
  2 siblings, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2019-02-21 15:19 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Daniel Borkmann, Alexei Starovoitov, brouer


You forgot at cover letter describing why we are doing this...
even-though is should be obvious from the performance results ;-)


On Thu, 21 Feb 2019 12:56:54 +0100 Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> Before patch:
> xdp_redirect:     5426035 pkt/s
> xdp_redirect_map: 8412754 pkt/s
> 
> After patch:
> xdp_redirect:     8314702 pkt/s
> xdp_redirect_map: 8411854 pkt/s
> 
> This corresponds to a 53% increase in xdp_redirect performance, or a
> reduction in per-packet processing time by 64 nanoseconds.

(1/5426035-1/8314702)*10^9 = 64.0277 almost exactly 64 nanosec

(1/8412754-1/8411854)*10^9 = -0.012 => no regression for xdp_redirect_map

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-21 11:56 ` [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
@ 2019-02-21 15:23   ` Jesper Dangaard Brouer
  2019-02-21 15:50     ` Toke Høiland-Jørgensen
  2019-02-21 21:49   ` Jakub Kicinski
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2019-02-21 15:23 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Daniel Borkmann, Alexei Starovoitov, brouer

On Thu, 21 Feb 2019 12:56:54 +0100
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> The default maps used by xdp_redirect() are changed to use the new map
> type, which means that xdp_redirect() is no longer limited to ifindex < 64,
> but instead to 64 total simultaneous interfaces per network namespace. This
> also provides an easy way to compare the performance of devmap and
> devmap_idx:
> 
> xdp_redirect_map (devmap): 8394560 pkt/s
> xdp_redirect (devmap_idx): 8179480 pkt/s
> 
> Difference: 215080 pkt/s or 3.1 nanoseconds per packet.

(1/8394560-1/8179480)*10^9 = -3.13239 ns

But was the xdp_redirect_map code-path affected from patch 1/1? 
(1/8412754-1/8394560)*10^9 = -0.2576 ns

It doesn't look like any performance regression to xdp_redirect_map
from these code changes :-)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-21 15:23   ` Jesper Dangaard Brouer
@ 2019-02-21 15:50     ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-21 15:50 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Miller, netdev, Daniel Borkmann, Alexei Starovoitov, brouer

Jesper Dangaard Brouer <brouer@redhat.com> writes:

> On Thu, 21 Feb 2019 12:56:54 +0100
> Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
>> The default maps used by xdp_redirect() are changed to use the new map
>> type, which means that xdp_redirect() is no longer limited to ifindex < 64,
>> but instead to 64 total simultaneous interfaces per network namespace. This
>> also provides an easy way to compare the performance of devmap and
>> devmap_idx:
>> 
>> xdp_redirect_map (devmap): 8394560 pkt/s
>> xdp_redirect (devmap_idx): 8179480 pkt/s
>> 
>> Difference: 215080 pkt/s or 3.1 nanoseconds per packet.
>
> (1/8394560-1/8179480)*10^9 = -3.13239 ns
>
> But was the xdp_redirect_map code-path affected from patch 1/1? 
> (1/8412754-1/8394560)*10^9 = -0.2576 ns
>
> It doesn't look like any performance regression to xdp_redirect_map
> from these code changes :-)

Nope, the difference between the two patches is just random noise; the
numbers vary more between runs (or even between samples in the same
run), which is why I didn't mention that difference.

-Toke

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-21 15:19 ` [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device Jesper Dangaard Brouer
@ 2019-02-21 15:52   ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-21 15:52 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Miller, netdev, Daniel Borkmann, Alexei Starovoitov, brouer

Jesper Dangaard Brouer <brouer@redhat.com> writes:

> You forgot at cover letter describing why we are doing this...
> even-though is should be obvious from the performance results ;-)

Well, I tried to put the motivation into the first paragraph of each
patch description instead of as a separate cover letter. I guess I could
have put it in a separate cover letter as well, but that was actually a
deliberate omission in this case ;)

-Toke

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-21 11:56 ` [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
  2019-02-21 15:23   ` Jesper Dangaard Brouer
@ 2019-02-21 21:49   ` Jakub Kicinski
  2019-02-21 23:02     ` Toke Høiland-Jørgensen
  2019-02-23 23:19   ` kbuild test robot
  2019-02-23 23:28   ` kbuild test robot
  3 siblings, 1 reply; 21+ messages in thread
From: Jakub Kicinski @ 2019-02-21 21:49 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:
> A common pattern when using xdp_redirect_map() is to create a device map
> where the lookup key is simply ifindex. Because device maps are arrays,
> this leaves holes in the map, and the map has to be sized to fit the
> largest ifindex, regardless of how many devices actually are actually
> needed in the map.
> 
> This patch adds a second type of device map where the key is interpreted as
> an ifindex and looked up using a hashmap, instead of being used as an array
> index. This leads to maps being densely packed, so they can be smaller.
> 
> The default maps used by xdp_redirect() are changed to use the new map
> type, which means that xdp_redirect() is no longer limited to ifindex < 64,
> but instead to 64 total simultaneous interfaces per network namespace. This
> also provides an easy way to compare the performance of devmap and
> devmap_idx:
> 
> xdp_redirect_map (devmap): 8394560 pkt/s
> xdp_redirect (devmap_idx): 8179480 pkt/s
> 
> Difference: 215080 pkt/s or 3.1 nanoseconds per packet.

Could you share what the ifindex mix was here, to arrive at these
numbers?  How does it compare to using an array but not keying with
ifindex?

> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>

> +static int dev_map_idx_update_elem(struct bpf_map *map, void *key, void *value,
> +				   u64 map_flags)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +	struct bpf_dtab_netdev *dev, *old_dev;
> +	u32 idx = *(u32 *)key;
> +	u32 val = *(u32 *)value;
> +	u32 bit;
> +
> +	if (unlikely(map_flags > BPF_EXIST))
> +		return -EINVAL;
> +	if (unlikely(map_flags == BPF_NOEXIST))
> +		return -EEXIST;
> +
> +	old_dev = __dev_map_idx_lookup_elem(map, idx);
> +	if (!val) {
> +		if (!old_dev)
> +			return 0;

IMHO this is a fairly strange mix of array and hashmap semantics.  I
think you should stick to hashmap behaviour AFA flags and update/delete
goes.

> +		xchg(&dtab->netdev_map[old_dev->bit], NULL);
> +		spin_lock(&dtab->index_lock);
> +		hlist_del_rcu(&old_dev->index_hlist);
> +		spin_unlock(&dtab->index_lock);
> +
> +		clear_bit_unlock(old_dev->bit, dtab->bits_used);
> +		call_rcu(&old_dev->rcu, __dev_map_entry_free);
> +	} else {
> +		if (idx != val)
> +			return -EINVAL;
> +		if (old_dev)
> +			return 0;
> +		if (!__dev_map_find_bit(dtab, &bit))
> +			return -E2BIG;
> +		dev = __dev_map_alloc_node(dtab, idx, bit);
> +		if (IS_ERR(dev))
> +			return PTR_ERR(dev);
> +
> +		xchg(&dtab->netdev_map[bit], dev);
> +		spin_lock(&dtab->index_lock);
> +		hlist_add_head_rcu(&dev->index_hlist,
> +				   dev_map_index_hash(dtab, dev->ifindex));
> +		spin_unlock(&dtab->index_lock);
> +	}
> +	return 0;
> +}
> +
>  const struct bpf_map_ops dev_map_ops = {
>  	.map_alloc = dev_map_alloc,
>  	.map_free = dev_map_free,

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-21 21:49   ` Jakub Kicinski
@ 2019-02-21 23:02     ` Toke Høiland-Jørgensen
  2019-02-22  0:32       ` Jakub Kicinski
  0 siblings, 1 reply; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-21 23:02 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:
>> A common pattern when using xdp_redirect_map() is to create a device map
>> where the lookup key is simply ifindex. Because device maps are arrays,
>> this leaves holes in the map, and the map has to be sized to fit the
>> largest ifindex, regardless of how many devices actually are actually
>> needed in the map.
>> 
>> This patch adds a second type of device map where the key is interpreted as
>> an ifindex and looked up using a hashmap, instead of being used as an array
>> index. This leads to maps being densely packed, so they can be smaller.
>> 
>> The default maps used by xdp_redirect() are changed to use the new map
>> type, which means that xdp_redirect() is no longer limited to ifindex < 64,
>> but instead to 64 total simultaneous interfaces per network namespace. This
>> also provides an easy way to compare the performance of devmap and
>> devmap_idx:
>> 
>> xdp_redirect_map (devmap): 8394560 pkt/s
>> xdp_redirect (devmap_idx): 8179480 pkt/s
>> 
>> Difference: 215080 pkt/s or 3.1 nanoseconds per packet.
>
> Could you share what the ifindex mix was here, to arrive at these
> numbers? How does it compare to using an array but not keying with
> ifindex?

Just the standard set on my test machine; ifindex 1 through 9, except 8
in this case. So certainly no more than 1 ifindex in each hash bucket
for those numbers.

>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>
>> +static int dev_map_idx_update_elem(struct bpf_map *map, void *key, void *value,
>> +				   u64 map_flags)
>> +{
>> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +	struct bpf_dtab_netdev *dev, *old_dev;
>> +	u32 idx = *(u32 *)key;
>> +	u32 val = *(u32 *)value;
>> +	u32 bit;
>> +
>> +	if (unlikely(map_flags > BPF_EXIST))
>> +		return -EINVAL;
>> +	if (unlikely(map_flags == BPF_NOEXIST))
>> +		return -EEXIST;
>> +
>> +	old_dev = __dev_map_idx_lookup_elem(map, idx);
>> +	if (!val) {
>> +		if (!old_dev)
>> +			return 0;
>
> IMHO this is a fairly strange mix of array and hashmap semantics. I
> think you should stick to hashmap behaviour AFA flags and
> update/delete goes.

Yeah, the double book-keeping is a bit strange, but it allows the actual
forwarding and flush code to be reused between both types of maps. I
think this is worth the slight semantic confusion :)

-Toke

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-21 23:02     ` Toke Høiland-Jørgensen
@ 2019-02-22  0:32       ` Jakub Kicinski
  2019-02-22  9:47         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 21+ messages in thread
From: Jakub Kicinski @ 2019-02-22  0:32 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Fri, 22 Feb 2019 00:02:23 +0100, Toke Høiland-Jørgensen wrote:
> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
> 
> > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:  
> >> A common pattern when using xdp_redirect_map() is to create a device map
> >> where the lookup key is simply ifindex. Because device maps are arrays,
> >> this leaves holes in the map, and the map has to be sized to fit the
> >> largest ifindex, regardless of how many devices actually are actually
> >> needed in the map.
> >> 
> >> This patch adds a second type of device map where the key is interpreted as
> >> an ifindex and looked up using a hashmap, instead of being used as an array
> >> index. This leads to maps being densely packed, so they can be smaller.
> >> 
> >> The default maps used by xdp_redirect() are changed to use the new map
> >> type, which means that xdp_redirect() is no longer limited to ifindex < 64,
> >> but instead to 64 total simultaneous interfaces per network namespace. This
> >> also provides an easy way to compare the performance of devmap and
> >> devmap_idx:
> >> 
> >> xdp_redirect_map (devmap): 8394560 pkt/s
> >> xdp_redirect (devmap_idx): 8179480 pkt/s
> >> 
> >> Difference: 215080 pkt/s or 3.1 nanoseconds per packet.  
> >
> > Could you share what the ifindex mix was here, to arrive at these
> > numbers? How does it compare to using an array but not keying with
> > ifindex?  
> 
> Just the standard set on my test machine; ifindex 1 through 9, except 8
> in this case. So certainly no more than 1 ifindex in each hash bucket
> for those numbers.

Oh, I clearly misread your numbers, it's still slower than array, you
just don't need the size limit.

> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>  
> >  
> >> +static int dev_map_idx_update_elem(struct bpf_map *map, void *key, void *value,
> >> +				   u64 map_flags)
> >> +{
> >> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> >> +	struct bpf_dtab_netdev *dev, *old_dev;
> >> +	u32 idx = *(u32 *)key;
> >> +	u32 val = *(u32 *)value;
> >> +	u32 bit;
> >> +
> >> +	if (unlikely(map_flags > BPF_EXIST))
> >> +		return -EINVAL;
> >> +	if (unlikely(map_flags == BPF_NOEXIST))
> >> +		return -EEXIST;
> >> +
> >> +	old_dev = __dev_map_idx_lookup_elem(map, idx);
> >> +	if (!val) {
> >> +		if (!old_dev)
> >> +			return 0;  
> >
> > IMHO this is a fairly strange mix of array and hashmap semantics. I
> > think you should stick to hashmap behaviour AFA flags and
> > update/delete goes.  
> 
> Yeah, the double book-keeping is a bit strange, but it allows the actual
> forwarding and flush code to be reused between both types of maps. I
> think this is worth the slight semantic confusion :)

I'm not sure I was clear, let me try again :)  Your get_next_key only
reports existing indexes if I read the code right, so that's not an
array - in an array indexes always exist.  What follows inserting 0
should not be equivalent to delete and BPF_NOEXIST should be handled
appropriately.  

Different maps behave differently, I think it's worth trying to limit
the divergence in how things behave to the basic array and a hashmap
models when possible.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-21 11:56 [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
  2019-02-21 11:56 ` [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
  2019-02-21 15:19 ` [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device Jesper Dangaard Brouer
@ 2019-02-22  0:36 ` Jakub Kicinski
  2019-02-22 10:13   ` Toke Høiland-Jørgensen
  2 siblings, 1 reply; 21+ messages in thread
From: Jakub Kicinski @ 2019-02-22  0:36 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index b63bc77af2d1..629661db36ee 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7527,6 +7527,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>  			prog->dst_needed = 1;
>  		if (insn->imm == BPF_FUNC_get_prandom_u32)
>  			bpf_user_rnd_init_once();
> +		if (insn->imm == BPF_FUNC_redirect) {
> +			int err = dev_map_alloc_default_map();
> +
> +			if (err)
> +				return err;
> +		}
>  		if (insn->imm == BPF_FUNC_override_return)
>  			prog->kprobe_override = 1;
>  		if (insn->imm == BPF_FUNC_tail_call) {

> +int dev_map_alloc_default_map(void)
> +{
> +	struct net *net = current->nsproxy->net_ns;
> +	struct bpf_dtab *dtab, *old_dtab;
> +	struct net_device *netdev;
> +	union bpf_attr attr = {};
> +	u32 idx;
> +	int err;

BPF programs don't obey by netns boundaries.  The fact the program is
verified in one ns doesn't mean this is the only ns it will be used in :(
Meaning if any program is using the redirect map you may need a secret
map in every ns.. no?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-22  0:32       ` Jakub Kicinski
@ 2019-02-22  9:47         ` Toke Høiland-Jørgensen
  2019-02-22 21:30           ` Jakub Kicinski
  0 siblings, 1 reply; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-22  9:47 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Fri, 22 Feb 2019 00:02:23 +0100, Toke Høiland-Jørgensen wrote:
>> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
>> 
>> > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:  
>> >> A common pattern when using xdp_redirect_map() is to create a device map
>> >> where the lookup key is simply ifindex. Because device maps are arrays,
>> >> this leaves holes in the map, and the map has to be sized to fit the
>> >> largest ifindex, regardless of how many devices actually are actually
>> >> needed in the map.
>> >> 
>> >> This patch adds a second type of device map where the key is interpreted as
>> >> an ifindex and looked up using a hashmap, instead of being used as an array
>> >> index. This leads to maps being densely packed, so they can be smaller.
>> >> 
>> >> The default maps used by xdp_redirect() are changed to use the new map
>> >> type, which means that xdp_redirect() is no longer limited to ifindex < 64,
>> >> but instead to 64 total simultaneous interfaces per network namespace. This
>> >> also provides an easy way to compare the performance of devmap and
>> >> devmap_idx:
>> >> 
>> >> xdp_redirect_map (devmap): 8394560 pkt/s
>> >> xdp_redirect (devmap_idx): 8179480 pkt/s
>> >> 
>> >> Difference: 215080 pkt/s or 3.1 nanoseconds per packet.  
>> >
>> > Could you share what the ifindex mix was here, to arrive at these
>> > numbers? How does it compare to using an array but not keying with
>> > ifindex?  
>> 
>> Just the standard set on my test machine; ifindex 1 through 9, except 8
>> in this case. So certainly no more than 1 ifindex in each hash bucket
>> for those numbers.
>
> Oh, I clearly misread your numbers, it's still slower than array, you
> just don't need the size limit.

Yeah, this is not about speeding up devmap, it's about lifting the size
restriction.

>> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>  
>> >  
>> >> +static int dev_map_idx_update_elem(struct bpf_map *map, void *key, void *value,
>> >> +				   u64 map_flags)
>> >> +{
>> >> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> >> +	struct bpf_dtab_netdev *dev, *old_dev;
>> >> +	u32 idx = *(u32 *)key;
>> >> +	u32 val = *(u32 *)value;
>> >> +	u32 bit;
>> >> +
>> >> +	if (unlikely(map_flags > BPF_EXIST))
>> >> +		return -EINVAL;
>> >> +	if (unlikely(map_flags == BPF_NOEXIST))
>> >> +		return -EEXIST;
>> >> +
>> >> +	old_dev = __dev_map_idx_lookup_elem(map, idx);
>> >> +	if (!val) {
>> >> +		if (!old_dev)
>> >> +			return 0;  
>> >
>> > IMHO this is a fairly strange mix of array and hashmap semantics. I
>> > think you should stick to hashmap behaviour AFA flags and
>> > update/delete goes.  
>> 
>> Yeah, the double book-keeping is a bit strange, but it allows the actual
>> forwarding and flush code to be reused between both types of maps. I
>> think this is worth the slight semantic confusion :)
>
> I'm not sure I was clear, let me try again :) Your get_next_key only
> reports existing indexes if I read the code right, so that's not an
> array - in an array indexes always exist. What follows inserting 0
> should not be equivalent to delete and BPF_NOEXIST should be handled
> appropriately.

Ah, I see what you mean. Yeah, sure, I guess I can restrict deletion to
only working through explicit delete.

I could also add a fail on NOEXIST, but since each index is tied to a
particular value, you can't actually change the contents of each index,
only insert and remove. So why would you ever set that flag?

> Different maps behave differently, I think it's worth trying to limit
> the divergence in how things behave to the basic array and a hashmap
> models when possible.

So I don't actually think of this as a hashmap in the general sense;
after all, you can only store ifindexes in it, and key and value are
tied to one another. So it's an ifindex'ed devmap (which is also why I
named it devmap_idx and not devmap_hash); the fact that it's implemented
as a hashmap is just incidental.

So I guess it's a choice between being consistent with the other devmap
type, or with a general hashmap. I'm not actually sure that the latter
is less surprising? :)

-Toke

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-22  0:36 ` Jakub Kicinski
@ 2019-02-22 10:13   ` Toke Høiland-Jørgensen
  2019-02-22 21:37     ` Jakub Kicinski
  0 siblings, 1 reply; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-22 10:13 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index b63bc77af2d1..629661db36ee 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -7527,6 +7527,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>>  			prog->dst_needed = 1;
>>  		if (insn->imm == BPF_FUNC_get_prandom_u32)
>>  			bpf_user_rnd_init_once();
>> +		if (insn->imm == BPF_FUNC_redirect) {
>> +			int err = dev_map_alloc_default_map();
>> +
>> +			if (err)
>> +				return err;
>> +		}
>>  		if (insn->imm == BPF_FUNC_override_return)
>>  			prog->kprobe_override = 1;
>>  		if (insn->imm == BPF_FUNC_tail_call) {
>
>> +int dev_map_alloc_default_map(void)
>> +{
>> +	struct net *net = current->nsproxy->net_ns;
>> +	struct bpf_dtab *dtab, *old_dtab;
>> +	struct net_device *netdev;
>> +	union bpf_attr attr = {};
>> +	u32 idx;
>> +	int err;
>
> BPF programs don't obey by netns boundaries.  The fact the program is
> verified in one ns doesn't mean this is the only ns it will be used in :(
> Meaning if any program is using the redirect map you may need a secret
> map in every ns.. no?

Ah, yes, good point. Totally didn't think about the fact that load and
attach are decoupled. Hmm, guess I'll just have to move the call to
alloc_default_map() to the point where the program is attached to an
interface, then...

I trust it's safe to skip the allocation in case the program is
offloaded to hardware, right? I.e., an offloaded program will never need
to fall back to the kernel helper?

-Toke

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-22  9:47         ` Toke Høiland-Jørgensen
@ 2019-02-22 21:30           ` Jakub Kicinski
  2019-02-23 11:52             ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 21+ messages in thread
From: Jakub Kicinski @ 2019-02-22 21:30 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Fri, 22 Feb 2019 10:47:10 +0100, Toke Høiland-Jørgensen wrote:
> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
> 
> > On Fri, 22 Feb 2019 00:02:23 +0100, Toke Høiland-Jørgensen wrote:  
> >> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
> >>   
> >> > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:    
> >> >> A common pattern when using xdp_redirect_map() is to create a device map
> >> >> where the lookup key is simply ifindex. Because device maps are arrays,
> >> >> this leaves holes in the map, and the map has to be sized to fit the
> >> >> largest ifindex, regardless of how many devices actually are actually
> >> >> needed in the map.
> >> >> 
> >> >> This patch adds a second type of device map where the key is interpreted as
> >> >> an ifindex and looked up using a hashmap, instead of being used as an array
> >> >> index. This leads to maps being densely packed, so they can be smaller.
> >> >> 
> >> >> The default maps used by xdp_redirect() are changed to use the new map
> >> >> type, which means that xdp_redirect() is no longer limited to ifindex < 64,
> >> >> but instead to 64 total simultaneous interfaces per network namespace. This
> >> >> also provides an easy way to compare the performance of devmap and
> >> >> devmap_idx:
> >> >> 
> >> >> xdp_redirect_map (devmap): 8394560 pkt/s
> >> >> xdp_redirect (devmap_idx): 8179480 pkt/s
> >> >> 
> >> >> Difference: 215080 pkt/s or 3.1 nanoseconds per packet.    
> >> >
> >> > Could you share what the ifindex mix was here, to arrive at these
> >> > numbers? How does it compare to using an array but not keying with
> >> > ifindex?    
> >> 
> >> Just the standard set on my test machine; ifindex 1 through 9, except 8
> >> in this case. So certainly no more than 1 ifindex in each hash bucket
> >> for those numbers.  
> >
> > Oh, I clearly misread your numbers, it's still slower than array, you
> > just don't need the size limit.  
> 
> Yeah, this is not about speeding up devmap, it's about lifting the size
> restriction.
> 
> >> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>    
> >> >    
> >> >> +static int dev_map_idx_update_elem(struct bpf_map *map, void *key, void *value,
> >> >> +				   u64 map_flags)
> >> >> +{
> >> >> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> >> >> +	struct bpf_dtab_netdev *dev, *old_dev;
> >> >> +	u32 idx = *(u32 *)key;
> >> >> +	u32 val = *(u32 *)value;
> >> >> +	u32 bit;
> >> >> +
> >> >> +	if (unlikely(map_flags > BPF_EXIST))
> >> >> +		return -EINVAL;
> >> >> +	if (unlikely(map_flags == BPF_NOEXIST))
> >> >> +		return -EEXIST;
> >> >> +
> >> >> +	old_dev = __dev_map_idx_lookup_elem(map, idx);
> >> >> +	if (!val) {
> >> >> +		if (!old_dev)
> >> >> +			return 0;    
> >> >
> >> > IMHO this is a fairly strange mix of array and hashmap semantics. I
> >> > think you should stick to hashmap behaviour AFA flags and
> >> > update/delete goes.    
> >> 
> >> Yeah, the double book-keeping is a bit strange, but it allows the actual
> >> forwarding and flush code to be reused between both types of maps. I
> >> think this is worth the slight semantic confusion :)  
> >
> > I'm not sure I was clear, let me try again :) Your get_next_key only
> > reports existing indexes if I read the code right, so that's not an
> > array - in an array indexes always exist. What follows inserting 0
> > should not be equivalent to delete and BPF_NOEXIST should be handled
> > appropriately.  
> 
> Ah, I see what you mean. Yeah, sure, I guess I can restrict deletion to
> only working through explicit delete.
> 
> I could also add a fail on NOEXIST, but since each index is tied to a
> particular value, you can't actually change the contents of each index,
> only insert and remove. So why would you ever set that flag?

The reason user would have for setting the flag is not clear :)  But 
if you want to reject it because it's unsupported/makes no sense, you
should do EINVAL, not EEXIST ;)

> > Different maps behave differently, I think it's worth trying to limit
> > the divergence in how things behave to the basic array and a hashmap
> > models when possible.  
> 
> So I don't actually think of this as a hashmap in the general sense;
> after all, you can only store ifindexes in it, and key and value are
> tied to one another. So it's an ifindex'ed devmap (which is also why I
> named it devmap_idx and not devmap_hash); the fact that it's implemented
> as a hashmap is just incidental.
> 
> So I guess it's a choice between being consistent with the other devmap
> type, or with a general hashmap. I'm not actually sure that the latter
> is less surprising? :)

The distinction is that if entry is not in the map get_next won't
return its key.  As you say the construct is not really a hash map
(probably a set is the closest) but it most definitely is not an
array, so no hard EEXIST on NOEXIST flag :)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-22 10:13   ` Toke Høiland-Jørgensen
@ 2019-02-22 21:37     ` Jakub Kicinski
  2019-02-23 10:43       ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 21+ messages in thread
From: Jakub Kicinski @ 2019-02-22 21:37 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Fri, 22 Feb 2019 11:13:50 +0100, Toke Høiland-Jørgensen wrote:
> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
> > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:  
> >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >> index b63bc77af2d1..629661db36ee 100644
> >> --- a/kernel/bpf/verifier.c
> >> +++ b/kernel/bpf/verifier.c
> >> @@ -7527,6 +7527,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
> >>  			prog->dst_needed = 1;
> >>  		if (insn->imm == BPF_FUNC_get_prandom_u32)
> >>  			bpf_user_rnd_init_once();
> >> +		if (insn->imm == BPF_FUNC_redirect) {
> >> +			int err = dev_map_alloc_default_map();
> >> +
> >> +			if (err)
> >> +				return err;
> >> +		}
> >>  		if (insn->imm == BPF_FUNC_override_return)
> >>  			prog->kprobe_override = 1;
> >>  		if (insn->imm == BPF_FUNC_tail_call) {  
> >  
> >> +int dev_map_alloc_default_map(void)
> >> +{
> >> +	struct net *net = current->nsproxy->net_ns;
> >> +	struct bpf_dtab *dtab, *old_dtab;
> >> +	struct net_device *netdev;
> >> +	union bpf_attr attr = {};
> >> +	u32 idx;
> >> +	int err;  
> >
> > BPF programs don't obey by netns boundaries.  The fact the program is
> > verified in one ns doesn't mean this is the only ns it will be used in :(
> > Meaning if any program is using the redirect map you may need a secret
> > map in every ns.. no?  
> 
> Ah, yes, good point. Totally didn't think about the fact that load and
> attach are decoupled. Hmm, guess I'll just have to move the call to
> alloc_default_map() to the point where the program is attached to an
> interface, then...

Possibly.. and you also need to handle the case where interface with a
program attached is moved, no?

> I trust it's safe to skip the allocation in case the program is
> offloaded to hardware, right? I.e., an offloaded program will never need
> to fall back to the kernel helper?

We will cross that bridge when we get there ;)  I'd definitely want the
ability to do a redirect to a non-offloaded netdev (e.g. redirect to a
veth) via some fallback, but the plan is to try to only add support for
the map version of redirect on offload, anyway.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-22 21:37     ` Jakub Kicinski
@ 2019-02-23 10:43       ` Jesper Dangaard Brouer
  2019-02-23 12:11         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2019-02-23 10:43 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Toke Høiland-Jørgensen, David Miller, netdev,
	Daniel Borkmann, Alexei Starovoitov, brouer


On Fri, 22 Feb 2019 13:37:34 -0800 Jakub Kicinski <jakub.kicinski@netronome.com> wrote:

> On Fri, 22 Feb 2019 11:13:50 +0100, Toke Høiland-Jørgensen wrote:
> > Jakub Kicinski <jakub.kicinski@netronome.com> writes:  
> > > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:    
[...]
> > >
> > > BPF programs don't obey by netns boundaries.  The fact the program is
> > > verified in one ns doesn't mean this is the only ns it will be used in :(
> > > Meaning if any program is using the redirect map you may need a secret
> > > map in every ns.. no?    
> > 
> > Ah, yes, good point. Totally didn't think about the fact that load and
> > attach are decoupled. Hmm, guess I'll just have to move the call to
> > alloc_default_map() to the point where the program is attached to an
> > interface, then...  
> 
> Possibly.. and you also need to handle the case where interface with a
> program attached is moved, no?

True, we need to handle if e.g. a veth gets an XDP program attached and
then is moved into a network namespace (as I've already explained to
Toke in a meeting).

I'm still not sure how to handle this...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-22 21:30           ` Jakub Kicinski
@ 2019-02-23 11:52             ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-23 11:52 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Fri, 22 Feb 2019 10:47:10 +0100, Toke Høiland-Jørgensen wrote:
>> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
>> 
>> > On Fri, 22 Feb 2019 00:02:23 +0100, Toke Høiland-Jørgensen wrote:  
>> >> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
>> >>   
>> >> > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:    
>> >> >> A common pattern when using xdp_redirect_map() is to create a device map
>> >> >> where the lookup key is simply ifindex. Because device maps are arrays,
>> >> >> this leaves holes in the map, and the map has to be sized to fit the
>> >> >> largest ifindex, regardless of how many devices actually are actually
>> >> >> needed in the map.
>> >> >> 
>> >> >> This patch adds a second type of device map where the key is interpreted as
>> >> >> an ifindex and looked up using a hashmap, instead of being used as an array
>> >> >> index. This leads to maps being densely packed, so they can be smaller.
>> >> >> 
>> >> >> The default maps used by xdp_redirect() are changed to use the new map
>> >> >> type, which means that xdp_redirect() is no longer limited to ifindex < 64,
>> >> >> but instead to 64 total simultaneous interfaces per network namespace. This
>> >> >> also provides an easy way to compare the performance of devmap and
>> >> >> devmap_idx:
>> >> >> 
>> >> >> xdp_redirect_map (devmap): 8394560 pkt/s
>> >> >> xdp_redirect (devmap_idx): 8179480 pkt/s
>> >> >> 
>> >> >> Difference: 215080 pkt/s or 3.1 nanoseconds per packet.    
>> >> >
>> >> > Could you share what the ifindex mix was here, to arrive at these
>> >> > numbers? How does it compare to using an array but not keying with
>> >> > ifindex?    
>> >> 
>> >> Just the standard set on my test machine; ifindex 1 through 9, except 8
>> >> in this case. So certainly no more than 1 ifindex in each hash bucket
>> >> for those numbers.  
>> >
>> > Oh, I clearly misread your numbers, it's still slower than array, you
>> > just don't need the size limit.  
>> 
>> Yeah, this is not about speeding up devmap, it's about lifting the size
>> restriction.
>> 
>> >> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>    
>> >> >    
>> >> >> +static int dev_map_idx_update_elem(struct bpf_map *map, void *key, void *value,
>> >> >> +				   u64 map_flags)
>> >> >> +{
>> >> >> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> >> >> +	struct bpf_dtab_netdev *dev, *old_dev;
>> >> >> +	u32 idx = *(u32 *)key;
>> >> >> +	u32 val = *(u32 *)value;
>> >> >> +	u32 bit;
>> >> >> +
>> >> >> +	if (unlikely(map_flags > BPF_EXIST))
>> >> >> +		return -EINVAL;
>> >> >> +	if (unlikely(map_flags == BPF_NOEXIST))
>> >> >> +		return -EEXIST;
>> >> >> +
>> >> >> +	old_dev = __dev_map_idx_lookup_elem(map, idx);
>> >> >> +	if (!val) {
>> >> >> +		if (!old_dev)
>> >> >> +			return 0;    
>> >> >
>> >> > IMHO this is a fairly strange mix of array and hashmap semantics. I
>> >> > think you should stick to hashmap behaviour AFA flags and
>> >> > update/delete goes.    
>> >> 
>> >> Yeah, the double book-keeping is a bit strange, but it allows the actual
>> >> forwarding and flush code to be reused between both types of maps. I
>> >> think this is worth the slight semantic confusion :)  
>> >
>> > I'm not sure I was clear, let me try again :) Your get_next_key only
>> > reports existing indexes if I read the code right, so that's not an
>> > array - in an array indexes always exist. What follows inserting 0
>> > should not be equivalent to delete and BPF_NOEXIST should be handled
>> > appropriately.  
>> 
>> Ah, I see what you mean. Yeah, sure, I guess I can restrict deletion to
>> only working through explicit delete.
>> 
>> I could also add a fail on NOEXIST, but since each index is tied to a
>> particular value, you can't actually change the contents of each index,
>> only insert and remove. So why would you ever set that flag?
>
> The reason user would have for setting the flag is not clear :)  But 
> if you want to reject it because it's unsupported/makes no sense, you
> should do EINVAL, not EEXIST ;)
>
>> > Different maps behave differently, I think it's worth trying to limit
>> > the divergence in how things behave to the basic array and a hashmap
>> > models when possible.  
>> 
>> So I don't actually think of this as a hashmap in the general sense;
>> after all, you can only store ifindexes in it, and key and value are
>> tied to one another. So it's an ifindex'ed devmap (which is also why I
>> named it devmap_idx and not devmap_hash); the fact that it's implemented
>> as a hashmap is just incidental.
>> 
>> So I guess it's a choice between being consistent with the other devmap
>> type, or with a general hashmap. I'm not actually sure that the latter
>> is less surprising? :)
>
> The distinction is that if entry is not in the map get_next won't
> return its key.  As you say the construct is not really a hash map
> (probably a set is the closest) but it most definitely is not an
> array, so no hard EEXIST on NOEXIST flag :)

Yeah, I thought about it some more and I agree it makes sense to change
the update semantics to be a bit more hashmap-like :)

-Toke

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-23 10:43       ` Jesper Dangaard Brouer
@ 2019-02-23 12:11         ` Toke Høiland-Jørgensen
  2019-02-25 18:47           ` Jakub Kicinski
  0 siblings, 1 reply; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-23 12:11 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Jakub Kicinski
  Cc: David Miller, netdev, Daniel Borkmann, Alexei Starovoitov, brouer

Jesper Dangaard Brouer <brouer@redhat.com> writes:

> On Fri, 22 Feb 2019 13:37:34 -0800 Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
>
>> On Fri, 22 Feb 2019 11:13:50 +0100, Toke Høiland-Jørgensen wrote:
>> > Jakub Kicinski <jakub.kicinski@netronome.com> writes:  
>> > > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:    
> [...]
>> > >
>> > > BPF programs don't obey by netns boundaries.  The fact the program is
>> > > verified in one ns doesn't mean this is the only ns it will be used in :(
>> > > Meaning if any program is using the redirect map you may need a secret
>> > > map in every ns.. no?    
>> > 
>> > Ah, yes, good point. Totally didn't think about the fact that load and
>> > attach are decoupled. Hmm, guess I'll just have to move the call to
>> > alloc_default_map() to the point where the program is attached to an
>> > interface, then...  
>> 
>> Possibly.. and you also need to handle the case where interface with a
>> program attached is moved, no?

Yup, alloc on attach was easy enough; the moving turns out to be the
tricky part :)

> True, we need to handle if e.g. a veth gets an XDP program attached and
> then is moved into a network namespace (as I've already explained to
> Toke in a meeting).

Yeah, I had somehow convinced myself that the XDP program was being
removed when the interface was being torn down before moving between
namespaces. Jesper pointed out that this was not in fact the case... :P

> I'm still not sure how to handle this...

There are a couple of options, I think. At least:

1. Maintain a flag on struct net_device indicating that this device
   needs the redirect map allocated, and react to that when interfaces
   are being moved.

2. Lookup the BPF program by ID (which we can get from the driver) on
   move, and react to the program flag.

3. Keep the allocation on program load, but allocate maps for all active
   namespaces (which would probably need a refcnt mechanism to
   deallocate things again).

I think I'm leaning towards #2; possibly combined with a refcnt so we
can actually deallocate the map in the root namespace when it's not
needed anymore.

-Toke

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-21 11:56 ` [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
  2019-02-21 15:23   ` Jesper Dangaard Brouer
  2019-02-21 21:49   ` Jakub Kicinski
@ 2019-02-23 23:19   ` kbuild test robot
  2019-02-23 23:28   ` kbuild test robot
  3 siblings, 0 replies; 21+ messages in thread
From: kbuild test robot @ 2019-02-23 23:19 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: kbuild-all, David Miller, netdev, Jesper Dangaard Brouer,
	Daniel Borkmann, Alexei Starovoitov

[-- Attachment #1: Type: text/plain, Size: 2605 bytes --]

Hi Toke,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Toke-H-iland-J-rgensen/xdp-Always-use-a-devmap-for-XDP_REDIRECT-to-a-device/20190224-054907
config: i386-randconfig-a1-201907 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   net/core/filter.c: In function '__bpf_tx_xdp_map':
>> net/core/filter.c:3355:29: warning: passing argument 2 of '__dev_map_insert_ctx' from incompatible pointer type
      __dev_map_insert_ctx(map, dst);
                                ^
   In file included from include/linux/bpf-cgroup.h:5:0,
                    from include/linux/cgroup-defs.h:22,
                    from include/linux/cgroup.h:28,
                    from include/net/netprio_cgroup.h:17,
                    from include/linux/netdevice.h:46,
                    from include/net/sock.h:51,
                    from include/linux/sock_diag.h:8,
                    from net/core/filter.c:29:
   include/linux/bpf.h:715:20: note: expected 'struct net_device *' but argument is of type 'struct bpf_dtab_netdev *'
    static inline void __dev_map_insert_ctx(struct bpf_map *map,
                       ^

vim +/__dev_map_insert_ctx +3355 net/core/filter.c

  3339	
  3340	static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
  3341				    struct bpf_map *map,
  3342				    struct xdp_buff *xdp,
  3343				    u32 index)
  3344	{
  3345		int err;
  3346	
  3347		switch (map->map_type) {
  3348		case BPF_MAP_TYPE_DEVMAP:
  3349		case BPF_MAP_TYPE_DEVMAP_IDX: {
  3350			struct bpf_dtab_netdev *dst = fwd;
  3351	
  3352			err = dev_map_enqueue(dst, xdp, dev_rx);
  3353			if (unlikely(err))
  3354				return err;
> 3355			__dev_map_insert_ctx(map, dst);
  3356			break;
  3357		}
  3358		case BPF_MAP_TYPE_CPUMAP: {
  3359			struct bpf_cpu_map_entry *rcpu = fwd;
  3360	
  3361			err = cpu_map_enqueue(rcpu, xdp, dev_rx);
  3362			if (unlikely(err))
  3363				return err;
  3364			__cpu_map_insert_ctx(map, index);
  3365			break;
  3366		}
  3367		case BPF_MAP_TYPE_XSKMAP: {
  3368			struct xdp_sock *xs = fwd;
  3369	
  3370			err = __xsk_map_redirect(map, xdp, xs);
  3371			return err;
  3372		}
  3373		default:
  3374			break;
  3375		}
  3376		return 0;
  3377	}
  3378	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 27631 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-02-21 11:56 ` [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
                     ` (2 preceding siblings ...)
  2019-02-23 23:19   ` kbuild test robot
@ 2019-02-23 23:28   ` kbuild test robot
  3 siblings, 0 replies; 21+ messages in thread
From: kbuild test robot @ 2019-02-23 23:28 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: kbuild-all, David Miller, netdev, Jesper Dangaard Brouer,
	Daniel Borkmann, Alexei Starovoitov

[-- Attachment #1: Type: text/plain, Size: 2691 bytes --]

Hi Toke,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Toke-H-iland-J-rgensen/xdp-Always-use-a-devmap-for-XDP_REDIRECT-to-a-device/20190224-054907
config: i386-randconfig-l1-02240344 (attached as .config)
compiler: gcc-5 (Debian 5.5.0-3) 5.4.1 20171010
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   net//core/filter.c: In function '__bpf_tx_xdp_map':
>> net//core/filter.c:3355:29: error: passing argument 2 of '__dev_map_insert_ctx' from incompatible pointer type [-Werror=incompatible-pointer-types]
      __dev_map_insert_ctx(map, dst);
                                ^
   In file included from include/linux/bpf-cgroup.h:5:0,
                    from include/linux/cgroup-defs.h:22,
                    from include/linux/cgroup.h:28,
                    from include/net/netprio_cgroup.h:17,
                    from include/linux/netdevice.h:46,
                    from include/net/sock.h:51,
                    from include/linux/sock_diag.h:8,
                    from net//core/filter.c:29:
   include/linux/bpf.h:715:20: note: expected 'struct net_device *' but argument is of type 'struct bpf_dtab_netdev *'
    static inline void __dev_map_insert_ctx(struct bpf_map *map,
                       ^
   cc1: some warnings being treated as errors

vim +/__dev_map_insert_ctx +3355 net//core/filter.c

  3339	
  3340	static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
  3341				    struct bpf_map *map,
  3342				    struct xdp_buff *xdp,
  3343				    u32 index)
  3344	{
  3345		int err;
  3346	
  3347		switch (map->map_type) {
  3348		case BPF_MAP_TYPE_DEVMAP:
  3349		case BPF_MAP_TYPE_DEVMAP_IDX: {
  3350			struct bpf_dtab_netdev *dst = fwd;
  3351	
  3352			err = dev_map_enqueue(dst, xdp, dev_rx);
  3353			if (unlikely(err))
  3354				return err;
> 3355			__dev_map_insert_ctx(map, dst);
  3356			break;
  3357		}
  3358		case BPF_MAP_TYPE_CPUMAP: {
  3359			struct bpf_cpu_map_entry *rcpu = fwd;
  3360	
  3361			err = cpu_map_enqueue(rcpu, xdp, dev_rx);
  3362			if (unlikely(err))
  3363				return err;
  3364			__cpu_map_insert_ctx(map, index);
  3365			break;
  3366		}
  3367		case BPF_MAP_TYPE_XSKMAP: {
  3368			struct xdp_sock *xs = fwd;
  3369	
  3370			err = __xsk_map_redirect(map, xdp, xs);
  3371			return err;
  3372		}
  3373		default:
  3374			break;
  3375		}
  3376		return 0;
  3377	}
  3378	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29026 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-23 12:11         ` Toke Høiland-Jørgensen
@ 2019-02-25 18:47           ` Jakub Kicinski
  2019-02-26 11:00             ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 21+ messages in thread
From: Jakub Kicinski @ 2019-02-25 18:47 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Jesper Dangaard Brouer, David Miller, netdev, Daniel Borkmann,
	Alexei Starovoitov

On Sat, 23 Feb 2019 13:11:02 +0100, Toke Høiland-Jørgensen wrote:
> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> > On Fri, 22 Feb 2019 13:37:34 -0800 Jakub Kicinski wrote:
> >> On Fri, 22 Feb 2019 11:13:50 +0100, Toke Høiland-Jørgensen wrote:  
> >> > Jakub Kicinski <jakub.kicinski@netronome.com> writes:    
> >> > > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:      
> > [...]  
> >> > >
> >> > > BPF programs don't obey by netns boundaries.  The fact the program is
> >> > > verified in one ns doesn't mean this is the only ns it will be used in :(
> >> > > Meaning if any program is using the redirect map you may need a secret
> >> > > map in every ns.. no?      
> >> > 
> >> > Ah, yes, good point. Totally didn't think about the fact that load and
> >> > attach are decoupled. Hmm, guess I'll just have to move the call to
> >> > alloc_default_map() to the point where the program is attached to an
> >> > interface, then...    
> >> 
> >> Possibly.. and you also need to handle the case where interface with a
> >> program attached is moved, no?  
> 
> Yup, alloc on attach was easy enough; the moving turns out to be the
> tricky part :)
> 
> > True, we need to handle if e.g. a veth gets an XDP program attached and
> > then is moved into a network namespace (as I've already explained to
> > Toke in a meeting).  
> 
> Yeah, I had somehow convinced myself that the XDP program was being
> removed when the interface was being torn down before moving between
> namespaces. Jesper pointed out that this was not in fact the case... :P
> 
> > I'm still not sure how to handle this...  
> 
> There are a couple of options, I think. At least:
> 
> 1. Maintain a flag on struct net_device indicating that this device
>    needs the redirect map allocated, and react to that when interfaces
>    are being moved.
> 
> 2. Lookup the BPF program by ID (which we can get from the driver) on
>    move, and react to the program flag.
> 
> 3. Keep the allocation on program load, but allocate maps for all active
>    namespaces (which would probably need a refcnt mechanism to
>    deallocate things again).
> 
> I think I'm leaning towards #2; possibly combined with a refcnt so we
> can actually deallocate the map in the root namespace when it's not
> needed anymore.

Okay.. what about tail calls?  I think #3 is most reasonable complexity-
-wise, or some mix of #2 and #3 - cnt the programs with legacy
redirects, and then allocate the resources if cnt && name space has any
XDP program attached.

Can users really not be told to just use the correct helper? ;)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-02-25 18:47           ` Jakub Kicinski
@ 2019-02-26 11:00             ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 21+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-02-26 11:00 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Jesper Dangaard Brouer, David Miller, netdev, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Sat, 23 Feb 2019 13:11:02 +0100, Toke Høiland-Jørgensen wrote:
>> Jesper Dangaard Brouer <brouer@redhat.com> writes:
>> > On Fri, 22 Feb 2019 13:37:34 -0800 Jakub Kicinski wrote:
>> >> On Fri, 22 Feb 2019 11:13:50 +0100, Toke Høiland-Jørgensen wrote:  
>> >> > Jakub Kicinski <jakub.kicinski@netronome.com> writes:    
>> >> > > On Thu, 21 Feb 2019 12:56:54 +0100, Toke Høiland-Jørgensen wrote:      
>> > [...]  
>> >> > >
>> >> > > BPF programs don't obey by netns boundaries.  The fact the program is
>> >> > > verified in one ns doesn't mean this is the only ns it will be used in :(
>> >> > > Meaning if any program is using the redirect map you may need a secret
>> >> > > map in every ns.. no?      
>> >> > 
>> >> > Ah, yes, good point. Totally didn't think about the fact that load and
>> >> > attach are decoupled. Hmm, guess I'll just have to move the call to
>> >> > alloc_default_map() to the point where the program is attached to an
>> >> > interface, then...    
>> >> 
>> >> Possibly.. and you also need to handle the case where interface with a
>> >> program attached is moved, no?  
>> 
>> Yup, alloc on attach was easy enough; the moving turns out to be the
>> tricky part :)
>> 
>> > True, we need to handle if e.g. a veth gets an XDP program attached and
>> > then is moved into a network namespace (as I've already explained to
>> > Toke in a meeting).  
>> 
>> Yeah, I had somehow convinced myself that the XDP program was being
>> removed when the interface was being torn down before moving between
>> namespaces. Jesper pointed out that this was not in fact the case... :P
>> 
>> > I'm still not sure how to handle this...  
>> 
>> There are a couple of options, I think. At least:
>> 
>> 1. Maintain a flag on struct net_device indicating that this device
>>    needs the redirect map allocated, and react to that when interfaces
>>    are being moved.
>> 
>> 2. Lookup the BPF program by ID (which we can get from the driver) on
>>    move, and react to the program flag.
>> 
>> 3. Keep the allocation on program load, but allocate maps for all active
>>    namespaces (which would probably need a refcnt mechanism to
>>    deallocate things again).
>> 
>> I think I'm leaning towards #2; possibly combined with a refcnt so we
>> can actually deallocate the map in the root namespace when it's not
>> needed anymore.
>
> Okay.. what about tail calls? I think #3 is most reasonable
> complexity- -wise, or some mix of #2 and #3 - cnt the programs with
> legacy redirects, and then allocate the resources if cnt && name space
> has any XDP program attached.

Yeah, I have that more or less working; except I forgot about tail
calls, but that should not be too difficult to fix.

> Can users really not be told to just use the correct helper? ;)

Experience would suggest not; users tend to use the simplest API that
gets their job done. And then wonder why they don't get the nice
performance numbers they were "promised". And, well, I tend to agree
that it's not terribly friendly to just go "use this other more
complicated API if you want proper performance". If we really mean that,
then we should formally deprecate xdp_redirect() as an API, IMO :)

-Toke

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2019-02-26 11:00 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-21 11:56 [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
2019-02-21 11:56 ` [PATCH net-next 2/2] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
2019-02-21 15:23   ` Jesper Dangaard Brouer
2019-02-21 15:50     ` Toke Høiland-Jørgensen
2019-02-21 21:49   ` Jakub Kicinski
2019-02-21 23:02     ` Toke Høiland-Jørgensen
2019-02-22  0:32       ` Jakub Kicinski
2019-02-22  9:47         ` Toke Høiland-Jørgensen
2019-02-22 21:30           ` Jakub Kicinski
2019-02-23 11:52             ` Toke Høiland-Jørgensen
2019-02-23 23:19   ` kbuild test robot
2019-02-23 23:28   ` kbuild test robot
2019-02-21 15:19 ` [PATCH net-next 1/2] xdp: Always use a devmap for XDP_REDIRECT to a device Jesper Dangaard Brouer
2019-02-21 15:52   ` Toke Høiland-Jørgensen
2019-02-22  0:36 ` Jakub Kicinski
2019-02-22 10:13   ` Toke Høiland-Jørgensen
2019-02-22 21:37     ` Jakub Kicinski
2019-02-23 10:43       ` Jesper Dangaard Brouer
2019-02-23 12:11         ` Toke Høiland-Jørgensen
2019-02-25 18:47           ` Jakub Kicinski
2019-02-26 11:00             ` Toke Høiland-Jørgensen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).