All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions
  2019-03-01 14:12 [PATCH net-next v3 0/3] xdp: Use a default map for xdp_redirect helper Toke Høiland-Jørgensen
  2019-03-01 14:12 ` [PATCH net-next v3 3/3] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
@ 2019-03-01 14:12 ` Toke Høiland-Jørgensen
  2019-03-02  1:08   ` Jakub Kicinski
  2019-03-01 14:12 ` [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
  2 siblings, 1 reply; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-01 14:12 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Jakub Kicinski

The subsequent commits introducing default maps and a hash-based ifindex
devmap require a bit of refactoring of the devmap code. Perform this first
so the subsequent commits become easier to read.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 kernel/bpf/devmap.c |  177 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 109 insertions(+), 68 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..1037fc08c504 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -75,6 +75,7 @@ struct bpf_dtab {
 	struct bpf_dtab_netdev **netdev_map;
 	unsigned long __percpu *flush_needed;
 	struct list_head list;
+	struct rcu_head rcu;
 };
 
 static DEFINE_SPINLOCK(dev_map_lock);
@@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
 }
 
-static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
+			    bool check_memlock)
 {
-	struct bpf_dtab *dtab;
-	int err = -EINVAL;
 	u64 cost;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	/* check sanity of attributes */
-	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
-		return ERR_PTR(-EINVAL);
-
-	dtab = kzalloc(sizeof(*dtab), GFP_USER);
-	if (!dtab)
-		return ERR_PTR(-ENOMEM);
+	int err;
 
 	bpf_map_init_from_attr(&dtab->map, attr);
 
@@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
 	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_dtab;
+		return -EINVAL;
 
 	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(dtab->map.pages);
-	if (err)
-		goto free_dtab;
-
-	err = -ENOMEM;
+	if (check_memlock) {
+		/* if map size is larger than memlock limit, reject it early */
+		err = bpf_map_precharge_memlock(dtab->map.pages);
+		if (err)
+			return -EINVAL;
+	}
 
 	/* A per cpu bitfield with a bit per possible net device */
 	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
 						__alignof__(unsigned long),
 						GFP_KERNEL | __GFP_NOWARN);
 	if (!dtab->flush_needed)
-		goto free_dtab;
+		goto err_alloc;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
 					      dtab->map.numa_node);
 	if (!dtab->netdev_map)
-		goto free_dtab;
+		goto err_map;
 
-	spin_lock(&dev_map_lock);
-	list_add_tail_rcu(&dtab->list, &dev_map_list);
-	spin_unlock(&dev_map_lock);
+	return 0;
 
-	return &dtab->map;
-free_dtab:
+ err_map:
 	free_percpu(dtab->flush_needed);
-	kfree(dtab);
-	return ERR_PTR(err);
+ err_alloc:
+	return -ENOMEM;
 }
 
-static void dev_map_free(struct bpf_map *map)
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 {
-	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	int i, cpu;
+	struct bpf_dtab *dtab;
+	int err;
 
-	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
-	 * so the programs (can be more than one that used this map) were
-	 * disconnected from events. Wait for outstanding critical sections in
-	 * these programs to complete. The rcu critical section only guarantees
-	 * no further reads against netdev_map. It does __not__ ensure pending
-	 * flush operations (if any) are complete.
-	 */
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	dtab = kzalloc(sizeof(*dtab), GFP_USER);
+	if (!dtab)
+		return ERR_PTR(-ENOMEM);
+
+	err = dev_map_init_map(dtab, attr, true);
+	if (err) {
+		kfree(dtab);
+		return ERR_PTR(err);
+	}
 
 	spin_lock(&dev_map_lock);
-	list_del_rcu(&dtab->list);
+	list_add_tail_rcu(&dtab->list, &dev_map_list);
 	spin_unlock(&dev_map_lock);
 
-	bpf_clear_redirect_map(map);
-	synchronize_rcu();
+	return &dtab->map;
+}
+
+static void __dev_map_free(struct rcu_head *rcu)
+{
+	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
+	int i, cpu;
 
 	/* To ensure all pending flush operations have completed wait for flush
 	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
@@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)
 	kfree(dtab);
 }
 
+static void dev_map_free(struct bpf_map *map)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further reads against netdev_map. It does __not__ ensure pending
+	 * flush operations (if any) are complete.
+	 */
+
+	spin_lock(&dev_map_lock);
+	list_del_rcu(&dtab->list);
+	spin_unlock(&dev_map_lock);
+
+	bpf_clear_redirect_map(map);
+	call_rcu(&dtab->rcu, __dev_map_free);
+}
+
 static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -429,12 +450,42 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
-				u64 map_flags)
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
+						    struct bpf_dtab *dtab,
+						    u32 ifindex,
+						    unsigned int bit)
 {
-	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct net *net = current->nsproxy->net_ns;
 	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	struct bpf_dtab_netdev *dev;
+
+	dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+					sizeof(void *), gfp);
+	if (!dev->bulkq) {
+		kfree(dev);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev->dev = dev_get_by_index(net, ifindex);
+	if (!dev->dev) {
+		free_percpu(dev->bulkq);
+		kfree(dev);
+		return ERR_PTR(-EINVAL);
+	}
+
+	dev->bit = bit;
+	dev->dtab = dtab;
+
+	return dev;
+}
+
+static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
+				 void *key, void *value, u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -449,26 +500,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
-		if (!dev)
-			return -ENOMEM;
-
-		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
-						sizeof(void *), gfp);
-		if (!dev->bulkq) {
-			kfree(dev);
-			return -ENOMEM;
-		}
-
-		dev->dev = dev_get_by_index(net, ifindex);
-		if (!dev->dev) {
-			free_percpu(dev->bulkq);
-			kfree(dev);
-			return -EINVAL;
-		}
-
-		dev->bit = i;
-		dev->dtab = dtab;
+		dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+		if (IS_ERR(dev))
+			return PTR_ERR(dev);
 	}
 
 	/* Use call_rcu() here to ensure rcu critical sections have completed
@@ -482,6 +516,13 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	return 0;
 }
 
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
+{
+	return __dev_map_update_elem(current->nsproxy->net_ns,
+				     map, key, value, map_flags);
+}
+
 const struct bpf_map_ops dev_map_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH net-next v3 0/3] xdp: Use a default map for xdp_redirect helper
@ 2019-03-01 14:12 Toke Høiland-Jørgensen
  2019-03-01 14:12 ` [PATCH net-next v3 3/3] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-01 14:12 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Jakub Kicinski

This series changes the xdp_redirect helper to use a hidden default map. The
redirect_map() helper also uses the map structure to batch packets, which
results in a significant (around 50%) performance boost for the _map variant.
However, the xdp_redirect() API is simpler if one just wants to redirect to
another interface, which means people tend to use this interface and then wonder
why they getter worse performance than expected.

This series seeks to close this performance difference between the two APIs. It
achieves this by changing xdp_redirect() to use a hidden devmap for looking up
destination interfaces, thus gaining the batching benefit with no visible
difference from the user API point of view.

Allocation of the default map is done dynamically as programs using the
xdp_redirect helper are loaded and attached to interfaces, and the maps are
freed again when no longer needed. Because of tail calls, this requires two
levels of refcounting: One global level that keeps track of whether any XDP
programs using the xdp_redirect() helper are loaded in the system at all. And
another that keeps track of whether any programs that could potentially result
in a call to xdp_redirect (i.e., either programs using the helper, or programs
using tail calls) are loaded in a given namespace.

The default maps are dynamically sized to the nearest power-of-two size that can
contain all interfaces present in each interface. If allocation fails, the user
action that triggered the allocation (either attaching an XDP program, or moving
an interface with a program attached) is rejected. If new interfaces appear in
a namespace which causes the default map to become too small, a new one is
allocated with the correct size; this allocation is the only one that cannot
lead to a rejection of the userspace action, so if it fails a warning is emitted
instead.

The first patch in the series refactors devmap.c to prepare for the subsequent
patches. The second patch adds the default map handling using the existing
array-based devmap structure. The third patch adds a new map type (devmap_idx)
that hashes devices on ifindex.

Changelog:

v2 -> v3:
- Fix compile warnings when CONFIG_BPF_SYSCALL is unset (as pointed out by the
  kbuild test bot).

v1 -> v2:
- Add refcounting to only allocate default maps when needed
- Using said refcounting, also deallocate default maps
- Add dynamic sizing of default maps
- Handle moving of interfaces between namespaces
- Split out refactoring of devmap.c to separate patch
- Use hashmap semantics for update_elem of devmap_idx type maps

---

Toke Høiland-Jørgensen (3):
      xdp: Refactor devmap code in preparation for subsequent additions
      xdp: Always use a devmap for XDP_REDIRECT to a device
      xdp: Add devmap_idx map type for looking up devices by ifindex


 include/linux/bpf.h                     |   46 ++
 include/linux/bpf_types.h               |    1 
 include/linux/filter.h                  |    2 
 include/net/net_namespace.h             |    2 
 include/net/netns/xdp.h                 |   11 +
 include/trace/events/xdp.h              |    3 
 include/uapi/linux/bpf.h                |    1 
 kernel/bpf/devmap.c                     |  609 +++++++++++++++++++++++++++----
 kernel/bpf/syscall.c                    |   27 +
 kernel/bpf/verifier.c                   |   14 +
 net/core/dev.c                          |   59 +++
 net/core/filter.c                       |   69 +---
 tools/bpf/bpftool/map.c                 |    1 
 tools/include/uapi/linux/bpf.h          |    1 
 tools/lib/bpf/libbpf_probes.c           |    1 
 tools/testing/selftests/bpf/test_maps.c |   16 +
 16 files changed, 720 insertions(+), 143 deletions(-)


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-01 14:12 [PATCH net-next v3 0/3] xdp: Use a default map for xdp_redirect helper Toke Høiland-Jørgensen
  2019-03-01 14:12 ` [PATCH net-next v3 3/3] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
  2019-03-01 14:12 ` [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions Toke Høiland-Jørgensen
@ 2019-03-01 14:12 ` Toke Høiland-Jørgensen
  2019-03-02  2:09   ` Jakub Kicinski
  2 siblings, 1 reply; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-01 14:12 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Jakub Kicinski

An XDP program can redirect packets between interfaces using either the
xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the
flexibility of updating maps from userspace, the redirect_map() helper also
uses the map structure to batch packets, which results in a significant
(around 50%) performance boost. However, the xdp_redirect() API is simpler
if one just wants to redirect to another interface, which means people tend
to use this interface and then wonder why they getter worse performance
than expected.

This patch seeks to close this performance difference between the two APIs.
It achieves this by changing xdp_redirect() to use a hidden devmap for
looking up destination interfaces, thus gaining the batching benefit with
no visible difference from the user API point of view.

A hidden per-namespace map is allocated when an XDP program that uses the
non-map xdp_redirect() helper is first loaded. This map is populated with
all available interfaces in its namespace, and kept up to date as
interfaces come and go. Once allocated, the map is kept around until the
namespace is removed.

The hidden map uses the ifindex as map key, which means they are limited to
ifindexes smaller than the map size of 64. A later patch introduces a new
map type to lift this restriction.

Performance numbers:

Before patch:
xdp_redirect:     5426035 pkt/s
xdp_redirect_map: 8412754 pkt/s

After patch:
xdp_redirect:     8314702 pkt/s
xdp_redirect_map: 8411854 pkt/s

This corresponds to a 53% increase in xdp_redirect performance, or a
reduction in per-packet processing time by 64 nanoseconds.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 include/linux/bpf.h         |   35 +++++++
 include/linux/filter.h      |    2 
 include/net/net_namespace.h |    2 
 include/net/netns/xdp.h     |   11 ++
 kernel/bpf/devmap.c         |  214 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c        |   27 ++++-
 kernel/bpf/verifier.c       |   12 ++
 net/core/dev.c              |   59 ++++++++++++
 net/core/filter.c           |   58 +-----------
 9 files changed, 354 insertions(+), 66 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index de18227b3d95..060c5850af3b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -25,6 +25,7 @@ struct sock;
 struct seq_file;
 struct btf;
 struct btf_type;
+struct net;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
@@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
+struct bpf_prog *bpf_prog_get_by_id(u32 id);
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
@@ -612,6 +614,11 @@ struct xdp_buff;
 struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
+int dev_map_ensure_default_map(struct net *net);
+void dev_map_put_default_map(struct net *net);
+int dev_map_inc_redirect_count(void);
+void dev_map_dec_redirect_count(void);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
@@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline struct bpf_prog *bpf_prog_get_by_id(u32 id)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
 						     enum bpf_prog_type type,
 						     bool attach_drv)
@@ -693,6 +705,29 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 	return NULL;
 }
 
+static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
+{
+	return NULL;
+}
+
+static inline int dev_map_ensure_default_map(struct net *net)
+{
+	return 0;
+}
+
+static inline void dev_map_put_default_map(struct net *net)
+{
+}
+
+static inline int dev_map_inc_redirect_count(void)
+{
+	return 0;
+}
+
+static inline void dev_map_dec_redirect_count(void)
+{
+}
+
 static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
 {
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 95e2d7ebdf21..dd6bbbab32f7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -507,6 +507,8 @@ struct bpf_prog {
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1,	/* Do we need dst entry? */
+				redirect_needed:1,	/* Does program need access to xdp_redirect? */
+				redirect_used:1,	/* Does program use xdp_redirect? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
 				kprobe_override:1, /* Do we override a kprobe? */
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index a68ced28d8f4..6706ecc25d8f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -162,7 +162,7 @@ struct net {
 #if IS_ENABLED(CONFIG_CAN)
 	struct netns_can	can;
 #endif
-#ifdef CONFIG_XDP_SOCKETS
+#ifdef CONFIG_BPF_SYSCALL
 	struct netns_xdp	xdp;
 #endif
 	struct sock		*diag_nlsk;
diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
index e5734261ba0a..4935dfe1cf43 100644
--- a/include/net/netns/xdp.h
+++ b/include/net/netns/xdp.h
@@ -4,10 +4,21 @@
 
 #include <linux/rculist.h>
 #include <linux/mutex.h>
+#include <linux/atomic.h>
+
+struct bpf_dtab;
+
+struct bpf_dtab_container {
+	struct bpf_dtab __rcu *dtab;
+	atomic_t refcnt;
+};
 
 struct netns_xdp {
+#ifdef CONFIG_XDP_SOCKETS
 	struct mutex		lock;
 	struct hlist_head	list;
+#endif
+	struct bpf_dtab_container default_map;
 };
 
 #endif /* __NETNS_XDP_H__ */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1037fc08c504..e55707e62b60 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -56,6 +56,9 @@
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
 #define DEV_MAP_BULK_SIZE 16
+#define DEV_MAP_DEFAULT_SIZE 8
+#define BPF_MAX_REFCNT 32768
+
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
 	struct net_device *dev_rx;
@@ -80,6 +83,7 @@ struct bpf_dtab {
 
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
+static atomic_t global_redirect_use = {};
 
 static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 {
@@ -332,6 +336,18 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	return obj;
 }
 
+/* This is only being called from xdp_do_redirect() if the xdp_redirect helper
+ * is used; the default map is allocated on XDP program load if the helper is
+ * used, so will always be available at this point.
+ */
+struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct bpf_dtab *dtab = rcu_dereference(net->xdp.default_map.dtab);
+
+	return dtab ? &dtab->map : NULL;
+}
+
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
@@ -533,14 +549,210 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_check_btf = map_check_no_btf,
 };
 
+static inline struct net *bpf_default_map_to_net(struct bpf_dtab_container *cont)
+{
+	struct netns_xdp *xdp = container_of(cont, struct netns_xdp, default_map);
+
+	return container_of(xdp, struct net, xdp);
+}
+
+static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
+{
+	struct bpf_dtab *dtab = NULL;
+
+	lockdep_assert_held(&dev_map_lock);
+
+	dtab = rcu_dereference(cont->dtab);
+	if (dtab) {
+		list_del_rcu(&dtab->list);
+		rcu_assign_pointer(cont->dtab, NULL);
+		bpf_clear_redirect_map(&dtab->map);
+		call_rcu(&dtab->rcu, __dev_map_free);
+	}
+}
+
+void dev_map_put_default_map(struct net *net)
+{
+	if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
+		spin_lock(&dev_map_lock);
+		__dev_map_release_default_map(&net->xdp.default_map);
+		spin_unlock(&dev_map_lock);
+	}
+}
+
+static int __init_default_map(struct bpf_dtab_container *cont)
+{
+	struct net *net = bpf_default_map_to_net(cont);
+	struct bpf_dtab *dtab, *old_dtab;
+	int size = DEV_MAP_DEFAULT_SIZE;
+	struct net_device *netdev;
+	union bpf_attr attr = {};
+	u32 idx;
+	int err;
+
+	lockdep_assert_held(&dev_map_lock);
+
+	if (!atomic_read(&global_redirect_use))
+		return 0;
+
+	for_each_netdev(net, netdev)
+		if (netdev->ifindex >= size)
+			size <<= 1;
+
+	old_dtab = rcu_dereference(cont->dtab);
+	if (old_dtab && old_dtab->map.max_entries == size)
+		return 0;
+
+	dtab = kzalloc(sizeof(*dtab), GFP_USER);
+	if (!dtab)
+		return -ENOMEM;
+
+	attr.map_type = BPF_MAP_TYPE_DEVMAP;
+	attr.max_entries = size;
+	attr.value_size = 4;
+	attr.key_size = 4;
+
+	err = dev_map_init_map(dtab, &attr, false);
+	if (err) {
+		kfree(dtab);
+		return err;
+	}
+
+	for_each_netdev(net, netdev) {
+		idx = netdev->ifindex;
+		err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0);
+		if (err) {
+			__dev_map_free(&dtab->rcu);
+			return err;
+		}
+	}
+
+	rcu_assign_pointer(cont->dtab, dtab);
+	list_add_tail_rcu(&dtab->list, &dev_map_list);
+
+	if (old_dtab) {
+		list_del_rcu(&old_dtab->list);
+		bpf_clear_redirect_map(&old_dtab->map);
+		call_rcu(&old_dtab->rcu, __dev_map_free);
+	}
+
+	return 0;
+}
+
+static int maybe_inc_refcnt(atomic_t *v)
+{
+	int refcnt;
+
+	refcnt = atomic_inc_return(v);
+	if (refcnt > BPF_MAX_REFCNT) {
+		atomic_dec(v);
+		return -EBUSY;
+	}
+
+	return refcnt;
+}
+
+int dev_map_ensure_default_map(struct net *net)
+{
+	int refcnt, err = 0;
+
+	refcnt = maybe_inc_refcnt(&net->xdp.default_map.refcnt);
+	if (refcnt < 0)
+		return refcnt;
+
+	if (refcnt == 1) {
+		spin_lock(&dev_map_lock);
+		err = __init_default_map(&net->xdp.default_map);
+		spin_unlock(&dev_map_lock);
+	}
+
+	return err;
+}
+
+static void __dev_map_dec_redirect_count(void)
+{
+	struct net *net;
+
+	lockdep_assert_held(&dev_map_lock);
+
+	if (atomic_dec_and_test(&global_redirect_use))
+		for_each_net_rcu(net)
+			__dev_map_release_default_map(&net->xdp.default_map);
+}
+
+void dev_map_dec_redirect_count(void)
+{
+	spin_lock(&dev_map_lock);
+	__dev_map_dec_redirect_count();
+	spin_unlock(&dev_map_lock);
+}
+
+static int __dev_map_init_redirect_use(void)
+{
+	struct net *net;
+	int err;
+
+	lockdep_assert_held(&dev_map_lock);
+
+	for_each_net_rcu(net) {
+		if (atomic_read(&net->xdp.default_map.refcnt)) {
+			err = __init_default_map(&net->xdp.default_map);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+int dev_map_inc_redirect_count(void)
+{
+	int refcnt, err = 0;
+
+	spin_lock(&dev_map_lock);
+	refcnt = maybe_inc_refcnt(&global_redirect_use);
+	if (refcnt < 0) {
+		err = refcnt;
+		goto out;
+	}
+
+	if (refcnt == 1)
+		err = __dev_map_init_redirect_use();
+
+	if (err)
+		__dev_map_dec_redirect_count();
+
+ out:
+	spin_unlock(&dev_map_lock);
+	return err;
+}
+
 static int dev_map_notification(struct notifier_block *notifier,
 				ulong event, void *ptr)
 {
 	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct net *net = dev_net(netdev);
+	u32 idx = netdev->ifindex;
 	struct bpf_dtab *dtab;
-	int i;
+	int i, err;
 
 	switch (event) {
+	case NETDEV_REGISTER:
+		rcu_read_lock();
+		dtab = rcu_dereference(net->xdp.default_map.dtab);
+		if (dtab) {
+			err = __dev_map_update_elem(net, &dtab->map,
+						    &idx, &idx, 0);
+			if (err == -E2BIG) {
+				spin_lock(&dev_map_lock);
+				err = __init_default_map(&net->xdp.default_map);
+				if (err)
+					net_warn_ratelimited("Unable to re-allocate default map, xdp_redirect() may fail on some ifindexes\n");
+				spin_unlock(&dev_map_lock);
+			}
+		}
+		rcu_read_unlock();
+		break;
 	case NETDEV_UNREGISTER:
 		/* This rcu_read_lock/unlock pair is needed because
 		 * dev_map_list is an RCU list AND to ensure a delete
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ec7c552af76b..fd1b76f5da2d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1265,6 +1265,9 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 		kvfree(prog->aux->func_info);
 		bpf_prog_free_linfo(prog);
 
+		if (prog->redirect_used)
+			dev_map_dec_redirect_count();
+
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
 }
@@ -1962,6 +1965,21 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 
 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
 
+struct bpf_prog *bpf_prog_get_by_id(u32 id)
+{
+	struct bpf_prog *prog;
+
+	spin_lock_bh(&prog_idr_lock);
+	prog = idr_find(&prog_idr, id);
+	if (prog)
+		prog = bpf_prog_inc_not_zero(prog);
+	else
+		prog = ERR_PTR(-ENOENT);
+	spin_unlock_bh(&prog_idr_lock);
+
+	return prog;
+}
+
 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 {
 	struct bpf_prog *prog;
@@ -1974,14 +1992,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	spin_lock_bh(&prog_idr_lock);
-	prog = idr_find(&prog_idr, id);
-	if (prog)
-		prog = bpf_prog_inc_not_zero(prog);
-	else
-		prog = ERR_PTR(-ENOENT);
-	spin_unlock_bh(&prog_idr_lock);
-
+	prog = bpf_prog_get_by_id(id);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1b9496c41383..f1b2f01e7ca1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_redirect) {
+			prog->redirect_needed = true;
+			if (!prog->redirect_used) {
+				int err = dev_map_inc_redirect_count();
+
+				if (err)
+					return err;
+				prog->redirect_used = true;
+			}
+		}
+
 		if (insn->imm == BPF_FUNC_override_return)
 			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
@@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 * the program array.
 			 */
 			prog->cb_access = 1;
+			prog->redirect_needed = true;
 			env->prog->aux->stack_depth = MAX_BPF_STACK;
 			env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 2b67f2aa59dd..1df20d529026 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
 	return xdp.prog_id;
 }
 
+static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev)
+{
+	struct bpf_prog *prog;
+	u32 prog_id;
+
+	prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+	if (prog_id) {
+		prog = bpf_prog_get_by_id(prog_id);
+		if (!IS_ERR(prog))
+			return prog;
+	}
+
+	return NULL;
+}
+
 static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 			   struct netlink_ext_ack *extack, u32 flags,
 			   struct bpf_prog *prog)
@@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev)
 	memset(&xdp, 0, sizeof(xdp));
 	xdp.command = XDP_QUERY_PROG;
 	WARN_ON(ndo_bpf(dev, &xdp));
-	if (xdp.prog_id)
+	if (xdp.prog_id) {
+		struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id);
+
+		if (!IS_ERR(prog)) {
+			if (prog->redirect_needed)
+				dev_map_put_default_map(dev_net(dev));
+			bpf_prog_put(prog);
+		}
+
 		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
 					NULL));
+	}
 
 	/* Remove HW offload */
 	memset(&xdp, 0, sizeof(xdp));
@@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 			bpf_prog_put(prog);
 			return -EINVAL;
 		}
+
+		if (!offload && bpf_op == ops->ndo_bpf &&
+		    prog->redirect_needed) {
+			err = dev_map_ensure_default_map(dev_net(dev));
+			if (err) {
+				NL_SET_ERR_MSG(extack, "unable to allocate default map for xdp_redirect()");
+				return err;
+			}
+		}
+	} else {
+		struct bpf_prog *old_prog = dev_xdp_get_prog(dev);
+
+		if (old_prog) {
+			if (old_prog->redirect_needed)
+				dev_map_put_default_map(dev_net(dev));
+			bpf_prog_put(old_prog);
+		}
 	}
 
 	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
@@ -9333,6 +9374,7 @@ EXPORT_SYMBOL(unregister_netdev);
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
 	int err, new_nsid, new_ifindex;
+	struct bpf_prog *prog = NULL;
 
 	ASSERT_RTNL();
 
@@ -9350,6 +9392,15 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	if (net_eq(dev_net(dev), net))
 		goto out;
 
+	prog = dev_xdp_get_prog(dev);
+	if (prog) {
+		if (prog->redirect_needed)
+			err = dev_map_ensure_default_map(net);
+
+		if (err)
+			goto out;
+	}
+
 	/* Pick the destination device name, and ensure
 	 * we can use it in the destination network namespace.
 	 */
@@ -9388,6 +9439,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 
+	if (prog && prog->redirect_needed)
+		dev_map_put_default_map(dev_net(dev));
+
 	new_nsid = peernet2id_alloc(dev_net(dev), net);
 	/* If there is an ifindex conflict assign a new one */
 	if (__dev_get_by_index(net, dev->ifindex))
@@ -9435,6 +9489,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	synchronize_net();
 	err = 0;
 out:
+	if (prog)
+		bpf_prog_put(prog);
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
diff --git a/net/core/filter.c b/net/core/filter.c
index 5132c054c981..be02ea103d05 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3337,58 +3337,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static int __bpf_tx_xdp(struct net_device *dev,
-			struct bpf_map *map,
-			struct xdp_buff *xdp,
-			u32 index)
-{
-	struct xdp_frame *xdpf;
-	int err, sent;
-
-	if (!dev->netdev_ops->ndo_xdp_xmit) {
-		return -EOPNOTSUPP;
-	}
-
-	err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
-	if (unlikely(err))
-		return err;
-
-	xdpf = convert_to_xdp_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
-	if (sent <= 0)
-		return sent;
-	return 0;
-}
-
-static noinline int
-xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
-		     struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
-{
-	struct net_device *fwd;
-	u32 index = ri->ifindex;
-	int err;
-
-	fwd = dev_get_by_index_rcu(dev_net(dev), index);
-	ri->ifindex = 0;
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
-
-	err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
-	if (unlikely(err))
-		goto err;
-
-	_trace_xdp_redirect(dev, xdp_prog, index);
-	return 0;
-err:
-	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
-	return err;
-}
-
 static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 			    struct bpf_map *map,
 			    struct xdp_buff *xdp,
@@ -3519,10 +3467,10 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct bpf_map *map = READ_ONCE(ri->map);
 
-	if (likely(map))
-		return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
+	if (unlikely(!map))
+		map = __dev_map_get_default_map(dev);
 
-	return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri);
+	return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH net-next v3 3/3] xdp: Add devmap_idx map type for looking up devices by ifindex
  2019-03-01 14:12 [PATCH net-next v3 0/3] xdp: Use a default map for xdp_redirect helper Toke Høiland-Jørgensen
@ 2019-03-01 14:12 ` Toke Høiland-Jørgensen
  2019-03-01 14:12 ` [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions Toke Høiland-Jørgensen
  2019-03-01 14:12 ` [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
  2 siblings, 0 replies; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-01 14:12 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Jakub Kicinski

A common pattern when using xdp_redirect_map() is to create a device map
where the lookup key is simply ifindex. Because device maps are arrays,
this leaves holes in the map, and the map has to be sized to fit the
largest ifindex, regardless of how many devices actually are actually
needed in the map.

This patch adds a second type of device map where the key is interpreted as
an ifindex and looked up using a hashmap, instead of being used as an array
index. This leads to maps being densely packed, so they can be smaller.

The default maps used by xdp_redirect() are changed to use the new map
type, which means that xdp_redirect() is no longer limited to ifindex < 64,
but instead to 64 total simultaneous interfaces per network namespace. This
also provides an easy way to compare the performance of devmap and
devmap_idx:

xdp_redirect_map (devmap): 8394560 pkt/s
xdp_redirect (devmap_idx): 8179480 pkt/s

Difference: 215080 pkt/s or 3.1 nanoseconds per packet.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 include/linux/bpf.h                     |   11 +
 include/linux/bpf_types.h               |    1 
 include/trace/events/xdp.h              |    3 
 include/uapi/linux/bpf.h                |    1 
 kernel/bpf/devmap.c                     |  232 ++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c                   |    2 
 net/core/filter.c                       |   11 +
 tools/bpf/bpftool/map.c                 |    1 
 tools/include/uapi/linux/bpf.h          |    1 
 tools/lib/bpf/libbpf_probes.c           |    1 
 tools/testing/selftests/bpf/test_maps.c |   16 ++
 11 files changed, 264 insertions(+), 16 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 060c5850af3b..d47d3160ff1a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -614,12 +614,13 @@ struct xdp_buff;
 struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct bpf_dtab_netdev *__dev_map_idx_lookup_elem(struct bpf_map *map, u32 key);
 struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
 int dev_map_ensure_default_map(struct net *net);
 void dev_map_put_default_map(struct net *net);
 int dev_map_inc_redirect_count(void);
 void dev_map_dec_redirect_count(void);
-void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
+void __dev_map_insert_ctx(struct bpf_map *map, struct bpf_dtab_netdev *dst);
 void __dev_map_flush(struct bpf_map *map);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
@@ -705,6 +706,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 	return NULL;
 }
 
+static inline struct net_device  *__dev_map_idx_lookup_elem(struct bpf_map *map,
+							    u32 key)
+{
+	return NULL;
+}
+
 static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
 {
 	return NULL;
@@ -728,7 +735,7 @@ static inline void dev_map_dec_redirect_count(void)
 {
 }
 
-static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
+static inline void __dev_map_insert_ctx(struct bpf_map *map, void *dst)
 {
 }
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 08bf2f1fe553..374c013ca243 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -59,6 +59,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_IDX, dev_map_idx_ops)
 #if defined(CONFIG_BPF_STREAM_PARSER)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index e95cb86b65cf..fcf006d49f67 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -147,7 +147,8 @@ struct _bpf_dtab_netdev {
 
 #define devmap_ifindex(fwd, map)				\
 	(!fwd ? 0 :						\
-	 ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	 ((map->map_type == BPF_MAP_TYPE_DEVMAP ||              \
+	   map->map_type == BPF_MAP_TYPE_DEVMAP_IDX) ?		\
 	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bcdd2474eee7..2cdd384f2cbc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 	BPF_MAP_TYPE_QUEUE,
 	BPF_MAP_TYPE_STACK,
+	BPF_MAP_TYPE_DEVMAP_IDX,
 };
 
 /* Note that tracing related programs such as
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e55707e62b60..95b276104dcd 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -46,6 +46,12 @@
  * notifier hook walks the map we know that new dev references can not be
  * added by the user because core infrastructure ensures dev_get_by_index()
  * calls will fail at this point.
+ *
+ * The devmap_idx type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to be
+ * densely packed instead of having holes in the lookup array for unused
+ * ifindexes. The setup and packet enqueue/send code is shared between the two
+ * types of devmap; only the lookup and insertion is different.
  */
 #include <linux/bpf.h>
 #include <net/xdp.h>
@@ -67,6 +73,8 @@ struct xdp_bulk_queue {
 
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
+	unsigned int ifindex;
+	struct hlist_node index_hlist;
 	struct bpf_dtab *dtab;
 	unsigned int bit;
 	struct xdp_bulk_queue __percpu *bulkq;
@@ -79,12 +87,30 @@ struct bpf_dtab {
 	unsigned long __percpu *flush_needed;
 	struct list_head list;
 	struct rcu_head rcu;
+
+	/* these are only used for DEVMAP_IDX type maps */
+	unsigned long *bits_used;
+	struct hlist_head *dev_index_head;
+	spinlock_t index_lock;
 };
 
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
 static atomic_t global_redirect_use = {};
 
+static struct hlist_head *dev_map_create_hash(void)
+{
+	int i;
+	struct hlist_head *hash;
+
+	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
+	if (hash != NULL)
+		for (i = 0; i < NETDEV_HASHENTRIES; i++)
+			INIT_HLIST_HEAD(&hash[i]);
+
+	return hash;
+}
+
 static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 {
 	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
@@ -101,6 +127,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
 	/* make sure page count doesn't overflow */
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+
+	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_IDX)
+		cost += dev_map_bitmap_size(attr) +
+			sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
+
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return -EINVAL;
 
@@ -126,8 +157,25 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
 	if (!dtab->netdev_map)
 		goto err_map;
 
+	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_IDX) {
+		dtab->bits_used = kzalloc(dev_map_bitmap_size(attr),
+					  GFP_KERNEL);
+		if (!dtab->bits_used)
+			goto err_bitmap;
+
+		dtab->dev_index_head = dev_map_create_hash();
+		if (!dtab->dev_index_head)
+			goto err_idx;
+
+		spin_lock_init(&dtab->index_lock);
+	}
+
 	return 0;
 
+ err_idx:
+	kfree(dtab->bits_used);
+ err_bitmap:
+	bpf_map_area_free(dtab->netdev_map);
  err_map:
 	free_percpu(dtab->flush_needed);
  err_alloc:
@@ -192,6 +240,8 @@ static void __dev_map_free(struct rcu_head *rcu)
 		kfree(dev);
 	}
 
+	kfree(dtab->dev_index_head);
+	kfree(dtab->bits_used);
 	free_percpu(dtab->flush_needed);
 	bpf_map_area_free(dtab->netdev_map);
 	kfree(dtab);
@@ -234,12 +284,76 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
+static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
+						    int ifindex)
+{
+	return &dtab->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+}
+
+struct bpf_dtab_netdev *__dev_map_idx_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct hlist_head *head = dev_map_index_hash(dtab, key);
+	struct bpf_dtab_netdev *dev;
+
+	hlist_for_each_entry_rcu(dev, head, index_hlist)
+		if (dev->ifindex == key)
+			return dev;
+
+	return NULL;
+}
+
+static int dev_map_idx_get_next_key(struct bpf_map *map, void *key,
+				    void *next_key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	u32 ifindex, *next = next_key;
+	struct bpf_dtab_netdev *dev, *next_dev;
+	struct hlist_head *head;
+	int i = 0;
+
+	if (!key)
+		goto find_first;
+
+	ifindex = *(u32 *)key;
+
+	dev = __dev_map_idx_lookup_elem(map, ifindex);
+	if (!dev)
+		goto find_first;
+
+	next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
+				    struct bpf_dtab_netdev, index_hlist);
+
+	if (next_dev) {
+		*next = next_dev->ifindex;
+		return 0;
+	}
+
+	i = ifindex & (NETDEV_HASHENTRIES - 1);
+	i++;
+
+ find_first:
+	for (; i < NETDEV_HASHENTRIES; i++) {
+		head = dev_map_index_hash(dtab, i);
+
+		next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					    struct bpf_dtab_netdev,
+					    index_hlist);
+		if (next_dev) {
+			*next = next_dev->ifindex;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+void __dev_map_insert_ctx(struct bpf_map *map, struct bpf_dtab_netdev *dst)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
 
-	__set_bit(bit, bitmap);
+	__set_bit(dst->bit, bitmap);
 }
 
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
@@ -409,9 +523,16 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
-	struct net_device *dev = obj ? obj->dev : NULL;
 
-	return dev ? &dev->ifindex : NULL;
+	return obj ? &obj->ifindex : NULL;
+}
+
+static void *dev_map_idx_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab_netdev *obj = __dev_map_idx_lookup_elem(map,
+								*(u32 *)key);
+
+	return obj ? &obj->ifindex : NULL;
 }
 
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
@@ -466,6 +587,43 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
+static int dev_map_idx_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *old_dev;
+	int k = *(u32 *)key;
+
+	old_dev = __dev_map_idx_lookup_elem(map, k);
+	if (!old_dev)
+		return 0;
+
+	spin_lock(&dtab->index_lock);
+	hlist_del_rcu(&old_dev->index_hlist);
+	spin_unlock(&dtab->index_lock);
+
+	xchg(&dtab->netdev_map[old_dev->bit], NULL);
+	clear_bit_unlock(old_dev->bit, dtab->bits_used);
+	call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	return 0;
+}
+
+static bool __dev_map_find_bit(struct bpf_dtab *dtab, unsigned int *bit)
+{
+	unsigned int b = 0;
+
+ retry:
+	b = find_next_zero_bit(dtab->bits_used, dtab->map.max_entries, b);
+
+	if (b >= dtab->map.max_entries)
+		return false;
+
+	if (test_and_set_bit_lock(b, dtab->bits_used))
+		goto retry;
+
+	*bit = b;
+	return true;
+}
+
 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 						    struct bpf_dtab *dtab,
 						    u32 ifindex,
@@ -492,6 +650,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 		return ERR_PTR(-EINVAL);
 	}
 
+	dev->ifindex = dev->dev->ifindex;
 	dev->bit = bit;
 	dev->dtab = dtab;
 
@@ -539,6 +698,49 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 				     map, key, value, map_flags);
 }
 
+static int __dev_map_idx_update_elem(struct net *net, struct bpf_map *map,
+				     void *key, void *value, u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *dev, *old_dev;
+	u32 idx = *(u32 *)key;
+	u32 val = *(u32 *)value;
+	u32 bit;
+
+	if (idx != val)
+		return -EINVAL;
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+
+	old_dev = __dev_map_idx_lookup_elem(map, idx);
+	if (old_dev) {
+		if (map_flags & BPF_NOEXIST)
+			return -EEXIST;
+		else
+			return 0;
+	}
+
+	if (!__dev_map_find_bit(dtab, &bit))
+		return -ENOSPC;
+	dev = __dev_map_alloc_node(net, dtab, idx, bit);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	xchg(&dtab->netdev_map[bit], dev);
+	spin_lock(&dtab->index_lock);
+	hlist_add_head_rcu(&dev->index_hlist,
+			   dev_map_index_hash(dtab, dev->ifindex));
+	spin_unlock(&dtab->index_lock);
+	return 0;
+}
+
+static int dev_map_idx_update_elem(struct bpf_map *map, void *key, void *value,
+				   u64 map_flags)
+{
+	return __dev_map_idx_update_elem(current->nsproxy->net_ns,
+					 map, key, value, map_flags);
+}
+
 const struct bpf_map_ops dev_map_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,
@@ -549,6 +751,16 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_check_btf = map_check_no_btf,
 };
 
+const struct bpf_map_ops dev_map_idx_ops = {
+	.map_alloc = dev_map_alloc,
+	.map_free = dev_map_free,
+	.map_get_next_key = dev_map_idx_get_next_key,
+	.map_lookup_elem = dev_map_idx_lookup_elem,
+	.map_update_elem = dev_map_idx_update_elem,
+	.map_delete_elem = dev_map_idx_delete_elem,
+	.map_check_btf = map_check_no_btf,
+};
+
 static inline struct net *bpf_default_map_to_net(struct bpf_dtab_container *cont)
 {
 	struct netns_xdp *xdp = container_of(cont, struct netns_xdp, default_map);
@@ -583,8 +795,8 @@ void dev_map_put_default_map(struct net *net)
 static int __init_default_map(struct bpf_dtab_container *cont)
 {
 	struct net *net = bpf_default_map_to_net(cont);
+	int size = DEV_MAP_DEFAULT_SIZE, i = 0;
 	struct bpf_dtab *dtab, *old_dtab;
-	int size = DEV_MAP_DEFAULT_SIZE;
 	struct net_device *netdev;
 	union bpf_attr attr = {};
 	u32 idx;
@@ -596,7 +808,7 @@ static int __init_default_map(struct bpf_dtab_container *cont)
 		return 0;
 
 	for_each_netdev(net, netdev)
-		if (netdev->ifindex >= size)
+		if (++i >= size)
 			size <<= 1;
 
 	old_dtab = rcu_dereference(cont->dtab);
@@ -607,7 +819,7 @@ static int __init_default_map(struct bpf_dtab_container *cont)
 	if (!dtab)
 		return -ENOMEM;
 
-	attr.map_type = BPF_MAP_TYPE_DEVMAP;
+	attr.map_type = BPF_MAP_TYPE_DEVMAP_IDX;
 	attr.max_entries = size;
 	attr.value_size = 4;
 	attr.key_size = 4;
@@ -620,7 +832,7 @@ static int __init_default_map(struct bpf_dtab_container *cont)
 
 	for_each_netdev(net, netdev) {
 		idx = netdev->ifindex;
-		err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0);
+		err = __dev_map_idx_update_elem(net, &dtab->map, &idx, &idx, 0);
 		if (err) {
 			__dev_map_free(&dtab->rcu);
 			return err;
@@ -741,8 +953,8 @@ static int dev_map_notification(struct notifier_block *notifier,
 		rcu_read_lock();
 		dtab = rcu_dereference(net->xdp.default_map.dtab);
 		if (dtab) {
-			err = __dev_map_update_elem(net, &dtab->map,
-						    &idx, &idx, 0);
+			err = __dev_map_idx_update_elem(net, &dtab->map,
+							&idx, &idx, 0);
 			if (err == -E2BIG) {
 				spin_lock(&dev_map_lock);
 				err = __init_default_map(&net->xdp.default_map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f1b2f01e7ca1..5bdcb3c23014 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2576,6 +2576,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	 * for now.
 	 */
 	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_IDX:
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
@@ -2648,6 +2649,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_redirect_map:
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_DEVMAP_IDX &&
 		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
 		    map->map_type != BPF_MAP_TYPE_XSKMAP)
 			goto error;
diff --git a/net/core/filter.c b/net/core/filter.c
index be02ea103d05..59f2e188e5ed 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3345,13 +3345,14 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	int err;
 
 	switch (map->map_type) {
-	case BPF_MAP_TYPE_DEVMAP: {
+	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_IDX: {
 		struct bpf_dtab_netdev *dst = fwd;
 
 		err = dev_map_enqueue(dst, xdp, dev_rx);
 		if (unlikely(err))
 			return err;
-		__dev_map_insert_ctx(map, index);
+		__dev_map_insert_ctx(map, dst);
 		break;
 	}
 	case BPF_MAP_TYPE_CPUMAP: {
@@ -3384,6 +3385,7 @@ void xdp_do_flush_map(void)
 	if (map) {
 		switch (map->map_type) {
 		case BPF_MAP_TYPE_DEVMAP:
+		case BPF_MAP_TYPE_DEVMAP_IDX:
 			__dev_map_flush(map);
 			break;
 		case BPF_MAP_TYPE_CPUMAP:
@@ -3404,6 +3406,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
 		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_DEVMAP_IDX:
+		return __dev_map_idx_lookup_elem(map, index);
 	case BPF_MAP_TYPE_CPUMAP:
 		return __cpu_map_lookup_elem(map, index);
 	case BPF_MAP_TYPE_XSKMAP:
@@ -3494,7 +3498,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 		goto err;
 	}
 
-	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
+	    map->map_type == BPF_MAP_TYPE_DEVMAP_IDX) {
 		struct bpf_dtab_netdev *dst = fwd;
 
 		err = dev_map_generic_redirect(dst, skb, xdp_prog);
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index e0c650d91784..0864ce33df94 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -37,6 +37,7 @@ const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_ARRAY_OF_MAPS]		= "array_of_maps",
 	[BPF_MAP_TYPE_HASH_OF_MAPS]		= "hash_of_maps",
 	[BPF_MAP_TYPE_DEVMAP]			= "devmap",
+	[BPF_MAP_TYPE_DEVMAP_IDX]		= "devmap_idx",
 	[BPF_MAP_TYPE_SOCKMAP]			= "sockmap",
 	[BPF_MAP_TYPE_CPUMAP]			= "cpumap",
 	[BPF_MAP_TYPE_XSKMAP]			= "xskmap",
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bcdd2474eee7..2cdd384f2cbc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 	BPF_MAP_TYPE_QUEUE,
 	BPF_MAP_TYPE_STACK,
+	BPF_MAP_TYPE_DEVMAP_IDX,
 };
 
 /* Note that tracing related programs such as
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 8c3a1c04dcb2..b87b760a1355 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -172,6 +172,7 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_IDX:
 	case BPF_MAP_TYPE_SOCKMAP:
 	case BPF_MAP_TYPE_CPUMAP:
 	case BPF_MAP_TYPE_XSKMAP:
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 3c627771f965..63681e4647f9 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -519,6 +519,21 @@ static void test_devmap(unsigned int task, void *data)
 	close(fd);
 }
 
+static void test_devmap_idx(unsigned int task, void *data)
+{
+	int fd;
+	__u32 key, value;
+
+	fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP_IDX, sizeof(key), sizeof(value),
+			    2, 0);
+	if (fd < 0) {
+		printf("Failed to create devmap_idx '%s'!\n", strerror(errno));
+		exit(1);
+	}
+
+	close(fd);
+}
+
 static void test_queuemap(unsigned int task, void *data)
 {
 	const int MAP_SIZE = 32;
@@ -1686,6 +1701,7 @@ static void run_all_tests(void)
 	test_arraymap_percpu_many_keys();
 
 	test_devmap(0, NULL);
+	test_devmap_idx(0, NULL);
 	test_sockmap(0, NULL);
 
 	test_map_large();


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions
  2019-03-01 14:12 ` [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions Toke Høiland-Jørgensen
@ 2019-03-02  1:08   ` Jakub Kicinski
  2019-03-04 12:47     ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2019-03-02  1:08 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> The subsequent commits introducing default maps and a hash-based ifindex
> devmap require a bit of refactoring of the devmap code. Perform this first
> so the subsequent commits become easier to read.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>

> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 191b79948424..1037fc08c504 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -75,6 +75,7 @@ struct bpf_dtab {
>  	struct bpf_dtab_netdev **netdev_map;
>  	unsigned long __percpu *flush_needed;
>  	struct list_head list;
> +	struct rcu_head rcu;

I think the RCU change may warrant a separate commit or at least a
mention in the commit message ;)

>  };
>  
>  static DEFINE_SPINLOCK(dev_map_lock);
> @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>  	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
>  }
>  
> -static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
> +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> +			    bool check_memlock)
>  {
> -	struct bpf_dtab *dtab;
> -	int err = -EINVAL;
>  	u64 cost;
> -
> -	if (!capable(CAP_NET_ADMIN))
> -		return ERR_PTR(-EPERM);
> -
> -	/* check sanity of attributes */
> -	if (attr->max_entries == 0 || attr->key_size != 4 ||
> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> -		return ERR_PTR(-EINVAL);

perhaps consider moving these to a map_alloc_check callback?

> -	dtab = kzalloc(sizeof(*dtab), GFP_USER);
> -	if (!dtab)
> -		return ERR_PTR(-ENOMEM);
> +	int err;
>  
>  	bpf_map_init_from_attr(&dtab->map, attr);
>  
> @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>  	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
>  	if (cost >= U32_MAX - PAGE_SIZE)
> -		goto free_dtab;
> +		return -EINVAL;
>  
>  	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>  
> -	/* if map size is larger than memlock limit, reject it early */
> -	err = bpf_map_precharge_memlock(dtab->map.pages);
> -	if (err)
> -		goto free_dtab;
> -
> -	err = -ENOMEM;
> +	if (check_memlock) {
> +		/* if map size is larger than memlock limit, reject it early */
> +		err = bpf_map_precharge_memlock(dtab->map.pages);
> +		if (err)
> +			return -EINVAL;
> +	}
>  
>  	/* A per cpu bitfield with a bit per possible net device */
>  	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
>  						__alignof__(unsigned long),
>  						GFP_KERNEL | __GFP_NOWARN);
>  	if (!dtab->flush_needed)
> -		goto free_dtab;
> +		goto err_alloc;
>  
>  	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>  					      sizeof(struct bpf_dtab_netdev *),
>  					      dtab->map.numa_node);
>  	if (!dtab->netdev_map)
> -		goto free_dtab;
> +		goto err_map;
>  
> -	spin_lock(&dev_map_lock);
> -	list_add_tail_rcu(&dtab->list, &dev_map_list);
> -	spin_unlock(&dev_map_lock);
> +	return 0;
>  
> -	return &dtab->map;
> -free_dtab:
> + err_map:

The label should name the first action.  So it was correct, you're
making it less good :)  Also why the space?

>  	free_percpu(dtab->flush_needed);
> -	kfree(dtab);
> -	return ERR_PTR(err);
> + err_alloc:

and no need for this one

> +	return -ENOMEM;
>  }
>  
> -static void dev_map_free(struct bpf_map *map)
> +static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  {
> -	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> -	int i, cpu;
> +	struct bpf_dtab *dtab;
> +	int err;
>  
> -	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> -	 * so the programs (can be more than one that used this map) were
> -	 * disconnected from events. Wait for outstanding critical sections in
> -	 * these programs to complete. The rcu critical section only guarantees
> -	 * no further reads against netdev_map. It does __not__ ensure pending
> -	 * flush operations (if any) are complete.
> -	 */
> +	if (!capable(CAP_NET_ADMIN))
> +		return ERR_PTR(-EPERM);
> +
> +	/* check sanity of attributes */
> +	if (attr->max_entries == 0 || attr->key_size != 4 ||
> +	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> +		return ERR_PTR(-EINVAL);
> +
> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
> +	if (!dtab)
> +		return ERR_PTR(-ENOMEM);
> +
> +	err = dev_map_init_map(dtab, attr, true);
> +	if (err) {
> +		kfree(dtab);
> +		return ERR_PTR(err);
> +	}
>  
>  	spin_lock(&dev_map_lock);
> -	list_del_rcu(&dtab->list);
> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>  	spin_unlock(&dev_map_lock);
>  
> -	bpf_clear_redirect_map(map);
> -	synchronize_rcu();
> +	return &dtab->map;
> +}
> +
> +static void __dev_map_free(struct rcu_head *rcu)
> +{
> +	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
> +	int i, cpu;
>  
>  	/* To ensure all pending flush operations have completed wait for flush
>  	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
> @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)

There is a cond_resched() here, I don't think you can call
cond_resched() from an RCU callback.

>  	kfree(dtab);
>  }
>  
> +static void dev_map_free(struct bpf_map *map)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +
> +	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> +	 * so the programs (can be more than one that used this map) were
> +	 * disconnected from events. Wait for outstanding critical sections in
> +	 * these programs to complete. The rcu critical section only guarantees
> +	 * no further reads against netdev_map. It does __not__ ensure pending
> +	 * flush operations (if any) are complete.
> +	 */
> +
> +	spin_lock(&dev_map_lock);
> +	list_del_rcu(&dtab->list);
> +	spin_unlock(&dev_map_lock);
> +
> +	bpf_clear_redirect_map(map);
> +	call_rcu(&dtab->rcu, __dev_map_free);
> +}
> +
>  static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-01 14:12 ` [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
@ 2019-03-02  2:09   ` Jakub Kicinski
  2019-03-04 11:58     ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2019-03-02  2:09 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> An XDP program can redirect packets between interfaces using either the
> xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the
> flexibility of updating maps from userspace, the redirect_map() helper also
> uses the map structure to batch packets, which results in a significant
> (around 50%) performance boost. However, the xdp_redirect() API is simpler
> if one just wants to redirect to another interface, which means people tend
> to use this interface and then wonder why they getter worse performance
> than expected.
> 
> This patch seeks to close this performance difference between the two APIs.
> It achieves this by changing xdp_redirect() to use a hidden devmap for
> looking up destination interfaces, thus gaining the batching benefit with
> no visible difference from the user API point of view.
> 
> A hidden per-namespace map is allocated when an XDP program that uses the
> non-map xdp_redirect() helper is first loaded. This map is populated with
> all available interfaces in its namespace, and kept up to date as
> interfaces come and go. Once allocated, the map is kept around until the
> namespace is removed.
> 
> The hidden map uses the ifindex as map key, which means they are limited to
> ifindexes smaller than the map size of 64. A later patch introduces a new
> map type to lift this restriction.
> 
> Performance numbers:
> 
> Before patch:
> xdp_redirect:     5426035 pkt/s
> xdp_redirect_map: 8412754 pkt/s
> 
> After patch:
> xdp_redirect:     8314702 pkt/s
> xdp_redirect_map: 8411854 pkt/s
> 
> This corresponds to a 53% increase in xdp_redirect performance, or a
> reduction in per-packet processing time by 64 nanoseconds.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index de18227b3d95..060c5850af3b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -25,6 +25,7 @@ struct sock;
>  struct seq_file;
>  struct btf;
>  struct btf_type;
> +struct net;
>  
>  /* map is generic key/value storage optionally accesible by eBPF programs */
>  struct bpf_map_ops {
> @@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
>  extern const struct bpf_verifier_ops xdp_analyzer_ops;
>  
>  struct bpf_prog *bpf_prog_get(u32 ufd);
> +struct bpf_prog *bpf_prog_get_by_id(u32 id);
>  struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
>  				       bool attach_drv);
>  struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
> @@ -612,6 +614,11 @@ struct xdp_buff;
>  struct sk_buff;
>  
>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
> +int dev_map_ensure_default_map(struct net *net);
> +void dev_map_put_default_map(struct net *net);
> +int dev_map_inc_redirect_count(void);
> +void dev_map_dec_redirect_count(void);
>  void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
>  void __dev_map_flush(struct bpf_map *map);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> @@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>  	return ERR_PTR(-EOPNOTSUPP);
>  }
>  
> +static inline struct bpf_prog *bpf_prog_get_by_id(u32 id)
> +{
> +	return ERR_PTR(-EOPNOTSUPP);
> +}
> +
>  static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
>  						     enum bpf_prog_type type,
>  						     bool attach_drv)
> @@ -693,6 +705,29 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
>  	return NULL;
>  }
>  
> +static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
> +{
> +	return NULL;
> +}
> +
> +static inline int dev_map_ensure_default_map(struct net *net)
> +{
> +	return 0;
> +}
> +
> +static inline void dev_map_put_default_map(struct net *net)
> +{
> +}
> +
> +static inline int dev_map_inc_redirect_count(void)
> +{
> +	return 0;
> +}
> +
> +static inline void dev_map_dec_redirect_count(void)
> +{
> +}
> +
>  static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
>  {
>  }
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 95e2d7ebdf21..dd6bbbab32f7 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -507,6 +507,8 @@ struct bpf_prog {
>  				gpl_compatible:1, /* Is filter GPL compatible? */
>  				cb_access:1,	/* Is control block accessed? */
>  				dst_needed:1,	/* Do we need dst entry? */
> +				redirect_needed:1,	/* Does program need access to xdp_redirect? */
> +				redirect_used:1,	/* Does program use xdp_redirect? */
>  				blinded:1,	/* Was blinded */
>  				is_func:1,	/* program is a bpf function */
>  				kprobe_override:1, /* Do we override a kprobe? */
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index a68ced28d8f4..6706ecc25d8f 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -162,7 +162,7 @@ struct net {
>  #if IS_ENABLED(CONFIG_CAN)
>  	struct netns_can	can;
>  #endif
> -#ifdef CONFIG_XDP_SOCKETS
> +#ifdef CONFIG_BPF_SYSCALL
>  	struct netns_xdp	xdp;
>  #endif
>  	struct sock		*diag_nlsk;
> diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
> index e5734261ba0a..4935dfe1cf43 100644
> --- a/include/net/netns/xdp.h
> +++ b/include/net/netns/xdp.h
> @@ -4,10 +4,21 @@
>  
>  #include <linux/rculist.h>
>  #include <linux/mutex.h>
> +#include <linux/atomic.h>
> +
> +struct bpf_dtab;
> +
> +struct bpf_dtab_container {
> +	struct bpf_dtab __rcu *dtab;
> +	atomic_t refcnt;

refcount_t ?  I'm not sure what the rules are for non-performance
critical stuff are..

> +};
>  
>  struct netns_xdp {
> +#ifdef CONFIG_XDP_SOCKETS
>  	struct mutex		lock;
>  	struct hlist_head	list;
> +#endif
> +	struct bpf_dtab_container default_map;
>  };
>  
>  #endif /* __NETNS_XDP_H__ */
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 1037fc08c504..e55707e62b60 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -56,6 +56,9 @@
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>  
>  #define DEV_MAP_BULK_SIZE 16
> +#define DEV_MAP_DEFAULT_SIZE 8
> +#define BPF_MAX_REFCNT 32768
> +
>  struct xdp_bulk_queue {
>  	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
>  	struct net_device *dev_rx;
> @@ -80,6 +83,7 @@ struct bpf_dtab {
>  
>  static DEFINE_SPINLOCK(dev_map_lock);
>  static LIST_HEAD(dev_map_list);
> +static atomic_t global_redirect_use = {};

REFCOUNT_INIT(0) or ATOMIC_INIT(0)

>  static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>  {
> @@ -332,6 +336,18 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
>  	return obj;
>  }
>  
> +/* This is only being called from xdp_do_redirect() if the xdp_redirect helper
> + * is used; the default map is allocated on XDP program load if the helper is
> + * used, so will always be available at this point.
> + */

Yet it does check for NULL..?

> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
> +{
> +	struct net *net = dev_net(dev);
> +	struct bpf_dtab *dtab = rcu_dereference(net->xdp.default_map.dtab);

This init is long and break rxmas tree rules, separate line perhaps?

> +	return dtab ? &dtab->map : NULL;
> +}
> +
>  /* Runs under RCU-read-side, plus in softirq under NAPI protection.
>   * Thus, safe percpu variable access.
>   */
> @@ -533,14 +549,210 @@ const struct bpf_map_ops dev_map_ops = {
>  	.map_check_btf = map_check_no_btf,
>  };
>  
> +static inline struct net *bpf_default_map_to_net(struct bpf_dtab_container *cont)
> +{
> +	struct netns_xdp *xdp = container_of(cont, struct netns_xdp, default_map);
> +
> +	return container_of(xdp, struct net, xdp);
> +}
> +
> +static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
> +{
> +	struct bpf_dtab *dtab = NULL;
> +
> +	lockdep_assert_held(&dev_map_lock);
> +
> +	dtab = rcu_dereference(cont->dtab);
> +	if (dtab) {

How can it be NULL if it's refcounted? hm..

> +		list_del_rcu(&dtab->list);
> +		rcu_assign_pointer(cont->dtab, NULL);
> +		bpf_clear_redirect_map(&dtab->map);
> +		call_rcu(&dtab->rcu, __dev_map_free);
> +	}
> +}
> +
> +void dev_map_put_default_map(struct net *net)
> +{
> +	if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
> +		spin_lock(&dev_map_lock);
> +		__dev_map_release_default_map(&net->xdp.default_map);
> +		spin_unlock(&dev_map_lock);
> +	}
> +}
> +
> +static int __init_default_map(struct bpf_dtab_container *cont)
> +{
> +	struct net *net = bpf_default_map_to_net(cont);
> +	struct bpf_dtab *dtab, *old_dtab;
> +	int size = DEV_MAP_DEFAULT_SIZE;
> +	struct net_device *netdev;
> +	union bpf_attr attr = {};
> +	u32 idx;
> +	int err;
> +
> +	lockdep_assert_held(&dev_map_lock);
> +
> +	if (!atomic_read(&global_redirect_use))
> +		return 0;
> +
> +	for_each_netdev(net, netdev)
> +		if (netdev->ifindex >= size)
> +			size <<= 1;
> +
> +	old_dtab = rcu_dereference(cont->dtab);
> +	if (old_dtab && old_dtab->map.max_entries == size)
> +		return 0;
> +
> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);

You are calling this with a spin lock held :(

> +	if (!dtab)
> +		return -ENOMEM;
> +
> +	attr.map_type = BPF_MAP_TYPE_DEVMAP;
> +	attr.max_entries = size;
> +	attr.value_size = 4;
> +	attr.key_size = 4;
> +
> +	err = dev_map_init_map(dtab, &attr, false);
> +	if (err) {
> +		kfree(dtab);
> +		return err;
> +	}
> +
> +	for_each_netdev(net, netdev) {
> +		idx = netdev->ifindex;
> +		err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0);
> +		if (err) {
> +			__dev_map_free(&dtab->rcu);

This map wasn't visible yet, so probably no need to go through RCU? 
Not that it matters much for an error path..

> +			return err;
> +		}
> +	}
> +
> +	rcu_assign_pointer(cont->dtab, dtab);
> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
> +
> +	if (old_dtab) {
> +		list_del_rcu(&old_dtab->list);
> +		bpf_clear_redirect_map(&old_dtab->map);
> +		call_rcu(&old_dtab->rcu, __dev_map_free);
> +	}
> +
> +	return 0;
> +}
> +

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1b9496c41383..f1b2f01e7ca1 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>  			prog->dst_needed = 1;
>  		if (insn->imm == BPF_FUNC_get_prandom_u32)
>  			bpf_user_rnd_init_once();
> +		if (insn->imm == BPF_FUNC_redirect) {
> +			prog->redirect_needed = true;

nit: this field is a bitfield not boolean

> +			if (!prog->redirect_used) {
> +				int err = dev_map_inc_redirect_count();

Don't call complex functions as variable init, please

> +				if (err)
> +					return err;
> +				prog->redirect_used = true;
> +			}
> +		}
> +
>  		if (insn->imm == BPF_FUNC_override_return)
>  			prog->kprobe_override = 1;
>  		if (insn->imm == BPF_FUNC_tail_call) {
> @@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>  			 * the program array.
>  			 */
>  			prog->cb_access = 1;
> +			prog->redirect_needed = true;
>  			env->prog->aux->stack_depth = MAX_BPF_STACK;
>  			env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
>  
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 2b67f2aa59dd..1df20d529026 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
>  	return xdp.prog_id;
>  }
>  
> +static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev)
> +{
> +	struct bpf_prog *prog;
> +	u32 prog_id;
> +
> +	prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);

if (!prog_id)
	return NULL;

...

> +	if (prog_id) {
> +		prog = bpf_prog_get_by_id(prog_id);
> +		if (!IS_ERR(prog))
> +			return prog;
> +	}
> +
> +	return NULL;
> +}
> +
>  static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
>  			   struct netlink_ext_ack *extack, u32 flags,
>  			   struct bpf_prog *prog)
> @@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev)
>  	memset(&xdp, 0, sizeof(xdp));
>  	xdp.command = XDP_QUERY_PROG;
>  	WARN_ON(ndo_bpf(dev, &xdp));
> -	if (xdp.prog_id)
> +	if (xdp.prog_id) {
> +		struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id);
> +
> +		if (!IS_ERR(prog)) {
> +			if (prog->redirect_needed)
> +				dev_map_put_default_map(dev_net(dev));
> +			bpf_prog_put(prog);
> +		}
> +
>  		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
>  					NULL));
> +	}
>  
>  	/* Remove HW offload */
>  	memset(&xdp, 0, sizeof(xdp));
> @@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
>  			bpf_prog_put(prog);
>  			return -EINVAL;
>  		}
> +
> +		if (!offload && bpf_op == ops->ndo_bpf &&
> +		    prog->redirect_needed) {
> +			err = dev_map_ensure_default_map(dev_net(dev));
> +			if (err) {
> +				NL_SET_ERR_MSG(extack, "unable to allocate default map for xdp_redirect()");
> +				return err;
> +			}
> +		}
> +	} else {
> +		struct bpf_prog *old_prog = dev_xdp_get_prog(dev);
> +
> +		if (old_prog) {
> +			if (old_prog->redirect_needed)
> +				dev_map_put_default_map(dev_net(dev));

I think you should hold the ref to this program and only release the
map _after_ new program was installed.

You also have to do this for replacing a program with a different one.

Could you please add tests for the replacement combinations?

> +			bpf_prog_put(old_prog);
> +		}
>  	}

>  	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
> @@ -9333,6 +9374,7 @@ EXPORT_SYMBOL(unregister_netdev);
>  int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
>  {
>  	int err, new_nsid, new_ifindex;
> +	struct bpf_prog *prog = NULL;
>  
>  	ASSERT_RTNL();
>  
> @@ -9350,6 +9392,15 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	if (net_eq(dev_net(dev), net))
>  		goto out;
>  
> +	prog = dev_xdp_get_prog(dev);
> +	if (prog) {
> +		if (prog->redirect_needed)
> +			err = dev_map_ensure_default_map(net);
> +
> +		if (err)
> +			goto out;
> +	}

prog = dev_xdp_get_prog(dev);
if (prog && prog->redirect_needed) {
	err = dev_map_ensure_default_map(net);
	if (err)
		goto out;
}

>  	/* Pick the destination device name, and ensure
>  	 * we can use it in the destination network namespace.
>  	 */
> @@ -9388,6 +9439,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
>  	rcu_barrier();
>  
> +	if (prog && prog->redirect_needed)
> +		dev_map_put_default_map(dev_net(dev));
> +
>  	new_nsid = peernet2id_alloc(dev_net(dev), net);
>  	/* If there is an ifindex conflict assign a new one */
>  	if (__dev_get_by_index(net, dev->ifindex))
> @@ -9435,6 +9489,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	synchronize_net();
>  	err = 0;
>  out:
> +	if (prog)
> +		bpf_prog_put(prog);
> +
>  	return err;
>  }
>  EXPORT_SYMBOL_GPL(dev_change_net_namespace);

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-02  2:09   ` Jakub Kicinski
@ 2019-03-04 11:58     ` Toke Høiland-Jørgensen
  2019-03-04 17:44       ` Jakub Kicinski
  0 siblings, 1 reply; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-04 11:58 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Hi Jakub

Thanks for the review! A few comments/questions below.

> On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
>> An XDP program can redirect packets between interfaces using either the
>> xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the
>> flexibility of updating maps from userspace, the redirect_map() helper also
>> uses the map structure to batch packets, which results in a significant
>> (around 50%) performance boost. However, the xdp_redirect() API is simpler
>> if one just wants to redirect to another interface, which means people tend
>> to use this interface and then wonder why they getter worse performance
>> than expected.
>> 
>> This patch seeks to close this performance difference between the two APIs.
>> It achieves this by changing xdp_redirect() to use a hidden devmap for
>> looking up destination interfaces, thus gaining the batching benefit with
>> no visible difference from the user API point of view.
>> 
>> A hidden per-namespace map is allocated when an XDP program that uses the
>> non-map xdp_redirect() helper is first loaded. This map is populated with
>> all available interfaces in its namespace, and kept up to date as
>> interfaces come and go. Once allocated, the map is kept around until the
>> namespace is removed.
>> 
>> The hidden map uses the ifindex as map key, which means they are limited to
>> ifindexes smaller than the map size of 64. A later patch introduces a new
>> map type to lift this restriction.
>> 
>> Performance numbers:
>> 
>> Before patch:
>> xdp_redirect:     5426035 pkt/s
>> xdp_redirect_map: 8412754 pkt/s
>> 
>> After patch:
>> xdp_redirect:     8314702 pkt/s
>> xdp_redirect_map: 8411854 pkt/s
>> 
>> This corresponds to a 53% increase in xdp_redirect performance, or a
>> reduction in per-packet processing time by 64 nanoseconds.
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index de18227b3d95..060c5850af3b 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -25,6 +25,7 @@ struct sock;
>>  struct seq_file;
>>  struct btf;
>>  struct btf_type;
>> +struct net;
>>  
>>  /* map is generic key/value storage optionally accesible by eBPF programs */
>>  struct bpf_map_ops {
>> @@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
>>  extern const struct bpf_verifier_ops xdp_analyzer_ops;
>>  
>>  struct bpf_prog *bpf_prog_get(u32 ufd);
>> +struct bpf_prog *bpf_prog_get_by_id(u32 id);
>>  struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
>>  				       bool attach_drv);
>>  struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
>> @@ -612,6 +614,11 @@ struct xdp_buff;
>>  struct sk_buff;
>>  
>>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
>> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
>> +int dev_map_ensure_default_map(struct net *net);
>> +void dev_map_put_default_map(struct net *net);
>> +int dev_map_inc_redirect_count(void);
>> +void dev_map_dec_redirect_count(void);
>>  void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
>>  void __dev_map_flush(struct bpf_map *map);
>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>> @@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>>  	return ERR_PTR(-EOPNOTSUPP);
>>  }
>>  
>> +static inline struct bpf_prog *bpf_prog_get_by_id(u32 id)
>> +{
>> +	return ERR_PTR(-EOPNOTSUPP);
>> +}
>> +
>>  static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
>>  						     enum bpf_prog_type type,
>>  						     bool attach_drv)
>> @@ -693,6 +705,29 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
>>  	return NULL;
>>  }
>>  
>> +static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
>> +{
>> +	return NULL;
>> +}
>> +
>> +static inline int dev_map_ensure_default_map(struct net *net)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void dev_map_put_default_map(struct net *net)
>> +{
>> +}
>> +
>> +static inline int dev_map_inc_redirect_count(void)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void dev_map_dec_redirect_count(void)
>> +{
>> +}
>> +
>>  static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
>>  {
>>  }
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 95e2d7ebdf21..dd6bbbab32f7 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -507,6 +507,8 @@ struct bpf_prog {
>>  				gpl_compatible:1, /* Is filter GPL compatible? */
>>  				cb_access:1,	/* Is control block accessed? */
>>  				dst_needed:1,	/* Do we need dst entry? */
>> +				redirect_needed:1,	/* Does program need access to xdp_redirect? */
>> +				redirect_used:1,	/* Does program use xdp_redirect? */
>>  				blinded:1,	/* Was blinded */
>>  				is_func:1,	/* program is a bpf function */
>>  				kprobe_override:1, /* Do we override a kprobe? */
>> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
>> index a68ced28d8f4..6706ecc25d8f 100644
>> --- a/include/net/net_namespace.h
>> +++ b/include/net/net_namespace.h
>> @@ -162,7 +162,7 @@ struct net {
>>  #if IS_ENABLED(CONFIG_CAN)
>>  	struct netns_can	can;
>>  #endif
>> -#ifdef CONFIG_XDP_SOCKETS
>> +#ifdef CONFIG_BPF_SYSCALL
>>  	struct netns_xdp	xdp;
>>  #endif
>>  	struct sock		*diag_nlsk;
>> diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
>> index e5734261ba0a..4935dfe1cf43 100644
>> --- a/include/net/netns/xdp.h
>> +++ b/include/net/netns/xdp.h
>> @@ -4,10 +4,21 @@
>>  
>>  #include <linux/rculist.h>
>>  #include <linux/mutex.h>
>> +#include <linux/atomic.h>
>> +
>> +struct bpf_dtab;
>> +
>> +struct bpf_dtab_container {
>> +	struct bpf_dtab __rcu *dtab;
>> +	atomic_t refcnt;
>
> refcount_t ?  I'm not sure what the rules are for non-performance
> critical stuff are..

I based the use of atomic_t on what bpf/syscall.c is doing. The problem
with using refcount_t is that it's a bit of a weird refcount because
it's not strictly tied to the object lifetime (as the objects are only
allocated if *both* the globals and the per-namespace refcnt are >0).


[... snip things that I'll just fix ...]
>
>> +static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
>> +{
>> +	struct bpf_dtab *dtab = NULL;
>> +
>> +	lockdep_assert_held(&dev_map_lock);
>> +
>> +	dtab = rcu_dereference(cont->dtab);
>> +	if (dtab) {
>
> How can it be NULL if it's refcounted? hm..

See above; it's not actually the object that is refcounted, it's a count
of the number of active XDP programs in the namespace, which still needs
to be kept so we can allocate the devmaps when a REDIRECT program is
loaded.

>> +		list_del_rcu(&dtab->list);
>> +		rcu_assign_pointer(cont->dtab, NULL);
>> +		bpf_clear_redirect_map(&dtab->map);
>> +		call_rcu(&dtab->rcu, __dev_map_free);
>> +	}
>> +}
>> +
>> +void dev_map_put_default_map(struct net *net)
>> +{
>> +	if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
>> +		spin_lock(&dev_map_lock);
>> +		__dev_map_release_default_map(&net->xdp.default_map);
>> +		spin_unlock(&dev_map_lock);
>> +	}
>> +}
>> +
>> +static int __init_default_map(struct bpf_dtab_container *cont)
>> +{
>> +	struct net *net = bpf_default_map_to_net(cont);
>> +	struct bpf_dtab *dtab, *old_dtab;
>> +	int size = DEV_MAP_DEFAULT_SIZE;
>> +	struct net_device *netdev;
>> +	union bpf_attr attr = {};
>> +	u32 idx;
>> +	int err;
>> +
>> +	lockdep_assert_held(&dev_map_lock);
>> +
>> +	if (!atomic_read(&global_redirect_use))
>> +		return 0;
>> +
>> +	for_each_netdev(net, netdev)
>> +		if (netdev->ifindex >= size)
>> +			size <<= 1;
>> +
>> +	old_dtab = rcu_dereference(cont->dtab);
>> +	if (old_dtab && old_dtab->map.max_entries == size)
>> +		return 0;
>> +
>> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
>
> You are calling this with a spin lock held :(

Yeah, not just this, there are more allocations in dev_map_init_map().
Hmm, I guess I need to think about the concurrency bits some more; it is
made a bit tricky by the refcounts being outside the objects.

Basically what I'm trying to protect against is the case where the
global refcount goes to zero and then immediately back to one (or vice
versa), which results in a deallocation immediately followed by an
allocation. I was worried about the allocation and deallocation racing
with each other (because there's a window after the refcnt is checked
before the (de)allocation begins). It's not quite clear to me whether
RCU is enough to protect against this... Thoughts?

>> +	if (!dtab)
>> +		return -ENOMEM;
>> +
>> +	attr.map_type = BPF_MAP_TYPE_DEVMAP;
>> +	attr.max_entries = size;
>> +	attr.value_size = 4;
>> +	attr.key_size = 4;
>> +
>> +	err = dev_map_init_map(dtab, &attr, false);
>> +	if (err) {
>> +		kfree(dtab);
>> +		return err;
>> +	}
>> +
>> +	for_each_netdev(net, netdev) {
>> +		idx = netdev->ifindex;
>> +		err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0);
>> +		if (err) {
>> +			__dev_map_free(&dtab->rcu);
>
> This map wasn't visible yet, so probably no need to go through RCU? 
> Not that it matters much for an error path..

It's not actually using call_rcu(), it's just calling a function that
expects a struct_rcu as a parameter (to avoid another wrapper function) :)

>> +			return err;
>> +		}
>> +	}
>> +
>> +	rcu_assign_pointer(cont->dtab, dtab);
>> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>> +
>> +	if (old_dtab) {
>> +		list_del_rcu(&old_dtab->list);
>> +		bpf_clear_redirect_map(&old_dtab->map);
>> +		call_rcu(&old_dtab->rcu, __dev_map_free);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 1b9496c41383..f1b2f01e7ca1 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>>  			prog->dst_needed = 1;
>>  		if (insn->imm == BPF_FUNC_get_prandom_u32)
>>  			bpf_user_rnd_init_once();
>> +		if (insn->imm == BPF_FUNC_redirect) {
>> +			prog->redirect_needed = true;
>
> nit: this field is a bitfield not boolean
>
>> +			if (!prog->redirect_used) {
>> +				int err = dev_map_inc_redirect_count();
>
> Don't call complex functions as variable init, please
>
>> +				if (err)
>> +					return err;
>> +				prog->redirect_used = true;
>> +			}
>> +		}
>> +
>>  		if (insn->imm == BPF_FUNC_override_return)
>>  			prog->kprobe_override = 1;
>>  		if (insn->imm == BPF_FUNC_tail_call) {
>> @@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>>  			 * the program array.
>>  			 */
>>  			prog->cb_access = 1;
>> +			prog->redirect_needed = true;
>>  			env->prog->aux->stack_depth = MAX_BPF_STACK;
>>  			env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
>>  
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 2b67f2aa59dd..1df20d529026 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
>>  	return xdp.prog_id;
>>  }
>>  
>> +static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev)
>> +{
>> +	struct bpf_prog *prog;
>> +	u32 prog_id;
>> +
>> +	prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
>
> if (!prog_id)
> 	return NULL;
>
> ...
>
>> +	if (prog_id) {
>> +		prog = bpf_prog_get_by_id(prog_id);
>> +		if (!IS_ERR(prog))
>> +			return prog;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>>  static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
>>  			   struct netlink_ext_ack *extack, u32 flags,
>>  			   struct bpf_prog *prog)
>> @@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev)
>>  	memset(&xdp, 0, sizeof(xdp));
>>  	xdp.command = XDP_QUERY_PROG;
>>  	WARN_ON(ndo_bpf(dev, &xdp));
>> -	if (xdp.prog_id)
>> +	if (xdp.prog_id) {
>> +		struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id);
>> +
>> +		if (!IS_ERR(prog)) {
>> +			if (prog->redirect_needed)
>> +				dev_map_put_default_map(dev_net(dev));
>> +			bpf_prog_put(prog);
>> +		}
>> +
>>  		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
>>  					NULL));
>> +	}
>>  
>>  	/* Remove HW offload */
>>  	memset(&xdp, 0, sizeof(xdp));
>> @@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
>>  			bpf_prog_put(prog);
>>  			return -EINVAL;
>>  		}
>> +
>> +		if (!offload && bpf_op == ops->ndo_bpf &&
>> +		    prog->redirect_needed) {
>> +			err = dev_map_ensure_default_map(dev_net(dev));
>> +			if (err) {
>> +				NL_SET_ERR_MSG(extack, "unable to allocate default map for xdp_redirect()");
>> +				return err;
>> +			}
>> +		}
>> +	} else {
>> +		struct bpf_prog *old_prog = dev_xdp_get_prog(dev);
>> +
>> +		if (old_prog) {
>> +			if (old_prog->redirect_needed)
>> +				dev_map_put_default_map(dev_net(dev));
>
> I think you should hold the ref to this program and only release the
> map _after_ new program was installed.
>
> You also have to do this for replacing a program with a different one.

Ah, right, good points.

> Could you please add tests for the replacement combinations?

Sure. But, erm, where? selftests? :)

[... snip a few more things that I'll just fix ...]

-Toke

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions
  2019-03-02  1:08   ` Jakub Kicinski
@ 2019-03-04 12:47     ` Toke Høiland-Jørgensen
  2019-03-04 17:08       ` Jakub Kicinski
  0 siblings, 1 reply; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-04 12:47 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
>> The subsequent commits introducing default maps and a hash-based ifindex
>> devmap require a bit of refactoring of the devmap code. Perform this first
>> so the subsequent commits become easier to read.
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>
>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> index 191b79948424..1037fc08c504 100644
>> --- a/kernel/bpf/devmap.c
>> +++ b/kernel/bpf/devmap.c
>> @@ -75,6 +75,7 @@ struct bpf_dtab {
>>  	struct bpf_dtab_netdev **netdev_map;
>>  	unsigned long __percpu *flush_needed;
>>  	struct list_head list;
>> +	struct rcu_head rcu;
>
> I think the RCU change may warrant a separate commit or at least a
> mention in the commit message ;)

Heh, fair point.

>>  };
>>  
>>  static DEFINE_SPINLOCK(dev_map_lock);
>> @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>>  	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
>>  }
>>  
>> -static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>> +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>> +			    bool check_memlock)
>>  {
>> -	struct bpf_dtab *dtab;
>> -	int err = -EINVAL;
>>  	u64 cost;
>> -
>> -	if (!capable(CAP_NET_ADMIN))
>> -		return ERR_PTR(-EPERM);
>> -
>> -	/* check sanity of attributes */
>> -	if (attr->max_entries == 0 || attr->key_size != 4 ||
>> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>> -		return ERR_PTR(-EINVAL);
>
> perhaps consider moving these to a map_alloc_check callback?

The reason I kept them here is that these checks are only needed
strictly when the parameters come from userspace. In the other
allocation paths, the attrs are hard-coded to valid values, so the only
thing having the check would do is guard against future changes to one
part of the file being out of sync with the other part. I can certainly
add the check, it just seemed a bit excessive...

>> -	dtab = kzalloc(sizeof(*dtab), GFP_USER);
>> -	if (!dtab)
>> -		return ERR_PTR(-ENOMEM);
>> +	int err;
>>  
>>  	bpf_map_init_from_attr(&dtab->map, attr);
>>  
>> @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>  	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>>  	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
>>  	if (cost >= U32_MAX - PAGE_SIZE)
>> -		goto free_dtab;
>> +		return -EINVAL;
>>  
>>  	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>>  
>> -	/* if map size is larger than memlock limit, reject it early */
>> -	err = bpf_map_precharge_memlock(dtab->map.pages);
>> -	if (err)
>> -		goto free_dtab;
>> -
>> -	err = -ENOMEM;
>> +	if (check_memlock) {
>> +		/* if map size is larger than memlock limit, reject it early */
>> +		err = bpf_map_precharge_memlock(dtab->map.pages);
>> +		if (err)
>> +			return -EINVAL;
>> +	}
>>  
>>  	/* A per cpu bitfield with a bit per possible net device */
>>  	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
>>  						__alignof__(unsigned long),
>>  						GFP_KERNEL | __GFP_NOWARN);
>>  	if (!dtab->flush_needed)
>> -		goto free_dtab;
>> +		goto err_alloc;
>>  
>>  	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>>  					      sizeof(struct bpf_dtab_netdev *),
>>  					      dtab->map.numa_node);
>>  	if (!dtab->netdev_map)
>> -		goto free_dtab;
>> +		goto err_map;
>>  
>> -	spin_lock(&dev_map_lock);
>> -	list_add_tail_rcu(&dtab->list, &dev_map_list);
>> -	spin_unlock(&dev_map_lock);
>> +	return 0;
>>  
>> -	return &dtab->map;
>> -free_dtab:
>> + err_map:
>
> The label should name the first action.  So it was correct, you're
> making it less good :)

Heh, yeah, guess you're right.

> Also why the space?

Because the might GNU ordained it (i.e., Emacs added those). Will fix.

>>  	free_percpu(dtab->flush_needed);
>> -	kfree(dtab);
>> -	return ERR_PTR(err);
>> + err_alloc:
>
> and no need for this one

Right, guess I can just rely on the NULL check in the free functions;
would make things simpler in patch 3 as well :)

>> +	return -ENOMEM;
>>  }
>>  
>> -static void dev_map_free(struct bpf_map *map)
>> +static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>  {
>> -	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> -	int i, cpu;
>> +	struct bpf_dtab *dtab;
>> +	int err;
>>  
>> -	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
>> -	 * so the programs (can be more than one that used this map) were
>> -	 * disconnected from events. Wait for outstanding critical sections in
>> -	 * these programs to complete. The rcu critical section only guarantees
>> -	 * no further reads against netdev_map. It does __not__ ensure pending
>> -	 * flush operations (if any) are complete.
>> -	 */
>> +	if (!capable(CAP_NET_ADMIN))
>> +		return ERR_PTR(-EPERM);
>> +
>> +	/* check sanity of attributes */
>> +	if (attr->max_entries == 0 || attr->key_size != 4 ||
>> +	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>> +		return ERR_PTR(-EINVAL);
>> +
>> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
>> +	if (!dtab)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	err = dev_map_init_map(dtab, attr, true);
>> +	if (err) {
>> +		kfree(dtab);
>> +		return ERR_PTR(err);
>> +	}
>>  
>>  	spin_lock(&dev_map_lock);
>> -	list_del_rcu(&dtab->list);
>> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>>  	spin_unlock(&dev_map_lock);
>>  
>> -	bpf_clear_redirect_map(map);
>> -	synchronize_rcu();
>> +	return &dtab->map;
>> +}
>> +
>> +static void __dev_map_free(struct rcu_head *rcu)
>> +{
>> +	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
>> +	int i, cpu;
>>  
>>  	/* To ensure all pending flush operations have completed wait for flush
>>  	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
>> @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)
>
> There is a cond_resched() here, I don't think you can call
> cond_resched() from an RCU callback.

Hmm, bugger. I thought I could just use call_rcu() as semantically
equivalent (but slightly slower) to just calling synchronize_rcu()
inside the function.

In an earlier version I had a namespace enter/exit notifier in devmap.c
as well, to react to new namespaces. And that notifier has a comment
about avoiding calls to synchronize_rcu(). But since this version
doesn't actually need that, maybe I can just keep using direct calls and
synchronize_rcu() and avoid the callback? I'm a bit worried about adding
both synchronize_rcu() and cond_resched() as a possible side effect of
every call to bpf_prog_put(), though; so maybe it's better to move the
cleanup somewhere it's actually safe to call cond_resched(); what would
that be, a workqueue?

-Toke

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions
  2019-03-04 12:47     ` Toke Høiland-Jørgensen
@ 2019-03-04 17:08       ` Jakub Kicinski
  2019-03-04 17:37         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2019-03-04 17:08 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Mon, 04 Mar 2019 13:47:47 +0100, Toke Høiland-Jørgensen wrote:
> In an earlier version I had a namespace enter/exit notifier in devmap.c
> as well, to react to new namespaces. And that notifier has a comment
> about avoiding calls to synchronize_rcu(). But since this version
> doesn't actually need that, maybe I can just keep using direct calls and
> synchronize_rcu() and avoid the callback? I'm a bit worried about adding
> both synchronize_rcu() and cond_resched() as a possible side effect of
> every call to bpf_prog_put(), though; so maybe it's better to move the
> cleanup somewhere it's actually safe to call cond_resched(); what would
> that be, a workqueue?

Workqueue would be my go to.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions
  2019-03-04 17:08       ` Jakub Kicinski
@ 2019-03-04 17:37         ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-04 17:37 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Mon, 04 Mar 2019 13:47:47 +0100, Toke Høiland-Jørgensen wrote:
>> In an earlier version I had a namespace enter/exit notifier in devmap.c
>> as well, to react to new namespaces. And that notifier has a comment
>> about avoiding calls to synchronize_rcu(). But since this version
>> doesn't actually need that, maybe I can just keep using direct calls and
>> synchronize_rcu() and avoid the callback? I'm a bit worried about adding
>> both synchronize_rcu() and cond_resched() as a possible side effect of
>> every call to bpf_prog_put(), though; so maybe it's better to move the
>> cleanup somewhere it's actually safe to call cond_resched(); what would
>> that be, a workqueue?
>
> Workqueue would be my go to.

Gotcha! Thanks :)

-Toke

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-04 11:58     ` Toke Høiland-Jørgensen
@ 2019-03-04 17:44       ` Jakub Kicinski
  2019-03-04 19:05         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2019-03-04 17:44 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Mon, 04 Mar 2019 12:58:51 +0100, Toke Høiland-Jørgensen wrote:
> >> diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
> >> index e5734261ba0a..4935dfe1cf43 100644
> >> --- a/include/net/netns/xdp.h
> >> +++ b/include/net/netns/xdp.h
> >> @@ -4,10 +4,21 @@
> >>  
> >>  #include <linux/rculist.h>
> >>  #include <linux/mutex.h>
> >> +#include <linux/atomic.h>
> >> +
> >> +struct bpf_dtab;
> >> +
> >> +struct bpf_dtab_container {
> >> +	struct bpf_dtab __rcu *dtab;
> >> +	atomic_t refcnt;  
> >
> > refcount_t ?  I'm not sure what the rules are for non-performance
> > critical stuff are..  
> 
> I based the use of atomic_t on what bpf/syscall.c is doing. The problem
> with using refcount_t is that it's a bit of a weird refcount because
> it's not strictly tied to the object lifetime (as the objects are only
> allocated if *both* the globals and the per-namespace refcnt are >0).
> 
> >> +static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
> >> +{
> >> +	struct bpf_dtab *dtab = NULL;
> >> +
> >> +	lockdep_assert_held(&dev_map_lock);
> >> +
> >> +	dtab = rcu_dereference(cont->dtab);
> >> +	if (dtab) {  
> >
> > How can it be NULL if it's refcounted? hm..  
> 
> See above; it's not actually the object that is refcounted, it's a count
> of the number of active XDP programs in the namespace, which still needs
> to be kept so we can allocate the devmaps when a REDIRECT program is
> loaded.

I think I got it now.  I wonder if anything could be done to improve
the readability of the dual-refcnting :S  Maybe use the "needed"/"used"
terms throughout?

> >> +		list_del_rcu(&dtab->list);
> >> +		rcu_assign_pointer(cont->dtab, NULL);
> >> +		bpf_clear_redirect_map(&dtab->map);
> >> +		call_rcu(&dtab->rcu, __dev_map_free);
> >> +	}
> >> +}
> >> +
> >> +void dev_map_put_default_map(struct net *net)
> >> +{
> >> +	if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
> >> +		spin_lock(&dev_map_lock);
> >> +		__dev_map_release_default_map(&net->xdp.default_map);
> >> +		spin_unlock(&dev_map_lock);
> >> +	}
> >> +}
> >> +
> >> +static int __init_default_map(struct bpf_dtab_container *cont)
> >> +{
> >> +	struct net *net = bpf_default_map_to_net(cont);
> >> +	struct bpf_dtab *dtab, *old_dtab;
> >> +	int size = DEV_MAP_DEFAULT_SIZE;
> >> +	struct net_device *netdev;
> >> +	union bpf_attr attr = {};
> >> +	u32 idx;
> >> +	int err;
> >> +
> >> +	lockdep_assert_held(&dev_map_lock);
> >> +
> >> +	if (!atomic_read(&global_redirect_use))
> >> +		return 0;
> >> +
> >> +	for_each_netdev(net, netdev)
> >> +		if (netdev->ifindex >= size)
> >> +			size <<= 1;
> >> +
> >> +	old_dtab = rcu_dereference(cont->dtab);
> >> +	if (old_dtab && old_dtab->map.max_entries == size)
> >> +		return 0;
> >> +
> >> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);  
> >
> > You are calling this with a spin lock held :(  
> 
> Yeah, not just this, there are more allocations in dev_map_init_map().
> Hmm, I guess I need to think about the concurrency bits some more; it is
> made a bit tricky by the refcounts being outside the objects.
> 
> Basically what I'm trying to protect against is the case where the
> global refcount goes to zero and then immediately back to one (or vice
> versa), which results in a deallocation immediately followed by an
> allocation. I was worried about the allocation and deallocation racing
> with each other (because there's a window after the refcnt is checked
> before the (de)allocation begins). It's not quite clear to me whether
> RCU is enough to protect against this... Thoughts?

Hm.  I think you'll still need a lock (mutex?) on the alloc path, but
the free path should be fine as long as you load the map pointer before
looking at the refcnt (atomic op ensuring the barrier there).

> > Could you please add tests for the replacement combinations?  
> 
> Sure. But, erm, where? selftests? :)

Yes :)  selftests/bpf/  Look around at the scripts, AFAIK we don't have
much in the way of standard infra for system interaction tests like
this :S

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-04 17:44       ` Jakub Kicinski
@ 2019-03-04 19:05         ` Toke Høiland-Jørgensen
  2019-03-04 22:15           ` Jakub Kicinski
  0 siblings, 1 reply; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-04 19:05 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Mon, 04 Mar 2019 12:58:51 +0100, Toke Høiland-Jørgensen wrote:
>> >> diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
>> >> index e5734261ba0a..4935dfe1cf43 100644
>> >> --- a/include/net/netns/xdp.h
>> >> +++ b/include/net/netns/xdp.h
>> >> @@ -4,10 +4,21 @@
>> >>  
>> >>  #include <linux/rculist.h>
>> >>  #include <linux/mutex.h>
>> >> +#include <linux/atomic.h>
>> >> +
>> >> +struct bpf_dtab;
>> >> +
>> >> +struct bpf_dtab_container {
>> >> +	struct bpf_dtab __rcu *dtab;
>> >> +	atomic_t refcnt;  
>> >
>> > refcount_t ?  I'm not sure what the rules are for non-performance
>> > critical stuff are..  
>> 
>> I based the use of atomic_t on what bpf/syscall.c is doing. The problem
>> with using refcount_t is that it's a bit of a weird refcount because
>> it's not strictly tied to the object lifetime (as the objects are only
>> allocated if *both* the globals and the per-namespace refcnt are >0).
>> 
>> >> +static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
>> >> +{
>> >> +	struct bpf_dtab *dtab = NULL;
>> >> +
>> >> +	lockdep_assert_held(&dev_map_lock);
>> >> +
>> >> +	dtab = rcu_dereference(cont->dtab);
>> >> +	if (dtab) {  
>> >
>> > How can it be NULL if it's refcounted? hm..  
>> 
>> See above; it's not actually the object that is refcounted, it's a count
>> of the number of active XDP programs in the namespace, which still needs
>> to be kept so we can allocate the devmaps when a REDIRECT program is
>> loaded.
>
> I think I got it now. I wonder if anything could be done to improve
> the readability of the dual-refcnting :S Maybe use the "needed"/"used"
> terms throughout?

Yeah, good idea, I'll try something along those lines.

>> >> +		list_del_rcu(&dtab->list);
>> >> +		rcu_assign_pointer(cont->dtab, NULL);
>> >> +		bpf_clear_redirect_map(&dtab->map);
>> >> +		call_rcu(&dtab->rcu, __dev_map_free);
>> >> +	}
>> >> +}
>> >> +
>> >> +void dev_map_put_default_map(struct net *net)
>> >> +{
>> >> +	if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
>> >> +		spin_lock(&dev_map_lock);
>> >> +		__dev_map_release_default_map(&net->xdp.default_map);
>> >> +		spin_unlock(&dev_map_lock);
>> >> +	}
>> >> +}
>> >> +
>> >> +static int __init_default_map(struct bpf_dtab_container *cont)
>> >> +{
>> >> +	struct net *net = bpf_default_map_to_net(cont);
>> >> +	struct bpf_dtab *dtab, *old_dtab;
>> >> +	int size = DEV_MAP_DEFAULT_SIZE;
>> >> +	struct net_device *netdev;
>> >> +	union bpf_attr attr = {};
>> >> +	u32 idx;
>> >> +	int err;
>> >> +
>> >> +	lockdep_assert_held(&dev_map_lock);
>> >> +
>> >> +	if (!atomic_read(&global_redirect_use))
>> >> +		return 0;
>> >> +
>> >> +	for_each_netdev(net, netdev)
>> >> +		if (netdev->ifindex >= size)
>> >> +			size <<= 1;
>> >> +
>> >> +	old_dtab = rcu_dereference(cont->dtab);
>> >> +	if (old_dtab && old_dtab->map.max_entries == size)
>> >> +		return 0;
>> >> +
>> >> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);  
>> >
>> > You are calling this with a spin lock held :(  
>> 
>> Yeah, not just this, there are more allocations in dev_map_init_map().
>> Hmm, I guess I need to think about the concurrency bits some more; it is
>> made a bit tricky by the refcounts being outside the objects.
>> 
>> Basically what I'm trying to protect against is the case where the
>> global refcount goes to zero and then immediately back to one (or vice
>> versa), which results in a deallocation immediately followed by an
>> allocation. I was worried about the allocation and deallocation racing
>> with each other (because there's a window after the refcnt is checked
>> before the (de)allocation begins). It's not quite clear to me whether
>> RCU is enough to protect against this... Thoughts?
>
> Hm.  I think you'll still need a lock (mutex?) on the alloc path, but
> the free path should be fine as long as you load the map pointer before
> looking at the refcnt (atomic op ensuring the barrier there).

Yeah, for the per-namespace refcnt it's pretty straight forward, the
trouble is the global count that needs to iterate over all namespaces;
probably need to put that all behind a (non-spin)lock, right?

>> > Could you please add tests for the replacement combinations?  
>> 
>> Sure. But, erm, where? selftests? :)
>
> Yes :)  selftests/bpf/  Look around at the scripts, AFAIK we don't have
> much in the way of standard infra for system interaction tests like
> this :S

Right. I'll shamelessly copy from the tests already in there and see if
I can't get something decent working :)

-Toke

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-04 19:05         ` Toke Høiland-Jørgensen
@ 2019-03-04 22:15           ` Jakub Kicinski
  2019-03-04 22:28             ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2019-03-04 22:15 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Mon, 04 Mar 2019 20:05:30 +0100, Toke Høiland-Jørgensen wrote:
> > Hm.  I think you'll still need a lock (mutex?) on the alloc path, but
> > the free path should be fine as long as you load the map pointer before
> > looking at the refcnt (atomic op ensuring the barrier there).  
> 
> Yeah, for the per-namespace refcnt it's pretty straight forward, the
> trouble is the global count that needs to iterate over all namespaces;
> probably need to put that all behind a (non-spin)lock, right?

Because net iteration is under RCU?  You can switch to taking
net_rwsem for that one, no?  I'm probably confused again ;)

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-04 22:15           ` Jakub Kicinski
@ 2019-03-04 22:28             ` Toke Høiland-Jørgensen
  2019-03-04 22:49               ` Jakub Kicinski
  0 siblings, 1 reply; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-04 22:28 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Mon, 04 Mar 2019 20:05:30 +0100, Toke Høiland-Jørgensen wrote:
>> > Hm.  I think you'll still need a lock (mutex?) on the alloc path, but
>> > the free path should be fine as long as you load the map pointer before
>> > looking at the refcnt (atomic op ensuring the barrier there).  
>> 
>> Yeah, for the per-namespace refcnt it's pretty straight forward, the
>> trouble is the global count that needs to iterate over all namespaces;
>> probably need to put that all behind a (non-spin)lock, right?
>
> Because net iteration is under RCU? You can switch to taking net_rwsem
> for that one, no? I'm probably confused again ;)

Because there's a single refcount that needs to trigger
creation/deletion of *all* the default maps. I.e.

if (atomic_dec_return(&global_refcnt))
  for_each_namespace(net)
    destroy_default_map(net);

which needs to not step on the toes of a subsequent

if (atomic_inc_return(&global_refcnt) == 1)
  for_each_namespace(net)
    create_default_map(net);

(or vice versa, of course).

Not sure there's a way to do that without wrapping both of those
constructs (including the refcnt inc/dec) in a mutex?

-Toke

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-04 22:28             ` Toke Høiland-Jørgensen
@ 2019-03-04 22:49               ` Jakub Kicinski
  2019-03-05  9:53                 ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2019-03-04 22:49 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

On Mon, 04 Mar 2019 23:28:14 +0100, Toke Høiland-Jørgensen wrote:
> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
> > On Mon, 04 Mar 2019 20:05:30 +0100, Toke Høiland-Jørgensen wrote:  
> >> > Hm.  I think you'll still need a lock (mutex?) on the alloc path, but
> >> > the free path should be fine as long as you load the map pointer before
> >> > looking at the refcnt (atomic op ensuring the barrier there).    
> >> 
> >> Yeah, for the per-namespace refcnt it's pretty straight forward, the
> >> trouble is the global count that needs to iterate over all namespaces;
> >> probably need to put that all behind a (non-spin)lock, right?  
> >
> > Because net iteration is under RCU? You can switch to taking net_rwsem
> > for that one, no? I'm probably confused again ;)  
> 
> Because there's a single refcount that needs to trigger
> creation/deletion of *all* the default maps. I.e.
> 
> if (atomic_dec_return(&global_refcnt))
>   for_each_namespace(net)
>     destroy_default_map(net);
> 
> which needs to not step on the toes of a subsequent
> 
> if (atomic_inc_return(&global_refcnt) == 1)
>   for_each_namespace(net)
>     create_default_map(net);
> 
> (or vice versa, of course).
> 
> Not sure there's a way to do that without wrapping both of those
> constructs (including the refcnt inc/dec) in a mutex?

Seems like it :(

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device
  2019-03-04 22:49               ` Jakub Kicinski
@ 2019-03-05  9:53                 ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 16+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-05  9:53 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Mon, 04 Mar 2019 23:28:14 +0100, Toke Høiland-Jørgensen wrote:
>> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
>> > On Mon, 04 Mar 2019 20:05:30 +0100, Toke Høiland-Jørgensen wrote:  
>> >> > Hm.  I think you'll still need a lock (mutex?) on the alloc path, but
>> >> > the free path should be fine as long as you load the map pointer before
>> >> > looking at the refcnt (atomic op ensuring the barrier there).    
>> >> 
>> >> Yeah, for the per-namespace refcnt it's pretty straight forward, the
>> >> trouble is the global count that needs to iterate over all namespaces;
>> >> probably need to put that all behind a (non-spin)lock, right?  
>> >
>> > Because net iteration is under RCU? You can switch to taking net_rwsem
>> > for that one, no? I'm probably confused again ;)  
>> 
>> Because there's a single refcount that needs to trigger
>> creation/deletion of *all* the default maps. I.e.
>> 
>> if (atomic_dec_return(&global_refcnt))
>>   for_each_namespace(net)
>>     destroy_default_map(net);
>> 
>> which needs to not step on the toes of a subsequent
>> 
>> if (atomic_inc_return(&global_refcnt) == 1)
>>   for_each_namespace(net)
>>     create_default_map(net);
>> 
>> (or vice versa, of course).
>> 
>> Not sure there's a way to do that without wrapping both of those
>> constructs (including the refcnt inc/dec) in a mutex?
>
> Seems like it :(

Right. Well I'm glad that we at least agree, so I can stop racking my
brain for other ways to do it ;)

-Toke

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2019-03-05  9:53 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-01 14:12 [PATCH net-next v3 0/3] xdp: Use a default map for xdp_redirect helper Toke Høiland-Jørgensen
2019-03-01 14:12 ` [PATCH net-next v3 3/3] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
2019-03-01 14:12 ` [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions Toke Høiland-Jørgensen
2019-03-02  1:08   ` Jakub Kicinski
2019-03-04 12:47     ` Toke Høiland-Jørgensen
2019-03-04 17:08       ` Jakub Kicinski
2019-03-04 17:37         ` Toke Høiland-Jørgensen
2019-03-01 14:12 ` [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
2019-03-02  2:09   ` Jakub Kicinski
2019-03-04 11:58     ` Toke Høiland-Jørgensen
2019-03-04 17:44       ` Jakub Kicinski
2019-03-04 19:05         ` Toke Høiland-Jørgensen
2019-03-04 22:15           ` Jakub Kicinski
2019-03-04 22:28             ` Toke Høiland-Jørgensen
2019-03-04 22:49               ` Jakub Kicinski
2019-03-05  9:53                 ` Toke Høiland-Jørgensen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.