All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper
  2019-06-11 15:44 [PATCH bpf-next v3 0/3] xdp: Allow lookup into devmaps before redirect Toke Høiland-Jørgensen
@ 2019-06-11 15:44 ` Toke Høiland-Jørgensen
  2019-06-11 18:00   ` Jesper Dangaard Brouer
  2019-06-11 21:48   ` Jakub Kicinski
  2019-06-11 15:44 ` [PATCH bpf-next v3 1/3] devmap/cpumap: Use flush list instead of bitmap Toke Høiland-Jørgensen
  2019-06-11 15:44 ` [PATCH bpf-next v3 3/3] devmap: Allow map lookups from eBPF Toke Høiland-Jørgensen
  2 siblings, 2 replies; 13+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-06-11 15:44 UTC (permalink / raw)
  To: netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	David Miller, Jonathan Lemon

From: Toke Høiland-Jørgensen <toke@redhat.com>

The bpf_redirect_map() helper used by XDP programs doesn't return any
indication of whether it can successfully redirect to the map index it was
given. Instead, BPF programs have to track this themselves, leading to
programs using duplicate maps to track which entries are populated in the
devmap.

This patch fixes this by moving the map lookup into the bpf_redirect_map()
helper, which makes it possible to return failure to the eBPF program. The
lower bits of the flags argument is used as the return code, which means
that existing users who pass a '0' flag argument will get XDP_ABORTED.

With this, a BPF program can check the return code from the helper call and
react by, for instance, substituting a different redirect. This works for
any type of map used for redirect.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 include/linux/filter.h   |    1 +
 include/uapi/linux/bpf.h |    8 ++++++++
 net/core/filter.c        |   26 ++++++++++++--------------
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 43b45d6db36d..f31ae8b9035a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -580,6 +580,7 @@ struct bpf_skb_data_end {
 struct bpf_redirect_info {
 	u32 ifindex;
 	u32 flags;
+	void *item;
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
 	u32 kern_flags;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7c6aef253173..9931cf02de19 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3098,6 +3098,14 @@ enum xdp_action {
 	XDP_REDIRECT,
 };
 
+/* Flags for bpf_xdp_redirect_map helper */
+
+/* The lower flag bits will be the return code of bpf_xdp_redirect_map() helper
+ * if the map lookup fails.
+ */
+#define XDP_REDIRECT_INVALID_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
+#define XDP_REDIRECT_ALL_FLAGS XDP_REDIRECT_INVALID_MASK
+
 /* user accessible metadata for XDP packet hook
  * new fields must be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index 7a996887c500..dd43be497480 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3608,17 +3608,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 			       struct bpf_redirect_info *ri)
 {
 	u32 index = ri->ifindex;
-	void *fwd = NULL;
+	void *fwd = ri->item;
 	int err;
 
 	ri->ifindex = 0;
+	ri->item = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	fwd = __xdp_map_lookup_elem(map, index);
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
 	if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
 		xdp_do_flush_map();
 
@@ -3655,18 +3651,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	u32 index = ri->ifindex;
-	void *fwd = NULL;
+	void *fwd = ri->item;
 	int err = 0;
 
 	ri->ifindex = 0;
+	ri->item = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	fwd = __xdp_map_lookup_elem(map, index);
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
-
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		struct bpf_dtab_netdev *dst = fwd;
 
@@ -3735,6 +3726,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 
 	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->item = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
 	return XDP_REDIRECT;
@@ -3753,9 +3745,15 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
-	if (unlikely(flags))
+	if (unlikely(flags & ~XDP_REDIRECT_ALL_FLAGS))
 		return XDP_ABORTED;
 
+	ri->item = __xdp_map_lookup_elem(map, ifindex);
+	if (unlikely(!ri->item)) {
+		WRITE_ONCE(ri->map, NULL);
+		return (flags & XDP_REDIRECT_INVALID_MASK);
+	}
+
 	ri->ifindex = ifindex;
 	ri->flags = flags;
 	WRITE_ONCE(ri->map, map);


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH bpf-next v3 3/3] devmap: Allow map lookups from eBPF
  2019-06-11 15:44 [PATCH bpf-next v3 0/3] xdp: Allow lookup into devmaps before redirect Toke Høiland-Jørgensen
  2019-06-11 15:44 ` [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper Toke Høiland-Jørgensen
  2019-06-11 15:44 ` [PATCH bpf-next v3 1/3] devmap/cpumap: Use flush list instead of bitmap Toke Høiland-Jørgensen
@ 2019-06-11 15:44 ` Toke Høiland-Jørgensen
  2 siblings, 0 replies; 13+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-06-11 15:44 UTC (permalink / raw)
  To: netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	David Miller, Jonathan Lemon

From: Toke Høiland-Jørgensen <toke@redhat.com>

We don't currently allow lookups into a devmap from eBPF, because the map
lookup returns a pointer directly to the dev->ifindex, which shouldn't be
modifiable from eBPF.

However, being able to do lookups in devmaps is useful to know (e.g.)
whether forwarding to a specific interface is enabled. Currently, programs
work around this by keeping a shadow map of another type which indicates
whether a map index is valid.

Since we now have a flag to make maps read-only from the eBPF side, we can
simply lift the lookup restriction if we make sure this flag is always set.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 kernel/bpf/devmap.c   |    5 +++++
 kernel/bpf/verifier.c |    7 ++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index c945518225f3..83c77f43030d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -99,6 +99,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
+	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
+	 * verifier prevents writes from the BPF side
+	 */
+	attr->map_flags |= BPF_F_RDONLY_PROG;
+
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
 	if (!dtab)
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c2cb5bd84ce..7128a9821481 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2893,12 +2893,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_get_local_storage)
 			goto error;
 		break;
-	/* devmap returns a pointer to a live net_device ifindex that we cannot
-	 * allow to be modified from bpf side. So do not allow lookup elements
-	 * for now.
-	 */
 	case BPF_MAP_TYPE_DEVMAP:
-		if (func_id != BPF_FUNC_redirect_map)
+		if (func_id != BPF_FUNC_redirect_map &&
+		    func_id != BPF_FUNC_map_lookup_elem)
 			goto error;
 		break;
 	/* Restrict bpf side of cpumap and xskmap, open when use-cases


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH bpf-next v3 0/3] xdp: Allow lookup into devmaps before redirect
@ 2019-06-11 15:44 Toke Høiland-Jørgensen
  2019-06-11 15:44 ` [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper Toke Høiland-Jørgensen
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-06-11 15:44 UTC (permalink / raw)
  To: netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	David Miller, Jonathan Lemon

When using the bpf_redirect_map() helper to redirect packets from XDP, the eBPF
program cannot currently know whether the redirect will succeed, which makes it
impossible to gracefully handle errors. To properly fix this will probably
require deeper changes to the way TX resources are allocated, but one thing that
is fairly straight forward to fix is to allow lookups into devmaps, so programs
can at least know when a redirect is *guaranteed* to fail because there is no
entry in the map. Currently, programs work around this by keeping a shadow map
of another type which indicates whether a map index is valid.

This series contains two changes that are complementary ways to fix this issue:

- Moving the map lookup into the bpf_redirect_map() helper (and caching the
  result), so the helper can return an error if no value is found in the map.
  This includes a refactoring of the devmap and cpumap code to not care about
  the index on enqueue.

- Allowing regular lookups into devmaps from eBPF programs, using the read-only
  flag to make sure they don't change the values.

The performance impact of the series is negligible, in the sense that I cannot
measure it because the variance between test runs is higher than the difference
pre/post series.

Changelog:

v3:
  - Adopt Jonathan's idea of using the lower two bits of the flag value as the
    return code.
  - Always do the lookup, and cache the result for use in xdp_do_redirect(); to
    achieve this, refactor the devmap and cpumap code to get rid the bitmap for
    selecting which devices to flush.
v2:
  - For patch 1, make it clear that the change works for any map type.
  - For patch 2, just use the new BPF_F_RDONLY_PROG flag to make the return
    value read-only.

---

Toke Høiland-Jørgensen (3):
      devmap/cpumap: Use flush list instead of bitmap
      bpf_xdp_redirect_map: Perform map lookup in eBPF helper
      devmap: Allow map lookups from eBPF


 include/linux/filter.h   |    1 
 include/uapi/linux/bpf.h |    8 ++++
 kernel/bpf/cpumap.c      |   97 +++++++++++++++++++++-------------------------
 kernel/bpf/devmap.c      |   98 ++++++++++++++++++++++------------------------
 kernel/bpf/verifier.c    |    7 +--
 net/core/filter.c        |   28 ++++++-------
 6 files changed, 115 insertions(+), 124 deletions(-)


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH bpf-next v3 1/3] devmap/cpumap: Use flush list instead of bitmap
  2019-06-11 15:44 [PATCH bpf-next v3 0/3] xdp: Allow lookup into devmaps before redirect Toke Høiland-Jørgensen
  2019-06-11 15:44 ` [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper Toke Høiland-Jørgensen
@ 2019-06-11 15:44 ` Toke Høiland-Jørgensen
  2019-06-13  5:51   ` Andrii Nakryiko
  2019-06-11 15:44 ` [PATCH bpf-next v3 3/3] devmap: Allow map lookups from eBPF Toke Høiland-Jørgensen
  2 siblings, 1 reply; 13+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-06-11 15:44 UTC (permalink / raw)
  To: netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	David Miller, Jonathan Lemon

From: Toke Høiland-Jørgensen <toke@redhat.com>

The socket map uses a linked list instead of a bitmap to keep track of
which entries to flush. Do the same for devmap and cpumap, as this means we
don't have to care about the map index when enqueueing things into the
map (and so we can cache the map lookup).

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 kernel/bpf/cpumap.c |   97 ++++++++++++++++++++++++---------------------------
 kernel/bpf/devmap.c |   93 ++++++++++++++++++++++---------------------------
 net/core/filter.c   |    2 -
 3 files changed, 87 insertions(+), 105 deletions(-)

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b31a71909307..02dc3503ef4d 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -38,8 +38,13 @@
  */
 
 #define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
+struct bpf_cpu_map_entry;
+struct bpf_cpu_map;
+
 struct xdp_bulk_queue {
 	void *q[CPU_MAP_BULK_SIZE];
+	struct list_head flush_node;
+	struct bpf_cpu_map_entry *obj;
 	unsigned int count;
 };
 
@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
 	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
 	struct xdp_bulk_queue __percpu *bulkq;
 
+	struct bpf_cpu_map *cmap;
+
 	/* Queue with potential multi-producers, and single-consumer kthread */
 	struct ptr_ring *queue;
 	struct task_struct *kthread;
@@ -65,23 +72,17 @@ struct bpf_cpu_map {
 	struct bpf_map map;
 	/* Below members specific for map type */
 	struct bpf_cpu_map_entry **cpu_map;
-	unsigned long __percpu *flush_needed;
+	struct list_head __percpu *flush_list;
 };
 
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
-			     struct xdp_bulk_queue *bq, bool in_napi_ctx);
-
-static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
-{
-	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
-}
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
 
 static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_cpu_map *cmap;
 	int err = -ENOMEM;
+	int ret, cpu;
 	u64 cost;
-	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 
 	/* make sure page count doesn't overflow */
 	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
-	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+	cost += sizeof(struct list_head) * num_possible_cpus();
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
 	ret = bpf_map_charge_init(&cmap->map.memory, cost);
@@ -115,11 +116,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	}
 
 	/* A per cpu bitfield with a bit per possible CPU in map  */
-	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
-					    __alignof__(unsigned long));
-	if (!cmap->flush_needed)
+	cmap->flush_list = alloc_percpu(struct list_head);
+	if (!cmap->flush_list)
 		goto free_charge;
 
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
+
 	/* Alloc array for possible remote "destination" CPUs */
 	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
 					   sizeof(struct bpf_cpu_map_entry *),
@@ -129,7 +132,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 
 	return &cmap->map;
 free_percpu:
-	free_percpu(cmap->flush_needed);
+	free_percpu(cmap->flush_list);
 free_charge:
 	bpf_map_charge_finish(&cmap->map.memory);
 free_cmap:
@@ -331,7 +334,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
 {
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	struct bpf_cpu_map_entry *rcpu;
-	int numa, err;
+	struct xdp_bulk_queue *bq;
+	int numa, err, i;
 
 	/* Have map->numa_node, but choose node of redirect target CPU */
 	numa = cpu_to_node(cpu);
@@ -346,6 +350,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
 	if (!rcpu->bulkq)
 		goto free_rcu;
 
+	for_each_possible_cpu(i) {
+		bq = per_cpu_ptr(rcpu->bulkq, i);
+		bq->obj = rcpu;
+	}
+
 	/* Alloc queue */
 	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
 	if (!rcpu->queue)
@@ -402,7 +411,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
 		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
 
 		/* No concurrent bq_enqueue can run at this point */
-		bq_flush_to_queue(rcpu, bq, false);
+		bq_flush_to_queue(bq, false);
 	}
 	free_percpu(rcpu->bulkq);
 	/* Cannot kthread_stop() here, last put free rcpu resources */
@@ -485,6 +494,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
 		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
 		if (!rcpu)
 			return -ENOMEM;
+		rcpu->cmap = cmap;
 	}
 	rcu_read_lock();
 	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -516,9 +526,9 @@ static void cpu_map_free(struct bpf_map *map)
 	 * from the program we can assume no new bits will be set.
 	 */
 	for_each_online_cpu(cpu) {
-		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+		struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
 
-		while (!bitmap_empty(bitmap, cmap->map.max_entries))
+		while (!list_empty(flush_list))
 			cond_resched();
 	}
 
@@ -535,7 +545,7 @@ static void cpu_map_free(struct bpf_map *map)
 		/* bq flush and cleanup happens after RCU graze-period */
 		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
 	}
-	free_percpu(cmap->flush_needed);
+	free_percpu(cmap->flush_list);
 	bpf_map_area_free(cmap->cpu_map);
 	kfree(cmap);
 }
@@ -587,9 +597,9 @@ const struct bpf_map_ops cpu_map_ops = {
 	.map_check_btf		= map_check_no_btf,
 };
 
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
-			     struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
 {
+	struct bpf_cpu_map_entry *rcpu = bq->obj;
 	unsigned int processed = 0, drops = 0;
 	const int to_cpu = rcpu->cpu;
 	struct ptr_ring *q;
@@ -618,6 +628,9 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	bq->count = 0;
 	spin_unlock(&q->producer_lock);
 
+	__list_del(bq->flush_node.prev, bq->flush_node.next);
+	bq->flush_node.prev = NULL;
+
 	/* Feedback loop via tracepoints */
 	trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
 	return 0;
@@ -628,10 +641,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
  */
 static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
+	struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
 	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
-		bq_flush_to_queue(rcpu, bq, true);
+		bq_flush_to_queue(bq, true);
 
 	/* Notice, xdp_buff/page MUST be queued here, long enough for
 	 * driver to code invoking us to finished, due to driver
@@ -643,6 +657,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 	 * operation, when completing napi->poll call.
 	 */
 	bq->q[bq->count++] = xdpf;
+
+	if (!bq->flush_node.prev)
+		list_add(&bq->flush_node, flush_list);
+
 	return 0;
 }
 
@@ -662,41 +680,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 	return 0;
 }
 
-void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
-	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
-	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
-
-	__set_bit(bit, bitmap);
-}
-
 void __cpu_map_flush(struct bpf_map *map)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
-	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
-	u32 bit;
-
-	/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
-	 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
-	 * bitmap indicate which percpu bulkq have packets.
-	 */
-	for_each_set_bit(bit, bitmap, map->max_entries) {
-		struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
-		struct xdp_bulk_queue *bq;
-
-		/* This is possible if entry is removed by user space
-		 * between xdp redirect and flush op.
-		 */
-		if (unlikely(!rcpu))
-			continue;
-
-		__clear_bit(bit, bitmap);
+	struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+	struct xdp_bulk_queue *bq, *tmp;
 
-		/* Flush all frames in bulkq to real queue */
-		bq = this_cpu_ptr(rcpu->bulkq);
-		bq_flush_to_queue(rcpu, bq, true);
+	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+		bq_flush_to_queue(bq, true);
 
 		/* If already running, costs spin_lock_irqsave + smb_mb */
-		wake_up_process(rcpu->kthread);
+		wake_up_process(bq->obj->kthread);
 	}
 }
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 5ae7cce5ef16..c945518225f3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -56,9 +56,13 @@
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
 #define DEV_MAP_BULK_SIZE 16
+struct bpf_dtab_netdev;
+
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	struct list_head flush_node;
 	struct net_device *dev_rx;
+	struct bpf_dtab_netdev *obj;
 	unsigned int count;
 };
 
@@ -73,23 +77,19 @@ struct bpf_dtab_netdev {
 struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map;
-	unsigned long __percpu *flush_needed;
+	struct list_head __percpu *flush_list;
 	struct list_head list;
 };
 
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
 
-static u64 dev_map_bitmap_size(const union bpf_attr *attr)
-{
-	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
-}
-
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_dtab *dtab;
 	int err = -EINVAL;
 	u64 cost;
+	int cpu;
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -107,7 +107,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* make sure page count doesn't overflow */
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
-	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+	cost += sizeof(struct list_head) * num_possible_cpus();
 
 	/* if map size is larger than memlock limit, reject it */
 	err = bpf_map_charge_init(&dtab->map.memory, cost);
@@ -116,28 +116,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	err = -ENOMEM;
 
-	/* A per cpu bitfield with a bit per possible net device */
-	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
-						__alignof__(unsigned long),
-						GFP_KERNEL | __GFP_NOWARN);
-	if (!dtab->flush_needed)
+	dtab->flush_list = alloc_percpu(struct list_head);
+	if (!dtab->flush_list)
 		goto free_charge;
 
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
+
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
 					      dtab->map.numa_node);
 	if (!dtab->netdev_map)
-		goto free_charge;
+		goto free_percpu;
 
 	spin_lock(&dev_map_lock);
 	list_add_tail_rcu(&dtab->list, &dev_map_list);
 	spin_unlock(&dev_map_lock);
 
 	return &dtab->map;
+
+free_percpu:
+	free_percpu(dtab->flush_list);
 free_charge:
 	bpf_map_charge_finish(&dtab->map.memory);
 free_dtab:
-	free_percpu(dtab->flush_needed);
 	kfree(dtab);
 	return ERR_PTR(err);
 }
@@ -171,9 +173,9 @@ static void dev_map_free(struct bpf_map *map)
 	 * from the program we can assume no new bits will be set.
 	 */
 	for_each_online_cpu(cpu) {
-		unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+		struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
 
-		while (!bitmap_empty(bitmap, dtab->map.max_entries))
+		while (!list_empty(flush_list))
 			cond_resched();
 	}
 
@@ -188,7 +190,7 @@ static void dev_map_free(struct bpf_map *map)
 		kfree(dev);
 	}
 
-	free_percpu(dtab->flush_needed);
+	free_percpu(dtab->flush_list);
 	bpf_map_area_free(dtab->netdev_map);
 	kfree(dtab);
 }
@@ -210,18 +212,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
-	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
-
-	__set_bit(bit, bitmap);
-}
-
-static int bq_xmit_all(struct bpf_dtab_netdev *obj,
-		       struct xdp_bulk_queue *bq, u32 flags,
+static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
 		       bool in_napi_ctx)
 {
+
+	struct bpf_dtab_netdev *obj = bq->obj;
 	struct net_device *dev = obj->dev;
 	int sent = 0, drops = 0, err = 0;
 	int i;
@@ -248,6 +243,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
 			      sent, drops, bq->dev_rx, dev, err);
 	bq->dev_rx = NULL;
+	__list_del(bq->flush_node.prev, bq->flush_node.next);
+	bq->flush_node.prev = NULL;
 	return 0;
 error:
 	/* If ndo_xdp_xmit fails with an errno, no frames have been
@@ -276,24 +273,11 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 void __dev_map_flush(struct bpf_map *map)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
-	u32 bit;
+	struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
+	struct xdp_bulk_queue *bq, *tmp;
 
-	for_each_set_bit(bit, bitmap, map->max_entries) {
-		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
-		struct xdp_bulk_queue *bq;
-
-		/* This is possible if the dev entry is removed by user space
-		 * between xdp redirect and flush op.
-		 */
-		if (unlikely(!dev))
-			continue;
-
-		__clear_bit(bit, bitmap);
-
-		bq = this_cpu_ptr(dev->bulkq);
-		bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
-	}
+	list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+		bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
 }
 
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -319,10 +303,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
 		      struct net_device *dev_rx)
 
 {
+	struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
-		bq_xmit_all(obj, bq, 0, true);
+		bq_xmit_all(bq, 0, true);
 
 	/* Ingress dev_rx will be the same for all xdp_frame's in
 	 * bulk_queue, because bq stored per-CPU and must be flushed
@@ -332,6 +317,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
 		bq->dev_rx = dev_rx;
 
 	bq->q[bq->count++] = xdpf;
+
+	if (!bq->flush_node.prev)
+		list_add(&bq->flush_node, flush_list);
+
 	return 0;
 }
 
@@ -382,16 +371,11 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
 	if (dev->dev->netdev_ops->ndo_xdp_xmit) {
 		struct xdp_bulk_queue *bq;
-		unsigned long *bitmap;
-
 		int cpu;
 
 		for_each_online_cpu(cpu) {
-			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
-			__clear_bit(dev->bit, bitmap);
-
 			bq = per_cpu_ptr(dev->bulkq, cpu);
-			bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
+			bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
 		}
 	}
 }
@@ -439,6 +423,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
+	struct xdp_bulk_queue *bq;
+	int cpu;
 
 	if (unlikely(map_flags > BPF_EXIST))
 		return -EINVAL;
@@ -461,6 +447,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 			return -ENOMEM;
 		}
 
+		for_each_possible_cpu(cpu) {
+			bq = per_cpu_ptr(dev->bulkq, cpu);
+			bq->obj = dev;
+		}
+
 		dev->dev = dev_get_by_index(net, ifindex);
 		if (!dev->dev) {
 			free_percpu(dev->bulkq);
diff --git a/net/core/filter.c b/net/core/filter.c
index 55bfc941d17a..7a996887c500 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3526,7 +3526,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 		err = dev_map_enqueue(dst, xdp, dev_rx);
 		if (unlikely(err))
 			return err;
-		__dev_map_insert_ctx(map, index);
 		break;
 	}
 	case BPF_MAP_TYPE_CPUMAP: {
@@ -3535,7 +3534,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
 		if (unlikely(err))
 			return err;
-		__cpu_map_insert_ctx(map, index);
 		break;
 	}
 	case BPF_MAP_TYPE_XSKMAP: {


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper
  2019-06-11 15:44 ` [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper Toke Høiland-Jørgensen
@ 2019-06-11 18:00   ` Jesper Dangaard Brouer
  2019-06-11 18:17     ` Toke Høiland-Jørgensen
  2019-06-11 21:48   ` Jakub Kicinski
  1 sibling, 1 reply; 13+ messages in thread
From: Jesper Dangaard Brouer @ 2019-06-11 18:00 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: netdev, Daniel Borkmann, Alexei Starovoitov, David Miller,
	Jonathan Lemon, brouer

On Tue, 11 Jun 2019 17:44:00 +0200
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> From: Toke Høiland-Jørgensen <toke@redhat.com>
> 
> The bpf_redirect_map() helper used by XDP programs doesn't return any
> indication of whether it can successfully redirect to the map index it was
> given. Instead, BPF programs have to track this themselves, leading to
> programs using duplicate maps to track which entries are populated in the
> devmap.
> 
> This patch fixes this by moving the map lookup into the bpf_redirect_map()
> helper, which makes it possible to return failure to the eBPF program. The
> lower bits of the flags argument is used as the return code, which means
> that existing users who pass a '0' flag argument will get XDP_ABORTED.
> 
> With this, a BPF program can check the return code from the helper call and
> react by, for instance, substituting a different redirect. This works for
> any type of map used for redirect.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---
>  include/linux/filter.h   |    1 +
>  include/uapi/linux/bpf.h |    8 ++++++++
>  net/core/filter.c        |   26 ++++++++++++--------------
>  3 files changed, 21 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 43b45d6db36d..f31ae8b9035a 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -580,6 +580,7 @@ struct bpf_skb_data_end {
>  struct bpf_redirect_info {
>  	u32 ifindex;
>  	u32 flags;
> +	void *item;
>  	struct bpf_map *map;
>  	struct bpf_map *map_to_flush;
>  	u32 kern_flags;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 7c6aef253173..9931cf02de19 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3098,6 +3098,14 @@ enum xdp_action {
>  	XDP_REDIRECT,
>  };
>  
> +/* Flags for bpf_xdp_redirect_map helper */
> +
> +/* The lower flag bits will be the return code of bpf_xdp_redirect_map() helper
> + * if the map lookup fails.
> + */
> +#define XDP_REDIRECT_INVALID_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
> +#define XDP_REDIRECT_ALL_FLAGS XDP_REDIRECT_INVALID_MASK
> +

Slightly confused about the naming of the define, see later.

>  /* user accessible metadata for XDP packet hook
>   * new fields must be added to the end of this structure
>   */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 7a996887c500..dd43be497480 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3608,17 +3608,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
>  			       struct bpf_redirect_info *ri)
>  {
>  	u32 index = ri->ifindex;
> -	void *fwd = NULL;
> +	void *fwd = ri->item;
>  	int err;
>  
>  	ri->ifindex = 0;
> +	ri->item = NULL;
>  	WRITE_ONCE(ri->map, NULL);
>  
> -	fwd = __xdp_map_lookup_elem(map, index);
> -	if (unlikely(!fwd)) {
> -		err = -EINVAL;
> -		goto err;
> -	}
>  	if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
>  		xdp_do_flush_map();
>  
> @@ -3655,18 +3651,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
>  {
>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>  	u32 index = ri->ifindex;
> -	void *fwd = NULL;
> +	void *fwd = ri->item;
>  	int err = 0;
>  
>  	ri->ifindex = 0;
> +	ri->item = NULL;
>  	WRITE_ONCE(ri->map, NULL);
>  
> -	fwd = __xdp_map_lookup_elem(map, index);
> -	if (unlikely(!fwd)) {
> -		err = -EINVAL;
> -		goto err;
> -	}
> -
>  	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
>  		struct bpf_dtab_netdev *dst = fwd;
>  
> @@ -3735,6 +3726,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
>  
>  	ri->ifindex = ifindex;
>  	ri->flags = flags;
> +	ri->item = NULL;
>  	WRITE_ONCE(ri->map, NULL);
>  
>  	return XDP_REDIRECT;
> @@ -3753,9 +3745,15 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
>  {
>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>  
> -	if (unlikely(flags))
> +	if (unlikely(flags & ~XDP_REDIRECT_ALL_FLAGS))
>  		return XDP_ABORTED;
>  
> +	ri->item = __xdp_map_lookup_elem(map, ifindex);
> +	if (unlikely(!ri->item)) {
> +		WRITE_ONCE(ri->map, NULL);
> +		return (flags & XDP_REDIRECT_INVALID_MASK);

Maybe I'm reading it wrong, but shouldn't the mask be called the "valid" mask?

> +	}
> +
>  	ri->ifindex = ifindex;
>  	ri->flags = flags;
>  	WRITE_ONCE(ri->map, map);
> 



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper
  2019-06-11 18:00   ` Jesper Dangaard Brouer
@ 2019-06-11 18:17     ` Toke Høiland-Jørgensen
  2019-06-12 20:01       ` Maciej Fijalkowski
  0 siblings, 1 reply; 13+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-06-11 18:17 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, Daniel Borkmann, Alexei Starovoitov, David Miller,
	Jonathan Lemon, brouer

Jesper Dangaard Brouer <brouer@redhat.com> writes:

> On Tue, 11 Jun 2019 17:44:00 +0200
> Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
>> From: Toke Høiland-Jørgensen <toke@redhat.com>
>> 
>> The bpf_redirect_map() helper used by XDP programs doesn't return any
>> indication of whether it can successfully redirect to the map index it was
>> given. Instead, BPF programs have to track this themselves, leading to
>> programs using duplicate maps to track which entries are populated in the
>> devmap.
>> 
>> This patch fixes this by moving the map lookup into the bpf_redirect_map()
>> helper, which makes it possible to return failure to the eBPF program. The
>> lower bits of the flags argument is used as the return code, which means
>> that existing users who pass a '0' flag argument will get XDP_ABORTED.
>> 
>> With this, a BPF program can check the return code from the helper call and
>> react by, for instance, substituting a different redirect. This works for
>> any type of map used for redirect.
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> ---
>>  include/linux/filter.h   |    1 +
>>  include/uapi/linux/bpf.h |    8 ++++++++
>>  net/core/filter.c        |   26 ++++++++++++--------------
>>  3 files changed, 21 insertions(+), 14 deletions(-)
>> 
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 43b45d6db36d..f31ae8b9035a 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -580,6 +580,7 @@ struct bpf_skb_data_end {
>>  struct bpf_redirect_info {
>>  	u32 ifindex;
>>  	u32 flags;
>> +	void *item;
>>  	struct bpf_map *map;
>>  	struct bpf_map *map_to_flush;
>>  	u32 kern_flags;
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 7c6aef253173..9931cf02de19 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -3098,6 +3098,14 @@ enum xdp_action {
>>  	XDP_REDIRECT,
>>  };
>>  
>> +/* Flags for bpf_xdp_redirect_map helper */
>> +
>> +/* The lower flag bits will be the return code of bpf_xdp_redirect_map() helper
>> + * if the map lookup fails.
>> + */
>> +#define XDP_REDIRECT_INVALID_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
>> +#define XDP_REDIRECT_ALL_FLAGS XDP_REDIRECT_INVALID_MASK
>> +
>
> Slightly confused about the naming of the define, see later.
>
>>  /* user accessible metadata for XDP packet hook
>>   * new fields must be added to the end of this structure
>>   */
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 7a996887c500..dd43be497480 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -3608,17 +3608,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
>>  			       struct bpf_redirect_info *ri)
>>  {
>>  	u32 index = ri->ifindex;
>> -	void *fwd = NULL;
>> +	void *fwd = ri->item;
>>  	int err;
>>  
>>  	ri->ifindex = 0;
>> +	ri->item = NULL;
>>  	WRITE_ONCE(ri->map, NULL);
>>  
>> -	fwd = __xdp_map_lookup_elem(map, index);
>> -	if (unlikely(!fwd)) {
>> -		err = -EINVAL;
>> -		goto err;
>> -	}
>>  	if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
>>  		xdp_do_flush_map();
>>  
>> @@ -3655,18 +3651,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
>>  {
>>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>>  	u32 index = ri->ifindex;
>> -	void *fwd = NULL;
>> +	void *fwd = ri->item;
>>  	int err = 0;
>>  
>>  	ri->ifindex = 0;
>> +	ri->item = NULL;
>>  	WRITE_ONCE(ri->map, NULL);
>>  
>> -	fwd = __xdp_map_lookup_elem(map, index);
>> -	if (unlikely(!fwd)) {
>> -		err = -EINVAL;
>> -		goto err;
>> -	}
>> -
>>  	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
>>  		struct bpf_dtab_netdev *dst = fwd;
>>  
>> @@ -3735,6 +3726,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
>>  
>>  	ri->ifindex = ifindex;
>>  	ri->flags = flags;
>> +	ri->item = NULL;
>>  	WRITE_ONCE(ri->map, NULL);
>>  
>>  	return XDP_REDIRECT;
>> @@ -3753,9 +3745,15 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
>>  {
>>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>>  
>> -	if (unlikely(flags))
>> +	if (unlikely(flags & ~XDP_REDIRECT_ALL_FLAGS))
>>  		return XDP_ABORTED;
>>  
>> +	ri->item = __xdp_map_lookup_elem(map, ifindex);
>> +	if (unlikely(!ri->item)) {
>> +		WRITE_ONCE(ri->map, NULL);
>> +		return (flags & XDP_REDIRECT_INVALID_MASK);
>
> Maybe I'm reading it wrong, but shouldn't the mask be called the "valid" mask?

It's the mask that is applied when the index looked up is invalid (i.e.,
the entry doesn't exist)? But yeah, can see how the name can be
confusing; maybe it should just be "RETURN_MASK" or something like that?

-Toke

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper
  2019-06-11 15:44 ` [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper Toke Høiland-Jørgensen
  2019-06-11 18:00   ` Jesper Dangaard Brouer
@ 2019-06-11 21:48   ` Jakub Kicinski
  2019-06-12  9:49     ` Toke Høiland-Jørgensen
  1 sibling, 1 reply; 13+ messages in thread
From: Jakub Kicinski @ 2019-06-11 21:48 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, David Miller, Jonathan Lemon

On Tue, 11 Jun 2019 17:44:00 +0200, Toke Høiland-Jørgensen wrote:
> +#define XDP_REDIRECT_INVALID_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)

It feels a little strange to OR in values which are not bits, even if
it happens to work today (since those are values of 0, 1, 2, 3)...

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper
  2019-06-11 21:48   ` Jakub Kicinski
@ 2019-06-12  9:49     ` Toke Høiland-Jørgensen
  2019-06-12 19:45       ` Jakub Kicinski
  0 siblings, 1 reply; 13+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-06-12  9:49 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, David Miller, Jonathan Lemon

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Tue, 11 Jun 2019 17:44:00 +0200, Toke Høiland-Jørgensen wrote:
>> +#define XDP_REDIRECT_INVALID_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
>
> It feels a little strange to OR in values which are not bits, even if
> it happens to work today (since those are values of 0, 1, 2, 3)...

Yeah, I agree. But it also nicely expresses the extent in code.
Otherwise that would need to be in a comment, like

// we allow return codes of ABORTED/DROP/PASS/TX
#define XDP_REDIRECT_INVALID_MASK 3


Or do you have a better idea?

-Toke

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper
  2019-06-12  9:49     ` Toke Høiland-Jørgensen
@ 2019-06-12 19:45       ` Jakub Kicinski
  0 siblings, 0 replies; 13+ messages in thread
From: Jakub Kicinski @ 2019-06-12 19:45 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: netdev, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, David Miller, Jonathan Lemon

On Wed, 12 Jun 2019 11:49:17 +0200, Toke Høiland-Jørgensen wrote:
> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
> 
> > On Tue, 11 Jun 2019 17:44:00 +0200, Toke Høiland-Jørgensen wrote:  
> >> +#define XDP_REDIRECT_INVALID_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)  
> >
> > It feels a little strange to OR in values which are not bits, even if
> > it happens to work today (since those are values of 0, 1, 2, 3)...  
> 
> Yeah, I agree. But it also nicely expresses the extent in code.
> Otherwise that would need to be in a comment, like
> 
> // we allow return codes of ABORTED/DROP/PASS/TX
> #define XDP_REDIRECT_INVALID_MASK 3
> 
> 
> Or do you have a better idea?

flags > XDP_TX

In the future when we add more fields in flags:

if (flags & ~XDP_REDIRECT_FLAGS_MASK)
	return -EBLA;
if ((flags & XDP_REDIRECT_RETCODE_MASK) > XDP_TX))
	return -EFOO;

?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper
  2019-06-11 18:17     ` Toke Høiland-Jørgensen
@ 2019-06-12 20:01       ` Maciej Fijalkowski
  2019-06-12 21:33         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 13+ messages in thread
From: Maciej Fijalkowski @ 2019-06-12 20:01 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Jesper Dangaard Brouer, netdev, Daniel Borkmann,
	Alexei Starovoitov, David Miller, Jonathan Lemon

On Tue, 11 Jun 2019 20:17:02 +0200
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> 
> > On Tue, 11 Jun 2019 17:44:00 +0200
> > Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >  
> >> From: Toke Høiland-Jørgensen <toke@redhat.com>
> >> 
> >> The bpf_redirect_map() helper used by XDP programs doesn't return any
> >> indication of whether it can successfully redirect to the map index it was
> >> given. Instead, BPF programs have to track this themselves, leading to
> >> programs using duplicate maps to track which entries are populated in the
> >> devmap.
> >> 
> >> This patch fixes this by moving the map lookup into the bpf_redirect_map()
> >> helper, which makes it possible to return failure to the eBPF program. The
> >> lower bits of the flags argument is used as the return code, which means
> >> that existing users who pass a '0' flag argument will get XDP_ABORTED.
> >> 
> >> With this, a BPF program can check the return code from the helper call and
> >> react by, for instance, substituting a different redirect. This works for
> >> any type of map used for redirect.
> >> 
> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> >> ---
> >>  include/linux/filter.h   |    1 +
> >>  include/uapi/linux/bpf.h |    8 ++++++++
> >>  net/core/filter.c        |   26 ++++++++++++--------------
> >>  3 files changed, 21 insertions(+), 14 deletions(-)
> >> 
> >> diff --git a/include/linux/filter.h b/include/linux/filter.h
> >> index 43b45d6db36d..f31ae8b9035a 100644
> >> --- a/include/linux/filter.h
> >> +++ b/include/linux/filter.h
> >> @@ -580,6 +580,7 @@ struct bpf_skb_data_end {
> >>  struct bpf_redirect_info {
> >>  	u32 ifindex;
> >>  	u32 flags;
> >> +	void *item;
> >>  	struct bpf_map *map;
> >>  	struct bpf_map *map_to_flush;
> >>  	u32 kern_flags;
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 7c6aef253173..9931cf02de19 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -3098,6 +3098,14 @@ enum xdp_action {
> >>  	XDP_REDIRECT,
> >>  };
> >>  
> >> +/* Flags for bpf_xdp_redirect_map helper */
> >> +
> >> +/* The lower flag bits will be the return code of bpf_xdp_redirect_map() helper
> >> + * if the map lookup fails.
> >> + */
> >> +#define XDP_REDIRECT_INVALID_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
> >> +#define XDP_REDIRECT_ALL_FLAGS XDP_REDIRECT_INVALID_MASK
> >> +  
> >
> > Slightly confused about the naming of the define, see later.
> >  
> >>  /* user accessible metadata for XDP packet hook
> >>   * new fields must be added to the end of this structure
> >>   */
> >> diff --git a/net/core/filter.c b/net/core/filter.c
> >> index 7a996887c500..dd43be497480 100644
> >> --- a/net/core/filter.c
> >> +++ b/net/core/filter.c
> >> @@ -3608,17 +3608,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
> >>  			       struct bpf_redirect_info *ri)
> >>  {
> >>  	u32 index = ri->ifindex;
> >> -	void *fwd = NULL;
> >> +	void *fwd = ri->item;
> >>  	int err;
> >>  
> >>  	ri->ifindex = 0;
> >> +	ri->item = NULL;
> >>  	WRITE_ONCE(ri->map, NULL);
> >>  
> >> -	fwd = __xdp_map_lookup_elem(map, index);
> >> -	if (unlikely(!fwd)) {
> >> -		err = -EINVAL;
> >> -		goto err;
> >> -	}
> >>  	if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
> >>  		xdp_do_flush_map();
> >>  
> >> @@ -3655,18 +3651,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
> >>  {
> >>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
> >>  	u32 index = ri->ifindex;
> >> -	void *fwd = NULL;
> >> +	void *fwd = ri->item;
> >>  	int err = 0;
> >>  
> >>  	ri->ifindex = 0;
> >> +	ri->item = NULL;
> >>  	WRITE_ONCE(ri->map, NULL);
> >>  
> >> -	fwd = __xdp_map_lookup_elem(map, index);
> >> -	if (unlikely(!fwd)) {
> >> -		err = -EINVAL;
> >> -		goto err;
> >> -	}
> >> -
> >>  	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
> >>  		struct bpf_dtab_netdev *dst = fwd;
> >>  
> >> @@ -3735,6 +3726,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
> >>  
> >>  	ri->ifindex = ifindex;
> >>  	ri->flags = flags;
> >> +	ri->item = NULL;
> >>  	WRITE_ONCE(ri->map, NULL);
> >>  
> >>  	return XDP_REDIRECT;
> >> @@ -3753,9 +3745,15 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
> >>  {
> >>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
> >>  
> >> -	if (unlikely(flags))
> >> +	if (unlikely(flags & ~XDP_REDIRECT_ALL_FLAGS))
> >>  		return XDP_ABORTED;
> >>  

Here you don't allow the flags to get different value than
XDP_REDIRECT_ALL_FLAGS.

> >> +	ri->item = __xdp_map_lookup_elem(map, ifindex);
> >> +	if (unlikely(!ri->item)) {
> >> +		WRITE_ONCE(ri->map, NULL);
> >> +		return (flags & XDP_REDIRECT_INVALID_MASK);  

So here you could just return flags? Don't we know that the flags value is
legit here? Am I missing something? TBH the v2 was more clear to me.

> >
> > Maybe I'm reading it wrong, but shouldn't the mask be called the "valid" mask?  
> 
> It's the mask that is applied when the index looked up is invalid (i.e.,
> the entry doesn't exist)? But yeah, can see how the name can be
> confusing; maybe it should just be "RETURN_MASK" or something like that?

Maybe something along ALLOWED_RETVAL_MASK?

> 
> -Toke


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper
  2019-06-12 20:01       ` Maciej Fijalkowski
@ 2019-06-12 21:33         ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 13+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-06-12 21:33 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: Jesper Dangaard Brouer, netdev, Daniel Borkmann,
	Alexei Starovoitov, David Miller, Jonathan Lemon

Maciej Fijalkowski <maciejromanfijalkowski@gmail.com> writes:

> On Tue, 11 Jun 2019 20:17:02 +0200
> Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
>> Jesper Dangaard Brouer <brouer@redhat.com> writes:
>> 
>> > On Tue, 11 Jun 2019 17:44:00 +0200
>> > Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>> >  
>> >> From: Toke Høiland-Jørgensen <toke@redhat.com>
>> >> 
>> >> The bpf_redirect_map() helper used by XDP programs doesn't return any
>> >> indication of whether it can successfully redirect to the map index it was
>> >> given. Instead, BPF programs have to track this themselves, leading to
>> >> programs using duplicate maps to track which entries are populated in the
>> >> devmap.
>> >> 
>> >> This patch fixes this by moving the map lookup into the bpf_redirect_map()
>> >> helper, which makes it possible to return failure to the eBPF program. The
>> >> lower bits of the flags argument is used as the return code, which means
>> >> that existing users who pass a '0' flag argument will get XDP_ABORTED.
>> >> 
>> >> With this, a BPF program can check the return code from the helper call and
>> >> react by, for instance, substituting a different redirect. This works for
>> >> any type of map used for redirect.
>> >> 
>> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> >> ---
>> >>  include/linux/filter.h   |    1 +
>> >>  include/uapi/linux/bpf.h |    8 ++++++++
>> >>  net/core/filter.c        |   26 ++++++++++++--------------
>> >>  3 files changed, 21 insertions(+), 14 deletions(-)
>> >> 
>> >> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> >> index 43b45d6db36d..f31ae8b9035a 100644
>> >> --- a/include/linux/filter.h
>> >> +++ b/include/linux/filter.h
>> >> @@ -580,6 +580,7 @@ struct bpf_skb_data_end {
>> >>  struct bpf_redirect_info {
>> >>  	u32 ifindex;
>> >>  	u32 flags;
>> >> +	void *item;
>> >>  	struct bpf_map *map;
>> >>  	struct bpf_map *map_to_flush;
>> >>  	u32 kern_flags;
>> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> >> index 7c6aef253173..9931cf02de19 100644
>> >> --- a/include/uapi/linux/bpf.h
>> >> +++ b/include/uapi/linux/bpf.h
>> >> @@ -3098,6 +3098,14 @@ enum xdp_action {
>> >>  	XDP_REDIRECT,
>> >>  };
>> >>  
>> >> +/* Flags for bpf_xdp_redirect_map helper */
>> >> +
>> >> +/* The lower flag bits will be the return code of bpf_xdp_redirect_map() helper
>> >> + * if the map lookup fails.
>> >> + */
>> >> +#define XDP_REDIRECT_INVALID_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
>> >> +#define XDP_REDIRECT_ALL_FLAGS XDP_REDIRECT_INVALID_MASK
>> >> +  
>> >
>> > Slightly confused about the naming of the define, see later.
>> >  
>> >>  /* user accessible metadata for XDP packet hook
>> >>   * new fields must be added to the end of this structure
>> >>   */
>> >> diff --git a/net/core/filter.c b/net/core/filter.c
>> >> index 7a996887c500..dd43be497480 100644
>> >> --- a/net/core/filter.c
>> >> +++ b/net/core/filter.c
>> >> @@ -3608,17 +3608,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
>> >>  			       struct bpf_redirect_info *ri)
>> >>  {
>> >>  	u32 index = ri->ifindex;
>> >> -	void *fwd = NULL;
>> >> +	void *fwd = ri->item;
>> >>  	int err;
>> >>  
>> >>  	ri->ifindex = 0;
>> >> +	ri->item = NULL;
>> >>  	WRITE_ONCE(ri->map, NULL);
>> >>  
>> >> -	fwd = __xdp_map_lookup_elem(map, index);
>> >> -	if (unlikely(!fwd)) {
>> >> -		err = -EINVAL;
>> >> -		goto err;
>> >> -	}
>> >>  	if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
>> >>  		xdp_do_flush_map();
>> >>  
>> >> @@ -3655,18 +3651,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
>> >>  {
>> >>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>> >>  	u32 index = ri->ifindex;
>> >> -	void *fwd = NULL;
>> >> +	void *fwd = ri->item;
>> >>  	int err = 0;
>> >>  
>> >>  	ri->ifindex = 0;
>> >> +	ri->item = NULL;
>> >>  	WRITE_ONCE(ri->map, NULL);
>> >>  
>> >> -	fwd = __xdp_map_lookup_elem(map, index);
>> >> -	if (unlikely(!fwd)) {
>> >> -		err = -EINVAL;
>> >> -		goto err;
>> >> -	}
>> >> -
>> >>  	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
>> >>  		struct bpf_dtab_netdev *dst = fwd;
>> >>  
>> >> @@ -3735,6 +3726,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
>> >>  
>> >>  	ri->ifindex = ifindex;
>> >>  	ri->flags = flags;
>> >> +	ri->item = NULL;
>> >>  	WRITE_ONCE(ri->map, NULL);
>> >>  
>> >>  	return XDP_REDIRECT;
>> >> @@ -3753,9 +3745,15 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
>> >>  {
>> >>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>> >>  
>> >> -	if (unlikely(flags))
>> >> +	if (unlikely(flags & ~XDP_REDIRECT_ALL_FLAGS))
>> >>  		return XDP_ABORTED;
>> >>  
>
> Here you don't allow the flags to get different value than
> XDP_REDIRECT_ALL_FLAGS.
>
>> >> +	ri->item = __xdp_map_lookup_elem(map, ifindex);
>> >> +	if (unlikely(!ri->item)) {
>> >> +		WRITE_ONCE(ri->map, NULL);
>> >> +		return (flags & XDP_REDIRECT_INVALID_MASK);  
>
> So here you could just return flags? Don't we know that the flags
> value is legit here? Am I missing something?

No, you're right. I was just worried that we would forget to change this
when we add another flag later on, so I thought I'd mask it from the
get-go. Can't the compiler figure out that the second mask is unnecessary?

> TBH the v2 was more clear to me.

Clearer how? As in, you prefer just always returning XDP_PASS on error?

-Toke

>> > Maybe I'm reading it wrong, but shouldn't the mask be called the "valid" mask?  
>> 
>> It's the mask that is applied when the index looked up is invalid (i.e.,
>> the entry doesn't exist)? But yeah, can see how the name can be
>> confusing; maybe it should just be "RETURN_MASK" or something like that?
>
> Maybe something along ALLOWED_RETVAL_MASK?

Yeah, good idea! :)

-Toke

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 1/3] devmap/cpumap: Use flush list instead of bitmap
  2019-06-11 15:44 ` [PATCH bpf-next v3 1/3] devmap/cpumap: Use flush list instead of bitmap Toke Høiland-Jørgensen
@ 2019-06-13  5:51   ` Andrii Nakryiko
  2019-06-13 11:02     ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 13+ messages in thread
From: Andrii Nakryiko @ 2019-06-13  5:51 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Networking, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, David Miller, Jonathan Lemon

On Tue, Jun 11, 2019 at 9:42 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> From: Toke Høiland-Jørgensen <toke@redhat.com>
>
> The socket map uses a linked list instead of a bitmap to keep track of
> which entries to flush. Do the same for devmap and cpumap, as this means we
> don't have to care about the map index when enqueueing things into the
> map (and so we can cache the map lookup).
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---
>  kernel/bpf/cpumap.c |   97 ++++++++++++++++++++++++---------------------------
>  kernel/bpf/devmap.c |   93 ++++++++++++++++++++++---------------------------
>  net/core/filter.c   |    2 -
>  3 files changed, 87 insertions(+), 105 deletions(-)
>
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index b31a71909307..02dc3503ef4d 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -38,8 +38,13 @@
>   */
>
>  #define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
> +struct bpf_cpu_map_entry;
> +struct bpf_cpu_map;
> +
>  struct xdp_bulk_queue {
>         void *q[CPU_MAP_BULK_SIZE];
> +       struct list_head flush_node;
> +       struct bpf_cpu_map_entry *obj;
>         unsigned int count;
>  };
>
> @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
>         /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
>         struct xdp_bulk_queue __percpu *bulkq;
>
> +       struct bpf_cpu_map *cmap;
> +
>         /* Queue with potential multi-producers, and single-consumer kthread */
>         struct ptr_ring *queue;
>         struct task_struct *kthread;
> @@ -65,23 +72,17 @@ struct bpf_cpu_map {
>         struct bpf_map map;
>         /* Below members specific for map type */
>         struct bpf_cpu_map_entry **cpu_map;
> -       unsigned long __percpu *flush_needed;
> +       struct list_head __percpu *flush_list;
>  };
>
> -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
> -                            struct xdp_bulk_queue *bq, bool in_napi_ctx);
> -
> -static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
> -{
> -       return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
> -}
> +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
>
>  static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>  {
>         struct bpf_cpu_map *cmap;
>         int err = -ENOMEM;
> +       int ret, cpu;
>         u64 cost;
> -       int ret;
>
>         if (!capable(CAP_SYS_ADMIN))
>                 return ERR_PTR(-EPERM);
> @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>
>         /* make sure page count doesn't overflow */
>         cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
> -       cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
> +       cost += sizeof(struct list_head) * num_possible_cpus();
>
>         /* Notice returns -EPERM on if map size is larger than memlock limit */
>         ret = bpf_map_charge_init(&cmap->map.memory, cost);
> @@ -115,11 +116,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>         }
>
>         /* A per cpu bitfield with a bit per possible CPU in map  */

Comment is wrong now.

> -       cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
> -                                           __alignof__(unsigned long));
> -       if (!cmap->flush_needed)
> +       cmap->flush_list = alloc_percpu(struct list_head);
> +       if (!cmap->flush_list)
>                 goto free_charge;
>
> +       for_each_possible_cpu(cpu)
> +               INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
> +
>         /* Alloc array for possible remote "destination" CPUs */
>         cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
>                                            sizeof(struct bpf_cpu_map_entry *),
> @@ -129,7 +132,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>
>         return &cmap->map;
>  free_percpu:
> -       free_percpu(cmap->flush_needed);
> +       free_percpu(cmap->flush_list);
>  free_charge:
>         bpf_map_charge_finish(&cmap->map.memory);
>  free_cmap:
> @@ -331,7 +334,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
>  {
>         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
>         struct bpf_cpu_map_entry *rcpu;
> -       int numa, err;
> +       struct xdp_bulk_queue *bq;
> +       int numa, err, i;
>
>         /* Have map->numa_node, but choose node of redirect target CPU */
>         numa = cpu_to_node(cpu);
> @@ -346,6 +350,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
>         if (!rcpu->bulkq)
>                 goto free_rcu;
>
> +       for_each_possible_cpu(i) {
> +               bq = per_cpu_ptr(rcpu->bulkq, i);
> +               bq->obj = rcpu;
> +       }
> +
>         /* Alloc queue */
>         rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
>         if (!rcpu->queue)
> @@ -402,7 +411,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
>                 struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
>
>                 /* No concurrent bq_enqueue can run at this point */
> -               bq_flush_to_queue(rcpu, bq, false);
> +               bq_flush_to_queue(bq, false);
>         }
>         free_percpu(rcpu->bulkq);
>         /* Cannot kthread_stop() here, last put free rcpu resources */
> @@ -485,6 +494,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
>                 rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
>                 if (!rcpu)
>                         return -ENOMEM;
> +               rcpu->cmap = cmap;
>         }
>         rcu_read_lock();
>         __cpu_map_entry_replace(cmap, key_cpu, rcpu);
> @@ -516,9 +526,9 @@ static void cpu_map_free(struct bpf_map *map)
>          * from the program we can assume no new bits will be set.
>          */
>         for_each_online_cpu(cpu) {
> -               unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
> +               struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
>
> -               while (!bitmap_empty(bitmap, cmap->map.max_entries))
> +               while (!list_empty(flush_list))
>                         cond_resched();
>         }
>
> @@ -535,7 +545,7 @@ static void cpu_map_free(struct bpf_map *map)
>                 /* bq flush and cleanup happens after RCU graze-period */
>                 __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
>         }
> -       free_percpu(cmap->flush_needed);
> +       free_percpu(cmap->flush_list);
>         bpf_map_area_free(cmap->cpu_map);
>         kfree(cmap);
>  }
> @@ -587,9 +597,9 @@ const struct bpf_map_ops cpu_map_ops = {
>         .map_check_btf          = map_check_no_btf,
>  };
>
> -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
> -                            struct xdp_bulk_queue *bq, bool in_napi_ctx)
> +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
>  {
> +       struct bpf_cpu_map_entry *rcpu = bq->obj;
>         unsigned int processed = 0, drops = 0;
>         const int to_cpu = rcpu->cpu;
>         struct ptr_ring *q;
> @@ -618,6 +628,9 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
>         bq->count = 0;
>         spin_unlock(&q->producer_lock);
>
> +       __list_del(bq->flush_node.prev, bq->flush_node.next);
> +       bq->flush_node.prev = NULL;
> +
>         /* Feedback loop via tracepoints */
>         trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
>         return 0;
> @@ -628,10 +641,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
>   */
>  static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
>  {
> +       struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
>         struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
>
>         if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
> -               bq_flush_to_queue(rcpu, bq, true);
> +               bq_flush_to_queue(bq, true);
>
>         /* Notice, xdp_buff/page MUST be queued here, long enough for
>          * driver to code invoking us to finished, due to driver
> @@ -643,6 +657,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
>          * operation, when completing napi->poll call.
>          */
>         bq->q[bq->count++] = xdpf;
> +
> +       if (!bq->flush_node.prev)
> +               list_add(&bq->flush_node, flush_list);
> +
>         return 0;
>  }
>
> @@ -662,41 +680,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
>         return 0;
>  }
>
> -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
> -{
> -       struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
> -       unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
> -
> -       __set_bit(bit, bitmap);
> -}
> -
>  void __cpu_map_flush(struct bpf_map *map)
>  {
>         struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
> -       unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
> -       u32 bit;
> -
> -       /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
> -        * and __cpu_map_flush() happen on same CPU. Thus, the percpu
> -        * bitmap indicate which percpu bulkq have packets.
> -        */
> -       for_each_set_bit(bit, bitmap, map->max_entries) {
> -               struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
> -               struct xdp_bulk_queue *bq;
> -
> -               /* This is possible if entry is removed by user space
> -                * between xdp redirect and flush op.
> -                */
> -               if (unlikely(!rcpu))
> -                       continue;
> -
> -               __clear_bit(bit, bitmap);
> +       struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
> +       struct xdp_bulk_queue *bq, *tmp;
>
> -               /* Flush all frames in bulkq to real queue */
> -               bq = this_cpu_ptr(rcpu->bulkq);
> -               bq_flush_to_queue(rcpu, bq, true);
> +       list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
> +               bq_flush_to_queue(bq, true);
>
>                 /* If already running, costs spin_lock_irqsave + smb_mb */
> -               wake_up_process(rcpu->kthread);
> +               wake_up_process(bq->obj->kthread);
>         }
>  }
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 5ae7cce5ef16..c945518225f3 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -56,9 +56,13 @@
>         (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>
>  #define DEV_MAP_BULK_SIZE 16
> +struct bpf_dtab_netdev;
> +
>  struct xdp_bulk_queue {
>         struct xdp_frame *q[DEV_MAP_BULK_SIZE];
> +       struct list_head flush_node;
>         struct net_device *dev_rx;
> +       struct bpf_dtab_netdev *obj;
>         unsigned int count;
>  };
>
> @@ -73,23 +77,19 @@ struct bpf_dtab_netdev {
>  struct bpf_dtab {
>         struct bpf_map map;
>         struct bpf_dtab_netdev **netdev_map;
> -       unsigned long __percpu *flush_needed;
> +       struct list_head __percpu *flush_list;
>         struct list_head list;
>  };
>
>  static DEFINE_SPINLOCK(dev_map_lock);
>  static LIST_HEAD(dev_map_list);
>
> -static u64 dev_map_bitmap_size(const union bpf_attr *attr)
> -{
> -       return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
> -}
> -
>  static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  {
>         struct bpf_dtab *dtab;
>         int err = -EINVAL;
>         u64 cost;
> +       int cpu;
>
>         if (!capable(CAP_NET_ADMIN))
>                 return ERR_PTR(-EPERM);
> @@ -107,7 +107,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>
>         /* make sure page count doesn't overflow */
>         cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
> -       cost += dev_map_bitmap_size(attr) * num_possible_cpus();
> +       cost += sizeof(struct list_head) * num_possible_cpus();
>
>         /* if map size is larger than memlock limit, reject it */
>         err = bpf_map_charge_init(&dtab->map.memory, cost);
> @@ -116,28 +116,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>
>         err = -ENOMEM;
>
> -       /* A per cpu bitfield with a bit per possible net device */
> -       dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
> -                                               __alignof__(unsigned long),
> -                                               GFP_KERNEL | __GFP_NOWARN);
> -       if (!dtab->flush_needed)
> +       dtab->flush_list = alloc_percpu(struct list_head);

Is it ok to lose __GFP_NOWARN bit, which was previously provided?

> +       if (!dtab->flush_list)
>                 goto free_charge;
>
> +       for_each_possible_cpu(cpu)
> +               INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
> +
>         dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>                                               sizeof(struct bpf_dtab_netdev *),
>                                               dtab->map.numa_node);
>         if (!dtab->netdev_map)
> -               goto free_charge;
> +               goto free_percpu;
>
>         spin_lock(&dev_map_lock);
>         list_add_tail_rcu(&dtab->list, &dev_map_list);
>         spin_unlock(&dev_map_lock);
>
>         return &dtab->map;
> +
> +free_percpu:
> +       free_percpu(dtab->flush_list);
>  free_charge:
>         bpf_map_charge_finish(&dtab->map.memory);
>  free_dtab:
> -       free_percpu(dtab->flush_needed);
>         kfree(dtab);
>         return ERR_PTR(err);
>  }
> @@ -171,9 +173,9 @@ static void dev_map_free(struct bpf_map *map)
>          * from the program we can assume no new bits will be set.
>          */
>         for_each_online_cpu(cpu) {
> -               unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
> +               struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
>
> -               while (!bitmap_empty(bitmap, dtab->map.max_entries))
> +               while (!list_empty(flush_list))
>                         cond_resched();
>         }
>
> @@ -188,7 +190,7 @@ static void dev_map_free(struct bpf_map *map)
>                 kfree(dev);
>         }
>
> -       free_percpu(dtab->flush_needed);
> +       free_percpu(dtab->flush_list);
>         bpf_map_area_free(dtab->netdev_map);
>         kfree(dtab);
>  }
> @@ -210,18 +212,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>         return 0;
>  }
>
> -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
> -{
> -       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> -       unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
> -
> -       __set_bit(bit, bitmap);
> -}
> -
> -static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> -                      struct xdp_bulk_queue *bq, u32 flags,
> +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
>                        bool in_napi_ctx)
>  {
> +

extra line?

> +       struct bpf_dtab_netdev *obj = bq->obj;
>         struct net_device *dev = obj->dev;
>         int sent = 0, drops = 0, err = 0;
>         int i;
> @@ -248,6 +243,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>         trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
>                               sent, drops, bq->dev_rx, dev, err);
>         bq->dev_rx = NULL;
> +       __list_del(bq->flush_node.prev, bq->flush_node.next);
> +       bq->flush_node.prev = NULL;
>         return 0;
>  error:
>         /* If ndo_xdp_xmit fails with an errno, no frames have been
> @@ -276,24 +273,11 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>  void __dev_map_flush(struct bpf_map *map)
>  {
>         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> -       unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
> -       u32 bit;
> +       struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
> +       struct xdp_bulk_queue *bq, *tmp;
>
> -       for_each_set_bit(bit, bitmap, map->max_entries) {
> -               struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
> -               struct xdp_bulk_queue *bq;
> -
> -               /* This is possible if the dev entry is removed by user space
> -                * between xdp redirect and flush op.
> -                */
> -               if (unlikely(!dev))
> -                       continue;
> -
> -               __clear_bit(bit, bitmap);
> -
> -               bq = this_cpu_ptr(dev->bulkq);
> -               bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
> -       }
> +       list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
> +               bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
>  }
>
>  /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
> @@ -319,10 +303,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
>                       struct net_device *dev_rx)
>
>  {
> +       struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
>         struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
>
>         if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
> -               bq_xmit_all(obj, bq, 0, true);
> +               bq_xmit_all(bq, 0, true);
>
>         /* Ingress dev_rx will be the same for all xdp_frame's in
>          * bulk_queue, because bq stored per-CPU and must be flushed
> @@ -332,6 +317,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
>                 bq->dev_rx = dev_rx;
>
>         bq->q[bq->count++] = xdpf;
> +
> +       if (!bq->flush_node.prev)
> +               list_add(&bq->flush_node, flush_list);
> +
>         return 0;
>  }
>
> @@ -382,16 +371,11 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>  {
>         if (dev->dev->netdev_ops->ndo_xdp_xmit) {
>                 struct xdp_bulk_queue *bq;
> -               unsigned long *bitmap;
> -
>                 int cpu;
>
>                 for_each_online_cpu(cpu) {
> -                       bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
> -                       __clear_bit(dev->bit, bitmap);
> -
>                         bq = per_cpu_ptr(dev->bulkq, cpu);
> -                       bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
> +                       bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
>                 }
>         }
>  }
> @@ -439,6 +423,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>         struct bpf_dtab_netdev *dev, *old_dev;
>         u32 i = *(u32 *)key;
>         u32 ifindex = *(u32 *)value;

can you please fix reverse Christmas tree order, while you are here?

> +       struct xdp_bulk_queue *bq;
> +       int cpu;
>
>         if (unlikely(map_flags > BPF_EXIST))
>                 return -EINVAL;
> @@ -461,6 +447,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>                         return -ENOMEM;
>                 }
>
> +               for_each_possible_cpu(cpu) {
> +                       bq = per_cpu_ptr(dev->bulkq, cpu);
> +                       bq->obj = dev;
> +               }
> +
>                 dev->dev = dev_get_by_index(net, ifindex);
>                 if (!dev->dev) {
>                         free_percpu(dev->bulkq);
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 55bfc941d17a..7a996887c500 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3526,7 +3526,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>                 err = dev_map_enqueue(dst, xdp, dev_rx);
>                 if (unlikely(err))
>                         return err;
> -               __dev_map_insert_ctx(map, index);
>                 break;
>         }
>         case BPF_MAP_TYPE_CPUMAP: {
> @@ -3535,7 +3534,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>                 err = cpu_map_enqueue(rcpu, xdp, dev_rx);
>                 if (unlikely(err))
>                         return err;
> -               __cpu_map_insert_ctx(map, index);
>                 break;
>         }
>         case BPF_MAP_TYPE_XSKMAP: {
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next v3 1/3] devmap/cpumap: Use flush list instead of bitmap
  2019-06-13  5:51   ` Andrii Nakryiko
@ 2019-06-13 11:02     ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 13+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-06-13 11:02 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Networking, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, David Miller, Jonathan Lemon

Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Tue, Jun 11, 2019 at 9:42 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> From: Toke Høiland-Jørgensen <toke@redhat.com>
>>
>> The socket map uses a linked list instead of a bitmap to keep track of
>> which entries to flush. Do the same for devmap and cpumap, as this means we
>> don't have to care about the map index when enqueueing things into the
>> map (and so we can cache the map lookup).
>>
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> ---
>>  kernel/bpf/cpumap.c |   97 ++++++++++++++++++++++++---------------------------
>>  kernel/bpf/devmap.c |   93 ++++++++++++++++++++++---------------------------
>>  net/core/filter.c   |    2 -
>>  3 files changed, 87 insertions(+), 105 deletions(-)
>>
>> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
>> index b31a71909307..02dc3503ef4d 100644
>> --- a/kernel/bpf/cpumap.c
>> +++ b/kernel/bpf/cpumap.c
>> @@ -38,8 +38,13 @@
>>   */
>>
>>  #define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
>> +struct bpf_cpu_map_entry;
>> +struct bpf_cpu_map;
>> +
>>  struct xdp_bulk_queue {
>>         void *q[CPU_MAP_BULK_SIZE];
>> +       struct list_head flush_node;
>> +       struct bpf_cpu_map_entry *obj;
>>         unsigned int count;
>>  };
>>
>> @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
>>         /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
>>         struct xdp_bulk_queue __percpu *bulkq;
>>
>> +       struct bpf_cpu_map *cmap;
>> +
>>         /* Queue with potential multi-producers, and single-consumer kthread */
>>         struct ptr_ring *queue;
>>         struct task_struct *kthread;
>> @@ -65,23 +72,17 @@ struct bpf_cpu_map {
>>         struct bpf_map map;
>>         /* Below members specific for map type */
>>         struct bpf_cpu_map_entry **cpu_map;
>> -       unsigned long __percpu *flush_needed;
>> +       struct list_head __percpu *flush_list;
>>  };
>>
>> -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
>> -                            struct xdp_bulk_queue *bq, bool in_napi_ctx);
>> -
>> -static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
>> -{
>> -       return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
>> -}
>> +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
>>
>>  static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>>  {
>>         struct bpf_cpu_map *cmap;
>>         int err = -ENOMEM;
>> +       int ret, cpu;
>>         u64 cost;
>> -       int ret;
>>
>>         if (!capable(CAP_SYS_ADMIN))
>>                 return ERR_PTR(-EPERM);
>> @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>>
>>         /* make sure page count doesn't overflow */
>>         cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
>> -       cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
>> +       cost += sizeof(struct list_head) * num_possible_cpus();
>>
>>         /* Notice returns -EPERM on if map size is larger than memlock limit */
>>         ret = bpf_map_charge_init(&cmap->map.memory, cost);
>> @@ -115,11 +116,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>>         }
>>
>>         /* A per cpu bitfield with a bit per possible CPU in map  */
>
> Comment is wrong now.

Good catch! And not the only comment that is wrong; will fix them all.

>> -       cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
>> -                                           __alignof__(unsigned long));
>> -       if (!cmap->flush_needed)
>> +       cmap->flush_list = alloc_percpu(struct list_head);
>> +       if (!cmap->flush_list)
>>                 goto free_charge;
>>
>> +       for_each_possible_cpu(cpu)
>> +               INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
>> +
>>         /* Alloc array for possible remote "destination" CPUs */
>>         cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
>>                                            sizeof(struct bpf_cpu_map_entry *),
>> @@ -129,7 +132,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>>
>>         return &cmap->map;
>>  free_percpu:
>> -       free_percpu(cmap->flush_needed);
>> +       free_percpu(cmap->flush_list);
>>  free_charge:
>>         bpf_map_charge_finish(&cmap->map.memory);
>>  free_cmap:
>> @@ -331,7 +334,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
>>  {
>>         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
>>         struct bpf_cpu_map_entry *rcpu;
>> -       int numa, err;
>> +       struct xdp_bulk_queue *bq;
>> +       int numa, err, i;
>>
>>         /* Have map->numa_node, but choose node of redirect target CPU */
>>         numa = cpu_to_node(cpu);
>> @@ -346,6 +350,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
>>         if (!rcpu->bulkq)
>>                 goto free_rcu;
>>
>> +       for_each_possible_cpu(i) {
>> +               bq = per_cpu_ptr(rcpu->bulkq, i);
>> +               bq->obj = rcpu;
>> +       }
>> +
>>         /* Alloc queue */
>>         rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
>>         if (!rcpu->queue)
>> @@ -402,7 +411,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
>>                 struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
>>
>>                 /* No concurrent bq_enqueue can run at this point */
>> -               bq_flush_to_queue(rcpu, bq, false);
>> +               bq_flush_to_queue(bq, false);
>>         }
>>         free_percpu(rcpu->bulkq);
>>         /* Cannot kthread_stop() here, last put free rcpu resources */
>> @@ -485,6 +494,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
>>                 rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
>>                 if (!rcpu)
>>                         return -ENOMEM;
>> +               rcpu->cmap = cmap;
>>         }
>>         rcu_read_lock();
>>         __cpu_map_entry_replace(cmap, key_cpu, rcpu);
>> @@ -516,9 +526,9 @@ static void cpu_map_free(struct bpf_map *map)
>>          * from the program we can assume no new bits will be set.
>>          */
>>         for_each_online_cpu(cpu) {
>> -               unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
>> +               struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
>>
>> -               while (!bitmap_empty(bitmap, cmap->map.max_entries))
>> +               while (!list_empty(flush_list))
>>                         cond_resched();
>>         }
>>
>> @@ -535,7 +545,7 @@ static void cpu_map_free(struct bpf_map *map)
>>                 /* bq flush and cleanup happens after RCU graze-period */
>>                 __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
>>         }
>> -       free_percpu(cmap->flush_needed);
>> +       free_percpu(cmap->flush_list);
>>         bpf_map_area_free(cmap->cpu_map);
>>         kfree(cmap);
>>  }
>> @@ -587,9 +597,9 @@ const struct bpf_map_ops cpu_map_ops = {
>>         .map_check_btf          = map_check_no_btf,
>>  };
>>
>> -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
>> -                            struct xdp_bulk_queue *bq, bool in_napi_ctx)
>> +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
>>  {
>> +       struct bpf_cpu_map_entry *rcpu = bq->obj;
>>         unsigned int processed = 0, drops = 0;
>>         const int to_cpu = rcpu->cpu;
>>         struct ptr_ring *q;
>> @@ -618,6 +628,9 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
>>         bq->count = 0;
>>         spin_unlock(&q->producer_lock);
>>
>> +       __list_del(bq->flush_node.prev, bq->flush_node.next);
>> +       bq->flush_node.prev = NULL;
>> +
>>         /* Feedback loop via tracepoints */
>>         trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
>>         return 0;
>> @@ -628,10 +641,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
>>   */
>>  static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
>>  {
>> +       struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
>>         struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
>>
>>         if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
>> -               bq_flush_to_queue(rcpu, bq, true);
>> +               bq_flush_to_queue(bq, true);
>>
>>         /* Notice, xdp_buff/page MUST be queued here, long enough for
>>          * driver to code invoking us to finished, due to driver
>> @@ -643,6 +657,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
>>          * operation, when completing napi->poll call.
>>          */
>>         bq->q[bq->count++] = xdpf;
>> +
>> +       if (!bq->flush_node.prev)
>> +               list_add(&bq->flush_node, flush_list);
>> +
>>         return 0;
>>  }
>>
>> @@ -662,41 +680,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
>>         return 0;
>>  }
>>
>> -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
>> -{
>> -       struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
>> -       unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
>> -
>> -       __set_bit(bit, bitmap);
>> -}
>> -
>>  void __cpu_map_flush(struct bpf_map *map)
>>  {
>>         struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
>> -       unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
>> -       u32 bit;
>> -
>> -       /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
>> -        * and __cpu_map_flush() happen on same CPU. Thus, the percpu
>> -        * bitmap indicate which percpu bulkq have packets.
>> -        */
>> -       for_each_set_bit(bit, bitmap, map->max_entries) {
>> -               struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
>> -               struct xdp_bulk_queue *bq;
>> -
>> -               /* This is possible if entry is removed by user space
>> -                * between xdp redirect and flush op.
>> -                */
>> -               if (unlikely(!rcpu))
>> -                       continue;
>> -
>> -               __clear_bit(bit, bitmap);
>> +       struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
>> +       struct xdp_bulk_queue *bq, *tmp;
>>
>> -               /* Flush all frames in bulkq to real queue */
>> -               bq = this_cpu_ptr(rcpu->bulkq);
>> -               bq_flush_to_queue(rcpu, bq, true);
>> +       list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
>> +               bq_flush_to_queue(bq, true);
>>
>>                 /* If already running, costs spin_lock_irqsave + smb_mb */
>> -               wake_up_process(rcpu->kthread);
>> +               wake_up_process(bq->obj->kthread);
>>         }
>>  }
>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> index 5ae7cce5ef16..c945518225f3 100644
>> --- a/kernel/bpf/devmap.c
>> +++ b/kernel/bpf/devmap.c
>> @@ -56,9 +56,13 @@
>>         (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>>
>>  #define DEV_MAP_BULK_SIZE 16
>> +struct bpf_dtab_netdev;
>> +
>>  struct xdp_bulk_queue {
>>         struct xdp_frame *q[DEV_MAP_BULK_SIZE];
>> +       struct list_head flush_node;
>>         struct net_device *dev_rx;
>> +       struct bpf_dtab_netdev *obj;
>>         unsigned int count;
>>  };
>>
>> @@ -73,23 +77,19 @@ struct bpf_dtab_netdev {
>>  struct bpf_dtab {
>>         struct bpf_map map;
>>         struct bpf_dtab_netdev **netdev_map;
>> -       unsigned long __percpu *flush_needed;
>> +       struct list_head __percpu *flush_list;
>>         struct list_head list;
>>  };
>>
>>  static DEFINE_SPINLOCK(dev_map_lock);
>>  static LIST_HEAD(dev_map_list);
>>
>> -static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>> -{
>> -       return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
>> -}
>> -
>>  static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>  {
>>         struct bpf_dtab *dtab;
>>         int err = -EINVAL;
>>         u64 cost;
>> +       int cpu;
>>
>>         if (!capable(CAP_NET_ADMIN))
>>                 return ERR_PTR(-EPERM);
>> @@ -107,7 +107,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>
>>         /* make sure page count doesn't overflow */
>>         cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>> -       cost += dev_map_bitmap_size(attr) * num_possible_cpus();
>> +       cost += sizeof(struct list_head) * num_possible_cpus();
>>
>>         /* if map size is larger than memlock limit, reject it */
>>         err = bpf_map_charge_init(&dtab->map.memory, cost);
>> @@ -116,28 +116,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>
>>         err = -ENOMEM;
>>
>> -       /* A per cpu bitfield with a bit per possible net device */
>> -       dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
>> -                                               __alignof__(unsigned long),
>> -                                               GFP_KERNEL | __GFP_NOWARN);
>> -       if (!dtab->flush_needed)
>> +       dtab->flush_list = alloc_percpu(struct list_head);
>
> Is it ok to lose __GFP_NOWARN bit, which was previously provided?

Yeah, I believe so. This is a pretty standard allocation failure, so no
real reason to mask the warning. The devmap is the only place that sets
the NOWARN bit anyway (neither cpumap nor xskmap do), so I think it's
probably just a coincidental setting here...

>> +       if (!dtab->flush_list)
>>                 goto free_charge;
>>
>> +       for_each_possible_cpu(cpu)
>> +               INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
>> +
>>         dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>>                                               sizeof(struct bpf_dtab_netdev *),
>>                                               dtab->map.numa_node);
>>         if (!dtab->netdev_map)
>> -               goto free_charge;
>> +               goto free_percpu;
>>
>>         spin_lock(&dev_map_lock);
>>         list_add_tail_rcu(&dtab->list, &dev_map_list);
>>         spin_unlock(&dev_map_lock);
>>
>>         return &dtab->map;
>> +
>> +free_percpu:
>> +       free_percpu(dtab->flush_list);
>>  free_charge:
>>         bpf_map_charge_finish(&dtab->map.memory);
>>  free_dtab:
>> -       free_percpu(dtab->flush_needed);
>>         kfree(dtab);
>>         return ERR_PTR(err);
>>  }
>> @@ -171,9 +173,9 @@ static void dev_map_free(struct bpf_map *map)
>>          * from the program we can assume no new bits will be set.
>>          */
>>         for_each_online_cpu(cpu) {
>> -               unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
>> +               struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
>>
>> -               while (!bitmap_empty(bitmap, dtab->map.max_entries))
>> +               while (!list_empty(flush_list))
>>                         cond_resched();
>>         }
>>
>> @@ -188,7 +190,7 @@ static void dev_map_free(struct bpf_map *map)
>>                 kfree(dev);
>>         }
>>
>> -       free_percpu(dtab->flush_needed);
>> +       free_percpu(dtab->flush_list);
>>         bpf_map_area_free(dtab->netdev_map);
>>         kfree(dtab);
>>  }
>> @@ -210,18 +212,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>>         return 0;
>>  }
>>
>> -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
>> -{
>> -       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> -       unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
>> -
>> -       __set_bit(bit, bitmap);
>> -}
>> -
>> -static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>> -                      struct xdp_bulk_queue *bq, u32 flags,
>> +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
>>                        bool in_napi_ctx)
>>  {
>> +
>
> extra line?

Right, will remove.

>> +       struct bpf_dtab_netdev *obj = bq->obj;
>>         struct net_device *dev = obj->dev;
>>         int sent = 0, drops = 0, err = 0;
>>         int i;
>> @@ -248,6 +243,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>>         trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
>>                               sent, drops, bq->dev_rx, dev, err);
>>         bq->dev_rx = NULL;
>> +       __list_del(bq->flush_node.prev, bq->flush_node.next);
>> +       bq->flush_node.prev = NULL;
>>         return 0;
>>  error:
>>         /* If ndo_xdp_xmit fails with an errno, no frames have been
>> @@ -276,24 +273,11 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>>  void __dev_map_flush(struct bpf_map *map)
>>  {
>>         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> -       unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
>> -       u32 bit;
>> +       struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
>> +       struct xdp_bulk_queue *bq, *tmp;
>>
>> -       for_each_set_bit(bit, bitmap, map->max_entries) {
>> -               struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
>> -               struct xdp_bulk_queue *bq;
>> -
>> -               /* This is possible if the dev entry is removed by user space
>> -                * between xdp redirect and flush op.
>> -                */
>> -               if (unlikely(!dev))
>> -                       continue;
>> -
>> -               __clear_bit(bit, bitmap);
>> -
>> -               bq = this_cpu_ptr(dev->bulkq);
>> -               bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
>> -       }
>> +       list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
>> +               bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
>>  }
>>
>>  /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
>> @@ -319,10 +303,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
>>                       struct net_device *dev_rx)
>>
>>  {
>> +       struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
>>         struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
>>
>>         if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
>> -               bq_xmit_all(obj, bq, 0, true);
>> +               bq_xmit_all(bq, 0, true);
>>
>>         /* Ingress dev_rx will be the same for all xdp_frame's in
>>          * bulk_queue, because bq stored per-CPU and must be flushed
>> @@ -332,6 +317,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
>>                 bq->dev_rx = dev_rx;
>>
>>         bq->q[bq->count++] = xdpf;
>> +
>> +       if (!bq->flush_node.prev)
>> +               list_add(&bq->flush_node, flush_list);
>> +
>>         return 0;
>>  }
>>
>> @@ -382,16 +371,11 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>>  {
>>         if (dev->dev->netdev_ops->ndo_xdp_xmit) {
>>                 struct xdp_bulk_queue *bq;
>> -               unsigned long *bitmap;
>> -
>>                 int cpu;
>>
>>                 for_each_online_cpu(cpu) {
>> -                       bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
>> -                       __clear_bit(dev->bit, bitmap);
>> -
>>                         bq = per_cpu_ptr(dev->bulkq, cpu);
>> -                       bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
>> +                       bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
>>                 }
>>         }
>>  }
>> @@ -439,6 +423,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>>         struct bpf_dtab_netdev *dev, *old_dev;
>>         u32 i = *(u32 *)key;
>>         u32 ifindex = *(u32 *)value;
>
> can you please fix reverse Christmas tree order, while you are here?

Sure!

>> +       struct xdp_bulk_queue *bq;
>> +       int cpu;
>>
>>         if (unlikely(map_flags > BPF_EXIST))
>>                 return -EINVAL;
>> @@ -461,6 +447,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>>                         return -ENOMEM;
>>                 }
>>
>> +               for_each_possible_cpu(cpu) {
>> +                       bq = per_cpu_ptr(dev->bulkq, cpu);
>> +                       bq->obj = dev;
>> +               }
>> +
>>                 dev->dev = dev_get_by_index(net, ifindex);
>>                 if (!dev->dev) {
>>                         free_percpu(dev->bulkq);
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 55bfc941d17a..7a996887c500 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -3526,7 +3526,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>>                 err = dev_map_enqueue(dst, xdp, dev_rx);
>>                 if (unlikely(err))
>>                         return err;
>> -               __dev_map_insert_ctx(map, index);
>>                 break;
>>         }
>>         case BPF_MAP_TYPE_CPUMAP: {
>> @@ -3535,7 +3534,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>>                 err = cpu_map_enqueue(rcpu, xdp, dev_rx);
>>                 if (unlikely(err))
>>                         return err;
>> -               __cpu_map_insert_ctx(map, index);
>>                 break;
>>         }
>>         case BPF_MAP_TYPE_XSKMAP: {
>>


-Toke

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2019-06-13 17:16 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-11 15:44 [PATCH bpf-next v3 0/3] xdp: Allow lookup into devmaps before redirect Toke Høiland-Jørgensen
2019-06-11 15:44 ` [PATCH bpf-next v3 2/3] bpf_xdp_redirect_map: Perform map lookup in eBPF helper Toke Høiland-Jørgensen
2019-06-11 18:00   ` Jesper Dangaard Brouer
2019-06-11 18:17     ` Toke Høiland-Jørgensen
2019-06-12 20:01       ` Maciej Fijalkowski
2019-06-12 21:33         ` Toke Høiland-Jørgensen
2019-06-11 21:48   ` Jakub Kicinski
2019-06-12  9:49     ` Toke Høiland-Jørgensen
2019-06-12 19:45       ` Jakub Kicinski
2019-06-11 15:44 ` [PATCH bpf-next v3 1/3] devmap/cpumap: Use flush list instead of bitmap Toke Høiland-Jørgensen
2019-06-13  5:51   ` Andrii Nakryiko
2019-06-13 11:02     ` Toke Høiland-Jørgensen
2019-06-11 15:44 ` [PATCH bpf-next v3 3/3] devmap: Allow map lookups from eBPF Toke Høiland-Jørgensen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.