bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29 22:04   ` Song Liu
  2019-08-29  6:45 ` [PATCH bpf-next 02/13] bpf: refactor map_update_elem() Yonghong Song
                   ` (12 subsequent siblings)
  13 siblings, 1 reply; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

From: Brian Vazquez <brianvv@google.com>

Move reusable code from map_lookup_elem to helper functions to avoid code
duplication in kernel/bpf/syscall.c

Suggested-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Brian Vazquez <brianvv@google.com>
---
 kernel/bpf/syscall.c | 134 +++++++++++++++++++++++--------------------
 1 file changed, 73 insertions(+), 61 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ca60eafa6922..211e0bc667bd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -126,6 +126,76 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
+static u32 bpf_map_value_size(struct bpf_map *map)
+{
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+		return round_up(map->value_size, 8) * num_possible_cpus();
+	else if (IS_FD_MAP(map))
+		return sizeof(u32);
+	else
+		return  map->value_size;
+}
+
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
+			      __u64 flags)
+{
+	void *ptr;
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return  bpf_map_offload_lookup_elem(map, key, value);
+
+	preempt_disable();
+	this_cpu_inc(bpf_prog_active);
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		err = bpf_percpu_hash_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		err = bpf_percpu_array_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+		err = bpf_stackmap_copy(map, key, value);
+	} else if (IS_FD_ARRAY(map)) {
+		err = bpf_fd_array_map_lookup_elem(map, key, value);
+	} else if (IS_FD_HASH(map)) {
+		err = bpf_fd_htab_map_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_peek_elem(map, value);
+	} else {
+		rcu_read_lock();
+		if (map->ops->map_lookup_elem_sys_only)
+			ptr = map->ops->map_lookup_elem_sys_only(map, key);
+		else
+			ptr = map->ops->map_lookup_elem(map, key);
+		if (IS_ERR(ptr)) {
+			err = PTR_ERR(ptr);
+		} else if (!ptr) {
+			err = -ENOENT;
+		} else {
+			err = 0;
+			if (flags & BPF_F_LOCK)
+				/* lock 'ptr' and copy everything but lock */
+				copy_map_value_locked(map, value, ptr, true);
+			else
+				copy_map_value(map, value, ptr);
+			/* mask lock, since value wasn't zero inited */
+			check_and_init_map_lock(map, value);
+		}
+		rcu_read_unlock();
+	}
+	this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+
+	return err;
+}
+
 void *bpf_map_area_alloc(size_t size, int numa_node)
 {
 	/* We really just want to fail instead of triggering OOM killer
@@ -739,7 +809,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	void __user *uvalue = u64_to_user_ptr(attr->value);
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
-	void *key, *value, *ptr;
+	void *key, *value;
 	u32 value_size;
 	struct fd f;
 	int err;
@@ -771,72 +841,14 @@ static int map_lookup_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
-		value_size = round_up(map->value_size, 8) * num_possible_cpus();
-	else if (IS_FD_MAP(map))
-		value_size = sizeof(u32);
-	else
-		value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
 
 	err = -ENOMEM;
 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 	if (!value)
 		goto free_key;
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_lookup_elem(map, key, value);
-		goto done;
-	}
-
-	preempt_disable();
-	this_cpu_inc(bpf_prog_active);
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
-		err = bpf_percpu_hash_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
-		err = bpf_percpu_array_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
-		err = bpf_percpu_cgroup_storage_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
-		err = bpf_stackmap_copy(map, key, value);
-	} else if (IS_FD_ARRAY(map)) {
-		err = bpf_fd_array_map_lookup_elem(map, key, value);
-	} else if (IS_FD_HASH(map)) {
-		err = bpf_fd_htab_map_lookup_elem(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
-		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
-		   map->map_type == BPF_MAP_TYPE_STACK) {
-		err = map->ops->map_peek_elem(map, value);
-	} else {
-		rcu_read_lock();
-		if (map->ops->map_lookup_elem_sys_only)
-			ptr = map->ops->map_lookup_elem_sys_only(map, key);
-		else
-			ptr = map->ops->map_lookup_elem(map, key);
-		if (IS_ERR(ptr)) {
-			err = PTR_ERR(ptr);
-		} else if (!ptr) {
-			err = -ENOENT;
-		} else {
-			err = 0;
-			if (attr->flags & BPF_F_LOCK)
-				/* lock 'ptr' and copy everything but lock */
-				copy_map_value_locked(map, value, ptr, true);
-			else
-				copy_map_value(map, value, ptr);
-			/* mask lock, since value wasn't zero inited */
-			check_and_init_map_lock(map, value);
-		}
-		rcu_read_unlock();
-	}
-	this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-
-done:
+	err = bpf_map_copy_value(map, key, value, attr->flags);
 	if (err)
 		goto free_value;
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 00/13] bpf: adding map batch processing support
@ 2019-08-29  6:45 Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions Yonghong Song
                   ` (13 more replies)
  0 siblings, 14 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
map entries per syscall.
  https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t

During discussion, we found more use cases can be supported in a similar
map operation batching framework. For example, batched map lookup and delete,
which can be really helpful for bcc.
  https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
  https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
    
Also, in bcc, we have API to delete all entries in a map.
  https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264

For map update, batched operations also useful as sometimes applications need
to populate initial maps with more than one entry. For example, the below
example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
  https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550

This patch addresses all the above use cases. To make uapi stable, it also
covers other potential use cases. Four bpf syscall subcommands are introduced:
    BPF_MAP_LOOKUP_BATCH
    BPF_MAP_LOOKUP_AND_DELETE_BATCH
    BPF_MAP_UPDATE_BATCH
    BPF_MAP_DELETE_BATCH

In userspace, application can iterate through the whole map one batch
as a time, e.g., bpf_map_lookup_batch() in the below:
    p_key = NULL;
    p_next_key = &key;
    while (true) {
       err = bpf_map_lookup_batch(fd, p_key, &p_next_key, keys, values,
                                  &batch_size, elem_flags, flags);
       if (err) ...
       if (p_next_key) break; // done
       if (!p_key) p_key = p_next_key;
    }
Please look at individual patches for details of new syscall subcommands
and examples of user codes.

The testing is also done in a qemu VM environment:
      measure_lookup: max_entries 1000000, batch 10, time 342ms
      measure_lookup: max_entries 1000000, batch 1000, time 295ms
      measure_lookup: max_entries 1000000, batch 1000000, time 270ms
      measure_lookup: max_entries 1000000, no batching, time 1346ms
      measure_lookup_delete: max_entries 1000000, batch 10, time 433ms
      measure_lookup_delete: max_entries 1000000, batch 1000, time 363ms
      measure_lookup_delete: max_entries 1000000, batch 1000000, time 357ms
      measure_lookup_delete: max_entries 1000000, not batch, time 1894ms
      measure_delete: max_entries 1000000, batch, time 220ms
      measure_delete: max_entries 1000000, not batch, time 1289ms
For a 1M entry hash table, batch size of 10 can reduce cpu time
by 70%. Please see patch "tools/bpf: measure map batching perf"
for details of test codes.

Brian Vazquez (1):
  bpf: add bpf_map_value_size and bp_map_copy_value helper functions

Yonghong Song (12):
  bpf: refactor map_update_elem()
  bpf: refactor map_delete_elem()
  bpf: refactor map_get_next_key()
  bpf: adding map batch processing support
  tools/bpf: sync uapi header bpf.h
  tools/bpf: implement libbpf API functions for map batch operations
  tools/bpf: add test for bpf_map_update_batch()
  tools/bpf: add test for bpf_map_lookup_batch()
  tools/bpf: add test for bpf_map_lookup_and_delete_batch()
  tools/bpf: add test for bpf_map_delete_batch()
  tools/bpf: add a multithreaded test for map batch operations
  tools/bpf: measure map batching perf

 include/uapi/linux/bpf.h                      |  27 +
 kernel/bpf/syscall.c                          | 752 ++++++++++++++----
 tools/include/uapi/linux/bpf.h                |  27 +
 tools/lib/bpf/bpf.c                           |  67 ++
 tools/lib/bpf/bpf.h                           |  17 +
 tools/lib/bpf/libbpf.map                      |   4 +
 .../selftests/bpf/map_tests/map_batch_mt.c    | 126 +++
 .../selftests/bpf/map_tests/map_batch_perf.c  | 242 ++++++
 .../bpf/map_tests/map_delete_batch.c          | 139 ++++
 .../map_tests/map_lookup_and_delete_batch.c   | 164 ++++
 .../bpf/map_tests/map_lookup_batch.c          | 166 ++++
 .../bpf/map_tests/map_update_batch.c          | 115 +++
 12 files changed, 1707 insertions(+), 139 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_mt.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_perf.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_delete_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_update_batch.c

-- 
2.17.1


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 02/13] bpf: refactor map_update_elem()
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29 23:37   ` Song Liu
  2019-08-29  6:45 ` [PATCH bpf-next 03/13] bpf: refactor map_delete_elem() Yonghong Song
                   ` (11 subsequent siblings)
  13 siblings, 1 reply; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Refactor function map_update_elem() by creating a
helper function bpf_map_update_elem() which will be
used later by batched map update operation.

Also reuse function bpf_map_value_size()
in map_update_elem().

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 113 ++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 211e0bc667bd..3caa0ab3d30d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -878,6 +878,61 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
 		synchronize_rcu();
 }
 
+static int bpf_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       struct fd *f, __u64 flags) {
+	int err;
+
+	/* Need to create a kthread, thus must support schedule */
+	if (bpf_map_is_dev_bound(map)) {
+		return bpf_map_offload_update_elem(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+		   map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		return map->ops->map_update_elem(map, key, value, flags);
+	}
+
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+	 * inside bpf map update or delete otherwise deadlocks are possible
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		err = bpf_percpu_hash_update(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		err = bpf_percpu_array_update(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_update(map, key, value,
+						       flags);
+	} else if (IS_FD_ARRAY(map)) {
+		rcu_read_lock();
+		err = bpf_fd_array_map_update_elem(map, f->file, key, value,
+						   flags);
+		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+		rcu_read_lock();
+		err = bpf_fd_htab_map_update_elem(map, f->file, key, value,
+						  flags);
+		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		/* rcu_read_lock() is not needed */
+		err = bpf_fd_reuseport_array_update_elem(map, key, value,
+							 flags);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_push_elem(map, value, flags);
+	} else {
+		rcu_read_lock();
+		err = map->ops->map_update_elem(map, key, value, flags);
+		rcu_read_unlock();
+	}
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+	maybe_wait_bpf_programs(map);
+
+	return err;
+}
+
 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
@@ -915,13 +970,7 @@ static int map_update_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
-		value_size = round_up(map->value_size, 8) * num_possible_cpus();
-	else
-		value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
 
 	err = -ENOMEM;
 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -932,56 +981,8 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
-	/* Need to create a kthread, thus must support schedule */
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_update_elem(map, key, value, attr->flags);
-		goto out;
-	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
-		   map->map_type == BPF_MAP_TYPE_SOCKHASH ||
-		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		err = map->ops->map_update_elem(map, key, value, attr->flags);
-		goto out;
-	}
+	err = bpf_map_update_elem(map, key, value, &f, attr->flags);
 
-	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
-	 * inside bpf map update or delete otherwise deadlocks are possible
-	 */
-	preempt_disable();
-	__this_cpu_inc(bpf_prog_active);
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
-		err = bpf_percpu_hash_update(map, key, value, attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
-		err = bpf_percpu_array_update(map, key, value, attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
-		err = bpf_percpu_cgroup_storage_update(map, key, value,
-						       attr->flags);
-	} else if (IS_FD_ARRAY(map)) {
-		rcu_read_lock();
-		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
-						   attr->flags);
-		rcu_read_unlock();
-	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-		rcu_read_lock();
-		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
-						  attr->flags);
-		rcu_read_unlock();
-	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
-		/* rcu_read_lock() is not needed */
-		err = bpf_fd_reuseport_array_update_elem(map, key, value,
-							 attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
-		   map->map_type == BPF_MAP_TYPE_STACK) {
-		err = map->ops->map_push_elem(map, value, attr->flags);
-	} else {
-		rcu_read_lock();
-		err = map->ops->map_update_elem(map, key, value, attr->flags);
-		rcu_read_unlock();
-	}
-	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-	maybe_wait_bpf_programs(map);
-out:
 free_value:
 	kfree(value);
 free_key:
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 03/13] bpf: refactor map_delete_elem()
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 02/13] bpf: refactor map_update_elem() Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29 23:39   ` Song Liu
  2019-08-29  6:45 ` [PATCH bpf-next 04/13] bpf: refactor map_get_next_key() Yonghong Song
                   ` (10 subsequent siblings)
  13 siblings, 1 reply; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Refactor function map_delete_elem() with a new helper
bpf_map_delete_elem(), which will be used later
for batched lookup_and_delete and delete operations.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3caa0ab3d30d..b8bba499df11 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -992,6 +992,25 @@ static int map_update_elem(union bpf_attr *attr)
 	return err;
 }
 
+static int bpf_map_delete_elem(struct bpf_map *map, void *key)
+{
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return bpf_map_offload_delete_elem(map, key);
+
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	rcu_read_lock();
+	err = map->ops->map_delete_elem(map, key);
+	rcu_read_unlock();
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+	maybe_wait_bpf_programs(map);
+
+	return err;
+}
+
 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
 
 static int map_delete_elem(union bpf_attr *attr)
@@ -1021,20 +1040,8 @@ static int map_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_delete_elem(map, key);
-		goto out;
-	}
+	err = bpf_map_delete_elem(map, key);
 
-	preempt_disable();
-	__this_cpu_inc(bpf_prog_active);
-	rcu_read_lock();
-	err = map->ops->map_delete_elem(map, key);
-	rcu_read_unlock();
-	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-	maybe_wait_bpf_programs(map);
-out:
 	kfree(key);
 err_put:
 	fdput(f);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 04/13] bpf: refactor map_get_next_key()
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (2 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 03/13] bpf: refactor map_delete_elem() Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29 23:39   ` Song Liu
  2019-08-29  6:45 ` [PATCH bpf-next 05/13] bpf: adding map batch processing support Yonghong Song
                   ` (9 subsequent siblings)
  13 siblings, 1 reply; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Refactor function map_get_next_key() with a new helper
bpf_map_get_next_key(), which will be used later
for batched map lookup/lookup_and_delete/delete operations.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b8bba499df11..06308f0206e5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1048,6 +1048,20 @@ static int map_delete_elem(union bpf_attr *attr)
 	return err;
 }
 
+static int bpf_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return bpf_map_offload_get_next_key(map, key, next_key);
+
+	rcu_read_lock();
+	err = map->ops->map_get_next_key(map, key, next_key);
+	rcu_read_unlock();
+
+	return err;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
 
@@ -1088,15 +1102,7 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (!next_key)
 		goto free_key;
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_get_next_key(map, key, next_key);
-		goto out;
-	}
-
-	rcu_read_lock();
-	err = map->ops->map_get_next_key(map, key, next_key);
-	rcu_read_unlock();
-out:
+	err = bpf_map_get_next_key(map, key, next_key);
 	if (err)
 		goto free_next_key;
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 05/13] bpf: adding map batch processing support
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (3 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 04/13] bpf: refactor map_get_next_key() Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29 23:01   ` Brian Vazquez
  2019-08-29  6:45 ` [PATCH bpf-next 06/13] tools/bpf: sync uapi header bpf.h Yonghong Song
                   ` (8 subsequent siblings)
  13 siblings, 1 reply; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
map entries per syscall.
  https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t

During discussion, we found more use cases can be supported in a similar
map operation batching framework. For example, batched map lookup and delete,
which can be really helpful for bcc.
  https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
  https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138

Also, in bcc, we have API to delete all entries in a map.
  https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264

For map update, batched operations also useful as sometimes applications need
to populate initial maps with more than one entry. For example, the below
example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
  https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550

This patch addresses all the above use cases. To make uapi stable, it also
covers other potential use cases. Four bpf syscall subcommands are introduced:
	BPF_MAP_LOOKUP_BATCH
	BPF_MAP_LOOKUP_AND_DELETE_BATCH
	BPF_MAP_UPDATE_BATCH
	BPF_MAP_DELETE_BATCH

The UAPI attribute structure looks like:
	struct { /* struct used by BPF_MAP_*_BATCH commands */
		__aligned_u64   start_key;      /* input: storing start key,
						 * if NULL, starting from the beginning.
						 */
		__aligned_u64   next_start_key; /* output: storing next batch start_key,
						 * if NULL, no next key.
						 */
		__aligned_u64	keys;		/* input/output: key buffer */
		__aligned_u64	values;		/* input/output: value buffer */
		__u32		count;		/* input: # of keys/values in
						 *   or fits in keys[]/values[].
						 * output: how many successful
						 *   lookup/lookup_and_delete
						 *   /delete/update operations.
						 */
		__u32		map_fd;
		__u64		elem_flags;	/* BPF_MAP_{UPDATE,LOOKUP}_ELEM flags */
		__u64		flags;		/* flags for batch operation */
	} batch;

The 'start_key' and 'next_start_key' are used to BPF_MAP_LOOKUP_BATCH,
BPF_MAP_LOOKUP_AND_DELETE_BATCH and BPF_MAP_DELETE_BATCH
to start the operation on 'start_key' and also set the
next batch start key in 'next_start_key'.

If 'count' is greater than 0 and the return code is 0,
  . the 'count' may be updated to be smaller if there is less
    elements than 'count' for the operation. In such cases,
    'next_start_key' will be set to NULL.
  . the 'count' remains the same. 'next_start_key' could be NULL
    or could point to next start_key for batch processing.

If 'count' is greater than 0 and the return code is an error
other than -EFAULT, the kernel will try to overwrite 'count'
to contain the number of successful element-level (lookup,
lookup_and_delete, update and delete) operations. The following
attributes can be checked:
  . if 'count' value is modified, the new value will be
    the number of successful element-level operations.
  . if 'count' value is modified, the keys[]/values[] will
    contain correct results for new 'count' number of
    operations for lookup[_and_delete] and update.

The implementation in this patch mimics what user space
did, e.g., for lookup_and_delete,
    while(bpf_get_next_key(map, keyp, next_keyp) == 0) {
       bpf_map_delete_elem(map, keyp);
       bpf_map_lookup_elem(map, next_keyp, &value, flags);
       keyp, next_keyp = next_keyp, keyp;
    }
The similar loop is implemented in the kernel, and
each operation, bpf_get_next_key(), bpf_map_delete_elem()
and bpf_map_lookup_elem(), uses existing kernel functions
each of which has its own rcu_read_lock region, bpf_prog_active
protection, etc.
Therefore, it is totally possible that after bpf_get_next_key(),
the bpf_map_delete_elem() or bpf_map_lookup_elem() may fail
as the key may be deleted concurrently by kernel or
other user space processes/threads.
By default, the current implementation permits the loop
to continue, just like typical user space handling. But
a flag, BPF_F_ENFORCE_ENOENT, can be specified, so kernel
will return an error if bpf_map_delete_elem() or
bpf_map_lookup_elem() failed.

The high-level algorithm for BPF_MAP_LOOKUP_BATCH and
BPF_MAP_LOOKUP_AND_DELETE_BATCH:
	if (start_key == NULL and next_start_key == NULL) {
		lookup up to 'count' keys in keys[] and fill
		corresponding values[], and delete those
		keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
	} else if (start_key == NULL && next_start_key != NULL) {
		lookup up to 'count' keys from the beginning
		of the map and fill keys[]/values[], delete
		those keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
		Set 'next_start_key' for next batch operation.
	} else if (start_key != NULL && next_start_key != NULL) {
		lookup to 'count' keys from 'start_key', inclusive,
		and fill keys[]/values[], delete those keys if
		BPF_MAP_LOOKUP_AND_DELETE_BATCH.
		Set 'next_start_key' for next batch operation.
	}

The high-level algorithm for BPF_MAP_UPDATE_BATCH:
	if (count != 0) {
		do 'count' number of updates on keys[]/values[].
	}

The high-level algorithm for BPF_MAP_DELETE_BATCH:
	if (count == 0) {
		if (start_key == NULL) {
			delete all elements from map.
		} else {
			delete all elements from start_key to the end of map.
		}
	} else {
		if (start_key == NULL and next_start_key == NULL) {
			delete 'count' number of keys in keys[].
		} else if (start_key == NULL and next_start_key != NULL) {
			delete 'count' number of keys from the
			beginning of the map and set 'next_start_key'
			properly.
		} else if (start_key != NULL and next_start_keeey != NULL) {
			delete 'count' number of keys from 'start_key',
			and set 'next_start_key' properly.
		}
	}

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/uapi/linux/bpf.h |  27 +++
 kernel/bpf/syscall.c     | 448 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 475 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5d2fb183ee2d..576688f13e8c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 	BPF_MAP_FREEZE,
 	BPF_BTF_GET_NEXT_ID,
+	BPF_MAP_LOOKUP_BATCH,
+	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+	BPF_MAP_UPDATE_BATCH,
+	BPF_MAP_DELETE_BATCH,
 };
 
 enum bpf_map_type {
@@ -347,6 +351,9 @@ enum bpf_attach_type {
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
+/* flags for BPF_MAP_*_BATCH */
+#define BPF_F_ENFORCE_ENOENT	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -396,6 +403,26 @@ union bpf_attr {
 		__u64		flags;
 	};
 
+	struct { /* struct used by BPF_MAP_*_BATCH commands */
+		__aligned_u64	start_key;	/* input: storing start key,
+						 * if NULL, starting from the beginning.
+						 */
+		__aligned_u64	next_start_key;	/* output: storing next batch start_key,
+						 * if NULL, no next key.
+						 */
+		__aligned_u64	keys;		/* input/output: key buffer */
+		__aligned_u64	values;		/* input/output: value buffer */
+		__u32		count;		/* input: # of keys/values in
+						 *   or fits in keys[]/values[].
+						 * output: how many successful
+						 *   lookup/lookup_and_delete
+						 *   update/delete operations.
+						 */
+		__u32		map_fd;
+		__u64		elem_flags;	/* BPF_MAP_*_ELEM flags */
+		__u64		flags;		/* flags for batch operation */
+	} batch;
+
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
 		__u32		prog_type;	/* one of enum bpf_prog_type */
 		__u32		insn_cnt;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 06308f0206e5..8746b55405f9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -33,6 +33,17 @@
 
 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define BPF_MAP_BATCH_SWAP_KEYS(key1, key2, buf1, buf2)	\
+	do {						\
+		if (key1 == (buf1)) {			\
+			key1 = buf2;			\
+			key2 = buf1;			\
+		} else {				\
+			key1 = buf1;			\
+			key2 = buf2;			\
+		}					\
+	} while (0)					\
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -1183,6 +1194,431 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	return err;
 }
 
+static void __map_batch_get_attrs(const union bpf_attr *attr,
+				  void __user **skey, void __user **nskey,
+				  void __user **keys, void __user **values,
+				  u32 *max_count, u64 *elem_flags, u64 *flags)
+{
+	*skey = u64_to_user_ptr(attr->batch.start_key);
+	*nskey = u64_to_user_ptr(attr->batch.next_start_key);
+	*keys = u64_to_user_ptr(attr->batch.keys);
+	*values = u64_to_user_ptr(attr->batch.values);
+	*max_count = attr->batch.count;
+	*elem_flags = attr->batch.elem_flags;
+	*flags = attr->batch.flags;
+}
+
+static int
+__map_lookup_delete_batch_key_in_keys(struct bpf_map *map, void *key, void *value,
+				      u32 max_count, u32 key_size, u32 value_size,
+				      u64 elem_flags, void __user *keys,
+				      void __user *values,
+				      union bpf_attr __user *uattr,
+				      bool ignore_enoent)
+{
+	u32 count, missed = 0;
+	int ret = 0, err;
+
+	for (count = 0; count < max_count; count++) {
+		if (copy_from_user(key, keys + count * key_size, key_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_copy_value(map, key, value, elem_flags);
+		if (ret) {
+			if (ret != -ENOENT || !ignore_enoent)
+				break;
+
+			missed++;
+			continue;
+		}
+
+
+		if (copy_to_user(values + count * value_size, value, value_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_delete_elem(map, key);
+		if (ret) {
+			if (ret != -ENOENT || !ignore_enoent)
+				break;
+
+			missed++;
+		}
+	}
+
+	count -= missed;
+	if ((!ret && missed) || (ret && ret != -EFAULT)) {
+		err = put_user(count, &uattr->batch.count);
+		ret = err ? : ret;
+	}
+
+	return ret;
+}
+
+static int map_lookup_and_delete_batch(struct bpf_map *map,
+				       const union bpf_attr *attr,
+				       union bpf_attr __user *uattr,
+				       bool do_delete)
+{
+	u32 max_count, count = 0, key_size, roundup_key_size, value_size;
+	bool ignore_enoent, nextkey_is_null, copied;
+	void *buf = NULL, *key, *value, *next_key;
+	void __user *skey, *nskey, *keys, *values;
+	u64 elem_flags, flags, zero = 0;
+	int err, ret = 0;
+
+	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+	    map->map_type == BPF_MAP_TYPE_STACK)
+		return -ENOTSUPP;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (elem_flags & ~BPF_F_LOCK || flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	if (!max_count)
+		return 0;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count != 0 && !skey && !nskey: loop/delete max_count elements in keys[]/values[].
+	 * max_count != 0 && !skey && nskey: loop/delete max_count elements starting from map start.
+	 * max_count != 0 && skey && nskey: loop/delete max_count elements starting from skey.
+	 */
+	if (skey && !nskey)
+		return -EINVAL;
+
+	/* allocate space for two keys and one value. */
+	key_size = map->key_size;
+	roundup_key_size = round_up(map->key_size, 8);
+	value_size = bpf_map_value_size(map);
+	buf = kmalloc(roundup_key_size * 2 + value_size, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	next_key = buf + roundup_key_size;
+	value = buf + roundup_key_size * 2;
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+
+	if (!skey && !nskey) {
+		/* handle cases where keys in keys[] */
+		ret = __map_lookup_delete_batch_key_in_keys(map, key, value, max_count,
+							    key_size, value_size,
+							    elem_flags, keys, values,
+							    uattr, ignore_enoent);
+		goto out;
+	}
+
+	/* Get the first key. */
+	if (!skey) {
+		ret = bpf_map_get_next_key(map, NULL, key);
+		if (ret) {
+			nextkey_is_null = true;
+			goto after_loop;
+		}
+	} else if (copy_from_user(key, skey, key_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Copy the first key/value pair */
+	ret = bpf_map_copy_value(map, key, value, elem_flags);
+	if (ret) {
+		if (skey)
+			goto out;
+
+		nextkey_is_null = true;
+		goto after_loop;
+	}
+
+	if (copy_to_user(keys, key, key_size) ||
+	    copy_to_user(values, value, value_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* We will always try to get next_key first
+	 * and then delete the current key.
+	 */
+	ret = bpf_map_get_next_key(map, key, next_key);
+	count = 0;
+	nextkey_is_null = false;
+	copied = true;
+
+again:
+	/* delete key */
+	if (do_delete) {
+		err = bpf_map_delete_elem(map, key);
+		if (err && (err != -ENOENT || !ignore_enoent)) {
+			/* failed to delete "key".
+			 * lookup_delete will considered failed.
+			 */
+			ret = err;
+			goto after_loop;
+		}
+	}
+
+	/* deletion in lookup_and_delete is successful. */
+	/* check bpf_map_get_next_key() result. */
+	count += !!copied;
+	nextkey_is_null = ret && ret == -ENOENT;
+	if (ret || count >= max_count)
+		goto after_loop;
+
+	/* copy value corresponding to next_key */
+	ret = bpf_map_copy_value(map, next_key, value, elem_flags);
+	copied = true;
+	if (ret) {
+		copied = false;
+		if (ret != -ENOENT || !ignore_enoent)
+			goto after_loop;
+	} else {
+		if (copy_to_user(keys + count * key_size, next_key, key_size) ||
+		    copy_to_user(values + count * value_size, value, value_size)) {
+			ret = -EFAULT;
+			goto after_loop;
+		}
+	}
+
+	BPF_MAP_BATCH_SWAP_KEYS(key, next_key, buf, buf + roundup_key_size);
+	ret = bpf_map_get_next_key(map, key, next_key);
+	goto again;
+
+after_loop:
+	if (!ret) {
+		if (put_user(count, &uattr->batch.count) ||
+		    copy_to_user(nskey, next_key, key_size))
+			ret = -EFAULT;
+	} else if (nextkey_is_null) {
+		ret = 0;
+		if (put_user(count, &uattr->batch.count) ||
+                    copy_to_user(&uattr->batch.next_start_key, &zero, sizeof(u64)))
+			ret = -EFAULT;
+	} else if (ret != -EFAULT) {
+		if (put_user(count, &uattr->batch.count))
+			ret = -EFAULT;
+	}
+
+out:
+	kfree(buf);
+	return ret;
+}
+
+static int map_update_batch(struct bpf_map *map,
+			    const union bpf_attr *attr,
+			    union bpf_attr __user *uattr,
+			    struct fd *f)
+{
+	u32 max_count, count, missed, key_size, roundup_key_size, value_size;
+	void __user *skey, *nskey, *keys, *values;
+	void *buf = NULL, *key, *value;
+	u64 elem_flags, flags;
+	bool ignore_enoent;
+	int ret, err;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	if (!max_count)
+		return 0;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count != 0 && !skey && !nskey: update max_count elements in keys[]/values[].
+	 */
+	if (nskey || skey)
+		return -EINVAL;
+
+	if ((elem_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))
+		return -EINVAL;
+
+	key_size = map->key_size;
+	roundup_key_size = round_up(key_size, 8);
+	value_size = bpf_map_value_size(map);
+	buf = kmalloc(roundup_key_size + value_size, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	value = buf + roundup_key_size;
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+
+	missed = 0;
+	for (count = 0; count < max_count; count++) {
+		if (copy_from_user(key, keys + count * key_size, key_size) ||
+		    copy_from_user(value, values + count * value_size, value_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_update_elem(map, key, value, f, elem_flags);
+		if (ret && (ret != -ENOENT || !ignore_enoent))
+			break;
+
+		missed += !!ret;
+		ret = 0;
+	}
+	count -= missed;
+	if ((!ret && missed) || (ret && ret != -EFAULT)) {
+		err = put_user(count, &uattr->batch.count);
+		// Reset the error code if the above put_user failed.
+		ret = err ? : ret;
+	}
+
+	kfree(buf);
+	return ret;
+}
+
+static int map_delete_batch(struct bpf_map *map,
+			    const union bpf_attr *attr,
+			    union bpf_attr __user *uattr)
+{
+	u32 max_count, count, key_size, roundup_key_size;
+	void __user *skey, *nskey, *keys, *values;
+	bool ignore_enoent, nextkey_is_null;
+	void *buf = NULL, *key, *next_key;
+	u64 elem_flags, flags, zero = 0;
+	int ret, err;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (elem_flags || flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count == 0 && !skey && !nskey: delete all elements in the map
+	 * max_count == 0 && skey && !nskey: delete all elements starting from skey.
+	 * max_count != 0 && !skey && nskey: delete max_count elements starting from map start.
+	 * max_count != 0 && skey && nskey: delete max_count elements starting from skey.
+	 * max_count != 0 && !skey && !nskey: delete max_count elements in keys[].
+	 */
+	if ((max_count == 0 && nskey) || (max_count != 0 && skey && !nskey))
+		return -EINVAL;
+
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+	key_size = map->key_size;
+	roundup_key_size = round_up(map->key_size, 8);
+	buf = kmalloc(roundup_key_size * 2, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	next_key = buf + roundup_key_size;
+
+	nextkey_is_null = false;
+
+	if (max_count == 0 || nskey) {
+		count = 0;
+		if (skey) {
+			if (copy_from_user(key, skey, key_size)) {
+				ret = -EFAULT;
+				goto out;
+			}
+		} else {
+			ret = bpf_map_get_next_key(map, NULL, key);
+			if (ret) {
+				nextkey_is_null = true;
+				goto after_loop;
+			}
+		}
+
+		while (max_count == 0 || count < max_count) {
+			ret = bpf_map_get_next_key(map, key, next_key);
+
+			err = bpf_map_delete_elem(map, key);
+			if (err && (err != -ENOENT || !ignore_enoent)) {
+				ret = err;
+				break;
+			}
+
+			count += !err;
+
+			/* check bpf_map_get_next_key() result */
+			if (ret) {
+				nextkey_is_null = ret == -ENOENT;
+				break;
+			}
+
+			BPF_MAP_BATCH_SWAP_KEYS(key, next_key, buf, buf + roundup_key_size);
+		}
+
+after_loop:
+		if (!ret) {
+			if (copy_to_user(nskey, key, key_size))
+				ret = -EFAULT;
+		} else if (nextkey_is_null) {
+			ret = 0;
+			if (copy_to_user(&uattr->batch.next_start_key, &zero, sizeof(u64)))
+				ret = -EFAULT;
+		}
+	} else {
+		for (count = 0; count < max_count;) {
+			if (copy_from_user(key, keys + count * key_size, key_size)) {
+				ret = -EFAULT;
+				break;
+			}
+
+			ret = bpf_map_delete_elem(map, key);
+			if (ret && (ret != -ENOENT || !ignore_enoent))
+				break;
+
+			count += !ret;
+		}
+	}
+
+	if (ret != -EFAULT && put_user(count, &uattr->batch.count))
+		ret = -EFAULT;
+
+out:
+	kfree(buf);
+	return ret;
+}
+
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+			    union bpf_attr __user *uattr,
+			    int cmd)
+{
+	struct bpf_map *map;
+	int err, ufd;
+	struct fd f;
+
+	if (CHECK_ATTR(BPF_MAP_BATCH))
+		return -EINVAL;
+
+	ufd = attr->batch.map_fd;
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	if (cmd == BPF_MAP_LOOKUP_BATCH)
+		err = map_lookup_and_delete_batch(map, attr, uattr, false);
+	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
+		err = map_lookup_and_delete_batch(map, attr, uattr, true);
+	else if (cmd == BPF_MAP_UPDATE_BATCH)
+		err = map_update_batch(map, attr, uattr, &f);
+	else /* BPF_MAP_DELETE_BATCH */
+		err = map_delete_batch(map, attr, uattr);
+
+err_put:
+	fdput(f);
+	return err;
+
+}
+
 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
 
 static int map_freeze(const union bpf_attr *attr)
@@ -2939,6 +3375,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
 		err = map_lookup_and_delete_elem(&attr);
 		break;
+	case BPF_MAP_LOOKUP_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+		break;
+	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+		break;
+	case BPF_MAP_UPDATE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+		break;
+	case BPF_MAP_DELETE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 06/13] tools/bpf: sync uapi header bpf.h
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (4 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 05/13] bpf: adding map batch processing support Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 07/13] tools/bpf: implement libbpf API functions for map batch operations Yonghong Song
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

sync uapi header include/uapi/linux/bpf.h to
tools/include/uapi/linux/bpf.h.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/bpf.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5d2fb183ee2d..576688f13e8c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 	BPF_MAP_FREEZE,
 	BPF_BTF_GET_NEXT_ID,
+	BPF_MAP_LOOKUP_BATCH,
+	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+	BPF_MAP_UPDATE_BATCH,
+	BPF_MAP_DELETE_BATCH,
 };
 
 enum bpf_map_type {
@@ -347,6 +351,9 @@ enum bpf_attach_type {
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
+/* flags for BPF_MAP_*_BATCH */
+#define BPF_F_ENFORCE_ENOENT	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -396,6 +403,26 @@ union bpf_attr {
 		__u64		flags;
 	};
 
+	struct { /* struct used by BPF_MAP_*_BATCH commands */
+		__aligned_u64	start_key;	/* input: storing start key,
+						 * if NULL, starting from the beginning.
+						 */
+		__aligned_u64	next_start_key;	/* output: storing next batch start_key,
+						 * if NULL, no next key.
+						 */
+		__aligned_u64	keys;		/* input/output: key buffer */
+		__aligned_u64	values;		/* input/output: value buffer */
+		__u32		count;		/* input: # of keys/values in
+						 *   or fits in keys[]/values[].
+						 * output: how many successful
+						 *   lookup/lookup_and_delete
+						 *   update/delete operations.
+						 */
+		__u32		map_fd;
+		__u64		elem_flags;	/* BPF_MAP_*_ELEM flags */
+		__u64		flags;		/* flags for batch operation */
+	} batch;
+
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
 		__u32		prog_type;	/* one of enum bpf_prog_type */
 		__u32		insn_cnt;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 07/13] tools/bpf: implement libbpf API functions for map batch operations
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (5 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 06/13] tools/bpf: sync uapi header bpf.h Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 08/13] tools/bpf: add test for bpf_map_update_batch() Yonghong Song
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Added four libbpf API functions to support map batch operations:
  . int bpf_map_delete_batch( ... )
  . int bpf_map_lookup_batch( ... )
  . int bpf_map_lookup_and_delete_batch( ... )
  . int bpf_map_update_batch( ... )

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/bpf.c      | 67 ++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/bpf.h      | 17 ++++++++++
 tools/lib/bpf/libbpf.map |  4 +++
 3 files changed, 88 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index cbb933532981..bdbd660aa2b8 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -438,6 +438,73 @@ int bpf_map_freeze(int fd)
 	return sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr));
 }
 
+static int bpf_map_batch_common(int cmd, int fd, const void *start_key,
+				void **next_start_key,
+				void *keys, void *values,
+				__u32 *count, __u64 elem_flags,
+				__u64 flags)
+{
+	union bpf_attr attr = {};
+	int ret;
+
+	attr.batch.map_fd = fd;
+	attr.batch.start_key = ptr_to_u64(start_key);
+	if (next_start_key)
+		attr.batch.next_start_key = ptr_to_u64(*next_start_key);
+	attr.batch.keys = ptr_to_u64(keys);
+	attr.batch.values = ptr_to_u64(values);
+	if (count)
+		attr.batch.count = *count;
+	attr.batch.elem_flags = elem_flags;
+	attr.batch.flags = flags;
+
+	ret = sys_bpf(cmd, &attr, sizeof(attr));
+	if (next_start_key)
+		*next_start_key = (void *)attr.batch.next_start_key;
+	if (count)
+		*count = attr.batch.count;
+
+	return ret;
+}
+
+int bpf_map_delete_batch(int fd, const void *start_key, void **next_start_key,
+			 void *keys, __u32 *count, __u64 elem_flags,
+			 __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, start_key,
+				    next_start_key, keys, (void *)NULL,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_lookup_batch(int fd, const void *start_key, void **next_start_key,
+			 void *keys, void *values, __u32 *count,
+			 __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_LOOKUP_BATCH, fd, start_key,
+				    next_start_key, keys, values,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_lookup_and_delete_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    void *values, __u32 *count,
+				    __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+				    fd, start_key,
+				    next_start_key, keys, values,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count,
+			 __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH,
+				    fd, (void *)NULL, (void *)NULL,
+				    keys, values,
+				    count, elem_flags, flags);
+}
+
 int bpf_obj_pin(int fd, const char *pathname)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 0db01334740f..a36bfcbfe04f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -120,6 +120,23 @@ LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
 LIBBPF_API int bpf_map_delete_elem(int fd, const void *key);
 LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key);
 LIBBPF_API int bpf_map_freeze(int fd);
+LIBBPF_API int bpf_map_delete_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    __u32 *count, __u64 elem_flags,
+				    __u64 flags);
+LIBBPF_API int bpf_map_lookup_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    void *values, __u32 *count,
+				    __u64 elem_flags, __u64 flags);
+LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, const void *start_key,
+					       void **next_start_key,
+					       void *keys, void *values,
+					       __u32 *count, __u64 elem_flags,
+					       __u64 flags);
+LIBBPF_API int bpf_map_update_batch(int fd, void *keys, void *values,
+				    __u32 *count, __u64 elem_flags,
+				    __u64 flags);
+
 LIBBPF_API int bpf_obj_pin(int fd, const char *pathname);
 LIBBPF_API int bpf_obj_get(const char *pathname);
 LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 664ce8e7a60e..7a00630f0ff1 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -188,4 +188,8 @@ LIBBPF_0.0.4 {
 LIBBPF_0.0.5 {
 	global:
 		bpf_btf_get_next_id;
+		bpf_map_delete_batch;
+		bpf_map_lookup_and_delete_batch;
+		bpf_map_lookup_batch;
+		bpf_map_update_batch;
 } LIBBPF_0.0.4;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 08/13] tools/bpf: add test for bpf_map_update_batch()
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (6 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 07/13] tools/bpf: implement libbpf API functions for map batch operations Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 09/13] tools/bpf: add test for bpf_map_lookup_batch() Yonghong Song
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Add tests for bpf_map_update_batch().
  $ ./test_maps
  ...
  test_map_update_batch:PASS
  ...
---
 .../bpf/map_tests/map_update_batch.c          | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_update_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_update_batch.c b/tools/testing/selftests/bpf/map_tests/map_update_batch.c
new file mode 100644
index 000000000000..67c1e11fc911
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_update_batch.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+void test_map_update_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, key, value;
+	const int max_entries = 10;
+	__u32 count, max_count;
+	int err, i;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values, "malloc()", "error:%s\n", strerror(errno));
+
+	/* do not fill in the whole hash table, so we could test
+	 * update with new elements.
+	 */
+	max_count = max_entries - 2;
+
+	for (i = 0; i < max_count; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	/* test 1: count == 0, expect success. */
+	count = 0;
+	err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+	CHECK(err, "count = 0", "error:%s\n", strerror(errno));
+
+	/* test 2: update initial map with BPF_NOEXIST, expect success. */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	CHECK(err, "elem_flags = BPF_NOEXIST",
+	      "error:%s\n", strerror(errno));
+
+	/* use bpf_map_get_next_key to ensure all keys/values are indeed
+	 * covered.
+	 */
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	CHECK(err, "bpf_map_lookup_elem()", "error: %s\n", strerror(errno));
+	CHECK(key + 1 != value, "key/value checking",
+	      "error: key %d value %d\n", key, value);
+	i = 1;
+	while (!bpf_map_get_next_key(map_fd, &key, &key)) {
+		err = bpf_map_lookup_elem(map_fd, &key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+		CHECK(key + 1 != value,
+		      "key/value checking", "error: key %d value %d\n",
+		      key, value);
+		i++;
+	}
+	CHECK(i != max_count, "checking number of entries",
+	      "err: i %u max_count %u\n", i, max_count);
+
+	/* test 3: elem_flags = BPF_NOEXIST, already exists, expect failure */
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	/* failure to due to flag BPF_NOEXIST, count is set to 0 */
+	CHECK(!err || count, "elem_flags = BPF_NOEXIST again",
+	      "unexpected success\n");
+
+	/* test 4: elem_flags = 0, expect success */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "elem_flags = 0", "error %s\n", strerror(errno));
+
+	/* test 5: keys = NULL, expect failure */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, NULL, values,
+				   &count, 0, 0);
+	CHECK(!err, "keys = NULL", "unexpected success\n");
+
+	/* test 6: values = NULL, expect failure */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, NULL, &count, 0, 0);
+	CHECK(!err, "values = NULL", "unexpected success\n");
+
+	/* test 7: modify the first key to be max_count + 10,
+	 * elem_flags = BPF_NOEXIST,
+	 * expect failure, the return count = 1.
+	 */
+	count = max_count;
+	keys[0] = max_count + 10;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	CHECK(!err, "keys[0] = max_count + 10", "unexpected success\n");
+	CHECK(count != 1, "keys[0] = max_count + 10",
+	      "error: %s, incorrect count %u\n", strerror(errno), count);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 09/13] tools/bpf: add test for bpf_map_lookup_batch()
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (7 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 08/13] tools/bpf: add test for bpf_map_update_batch() Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 10/13] tools/bpf: add test for bpf_map_lookup_and_delete_batch() Yonghong Song
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Test bpf_map_lookup_batch() functionality.
  $ ./test_maps
  ...
  test_map_lookup_batch:PASS
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../bpf/map_tests/map_lookup_batch.c          | 166 ++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c b/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
new file mode 100644
index 000000000000..9c88e8adc556
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+			     int *keys, int *values)
+{
+	int i;
+
+	memset(visited, 0, max_entries * sizeof(*visited));
+	for (i = 0; i < max_entries; i++) {
+		CHECK(keys[i] + 1 != values[i], "key/value checking",
+		      "error: i %d key %d value %d\n", i, keys[i], values[i]);
+		visited[i] = 1;
+	}
+	for (i = 0; i < max_entries; i++) {
+		CHECK(visited[i] != 1, "visited checking",
+		      "error: keys array at index %d missing\n", i);
+	}
+}
+
+void test_map_lookup_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: lookup an empty hash table, success */
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "empty map", "error: %s\n", strerror(errno));
+	CHECK(p_next_skey || count, "empty map",
+	      "p_next_skey = %p, count = %u\n", p_next_skey, count);
+
+	/* populate elements to the map */
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	count = max_entries;
+	err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+
+	/* test 2: lookup with count = 0, success */
+	count = 0;
+	err = bpf_map_lookup_batch(map_fd, NULL, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+
+	/* test 3: lookup with count = max_entries, skey && !nskey, failure */
+	count = max_entries;
+	key = 1;
+	err = bpf_map_lookup_batch(map_fd, &key, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(!err, "skey && !nskey", "unexpected success\n");
+
+	/* test 4: lookup with count = max_entries, success */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+
+
+	/* test 5: lookup with count = max_entries, it should return
+	 * success with count = max_entries.
+	 */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+
+	/* test 6: lookup with an invalid start key, failure */
+	count = max_entries;
+	key = max_entries + 10;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, &key, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(!err, "invalid start_key", "unexpected success\n");
+
+	/* test 7: lookup in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively lookup elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_lookup_batch(map_fd, p_skey, &p_next_skey,
+						   keys + i * step,
+						   values + i * step,
+						   &count, 0, 0);
+			CHECK(err, "lookup with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "lookup with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "lookup with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		map_batch_verify(visited, max_entries, keys, values);
+	}
+
+	/* test 8: lookup with keys in keys[]. */
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_lookup_batch(map_fd, NULL, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "lookup key in keys[]", "error: %s\n", strerror(errno));
+	map_batch_verify(visited, max_entries, keys, values);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 10/13] tools/bpf: add test for bpf_map_lookup_and_delete_batch()
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (8 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 09/13] tools/bpf: add test for bpf_map_lookup_batch() Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 11/13] tools/bpf: add test for bpf_map_delete_batch() Yonghong Song
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Test bpf_map_lookup_and_delete_batch() functionality.
  $ ./test_maps
  ...
  test_map_lookup_and_delete_batch:PASS
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../map_tests/map_lookup_and_delete_batch.c   | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c b/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
new file mode 100644
index 000000000000..b151513a0ab2
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+			     int *keys, int *values)
+{
+	int i;
+
+	memset(visited, 0, max_entries * sizeof(*visited));
+	for (i = 0; i < max_entries; i++) {
+		CHECK(keys[i] + 1 != values[i], "key/value checking",
+		      "error: i %d key %d value %d\n", i, keys[i], values[i]);
+		visited[i] = 1;
+	}
+	for (i = 0; i < max_entries; i++) {
+		CHECK(visited[i] != 1, "visited checking",
+		      "error: keys array at index %d missing\n", i);
+	}
+}
+
+void test_map_lookup_and_delete_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: lookup/delete an empty hash table, success */
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_next_skey, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "empty map", "error: %s\n", strerror(errno));
+	CHECK(p_next_skey || count, "empty map",
+	      "p_next_skey = %p, count = %u\n", p_next_skey, count);
+
+	/* populate elements to the map */
+	map_batch_update(map_fd, max_entries, keys, values);
+
+	/* test 2: lookup/delete with count = 0, success */
+	count = 0;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+
+	/* test 3: lookup/delete with count = max_entries, skey && !nskey,
+	 * failure.
+	 */
+	count = max_entries;
+	key = 1;
+	err = bpf_map_lookup_and_delete_batch(map_fd, &key, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(!err, "skey && !nskey", "unexpected success\n");
+
+	/* test 4: lookup/delete with count = max_entries, success */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_next_skey, keys,
+					      values, &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+	map_batch_verify(visited, max_entries, keys, values);
+
+	/* bpf_map_get_next_key() should return -ENOENT for an empty map. */
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	/* test 5: lookup/delete in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively lookup/delete elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_lookup_and_delete_batch(map_fd, p_skey,
+							      &p_next_skey,
+							      keys + i * step,
+							      values + i * step,
+							      &count, 0, 0);
+			CHECK(err, "lookup/delete with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "lookup/delete with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "lookup/delete with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		map_batch_verify(visited, max_entries, keys, values);
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	}
+
+	/* test 6: lookup/delete with keys in keys[]. */
+	map_batch_update(map_fd, max_entries, keys, values);
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "lookup/delete key in keys[]", "error: %s\n", strerror(errno));
+	map_batch_verify(visited, max_entries, keys, values);
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 11/13] tools/bpf: add test for bpf_map_delete_batch()
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (9 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 10/13] tools/bpf: add test for bpf_map_lookup_and_delete_batch() Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 12/13] tools/bpf: add a multithreaded test for map batch operations Yonghong Song
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Test bpf_map_delete_batch() functionality.
  $ ./test_maps
  ...
  test_map_delete_batch:PASS
  ...
---
 .../bpf/map_tests/map_delete_batch.c          | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_delete_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_delete_batch.c b/tools/testing/selftests/bpf/map_tests/map_delete_batch.c
new file mode 100644
index 000000000000..459495a6d9fc
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_delete_batch.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+void test_map_delete_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: delete an empty hash table, success */
+	for (i = 0; i < 2; i++) {
+		if (i == 0) {
+			count = 0;
+			p_next_skey = NULL;
+		} else {
+			count = max_entries;
+			p_next_skey = &key;
+		}
+		err = bpf_map_delete_batch(map_fd, NULL, &p_next_skey, keys,
+					   &count, 0, 0);
+		CHECK(err, "empty map", "error: %s\n", strerror(errno));
+		CHECK(p_next_skey || count, "empty map",
+		      "i = %d, p_next_skey = %p, count = %u\n", i, p_next_skey,
+		      count);
+	}
+
+	/* test 2: delete with count = 0, success */
+	for (i = 0; i < 2; i++) {
+		/* populate elements to the map */
+		map_batch_update(map_fd, max_entries, keys, values);
+
+		count = 0;
+		if (i == 0) {
+			/* all elements in the map */
+			p_skey = NULL;
+		} else {
+			/* all elements starting from p_skey */
+			p_skey = &key;
+			err = bpf_map_get_next_key(map_fd, NULL, p_skey);
+			CHECK(err, "bpf_map_get_next_key()", "error: %s\n",
+			      strerror(errno));
+		}
+		err = bpf_map_delete_batch(map_fd, p_skey, NULL, NULL,
+				    &count, 0, 0);
+		CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+		/* all elements should be gone. */
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "unexpected success\n");
+	}
+
+	/* test 3: delete in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively delete elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_delete_batch(map_fd, p_skey, &p_next_skey,
+					      keys + i * step, &count, 0, 0);
+			CHECK(err, "delete with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "delete with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "delete with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "error: %s\n",
+		      strerror(errno));
+	}
+
+	/* test 4: delete with keys in keys[]. */
+	map_batch_update(map_fd, max_entries, keys, values);
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_delete_batch(map_fd, NULL, NULL, keys, &count, 0, 0);
+	CHECK(err, "delete key in keys[]", "error: %s\n", strerror(errno));
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 12/13] tools/bpf: add a multithreaded test for map batch operations
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (10 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 11/13] tools/bpf: add test for bpf_map_delete_batch() Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29  6:45 ` [PATCH bpf-next 13/13] tools/bpf: measure map batching perf Yonghong Song
  2019-08-29 18:39 ` [PATCH bpf-next 00/13] bpf: adding map batch processing support Jakub Kicinski
  13 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

A multithreaded test is added. Three threads repeatedly did:
  - batch update
  - batch lookup_and_delete
  - batch delete

It is totally possible each batch element operation in kernel
may find that the key, retrieved from bpf_map_get_next_key(),
may fail lookup and/or delete as some other threads in parallel
operates on the same map.

The default mode for new batch APIs is to ignore -ENOENT errors
in case of lookup and delete and move to the next element.
The test would otherwise fail if the kernel reacts as -ENOENT
as a real error and propogates it back to user space.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/map_tests/map_batch_mt.c    | 126 ++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_mt.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_batch_mt.c b/tools/testing/selftests/bpf/map_tests/map_batch_mt.c
new file mode 100644
index 000000000000..a0e2591d0079
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_batch_mt.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+/* Create three threads. Each thread will iteratively do:
+ *   . update constantly
+ *   . lookup and delete constantly
+ *   . delete constantly
+ * So this will make lookup and delete operations
+ * may fail as the elements may be deleted by another
+ * thread.
+ *
+ * By default, we should not see a problem as
+ * -ENOENT for bpf_map_delete_elem() and bpf_map_lookup_elem()
+ * will be ignored. But with flag, BPF_F_ENFORCE_ENOENT
+ * we may see errors.
+ */
+
+static int map_fd;
+static const __u32 max_entries = 10;
+static volatile bool stop = false;
+
+static void do_batch_update()
+{
+	int i, err, keys[max_entries], values[max_entries];
+	__u32 count;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	while (!stop) {
+		count = max_entries;
+		err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+		CHECK(err, "bpf_map_update_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void do_batch_delete()
+{
+	__u32 count;
+	int err;
+
+	while (!stop) {
+		count = 0;
+		err = bpf_map_delete_batch(map_fd, NULL, NULL, NULL, &count,
+					   0, 0);
+		CHECK(err, "bpf_map_delete_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void do_batch_lookup_and_delete()
+{
+	int err, key, keys[max_entries], values[max_entries];
+	__u32 count;
+	void *p_key;
+
+	while (!stop) {
+		p_key = &key;
+		count = max_entries;
+		err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_key,
+						      keys, values, &count,
+						      0, 0);
+		CHECK(err, "bpf_map_lookup_and_delete_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void *do_work(void *arg)
+{
+	int work_index = (int)(long)arg;
+
+	if (work_index == 0)
+		do_batch_update();
+	else if (work_index == 1)
+		do_batch_delete();
+	else
+		do_batch_lookup_and_delete();
+
+	return NULL;
+}
+
+void test_map_batch_mt(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	const int nr_threads = 3;
+	pthread_t threads[nr_threads];
+	int i, err;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	for (i = 0; i < nr_threads; i++) {
+		err = pthread_create(&threads[i], NULL, do_work,
+				     (void *)(long)i);
+		CHECK(err, "pthread_create", "error: %s\n", strerror(errno));
+	}
+
+	sleep(1);
+	stop = true;
+
+	for (i = 0; i < nr_threads; i++)
+		pthread_join(threads[i], NULL);
+
+	close(map_fd);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH bpf-next 13/13] tools/bpf: measure map batching perf
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (11 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 12/13] tools/bpf: add a multithreaded test for map batch operations Yonghong Song
@ 2019-08-29  6:45 ` Yonghong Song
  2019-08-29 18:39 ` [PATCH bpf-next 00/13] bpf: adding map batch processing support Jakub Kicinski
  13 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

The test program run result:
  $ ./test_maps
  ...
  measure_lookup: max_entries 1000000, batch 10, time 342
  measure_lookup: max_entries 1000000, batch 1000, time 295
  measure_lookup: max_entries 1000000, batch 1000000, time 270
  measure_lookup: max_entries 1000000, no batching, time 1346
  measure_lookup_delete: max_entries 1000000, batch 10, time 433
  measure_lookup_delete: max_entries 1000000, batch 1000, time 363
  measure_lookup_delete: max_entries 1000000, batch 1000000, time 357
  measure_lookup_delete: max_entries 1000000, not batch, time 1894
  measure_delete: max_entries 1000000, batch, time 220
  measure_delete: max_entries 1000000, not batch, time 1289
  test_map_batch_perf:PASS
  ...

  The test is running on a qemu VM environment. The time
  unit is millisecond. A simple hash table with 1M elements
  is created.

  For lookup and lookup_and_deletion, since buffer allocation
  is needed to hold the lookup results, three different
  batch sizes (10, 1000, and 1M) are tried. The performance
  without batching is also measured. A batch size of 10
  can already improves performance dramatically, more than 70%,
  and increasing batch size may continue to improve performance,
  but with diminishing returns.

  For delete, the batch API provides a mechanism to delete all elements
  in the map, which is used here. The deletion of the whole map
  improves performance by 80% compared to non-batch mechanism.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/map_tests/map_batch_perf.c  | 242 ++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_perf.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_batch_perf.c b/tools/testing/selftests/bpf/map_tests/map_batch_perf.c
new file mode 100644
index 000000000000..42d95651e1ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_batch_perf.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+/* Test map batch performance.
+ * Test three common scenarios:
+ *    - batched lookup
+ *    - batched lookup and delete
+ *    - batched deletion
+ */
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static unsigned long util_gettime(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void measure_lookup(int map_fd, __u32 max_entries, int *keys,
+			   int *values)
+{
+	__u32 batches[3] = {10, 1000};
+	int err, key, value, option;
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	__u32 count, total;
+
+	map_batch_update(map_fd, max_entries, keys, values);
+
+	/* batched */
+	batches[2] = max_entries;
+	for (option = 0; option < 3; option++) {
+		p_key = NULL;
+		p_next_key = &key;
+		count = batches[option];
+		start = util_gettime();
+		total = 0;
+
+		while (true) {
+			err = bpf_map_lookup_batch(map_fd, p_key, &p_next_key,
+						   keys, values, &count, 0, 0);
+			CHECK(err, "bpf_map_lookup_batch()", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_key)
+				break;
+
+			if (!p_key)
+				p_key = p_next_key;
+		}
+
+		end = util_gettime();
+		CHECK(total != max_entries,
+		      "checking total", "total %u, max_entries %u\n",
+		      total, max_entries);
+		printf("%s: max_entries %u, batch %u, time %ld\n", __func__,
+		       max_entries, batches[option], end - start);
+	}
+
+	/* not batched */
+	start = util_gettime();
+	p_key = NULL;
+	p_next_key = &key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+		p_key = p_next_key;
+	}
+	end = util_gettime();
+	printf("%s: max_entries %u, no batching, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+static void measure_lookup_delete(int map_fd, __u32 max_entries, int *keys,
+				  int *values)
+{
+	int err, key, next_key, value, option;
+	__u32 batches[3] = {10, 1000};
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	__u32 count, total;
+
+	/* batched */
+	batches[2] = max_entries;
+	for (option = 0; option < 3; option++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		p_key = NULL;
+		p_next_key = &key;
+		count = batches[option];
+		start = util_gettime();
+		total = 0;
+
+		while (true) {
+			err = bpf_map_lookup_and_delete_batch(map_fd, p_key,
+				&p_next_key, keys, values, &count, 0, 0);
+			CHECK(err, "bpf_map_lookup_and_delete_batch()",
+			      "error: %s\n", strerror(errno));
+
+			total += count;
+			if (!p_next_key)
+				break;
+
+			if (!p_key)
+				p_key = p_next_key;
+		}
+
+		end = util_gettime();
+		CHECK(total != max_entries,
+		      "checking total", "total %u, max_entries %u\n",
+		      total, max_entries);
+		printf("%s: max_entries %u, batch %u, time %ld\n", __func__,
+		       max_entries, batches[option], end - start);
+	}
+
+	/* not batched */
+	map_batch_update(map_fd, max_entries, keys, values);
+	start = util_gettime();
+	p_key = NULL;
+	p_next_key = &key;
+	err = bpf_map_get_next_key(map_fd, p_key, p_next_key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+	CHECK(err, "bpf_map_lookup_elem()", "error: %s\n", strerror(errno));
+
+	p_key = p_next_key;
+	p_next_key = &next_key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_delete_elem(map_fd, p_key);
+		CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+		      strerror(errno));
+
+		err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+
+		p_key = p_next_key;
+		p_next_key = (p_next_key == &key) ? &next_key : &key;
+	}
+	err = bpf_map_delete_elem(map_fd, p_key);
+	CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+	      strerror(errno));
+	end = util_gettime();
+	printf("%s: max_entries %u, not batch, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+static void measure_delete(int map_fd, __u32 max_entries, int *keys,
+			   int *values)
+{
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	int err, key, next_key;
+	__u32 count;
+
+	/* batched */
+	map_batch_update(map_fd, max_entries, keys, values);
+	count = 0;
+	p_next_key = &key;
+	start = util_gettime();
+	err = bpf_map_delete_batch(map_fd, NULL, NULL, NULL, &count, 0, 0);
+	end = util_gettime();
+	CHECK(err, "bpf_map_delete_batch()", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries, "bpf_map_delete_batch()",
+	      "count = %u, max_entries = %u\n", count, max_entries);
+
+	printf("%s: max_entries %u, batch, time %ld\n", __func__,
+	       max_entries, end - start);
+
+	map_batch_update(map_fd, max_entries, keys, values);
+	p_key = NULL;
+	p_next_key = &key;
+	start = util_gettime();
+	err = bpf_map_get_next_key(map_fd, p_key, p_next_key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	p_key = p_next_key;
+	p_next_key = &next_key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_delete_elem(map_fd, p_key);
+		CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+		      strerror(errno));
+		p_key = p_next_key;
+		p_next_key = (p_next_key == &key) ? &next_key : &key;
+	}
+	err = bpf_map_delete_elem(map_fd, p_key);
+	CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+	      strerror(errno));
+	end = util_gettime();
+	printf("%s: max_entries %u, not batch, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+void test_map_batch_perf(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	const __u32 max_entries = 1000000;  // 1M entries for the hash table
+	int map_fd, *keys, *values;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values, "malloc()", "error:%s\n", strerror(errno));
+
+	measure_lookup(map_fd, max_entries, keys, values);
+	measure_lookup_delete(map_fd, max_entries, keys, values);
+	measure_delete(map_fd, max_entries, keys, values);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
                   ` (12 preceding siblings ...)
  2019-08-29  6:45 ` [PATCH bpf-next 13/13] tools/bpf: measure map batching perf Yonghong Song
@ 2019-08-29 18:39 ` Jakub Kicinski
  2019-08-29 23:13   ` Brian Vazquez
  2019-08-30  7:25   ` Yonghong Song
  13 siblings, 2 replies; 38+ messages in thread
From: Jakub Kicinski @ 2019-08-29 18:39 UTC (permalink / raw)
  To: Yonghong Song, Alexei Starovoitov
  Cc: bpf, netdev, Brian Vazquez, Daniel Borkmann, kernel-team

On Wed, 28 Aug 2019 23:45:02 -0700, Yonghong Song wrote:
> Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
> map entries per syscall.
>   https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t
> 
> During discussion, we found more use cases can be supported in a similar
> map operation batching framework. For example, batched map lookup and delete,
> which can be really helpful for bcc.
>   https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
>   https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
>     
> Also, in bcc, we have API to delete all entries in a map.
>   https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
> 
> For map update, batched operations also useful as sometimes applications need
> to populate initial maps with more than one entry. For example, the below
> example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
>   https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
> 
> This patch addresses all the above use cases. To make uapi stable, it also
> covers other potential use cases. Four bpf syscall subcommands are introduced:
>     BPF_MAP_LOOKUP_BATCH
>     BPF_MAP_LOOKUP_AND_DELETE_BATCH
>     BPF_MAP_UPDATE_BATCH
>     BPF_MAP_DELETE_BATCH
> 
> In userspace, application can iterate through the whole map one batch
> as a time, e.g., bpf_map_lookup_batch() in the below:
>     p_key = NULL;
>     p_next_key = &key;
>     while (true) {
>        err = bpf_map_lookup_batch(fd, p_key, &p_next_key, keys, values,
>                                   &batch_size, elem_flags, flags);
>        if (err) ...
>        if (p_next_key) break; // done
>        if (!p_key) p_key = p_next_key;
>     }
> Please look at individual patches for details of new syscall subcommands
> and examples of user codes.
> 
> The testing is also done in a qemu VM environment:
>       measure_lookup: max_entries 1000000, batch 10, time 342ms
>       measure_lookup: max_entries 1000000, batch 1000, time 295ms
>       measure_lookup: max_entries 1000000, batch 1000000, time 270ms
>       measure_lookup: max_entries 1000000, no batching, time 1346ms
>       measure_lookup_delete: max_entries 1000000, batch 10, time 433ms
>       measure_lookup_delete: max_entries 1000000, batch 1000, time 363ms
>       measure_lookup_delete: max_entries 1000000, batch 1000000, time 357ms
>       measure_lookup_delete: max_entries 1000000, not batch, time 1894ms
>       measure_delete: max_entries 1000000, batch, time 220ms
>       measure_delete: max_entries 1000000, not batch, time 1289ms
> For a 1M entry hash table, batch size of 10 can reduce cpu time
> by 70%. Please see patch "tools/bpf: measure map batching perf"
> for details of test codes.

Hi Yonghong!

great to see this, we have been looking at implementing some way to
speed up map walks as well.

The direction we were looking in, after previous discussions [1],
however, was to provide a BPF program which can run the logic entirely
within the kernel.

We have a rough PoC on the FW side (we can offload the program which
walks the map, which is pretty neat), but the kernel verifier side
hasn't really progressed. It will soon.

The rough idea is that the user space provides two programs, "filter"
and "dumper":

	bpftool map exec id XYZ filter pinned /some/prog \
				dumper pinned /some/other_prog

Both programs get this context:

struct map_op_ctx {
	u64 key;
	u64 value;
}

We need a per-map implementation of the exec side, but roughly maps
would do:

	LIST_HEAD(deleted);

	for entry in map {
		struct map_op_ctx {
			.key	= entry->key,
			.value	= entry->value,
		};

		act = BPF_PROG_RUN(filter, &map_op_ctx);
		if (act & ~ACT_BITS)
			return -EINVAL;

		if (act & DELETE) {
			map_unlink(entry);
			list_add(entry, &deleted);
		}
		if (act & STOP)
			break;
	}

	synchronize_rcu();

	for entry in deleted {
		struct map_op_ctx {
			.key	= entry->key,
			.value	= entry->value,
		};
		
		BPF_PROG_RUN(dumper, &map_op_ctx);
		map_free(entry);
	}

The filter program can't perform any map operations other than lookup,
otherwise we won't be able to guarantee that we'll walk the entire map
(if the filter program deletes some entries in a unfortunate order).

If user space just wants a pure dump it can simply load a program which
dumps the entries into a perf ring.

I'm bringing this up because that mechanism should cover what is
achieved with this patch set and much more. 

In particular for networking workloads where old flows have to be
pruned from the map periodically it's far more efficient to communicate
to user space only the flows which timed out (the delete batching from
this set won't help at all).

With a 2M entry map and this patch set we still won't be able to prune
once a second on one core.

[1]
https://lore.kernel.org/netdev/20190813130921.10704-4-quentin.monnet@netronome.com/

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions
  2019-08-29  6:45 ` [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions Yonghong Song
@ 2019-08-29 22:04   ` Song Liu
  2019-08-30  6:40     ` Yonghong Song
  0 siblings, 1 reply; 38+ messages in thread
From: Song Liu @ 2019-08-29 22:04 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Networking, Alexei Starovoitov, Brian Vazquez,
	Daniel Borkmann, Kernel Team



> On Aug 28, 2019, at 11:45 PM, Yonghong Song <yhs@fb.com> wrote:
> 
> From: Brian Vazquez <brianvv@google.com>
> 
> Move reusable code from map_lookup_elem to helper functions to avoid code
> duplication in kernel/bpf/syscall.c
> 
> Suggested-by: Stanislav Fomichev <sdf@google.com>
> Signed-off-by: Brian Vazquez <brianvv@google.com>


Acked-by: Song Liu <songliubraving@fb.com>

Yonghong, we also need your SoB. 



^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 05/13] bpf: adding map batch processing support
  2019-08-29  6:45 ` [PATCH bpf-next 05/13] bpf: adding map batch processing support Yonghong Song
@ 2019-08-29 23:01   ` Brian Vazquez
  2019-08-30  6:39     ` Yonghong Song
  0 siblings, 1 reply; 38+ messages in thread
From: Brian Vazquez @ 2019-08-29 23:01 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team

Hi Yonghong!

Thanks for sending this series of patches and starting the discussion.

On Wed, Aug 28, 2019 at 11:45 PM Yonghong Song <yhs@fb.com> wrote:
>
> Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
> map entries per syscall.
>   https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t
>
> During discussion, we found more use cases can be supported in a similar
> map operation batching framework. For example, batched map lookup and delete,
> which can be really helpful for bcc.
>   https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
>   https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
>
> Also, in bcc, we have API to delete all entries in a map.
>   https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
>
> For map update, batched operations also useful as sometimes applications need
> to populate initial maps with more than one entry. For example, the below
> example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
>   https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
>
> This patch addresses all the above use cases. To make uapi stable, it also
> covers other potential use cases. Four bpf syscall subcommands are introduced:
>         BPF_MAP_LOOKUP_BATCH
>         BPF_MAP_LOOKUP_AND_DELETE_BATCH
>         BPF_MAP_UPDATE_BATCH
>         BPF_MAP_DELETE_BATCH
>
> The UAPI attribute structure looks like:
>         struct { /* struct used by BPF_MAP_*_BATCH commands */
>                 __aligned_u64   start_key;      /* input: storing start key,
>                                                  * if NULL, starting from the beginning.
>                                                  */
>                 __aligned_u64   next_start_key; /* output: storing next batch start_key,
>                                                  * if NULL, no next key.
>                                                  */
>                 __aligned_u64   keys;           /* input/output: key buffer */
>                 __aligned_u64   values;         /* input/output: value buffer */
>                 __u32           count;          /* input: # of keys/values in
>                                                  *   or fits in keys[]/values[].
>                                                  * output: how many successful
>                                                  *   lookup/lookup_and_delete
>                                                  *   /delete/update operations.
>                                                  */
>                 __u32           map_fd;
>                 __u64           elem_flags;     /* BPF_MAP_{UPDATE,LOOKUP}_ELEM flags */
>                 __u64           flags;          /* flags for batch operation */
>         } batch;
>
> The 'start_key' and 'next_start_key' are used to BPF_MAP_LOOKUP_BATCH,
> BPF_MAP_LOOKUP_AND_DELETE_BATCH and BPF_MAP_DELETE_BATCH
> to start the operation on 'start_key' and also set the
> next batch start key in 'next_start_key'.
>
> If 'count' is greater than 0 and the return code is 0,
>   . the 'count' may be updated to be smaller if there is less
>     elements than 'count' for the operation. In such cases,
>     'next_start_key' will be set to NULL.
>   . the 'count' remains the same. 'next_start_key' could be NULL
>     or could point to next start_key for batch processing.
>
> If 'count' is greater than 0 and the return code is an error
> other than -EFAULT, the kernel will try to overwrite 'count'
> to contain the number of successful element-level (lookup,
> lookup_and_delete, update and delete) operations. The following
> attributes can be checked:
>   . if 'count' value is modified, the new value will be
>     the number of successful element-level operations.
>   . if 'count' value is modified, the keys[]/values[] will
>     contain correct results for new 'count' number of
>     operations for lookup[_and_delete] and update.
>
> The implementation in this patch mimics what user space
> did, e.g., for lookup_and_delete,
>     while(bpf_get_next_key(map, keyp, next_keyp) == 0) {
>        bpf_map_delete_elem(map, keyp);
>        bpf_map_lookup_elem(map, next_keyp, &value, flags);
>        keyp, next_keyp = next_keyp, keyp;
>     }
> The similar loop is implemented in the kernel, and
> each operation, bpf_get_next_key(), bpf_map_delete_elem()
> and bpf_map_lookup_elem(), uses existing kernel functions
> each of which has its own rcu_read_lock region, bpf_prog_active
> protection, etc.
> Therefore, it is totally possible that after bpf_get_next_key(),
> the bpf_map_delete_elem() or bpf_map_lookup_elem() may fail
> as the key may be deleted concurrently by kernel or
> other user space processes/threads.
> By default, the current implementation permits the loop
> to continue, just like typical user space handling. But
> a flag, BPF_F_ENFORCE_ENOENT, can be specified, so kernel
> will return an error if bpf_map_delete_elem() or
> bpf_map_lookup_elem() failed.
>
> The high-level algorithm for BPF_MAP_LOOKUP_BATCH and
> BPF_MAP_LOOKUP_AND_DELETE_BATCH:
>         if (start_key == NULL and next_start_key == NULL) {
>                 lookup up to 'count' keys in keys[] and fill
>                 corresponding values[], and delete those
>                 keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
>         } else if (start_key == NULL && next_start_key != NULL) {
>                 lookup up to 'count' keys from the beginning
>                 of the map and fill keys[]/values[], delete
>                 those keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
>                 Set 'next_start_key' for next batch operation.
>         } else if (start_key != NULL && next_start_key != NULL) {
>                 lookup to 'count' keys from 'start_key', inclusive,
>                 and fill keys[]/values[], delete those keys if
>                 BPF_MAP_LOOKUP_AND_DELETE_BATCH.
>                 Set 'next_start_key' for next batch operation.
>         }
>
> The high-level algorithm for BPF_MAP_UPDATE_BATCH:
>         if (count != 0) {
>                 do 'count' number of updates on keys[]/values[].
>         }
>
> The high-level algorithm for BPF_MAP_DELETE_BATCH:
>         if (count == 0) {
>                 if (start_key == NULL) {
>                         delete all elements from map.
>                 } else {
>                         delete all elements from start_key to the end of map.
>                 }
>         } else {
>                 if (start_key == NULL and next_start_key == NULL) {
>                         delete 'count' number of keys in keys[].
>                 } else if (start_key == NULL and next_start_key != NULL) {
>                         delete 'count' number of keys from the
>                         beginning of the map and set 'next_start_key'
>                         properly.
>                 } else if (start_key != NULL and next_start_keeey != NULL) {
>                         delete 'count' number of keys from 'start_key',
>                         and set 'next_start_key' properly.
>                 }
>         }
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/uapi/linux/bpf.h |  27 +++
>  kernel/bpf/syscall.c     | 448 +++++++++++++++++++++++++++++++++++++++
>  2 files changed, 475 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 5d2fb183ee2d..576688f13e8c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -107,6 +107,10 @@ enum bpf_cmd {
>         BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>         BPF_MAP_FREEZE,
>         BPF_BTF_GET_NEXT_ID,
> +       BPF_MAP_LOOKUP_BATCH,
> +       BPF_MAP_LOOKUP_AND_DELETE_BATCH,
> +       BPF_MAP_UPDATE_BATCH,
> +       BPF_MAP_DELETE_BATCH,
>  };
>
>  enum bpf_map_type {
> @@ -347,6 +351,9 @@ enum bpf_attach_type {
>  /* flags for BPF_PROG_QUERY */
>  #define BPF_F_QUERY_EFFECTIVE  (1U << 0)
>
> +/* flags for BPF_MAP_*_BATCH */
> +#define BPF_F_ENFORCE_ENOENT   (1U << 0)
> +
>  enum bpf_stack_build_id_status {
>         /* user space need an empty entry to identify end of a trace */
>         BPF_STACK_BUILD_ID_EMPTY = 0,
> @@ -396,6 +403,26 @@ union bpf_attr {
>                 __u64           flags;
>         };
>
> +       struct { /* struct used by BPF_MAP_*_BATCH commands */
> +               __aligned_u64   start_key;      /* input: storing start key,
> +                                                * if NULL, starting from the beginning.
> +                                                */
> +               __aligned_u64   next_start_key; /* output: storing next batch start_key,
> +                                                * if NULL, no next key.
> +                                                */
> +               __aligned_u64   keys;           /* input/output: key buffer */
> +               __aligned_u64   values;         /* input/output: value buffer */
> +               __u32           count;          /* input: # of keys/values in
> +                                                *   or fits in keys[]/values[].
> +                                                * output: how many successful
> +                                                *   lookup/lookup_and_delete
> +                                                *   update/delete operations.
> +                                                */
> +               __u32           map_fd;
> +               __u64           elem_flags;     /* BPF_MAP_*_ELEM flags */
> +               __u64           flags;          /* flags for batch operation */
> +       } batch;
> +
>         struct { /* anonymous struct used by BPF_PROG_LOAD command */
>                 __u32           prog_type;      /* one of enum bpf_prog_type */
>                 __u32           insn_cnt;
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 06308f0206e5..8746b55405f9 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -33,6 +33,17 @@
>
>  #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
>
> +#define BPF_MAP_BATCH_SWAP_KEYS(key1, key2, buf1, buf2)        \
> +       do {                                            \
> +               if (key1 == (buf1)) {                   \
> +                       key1 = buf2;                    \
> +                       key2 = buf1;                    \
> +               } else {                                \
> +                       key1 = buf1;                    \
> +                       key2 = buf2;                    \
> +               }                                       \
> +       } while (0)                                     \
> +
>  DEFINE_PER_CPU(int, bpf_prog_active);
>  static DEFINE_IDR(prog_idr);
>  static DEFINE_SPINLOCK(prog_idr_lock);
> @@ -1183,6 +1194,431 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
>         return err;
>  }
>
> +static void __map_batch_get_attrs(const union bpf_attr *attr,
> +                                 void __user **skey, void __user **nskey,
> +                                 void __user **keys, void __user **values,
> +                                 u32 *max_count, u64 *elem_flags, u64 *flags)
> +{
> +       *skey = u64_to_user_ptr(attr->batch.start_key);
> +       *nskey = u64_to_user_ptr(attr->batch.next_start_key);
> +       *keys = u64_to_user_ptr(attr->batch.keys);
> +       *values = u64_to_user_ptr(attr->batch.values);
> +       *max_count = attr->batch.count;
> +       *elem_flags = attr->batch.elem_flags;
> +       *flags = attr->batch.flags;
> +}
> +
> +static int
> +__map_lookup_delete_batch_key_in_keys(struct bpf_map *map, void *key, void *value,
> +                                     u32 max_count, u32 key_size, u32 value_size,
> +                                     u64 elem_flags, void __user *keys,
> +                                     void __user *values,
> +                                     union bpf_attr __user *uattr,
> +                                     bool ignore_enoent)
> +{
> +       u32 count, missed = 0;
> +       int ret = 0, err;
> +
> +       for (count = 0; count < max_count; count++) {
> +               if (copy_from_user(key, keys + count * key_size, key_size)) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               ret = bpf_map_copy_value(map, key, value, elem_flags);
> +               if (ret) {
> +                       if (ret != -ENOENT || !ignore_enoent)
> +                               break;
> +
> +                       missed++;
> +                       continue;
> +               }
> +
> +
> +               if (copy_to_user(values + count * value_size, value, value_size)) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               ret = bpf_map_delete_elem(map, key);
> +               if (ret) {
> +                       if (ret != -ENOENT || !ignore_enoent)
> +                               break;
> +
> +                       missed++;
> +               }
> +       }
> +
> +       count -= missed;
> +       if ((!ret && missed) || (ret && ret != -EFAULT)) {
> +               err = put_user(count, &uattr->batch.count);
> +               ret = err ? : ret;
> +       }
> +
> +       return ret;
> +}
> +
> +static int map_lookup_and_delete_batch(struct bpf_map *map,
> +                                      const union bpf_attr *attr,
> +                                      union bpf_attr __user *uattr,
> +                                      bool do_delete)
> +{
> +       u32 max_count, count = 0, key_size, roundup_key_size, value_size;
> +       bool ignore_enoent, nextkey_is_null, copied;
> +       void *buf = NULL, *key, *value, *next_key;
> +       void __user *skey, *nskey, *keys, *values;
> +       u64 elem_flags, flags, zero = 0;
> +       int err, ret = 0;
> +
> +       if (map->map_type == BPF_MAP_TYPE_QUEUE ||
> +           map->map_type == BPF_MAP_TYPE_STACK)
> +               return -ENOTSUPP;
> +
> +       __map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
> +                             &elem_flags, &flags);
> +
> +       if (elem_flags & ~BPF_F_LOCK || flags & ~BPF_F_ENFORCE_ENOENT)
> +               return -EINVAL;
> +
> +       if (!max_count)
> +               return 0;
> +
> +       /* The following max_count/skey/nskey combinations are supported:
> +        * max_count != 0 && !skey && !nskey: loop/delete max_count elements in keys[]/values[].
> +        * max_count != 0 && !skey && nskey: loop/delete max_count elements starting from map start.
> +        * max_count != 0 && skey && nskey: loop/delete max_count elements starting from skey.
> +        */
> +       if (skey && !nskey)
> +               return -EINVAL;
> +
> +       /* allocate space for two keys and one value. */
> +       key_size = map->key_size;
> +       roundup_key_size = round_up(map->key_size, 8);
> +       value_size = bpf_map_value_size(map);
> +       buf = kmalloc(roundup_key_size * 2 + value_size, GFP_USER | __GFP_NOWARN);
> +       if (!buf)
> +               return -ENOMEM;
> +
> +       key = buf;
> +       next_key = buf + roundup_key_size;
> +       value = buf + roundup_key_size * 2;
> +       ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
> +
> +       if (!skey && !nskey) {
> +               /* handle cases where keys in keys[] */
> +               ret = __map_lookup_delete_batch_key_in_keys(map, key, value, max_count,
> +                                                           key_size, value_size,
> +                                                           elem_flags, keys, values,
> +                                                           uattr, ignore_enoent);
> +               goto out;
> +       }
> +
> +       /* Get the first key. */
> +       if (!skey) {
> +               ret = bpf_map_get_next_key(map, NULL, key);
> +               if (ret) {
> +                       nextkey_is_null = true;
> +                       goto after_loop;
> +               }
> +       } else if (copy_from_user(key, skey, key_size)) {
> +               ret = -EFAULT;
> +               goto out;
> +       }
> +
> +       /* Copy the first key/value pair */
> +       ret = bpf_map_copy_value(map, key, value, elem_flags);
> +       if (ret) {
> +               if (skey)
> +                       goto out;
> +
> +               nextkey_is_null = true;
> +               goto after_loop;
> +       }
> +
> +       if (copy_to_user(keys, key, key_size) ||
> +           copy_to_user(values, value, value_size)) {
> +               ret = -EFAULT;
> +               goto out;
> +       }
> +
> +       /* We will always try to get next_key first
> +        * and then delete the current key.
> +        */
> +       ret = bpf_map_get_next_key(map, key, next_key);

One of the issues I see in this implementation is that is still
relying on the existing functions and has the same consistency
problems that my attempt had.

The problem happens when you are trying to do batch lookup on a
hashmap and when executing bpf_map_get_next_key(map, key, next_key)
the key is removed, then that call will return the first key and you'd
start iterating the map from the beginning again and retrieve
duplicate information.

Note that sometimes you can also start from the same bucket when the
key is updated while dumping it because it can be inserted on the head
of the bucket so you could potentially revisit elements that you had
already visited.

From previous discussion my understanding was that what we wanted was
to pursue 'atomic' compounded operations first and after that, try to
batch them. Although I don't think there's an easy way of batching and
being consistent at the same time.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-29 18:39 ` [PATCH bpf-next 00/13] bpf: adding map batch processing support Jakub Kicinski
@ 2019-08-29 23:13   ` Brian Vazquez
  2019-08-30  0:15     ` Jakub Kicinski
  2019-08-30  7:25   ` Yonghong Song
  1 sibling, 1 reply; 38+ messages in thread
From: Brian Vazquez @ 2019-08-29 23:13 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Yonghong Song, Alexei Starovoitov, bpf, netdev, Daniel Borkmann,
	kernel-team

On Thu, Aug 29, 2019 at 11:40 AM Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Wed, 28 Aug 2019 23:45:02 -0700, Yonghong Song wrote:
> > Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
> > map entries per syscall.
> >   https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t
> >
> > During discussion, we found more use cases can be supported in a similar
> > map operation batching framework. For example, batched map lookup and delete,
> > which can be really helpful for bcc.
> >   https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
> >   https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
> >
> > Also, in bcc, we have API to delete all entries in a map.
> >   https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
> >
> > For map update, batched operations also useful as sometimes applications need
> > to populate initial maps with more than one entry. For example, the below
> > example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
> >   https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
> >
> > This patch addresses all the above use cases. To make uapi stable, it also
> > covers other potential use cases. Four bpf syscall subcommands are introduced:
> >     BPF_MAP_LOOKUP_BATCH
> >     BPF_MAP_LOOKUP_AND_DELETE_BATCH
> >     BPF_MAP_UPDATE_BATCH
> >     BPF_MAP_DELETE_BATCH
> >
> > In userspace, application can iterate through the whole map one batch
> > as a time, e.g., bpf_map_lookup_batch() in the below:
> >     p_key = NULL;
> >     p_next_key = &key;
> >     while (true) {
> >        err = bpf_map_lookup_batch(fd, p_key, &p_next_key, keys, values,
> >                                   &batch_size, elem_flags, flags);
> >        if (err) ...
> >        if (p_next_key) break; // done
> >        if (!p_key) p_key = p_next_key;
> >     }
> > Please look at individual patches for details of new syscall subcommands
> > and examples of user codes.
> >
> > The testing is also done in a qemu VM environment:
> >       measure_lookup: max_entries 1000000, batch 10, time 342ms
> >       measure_lookup: max_entries 1000000, batch 1000, time 295ms
> >       measure_lookup: max_entries 1000000, batch 1000000, time 270ms
> >       measure_lookup: max_entries 1000000, no batching, time 1346ms
> >       measure_lookup_delete: max_entries 1000000, batch 10, time 433ms
> >       measure_lookup_delete: max_entries 1000000, batch 1000, time 363ms
> >       measure_lookup_delete: max_entries 1000000, batch 1000000, time 357ms
> >       measure_lookup_delete: max_entries 1000000, not batch, time 1894ms
> >       measure_delete: max_entries 1000000, batch, time 220ms
> >       measure_delete: max_entries 1000000, not batch, time 1289ms
> > For a 1M entry hash table, batch size of 10 can reduce cpu time
> > by 70%. Please see patch "tools/bpf: measure map batching perf"
> > for details of test codes.
>
> Hi Yonghong!
>
> great to see this, we have been looking at implementing some way to
> speed up map walks as well.
>
> The direction we were looking in, after previous discussions [1],
> however, was to provide a BPF program which can run the logic entirely
> within the kernel.
>
> We have a rough PoC on the FW side (we can offload the program which
> walks the map, which is pretty neat), but the kernel verifier side
> hasn't really progressed. It will soon.
>
> The rough idea is that the user space provides two programs, "filter"
> and "dumper":
>
>         bpftool map exec id XYZ filter pinned /some/prog \
>                                 dumper pinned /some/other_prog
>
> Both programs get this context:
>
> struct map_op_ctx {
>         u64 key;
>         u64 value;
> }
>
> We need a per-map implementation of the exec side, but roughly maps
> would do:
>
>         LIST_HEAD(deleted);
>
>         for entry in map {
>                 struct map_op_ctx {
>                         .key    = entry->key,
>                         .value  = entry->value,
>                 };
>
>                 act = BPF_PROG_RUN(filter, &map_op_ctx);
>                 if (act & ~ACT_BITS)
>                         return -EINVAL;
>
>                 if (act & DELETE) {
>                         map_unlink(entry);
>                         list_add(entry, &deleted);
>                 }
>                 if (act & STOP)
>                         break;
>         }
>
>         synchronize_rcu();
>
>         for entry in deleted {
>                 struct map_op_ctx {
>                         .key    = entry->key,
>                         .value  = entry->value,
>                 };
>
>                 BPF_PROG_RUN(dumper, &map_op_ctx);
>                 map_free(entry);
>         }
>
Hi Jakub,

how would that approach support percpu maps?

I'm thinking of a scenario where you want to do some calculations on
percpu maps and you are interested on the info on all the cpus not
just the one that is running the bpf program. Currently on a pcpu map
the bpf_map_lookup_elem helper only returns the pointer to the data of
the executing cpu.

> The filter program can't perform any map operations other than lookup,
> otherwise we won't be able to guarantee that we'll walk the entire map
> (if the filter program deletes some entries in a unfortunate order).
>
> If user space just wants a pure dump it can simply load a program which
> dumps the entries into a perf ring.
>
> I'm bringing this up because that mechanism should cover what is
> achieved with this patch set and much more.
>
> In particular for networking workloads where old flows have to be
> pruned from the map periodically it's far more efficient to communicate
> to user space only the flows which timed out (the delete batching from
> this set won't help at all).
>
> With a 2M entry map and this patch set we still won't be able to prune
> once a second on one core.
>
> [1]
> https://lore.kernel.org/netdev/20190813130921.10704-4-quentin.monnet@netronome.com/

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 02/13] bpf: refactor map_update_elem()
  2019-08-29  6:45 ` [PATCH bpf-next 02/13] bpf: refactor map_update_elem() Yonghong Song
@ 2019-08-29 23:37   ` Song Liu
  0 siblings, 0 replies; 38+ messages in thread
From: Song Liu @ 2019-08-29 23:37 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Networking, Alexei Starovoitov, Brian Vazquez,
	Daniel Borkmann, Kernel Team



> On Aug 28, 2019, at 11:45 PM, Yonghong Song <yhs@fb.com> wrote:
> 
> Refactor function map_update_elem() by creating a
> helper function bpf_map_update_elem() which will be
> used later by batched map update operation.
> 
> Also reuse function bpf_map_value_size()
> in map_update_elem().
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>

Acked-by: Song Liu <songliubraving@fb.com>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 03/13] bpf: refactor map_delete_elem()
  2019-08-29  6:45 ` [PATCH bpf-next 03/13] bpf: refactor map_delete_elem() Yonghong Song
@ 2019-08-29 23:39   ` Song Liu
  0 siblings, 0 replies; 38+ messages in thread
From: Song Liu @ 2019-08-29 23:39 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, netdev, Alexei Starovoitov, Brian Vazquez, Daniel Borkmann,
	Kernel Team



> On Aug 28, 2019, at 11:45 PM, Yonghong Song <yhs@fb.com> wrote:
> 
> Refactor function map_delete_elem() with a new helper
> bpf_map_delete_elem(), which will be used later
> for batched lookup_and_delete and delete operations.
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>

Acked-by: Song Liu <songliubraving@fb.com>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 04/13] bpf: refactor map_get_next_key()
  2019-08-29  6:45 ` [PATCH bpf-next 04/13] bpf: refactor map_get_next_key() Yonghong Song
@ 2019-08-29 23:39   ` Song Liu
  0 siblings, 0 replies; 38+ messages in thread
From: Song Liu @ 2019-08-29 23:39 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, netdev, Alexei Starovoitov, Brian Vazquez, Daniel Borkmann,
	Kernel Team



> On Aug 28, 2019, at 11:45 PM, Yonghong Song <yhs@fb.com> wrote:
> 
> Refactor function map_get_next_key() with a new helper
> bpf_map_get_next_key(), which will be used later
> for batched map lookup/lookup_and_delete/delete operations.
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>

Acked-by: Song Liu <songliubraving@fb.com>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-29 23:13   ` Brian Vazquez
@ 2019-08-30  0:15     ` Jakub Kicinski
  2019-08-30 20:15       ` Stanislav Fomichev
  0 siblings, 1 reply; 38+ messages in thread
From: Jakub Kicinski @ 2019-08-30  0:15 UTC (permalink / raw)
  To: Brian Vazquez
  Cc: Yonghong Song, Alexei Starovoitov, bpf, netdev, Daniel Borkmann,
	kernel-team

On Thu, 29 Aug 2019 16:13:59 -0700, Brian Vazquez wrote:
> > We need a per-map implementation of the exec side, but roughly maps
> > would do:
> >
> >         LIST_HEAD(deleted);
> >
> >         for entry in map {
> >                 struct map_op_ctx {
> >                         .key    = entry->key,
> >                         .value  = entry->value,
> >                 };
> >
> >                 act = BPF_PROG_RUN(filter, &map_op_ctx);
> >                 if (act & ~ACT_BITS)
> >                         return -EINVAL;
> >
> >                 if (act & DELETE) {
> >                         map_unlink(entry);
> >                         list_add(entry, &deleted);
> >                 }
> >                 if (act & STOP)
> >                         break;
> >         }
> >
> >         synchronize_rcu();
> >
> >         for entry in deleted {
> >                 struct map_op_ctx {
> >                         .key    = entry->key,
> >                         .value  = entry->value,
> >                 };
> >
> >                 BPF_PROG_RUN(dumper, &map_op_ctx);
> >                 map_free(entry);
> >         }
> >  
> Hi Jakub,
> 
> how would that approach support percpu maps?
> 
> I'm thinking of a scenario where you want to do some calculations on
> percpu maps and you are interested on the info on all the cpus not
> just the one that is running the bpf program. Currently on a pcpu map
> the bpf_map_lookup_elem helper only returns the pointer to the data of
> the executing cpu.

Right, we need to have the iteration outside of the bpf program itself,
and pass the element in through the context. That way we can feed each
per cpu entry into the program separately.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 05/13] bpf: adding map batch processing support
  2019-08-29 23:01   ` Brian Vazquez
@ 2019-08-30  6:39     ` Yonghong Song
  2019-08-30  6:58       ` Alexei Starovoitov
  0 siblings, 1 reply; 38+ messages in thread
From: Yonghong Song @ 2019-08-30  6:39 UTC (permalink / raw)
  To: Brian Vazquez
  Cc: bpf, netdev, Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 8/29/19 4:01 PM, Brian Vazquez wrote:
> Hi Yonghong!
> 
> Thanks for sending this series of patches and starting the discussion.
> 
> On Wed, Aug 28, 2019 at 11:45 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
>> map entries per syscall.
>>    https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t
>>
>> During discussion, we found more use cases can be supported in a similar
>> map operation batching framework. For example, batched map lookup and delete,
>> which can be really helpful for bcc.
>>    https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
>>    https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
>>
>> Also, in bcc, we have API to delete all entries in a map.
>>    https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
>>
>> For map update, batched operations also useful as sometimes applications need
>> to populate initial maps with more than one entry. For example, the below
>> example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
>>    https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
>>
>> This patch addresses all the above use cases. To make uapi stable, it also
>> covers other potential use cases. Four bpf syscall subcommands are introduced:
>>          BPF_MAP_LOOKUP_BATCH
>>          BPF_MAP_LOOKUP_AND_DELETE_BATCH
>>          BPF_MAP_UPDATE_BATCH
>>          BPF_MAP_DELETE_BATCH
>>
>> The UAPI attribute structure looks like:
>>          struct { /* struct used by BPF_MAP_*_BATCH commands */
>>                  __aligned_u64   start_key;      /* input: storing start key,
>>                                                   * if NULL, starting from the beginning.
>>                                                   */
>>                  __aligned_u64   next_start_key; /* output: storing next batch start_key,
>>                                                   * if NULL, no next key.
>>                                                   */
>>                  __aligned_u64   keys;           /* input/output: key buffer */
>>                  __aligned_u64   values;         /* input/output: value buffer */
>>                  __u32           count;          /* input: # of keys/values in
>>                                                   *   or fits in keys[]/values[].
>>                                                   * output: how many successful
>>                                                   *   lookup/lookup_and_delete
>>                                                   *   /delete/update operations.
>>                                                   */
>>                  __u32           map_fd;
>>                  __u64           elem_flags;     /* BPF_MAP_{UPDATE,LOOKUP}_ELEM flags */
>>                  __u64           flags;          /* flags for batch operation */
>>          } batch;
>>
>> The 'start_key' and 'next_start_key' are used to BPF_MAP_LOOKUP_BATCH,
>> BPF_MAP_LOOKUP_AND_DELETE_BATCH and BPF_MAP_DELETE_BATCH
>> to start the operation on 'start_key' and also set the
>> next batch start key in 'next_start_key'.
>>
>> If 'count' is greater than 0 and the return code is 0,
>>    . the 'count' may be updated to be smaller if there is less
>>      elements than 'count' for the operation. In such cases,
>>      'next_start_key' will be set to NULL.
>>    . the 'count' remains the same. 'next_start_key' could be NULL
>>      or could point to next start_key for batch processing.
>>
>> If 'count' is greater than 0 and the return code is an error
>> other than -EFAULT, the kernel will try to overwrite 'count'
>> to contain the number of successful element-level (lookup,
>> lookup_and_delete, update and delete) operations. The following
>> attributes can be checked:
>>    . if 'count' value is modified, the new value will be
>>      the number of successful element-level operations.
>>    . if 'count' value is modified, the keys[]/values[] will
>>      contain correct results for new 'count' number of
>>      operations for lookup[_and_delete] and update.
>>
>> The implementation in this patch mimics what user space
>> did, e.g., for lookup_and_delete,
>>      while(bpf_get_next_key(map, keyp, next_keyp) == 0) {
>>         bpf_map_delete_elem(map, keyp);
>>         bpf_map_lookup_elem(map, next_keyp, &value, flags);
>>         keyp, next_keyp = next_keyp, keyp;
>>      }
>> The similar loop is implemented in the kernel, and
>> each operation, bpf_get_next_key(), bpf_map_delete_elem()
>> and bpf_map_lookup_elem(), uses existing kernel functions
>> each of which has its own rcu_read_lock region, bpf_prog_active
>> protection, etc.
>> Therefore, it is totally possible that after bpf_get_next_key(),
>> the bpf_map_delete_elem() or bpf_map_lookup_elem() may fail
>> as the key may be deleted concurrently by kernel or
>> other user space processes/threads.
>> By default, the current implementation permits the loop
>> to continue, just like typical user space handling. But
>> a flag, BPF_F_ENFORCE_ENOENT, can be specified, so kernel
>> will return an error if bpf_map_delete_elem() or
>> bpf_map_lookup_elem() failed.
>>
>> The high-level algorithm for BPF_MAP_LOOKUP_BATCH and
>> BPF_MAP_LOOKUP_AND_DELETE_BATCH:
>>          if (start_key == NULL and next_start_key == NULL) {
>>                  lookup up to 'count' keys in keys[] and fill
>>                  corresponding values[], and delete those
>>                  keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
>>          } else if (start_key == NULL && next_start_key != NULL) {
>>                  lookup up to 'count' keys from the beginning
>>                  of the map and fill keys[]/values[], delete
>>                  those keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
>>                  Set 'next_start_key' for next batch operation.
>>          } else if (start_key != NULL && next_start_key != NULL) {
>>                  lookup to 'count' keys from 'start_key', inclusive,
>>                  and fill keys[]/values[], delete those keys if
>>                  BPF_MAP_LOOKUP_AND_DELETE_BATCH.
>>                  Set 'next_start_key' for next batch operation.
>>          }
>>
>> The high-level algorithm for BPF_MAP_UPDATE_BATCH:
>>          if (count != 0) {
>>                  do 'count' number of updates on keys[]/values[].
>>          }
>>
>> The high-level algorithm for BPF_MAP_DELETE_BATCH:
>>          if (count == 0) {
>>                  if (start_key == NULL) {
>>                          delete all elements from map.
>>                  } else {
>>                          delete all elements from start_key to the end of map.
>>                  }
>>          } else {
>>                  if (start_key == NULL and next_start_key == NULL) {
>>                          delete 'count' number of keys in keys[].
>>                  } else if (start_key == NULL and next_start_key != NULL) {
>>                          delete 'count' number of keys from the
>>                          beginning of the map and set 'next_start_key'
>>                          properly.
>>                  } else if (start_key != NULL and next_start_keeey != NULL) {
>>                          delete 'count' number of keys from 'start_key',
>>                          and set 'next_start_key' properly.
>>                  }
>>          }
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/uapi/linux/bpf.h |  27 +++
>>   kernel/bpf/syscall.c     | 448 +++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 475 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 5d2fb183ee2d..576688f13e8c 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -107,6 +107,10 @@ enum bpf_cmd {
>>          BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>>          BPF_MAP_FREEZE,
>>          BPF_BTF_GET_NEXT_ID,
>> +       BPF_MAP_LOOKUP_BATCH,
>> +       BPF_MAP_LOOKUP_AND_DELETE_BATCH,
>> +       BPF_MAP_UPDATE_BATCH,
>> +       BPF_MAP_DELETE_BATCH,
>>   };
>>
>>   enum bpf_map_type {
>> @@ -347,6 +351,9 @@ enum bpf_attach_type {
>>   /* flags for BPF_PROG_QUERY */
>>   #define BPF_F_QUERY_EFFECTIVE  (1U << 0)
>>
>> +/* flags for BPF_MAP_*_BATCH */
>> +#define BPF_F_ENFORCE_ENOENT   (1U << 0)
>> +
>>   enum bpf_stack_build_id_status {
>>          /* user space need an empty entry to identify end of a trace */
>>          BPF_STACK_BUILD_ID_EMPTY = 0,
>> @@ -396,6 +403,26 @@ union bpf_attr {
>>                  __u64           flags;
>>          };
>>
>> +       struct { /* struct used by BPF_MAP_*_BATCH commands */
>> +               __aligned_u64   start_key;      /* input: storing start key,
>> +                                                * if NULL, starting from the beginning.
>> +                                                */
>> +               __aligned_u64   next_start_key; /* output: storing next batch start_key,
>> +                                                * if NULL, no next key.
>> +                                                */
>> +               __aligned_u64   keys;           /* input/output: key buffer */
>> +               __aligned_u64   values;         /* input/output: value buffer */
>> +               __u32           count;          /* input: # of keys/values in
>> +                                                *   or fits in keys[]/values[].
>> +                                                * output: how many successful
>> +                                                *   lookup/lookup_and_delete
>> +                                                *   update/delete operations.
>> +                                                */
>> +               __u32           map_fd;
>> +               __u64           elem_flags;     /* BPF_MAP_*_ELEM flags */
>> +               __u64           flags;          /* flags for batch operation */
>> +       } batch;
>> +
>>          struct { /* anonymous struct used by BPF_PROG_LOAD command */
>>                  __u32           prog_type;      /* one of enum bpf_prog_type */
>>                  __u32           insn_cnt;
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 06308f0206e5..8746b55405f9 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -33,6 +33,17 @@
>>
>>   #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
>>
>> +#define BPF_MAP_BATCH_SWAP_KEYS(key1, key2, buf1, buf2)        \
>> +       do {                                            \
>> +               if (key1 == (buf1)) {                   \
>> +                       key1 = buf2;                    \
>> +                       key2 = buf1;                    \
>> +               } else {                                \
>> +                       key1 = buf1;                    \
>> +                       key2 = buf2;                    \
>> +               }                                       \
>> +       } while (0)                                     \
>> +
>>   DEFINE_PER_CPU(int, bpf_prog_active);
>>   static DEFINE_IDR(prog_idr);
>>   static DEFINE_SPINLOCK(prog_idr_lock);
>> @@ -1183,6 +1194,431 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
>>          return err;
>>   }
>>
>> +static void __map_batch_get_attrs(const union bpf_attr *attr,
>> +                                 void __user **skey, void __user **nskey,
>> +                                 void __user **keys, void __user **values,
>> +                                 u32 *max_count, u64 *elem_flags, u64 *flags)
>> +{
>> +       *skey = u64_to_user_ptr(attr->batch.start_key);
>> +       *nskey = u64_to_user_ptr(attr->batch.next_start_key);
>> +       *keys = u64_to_user_ptr(attr->batch.keys);
>> +       *values = u64_to_user_ptr(attr->batch.values);
>> +       *max_count = attr->batch.count;
>> +       *elem_flags = attr->batch.elem_flags;
>> +       *flags = attr->batch.flags;
>> +}
>> +
>> +static int
>> +__map_lookup_delete_batch_key_in_keys(struct bpf_map *map, void *key, void *value,
>> +                                     u32 max_count, u32 key_size, u32 value_size,
>> +                                     u64 elem_flags, void __user *keys,
>> +                                     void __user *values,
>> +                                     union bpf_attr __user *uattr,
>> +                                     bool ignore_enoent)
>> +{
>> +       u32 count, missed = 0;
>> +       int ret = 0, err;
>> +
>> +       for (count = 0; count < max_count; count++) {
>> +               if (copy_from_user(key, keys + count * key_size, key_size)) {
>> +                       ret = -EFAULT;
>> +                       break;
>> +               }
>> +
>> +               ret = bpf_map_copy_value(map, key, value, elem_flags);
>> +               if (ret) {
>> +                       if (ret != -ENOENT || !ignore_enoent)
>> +                               break;
>> +
>> +                       missed++;
>> +                       continue;
>> +               }
>> +
>> +
>> +               if (copy_to_user(values + count * value_size, value, value_size)) {
>> +                       ret = -EFAULT;
>> +                       break;
>> +               }
>> +
>> +               ret = bpf_map_delete_elem(map, key);
>> +               if (ret) {
>> +                       if (ret != -ENOENT || !ignore_enoent)
>> +                               break;
>> +
>> +                       missed++;
>> +               }
>> +       }
>> +
>> +       count -= missed;
>> +       if ((!ret && missed) || (ret && ret != -EFAULT)) {
>> +               err = put_user(count, &uattr->batch.count);
>> +               ret = err ? : ret;
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>> +static int map_lookup_and_delete_batch(struct bpf_map *map,
>> +                                      const union bpf_attr *attr,
>> +                                      union bpf_attr __user *uattr,
>> +                                      bool do_delete)
>> +{
>> +       u32 max_count, count = 0, key_size, roundup_key_size, value_size;
>> +       bool ignore_enoent, nextkey_is_null, copied;
>> +       void *buf = NULL, *key, *value, *next_key;
>> +       void __user *skey, *nskey, *keys, *values;
>> +       u64 elem_flags, flags, zero = 0;
>> +       int err, ret = 0;
>> +
>> +       if (map->map_type == BPF_MAP_TYPE_QUEUE ||
>> +           map->map_type == BPF_MAP_TYPE_STACK)
>> +               return -ENOTSUPP;
>> +
>> +       __map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
>> +                             &elem_flags, &flags);
>> +
>> +       if (elem_flags & ~BPF_F_LOCK || flags & ~BPF_F_ENFORCE_ENOENT)
>> +               return -EINVAL;
>> +
>> +       if (!max_count)
>> +               return 0;
>> +
>> +       /* The following max_count/skey/nskey combinations are supported:
>> +        * max_count != 0 && !skey && !nskey: loop/delete max_count elements in keys[]/values[].
>> +        * max_count != 0 && !skey && nskey: loop/delete max_count elements starting from map start.
>> +        * max_count != 0 && skey && nskey: loop/delete max_count elements starting from skey.
>> +        */
>> +       if (skey && !nskey)
>> +               return -EINVAL;
>> +
>> +       /* allocate space for two keys and one value. */
>> +       key_size = map->key_size;
>> +       roundup_key_size = round_up(map->key_size, 8);
>> +       value_size = bpf_map_value_size(map);
>> +       buf = kmalloc(roundup_key_size * 2 + value_size, GFP_USER | __GFP_NOWARN);
>> +       if (!buf)
>> +               return -ENOMEM;
>> +
>> +       key = buf;
>> +       next_key = buf + roundup_key_size;
>> +       value = buf + roundup_key_size * 2;
>> +       ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
>> +
>> +       if (!skey && !nskey) {
>> +               /* handle cases where keys in keys[] */
>> +               ret = __map_lookup_delete_batch_key_in_keys(map, key, value, max_count,
>> +                                                           key_size, value_size,
>> +                                                           elem_flags, keys, values,
>> +                                                           uattr, ignore_enoent);
>> +               goto out;
>> +       }
>> +
>> +       /* Get the first key. */
>> +       if (!skey) {
>> +               ret = bpf_map_get_next_key(map, NULL, key);
>> +               if (ret) {
>> +                       nextkey_is_null = true;
>> +                       goto after_loop;
>> +               }
>> +       } else if (copy_from_user(key, skey, key_size)) {
>> +               ret = -EFAULT;
>> +               goto out;
>> +       }
>> +
>> +       /* Copy the first key/value pair */
>> +       ret = bpf_map_copy_value(map, key, value, elem_flags);
>> +       if (ret) {
>> +               if (skey)
>> +                       goto out;
>> +
>> +               nextkey_is_null = true;
>> +               goto after_loop;
>> +       }
>> +
>> +       if (copy_to_user(keys, key, key_size) ||
>> +           copy_to_user(values, value, value_size)) {
>> +               ret = -EFAULT;
>> +               goto out;
>> +       }
>> +
>> +       /* We will always try to get next_key first
>> +        * and then delete the current key.
>> +        */
>> +       ret = bpf_map_get_next_key(map, key, next_key);
> 
> One of the issues I see in this implementation is that is still
> relying on the existing functions and has the same consistency
> problems that my attempt had.

Right. The approach is very similar to what you proposed earlier.

> 
> The problem happens when you are trying to do batch lookup on a
> hashmap and when executing bpf_map_get_next_key(map, key, next_key)
> the key is removed, then that call will return the first key and you'd
> start iterating the map from the beginning again and retrieve
> duplicate information.

Right. Maybe we can have another bpf_map_ops callback function
like 'map_batch_get_next_key' which won't fall back to the
first key if the 'key' is not available in the hash table?

If 'map_batch_get_next_key' failed due to 'key' is gone, we just
return to user space with whatever results we got so far.

We can have a flag to control this behavior. Some users may not
care about duplication.

> 
> Note that sometimes you can also start from the same bucket when the
> key is updated while dumping it because it can be inserted on the head
> of the bucket so you could potentially revisit elements that you had
> already visited.

Do you think 'map_batch_get_next_key' approach could solve this as well?

> 
>  From previous discussion my understanding was that what we wanted was
> to pursue 'atomic' compounded operations first and after that, try to
> batch them. Although I don't think there's an easy way of batching and
> being consistent at the same time.

Yes, I thought about this 'atomic' compounded approach. First,
e.g., batch deletion, after some of elements are deleted, we found
one key is gone. we cannot really go back to the state where no
deletion happens.

Second, even we permit partial work, batched update/delete and element
update/delete concurrency (userspace or bpf program) still hard to 
manage. We cannot put giant batched update/delete in a spin
lock... (bpf program uses spin lock for may update/delete operations).

I agree with you as I also did not find an easy way to have both
batching and good consistency.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions
  2019-08-29 22:04   ` Song Liu
@ 2019-08-30  6:40     ` Yonghong Song
  0 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-30  6:40 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Networking, Alexei Starovoitov, Brian Vazquez,
	Daniel Borkmann, Kernel Team



On 8/29/19 3:04 PM, Song Liu wrote:
> 
> 
>> On Aug 28, 2019, at 11:45 PM, Yonghong Song <yhs@fb.com> wrote:
>>
>> From: Brian Vazquez <brianvv@google.com>
>>
>> Move reusable code from map_lookup_elem to helper functions to avoid code
>> duplication in kernel/bpf/syscall.c
>>
>> Suggested-by: Stanislav Fomichev <sdf@google.com>
>> Signed-off-by: Brian Vazquez <brianvv@google.com>
> 
> 
> Acked-by: Song Liu <songliubraving@fb.com>
> 
> Yonghong, we also need your SoB.

Thanks for reminding me of this. Will add my SoB in next revision.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 05/13] bpf: adding map batch processing support
  2019-08-30  6:39     ` Yonghong Song
@ 2019-08-30  6:58       ` Alexei Starovoitov
  0 siblings, 0 replies; 38+ messages in thread
From: Alexei Starovoitov @ 2019-08-30  6:58 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Brian Vazquez, bpf, netdev, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team

On Fri, Aug 30, 2019 at 06:39:48AM +0000, Yonghong Song wrote:
> > 
> > The problem happens when you are trying to do batch lookup on a
> > hashmap and when executing bpf_map_get_next_key(map, key, next_key)
> > the key is removed, then that call will return the first key and you'd
> > start iterating the map from the beginning again and retrieve
> > duplicate information.
> 
> Right. Maybe we can have another bpf_map_ops callback function
> like 'map_batch_get_next_key' which won't fall back to the
> first key if the 'key' is not available in the hash table?

The reason I picked this get_next_key behavior long ago
because I couldn't come up with a way to pick the next key consistently.
In the hash table the elements are not sorted.
If there are more than one element in the hash table bucket
they are added to the link list in sort-of random order.
If one out of N elems in the bucket are deleted which one should be
picked next?
select_bucket() picks the bucket.
if lookup_nulls_elem_raw() cannot find the element which one in
the link list is the "right one" to continue?
Iterating over hash table without duplicates when elements
are being added and removed in parallel is a hard problem to solve.
I think "best effort" is the right answer.
When users care about consistency they should use map-in-map.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-29 18:39 ` [PATCH bpf-next 00/13] bpf: adding map batch processing support Jakub Kicinski
  2019-08-29 23:13   ` Brian Vazquez
@ 2019-08-30  7:25   ` Yonghong Song
  2019-08-30 21:35     ` Jakub Kicinski
  1 sibling, 1 reply; 38+ messages in thread
From: Yonghong Song @ 2019-08-30  7:25 UTC (permalink / raw)
  To: Jakub Kicinski, Alexei Starovoitov
  Cc: bpf, netdev, Brian Vazquez, Daniel Borkmann, Kernel Team



On 8/29/19 11:39 AM, Jakub Kicinski wrote:
> On Wed, 28 Aug 2019 23:45:02 -0700, Yonghong Song wrote:
>> Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
>> map entries per syscall.
>>    https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t
>>
>> During discussion, we found more use cases can be supported in a similar
>> map operation batching framework. For example, batched map lookup and delete,
>> which can be really helpful for bcc.
>>    https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
>>    https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
>>      
>> Also, in bcc, we have API to delete all entries in a map.
>>    https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
>>
>> For map update, batched operations also useful as sometimes applications need
>> to populate initial maps with more than one entry. For example, the below
>> example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
>>    https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
>>
>> This patch addresses all the above use cases. To make uapi stable, it also
>> covers other potential use cases. Four bpf syscall subcommands are introduced:
>>      BPF_MAP_LOOKUP_BATCH
>>      BPF_MAP_LOOKUP_AND_DELETE_BATCH
>>      BPF_MAP_UPDATE_BATCH
>>      BPF_MAP_DELETE_BATCH
>>
>> In userspace, application can iterate through the whole map one batch
>> as a time, e.g., bpf_map_lookup_batch() in the below:
>>      p_key = NULL;
>>      p_next_key = &key;
>>      while (true) {
>>         err = bpf_map_lookup_batch(fd, p_key, &p_next_key, keys, values,
>>                                    &batch_size, elem_flags, flags);
>>         if (err) ...
>>         if (p_next_key) break; // done
>>         if (!p_key) p_key = p_next_key;
>>      }
>> Please look at individual patches for details of new syscall subcommands
>> and examples of user codes.
>>
>> The testing is also done in a qemu VM environment:
>>        measure_lookup: max_entries 1000000, batch 10, time 342ms
>>        measure_lookup: max_entries 1000000, batch 1000, time 295ms
>>        measure_lookup: max_entries 1000000, batch 1000000, time 270ms
>>        measure_lookup: max_entries 1000000, no batching, time 1346ms
>>        measure_lookup_delete: max_entries 1000000, batch 10, time 433ms
>>        measure_lookup_delete: max_entries 1000000, batch 1000, time 363ms
>>        measure_lookup_delete: max_entries 1000000, batch 1000000, time 357ms
>>        measure_lookup_delete: max_entries 1000000, not batch, time 1894ms
>>        measure_delete: max_entries 1000000, batch, time 220ms
>>        measure_delete: max_entries 1000000, not batch, time 1289ms
>> For a 1M entry hash table, batch size of 10 can reduce cpu time
>> by 70%. Please see patch "tools/bpf: measure map batching perf"
>> for details of test codes.
> 
> Hi Yonghong!
> 
> great to see this, we have been looking at implementing some way to
> speed up map walks as well.
> 
> The direction we were looking in, after previous discussions [1],
> however, was to provide a BPF program which can run the logic entirely
> within the kernel.
> 
> We have a rough PoC on the FW side (we can offload the program which
> walks the map, which is pretty neat), but the kernel verifier side
> hasn't really progressed. It will soon.
> 
> The rough idea is that the user space provides two programs, "filter"
> and "dumper":
> 
> 	bpftool map exec id XYZ filter pinned /some/prog \
> 				dumper pinned /some/other_prog
> 
> Both programs get this context:
> 
> struct map_op_ctx {
> 	u64 key;
> 	u64 value;
> }
> 
> We need a per-map implementation of the exec side, but roughly maps
> would do:
> 
> 	LIST_HEAD(deleted);
> 
> 	for entry in map {
> 		struct map_op_ctx {
> 			.key	= entry->key,
> 			.value	= entry->value,
> 		};
> 
> 		act = BPF_PROG_RUN(filter, &map_op_ctx);
> 		if (act & ~ACT_BITS)
> 			return -EINVAL;
> 
> 		if (act & DELETE) {
> 			map_unlink(entry);
> 			list_add(entry, &deleted);
> 		}
> 		if (act & STOP)
> 			break;
> 	}
> 
> 	synchronize_rcu();
> 
> 	for entry in deleted {
> 		struct map_op_ctx {
> 			.key	= entry->key,
> 			.value	= entry->value,
> 		};
> 		
> 		BPF_PROG_RUN(dumper, &map_op_ctx);
> 		map_free(entry);
> 	}
> 
> The filter program can't perform any map operations other than lookup,
> otherwise we won't be able to guarantee that we'll walk the entire map
> (if the filter program deletes some entries in a unfortunate order).

Looks like you will provide a new program type and per-map 
implementation of above code. My patch set indeed avoided per-map 
implementation for all of lookup/delete/get-next-key...

> 
> If user space just wants a pure dump it can simply load a program which
> dumps the entries into a perf ring.

percpu perf ring is not really ideal for user space which simply just
want to get some key/value pairs back. Some kind of generate non-per-cpu
ring buffer might be better for such cases.

> 
> I'm bringing this up because that mechanism should cover what is
> achieved with this patch set and much more.

The only case it did not cover is batched update. But that may not
be super critical.

Your approach give each element an action choice through another bpf 
program. This indeed powerful. My use case is simpler than your use case
below, hence the implementation.

> 
> In particular for networking workloads where old flows have to be
> pruned from the map periodically it's far more efficient to communicate
> to user space only the flows which timed out (the delete batching from
> this set won't help at all).

Maybe LRU map will help in this case? It is designed for such
use cases.

> 
> With a 2M entry map and this patch set we still won't be able to prune
> once a second on one core.
> 
> [1]
> https://lore.kernel.org/netdev/20190813130921.10704-4-quentin.monnet@netronome.com/
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-30  0:15     ` Jakub Kicinski
@ 2019-08-30 20:15       ` Stanislav Fomichev
  2019-08-30 20:55         ` Yonghong Song
  0 siblings, 1 reply; 38+ messages in thread
From: Stanislav Fomichev @ 2019-08-30 20:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Brian Vazquez, Yonghong Song, Alexei Starovoitov, bpf, netdev,
	Daniel Borkmann, kernel-team

On 08/29, Jakub Kicinski wrote:
> On Thu, 29 Aug 2019 16:13:59 -0700, Brian Vazquez wrote:
> > > We need a per-map implementation of the exec side, but roughly maps
> > > would do:
> > >
> > >         LIST_HEAD(deleted);
> > >
> > >         for entry in map {
> > >                 struct map_op_ctx {
> > >                         .key    = entry->key,
> > >                         .value  = entry->value,
> > >                 };
> > >
> > >                 act = BPF_PROG_RUN(filter, &map_op_ctx);
> > >                 if (act & ~ACT_BITS)
> > >                         return -EINVAL;
> > >
> > >                 if (act & DELETE) {
> > >                         map_unlink(entry);
> > >                         list_add(entry, &deleted);
> > >                 }
> > >                 if (act & STOP)
> > >                         break;
> > >         }
> > >
> > >         synchronize_rcu();
> > >
> > >         for entry in deleted {
> > >                 struct map_op_ctx {
> > >                         .key    = entry->key,
> > >                         .value  = entry->value,
> > >                 };
> > >
> > >                 BPF_PROG_RUN(dumper, &map_op_ctx);
> > >                 map_free(entry);
> > >         }
> > >  
> > Hi Jakub,
> > 
> > how would that approach support percpu maps?
> > 
> > I'm thinking of a scenario where you want to do some calculations on
> > percpu maps and you are interested on the info on all the cpus not
> > just the one that is running the bpf program. Currently on a pcpu map
> > the bpf_map_lookup_elem helper only returns the pointer to the data of
> > the executing cpu.
> 
> Right, we need to have the iteration outside of the bpf program itself,
> and pass the element in through the context. That way we can feed each
> per cpu entry into the program separately.
My 2 cents:

I personally like Jakub's/Quentin's proposal more. So if I get to choose
between this series and Jakub's filter+dump in BPF, I'd pick filter+dump
(pending per-cpu issue which we actually care about).

But if we can have both, I don't have any objections; this patch
series looks to me a lot like what Brian did, just extended to more
commands. If we are fine with the shortcomings raised about the
original series, then let's go with this version. Maybe we can also
look into addressing these independently.

But if I pretend that we live in an ideal world, I'd just go with
whatever Jakub and Quentin are doing so we don't have to support
two APIs that essentially do the same (minus batching update, but
it looks like there is no clear use case for that yet; maybe).

I guess you can hold off this series a bit and discuss it at LPC,
you have a talk dedicated to that :-) (and afaiu, you are all going)

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-30 20:15       ` Stanislav Fomichev
@ 2019-08-30 20:55         ` Yonghong Song
  2019-08-30 21:10           ` Jakub Kicinski
  2019-08-30 21:18           ` Stanislav Fomichev
  0 siblings, 2 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-30 20:55 UTC (permalink / raw)
  To: Stanislav Fomichev, Jakub Kicinski
  Cc: Brian Vazquez, Alexei Starovoitov, bpf, netdev, Daniel Borkmann,
	Kernel Team



On 8/30/19 1:15 PM, Stanislav Fomichev wrote:
> On 08/29, Jakub Kicinski wrote:
>> On Thu, 29 Aug 2019 16:13:59 -0700, Brian Vazquez wrote:
>>>> We need a per-map implementation of the exec side, but roughly maps
>>>> would do:
>>>>
>>>>          LIST_HEAD(deleted);
>>>>
>>>>          for entry in map {
>>>>                  struct map_op_ctx {
>>>>                          .key    = entry->key,
>>>>                          .value  = entry->value,
>>>>                  };
>>>>
>>>>                  act = BPF_PROG_RUN(filter, &map_op_ctx);
>>>>                  if (act & ~ACT_BITS)
>>>>                          return -EINVAL;
>>>>
>>>>                  if (act & DELETE) {
>>>>                          map_unlink(entry);
>>>>                          list_add(entry, &deleted);
>>>>                  }
>>>>                  if (act & STOP)
>>>>                          break;
>>>>          }
>>>>
>>>>          synchronize_rcu();
>>>>
>>>>          for entry in deleted {
>>>>                  struct map_op_ctx {
>>>>                          .key    = entry->key,
>>>>                          .value  = entry->value,
>>>>                  };
>>>>
>>>>                  BPF_PROG_RUN(dumper, &map_op_ctx);
>>>>                  map_free(entry);
>>>>          }
>>>>   
>>> Hi Jakub,
>>>
>>> how would that approach support percpu maps?
>>>
>>> I'm thinking of a scenario where you want to do some calculations on
>>> percpu maps and you are interested on the info on all the cpus not
>>> just the one that is running the bpf program. Currently on a pcpu map
>>> the bpf_map_lookup_elem helper only returns the pointer to the data of
>>> the executing cpu.
>>
>> Right, we need to have the iteration outside of the bpf program itself,
>> and pass the element in through the context. That way we can feed each
>> per cpu entry into the program separately.
> My 2 cents:
> 
> I personally like Jakub's/Quentin's proposal more. So if I get to choose
> between this series and Jakub's filter+dump in BPF, I'd pick filter+dump
> (pending per-cpu issue which we actually care about).
> 
> But if we can have both, I don't have any objections; this patch
> series looks to me a lot like what Brian did, just extended to more
> commands. If we are fine with the shortcomings raised about the
> original series, then let's go with this version. Maybe we can also
> look into addressing these independently.
> 
> But if I pretend that we live in an ideal world, I'd just go with
> whatever Jakub and Quentin are doing so we don't have to support
> two APIs that essentially do the same (minus batching update, but
> it looks like there is no clear use case for that yet; maybe).
> 
> I guess you can hold off this series a bit and discuss it at LPC,
> you have a talk dedicated to that :-) (and afaiu, you are all going)

Absolutely. We will have a discussion on map batching and I signed
on with that :-). One of goals for this patch set is for me to explore
what uapi (attr and bpf subcommands) we should expose to users.
Hopefully at that time we will get more clarity
on Jakub's approach and we can discuss how to proceed.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-30 20:55         ` Yonghong Song
@ 2019-08-30 21:10           ` Jakub Kicinski
  2019-08-30 22:24             ` Yonghong Song
  2019-08-30 21:18           ` Stanislav Fomichev
  1 sibling, 1 reply; 38+ messages in thread
From: Jakub Kicinski @ 2019-08-30 21:10 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Stanislav Fomichev, Brian Vazquez, Alexei Starovoitov, bpf,
	netdev, Daniel Borkmann, Kernel Team, Quentin Monnet

On Fri, 30 Aug 2019 20:55:33 +0000, Yonghong Song wrote:
> > I guess you can hold off this series a bit and discuss it at LPC,
> > you have a talk dedicated to that :-) (and afaiu, you are all going)  
> 
> Absolutely. We will have a discussion on map batching and I signed
> on with that :-)

FWIW unfortunately neither Quentin nor I will be able to make the LPC
this year :( But the idea had been floated a few times (I'd certainly
not claim more authorship here than Ed or Alexei), and is simple enough
to move forward over email (I hope).

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-30 20:55         ` Yonghong Song
  2019-08-30 21:10           ` Jakub Kicinski
@ 2019-08-30 21:18           ` Stanislav Fomichev
  2019-09-03 21:01             ` Alexei Starovoitov
  1 sibling, 1 reply; 38+ messages in thread
From: Stanislav Fomichev @ 2019-08-30 21:18 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Jakub Kicinski, Brian Vazquez, Alexei Starovoitov, bpf, netdev,
	Daniel Borkmann, Kernel Team

On 08/30, Yonghong Song wrote:
> 
> 
> On 8/30/19 1:15 PM, Stanislav Fomichev wrote:
> > On 08/29, Jakub Kicinski wrote:
> >> On Thu, 29 Aug 2019 16:13:59 -0700, Brian Vazquez wrote:
> >>>> We need a per-map implementation of the exec side, but roughly maps
> >>>> would do:
> >>>>
> >>>>          LIST_HEAD(deleted);
> >>>>
> >>>>          for entry in map {
> >>>>                  struct map_op_ctx {
> >>>>                          .key    = entry->key,
> >>>>                          .value  = entry->value,
> >>>>                  };
> >>>>
> >>>>                  act = BPF_PROG_RUN(filter, &map_op_ctx);
> >>>>                  if (act & ~ACT_BITS)
> >>>>                          return -EINVAL;
> >>>>
> >>>>                  if (act & DELETE) {
> >>>>                          map_unlink(entry);
> >>>>                          list_add(entry, &deleted);
> >>>>                  }
> >>>>                  if (act & STOP)
> >>>>                          break;
> >>>>          }
> >>>>
> >>>>          synchronize_rcu();
> >>>>
> >>>>          for entry in deleted {
> >>>>                  struct map_op_ctx {
> >>>>                          .key    = entry->key,
> >>>>                          .value  = entry->value,
> >>>>                  };
> >>>>
> >>>>                  BPF_PROG_RUN(dumper, &map_op_ctx);
> >>>>                  map_free(entry);
> >>>>          }
> >>>>   
> >>> Hi Jakub,
> >>>
> >>> how would that approach support percpu maps?
> >>>
> >>> I'm thinking of a scenario where you want to do some calculations on
> >>> percpu maps and you are interested on the info on all the cpus not
> >>> just the one that is running the bpf program. Currently on a pcpu map
> >>> the bpf_map_lookup_elem helper only returns the pointer to the data of
> >>> the executing cpu.
> >>
> >> Right, we need to have the iteration outside of the bpf program itself,
> >> and pass the element in through the context. That way we can feed each
> >> per cpu entry into the program separately.
> > My 2 cents:
> > 
> > I personally like Jakub's/Quentin's proposal more. So if I get to choose
> > between this series and Jakub's filter+dump in BPF, I'd pick filter+dump
> > (pending per-cpu issue which we actually care about).
> > 
> > But if we can have both, I don't have any objections; this patch
> > series looks to me a lot like what Brian did, just extended to more
> > commands. If we are fine with the shortcomings raised about the
> > original series, then let's go with this version. Maybe we can also
> > look into addressing these independently.
> > 
> > But if I pretend that we live in an ideal world, I'd just go with
> > whatever Jakub and Quentin are doing so we don't have to support
> > two APIs that essentially do the same (minus batching update, but
> > it looks like there is no clear use case for that yet; maybe).
> > 
> > I guess you can hold off this series a bit and discuss it at LPC,
> > you have a talk dedicated to that :-) (and afaiu, you are all going)
> 
> Absolutely. We will have a discussion on map batching and I signed
> on with that :-). One of goals for this patch set is for me to explore
> what uapi (attr and bpf subcommands) we should expose to users.
> Hopefully at that time we will get more clarity
> on Jakub's approach and we can discuss how to proceed.
Sounds good! Your series didn't have an RFC tag, so I wasn't
sure whether we've fully committed to that approach or not.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-30  7:25   ` Yonghong Song
@ 2019-08-30 21:35     ` Jakub Kicinski
  2019-08-30 22:38       ` Yonghong Song
  0 siblings, 1 reply; 38+ messages in thread
From: Jakub Kicinski @ 2019-08-30 21:35 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Alexei Starovoitov, bpf, netdev, Brian Vazquez, Daniel Borkmann,
	Kernel Team, Quentin Monnet

On Fri, 30 Aug 2019 07:25:54 +0000, Yonghong Song wrote:
> On 8/29/19 11:39 AM, Jakub Kicinski wrote:
> > On Wed, 28 Aug 2019 23:45:02 -0700, Yonghong Song wrote:  
> >> Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
> >> map entries per syscall.
> >>    https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t
> >>
> >> During discussion, we found more use cases can be supported in a similar
> >> map operation batching framework. For example, batched map lookup and delete,
> >> which can be really helpful for bcc.
> >>    https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
> >>    https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
> >>      
> >> Also, in bcc, we have API to delete all entries in a map.
> >>    https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
> >>
> >> For map update, batched operations also useful as sometimes applications need
> >> to populate initial maps with more than one entry. For example, the below
> >> example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
> >>    https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
> >>
> >> This patch addresses all the above use cases. To make uapi stable, it also
> >> covers other potential use cases. Four bpf syscall subcommands are introduced:
> >>      BPF_MAP_LOOKUP_BATCH
> >>      BPF_MAP_LOOKUP_AND_DELETE_BATCH
> >>      BPF_MAP_UPDATE_BATCH
> >>      BPF_MAP_DELETE_BATCH
> >>
> >> In userspace, application can iterate through the whole map one batch
> >> as a time, e.g., bpf_map_lookup_batch() in the below:
> >>      p_key = NULL;
> >>      p_next_key = &key;
> >>      while (true) {
> >>         err = bpf_map_lookup_batch(fd, p_key, &p_next_key, keys, values,
> >>                                    &batch_size, elem_flags, flags);
> >>         if (err) ...
> >>         if (p_next_key) break; // done
> >>         if (!p_key) p_key = p_next_key;
> >>      }
> >> Please look at individual patches for details of new syscall subcommands
> >> and examples of user codes.
> >>
> >> The testing is also done in a qemu VM environment:
> >>        measure_lookup: max_entries 1000000, batch 10, time 342ms
> >>        measure_lookup: max_entries 1000000, batch 1000, time 295ms
> >>        measure_lookup: max_entries 1000000, batch 1000000, time 270ms
> >>        measure_lookup: max_entries 1000000, no batching, time 1346ms
> >>        measure_lookup_delete: max_entries 1000000, batch 10, time 433ms
> >>        measure_lookup_delete: max_entries 1000000, batch 1000, time 363ms
> >>        measure_lookup_delete: max_entries 1000000, batch 1000000, time 357ms
> >>        measure_lookup_delete: max_entries 1000000, not batch, time 1894ms
> >>        measure_delete: max_entries 1000000, batch, time 220ms
> >>        measure_delete: max_entries 1000000, not batch, time 1289ms
> >> For a 1M entry hash table, batch size of 10 can reduce cpu time
> >> by 70%. Please see patch "tools/bpf: measure map batching perf"
> >> for details of test codes.  
> > 
> > Hi Yonghong!
> > 
> > great to see this, we have been looking at implementing some way to
> > speed up map walks as well.
> > 
> > The direction we were looking in, after previous discussions [1],
> > however, was to provide a BPF program which can run the logic entirely
> > within the kernel.
> > 
> > We have a rough PoC on the FW side (we can offload the program which
> > walks the map, which is pretty neat), but the kernel verifier side
> > hasn't really progressed. It will soon.
> > 
> > The rough idea is that the user space provides two programs, "filter"
> > and "dumper":
> > 
> > 	bpftool map exec id XYZ filter pinned /some/prog \
> > 				dumper pinned /some/other_prog
> > 
> > Both programs get this context:
> > 
> > struct map_op_ctx {
> > 	u64 key;
> > 	u64 value;
> > }
> > 
> > We need a per-map implementation of the exec side, but roughly maps
> > would do:
> > 
> > 	LIST_HEAD(deleted);
> > 
> > 	for entry in map {
> > 		struct map_op_ctx {
> > 			.key	= entry->key,
> > 			.value	= entry->value,
> > 		};
> > 
> > 		act = BPF_PROG_RUN(filter, &map_op_ctx);
> > 		if (act & ~ACT_BITS)
> > 			return -EINVAL;
> > 
> > 		if (act & DELETE) {
> > 			map_unlink(entry);
> > 			list_add(entry, &deleted);
> > 		}
> > 		if (act & STOP)
> > 			break;
> > 	}
> > 
> > 	synchronize_rcu();
> > 
> > 	for entry in deleted {
> > 		struct map_op_ctx {
> > 			.key	= entry->key,
> > 			.value	= entry->value,
> > 		};
> > 		
> > 		BPF_PROG_RUN(dumper, &map_op_ctx);
> > 		map_free(entry);
> > 	}
> > 
> > The filter program can't perform any map operations other than lookup,
> > otherwise we won't be able to guarantee that we'll walk the entire map
> > (if the filter program deletes some entries in a unfortunate order).  
> 
> Looks like you will provide a new program type and per-map 
> implementation of above code. My patch set indeed avoided per-map 
> implementation for all of lookup/delete/get-next-key...

Indeed, the simple batched ops are undeniably lower LoC.

> > If user space just wants a pure dump it can simply load a program which
> > dumps the entries into a perf ring.  
> 
> percpu perf ring is not really ideal for user space which simply just
> want to get some key/value pairs back. Some kind of generate non-per-cpu
> ring buffer might be better for such cases.

I don't think it had to be per-cpu, but I may be blissfully ignorant
about the perf ring details :) bpf_perf_event_output() takes flags,
which are effectively selecting the "output CPU", no?

> > I'm bringing this up because that mechanism should cover what is
> > achieved with this patch set and much more.  
> 
> The only case it did not cover is batched update. But that may not
> be super critical.

Right, my other concern (which admittedly is slightly pedantic) is the
potential loss of modifications. the lookup_and_delete() operation does
not guarantee that the result returned to user space is the "final"
value. We'd need to wait for a RCU grace period between lookup and dump.
The "map is definitely unused now"/RCU guarantee had came up in the
past.

> Your approach give each element an action choice through another bpf 
> program. This indeed powerful. My use case is simpler than your use case
> below, hence the implementation.

Agreed, the simplicity is tempting. 

> > In particular for networking workloads where old flows have to be
> > pruned from the map periodically it's far more efficient to communicate
> > to user space only the flows which timed out (the delete batching from
> > this set won't help at all).  
> 
> Maybe LRU map will help in this case? It is designed for such
> use cases.

LRU map would be perfect if it dumped the entry somewhere before it got
reused... Perhaps we need to attach a program to the LRU map that'd get
run when entries get reaped. That'd be cool 🤔 We could trivially reuse
the "dumper" prog type for this.

> > With a 2M entry map and this patch set we still won't be able to prune
> > once a second on one core.
> > 
> > [1]
> > https://lore.kernel.org/netdev/20190813130921.10704-4-quentin.monnet@netronome.com/

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-30 21:10           ` Jakub Kicinski
@ 2019-08-30 22:24             ` Yonghong Song
  0 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-30 22:24 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Stanislav Fomichev, Brian Vazquez, Alexei Starovoitov, bpf,
	netdev, Daniel Borkmann, Kernel Team, Quentin Monnet



On 8/30/19 2:10 PM, Jakub Kicinski wrote:
> On Fri, 30 Aug 2019 20:55:33 +0000, Yonghong Song wrote:
>>> I guess you can hold off this series a bit and discuss it at LPC,
>>> you have a talk dedicated to that :-) (and afaiu, you are all going)
>>
>> Absolutely. We will have a discussion on map batching and I signed
>> on with that :-)
> 
> FWIW unfortunately neither Quentin nor I will be able to make the LPC
> this year :( But the idea had been floated a few times (I'd certainly
> not claim more authorship here than Ed or Alexei), and is simple enough
> to move forward over email (I hope).

Yes, let us continue to discuss over emails. :-)

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-30 21:35     ` Jakub Kicinski
@ 2019-08-30 22:38       ` Yonghong Song
  0 siblings, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-08-30 22:38 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Alexei Starovoitov, bpf, netdev, Brian Vazquez, Daniel Borkmann,
	Kernel Team, Quentin Monnet



On 8/30/19 2:35 PM, Jakub Kicinski wrote:
> On Fri, 30 Aug 2019 07:25:54 +0000, Yonghong Song wrote:
>> On 8/29/19 11:39 AM, Jakub Kicinski wrote:
>>> On Wed, 28 Aug 2019 23:45:02 -0700, Yonghong Song wrote:
>>>> Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
>>>> map entries per syscall.
>>>>     https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t
>>>>
>>>> During discussion, we found more use cases can be supported in a similar
>>>> map operation batching framework. For example, batched map lookup and delete,
>>>> which can be really helpful for bcc.
>>>>     https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
>>>>     https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
>>>>       
>>>> Also, in bcc, we have API to delete all entries in a map.
>>>>     https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
>>>>
>>>> For map update, batched operations also useful as sometimes applications need
>>>> to populate initial maps with more than one entry. For example, the below
>>>> example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
>>>>     https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
>>>>
>>>> This patch addresses all the above use cases. To make uapi stable, it also
>>>> covers other potential use cases. Four bpf syscall subcommands are introduced:
>>>>       BPF_MAP_LOOKUP_BATCH
>>>>       BPF_MAP_LOOKUP_AND_DELETE_BATCH
>>>>       BPF_MAP_UPDATE_BATCH
>>>>       BPF_MAP_DELETE_BATCH
>>>>
>>>> In userspace, application can iterate through the whole map one batch
>>>> as a time, e.g., bpf_map_lookup_batch() in the below:
>>>>       p_key = NULL;
>>>>       p_next_key = &key;
>>>>       while (true) {
>>>>          err = bpf_map_lookup_batch(fd, p_key, &p_next_key, keys, values,
>>>>                                     &batch_size, elem_flags, flags);
>>>>          if (err) ...
>>>>          if (p_next_key) break; // done
>>>>          if (!p_key) p_key = p_next_key;
>>>>       }
>>>> Please look at individual patches for details of new syscall subcommands
>>>> and examples of user codes.
>>>>
>>>> The testing is also done in a qemu VM environment:
>>>>         measure_lookup: max_entries 1000000, batch 10, time 342ms
>>>>         measure_lookup: max_entries 1000000, batch 1000, time 295ms
>>>>         measure_lookup: max_entries 1000000, batch 1000000, time 270ms
>>>>         measure_lookup: max_entries 1000000, no batching, time 1346ms
>>>>         measure_lookup_delete: max_entries 1000000, batch 10, time 433ms
>>>>         measure_lookup_delete: max_entries 1000000, batch 1000, time 363ms
>>>>         measure_lookup_delete: max_entries 1000000, batch 1000000, time 357ms
>>>>         measure_lookup_delete: max_entries 1000000, not batch, time 1894ms
>>>>         measure_delete: max_entries 1000000, batch, time 220ms
>>>>         measure_delete: max_entries 1000000, not batch, time 1289ms
>>>> For a 1M entry hash table, batch size of 10 can reduce cpu time
>>>> by 70%. Please see patch "tools/bpf: measure map batching perf"
>>>> for details of test codes.
>>>
>>> Hi Yonghong!
>>>
>>> great to see this, we have been looking at implementing some way to
>>> speed up map walks as well.
>>>
>>> The direction we were looking in, after previous discussions [1],
>>> however, was to provide a BPF program which can run the logic entirely
>>> within the kernel.
>>>
>>> We have a rough PoC on the FW side (we can offload the program which
>>> walks the map, which is pretty neat), but the kernel verifier side
>>> hasn't really progressed. It will soon.
>>>
>>> The rough idea is that the user space provides two programs, "filter"
>>> and "dumper":
>>>
>>> 	bpftool map exec id XYZ filter pinned /some/prog \
>>> 				dumper pinned /some/other_prog
>>>
>>> Both programs get this context:
>>>
>>> struct map_op_ctx {
>>> 	u64 key;
>>> 	u64 value;
>>> }
>>>
>>> We need a per-map implementation of the exec side, but roughly maps
>>> would do:
>>>
>>> 	LIST_HEAD(deleted);
>>>
>>> 	for entry in map {
>>> 		struct map_op_ctx {
>>> 			.key	= entry->key,
>>> 			.value	= entry->value,
>>> 		};
>>>
>>> 		act = BPF_PROG_RUN(filter, &map_op_ctx);
>>> 		if (act & ~ACT_BITS)
>>> 			return -EINVAL;
>>>
>>> 		if (act & DELETE) {
>>> 			map_unlink(entry);
>>> 			list_add(entry, &deleted);
>>> 		}
>>> 		if (act & STOP)
>>> 			break;
>>> 	}
>>>
>>> 	synchronize_rcu();
>>>
>>> 	for entry in deleted {
>>> 		struct map_op_ctx {
>>> 			.key	= entry->key,
>>> 			.value	= entry->value,
>>> 		};
>>> 		
>>> 		BPF_PROG_RUN(dumper, &map_op_ctx);
>>> 		map_free(entry);
>>> 	}
>>>
>>> The filter program can't perform any map operations other than lookup,
>>> otherwise we won't be able to guarantee that we'll walk the entire map
>>> (if the filter program deletes some entries in a unfortunate order).
>>
>> Looks like you will provide a new program type and per-map
>> implementation of above code. My patch set indeed avoided per-map
>> implementation for all of lookup/delete/get-next-key...
> 
> Indeed, the simple batched ops are undeniably lower LoC.
> 
>>> If user space just wants a pure dump it can simply load a program which
>>> dumps the entries into a perf ring.
>>
>> percpu perf ring is not really ideal for user space which simply just
>> want to get some key/value pairs back. Some kind of generate non-per-cpu
>> ring buffer might be better for such cases.
> 
> I don't think it had to be per-cpu, but I may be blissfully ignorant
> about the perf ring details :) bpf_perf_event_output() takes flags,
> which are effectively selecting the "output CPU", no?

Right, it does not need to be per-cpu. One particular cpu
can be selected. Binding to which cpu might be always
subject to debate like why this cpu, not another cpu.
This works, and I am just thinking a ring buffer without
binding to cpu is better and less confusion to user.
But this may need yet another ring buffer implementation
in the kernel and people might not like it.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-08-30 21:18           ` Stanislav Fomichev
@ 2019-09-03 21:01             ` Alexei Starovoitov
  2019-09-03 22:30               ` Stanislav Fomichev
  0 siblings, 1 reply; 38+ messages in thread
From: Alexei Starovoitov @ 2019-09-03 21:01 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Yonghong Song, Jakub Kicinski, Brian Vazquez, Alexei Starovoitov,
	bpf, netdev, Daniel Borkmann, Kernel Team

On Fri, Aug 30, 2019 at 02:18:09PM -0700, Stanislav Fomichev wrote:
> > > 
> > > I personally like Jakub's/Quentin's proposal more. So if I get to choose
> > > between this series and Jakub's filter+dump in BPF, I'd pick filter+dump
> > > (pending per-cpu issue which we actually care about).
> > > 
> > > But if we can have both, I don't have any objections; this patch

I think we need to have both.
imo Jakub's and Yonghong's approach are solving slightly different cases.

filter+dump via program is better suited for LRU map walks where filter prog
would do some non-trivial logic.
Whereas plain 'delete all' or 'dump all' is much simpler to use without
loading yet another prog just to dump it.
bpf infra today isn't quite ready for this very short lived auxiliary progs.
At prog load pages get read-only mapping, tlbs across cpus flushed,
kallsyms populated, FDs allocated, etc.
Loading the prog is a heavy operation. There was a chatter before to have
built-in progs. This filter+dump could benefit from builtin 'allow all'
or 'delete all' progs, but imo that complicates design and asks even
more questions than it answers. Should this builtin progs show up
in 'bpftool prog show' ? When do they load/unload? Same safety requirements
as normal progs? etc.
imo it's fine to have little bit overlap between apis.
So I think we should proceed with both batching apis.

Having said that I think both are suffering from the important issue pointed out
by Brian: when kernel deletes an element get_next_key iterator over hash/lru
map will produce duplicates.
The amount of duplicates can be huge. When batched iterator is slow and
bpf prog is doing a lot of update/delete, there could be 10x worth of duplicates,
since walk will resume from the beginning.
User space cannot be tasked to deal with it.
I think this issue has to be solved in the kernel first and it may require
different batching api.

One idea is to use bucket spin_lock and batch process it bucket-at-a-time.
From api pov the user space will tell kernel:
- here is the buffer for N element. start dump from the beginning.
- kernel will return <= N elements and an iterator.
- user space will pass this opaque iterator back to get another batch
For well behaved hash/lru map there will be zero or one elements per bucket.
When there are 2+ the batching logic can process them together.
If 'lookup' is requested the kernel can check whether user space provided
enough space for these 2 elements. If not abort the batch earlier.
get_next_key won't be used. Instead some sort of opaque iterator
will be returned to user space, so next batch lookup can start from it.
This iterator could be the index of the last dumped bucket.
This idea won't work for pathological hash tables though.
A lot of elements in a single bucket may be more than room for single batch.
In such case iterator will get stuck, since num_of_elements_in_bucket > batch_buf_size.
May be special error code can be used to solve that?

I hope we can come up with other ideas to have a stable iterator over hash table.
Let's use email to describe the ideas and upcoming LPC conference to
sort out details and finalize the one to use.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-09-03 21:01             ` Alexei Starovoitov
@ 2019-09-03 22:30               ` Stanislav Fomichev
  2019-09-03 23:07                 ` Brian Vazquez
  2019-09-03 23:07                 ` Yonghong Song
  0 siblings, 2 replies; 38+ messages in thread
From: Stanislav Fomichev @ 2019-09-03 22:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Yonghong Song, Jakub Kicinski, Brian Vazquez, Alexei Starovoitov,
	bpf, netdev, Daniel Borkmann, Kernel Team

On 09/03, Alexei Starovoitov wrote:
> On Fri, Aug 30, 2019 at 02:18:09PM -0700, Stanislav Fomichev wrote:
> > > > 
> > > > I personally like Jakub's/Quentin's proposal more. So if I get to choose
> > > > between this series and Jakub's filter+dump in BPF, I'd pick filter+dump
> > > > (pending per-cpu issue which we actually care about).
> > > > 
> > > > But if we can have both, I don't have any objections; this patch
> 
> I think we need to have both.
> imo Jakub's and Yonghong's approach are solving slightly different cases.
> 
> filter+dump via program is better suited for LRU map walks where filter prog
> would do some non-trivial logic.
> Whereas plain 'delete all' or 'dump all' is much simpler to use without
> loading yet another prog just to dump it.
> bpf infra today isn't quite ready for this very short lived auxiliary progs.
> At prog load pages get read-only mapping, tlbs across cpus flushed,
> kallsyms populated, FDs allocated, etc.
> Loading the prog is a heavy operation. There was a chatter before to have
> built-in progs. This filter+dump could benefit from builtin 'allow all'
> or 'delete all' progs, but imo that complicates design and asks even
> more questions than it answers. Should this builtin progs show up
> in 'bpftool prog show' ? When do they load/unload? Same safety requirements
> as normal progs? etc.
> imo it's fine to have little bit overlap between apis.
> So I think we should proceed with both batching apis.
We don't need to load filter+dump every time we need a dump, right?
We, internally, want to have this 'batch dump' only for long running daemons
(I think the same applies to bcc), we can load this filter+dump once and
then have a sys_bpf() command to trigger it.

Also, related, if we add this batch dump, it doesn't mean that
everything should switch to it. For example, I feel like we
are perfectly fine if bpftool still uses get_next_key+lookup
since we use it only for debugging.

> Having said that I think both are suffering from the important issue pointed out
> by Brian: when kernel deletes an element get_next_key iterator over hash/lru
> map will produce duplicates.
> The amount of duplicates can be huge. When batched iterator is slow and
> bpf prog is doing a lot of update/delete, there could be 10x worth of duplicates,
> since walk will resume from the beginning.
> User space cannot be tasked to deal with it.
> I think this issue has to be solved in the kernel first and it may require
> different batching api.
> 
> One idea is to use bucket spin_lock and batch process it bucket-at-a-time.
> From api pov the user space will tell kernel:
> - here is the buffer for N element. start dump from the beginning.
> - kernel will return <= N elements and an iterator.
> - user space will pass this opaque iterator back to get another batch
> For well behaved hash/lru map there will be zero or one elements per bucket.
> When there are 2+ the batching logic can process them together.
> If 'lookup' is requested the kernel can check whether user space provided
> enough space for these 2 elements. If not abort the batch earlier.
> get_next_key won't be used. Instead some sort of opaque iterator
> will be returned to user space, so next batch lookup can start from it.
> This iterator could be the index of the last dumped bucket.
> This idea won't work for pathological hash tables though.
> A lot of elements in a single bucket may be more than room for single batch.
> In such case iterator will get stuck, since num_of_elements_in_bucket > batch_buf_size.
> May be special error code can be used to solve that?
This all requires new per-map implementations unfortunately :-(
We were trying to see if we can somehow improve the existing bpf_map_ops
to be more friendly towards batching.

You also bring a valid point with regard to well behaved hash/lru,
we might be optimizing for the wrong case :-)

> I hope we can come up with other ideas to have a stable iterator over hash table.
> Let's use email to describe the ideas and upcoming LPC conference to
> sort out details and finalize the one to use.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-09-03 22:30               ` Stanislav Fomichev
@ 2019-09-03 23:07                 ` Brian Vazquez
  2019-09-04  1:35                   ` Alexei Starovoitov
  2019-09-03 23:07                 ` Yonghong Song
  1 sibling, 1 reply; 38+ messages in thread
From: Brian Vazquez @ 2019-09-03 23:07 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Alexei Starovoitov, Yonghong Song, Jakub Kicinski,
	Alexei Starovoitov, bpf, netdev, Daniel Borkmann, Kernel Team

On Tue, Sep 3, 2019 at 3:30 PM Stanislav Fomichev <sdf@fomichev.me> wrote:
>
> On 09/03, Alexei Starovoitov wrote:
> > On Fri, Aug 30, 2019 at 02:18:09PM -0700, Stanislav Fomichev wrote:
> > > > >
> > > > > I personally like Jakub's/Quentin's proposal more. So if I get to choose
> > > > > between this series and Jakub's filter+dump in BPF, I'd pick filter+dump
> > > > > (pending per-cpu issue which we actually care about).
> > > > >
> > > > > But if we can have both, I don't have any objections; this patch
> >
> > I think we need to have both.
> > imo Jakub's and Yonghong's approach are solving slightly different cases.
> >
> > filter+dump via program is better suited for LRU map walks where filter prog
> > would do some non-trivial logic.
> > Whereas plain 'delete all' or 'dump all' is much simpler to use without
> > loading yet another prog just to dump it.
> > bpf infra today isn't quite ready for this very short lived auxiliary progs.
> > At prog load pages get read-only mapping, tlbs across cpus flushed,
> > kallsyms populated, FDs allocated, etc.
> > Loading the prog is a heavy operation. There was a chatter before to have
> > built-in progs. This filter+dump could benefit from builtin 'allow all'
> > or 'delete all' progs, but imo that complicates design and asks even
> > more questions than it answers. Should this builtin progs show up
> > in 'bpftool prog show' ? When do they load/unload? Same safety requirements
> > as normal progs? etc.
> > imo it's fine to have little bit overlap between apis.
> > So I think we should proceed with both batching apis.
> We don't need to load filter+dump every time we need a dump, right?
> We, internally, want to have this 'batch dump' only for long running daemons
> (I think the same applies to bcc), we can load this filter+dump once and
> then have a sys_bpf() command to trigger it.
>
> Also, related, if we add this batch dump, it doesn't mean that
> everything should switch to it. For example, I feel like we
> are perfectly fine if bpftool still uses get_next_key+lookup
> since we use it only for debugging.
>
> > Having said that I think both are suffering from the important issue pointed out
> > by Brian: when kernel deletes an element get_next_key iterator over hash/lru
> > map will produce duplicates.
> > The amount of duplicates can be huge. When batched iterator is slow and
> > bpf prog is doing a lot of update/delete, there could be 10x worth of duplicates,
> > since walk will resume from the beginning.
> > User space cannot be tasked to deal with it.
> > I think this issue has to be solved in the kernel first and it may require
> > different batching api.

We could also modify get_next_key behaviour _only_ when it's called
from a dumping function in which case we do know that we want to move
forward not backwards (basically if prev_key is not found, then
retrieve the first key in the next bucket).

That approach might miss entries that are in the same bucket where the
prev_key used to be, but missing entries is something that can always
happen (new additions in previous buckets), we can not control that
and as it has said before, if users care about consistency they can
use map-in-map.

> > One idea is to use bucket spin_lock and batch process it bucket-at-a-time.
> > From api pov the user space will tell kernel:
> > - here is the buffer for N element. start dump from the beginning.
> > - kernel will return <= N elements and an iterator.
> > - user space will pass this opaque iterator back to get another batch
> > For well behaved hash/lru map there will be zero or one elements per bucket.
> > When there are 2+ the batching logic can process them together.
> > If 'lookup' is requested the kernel can check whether user space provided
> > enough space for these 2 elements. If not abort the batch earlier.
> > get_next_key won't be used. Instead some sort of opaque iterator
> > will be returned to user space, so next batch lookup can start from it.
> > This iterator could be the index of the last dumped bucket.
> > This idea won't work for pathological hash tables though.
> > A lot of elements in a single bucket may be more than room for single batch.
> > In such case iterator will get stuck, since num_of_elements_in_bucket > batch_buf_size.
> > May be special error code can be used to solve that?
> This all requires new per-map implementations unfortunately :-(
> We were trying to see if we can somehow improve the existing bpf_map_ops
> to be more friendly towards batching.

Agreed that one of the motivations for current batching implementation
was to re use the existing code as much as we can. Although this might
be a good opportunity to do some per-map implementations as long as
they behave better than the current ones.

>
> You also bring a valid point with regard to well behaved hash/lru,
> we might be optimizing for the wrong case :-)

For pathological hash tables, wouldn't batching lookup_and_delete
help? In theory you are always requesting for the current first_key,
copying it and deleting it. And even if you retrieve the same key
during the batch (because another cpu added it again) it would contain
different info right?

> > I hope we can come up with other ideas to have a stable iterator over hash table.
> > Let's use email to describe the ideas and upcoming LPC conference to
> > sort out details and finalize the one to use.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-09-03 22:30               ` Stanislav Fomichev
  2019-09-03 23:07                 ` Brian Vazquez
@ 2019-09-03 23:07                 ` Yonghong Song
  1 sibling, 0 replies; 38+ messages in thread
From: Yonghong Song @ 2019-09-03 23:07 UTC (permalink / raw)
  To: Stanislav Fomichev, Alexei Starovoitov
  Cc: Jakub Kicinski, Brian Vazquez, Alexei Starovoitov, bpf, netdev,
	Daniel Borkmann, Kernel Team



On 9/3/19 3:30 PM, Stanislav Fomichev wrote:
> On 09/03, Alexei Starovoitov wrote:
>> On Fri, Aug 30, 2019 at 02:18:09PM -0700, Stanislav Fomichev wrote:
>>>>>
>>>>> I personally like Jakub's/Quentin's proposal more. So if I get to choose
>>>>> between this series and Jakub's filter+dump in BPF, I'd pick filter+dump
>>>>> (pending per-cpu issue which we actually care about).
>>>>>
>>>>> But if we can have both, I don't have any objections; this patch
>>
>> I think we need to have both.
>> imo Jakub's and Yonghong's approach are solving slightly different cases.
>>
>> filter+dump via program is better suited for LRU map walks where filter prog
>> would do some non-trivial logic.
>> Whereas plain 'delete all' or 'dump all' is much simpler to use without
>> loading yet another prog just to dump it.
>> bpf infra today isn't quite ready for this very short lived auxiliary progs.
>> At prog load pages get read-only mapping, tlbs across cpus flushed,
>> kallsyms populated, FDs allocated, etc.
>> Loading the prog is a heavy operation. There was a chatter before to have
>> built-in progs. This filter+dump could benefit from builtin 'allow all'
>> or 'delete all' progs, but imo that complicates design and asks even
>> more questions than it answers. Should this builtin progs show up
>> in 'bpftool prog show' ? When do they load/unload? Same safety requirements
>> as normal progs? etc.
>> imo it's fine to have little bit overlap between apis.
>> So I think we should proceed with both batching apis.
> We don't need to load filter+dump every time we need a dump, right?
> We, internally, want to have this 'batch dump' only for long running daemons
> (I think the same applies to bcc), we can load this filter+dump once and
> then have a sys_bpf() command to trigger it.
> 
> Also, related, if we add this batch dump, it doesn't mean that
> everything should switch to it. For example, I feel like we
> are perfectly fine if bpftool still uses get_next_key+lookup
> since we use it only for debugging.
> 
>> Having said that I think both are suffering from the important issue pointed out
>> by Brian: when kernel deletes an element get_next_key iterator over hash/lru
>> map will produce duplicates.
>> The amount of duplicates can be huge. When batched iterator is slow and
>> bpf prog is doing a lot of update/delete, there could be 10x worth of duplicates,
>> since walk will resume from the beginning.
>> User space cannot be tasked to deal with it.
>> I think this issue has to be solved in the kernel first and it may require
>> different batching api.
>>
>> One idea is to use bucket spin_lock and batch process it bucket-at-a-time.
>>  From api pov the user space will tell kernel:
>> - here is the buffer for N element. start dump from the beginning.
>> - kernel will return <= N elements and an iterator.
>> - user space will pass this opaque iterator back to get another batch
>> For well behaved hash/lru map there will be zero or one elements per bucket.
>> When there are 2+ the batching logic can process them together.
>> If 'lookup' is requested the kernel can check whether user space provided
>> enough space for these 2 elements. If not abort the batch earlier.
>> get_next_key won't be used. Instead some sort of opaque iterator
>> will be returned to user space, so next batch lookup can start from it.
>> This iterator could be the index of the last dumped bucket.
>> This idea won't work for pathological hash tables though.
>> A lot of elements in a single bucket may be more than room for single batch.
>> In such case iterator will get stuck, since num_of_elements_in_bucket > batch_buf_size.
>> May be special error code can be used to solve that?
> This all requires new per-map implementations unfortunately :-(
> We were trying to see if we can somehow improve the existing bpf_map_ops
> to be more friendly towards batching.

The below is a link to folly current hashmap implementation:
https://github.com/facebook/folly/blob/master/folly/concurrency/ConcurrentHashMap.h
It uses segment level batching and each segment can contain multiple 
buckets. It support concurrent iterating vs. deletion.
I am yet to fully understand its implementation. But my guess is
that old elements are somehow kept long enough until all the
queries, read, etc. done and then garbage collection kicks in
and removes those old elements.

Kernel bucket-level locking should be okay. Indeed, per-map 
implementation or tweaking existing per-map interfaces will be necessary.

> 
> You also bring a valid point with regard to well behaved hash/lru,
> we might be optimizing for the wrong case :-)
> 
>> I hope we can come up with other ideas to have a stable iterator over hash table.
>> Let's use email to describe the ideas and upcoming LPC conference to
>> sort out details and finalize the one to use.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
  2019-09-03 23:07                 ` Brian Vazquez
@ 2019-09-04  1:35                   ` Alexei Starovoitov
  0 siblings, 0 replies; 38+ messages in thread
From: Alexei Starovoitov @ 2019-09-04  1:35 UTC (permalink / raw)
  To: Brian Vazquez
  Cc: Stanislav Fomichev, Yonghong Song, Jakub Kicinski,
	Alexei Starovoitov, bpf, netdev, Daniel Borkmann, Kernel Team

On Tue, Sep 03, 2019 at 04:07:17PM -0700, Brian Vazquez wrote:
> 
> We could also modify get_next_key behaviour _only_ when it's called
> from a dumping function in which case we do know that we want to move
> forward not backwards (basically if prev_key is not found, then
> retrieve the first key in the next bucket).
> 
> That approach might miss entries that are in the same bucket where the
> prev_key used to be, but missing entries is something that can always
> happen (new additions in previous buckets), we can not control that
> and as it has said before, if users care about consistency they can
> use map-in-map.

for dump-all case such miss of elements might be ok-ish,
but for delete-all it's probably not.
Imagine bpf prog is doing delete and bcc script is doing delete-all.
With 'go to the next bucket' logic there will be a case where
both kernel and user side are doing delete, but some elements are still left.
True that map-in-map is the answer, but if we're doing new get_next op
we probably should do it with less corner cases.

> > This all requires new per-map implementations unfortunately :-(
> > We were trying to see if we can somehow improve the existing bpf_map_ops
> > to be more friendly towards batching.
> 
> Agreed that one of the motivations for current batching implementation
> was to re use the existing code as much as we can. Although this might
> be a good opportunity to do some per-map implementations as long as
> they behave better than the current ones.

I don't think non reuse of get_next is a big deal.
Adding another get_next_v2 to all maps is a low cost comparing
with advantages of having stable iterate api.

stable walk over hash map is imo more important feature of api than
perf improvement brought by batching.


^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2019-09-04  1:35 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-29  6:45 [PATCH bpf-next 00/13] bpf: adding map batch processing support Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions Yonghong Song
2019-08-29 22:04   ` Song Liu
2019-08-30  6:40     ` Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 02/13] bpf: refactor map_update_elem() Yonghong Song
2019-08-29 23:37   ` Song Liu
2019-08-29  6:45 ` [PATCH bpf-next 03/13] bpf: refactor map_delete_elem() Yonghong Song
2019-08-29 23:39   ` Song Liu
2019-08-29  6:45 ` [PATCH bpf-next 04/13] bpf: refactor map_get_next_key() Yonghong Song
2019-08-29 23:39   ` Song Liu
2019-08-29  6:45 ` [PATCH bpf-next 05/13] bpf: adding map batch processing support Yonghong Song
2019-08-29 23:01   ` Brian Vazquez
2019-08-30  6:39     ` Yonghong Song
2019-08-30  6:58       ` Alexei Starovoitov
2019-08-29  6:45 ` [PATCH bpf-next 06/13] tools/bpf: sync uapi header bpf.h Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 07/13] tools/bpf: implement libbpf API functions for map batch operations Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 08/13] tools/bpf: add test for bpf_map_update_batch() Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 09/13] tools/bpf: add test for bpf_map_lookup_batch() Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 10/13] tools/bpf: add test for bpf_map_lookup_and_delete_batch() Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 11/13] tools/bpf: add test for bpf_map_delete_batch() Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 12/13] tools/bpf: add a multithreaded test for map batch operations Yonghong Song
2019-08-29  6:45 ` [PATCH bpf-next 13/13] tools/bpf: measure map batching perf Yonghong Song
2019-08-29 18:39 ` [PATCH bpf-next 00/13] bpf: adding map batch processing support Jakub Kicinski
2019-08-29 23:13   ` Brian Vazquez
2019-08-30  0:15     ` Jakub Kicinski
2019-08-30 20:15       ` Stanislav Fomichev
2019-08-30 20:55         ` Yonghong Song
2019-08-30 21:10           ` Jakub Kicinski
2019-08-30 22:24             ` Yonghong Song
2019-08-30 21:18           ` Stanislav Fomichev
2019-09-03 21:01             ` Alexei Starovoitov
2019-09-03 22:30               ` Stanislav Fomichev
2019-09-03 23:07                 ` Brian Vazquez
2019-09-04  1:35                   ` Alexei Starovoitov
2019-09-03 23:07                 ` Yonghong Song
2019-08-30  7:25   ` Yonghong Song
2019-08-30 21:35     ` Jakub Kicinski
2019-08-30 22:38       ` Yonghong Song

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).