All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
@ 2016-01-07 22:35 Martin KaFai Lau
  2016-01-07 22:35 ` [PATCH net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic Martin KaFai Lau
                   ` (4 more replies)
  0 siblings, 5 replies; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-07 22:35 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: FB Kernel Team, Alexei Starovoitov

This patchset adds BPF_MAP_TYPE_PERCPU_HASH map type which allows
percpu value.

BPF + kprobe is very useful in statistics collection.  In particular,
bpf is strong in doing aggregation within the kernel instead of
outputting a lot of raw samples to the userspace.

In some cases, bumping a counter/value of a particular key will have
noticeable impact.  For example, doing statistics collection
on received packets and aggregating them by network
prefix (like /64 in IPv6).  Having a percpu value can help.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic
  2016-01-07 22:35 [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
@ 2016-01-07 22:35 ` Martin KaFai Lau
  2016-01-07 22:35 ` [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-07 22:35 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: FB Kernel Team, Alexei Starovoitov, Ming Lei

It is a prep work for htab_percpu_map.

A similar fio test against /dev/nullb0 is done
while bcc/tools/biolatency is running.  It is to
make sure this change does not have regression
after the commit 688ecfe60220: ("bpf: hash: use per-bucket spinlock")
by "Ming Lei <tom.leiming@gmail.com>".

With biolatency running, the iops before and after this patch
is similar.  However, I cannot get as high iops as Ming did when
biolatency is not running.

With biolatency running, I get ~280k iops before
and after this patch.  Hence, no regression noticed.

Without biolatency running, I get ~330k iops.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Ming Lei <tom.leiming@gmail.com>
---
 kernel/bpf/hashtab.c | 138 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 99 insertions(+), 39 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c5b30fd..d55df8c 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -19,24 +19,47 @@ struct bucket {
 	raw_spinlock_t lock;
 };
 
+struct bpf_htab;
+typedef void (*flush_elems_fn)(struct bpf_htab *htab);
+
 struct bpf_htab {
 	struct bpf_map map;
 	struct bucket *buckets;
+	flush_elems_fn flush;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
+	u32 elem_key_offset; /* offset to the key */
 };
 
-/* each htab element is struct htab_elem + key + value */
-struct htab_elem {
+struct htab_elem_common {
 	struct hlist_node hash_node;
 	struct rcu_head rcu;
 	u32 hash;
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+	struct htab_elem_common common;
 	char key[0] __aligned(8);
 };
 
-/* Called from syscall */
-static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+static void *htab_elem_common_get_key(const struct bpf_htab *htab,
+				      struct htab_elem_common *l)
+{
+	return (void *)l + htab->elem_key_offset;
+}
+
+static struct htab_elem *htab_elem(struct htab_elem_common *l)
+{
+	return (struct htab_elem *)l;
+}
+
+static struct bpf_map *__htab_map_alloc(union bpf_attr *attr,
+					u32 elem_size,
+					u32 elem_value_size,
+					u32 elem_key_offset,
+					flush_elems_fn flush)
 {
 	struct bpf_htab *htab;
 	int err, i;
@@ -77,9 +100,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		goto free_htab;
 
-	htab->elem_size = sizeof(struct htab_elem) +
-			  round_up(htab->map.key_size, 8) +
-			  htab->map.value_size;
+	htab->elem_size = elem_size;
+	htab->elem_key_offset = elem_key_offset;
+	htab->flush = flush;
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
@@ -87,13 +110,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		goto free_htab;
 
 	if ((u64) htab->n_buckets * sizeof(struct bucket) +
-	    (u64) htab->elem_size * htab->map.max_entries >=
+	    (u64) elem_value_size * htab->map.max_entries >=
 	    U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
 	htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
-				   htab->elem_size * htab->map.max_entries,
+				   elem_value_size * htab->map.max_entries,
 				   PAGE_SIZE) >> PAGE_SHIFT;
 
 	err = -ENOMEM;
@@ -120,6 +143,19 @@ free_htab:
 	return ERR_PTR(err);
 }
 
+static void htab_map_flush(struct bpf_htab *htab);
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+	u32 elem_size = sizeof(struct htab_elem) + round_up(attr->key_size, 8) +
+		attr->value_size;
+
+	return __htab_map_alloc(attr, elem_size, elem_size,
+				offsetof(struct htab_elem, key),
+				htab_map_flush);
+}
+
 static inline u32 htab_map_hash(const void *key, u32 key_len)
 {
 	return jhash(key, key_len, 0);
@@ -135,39 +171,48 @@ static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
 	return &__select_bucket(htab, hash)->head;
 }
 
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
-					 void *key, u32 key_size)
+static struct htab_elem_common *lookup_elem_raw(struct bpf_htab *htab,
+						struct hlist_head *head,
+						u32 hash, void *key)
 {
-	struct htab_elem *l;
+	struct htab_elem_common *l;
 
 	hlist_for_each_entry_rcu(l, head, hash_node)
-		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+		if (l->hash == hash &&
+		    !memcmp(htab_elem_common_get_key(htab, l),
+			    key, htab->map.key_size))
 			return l;
 
 	return NULL;
 }
 
 /* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+static struct htab_elem_common *__htab_map_lookup_elem(struct bpf_htab *htab,
+						       void *key)
 {
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
-	struct htab_elem *l;
 	u32 hash, key_size;
 
 	/* Must be called with rcu_read_lock. */
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	key_size = map->key_size;
+	key_size = htab->map.key_size;
 
 	hash = htab_map_hash(key, key_size);
 
 	head = select_bucket(htab, hash);
 
-	l = lookup_elem_raw(head, hash, key, key_size);
+	return lookup_elem_raw(htab, head, hash, key);
+}
+
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem_common *l;
 
+	l = __htab_map_lookup_elem(htab, key);
 	if (l)
-		return l->key + round_up(map->key_size, 8);
+		return htab_elem(l)->key + round_up(map->key_size, 8);
 
 	return NULL;
 }
@@ -177,7 +222,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
-	struct htab_elem *l, *next_l;
+	struct htab_elem_common *l, *next_l;
 	u32 hash, key_size;
 	int i;
 
@@ -190,7 +235,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	head = select_bucket(htab, hash);
 
 	/* lookup the key */
-	l = lookup_elem_raw(head, hash, key, key_size);
+	l = lookup_elem_raw(htab, head, hash, key);
 
 	if (!l) {
 		i = 0;
@@ -199,11 +244,12 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 	/* key was found, get next key in the same bucket */
 	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
-				  struct htab_elem, hash_node);
+				  struct htab_elem_common, hash_node);
 
 	if (next_l) {
 		/* if next elem in this hash list is non-zero, just return it */
-		memcpy(next_key, next_l->key, key_size);
+		memcpy(next_key, htab_elem_common_get_key(htab, next_l),
+		       key_size);
 		return 0;
 	}
 
@@ -218,10 +264,11 @@ find_first_elem:
 
 		/* pick first element in the bucket */
 		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
-					  struct htab_elem, hash_node);
+					  struct htab_elem_common, hash_node);
 		if (next_l) {
 			/* if it's not empty, just return it */
-			memcpy(next_key, next_l->key, key_size);
+			memcpy(next_key, htab_elem_common_get_key(htab, next_l),
+			       key_size);
 			return 0;
 		}
 	}
@@ -230,6 +277,21 @@ find_first_elem:
 	return -ENOENT;
 }
 
+static struct htab_elem_common *htab_elem_common_alloc(struct bpf_htab *htab,
+							void *key)
+{
+	struct htab_elem_common *l;
+
+	l = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	if (!l)
+		return NULL;
+
+	memcpy(htab_elem_common_get_key(htab, l), key, htab->map.key_size);
+	l->hash = htab_map_hash(key, htab->map.key_size);
+
+	return l;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 				u64 map_flags)
@@ -249,23 +311,21 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* allocate new element outside of lock */
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	l_new = (struct htab_elem *)htab_elem_common_alloc(htab, key);
 	if (!l_new)
 		return -ENOMEM;
 
 	key_size = map->key_size;
-
-	memcpy(l_new->key, key, key_size);
 	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
 
-	l_new->hash = htab_map_hash(l_new->key, key_size);
-	b = __select_bucket(htab, l_new->hash);
+	b = __select_bucket(htab, l_new->common.hash);
 	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
 	raw_spin_lock_irqsave(&b->lock, flags);
 
-	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+	l_old = (struct htab_elem *)lookup_elem_raw(htab, head,
+						    l_new->common.hash, key);
 
 	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
 		/* if elem with this 'key' doesn't exist and we've reached
@@ -290,10 +350,10 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	/* add new element to the head of the list, so that concurrent
 	 * search will find it before old elem
 	 */
-	hlist_add_head_rcu(&l_new->hash_node, head);
+	hlist_add_head_rcu(&l_new->common.hash_node, head);
 	if (l_old) {
-		hlist_del_rcu(&l_old->hash_node);
-		kfree_rcu(l_old, rcu);
+		hlist_del_rcu(&l_old->common.hash_node);
+		kfree_rcu(&l_old->common, rcu);
 	} else {
 		atomic_inc(&htab->count);
 	}
@@ -310,9 +370,9 @@ err:
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem_common *l;
 	struct hlist_head *head;
 	struct bucket *b;
-	struct htab_elem *l;
 	unsigned long flags;
 	u32 hash, key_size;
 	int ret = -ENOENT;
@@ -327,7 +387,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	raw_spin_lock_irqsave(&b->lock, flags);
 
-	l = lookup_elem_raw(head, hash, key, key_size);
+	l = lookup_elem_raw(htab, head, hash, key);
 
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
@@ -340,14 +400,14 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	return ret;
 }
 
-static void delete_all_elements(struct bpf_htab *htab)
+static void htab_map_flush(struct bpf_htab *htab)
 {
 	int i;
 
 	for (i = 0; i < htab->n_buckets; i++) {
 		struct hlist_head *head = select_bucket(htab, i);
 		struct hlist_node *n;
-		struct htab_elem *l;
+		struct htab_elem_common *l;
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
@@ -372,7 +432,7 @@ static void htab_map_free(struct bpf_map *map)
 	/* some of kfree_rcu() callbacks for elements of this map may not have
 	 * executed. It's ok. Proceed to free residual elements and map itself
 	 */
-	delete_all_elements(htab);
+	htab->flush(htab);
 	kvfree(htab->buckets);
 	kfree(htab);
 }
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-07 22:35 [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
  2016-01-07 22:35 ` [PATCH net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic Martin KaFai Lau
@ 2016-01-07 22:35 ` Martin KaFai Lau
  2016-01-09 10:06   ` Ming Lei
  2016-01-09 10:33   ` Ming Lei
  2016-01-07 22:35 ` [PATCH net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key Martin KaFai Lau
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-07 22:35 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: FB Kernel Team, Alexei Starovoitov

This patch adds BPFMAP_TYPE_PERCPU_HASH map type and its
htab_map_ops implementation.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/hashtab.c     | 201 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 201 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8bed7f1..e4f8060 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	BPF_MAP_TYPE_PERCPU_HASH,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d55df8c..63f2945 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -278,7 +278,7 @@ find_first_elem:
 }
 
 static struct htab_elem_common *htab_elem_common_alloc(struct bpf_htab *htab,
-							void *key)
+						       void *key)
 {
 	struct htab_elem_common *l;
 
@@ -451,9 +451,208 @@ static struct bpf_map_type_list htab_type __read_mostly = {
 	.type = BPF_MAP_TYPE_HASH,
 };
 
+/* each htab_percpu_elem is struct htab_percpu_elem + key  */
+struct htab_percpu_elem {
+	struct htab_elem_common common;
+	void * __percpu value;
+	char key[0] __aligned(8);
+};
+
+static struct htab_percpu_elem *htab_percpu_elem(struct htab_elem_common *l)
+{
+	return (struct htab_percpu_elem *)l;
+}
+
+static void htab_percpu_elem_free(struct htab_percpu_elem *l)
+{
+	free_percpu(l->value);
+	kfree(l);
+}
+
+static void htab_percpu_elem_rcu_free(struct rcu_head *head)
+{
+	struct htab_elem_common *l = container_of(head,
+						  struct htab_elem_common,
+						  rcu);
+
+	htab_percpu_elem_free(htab_percpu_elem(l));
+}
+
+static void htab_percpu_map_flush(struct bpf_htab *htab)
+{
+	int i;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem_common *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			hlist_del_rcu(&l->hash_node);
+			atomic_dec(&htab->count);
+			htab_percpu_elem_free(htab_percpu_elem(l));
+		}
+	}
+}
+
+/* Called from syscall */
+static struct bpf_map *htab_percpu_map_alloc(union bpf_attr *attr)
+{
+	u32 elem_size = sizeof(struct htab_percpu_elem) +
+		round_up(attr->key_size, 8);
+	u32 elem_value_size = elem_size +
+		num_possible_cpus() * attr->value_size;
+
+	return __htab_map_alloc(attr, elem_size, elem_value_size,
+				offsetof(struct htab_percpu_elem, key),
+				htab_percpu_map_flush);
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_percpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem_common *l;
+	struct hlist_head *head;
+	unsigned long flags;
+	u32 hash, key_size;
+	struct bucket *b;
+	int ret = -ENOENT;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l = lookup_elem_raw(htab, head, hash, key);
+
+	if (l) {
+		hlist_del_rcu(&l->hash_node);
+		atomic_dec(&htab->count);
+		call_rcu(&l->rcu, htab_percpu_elem_rcu_free);
+		ret = 0;
+	}
+
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	return ret;
+}
+
+/* Called from syscall or eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem_common *l;
+
+	l = __htab_map_lookup_elem(htab, key);
+	if (l) {
+		void *value = per_cpu_ptr(htab_percpu_elem(l)->value,
+					  smp_processor_id());
+		return value;
+	}
+
+	return NULL;
+
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_percpu_elem *l_new, *l_old;
+	struct hlist_head *head;
+	struct bucket *b;
+	unsigned long flags;
+	int ret;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* allocate new element outside of lock */
+	l_new = htab_percpu_elem(htab_elem_common_alloc(htab, key));
+	if (!l_new)
+		return -ENOMEM;
+
+	l_new->value = __alloc_percpu_gfp(htab->map.value_size, 8,
+					  GFP_ATOMIC | __GFP_NOWARN);
+	if (!l_new->value) {
+		htab_percpu_elem_free(l_new);
+		return -ENOMEM;
+	}
+
+	memcpy(raw_cpu_ptr(l_new->value), value, map->value_size);
+
+	b = __select_bucket(htab, l_new->common.hash);
+	head = &b->head;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l_old = htab_percpu_elem(lookup_elem_raw(htab, head, l_new->common.hash,
+						 key));
+
+	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		ret = -E2BIG;
+		goto err;
+	}
+
+	if (l_old && map_flags == BPF_NOEXIST) {
+		/* elem already exists */
+		ret = -EEXIST;
+		goto err;
+	}
+
+	if (!l_old && map_flags == BPF_EXIST) {
+		/* elem doesn't exist, cannot update it */
+		ret = -ENOENT;
+		goto err;
+	}
+
+	if (l_old) {
+		memcpy(this_cpu_ptr(l_old->value), value, map->value_size);
+	} else {
+		hlist_add_head_rcu(&l_new->common.hash_node, head);
+		atomic_inc(&htab->count);
+	}
+
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+
+	return 0;
+err:
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_percpu_elem_free(l_new);
+	return ret;
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+	.map_alloc = htab_percpu_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_percpu_map_lookup_elem,
+	.map_update_elem = htab_percpu_map_update_elem,
+	.map_delete_elem = htab_percpu_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+	.ops = &htab_percpu_ops,
+	.type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
 static int __init register_htab_map(void)
 {
 	bpf_register_map_type(&htab_type);
+	bpf_register_map_type(&htab_percpu_type);
 	return 0;
 }
 late_initcall(register_htab_map);
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key
  2016-01-07 22:35 [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
  2016-01-07 22:35 ` [PATCH net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic Martin KaFai Lau
  2016-01-07 22:35 ` [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
@ 2016-01-07 22:35 ` Martin KaFai Lau
  2016-01-07 22:35 ` [PATCH net-next 4/4] bpf: bpf_htab: Test for BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
  2016-01-08  6:55 ` [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Ming Lei
  4 siblings, 0 replies; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-07 22:35 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: FB Kernel Team, Alexei Starovoitov

Add map_lookup_percpu_elem() syscall to lookup value
of a particular CPU.

The user is expected to loop through all CPU values by:
for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) {
	if (!bpf_map_lookup_percpu_elem(map, &key, &value, cpu))
		total_value += value;
}

* If the cpu is offline, errno == ENXIO
* If the key is deleted before the iteration is done,
  errno == ENOENT.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e4f8060..96ce561 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,7 @@ enum bpf_cmd {
 	BPF_PROG_LOAD,
 	BPF_OBJ_PIN,
 	BPF_OBJ_GET,
+	BPF_MAP_LOOKUP_PERCPU_ELEM,
 };
 
 enum bpf_map_type {
@@ -115,6 +116,7 @@ union bpf_attr {
 			__aligned_u64 next_key;
 		};
 		__u64		flags;
+		__u32		cpu;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6373970..ba1172b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -289,6 +289,96 @@ err_put:
 	return err;
 }
 
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_PERCPU_ELEM_LAST_FIELD cpu
+
+struct percpu_map_value_info {
+	struct bpf_map *map;
+	void *key;
+	void *value;
+	bool found;
+};
+
+static void percpu_map_lookup_value(void *i)
+{
+	struct percpu_map_value_info *info = (struct percpu_map_value_info *)i;
+	struct bpf_map *map = info->map;
+	void *ptr;
+
+	rcu_read_lock();
+	ptr = map->ops->map_lookup_elem(map, info->key);
+	if (ptr) {
+		memcpy(info->value, ptr, map->value_size);
+		info->found = true;
+	} else {
+		info->found = false;
+	}
+	rcu_read_unlock();
+}
+
+static int map_lookup_percpu_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_ptr(attr->key);
+	void __user *uvalue = u64_to_ptr(attr->value);
+	u32 __user ucpu = attr->cpu;
+	int ufd = attr->map_fd;
+	struct percpu_map_value_info value_info;
+	struct bpf_map *map;
+	void *key, *value;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) ||
+	    ucpu >= num_possible_cpus())
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+	if (!value)
+		goto free_key;
+
+	value_info.map = map;
+	value_info.key = key;
+	value_info.value = value;
+
+	err = smp_call_function_single(ucpu, percpu_map_lookup_value,
+				       &value_info, 1);
+	if (err)
+		goto free_value;
+
+	err = -ENOENT;
+	if (!value_info.found)
+		goto free_value;
+
+	err = -EFAULT;
+	if (copy_to_user(uvalue, value, map->value_size) != 0)
+		goto free_value;
+
+	err = 0;
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
@@ -792,6 +882,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
+	case BPF_MAP_LOOKUP_PERCPU_ELEM:
+		err = map_lookup_percpu_elem(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH net-next 4/4] bpf: bpf_htab: Test for BPF_MAP_TYPE_PERCPU_HASH
  2016-01-07 22:35 [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
                   ` (2 preceding siblings ...)
  2016-01-07 22:35 ` [PATCH net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key Martin KaFai Lau
@ 2016-01-07 22:35 ` Martin KaFai Lau
  2016-01-08  6:55 ` [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Ming Lei
  4 siblings, 0 replies; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-07 22:35 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: FB Kernel Team, Alexei Starovoitov

A sanity test for BPF_MAP_TYPE_PERCPU_HASH.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 samples/bpf/libbpf.c    |  13 ++++++
 samples/bpf/libbpf.h    |   1 +
 samples/bpf/test_maps.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 126 insertions(+)

diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 65a8d48..a9c5e61 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -54,6 +54,19 @@ int bpf_lookup_elem(int fd, void *key, void *value)
 	return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
 }
 
+int bpf_lookup_percpu_elem(int fd, void *key, void *value, unsigned int cpu)
+{
+	union bpf_attr attr = {
+		.map_fd = fd,
+		.key = ptr_to_u64(key),
+		.value = ptr_to_u64(value),
+		.cpu = cpu,
+	};
+
+	return syscall(__NR_bpf, BPF_MAP_LOOKUP_PERCPU_ELEM, &attr,
+		       sizeof(attr));
+}
+
 int bpf_delete_elem(int fd, void *key)
 {
 	union bpf_attr attr = {
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 014aacf..7759517 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -8,6 +8,7 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
 		   int max_entries);
 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
 int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_lookup_percpu_elem(int fd, void *key, void *value, unsigned int cpu);
 int bpf_delete_elem(int fd, void *key);
 int bpf_get_next_key(int fd, void *key, void *next_key);
 
diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
index 6299ee9..d0638aa 100644
--- a/samples/bpf/test_maps.c
+++ b/samples/bpf/test_maps.c
@@ -89,6 +89,117 @@ static void test_hashmap_sanity(int i, void *data)
 	close(map_fd);
 }
 
+/* sanity tests for percpu map API */
+static void test_percpu_hashmap_sanity(int i, void *data)
+{
+	long long key, next_key, value;
+	int expected_key_mask = 0;
+	unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+	int map_fd;
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key),
+				sizeof(value), 2);
+	if (map_fd < 0) {
+		printf("failed to create hashmap '%s'\n", strerror(errno));
+		exit(1);
+	}
+
+	key = 1;
+	value = 1234;
+	/* insert key=1 element */
+	assert(!(expected_key_mask & key));
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0);
+	expected_key_mask |= key;
+
+	/* BPF_NOEXIST means: add new element if it doesn't exist */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       /* key=1 already exists */
+	       errno == EEXIST);
+
+	/* -1 is an invalid flag */
+	assert(bpf_update_elem(map_fd, &key, &value, -1) == -1
+	       && errno == EINVAL);
+
+	/* check that key=1 can be found. value could be 0 if the lookup
+	 * was run from a different cpu.
+	 */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 &&
+	       (value == 0 || value == 1234));
+
+	key = 2;
+	/* check that key=2 is not found */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+	/* BPF_EXIST means: update existing element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
+	       /* key=2 is not there */
+	       errno == ENOENT);
+
+	/* insert key=2 element */
+	assert(!(expected_key_mask & key));
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+	expected_key_mask |= key;
+
+	/* key=1 and key=2 were inserted, check that key=0 cannot be inserted
+	 * due to max_entries limit
+	 */
+	key = 0;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1
+	       && errno == E2BIG);
+
+	/* check that key = 0 doesn't exist */
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+	/* iterate over two elements */
+	while (!bpf_get_next_key(map_fd, &key, &next_key)) {
+		unsigned int cpu;
+		int found;
+
+		assert((expected_key_mask & next_key) == next_key);
+		expected_key_mask &= ~next_key;
+
+		found = 0;
+		for (cpu = 0; cpu < nr_cpus; cpu++) {
+			if (!bpf_lookup_percpu_elem(map_fd, &next_key,
+						    &value, cpu)) {
+				if (value == 1234) {
+					assert(!found);
+					found = 1;
+				} else {
+					assert(value == 0);
+				}
+			} else {
+				assert(errno == ENXIO);
+			}
+		}
+		assert(found);
+
+		key = next_key;
+	}
+	assert(errno == ENOENT);
+
+	/* Look up the value with an invalid CPU */
+	key = 1;
+	assert(bpf_lookup_percpu_elem(map_fd, &key, &value, nr_cpus) == -1 && errno == EINVAL);
+
+	/* Update with BPF_EXIST */
+	key = 1;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == 0);
+
+	/* delete both elements */
+	key = 1;
+	assert(bpf_delete_elem(map_fd, &key) == 0);
+	key = 2;
+	assert(bpf_delete_elem(map_fd, &key) == 0);
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+	key = 0;
+	/* check that map is empty */
+	assert(bpf_get_next_key(map_fd, &key, &next_key) == -1 &&
+	       errno == ENOENT);
+	close(map_fd);
+}
+
 static void test_arraymap_sanity(int i, void *data)
 {
 	int key, next_key, map_fd;
@@ -282,6 +393,7 @@ static void test_map_parallel(void)
 int main(void)
 {
 	test_hashmap_sanity(0, NULL);
+	test_percpu_hashmap_sanity(0, NULL);
 	test_arraymap_sanity(0, NULL);
 	test_map_large();
 	test_map_parallel();
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-07 22:35 [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
                   ` (3 preceding siblings ...)
  2016-01-07 22:35 ` [PATCH net-next 4/4] bpf: bpf_htab: Test for BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
@ 2016-01-08  6:55 ` Ming Lei
  2016-01-09  0:44   ` Martin KaFai Lau
  4 siblings, 1 reply; 17+ messages in thread
From: Ming Lei @ 2016-01-08  6:55 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

Hi Martin,

On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> This patchset adds BPF_MAP_TYPE_PERCPU_HASH map type which allows
> percpu value.

I am also thinking about using percpu variable to ebpf map, but IMO it
should be better for ARRAY map instead of HASH map, then we can
avoid the atomic op in eBPF program, see example of tracex3, sockex1
and sockex3 in sample/bpf/ of kernel tree.  Also looks the ARRAY map
usage in bcc is wrong, strictly speaking.

For HASH map, it is easy to make cpu id as part of key, then the map
can be thought as percpu too, and atomic op isn't needed in eBPF program.

> BPF + kprobe is very useful in statistics collection.  In particular,
> bpf is strong in doing aggregation within the kernel instead of
> outputting a lot of raw samples to the userspace.

Exactly, :-)

>
> In some cases, bumping a counter/value of a particular key will have
> noticeable impact.  For example, doing statistics collection
> on received packets and aggregating them by network
> prefix (like /64 in IPv6).  Having a percpu value can help.

Given it is always related with performance, could you provide some data
about the improvement? Also you can compare this patchset with the
approach of providing cpu id as hash key.


-- 
Ming Lei

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-08  6:55 ` [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Ming Lei
@ 2016-01-09  0:44   ` Martin KaFai Lau
  2016-01-09  9:39     ` Ming Lei
  0 siblings, 1 reply; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-09  0:44 UTC (permalink / raw)
  To: Ming Lei
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

On Fri, Jan 08, 2016 at 02:55:32PM +0800, Ming Lei wrote:
> On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> > This patchset adds BPF_MAP_TYPE_PERCPU_HASH map type which allows
> > percpu value.
>
> I am also thinking about using percpu variable to ebpf map, but IMO it
> should be better for ARRAY map instead of HASH map, then we can
> avoid the atomic op in eBPF program, see example of tracex3, sockex1
> and sockex3 in sample/bpf/ of kernel tree.  Also looks the ARRAY map
> usage in bcc is wrong, strictly speaking.
array and hash are two different use cases. May be we should have percpu
value for array map too.

>
> For HASH map, it is easy to make cpu id as part of key, then the map
> can be thought as percpu too, and atomic op isn't needed in eBPF program.
Putting the cpu id as part of the key was indeed the first hack I did
to get a sense of potential benefit.

However, by extending the real-key with cpu-id, it is not intuitive to
use and it is prone to error.  For example, how to delete a real-key for
all cpus?  Iterating a particular real-key for all cpu is also tricky.  What
does it mean if a real-key exists for cpu#0 but not cpu#1? The real-key
got deleted from all cpu while iterating? or something else?  I believe
there are ways to get around but it is better to provide a clean
implementation instead.

> Given it is always related with performance, could you provide some data
> about the improvement? Also you can compare this patchset with the
> approach of providing cpu id as hash key.
In my test (bpf+kprobe at tcp_rcv_established()), both this patchset and
extend(real_key, cpu_id) approach save ~3% CPU while receiving ~4Mpps
in a 40cores machine.  The bpf is mostly bumping some counters for
each received packet.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-09  0:44   ` Martin KaFai Lau
@ 2016-01-09  9:39     ` Ming Lei
  2016-01-10  2:30       ` Martin KaFai Lau
  0 siblings, 1 reply; 17+ messages in thread
From: Ming Lei @ 2016-01-09  9:39 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

Hi Martin,

On Sat, Jan 9, 2016 at 8:44 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> On Fri, Jan 08, 2016 at 02:55:32PM +0800, Ming Lei wrote:
>> On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
>> > This patchset adds BPF_MAP_TYPE_PERCPU_HASH map type which allows
>> > percpu value.
>>
>> I am also thinking about using percpu variable to ebpf map, but IMO it
>> should be better for ARRAY map instead of HASH map, then we can
>> avoid the atomic op in eBPF program, see example of tracex3, sockex1
>> and sockex3 in sample/bpf/ of kernel tree.  Also looks the ARRAY map
>> usage in bcc is wrong, strictly speaking.
> array and hash are two different use cases. May be we should have percpu
> value for array map too.

Atomic operations are definitely expensive, which should have been avoided
in eBPF prog by percpu way. I think ARRAY map does need percpu way more, :-)

But for hash map, I am still not 100% sure if percpu is needed:

- key plus cpu_id is ok
- key often can be chosed so that the mapped value is accessed without
race(such as, req is used as key in biolatency, ...)
- percpu hash can cause huge memory consumption compared with the way
of key plus cpu_id without obvious performance advantage

>
>>
>> For HASH map, it is easy to make cpu id as part of key, then the map
>> can be thought as percpu too, and atomic op isn't needed in eBPF program.
> Putting the cpu id as part of the key was indeed the first hack I did
> to get a sense of potential benefit.
>
> However, by extending the real-key with cpu-id, it is not intuitive to
> use and it is prone to error.  For example, how to delete a real-key for
> all cpus?  Iterating a particular real-key for all cpu is also tricky.  What
> does it mean if a real-key exists for cpu#0 but not cpu#1? The real-key

lookup returns NULL if the key(real key plus cpu id) doesn't exist.

> got deleted from all cpu while iterating? or something else?  I believe
> there are ways to get around but it is better to provide a clean
> implementation instead.

It might be true, but I hope there is one real example, in which we can
see the obvious disadvantage of real key plus cpu_id. In the other way,
we still can improve current hash map to make this case easier to use.

Also in reality, it is often true that the mapped value from chosed key
can't be accessed concurrently from other CPUs, so percpu isn't needed.

>
>> Given it is always related with performance, could you provide some data
>> about the improvement? Also you can compare this patchset with the
>> approach of providing cpu id as hash key.
> In my test (bpf+kprobe at tcp_rcv_established()), both this patchset and
> extend(real_key, cpu_id) approach save ~3% CPU while receiving ~4Mpps
> in a 40cores machine.  The bpf is mostly bumping some counters for
> each received packet.

That sounds not a good result, 40X memory consumption only save %3 CPU,
and the approach(real_key, cpu_id) still can save %3 without extra memory
consumption(or much less than percpu MAP)


-- 
Ming Lei

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-07 22:35 ` [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
@ 2016-01-09 10:06   ` Ming Lei
  2016-01-12  3:11     ` Martin KaFai Lau
  2016-01-09 10:33   ` Ming Lei
  1 sibling, 1 reply; 17+ messages in thread
From: Ming Lei @ 2016-01-09 10:06 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> This patch adds BPFMAP_TYPE_PERCPU_HASH map type and its
> htab_map_ops implementation.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/uapi/linux/bpf.h |   1 +
>  kernel/bpf/hashtab.c     | 201 ++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 201 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 8bed7f1..e4f8060 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -81,6 +81,7 @@ enum bpf_map_type {
>         BPF_MAP_TYPE_ARRAY,
>         BPF_MAP_TYPE_PROG_ARRAY,
>         BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> +       BPF_MAP_TYPE_PERCPU_HASH,
>  };
>
>  enum bpf_prog_type {
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index d55df8c..63f2945 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -278,7 +278,7 @@ find_first_elem:
>  }
>
>  static struct htab_elem_common *htab_elem_common_alloc(struct bpf_htab *htab,
> -                                                       void *key)
> +                                                      void *key)

better to not introduce the above change.

>  {
>         struct htab_elem_common *l;
>
> @@ -451,9 +451,208 @@ static struct bpf_map_type_list htab_type __read_mostly = {
>         .type = BPF_MAP_TYPE_HASH,
>  };
>
> +/* each htab_percpu_elem is struct htab_percpu_elem + key  */
> +struct htab_percpu_elem {
> +       struct htab_elem_common common;
> +       void * __percpu value;
> +       char key[0] __aligned(8);
> +};
> +
> +static struct htab_percpu_elem *htab_percpu_elem(struct htab_elem_common *l)
> +{
> +       return (struct htab_percpu_elem *)l;
> +}
> +
> +static void htab_percpu_elem_free(struct htab_percpu_elem *l)
> +{
> +       free_percpu(l->value);
> +       kfree(l);
> +}
> +
> +static void htab_percpu_elem_rcu_free(struct rcu_head *head)
> +{
> +       struct htab_elem_common *l = container_of(head,
> +                                                 struct htab_elem_common,
> +                                                 rcu);
> +
> +       htab_percpu_elem_free(htab_percpu_elem(l));
> +}
> +
> +static void htab_percpu_map_flush(struct bpf_htab *htab)
> +{
> +       int i;
> +
> +       for (i = 0; i < htab->n_buckets; i++) {
> +               struct hlist_head *head = select_bucket(htab, i);
> +               struct hlist_node *n;
> +               struct htab_elem_common *l;
> +
> +               hlist_for_each_entry_safe(l, n, head, hash_node) {
> +                       hlist_del_rcu(&l->hash_node);
> +                       atomic_dec(&htab->count);
> +                       htab_percpu_elem_free(htab_percpu_elem(l));
> +               }
> +       }
> +}

The above helper should have been saved by introduce percpu_map
flag in bpf_htab.

> +
> +/* Called from syscall */
> +static struct bpf_map *htab_percpu_map_alloc(union bpf_attr *attr)
> +{
> +       u32 elem_size = sizeof(struct htab_percpu_elem) +
> +               round_up(attr->key_size, 8);
> +       u32 elem_value_size = elem_size +
> +               num_possible_cpus() * attr->value_size;
> +
> +       return __htab_map_alloc(attr, elem_size, elem_value_size,
> +                               offsetof(struct htab_percpu_elem, key),
> +                               htab_percpu_map_flush);
> +}
> +
> +/* Called from syscall or from eBPF program */
> +static int htab_percpu_map_delete_elem(struct bpf_map *map, void *key)
> +{
> +       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       struct htab_elem_common *l;
> +       struct hlist_head *head;
> +       unsigned long flags;
> +       u32 hash, key_size;
> +       struct bucket *b;
> +       int ret = -ENOENT;
> +
> +       WARN_ON_ONCE(!rcu_read_lock_held());
> +
> +       key_size = map->key_size;
> +
> +       hash = htab_map_hash(key, key_size);
> +       b = __select_bucket(htab, hash);
> +       head = &b->head;
> +
> +       raw_spin_lock_irqsave(&b->lock, flags);
> +
> +       l = lookup_elem_raw(htab, head, hash, key);
> +
> +       if (l) {
> +               hlist_del_rcu(&l->hash_node);
> +               atomic_dec(&htab->count);
> +               call_rcu(&l->rcu, htab_percpu_elem_rcu_free);
> +               ret = 0;
> +       }
> +
> +       raw_spin_unlock_irqrestore(&b->lock, flags);
> +       return ret;
> +}
> +
> +/* Called from syscall or eBPF program */
> +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> +       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       struct htab_elem_common *l;
> +
> +       l = __htab_map_lookup_elem(htab, key);
> +       if (l) {
> +               void *value = per_cpu_ptr(htab_percpu_elem(l)->value,
> +                                         smp_processor_id());
> +               return value;
> +       }
> +
> +       return NULL;
> +
> +}
> +
> +/* Called from syscall or from eBPF program */
> +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
> +                                      void *value, u64 map_flags)
> +{
> +       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       struct htab_percpu_elem *l_new, *l_old;
> +       struct hlist_head *head;
> +       struct bucket *b;
> +       unsigned long flags;
> +       int ret;
> +
> +       if (map_flags > BPF_EXIST)
> +               /* unknown flags */
> +               return -EINVAL;
> +
> +       WARN_ON_ONCE(!rcu_read_lock_held());
> +
> +       /* allocate new element outside of lock */
> +       l_new = htab_percpu_elem(htab_elem_common_alloc(htab, key));
> +       if (!l_new)
> +               return -ENOMEM;
> +
> +       l_new->value = __alloc_percpu_gfp(htab->map.value_size, 8,
> +                                         GFP_ATOMIC | __GFP_NOWARN);

Looks not good to introduce another memory allocation in eBPF prog,
it is a bit heavy to run, and it is in my TODO list to remove allocation of
htab_elem in eBPF prog.

> +       if (!l_new->value) {
> +               htab_percpu_elem_free(l_new);
> +               return -ENOMEM;
> +       }
> +
> +       memcpy(raw_cpu_ptr(l_new->value), value, map->value_size);
> +
> +       b = __select_bucket(htab, l_new->common.hash);
> +       head = &b->head;
> +
> +       /* bpf_map_update_elem() can be called in_irq() */
> +       raw_spin_lock_irqsave(&b->lock, flags);
> +
> +       l_old = htab_percpu_elem(lookup_elem_raw(htab, head, l_new->common.hash,
> +                                                key));
> +
> +       if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
> +               /* if elem with this 'key' doesn't exist and we've reached
> +                * max_entries limit, fail insertion of new elem
> +                */
> +               ret = -E2BIG;
> +               goto err;
> +       }
> +
> +       if (l_old && map_flags == BPF_NOEXIST) {
> +               /* elem already exists */
> +               ret = -EEXIST;
> +               goto err;
> +       }
> +
> +       if (!l_old && map_flags == BPF_EXIST) {
> +               /* elem doesn't exist, cannot update it */
> +               ret = -ENOENT;
> +               goto err;
> +       }
> +
> +       if (l_old) {
> +               memcpy(this_cpu_ptr(l_old->value), value, map->value_size);
> +       } else {
> +               hlist_add_head_rcu(&l_new->common.hash_node, head);
> +               atomic_inc(&htab->count);
> +       }
> +
> +       raw_spin_unlock_irqrestore(&b->lock, flags);
> +
> +       return 0;
> +err:
> +       raw_spin_unlock_irqrestore(&b->lock, flags);
> +       htab_percpu_elem_free(l_new);
> +       return ret;
> +}

It isn't good to introduce so much code duplication.

> +
> +static const struct bpf_map_ops htab_percpu_ops = {
> +       .map_alloc = htab_percpu_map_alloc,
> +       .map_free = htab_map_free,
> +       .map_get_next_key = htab_map_get_next_key,
> +       .map_lookup_elem = htab_percpu_map_lookup_elem,
> +       .map_update_elem = htab_percpu_map_update_elem,
> +       .map_delete_elem = htab_percpu_map_delete_elem,
> +};
> +
> +static struct bpf_map_type_list htab_percpu_type __read_mostly = {
> +       .ops = &htab_percpu_ops,
> +       .type = BPF_MAP_TYPE_PERCPU_HASH,
> +};
> +
>  static int __init register_htab_map(void)
>  {
>         bpf_register_map_type(&htab_type);
> +       bpf_register_map_type(&htab_percpu_type);
>         return 0;
>  }
>  late_initcall(register_htab_map);
> --
> 2.5.1
>



-- 
Ming Lei

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-07 22:35 ` [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
  2016-01-09 10:06   ` Ming Lei
@ 2016-01-09 10:33   ` Ming Lei
  1 sibling, 0 replies; 17+ messages in thread
From: Ming Lei @ 2016-01-09 10:33 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> This patch adds BPFMAP_TYPE_PERCPU_HASH map type and its
> htab_map_ops implementation.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/uapi/linux/bpf.h |   1 +
>  kernel/bpf/hashtab.c     | 201 ++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 201 insertions(+), 1 deletion(-)
...
> +
> +/* Called from syscall or eBPF program */
> +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> +       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       struct htab_elem_common *l;
> +
> +       l = __htab_map_lookup_elem(htab, key);
> +       if (l) {
> +               void *value = per_cpu_ptr(htab_percpu_elem(l)->value,
> +                                         smp_processor_id());

Then it isn't possible to retrieve the value of one key from all CPUs in eBPF
prog, and that is definitely not flexible because update/delete element can
be run from eBPF prog.

And that is not a thing if using real_key and cpu_id, isn't it?

Thanks,

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-09  9:39     ` Ming Lei
@ 2016-01-10  2:30       ` Martin KaFai Lau
  2016-01-11  2:20         ` Ming Lei
  0 siblings, 1 reply; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-10  2:30 UTC (permalink / raw)
  To: Ming Lei
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

Hi Ming,

On Sat, Jan 09, 2016 at 05:39:16PM +0800, Ming Lei wrote:
> On Sat, Jan 9, 2016 at 8:44 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> > On Fri, Jan 08, 2016 at 02:55:32PM +0800, Ming Lei wrote:
> >> On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> >> > This patchset adds BPF_MAP_TYPE_PERCPU_HASH map type which allows
> >> > percpu value.
> >>
> >> I am also thinking about using percpu variable to ebpf map, but IMO it
> >> should be better for ARRAY map instead of HASH map, then we can
> >> avoid the atomic op in eBPF program, see example of tracex3, sockex1
> >> and sockex3 in sample/bpf/ of kernel tree.  Also looks the ARRAY map
> >> usage in bcc is wrong, strictly speaking.
> > array and hash are two different use cases. May be we should have percpu
> > value for array map too.
>
> Atomic operations are definitely expensive, which should have been avoided
> in eBPF prog by percpu way. I think ARRAY map does need percpu way more, :-)
>
> But for hash map, I am still not 100% sure if percpu is needed:
>
> - key plus cpu_id is ok
> - key often can be chosed so that the mapped value is accessed without
> race(such as, req is used as key in biolatency, ...)
> - percpu hash can cause huge memory consumption compared with the way
> of key plus cpu_id without obvious performance advantage
If the bpf prog uses the percpu map type, the expectation
is the bpf is tapping some kprobes that most cpus (if not all) will access
the same key.  I would argue that having a bigger key (due to cpu_id),
and 40 more keys (and linking to 40 buckets) is less efficient.

If the expectation is to have a small numbers of cpu accessing a key (like
biolatency), the bpf prog can stay with the regular bpf map.  The new
percpu map has almost the same interface (except the syscall added in
Patch 3).  Switching between the two map types is easy.

> >> For HASH map, it is easy to make cpu id as part of key, then the map
> >> can be thought as percpu too, and atomic op isn't needed in eBPF program.
> > Putting the cpu id as part of the key was indeed the first hack I did
> > to get a sense of potential benefit.
> >
> > However, by extending the real-key with cpu-id, it is not intuitive to
> > use and it is prone to error.  For example, how to delete a real-key for
> > all cpus?  Iterating a particular real-key for all cpu is also tricky.  What
> > does it mean if a real-key exists for cpu#0 but not cpu#1? The real-key
>
> lookup returns NULL if the key(real key plus cpu id) doesn't exist.
>
> > got deleted from all cpu while iterating? or something else?  I believe
> > there are ways to get around but it is better to provide a clean
> > implementation instead.
>
> It might be true, but I hope there is one real example, in which we can
> see the obvious disadvantage of real key plus cpu_id. In the other way,
> we still can improve current hash map to make this case easier to use.
By extend(real_key, cpu_id), the uniqueness property of a real_key (and
it should be as a key for a hashmap) is getting blurry and it is not
ideal.

The typical use case is for doing a lot of aggregation (which bpf+kprobe
is strong at) on a real_key which will exist for a reasonable long period
of time.  For example, I would like to do some packet statistics on each
IPv6 /64 prefix.  The userspace will iterate the /64 prefix (real_key)
and sum up all values (of each cpu) before reporting for a particular
prefix.

When the userspace prog can find extend(real_key, cpu0) but not
extend(real_key, cpu1).  What does it mean?
It means the real_key has been deleted from all cpu and probably
not making sense to continue reporting?
or cpu1 has not inserted the real_key yet and I should
continue with cpu2?

How to delete a real_key from all cpu? A loop to iterate all
extend(real_key, cpu_id)?

> That sounds not a good result, 40X memory consumption only save %3 CPU,
> and the approach(real_key, cpu_id) still can save %3 without extra memory
> consumption(or much less than percpu MAP)
Another way to look at it, the bpf prog costs 6% CPU and saving 3%
with percpu implemenation.

Thanks,
-- Martin

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-10  2:30       ` Martin KaFai Lau
@ 2016-01-11  2:20         ` Ming Lei
  2016-01-11 22:35           ` Martin KaFai Lau
  0 siblings, 1 reply; 17+ messages in thread
From: Ming Lei @ 2016-01-11  2:20 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

Hi Martin,

On Sun, Jan 10, 2016 at 10:30 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> Hi Ming,
>
> On Sat, Jan 09, 2016 at 05:39:16PM +0800, Ming Lei wrote:
>> On Sat, Jan 9, 2016 at 8:44 AM, Martin KaFai Lau <kafai@fb.com> wrote:
>> > On Fri, Jan 08, 2016 at 02:55:32PM +0800, Ming Lei wrote:
>> >> On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
>> >> > This patchset adds BPF_MAP_TYPE_PERCPU_HASH map type which allows
>> >> > percpu value.
>> >>
>> >> I am also thinking about using percpu variable to ebpf map, but IMO it
>> >> should be better for ARRAY map instead of HASH map, then we can
>> >> avoid the atomic op in eBPF program, see example of tracex3, sockex1
>> >> and sockex3 in sample/bpf/ of kernel tree.  Also looks the ARRAY map
>> >> usage in bcc is wrong, strictly speaking.
>> > array and hash are two different use cases. May be we should have percpu
>> > value for array map too.
>>
>> Atomic operations are definitely expensive, which should have been avoided
>> in eBPF prog by percpu way. I think ARRAY map does need percpu way more, :-)
>>
>> But for hash map, I am still not 100% sure if percpu is needed:
>>
>> - key plus cpu_id is ok
>> - key often can be chosed so that the mapped value is accessed without
>> race(such as, req is used as key in biolatency, ...)
>> - percpu hash can cause huge memory consumption compared with the way
>> of key plus cpu_id without obvious performance advantage
> If the bpf prog uses the percpu map type, the expectation
> is the bpf is tapping some kprobes that most cpus (if not all) will access
> the same key.  I would argue that having a bigger key (due to cpu_id),

I think you mean the value mapped from the key can be accessed
concurrently from most cpus.

> and 40 more keys (and linking to 40 buckets) is less efficient.

Suppose the key is choosed in this way and may be accessed concurrently
from most cpus, the only extra loading for (real_key, cpu_id) is in memcmp(),
because key size is increased by sizeof(cpu_id), is it really a big deal?

Also in your percpu patch, you add one extra alloc_percpu_gfp(), do you
think that it is cheaper than the extra compasion on cpu_id in memcmp()?

>
> If the expectation is to have a small numbers of cpu accessing a key (like
> biolatency), the bpf prog can stay with the regular bpf map.  The new
> percpu map has almost the same interface (except the syscall added in
> Patch 3).

> Switching between the two map types is easy.

No, it isn't easy with your patchset, since we can't look up
one key in one specific CPU from eBPF prog, not mention you
introduce extra syscall.

>
>> >> For HASH map, it is easy to make cpu id as part of key, then the map
>> >> can be thought as percpu too, and atomic op isn't needed in eBPF program.
>> > Putting the cpu id as part of the key was indeed the first hack I did
>> > to get a sense of potential benefit.
>> >
>> > However, by extending the real-key with cpu-id, it is not intuitive to
>> > use and it is prone to error.  For example, how to delete a real-key for
>> > all cpus?  Iterating a particular real-key for all cpu is also tricky.  What
>> > does it mean if a real-key exists for cpu#0 but not cpu#1? The real-key
>>
>> lookup returns NULL if the key(real key plus cpu id) doesn't exist.
>>
>> > got deleted from all cpu while iterating? or something else?  I believe
>> > there are ways to get around but it is better to provide a clean
>> > implementation instead.
>>
>> It might be true, but I hope there is one real example, in which we can
>> see the obvious disadvantage of real key plus cpu_id. In the other way,
>> we still can improve current hash map to make this case easier to use.
> By extend(real_key, cpu_id), the uniqueness property of a real_key (and
> it should be as a key for a hashmap) is getting blurry and it is not
> ideal.
>
> The typical use case is for doing a lot of aggregation (which bpf+kprobe
> is strong at) on a real_key which will exist for a reasonable long period
> of time.  For example, I would like to do some packet statistics on each
> IPv6 /64 prefix.  The userspace will iterate the /64 prefix (real_key)
> and sum up all values (of each cpu) before reporting for a particular
> prefix.

Is there any script or ebpf sample code for this case? I appreciate much
we can talk with code.

>
> When the userspace prog can find extend(real_key, cpu0) but not
> extend(real_key, cpu1).  What does it mean?
> It means the real_key has been deleted from all cpu and probably
> not making sense to continue reporting?
> or cpu1 has not inserted the real_key yet and I should
> continue with cpu2?

I am wondering why your above concern can't be covered by
the following code?

        key.key = KEY;
        key.cpu_id = smp_processor_id();
        value = bpf_map_lookup_elem(&hash_map, &key);
        if (value) {
                *value->packets = *value->packets + 1;
        } else {
                val.packet = 1;
                bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
        }

>
> How to delete a real_key from all cpu? A loop to iterate all
> extend(real_key, cpu_id)?

Yes.

>
>> That sounds not a good result, 40X memory consumption only save %3 CPU,
>> and the approach(real_key, cpu_id) still can save %3 without extra memory
>> consumption(or much less than percpu MAP)
> Another way to look at it, the bpf prog costs 6% CPU and saving 3%
> with percpu implemenation.

Sorry, I am a bit confused why you say 6% saving, and could explain a bit?
Let's see your previous post:

> In my test (bpf+kprobe at tcp_rcv_established()), both this patchset and
> extend(real_key, cpu_id) approach save ~3% CPU while receiving ~4Mpps
> in a 40cores machine.



-- 
Ming Lei

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-11  2:20         ` Ming Lei
@ 2016-01-11 22:35           ` Martin KaFai Lau
  2016-01-12  5:48             ` Ming Lei
  0 siblings, 1 reply; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-11 22:35 UTC (permalink / raw)
  To: Ming Lei
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

Hi Ming,

On Mon, Jan 11, 2016 at 10:20:55AM +0800, Ming Lei wrote:
> >> - percpu hash can cause huge memory consumption compared with the way
                                  ~~~~~~~~~~~~~~~~~~
> >> of key plus cpu_id without obvious performance advantage
> > If the bpf prog uses the percpu map type, the expectation
> > is the bpf is tapping some kprobes that most cpus (if not all) will access
> > the same key.  I would argue that having a bigger key (due to cpu_id),
>
> I think you mean the value mapped from the key can be accessed
> concurrently from most cpus.
>
> > and 40 more keys (and linking to 40 buckets) is less efficient.
>
> Suppose the key is choosed in this way and may be accessed concurrently
> from most cpus, the only extra loading for (real_key, cpu_id) is in memcmp(),
> because key size is increased by sizeof(cpu_id), is it really a big deal?
>
> Also in your percpu patch, you add one extra alloc_percpu_gfp(), do you
> think that it is cheaper than the extra compasion on cpu_id in memcmp()?
I was referring to your _memory_ consumption comment.  The key is bigger,
more keys and buckets are needed.

and even for CPU concern, Yes also.  If the key stays long enough for
aggregation (and atomic operations), is it better to do one alloc_percpu_gfp()
at the beginning instead of extra memcmp() over and over again during lookup?
Having said that, I don't think either of the alloc or memcmp cost is
significant enough to distract the percpu discussion.

>
> >
> > If the expectation is to have a small numbers of cpu accessing a key (like
> > biolatency), the bpf prog can stay with the regular bpf map.  The new
> > percpu map has almost the same interface (except the syscall added in
> > Patch 3).
>
> > Switching between the two map types is easy.
>
> No, it isn't easy with your patchset, since we can't look up
> one key in one specific CPU from eBPF prog, not mention you
> introduce extra syscall.
Why would a eBPF prog (bpf_kern.o) want to access value of another CPU?
Does it defeat the purpose of using percpu value in the first place?

> >
> >> >> For HASH map, it is easy to make cpu id as part of key, then the map
> >> >> can be thought as percpu too, and atomic op isn't needed in eBPF program.
> >> > Putting the cpu id as part of the key was indeed the first hack I did
> >> > to get a sense of potential benefit.
> >> >
> >> > However, by extending the real-key with cpu-id, it is not intuitive to
> >> > use and it is prone to error.  For example, how to delete a real-key for
> >> > all cpus?  Iterating a particular real-key for all cpu is also tricky.  What
> >> > does it mean if a real-key exists for cpu#0 but not cpu#1? The real-key
> >>
> >> lookup returns NULL if the key(real key plus cpu id) doesn't exist.
> >>
> >> > got deleted from all cpu while iterating? or something else?  I believe
> >> > there are ways to get around but it is better to provide a clean
> >> > implementation instead.
> >>
> >> It might be true, but I hope there is one real example, in which we can
> >> see the obvious disadvantage of real key plus cpu_id. In the other way,
> >> we still can improve current hash map to make this case easier to use.
> > By extend(real_key, cpu_id), the uniqueness property of a real_key (and
> > it should be as a key for a hashmap) is getting blurry and it is not
> > ideal.
> >
> > The typical use case is for doing a lot of aggregation (which bpf+kprobe
> > is strong at) on a real_key which will exist for a reasonable long period
> > of time.  For example, I would like to do some packet statistics on each
> > IPv6 /64 prefix.  The userspace will iterate the /64 prefix (real_key)
                          ^^^^^^^^^^^^^^^^^^^^^^
> > and sum up all values (of each cpu) before reporting for a particular
> > prefix.
>
> Is there any script or ebpf sample code for this case? I appreciate much
> we can talk with code.
Sure, a simplified version is at the end.

> >
> > How to delete a real_key from all cpu? A loop to iterate all
> > extend(real_key, cpu_id)?
>
> Yes.
That is not ideal then.

Let alone the max_entries of the map also needs to be changed
whenever it is running in another machine with different number
of cores.

> >> That sounds not a good result, 40X memory consumption only save %3 CPU,
> >> and the approach(real_key, cpu_id) still can save %3 without extra memory
> >> consumption(or much less than percpu MAP)
> > Another way to look at it, the bpf prog costs 6% CPU and saving 3%
> > with percpu implemenation.
>
> Sorry, I am a bit confused why you say 6% saving, and could explain a bit?
> Let's see your previous post:
I meant:
Without percpu optimization, the bpf_prog costs a total 6% CPU.
With the percpu optimization, it saves 3% CPU.

Thanks,
-- Martin

Here is a much stripped down version of the bpf_kern and bpf_user:

struct tcp_trace_flow6 {
	__be32	dst0;
	__be32	dst1;
};

struct tcp_stats {
	u64	segs_in;
	/* A few more counters... */
};

/* bpf_kern.c */
struct bpf_map_def SEC("maps") dst_rack_map6 = {
	.type = BPF_MAP_TYPE_PERCPU_HASH,
	.key_size = sizeof(struct tcp_trace_flow6),
	.value_size = sizeof(struct tcp_stats),
	.max_entries = 10000,
};

SEC("kprobe/tcp_rcv_established")
int trace_rcv_established(struct pt_regs *ctx)
{
	struct tcp_stats *tpes;
	struct sk_buff *skb;
	struct sock *sk;

	sk = (struct sock *) PT_REGS_PARM1(ctx);
	skb = (struct sk_buff *) PT_REGS_PARM2(ctx);

	/* tcp_stats_get: A simple map lookup based on the IPv6 /64 prefix.
	 * If it does not exist, insert
	 */
	tpes = tcp_stats_get(sk);
	if (!tpes)
		return 0;

	tpes->segs_in++;

	/* A free more counter++ operations here */

	return 0;
}

/* bpf_user.c */
total = 0;
while (bpf_get_next_key(map_fd, &ttf6, &next_ttf6) == 0) {
	for (cpu = 0; cpu < nr_cpus; cpu++) {
		if (!bpf_lookup_percpu_elem(map_fd,
						&next_ttf6,
						&tpes, cpu))
			total += tpes.segs_in;
		else if (errno != ENXIO)
		     break;
	}
	if (cpu == nr_cpus)
		tcp_stats_log6(&next_ttf6, total);
	ttf6 = next_ttf6;
}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-09 10:06   ` Ming Lei
@ 2016-01-12  3:11     ` Martin KaFai Lau
  2016-01-12  7:44       ` Martin KaFai Lau
  0 siblings, 1 reply; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-12  3:11 UTC (permalink / raw)
  To: Ming Lei
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

On Sat, Jan 09, 2016 at 06:06:15PM +0800, Ming Lei wrote:
> On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> > This patch adds BPFMAP_TYPE_PERCPU_HASH map type and its
> > htab_map_ops implementation.
> >
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> >  include/uapi/linux/bpf.h |   1 +
> >  kernel/bpf/hashtab.c     | 201 ++++++++++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 201 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 8bed7f1..e4f8060 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -81,6 +81,7 @@ enum bpf_map_type {
> >         BPF_MAP_TYPE_ARRAY,
> >         BPF_MAP_TYPE_PROG_ARRAY,
> >         BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> > +       BPF_MAP_TYPE_PERCPU_HASH,
> >  };
> >
> >  enum bpf_prog_type {
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index d55df8c..63f2945 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -278,7 +278,7 @@ find_first_elem:
> >  }
> >
> >  static struct htab_elem_common *htab_elem_common_alloc(struct bpf_htab *htab,
> > -                                                       void *key)
> > +                                                      void *key)
>
> better to not introduce the above change.
What is the concern?

>
> >  {
> >         struct htab_elem_common *l;
> >
> > @@ -451,9 +451,208 @@ static struct bpf_map_type_list htab_type __read_mostly = {
> >         .type = BPF_MAP_TYPE_HASH,
> >  };
> >
> > +/* each htab_percpu_elem is struct htab_percpu_elem + key  */
> > +struct htab_percpu_elem {
> > +       struct htab_elem_common common;
> > +       void * __percpu value;
> > +       char key[0] __aligned(8);
> > +};
> > +
> > +static struct htab_percpu_elem *htab_percpu_elem(struct htab_elem_common *l)
> > +{
> > +       return (struct htab_percpu_elem *)l;
> > +}
> > +
> > +static void htab_percpu_elem_free(struct htab_percpu_elem *l)
> > +{
> > +       free_percpu(l->value);
> > +       kfree(l);
> > +}
> > +
> > +static void htab_percpu_elem_rcu_free(struct rcu_head *head)
> > +{
> > +       struct htab_elem_common *l = container_of(head,
> > +                                                 struct htab_elem_common,
> > +                                                 rcu);
> > +
> > +       htab_percpu_elem_free(htab_percpu_elem(l));
> > +}
> > +
> > +static void htab_percpu_map_flush(struct bpf_htab *htab)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < htab->n_buckets; i++) {
> > +               struct hlist_head *head = select_bucket(htab, i);
> > +               struct hlist_node *n;
> > +               struct htab_elem_common *l;
> > +
> > +               hlist_for_each_entry_safe(l, n, head, hash_node) {
> > +                       hlist_del_rcu(&l->hash_node);
> > +                       atomic_dec(&htab->count);
> > +                       htab_percpu_elem_free(htab_percpu_elem(l));
> > +               }
> > +       }
> > +}
>
> The above helper should have been saved by introduce percpu_map
> flag in bpf_htab.
There is no need to introduce a new flag. Is it the same as checking
htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH?

The current 'struct bpf_map_ops' setup has already made a clean function
dispatch based on different 'enum bpf_map_type'.  I have been refraining
to make another map_type check else where again.

I will make another attempt to further remove duplicate code first and
will post it shortly.

Thanks,
-- Martin

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-11 22:35           ` Martin KaFai Lau
@ 2016-01-12  5:48             ` Ming Lei
  2016-01-12  6:00               ` Alexei Starovoitov
  0 siblings, 1 reply; 17+ messages in thread
From: Ming Lei @ 2016-01-12  5:48 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

Hi Martin,

On Tue, Jan 12, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> Hi Ming,
>
> On Mon, Jan 11, 2016 at 10:20:55AM +0800, Ming Lei wrote:
>> >> - percpu hash can cause huge memory consumption compared with the way
>                                   ~~~~~~~~~~~~~~~~~~
>> >> of key plus cpu_id without obvious performance advantage
>> > If the bpf prog uses the percpu map type, the expectation
>> > is the bpf is tapping some kprobes that most cpus (if not all) will access
>> > the same key.  I would argue that having a bigger key (due to cpu_id),
>>
>> I think you mean the value mapped from the key can be accessed
>> concurrently from most cpus.
>>
>> > and 40 more keys (and linking to 40 buckets) is less efficient.
>>
>> Suppose the key is choosed in this way and may be accessed concurrently
>> from most cpus, the only extra loading for (real_key, cpu_id) is in memcmp(),
>> because key size is increased by sizeof(cpu_id), is it really a big deal?
>>
>> Also in your percpu patch, you add one extra alloc_percpu_gfp(), do you
>> think that it is cheaper than the extra compasion on cpu_id in memcmp()?
> I was referring to your _memory_ consumption comment.  The key is bigger,
> more keys and buckets are needed.

The total memory consumption is still much less than memory consumed
by percpu hash since a new element is only added to hash if the key is run
on that CPU. Most of times, for one key it may touch very few CPUs.

For percpu hash, the memory is always allocated to every CPU no matter
if the key is run from the CPU.

>
> and even for CPU concern, Yes also.  If the key stays long enough for
> aggregation (and atomic operations), is it better to do one alloc_percpu_gfp()
> at the beginning instead of extra memcmp() over and over again during lookup?
> Having said that, I don't think either of the alloc or memcmp cost is
> significant enough to distract the percpu discussion.

In my test, removing the current kmalloc() in update element callback can
improve io thoughput by 10% not mention the percpu ida allocation cost, and
looks it isn't cheap. That is why I don't think it is good to
introduce another new
allocation in the eBPF prog path.

You can find my test in the link below:

       https://patchwork.ozlabs.org/patch/556926/

>
>>
>> >
>> > If the expectation is to have a small numbers of cpu accessing a key (like
>> > biolatency), the bpf prog can stay with the regular bpf map.  The new
>> > percpu map has almost the same interface (except the syscall added in
>> > Patch 3).
>>
>> > Switching between the two map types is easy.
>>
>> No, it isn't easy with your patchset, since we can't look up
>> one key in one specific CPU from eBPF prog, not mention you
>> introduce extra syscall.
> Why would a eBPF prog (bpf_kern.o) want to access value of another CPU?
> Does it defeat the purpose of using percpu value in the first place?

HASH map need to delete element in eBPF prog and it is used a bit
common, so it is reasonable to retrieve the value of the element(need
to access value of other CPUs) before deleting it, isn't it?

>
>> >
>> >> >> For HASH map, it is easy to make cpu id as part of key, then the map
>> >> >> can be thought as percpu too, and atomic op isn't needed in eBPF program.
>> >> > Putting the cpu id as part of the key was indeed the first hack I did
>> >> > to get a sense of potential benefit.
>> >> >
>> >> > However, by extending the real-key with cpu-id, it is not intuitive to
>> >> > use and it is prone to error.  For example, how to delete a real-key for
>> >> > all cpus?  Iterating a particular real-key for all cpu is also tricky.  What
>> >> > does it mean if a real-key exists for cpu#0 but not cpu#1? The real-key
>> >>
>> >> lookup returns NULL if the key(real key plus cpu id) doesn't exist.
>> >>
>> >> > got deleted from all cpu while iterating? or something else?  I believe
>> >> > there are ways to get around but it is better to provide a clean
>> >> > implementation instead.
>> >>
>> >> It might be true, but I hope there is one real example, in which we can
>> >> see the obvious disadvantage of real key plus cpu_id. In the other way,
>> >> we still can improve current hash map to make this case easier to use.
>> > By extend(real_key, cpu_id), the uniqueness property of a real_key (and
>> > it should be as a key for a hashmap) is getting blurry and it is not
>> > ideal.
>> >
>> > The typical use case is for doing a lot of aggregation (which bpf+kprobe
>> > is strong at) on a real_key which will exist for a reasonable long period
>> > of time.  For example, I would like to do some packet statistics on each
>> > IPv6 /64 prefix.  The userspace will iterate the /64 prefix (real_key)
>                           ^^^^^^^^^^^^^^^^^^^^^^
>> > and sum up all values (of each cpu) before reporting for a particular
>> > prefix.
>>
>> Is there any script or ebpf sample code for this case? I appreciate much
>> we can talk with code.
> Sure, a simplified version is at the end.
>
>> >
>> > How to delete a real_key from all cpu? A loop to iterate all
>> > extend(real_key, cpu_id)?
>>
>> Yes.
> That is not ideal then.
>
> Let alone the max_entries of the map also needs to be changed

Why is the max_entries related?

> whenever it is running in another machine with different number
> of cores.

The core number can be get easily, is that a problem?

>
>> >> That sounds not a good result, 40X memory consumption only save %3 CPU,
>> >> and the approach(real_key, cpu_id) still can save %3 without extra memory
>> >> consumption(or much less than percpu MAP)
>> > Another way to look at it, the bpf prog costs 6% CPU and saving 3%
>> > with percpu implemenation.
>>
>> Sorry, I am a bit confused why you say 6% saving, and could explain a bit?
>> Let's see your previous post:
> I meant:
> Without percpu optimization, the bpf_prog costs a total 6% CPU.
> With the percpu optimization, it saves 3% CPU.

OK, I see it. 3% can be a bit small and often in fluctuant range. And why not
make CPU at full loading and see the improvement obviously?

Also you mentioned (real_key, cpu_id) can get similar improvement too.

>
> Thanks,
> -- Martin
>
> Here is a much stripped down version of the bpf_kern and bpf_user:
>
> struct tcp_trace_flow6 {
>         __be32  dst0;
>         __be32  dst1;
> };
>
> struct tcp_stats {
>         u64     segs_in;
>         /* A few more counters... */
> };
>
> /* bpf_kern.c */
> struct bpf_map_def SEC("maps") dst_rack_map6 = {
>         .type = BPF_MAP_TYPE_PERCPU_HASH,
>         .key_size = sizeof(struct tcp_trace_flow6),
>         .value_size = sizeof(struct tcp_stats),
>         .max_entries = 10000,
> };
>
> SEC("kprobe/tcp_rcv_established")
> int trace_rcv_established(struct pt_regs *ctx)
> {
>         struct tcp_stats *tpes;
>         struct sk_buff *skb;
>         struct sock *sk;
>
>         sk = (struct sock *) PT_REGS_PARM1(ctx);
>         skb = (struct sk_buff *) PT_REGS_PARM2(ctx);
>
>         /* tcp_stats_get: A simple map lookup based on the IPv6 /64 prefix.
>          * If it does not exist, insert
>          */
>         tpes = tcp_stats_get(sk);

Where is the above function? Is 'sk' the key? That can't use
(real_key, cpu_id) easily?

Looks your usage is simple, without delete/update elem invovled?

>         if (!tpes)
>                 return 0;
>
>         tpes->segs_in++;
>
>         /* A free more counter++ operations here */
>
>         return 0;
> }
>
> /* bpf_user.c */
> total = 0;
> while (bpf_get_next_key(map_fd, &ttf6, &next_ttf6) == 0) {
>         for (cpu = 0; cpu < nr_cpus; cpu++) {
>                 if (!bpf_lookup_percpu_elem(map_fd,
>                                                 &next_ttf6,
>                                                 &tpes, cpu))
>                         total += tpes.segs_in;
>                 else if (errno != ENXIO)
>                      break;
>         }
>         if (cpu == nr_cpus)
>                 tcp_stats_log6(&next_ttf6, total);
>         ttf6 = next_ttf6;
> }



Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-12  5:48             ` Ming Lei
@ 2016-01-12  6:00               ` Alexei Starovoitov
  0 siblings, 0 replies; 17+ messages in thread
From: Alexei Starovoitov @ 2016-01-12  6:00 UTC (permalink / raw)
  To: Ming Lei
  Cc: Martin KaFai Lau, Network Development, Linux Kernel Mailing List,
	FB Kernel Team

On Tue, Jan 12, 2016 at 01:48:10PM +0800, Ming Lei wrote:
> The total memory consumption is still much less than memory consumed
> by percpu hash since a new element is only added to hash if the key is run
> on that CPU. Most of times, for one key it may touch very few CPUs.
> 
> For percpu hash, the memory is always allocated to every CPU no matter
> if the key is run from the CPU.

In Martin's use case all cpus are servicing network traffic and all of them
are counting packets.

> In my test, removing the current kmalloc() in update element callback can
> improve io thoughput by 10% not mention the percpu ida allocation cost, and
> looks it isn't cheap. That is why I don't think it is good to
> introduce another new
> allocation in the eBPF prog path.

I don't think anyone is arguing that pre-allocation is not needed.
In some cases better performance can be achieved with pre-allocation,
in some other cases regular hash map will be enough,
and in others hash map with per-cpu is needed as well.

> You can find my test in the link below:
> 
>        https://patchwork.ozlabs.org/patch/556926/

yes, for tools/biolatency pre-allocation is a win,
but in many other cases we simply cannot pre-allocate all elements.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-12  3:11     ` Martin KaFai Lau
@ 2016-01-12  7:44       ` Martin KaFai Lau
  0 siblings, 0 replies; 17+ messages in thread
From: Martin KaFai Lau @ 2016-01-12  7:44 UTC (permalink / raw)
  To: Ming Lei
  Cc: Network Development, Linux Kernel Mailing List, FB Kernel Team,
	Alexei Starovoitov

On Mon, Jan 11, 2016 at 07:11:48PM -0800, Martin KaFai Lau wrote:
> On Sat, Jan 09, 2016 at 06:06:15PM +0800, Ming Lei wrote:
> > On Fri, Jan 8, 2016 at 6:35 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> > > This patch adds BPFMAP_TYPE_PERCPU_HASH map type and its
> > > htab_map_ops implementation.
> > >
> > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > ---
> > >  include/uapi/linux/bpf.h |   1 +
> > >  kernel/bpf/hashtab.c     | 201 ++++++++++++++++++++++++++++++++++++++++++++++-
> > >  2 files changed, 201 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 8bed7f1..e4f8060 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -81,6 +81,7 @@ enum bpf_map_type {
> > >         BPF_MAP_TYPE_ARRAY,
> > >         BPF_MAP_TYPE_PROG_ARRAY,
> > >         BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> > > +       BPF_MAP_TYPE_PERCPU_HASH,
> > >  };
> > >
> > >  enum bpf_prog_type {
> > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > > index d55df8c..63f2945 100644
> > > --- a/kernel/bpf/hashtab.c
> > > +++ b/kernel/bpf/hashtab.c
> > > @@ -278,7 +278,7 @@ find_first_elem:
> > >  }
> > >
> > >  static struct htab_elem_common *htab_elem_common_alloc(struct bpf_htab *htab,
> > > -                                                       void *key)
> > > +                                                      void *key)
> >
> > better to not introduce the above change.
> What is the concern?
Sorry. I misread your comment. will fix this unnecessary indentation change.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-01-12  7:44 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-07 22:35 [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
2016-01-07 22:35 ` [PATCH net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic Martin KaFai Lau
2016-01-07 22:35 ` [PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
2016-01-09 10:06   ` Ming Lei
2016-01-12  3:11     ` Martin KaFai Lau
2016-01-12  7:44       ` Martin KaFai Lau
2016-01-09 10:33   ` Ming Lei
2016-01-07 22:35 ` [PATCH net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key Martin KaFai Lau
2016-01-07 22:35 ` [PATCH net-next 4/4] bpf: bpf_htab: Test for BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
2016-01-08  6:55 ` [PATCH net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Ming Lei
2016-01-09  0:44   ` Martin KaFai Lau
2016-01-09  9:39     ` Ming Lei
2016-01-10  2:30       ` Martin KaFai Lau
2016-01-11  2:20         ` Ming Lei
2016-01-11 22:35           ` Martin KaFai Lau
2016-01-12  5:48             ` Ming Lei
2016-01-12  6:00               ` Alexei Starovoitov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.