All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
@ 2016-01-12  8:20 Martin KaFai Lau
  2016-01-12  8:20 ` [PATCH v2 net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic Martin KaFai Lau
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2016-01-12  8:20 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Alexei Starovoitov, Ming Lei, FB Kernel Team

V2:
Patch 1/4:
* Remove flush_elems_fn from 'struct bpf_htab' and
  also remove htab_map_flush() together
* Add htab_elem_alloc()
* Move the l_old check from htab_map_update_elem() to
  the newly added htab_map_check_update()

Patch 2/4:
* Add htab_percpu_elem_alloc()
* Add htab_percpu_map_free()
* Use this_cpu_ptr() instead of per_cpu_ptr in
  htab_percpu_map_lookup_elem()

V1 compose:
This patchset adds BPF_MAP_TYPE_PERCPU_HASH map type which allows
percpu value.

BPF + kprobe is very useful in statistics collection.  In particular,
bpf is strong in doing aggregation within the kernel instead of
outputting a lot of raw samples to the userspace.

In some cases, bumping a counter/value of a particular key will have
noticeable impact.  For example, doing statistics collection
on received packets and aggregating them by network
prefix (like /64 in IPv6).  Having a percpu value can help.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic
  2016-01-12  8:20 [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
@ 2016-01-12  8:20 ` Martin KaFai Lau
  2016-01-12  8:20 ` [PATCH v2 net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2016-01-12  8:20 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Alexei Starovoitov, Ming Lei, FB Kernel Team

It is a prep work for htab_percpu_map.

A similar fio test against /dev/nullb0 is done
while bcc/tools/biolatency is running.  It is to
make sure this change does not have regression
after the commit 688ecfe60220 ("bpf: hash: use per-bucket spinlock")
by "Ming Lei <tom.leiming@gmail.com>".

With biolatency running, the iops before and after this patch
is similar.  However, I cannot get as high iops as Ming did when
biolatency is not running.

With biolatency running, I get ~280k iops before
and after this patch.  Hence, no regression noticed.

Without biolatency running, I get ~330k iops.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Ming Lei <tom.leiming@gmail.com>
---
 kernel/bpf/hashtab.c | 217 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 153 insertions(+), 64 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c5b30fd..24a6a47 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -19,24 +19,45 @@ struct bucket {
 	raw_spinlock_t lock;
 };
 
+struct htab_elem_common;
+typedef void (*elem_free_fn)(struct htab_elem_common *);
+
 struct bpf_htab {
 	struct bpf_map map;
 	struct bucket *buckets;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
+	u32 elem_key_offset; /* offset to the key */
 };
 
-/* each htab element is struct htab_elem + key + value */
-struct htab_elem {
+struct htab_elem_common {
 	struct hlist_node hash_node;
 	struct rcu_head rcu;
 	u32 hash;
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+	struct htab_elem_common common;
 	char key[0] __aligned(8);
 };
 
-/* Called from syscall */
-static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+static void *htab_elem_common_get_key(const struct bpf_htab *htab,
+				      struct htab_elem_common *l)
+{
+	return (void *)l + htab->elem_key_offset;
+}
+
+static struct htab_elem *htab_elem(struct htab_elem_common *l)
+{
+	return (struct htab_elem *)l;
+}
+
+static struct bpf_map *__htab_map_alloc(union bpf_attr *attr,
+					u32 elem_size,
+					u32 elem_value_size,
+					u32 elem_key_offset)
 {
 	struct bpf_htab *htab;
 	int err, i;
@@ -77,9 +98,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		goto free_htab;
 
-	htab->elem_size = sizeof(struct htab_elem) +
-			  round_up(htab->map.key_size, 8) +
-			  htab->map.value_size;
+	htab->elem_size = elem_size;
+	htab->elem_key_offset = elem_key_offset;
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
@@ -87,13 +107,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		goto free_htab;
 
 	if ((u64) htab->n_buckets * sizeof(struct bucket) +
-	    (u64) htab->elem_size * htab->map.max_entries >=
+	    (u64) elem_value_size * htab->map.max_entries >=
 	    U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
 	htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
-				   htab->elem_size * htab->map.max_entries,
+				   elem_value_size * htab->map.max_entries,
 				   PAGE_SIZE) >> PAGE_SHIFT;
 
 	err = -ENOMEM;
@@ -120,6 +140,16 @@ free_htab:
 	return ERR_PTR(err);
 }
 
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+	u32 elem_size = sizeof(struct htab_elem) + round_up(attr->key_size, 8) +
+		attr->value_size;
+
+	return __htab_map_alloc(attr, elem_size, elem_size,
+				offsetof(struct htab_elem, key));
+}
+
 static inline u32 htab_map_hash(const void *key, u32 key_len)
 {
 	return jhash(key, key_len, 0);
@@ -135,39 +165,48 @@ static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
 	return &__select_bucket(htab, hash)->head;
 }
 
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
-					 void *key, u32 key_size)
+static struct htab_elem_common *lookup_elem_raw(struct bpf_htab *htab,
+						struct hlist_head *head,
+						u32 hash, void *key)
 {
-	struct htab_elem *l;
+	struct htab_elem_common *l;
 
 	hlist_for_each_entry_rcu(l, head, hash_node)
-		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+		if (l->hash == hash &&
+		    !memcmp(htab_elem_common_get_key(htab, l),
+			    key, htab->map.key_size))
 			return l;
 
 	return NULL;
 }
 
 /* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+static struct htab_elem_common *__htab_map_lookup_elem(struct bpf_htab *htab,
+						       void *key)
 {
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
-	struct htab_elem *l;
 	u32 hash, key_size;
 
 	/* Must be called with rcu_read_lock. */
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	key_size = map->key_size;
+	key_size = htab->map.key_size;
 
 	hash = htab_map_hash(key, key_size);
 
 	head = select_bucket(htab, hash);
 
-	l = lookup_elem_raw(head, hash, key, key_size);
+	return lookup_elem_raw(htab, head, hash, key);
+}
 
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem_common *l;
+
+	l = __htab_map_lookup_elem(htab, key);
 	if (l)
-		return l->key + round_up(map->key_size, 8);
+		return htab_elem(l)->key + round_up(map->key_size, 8);
 
 	return NULL;
 }
@@ -177,7 +216,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
-	struct htab_elem *l, *next_l;
+	struct htab_elem_common *l, *next_l;
 	u32 hash, key_size;
 	int i;
 
@@ -190,7 +229,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	head = select_bucket(htab, hash);
 
 	/* lookup the key */
-	l = lookup_elem_raw(head, hash, key, key_size);
+	l = lookup_elem_raw(htab, head, hash, key);
 
 	if (!l) {
 		i = 0;
@@ -199,11 +238,12 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 	/* key was found, get next key in the same bucket */
 	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
-				  struct htab_elem, hash_node);
+				  struct htab_elem_common, hash_node);
 
 	if (next_l) {
 		/* if next elem in this hash list is non-zero, just return it */
-		memcpy(next_key, next_l->key, key_size);
+		memcpy(next_key, htab_elem_common_get_key(htab, next_l),
+		       key_size);
 		return 0;
 	}
 
@@ -218,10 +258,11 @@ find_first_elem:
 
 		/* pick first element in the bucket */
 		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
-					  struct htab_elem, hash_node);
+					  struct htab_elem_common, hash_node);
 		if (next_l) {
 			/* if it's not empty, just return it */
-			memcpy(next_key, next_l->key, key_size);
+			memcpy(next_key, htab_elem_common_get_key(htab, next_l),
+			       key_size);
 			return 0;
 		}
 	}
@@ -230,6 +271,67 @@ find_first_elem:
 	return -ENOENT;
 }
 
+static struct htab_elem_common *htab_elem_common_alloc(
+	const struct bpf_htab *htab, void *key)
+{
+	struct htab_elem_common *l;
+
+	l = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	if (!l)
+		return NULL;
+
+	memcpy(htab_elem_common_get_key(htab, l), key, htab->map.key_size);
+	l->hash = htab_map_hash(key, htab->map.key_size);
+
+	return l;
+}
+
+static struct htab_elem *htab_elem_alloc(const struct bpf_htab *htab,
+					 void *key,
+					 void *value)
+{
+	struct htab_elem *l;
+
+	l = htab_elem(htab_elem_common_alloc(htab, key));
+	if (!l)
+		return NULL;
+
+	memcpy(l->key + round_up(htab->map.key_size, 8), value,
+	       htab->map.value_size);
+
+	return l;
+}
+
+static struct htab_elem_common *htab_map_check_update(
+	struct bpf_htab *htab,
+	struct hlist_head *head,
+	u32 hash,
+	void *key,
+	u64 map_flags)
+{
+	struct htab_elem_common *l_old;
+
+	l_old = lookup_elem_raw(htab, head, hash, key);
+
+	if (!l_old &&
+	    unlikely(atomic_read(&htab->count) >= htab->map.max_entries)) {
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		return ERR_PTR(-E2BIG);
+	}
+
+	if (l_old && map_flags == BPF_NOEXIST)
+		/* elem already exists */
+		return ERR_PTR(-EEXIST);
+
+	if (!l_old && map_flags == BPF_EXIST)
+		/* elem doesn't exist, cannot update it */
+		return ERR_PTR(-ENOENT);
+
+	return l_old;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 				u64 map_flags)
@@ -239,7 +341,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	struct hlist_head *head;
 	struct bucket *b;
 	unsigned long flags;
-	u32 key_size;
 	int ret;
 
 	if (map_flags > BPF_EXIST)
@@ -249,51 +350,32 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* allocate new element outside of lock */
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	l_new = htab_elem_alloc(htab, key, value);
 	if (!l_new)
 		return -ENOMEM;
 
-	key_size = map->key_size;
-
-	memcpy(l_new->key, key, key_size);
-	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
-
-	l_new->hash = htab_map_hash(l_new->key, key_size);
-	b = __select_bucket(htab, l_new->hash);
+	b = __select_bucket(htab, l_new->common.hash);
 	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
 	raw_spin_lock_irqsave(&b->lock, flags);
 
-	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+	l_old = htab_elem(htab_map_check_update(
+				  htab, head, l_new->common.hash, key,
+				  map_flags));
 
-	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		ret = -E2BIG;
-		goto err;
-	}
-
-	if (l_old && map_flags == BPF_NOEXIST) {
-		/* elem already exists */
-		ret = -EEXIST;
-		goto err;
-	}
-
-	if (!l_old && map_flags == BPF_EXIST) {
-		/* elem doesn't exist, cannot update it */
-		ret = -ENOENT;
+	if (IS_ERR(l_old)) {
+		ret = PTR_ERR(l_old);
 		goto err;
 	}
 
 	/* add new element to the head of the list, so that concurrent
 	 * search will find it before old elem
 	 */
-	hlist_add_head_rcu(&l_new->hash_node, head);
+	hlist_add_head_rcu(&l_new->common.hash_node, head);
 	if (l_old) {
-		hlist_del_rcu(&l_old->hash_node);
-		kfree_rcu(l_old, rcu);
+		hlist_del_rcu(&l_old->common.hash_node);
+		kfree_rcu(&l_old->common, rcu);
 	} else {
 		atomic_inc(&htab->count);
 	}
@@ -310,9 +392,9 @@ err:
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem_common *l;
 	struct hlist_head *head;
 	struct bucket *b;
-	struct htab_elem *l;
 	unsigned long flags;
 	u32 hash, key_size;
 	int ret = -ENOENT;
@@ -327,7 +409,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	raw_spin_lock_irqsave(&b->lock, flags);
 
-	l = lookup_elem_raw(head, hash, key, key_size);
+	l = lookup_elem_raw(htab, head, hash, key);
 
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
@@ -340,28 +422,28 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	return ret;
 }
 
-static void delete_all_elements(struct bpf_htab *htab)
+static void delete_all_elements(struct bpf_htab *htab, elem_free_fn elem_free)
+
 {
 	int i;
 
 	for (i = 0; i < htab->n_buckets; i++) {
 		struct hlist_head *head = select_bucket(htab, i);
 		struct hlist_node *n;
-		struct htab_elem *l;
+		struct htab_elem_common *l;
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
 			atomic_dec(&htab->count);
-			kfree(l);
+			elem_free(l);
 		}
 	}
 }
 
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
-static void htab_map_free(struct bpf_map *map)
+static void __htab_map_free(struct bpf_htab *htab,
+			    elem_free_fn elem_free)
 {
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
 	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
 	 * so the programs (can be more than one that used this map) were
 	 * disconnected from events. Wait for outstanding critical sections in
@@ -372,11 +454,18 @@ static void htab_map_free(struct bpf_map *map)
 	/* some of kfree_rcu() callbacks for elements of this map may not have
 	 * executed. It's ok. Proceed to free residual elements and map itself
 	 */
-	delete_all_elements(htab);
+	delete_all_elements(htab, elem_free);
 	kvfree(htab->buckets);
 	kfree(htab);
 }
 
+static void htab_map_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	__htab_map_free(htab, (elem_free_fn)kfree);
+}
+
 static const struct bpf_map_ops htab_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-12  8:20 [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
  2016-01-12  8:20 ` [PATCH v2 net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic Martin KaFai Lau
@ 2016-01-12  8:20 ` Martin KaFai Lau
  2016-01-12  8:20 ` [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key Martin KaFai Lau
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2016-01-12  8:20 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Alexei Starovoitov, Ming Lei, FB Kernel Team

This patch adds BPFMAP_TYPE_PERCPU_HASH map type and its
htab_map_ops implementation.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/hashtab.c     | 186 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa6f857..43ae40c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	BPF_MAP_TYPE_PERCPU_HASH,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 24a6a47..02d7473 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -480,9 +480,195 @@ static struct bpf_map_type_list htab_type __read_mostly = {
 	.type = BPF_MAP_TYPE_HASH,
 };
 
+/* each htab_percpu_elem is struct htab_percpu_elem + key  */
+struct htab_percpu_elem {
+	struct htab_elem_common common;
+	void  __percpu *value;
+	char key[0] __aligned(8);
+};
+
+static struct htab_percpu_elem *htab_percpu_elem(struct htab_elem_common *l)
+{
+	return (struct htab_percpu_elem *)l;
+}
+
+static void htab_percpu_elem_free(struct htab_percpu_elem *l)
+{
+	free_percpu(l->value);
+	kfree(l);
+}
+
+static void htab_percpu_elem_rcu_free(struct rcu_head *head)
+{
+	struct htab_elem_common *l = container_of(head,
+						  struct htab_elem_common,
+						  rcu);
+
+	htab_percpu_elem_free(htab_percpu_elem(l));
+}
+
+static struct htab_percpu_elem *htab_percpu_elem_alloc(struct bpf_htab *htab,
+						       void *key,
+						       void *value)
+{
+	/* allocate new element outside of lock */
+	struct htab_percpu_elem *l;
+
+	l = htab_percpu_elem(htab_elem_common_alloc(htab, key));
+	if (!l)
+		return NULL;
+
+	l->value = __alloc_percpu_gfp(htab->map.value_size, 8,
+				      GFP_ATOMIC | __GFP_NOWARN);
+	if (!l->value) {
+		htab_percpu_elem_free(l);
+		return NULL;
+	}
+
+	memcpy(raw_cpu_ptr(l->value), value, htab->map.value_size);
+
+	return l;
+}
+
+/* Called from syscall */
+static struct bpf_map *htab_percpu_map_alloc(union bpf_attr *attr)
+{
+	u32 elem_size = sizeof(struct htab_percpu_elem) +
+		round_up(attr->key_size, 8);
+	u32 elem_value_size = elem_size +
+		num_possible_cpus() * attr->value_size;
+
+	return __htab_map_alloc(attr, elem_size, elem_value_size,
+				offsetof(struct htab_percpu_elem, key));
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_percpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem_common *l;
+	struct hlist_head *head;
+	unsigned long flags;
+	u32 hash, key_size;
+	struct bucket *b;
+	int ret = -ENOENT;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l = lookup_elem_raw(htab, head, hash, key);
+
+	if (l) {
+		hlist_del_rcu(&l->hash_node);
+		atomic_dec(&htab->count);
+		call_rcu(&l->rcu, htab_percpu_elem_rcu_free);
+		ret = 0;
+	}
+
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	return ret;
+}
+
+/* Called from syscall or eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem_common *l;
+
+	l = __htab_map_lookup_elem(htab, key);
+	if (l) {
+		void *value = this_cpu_ptr(htab_percpu_elem(l)->value);
+		return value;
+	}
+
+	return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_percpu_elem *l_new, *l_old;
+	struct hlist_head *head;
+	struct bucket *b;
+	unsigned long flags;
+	int ret;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* allocate new element outside of lock */
+	l_new = htab_percpu_elem_alloc(htab, key, value);
+	if (!l_new)
+		return -ENOMEM;
+
+	b = __select_bucket(htab, l_new->common.hash);
+	head = &b->head;
+
+	/* bpf_percpu_map_update_elem() can be called in_irq() */
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l_old = htab_percpu_elem(htab_map_check_update(
+					 htab, head, l_new->common.hash, key,
+					 map_flags));
+
+	if (IS_ERR(l_old)) {
+		ret = PTR_ERR(l_old);
+		goto err;
+	}
+
+	if (l_old) {
+		memcpy(this_cpu_ptr(l_old->value), value, map->value_size);
+	} else {
+		hlist_add_head_rcu(&l_new->common.hash_node, head);
+		atomic_inc(&htab->count);
+	}
+
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+
+	return 0;
+err:
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_percpu_elem_free(l_new);
+	return ret;
+}
+
+static void htab_percpu_map_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	__htab_map_free(htab, (elem_free_fn)htab_percpu_elem_free);
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+	.map_alloc = htab_percpu_map_alloc,
+	.map_free = htab_percpu_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_percpu_map_lookup_elem,
+	.map_update_elem = htab_percpu_map_update_elem,
+	.map_delete_elem = htab_percpu_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+	.ops = &htab_percpu_ops,
+	.type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
 static int __init register_htab_map(void)
 {
 	bpf_register_map_type(&htab_type);
+	bpf_register_map_type(&htab_percpu_type);
 	return 0;
 }
 late_initcall(register_htab_map);
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key
  2016-01-12  8:20 [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
  2016-01-12  8:20 ` [PATCH v2 net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic Martin KaFai Lau
  2016-01-12  8:20 ` [PATCH v2 net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
@ 2016-01-12  8:20 ` Martin KaFai Lau
  2016-01-13  2:24   ` Eric Dumazet
  2016-01-13  2:42   ` Ming Lei
  2016-01-12  8:20 ` [PATCH v2 net-next 4/4] bpf: bpf_htab: Test for BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
  2016-01-12 21:45 ` [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH David Miller
  4 siblings, 2 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2016-01-12  8:20 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Alexei Starovoitov, Ming Lei, FB Kernel Team

Add map_lookup_percpu_elem() syscall to lookup value
of a particular CPU.

The user is expected to loop through all CPU values by:
for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) {
	if (!bpf_map_lookup_percpu_elem(map, &key, &value, cpu))
		total_value += value;
}

* If the cpu is offline, errno == ENXIO
* If the key is deleted before the iteration is done,
  errno == ENOENT.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ae40c..2c73c09 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,7 @@ enum bpf_cmd {
 	BPF_PROG_LOAD,
 	BPF_OBJ_PIN,
 	BPF_OBJ_GET,
+	BPF_MAP_LOOKUP_PERCPU_ELEM,
 };
 
 enum bpf_map_type {
@@ -115,6 +116,7 @@ union bpf_attr {
 			__aligned_u64 next_key;
 		};
 		__u64		flags;
+		__u32		cpu;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6373970..ba1172b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -289,6 +289,96 @@ err_put:
 	return err;
 }
 
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_PERCPU_ELEM_LAST_FIELD cpu
+
+struct percpu_map_value_info {
+	struct bpf_map *map;
+	void *key;
+	void *value;
+	bool found;
+};
+
+static void percpu_map_lookup_value(void *i)
+{
+	struct percpu_map_value_info *info = (struct percpu_map_value_info *)i;
+	struct bpf_map *map = info->map;
+	void *ptr;
+
+	rcu_read_lock();
+	ptr = map->ops->map_lookup_elem(map, info->key);
+	if (ptr) {
+		memcpy(info->value, ptr, map->value_size);
+		info->found = true;
+	} else {
+		info->found = false;
+	}
+	rcu_read_unlock();
+}
+
+static int map_lookup_percpu_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_ptr(attr->key);
+	void __user *uvalue = u64_to_ptr(attr->value);
+	u32 __user ucpu = attr->cpu;
+	int ufd = attr->map_fd;
+	struct percpu_map_value_info value_info;
+	struct bpf_map *map;
+	void *key, *value;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) ||
+	    ucpu >= num_possible_cpus())
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+	if (!value)
+		goto free_key;
+
+	value_info.map = map;
+	value_info.key = key;
+	value_info.value = value;
+
+	err = smp_call_function_single(ucpu, percpu_map_lookup_value,
+				       &value_info, 1);
+	if (err)
+		goto free_value;
+
+	err = -ENOENT;
+	if (!value_info.found)
+		goto free_value;
+
+	err = -EFAULT;
+	if (copy_to_user(uvalue, value, map->value_size) != 0)
+		goto free_value;
+
+	err = 0;
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
@@ -792,6 +882,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
+	case BPF_MAP_LOOKUP_PERCPU_ELEM:
+		err = map_lookup_percpu_elem(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 net-next 4/4] bpf: bpf_htab: Test for BPF_MAP_TYPE_PERCPU_HASH
  2016-01-12  8:20 [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
                   ` (2 preceding siblings ...)
  2016-01-12  8:20 ` [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key Martin KaFai Lau
@ 2016-01-12  8:20 ` Martin KaFai Lau
  2016-01-12 21:45 ` [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH David Miller
  4 siblings, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2016-01-12  8:20 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Alexei Starovoitov, Ming Lei, FB Kernel Team

A sanity test for BPF_MAP_TYPE_PERCPU_HASH.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 samples/bpf/libbpf.c    |  13 ++++++
 samples/bpf/libbpf.h    |   1 +
 samples/bpf/test_maps.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+)

diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 65a8d48..a9c5e61 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -54,6 +54,19 @@ int bpf_lookup_elem(int fd, void *key, void *value)
 	return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
 }
 
+int bpf_lookup_percpu_elem(int fd, void *key, void *value, unsigned int cpu)
+{
+	union bpf_attr attr = {
+		.map_fd = fd,
+		.key = ptr_to_u64(key),
+		.value = ptr_to_u64(value),
+		.cpu = cpu,
+	};
+
+	return syscall(__NR_bpf, BPF_MAP_LOOKUP_PERCPU_ELEM, &attr,
+		       sizeof(attr));
+}
+
 int bpf_delete_elem(int fd, void *key)
 {
 	union bpf_attr attr = {
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 014aacf..7759517 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -8,6 +8,7 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
 		   int max_entries);
 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
 int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_lookup_percpu_elem(int fd, void *key, void *value, unsigned int cpu);
 int bpf_delete_elem(int fd, void *key);
 int bpf_get_next_key(int fd, void *key, void *next_key);
 
diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
index 6299ee9..7b1dc3e 100644
--- a/samples/bpf/test_maps.c
+++ b/samples/bpf/test_maps.c
@@ -89,6 +89,118 @@ static void test_hashmap_sanity(int i, void *data)
 	close(map_fd);
 }
 
+/* sanity tests for percpu map API */
+static void test_percpu_hashmap_sanity(int i, void *data)
+{
+	long long key, next_key, value;
+	int expected_key_mask = 0;
+	unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+	int map_fd;
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key),
+				sizeof(value), 2);
+	if (map_fd < 0) {
+		printf("failed to create hashmap '%s'\n", strerror(errno));
+		exit(1);
+	}
+
+	key = 1;
+	value = 1234;
+	/* insert key=1 element */
+	assert(!(expected_key_mask & key));
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0);
+	expected_key_mask |= key;
+
+	/* BPF_NOEXIST means: add new element if it doesn't exist */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       /* key=1 already exists */
+	       errno == EEXIST);
+
+	/* -1 is an invalid flag */
+	assert(bpf_update_elem(map_fd, &key, &value, -1) == -1 &&
+	       errno == EINVAL);
+
+	/* check that key=1 can be found. value could be 0 if the lookup
+	 * was run from a different cpu.
+	 */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 &&
+	       (value == 0 || value == 1234));
+
+	key = 2;
+	/* check that key=2 is not found */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+	/* BPF_EXIST means: update existing element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
+	       /* key=2 is not there */
+	       errno == ENOENT);
+
+	/* insert key=2 element */
+	assert(!(expected_key_mask & key));
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+	expected_key_mask |= key;
+
+	/* key=1 and key=2 were inserted, check that key=0 cannot be inserted
+	 * due to max_entries limit
+	 */
+	key = 0;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == E2BIG);
+
+	/* check that key = 0 doesn't exist */
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+	/* iterate over two elements */
+	while (!bpf_get_next_key(map_fd, &key, &next_key)) {
+		unsigned int cpu;
+		int found;
+
+		assert((expected_key_mask & next_key) == next_key);
+		expected_key_mask &= ~next_key;
+
+		found = 0;
+		for (cpu = 0; cpu < nr_cpus; cpu++) {
+			if (!bpf_lookup_percpu_elem(map_fd, &next_key,
+						    &value, cpu)) {
+				if (value == 1234) {
+					assert(!found);
+					found = 1;
+				} else {
+					assert(value == 0);
+				}
+			} else {
+				assert(errno == ENXIO);
+			}
+		}
+		assert(found);
+
+		key = next_key;
+	}
+	assert(errno == ENOENT);
+
+	/* Look up the value with an invalid CPU */
+	key = 1;
+	assert(bpf_lookup_percpu_elem(map_fd, &key, &value, nr_cpus) == -1 &&
+	       errno == EINVAL);
+
+	/* Update with BPF_EXIST */
+	key = 1;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == 0);
+
+	/* delete both elements */
+	key = 1;
+	assert(bpf_delete_elem(map_fd, &key) == 0);
+	key = 2;
+	assert(bpf_delete_elem(map_fd, &key) == 0);
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+	key = 0;
+	/* check that map is empty */
+	assert(bpf_get_next_key(map_fd, &key, &next_key) == -1 &&
+	       errno == ENOENT);
+	close(map_fd);
+}
+
 static void test_arraymap_sanity(int i, void *data)
 {
 	int key, next_key, map_fd;
@@ -282,6 +394,7 @@ static void test_map_parallel(void)
 int main(void)
 {
 	test_hashmap_sanity(0, NULL);
+	test_percpu_hashmap_sanity(0, NULL);
 	test_arraymap_sanity(0, NULL);
 	test_map_large();
 	test_map_parallel();
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH
  2016-01-12  8:20 [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
                   ` (3 preceding siblings ...)
  2016-01-12  8:20 ` [PATCH v2 net-next 4/4] bpf: bpf_htab: Test for BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
@ 2016-01-12 21:45 ` David Miller
  4 siblings, 0 replies; 12+ messages in thread
From: David Miller @ 2016-01-12 21:45 UTC (permalink / raw)
  To: kafai; +Cc: netdev, linux-kernel, alexei.starovoitov, tom.leiming, kernel-team

From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 12 Jan 2016 00:20:48 -0800

> BPF + kprobe is very useful in statistics collection.  In particular,
> bpf is strong in doing aggregation within the kernel instead of
> outputting a lot of raw samples to the userspace.
> 
> In some cases, bumping a counter/value of a particular key will have
> noticeable impact.  For example, doing statistics collection
> on received packets and aggregating them by network
> prefix (like /64 in IPv6).  Having a percpu value can help.

Please:

	http://marc.info/?l=linux-netdev&m=145248145925834&w=2

Thanks.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key
  2016-01-12  8:20 ` [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key Martin KaFai Lau
@ 2016-01-13  2:24   ` Eric Dumazet
  2016-01-13  2:42   ` Ming Lei
  1 sibling, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2016-01-13  2:24 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: netdev, linux-kernel, Alexei Starovoitov, Ming Lei, FB Kernel Team

On Tue, 2016-01-12 at 00:20 -0800, Martin KaFai Lau wrote:
> Add map_lookup_percpu_elem() syscall to lookup value
> of a particular CPU.

...

> +static int map_lookup_percpu_elem(union bpf_attr *attr)
> +{
> +	void __user *ukey = u64_to_ptr(attr->key);
> +	void __user *uvalue = u64_to_ptr(attr->value);
> +	u32 __user ucpu = attr->cpu;
> +	int ufd = attr->map_fd;
> +	struct percpu_map_value_info value_info;
> +	struct bpf_map *map;
> +	void *key, *value;
> +	struct fd f;
> +	int err;
> +
> +	if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) ||
> +	    ucpu >= num_possible_cpus())
> +		return -EINVAL;

Arg, a num_possible_cpus() wrong call.

You probably want to use    (ucpu >= nr_cpu_ids)

hint : You can have hosts with 2 possibles cpus, CPU0 and CPU2

So the userland loop in your changelog is wrong.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key
  2016-01-12  8:20 ` [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key Martin KaFai Lau
  2016-01-13  2:24   ` Eric Dumazet
@ 2016-01-13  2:42   ` Ming Lei
  2016-01-13  5:23     ` Alexei Starovoitov
  1 sibling, 1 reply; 12+ messages in thread
From: Ming Lei @ 2016-01-13  2:42 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Network Development, Linux Kernel Mailing List,
	Alexei Starovoitov, FB Kernel Team

On Tue, Jan 12, 2016 at 4:20 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> Add map_lookup_percpu_elem() syscall to lookup value
> of a particular CPU.
>
> The user is expected to loop through all CPU values by:
> for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) {
>         if (!bpf_map_lookup_percpu_elem(map, &key, &value, cpu))

Thinking of further, the above way can not work well, the iteration may
takes long time to collect the value from each CPU, and finally
the accumulated value has been stale for long.

Syscall is often the preempt point.

>                 total_value += value;
> }
>
> * If the cpu is offline, errno == ENXIO
> * If the key is deleted before the iteration is done,
>   errno == ENOENT.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/uapi/linux/bpf.h |  2 ++
>  kernel/bpf/syscall.c     | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 95 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 43ae40c..2c73c09 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -73,6 +73,7 @@ enum bpf_cmd {
>         BPF_PROG_LOAD,
>         BPF_OBJ_PIN,
>         BPF_OBJ_GET,
> +       BPF_MAP_LOOKUP_PERCPU_ELEM,
>  };
>
>  enum bpf_map_type {
> @@ -115,6 +116,7 @@ union bpf_attr {
>                         __aligned_u64 next_key;
>                 };
>                 __u64           flags;
> +               __u32           cpu;
>         };
>
>         struct { /* anonymous struct used by BPF_PROG_LOAD command */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 6373970..ba1172b 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -289,6 +289,96 @@ err_put:
>         return err;
>  }
>
> +/* last field in 'union bpf_attr' used by this command */
> +#define BPF_MAP_LOOKUP_PERCPU_ELEM_LAST_FIELD cpu
> +
> +struct percpu_map_value_info {
> +       struct bpf_map *map;
> +       void *key;
> +       void *value;
> +       bool found;
> +};
> +
> +static void percpu_map_lookup_value(void *i)
> +{
> +       struct percpu_map_value_info *info = (struct percpu_map_value_info *)i;
> +       struct bpf_map *map = info->map;
> +       void *ptr;
> +
> +       rcu_read_lock();
> +       ptr = map->ops->map_lookup_elem(map, info->key);
> +       if (ptr) {
> +               memcpy(info->value, ptr, map->value_size);
> +               info->found = true;
> +       } else {
> +               info->found = false;
> +       }
> +       rcu_read_unlock();
> +}
> +
> +static int map_lookup_percpu_elem(union bpf_attr *attr)
> +{
> +       void __user *ukey = u64_to_ptr(attr->key);
> +       void __user *uvalue = u64_to_ptr(attr->value);
> +       u32 __user ucpu = attr->cpu;
> +       int ufd = attr->map_fd;
> +       struct percpu_map_value_info value_info;
> +       struct bpf_map *map;
> +       void *key, *value;
> +       struct fd f;
> +       int err;
> +
> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) ||
> +           ucpu >= num_possible_cpus())
> +               return -EINVAL;
> +
> +       f = fdget(ufd);
> +       map = __bpf_map_get(f);
> +       if (IS_ERR(map))
> +               return PTR_ERR(map);
> +
> +       err = -ENOMEM;
> +       key = kmalloc(map->key_size, GFP_USER);

The kmalloc may trigger memory reclaim and take dozens of seconds
or more.

> +       if (!key)
> +               goto err_put;
> +
> +       err = -EFAULT;
> +       if (copy_from_user(key, ukey, map->key_size) != 0)
> +               goto free_key;
> +
> +       err = -ENOMEM;
> +       value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);

Same with above.

> +       if (!value)
> +               goto free_key;
> +
> +       value_info.map = map;
> +       value_info.key = key;
> +       value_info.value = value;
> +
> +       err = smp_call_function_single(ucpu, percpu_map_lookup_value,
> +                                      &value_info, 1);

It is slow too.

> +       if (err)
> +               goto free_value;
> +
> +       err = -ENOENT;
> +       if (!value_info.found)
> +               goto free_value;
> +
> +       err = -EFAULT;
> +       if (copy_to_user(uvalue, value, map->value_size) != 0)
> +               goto free_value;

page fault may be triggered.

> +
> +       err = 0;
> +
> +free_value:
> +       kfree(value);
> +free_key:
> +       kfree(key);
> +err_put:
> +       fdput(f);
> +       return err;
> +}
> +
>  #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
>
>  static int map_update_elem(union bpf_attr *attr)
> @@ -792,6 +882,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>         case BPF_OBJ_GET:
>                 err = bpf_obj_get(&attr);
>                 break;
> +       case BPF_MAP_LOOKUP_PERCPU_ELEM:
> +               err = map_lookup_percpu_elem(&attr);
> +               break;
>         default:
>                 err = -EINVAL;
>                 break;

So I don't think it is good to retrieve value from one CPU via one
single system call, and accumulate them finally in userspace.

One approach I thought of is to define the function(or sort of)

 handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)

in bpf kernel code for collecting value from each cpu and
accumulating them into 'val_total', and most of situations, the
function can be implemented without loop most of situations.
kernel can call this function directly, and the total value can be
return to userspace by one single syscall.

Alexei and anyone, could you comment on this draft idea for
perpcu map?

Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key
  2016-01-13  2:42   ` Ming Lei
@ 2016-01-13  5:23     ` Alexei Starovoitov
  2016-01-13 15:43       ` Ming Lei
  0 siblings, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2016-01-13  5:23 UTC (permalink / raw)
  To: Ming Lei
  Cc: Martin KaFai Lau, Network Development, Linux Kernel Mailing List,
	FB Kernel Team

On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote:
> 
> So I don't think it is good to retrieve value from one CPU via one
> single system call, and accumulate them finally in userspace.
> 
> One approach I thought of is to define the function(or sort of)
> 
>  handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)
> 
> in bpf kernel code for collecting value from each cpu and
> accumulating them into 'val_total', and most of situations, the
> function can be implemented without loop most of situations.
> kernel can call this function directly, and the total value can be
> return to userspace by one single syscall.
> 
> Alexei and anyone, could you comment on this draft idea for
> perpcu map?

I'm not sure how you expect user space to specify such callback.
Kernel cannot execute user code.
Also syscall/malloc/etc is a noise comparing to ipi and it
will still be there, so
for(all cpus) { syscall+ipi;} will have the same speed.
I think in this use case the overhead of ipi is justified,
since user space needs to read accurate numbers otherwise
the whole per-cpu is not very useful. One can just use
normal hash map and do normal increment. All cpus will race
and the counter may contain complete garbage, but in some
cases such rough counters are actually good enough.
Here per-cpu hash gives fast performance and _accurate_
numbers to userspace.
Having said that if you see a way to avoid ipi and still
get correct numbers to user space, it would be great.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key
  2016-01-13  5:23     ` Alexei Starovoitov
@ 2016-01-13 15:43       ` Ming Lei
  2016-01-14  1:24         ` Alexei Starovoitov
  0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2016-01-13 15:43 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Martin KaFai Lau, Network Development, Linux Kernel Mailing List,
	FB Kernel Team

On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote:
>>
>> So I don't think it is good to retrieve value from one CPU via one
>> single system call, and accumulate them finally in userspace.
>>
>> One approach I thought of is to define the function(or sort of)
>>
>>  handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)
>>
>> in bpf kernel code for collecting value from each cpu and
>> accumulating them into 'val_total', and most of situations, the
>> function can be implemented without loop most of situations.
>> kernel can call this function directly, and the total value can be
>> return to userspace by one single syscall.
>>
>> Alexei and anyone, could you comment on this draft idea for
>> perpcu map?
>
> I'm not sure how you expect user space to specify such callback.
> Kernel cannot execute user code.

I mean the above callback function can be built into bpf code and then
run from kernel after loading like in packet filter case by tcpdump, maybe
one new prog type is needed. It is doable in theroy. I need to investigate
a bit to understand how it can be called from kernel, and it might be OK
to call it via kprobe, but not elegent just for accumulating value from each
CPU.

> Also syscall/malloc/etc is a noise comparing to ipi and it
> will still be there, so
> for(all cpus) { syscall+ipi;} will have the same speed.

In the syscall path, lots of slow things, and finally the accumulated
value is often stale and may not reprensent accurate number at any
time, and can be thought as invalid.

> I think in this use case the overhead of ipi is justified,
> since user space needs to read accurate numbers otherwise
> the whole per-cpu is not very useful. One can just use
> normal hash map and do normal increment. All cpus will race
> and the counter may contain complete garbage, but in some
> cases such rough counters are actually good enough.
> Here per-cpu hash gives fast performance and _accurate_
> numbers to userspace.

IMO the 'accurate' value on each CPU at different times
doesn't matter, and from user view, they do care the
accurate/final number accumulated from all CPUs at the
specified time.

> Having said that if you see a way to avoid ipi and still
> get correct numbers to user space, it would be great.

As I mentioned in another thread, the current non-percpu
map has the same problem too between lookup element syscall
and updating element in eBPF prog.

-- 
Ming Lei

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key
  2016-01-13 15:43       ` Ming Lei
@ 2016-01-14  1:24         ` Alexei Starovoitov
  2016-01-14  3:23           ` Ming Lei
  0 siblings, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2016-01-14  1:24 UTC (permalink / raw)
  To: Ming Lei
  Cc: Martin KaFai Lau, Network Development, Linux Kernel Mailing List,
	FB Kernel Team

On Wed, Jan 13, 2016 at 11:43:50PM +0800, Ming Lei wrote:
> On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> > On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote:
> >>
> >> So I don't think it is good to retrieve value from one CPU via one
> >> single system call, and accumulate them finally in userspace.
> >>
> >> One approach I thought of is to define the function(or sort of)
> >>
> >>  handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)
> >>
> >> in bpf kernel code for collecting value from each cpu and
> >> accumulating them into 'val_total', and most of situations, the
> >> function can be implemented without loop most of situations.
> >> kernel can call this function directly, and the total value can be
> >> return to userspace by one single syscall.
> >>
> >> Alexei and anyone, could you comment on this draft idea for
> >> perpcu map?
> >
> > I'm not sure how you expect user space to specify such callback.
> > Kernel cannot execute user code.
> 
> I mean the above callback function can be built into bpf code and then
> run from kernel after loading like in packet filter case by tcpdump, maybe
> one new prog type is needed. It is doable in theroy. I need to investigate
> a bit to understand how it can be called from kernel, and it might be OK
> to call it via kprobe, but not elegent just for accumulating value from each
> CPU.

that would be a total overkill.

> > Also syscall/malloc/etc is a noise comparing to ipi and it
> > will still be there, so
> > for(all cpus) { syscall+ipi;} will have the same speed.
> 
> In the syscall path, lots of slow things, and finally the accumulated
> value is often stale and may not reprensent accurate number at any
> time, and can be thought as invalid.

no. stale != invalid.
Some analytics/monitor applications are good with ball park numbers
and for them regular hash map with non-atomic increment is good enough,
but others need accurate numbers. Even though they may be seconds stale.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key
  2016-01-14  1:24         ` Alexei Starovoitov
@ 2016-01-14  3:23           ` Ming Lei
  0 siblings, 0 replies; 12+ messages in thread
From: Ming Lei @ 2016-01-14  3:23 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Martin KaFai Lau, Network Development, Linux Kernel Mailing List,
	FB Kernel Team

On Thu, Jan 14, 2016 at 9:24 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Wed, Jan 13, 2016 at 11:43:50PM +0800, Ming Lei wrote:
>> On Wed, Jan 13, 2016 at 1:23 PM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>> > On Wed, Jan 13, 2016 at 10:42:49AM +0800, Ming Lei wrote:
>> >>
>> >> So I don't think it is good to retrieve value from one CPU via one
>> >> single system call, and accumulate them finally in userspace.
>> >>
>> >> One approach I thought of is to define the function(or sort of)
>> >>
>> >>  handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)
>> >>
>> >> in bpf kernel code for collecting value from each cpu and
>> >> accumulating them into 'val_total', and most of situations, the
>> >> function can be implemented without loop most of situations.
>> >> kernel can call this function directly, and the total value can be
>> >> return to userspace by one single syscall.
>> >>
>> >> Alexei and anyone, could you comment on this draft idea for
>> >> perpcu map?
>> >
>> > I'm not sure how you expect user space to specify such callback.
>> > Kernel cannot execute user code.
>>
>> I mean the above callback function can be built into bpf code and then
>> run from kernel after loading like in packet filter case by tcpdump, maybe
>> one new prog type is needed. It is doable in theroy. I need to investigate
>> a bit to understand how it can be called from kernel, and it might be OK
>> to call it via kprobe, but not elegent just for accumulating value from each
>> CPU.
>
> that would be a total overkill.

With current bpf framework, it isn't difficult to implement the prog type
for percpu, and it is still reasonable because accumulating values
from each CPU is one policy and should have been defined/provide
from bpf code in case of percpu map.

>
>> > Also syscall/malloc/etc is a noise comparing to ipi and it
>> > will still be there, so
>> > for(all cpus) { syscall+ipi;} will have the same speed.
>>
>> In the syscall path, lots of slow things, and finally the accumulated
>> value is often stale and may not reprensent accurate number at any
>> time, and can be thought as invalid.
>
> no. stale != invalid.
> Some analytics/monitor applications are good with ball park numbers
> and for them regular hash map with non-atomic increment is good enough,
> but others need accurate numbers. Even though they may be seconds stale.

Let me make it clearly, and the following is one case for reading packet length
on each CPU.

time: --------T0--------T1----------T2---------T3-------T4-----------------
CPU0:               E0(0)  E0(1)  .... E0(2M)
CPU1:                               E1(0)  E1(2) .... E1(1M)
CPU2:                                                  E2(0) ....
E2(10K)
CPU3:
E3(0)...E3(1K)

                    R0
                                    R1
                                                      R2
                                                                       R3

               A4

1) suppose the system has 4 cpu cores

2) There is one single event(suppose it is packets received) we are
interetested in, and if the event happens on CPU(i) we will add the
received packet's length into the value of CPU(i)

2) E0(i) reprents the ith event happened on CPU0 from T0, which will
be recored into the value of CPU(0), E1(i) represents the ith event on
CPU(1) from T1, ....

2) R0 represents reading value of CPU0 at the time T0, and R1 represents
reading value of CPU1 at the time T1, .....

3) A4 represents accumulating all percpu value into one totoal value at time T4,
suppose value(A4) = value(R0) + value(R1) + value(R2) + value(R3).

3) if we use syscall to implement Ri(i=1...3), the period between T(i)
and T(i+1)
can become quite big, for example dozens of seconds, so the accumulated value
in A4 can't represent the actual/correct value(counter) at any time between T0
and T4, and the value is wrong actually, and all events in above diagram
(E0(0)~E0(2M), E1(0)~E1(1M),  E2(0) .... E2(10K), ...) aren't counted at all,
and the missed number can be quite huge.

So does the value got by A4 make sense for user?


-- 
Ming Lei

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-01-14  3:23 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-12  8:20 [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
2016-01-12  8:20 ` [PATCH v2 net-next 1/4] bpf: bpf_htab: Refactor some htab_elem logic Martin KaFai Lau
2016-01-12  8:20 ` [PATCH v2 net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
2016-01-12  8:20 ` [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate percpu value of a key Martin KaFai Lau
2016-01-13  2:24   ` Eric Dumazet
2016-01-13  2:42   ` Ming Lei
2016-01-13  5:23     ` Alexei Starovoitov
2016-01-13 15:43       ` Ming Lei
2016-01-14  1:24         ` Alexei Starovoitov
2016-01-14  3:23           ` Ming Lei
2016-01-12  8:20 ` [PATCH v2 net-next 4/4] bpf: bpf_htab: Test for BPF_MAP_TYPE_PERCPU_HASH Martin KaFai Lau
2016-01-12 21:45 ` [PATCH v2 net-next 0/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.