All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 bpf-next] bpf: add optional memory accounting for maps
@ 2019-01-31  9:38 Martynas Pumputis
  2019-01-31 18:36 ` Alexei Starovoitov
  2019-01-31 20:04 ` Jakub Kicinski
  0 siblings, 2 replies; 4+ messages in thread
From: Martynas Pumputis @ 2019-01-31  9:38 UTC (permalink / raw)
  To: netdev; +Cc: ys114321, ast, daniel, m, Yonghong Song

Previously, memory allocated for a map was not accounted. Therefore,
this memory could not be taken into consideration by the cgroups
memory controller.

This patch introduces the "BPF_F_ACCOUNT_MEM" flag which enables
the memory accounting for a map, and it can be set during
the map creation ("BPF_MAP_CREATE") in "map_flags".

When enabled, we account only that amount of memory which is charged
against the "RLIMIT_MEMLOCK" limit.

To validate the change, first we create the memory cgroup-v1 "test-map":

    # mkdir /sys/fs/cgroup/memory/test-map

And then we run the following program against the cgroup:

    $ cat test_map.c
    <..>
    int main() {
        usleep(3 * 1000000);
        assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, 0) > 0);
        usleep(3 * 1000000);
    }
    # cgexec -g memory:test-map ./test_map &
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    397312
    258048

    <after 3 sec the map has been created>

    # bpftool map list
    19: hash  flags 0x0
        key 8B  value 16B  max_entries 65536  memlock 5771264B
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    401408
    262144

As we can see, the memory allocated for map is not accounted, as
397312B + 5771264B > 401408B.

Next, we enabled the accounting and re-run the test:

    $ cat test_map.c
    <..>
    int main() {
        usleep(3 * 1000000);
        assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, BPF_F_ACCOUNT_MEM) > 0);
        usleep(3 * 1000000);
    }
    # cgexec -g memory:test-map ./test_map &
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    450560
    307200

    <after 3 sec the map has been created>

    # bpftool map list
    20: hash  flags 0x80
        key 8B  value 16B  max_entries 65536  memlock 5771264B
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    6221824
    6078464

This time, the memory (including kmem) is accounted, as
450560B + 5771264B <= 6221824B

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Martynas Pumputis <m@lambda.lt>
---
 include/linux/bpf.h            |  5 +++--
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/arraymap.c          | 14 +++++++++-----
 kernel/bpf/bpf_lru_list.c      | 11 +++++++++--
 kernel/bpf/bpf_lru_list.h      |  1 +
 kernel/bpf/cpumap.c            | 12 +++++++++---
 kernel/bpf/devmap.c            | 10 ++++++++--
 kernel/bpf/hashtab.c           | 19 ++++++++++++++-----
 kernel/bpf/lpm_trie.c          | 19 +++++++++++++------
 kernel/bpf/queue_stack_maps.c  |  5 +++--
 kernel/bpf/reuseport_array.c   |  3 ++-
 kernel/bpf/stackmap.c          | 12 ++++++++----
 kernel/bpf/syscall.c           | 12 ++++++++----
 kernel/bpf/xskmap.c            |  9 +++++++--
 net/core/sock_map.c            | 13 +++++++++----
 tools/include/uapi/linux/bpf.h |  4 ++++
 16 files changed, 110 insertions(+), 42 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e734f163bd0b..353a3f4304fe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -79,7 +79,8 @@ struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	bool unpriv_array;
-	/* 55 bytes hole */
+	bool account_mem;
+	/* 54 bytes hole */
 
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
@@ -506,7 +507,7 @@ void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
-void *bpf_map_area_alloc(size_t size, int numa_node);
+void *bpf_map_area_alloc(size_t size, int numa_node, bool account_mem);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91c43884f295..cff34678caf6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -291,6 +291,9 @@ enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED		(1U << 6)
 
+/* Enable memory accounting for map (only for BPF_MAP_CREATE) */
+#define BPF_F_ACCOUNT_MEM	(1U << 7)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..86417f2e6f1b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -34,14 +34,17 @@ static void bpf_array_free_percpu(struct bpf_array *array)
 	}
 }
 
-static int bpf_array_alloc_percpu(struct bpf_array *array)
+static int bpf_array_alloc_percpu(struct bpf_array *array, bool account_mem)
 {
 	void __percpu *ptr;
+	gfp_t gfp = GFP_USER | __GFP_NOWARN;
 	int i;
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+
 	for (i = 0; i < array->map.max_entries; i++) {
-		ptr = __alloc_percpu_gfp(array->elem_size, 8,
-					 GFP_USER | __GFP_NOWARN);
+		ptr = __alloc_percpu_gfp(array->elem_size, 8, gfp);
 		if (!ptr) {
 			bpf_array_free_percpu(array);
 			return -ENOMEM;
@@ -82,6 +85,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	u32 elem_size, index_mask, max_entries;
 	bool unpriv = !capable(CAP_SYS_ADMIN);
 	u64 cost, array_size, mask64;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	struct bpf_array *array;
 
 	elem_size = round_up(attr->value_size, 8);
@@ -129,7 +133,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(ret);
 
 	/* allocate all map elements and zero-initialize them */
-	array = bpf_map_area_alloc(array_size, numa_node);
+	array = bpf_map_area_alloc(array_size, numa_node, account_mem);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
 	array->index_mask = index_mask;
@@ -140,7 +144,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.pages = cost;
 	array->elem_size = elem_size;
 
-	if (percpu && bpf_array_alloc_percpu(array)) {
+	if (percpu && bpf_array_alloc_percpu(array, account_mem)) {
 		bpf_map_area_free(array);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..4d58537e0af2 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -7,6 +7,7 @@
 #include <linux/cpumask.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
+#include <linux/gfp.h>
 
 #include "bpf_lru_list.h"
 
@@ -646,12 +647,17 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+		 bool account_mem,
 		 del_from_htab_func del_from_htab, void *del_arg)
 {
+	gfp_t gfp = GFP_KERNEL;
 	int cpu;
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+
 	if (percpu) {
-		lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
+		lru->percpu_lru = alloc_percpu_gfp(struct bpf_lru_list, gfp);
 		if (!lru->percpu_lru)
 			return -ENOMEM;
 
@@ -665,7 +671,8 @@ int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
 	} else {
 		struct bpf_common_lru *clru = &lru->common_lru;
 
-		clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+		clru->local_list = alloc_percpu_gfp(struct bpf_lru_locallist,
+						    gfp);
 		if (!clru->local_list)
 			return -ENOMEM;
 
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..89566665592b 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -74,6 +74,7 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+		 bool account_mem,
 		 del_from_htab_func del_from_htab, void *delete_arg);
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
 		      u32 elem_size, u32 nr_elems);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8974b3755670..1e84bf78716e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -81,6 +81,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	struct bpf_cpu_map *cmap;
 	int err = -ENOMEM;
 	u64 cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+	gfp_t gfp = GFP_KERNEL;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -117,16 +119,20 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 		goto free_cmap;
 	}
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
 	/* A per cpu bitfield with a bit per possible CPU in map  */
-	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
-					    __alignof__(unsigned long));
+	cmap->flush_needed = __alloc_percpu_gfp(cpu_map_bitmap_size(attr),
+					    __alignof__(unsigned long),
+					    gfp);
 	if (!cmap->flush_needed)
 		goto free_cmap;
 
 	/* Alloc array for possible remote "destination" CPUs */
 	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
 					   sizeof(struct bpf_cpu_map_entry *),
-					   cmap->map.numa_node);
+					   cmap->map.numa_node,
+					   account_mem);
 	if (!cmap->cpu_map)
 		goto free_percpu;
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..acfc1b35aa51 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -90,6 +90,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	struct bpf_dtab *dtab;
 	int err = -EINVAL;
 	u64 cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+	gfp_t gfp;
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -120,16 +122,20 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	err = -ENOMEM;
 
+	gfp = GFP_KERNEL | __GFP_NOWARN;
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
 	/* A per cpu bitfield with a bit per possible net device */
 	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
 						__alignof__(unsigned long),
-						GFP_KERNEL | __GFP_NOWARN);
+						gfp);
 	if (!dtab->flush_needed)
 		goto free_dtab;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
-					      dtab->map.numa_node);
+					      dtab->map.numa_node,
+					      account_mem);
 	if (!dtab->netdev_map)
 		goto free_dtab;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..fc2f44451256 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,6 +23,7 @@
 
 #define HTAB_CREATE_FLAG_MASK						\
 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
+	 BPF_F_ACCOUNT_MEM |						\
 	 BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
 
 struct bucket {
@@ -139,27 +140,32 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 	return NULL;
 }
 
-static int prealloc_init(struct bpf_htab *htab)
+static int prealloc_init(struct bpf_htab *htab, bool account_mem)
 {
 	u32 num_entries = htab->map.max_entries;
+	gfp_t gfp = GFP_USER | __GFP_NOWARN;
 	int err = -ENOMEM, i;
 
 	if (!htab_is_percpu(htab) && !htab_is_lru(htab))
 		num_entries += num_possible_cpus();
 
 	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
-					 htab->map.numa_node);
+					 htab->map.numa_node,
+					 account_mem);
 	if (!htab->elems)
 		return -ENOMEM;
 
 	if (!htab_is_percpu(htab))
 		goto skip_percpu_elems;
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+
 	for (i = 0; i < num_entries; i++) {
 		u32 size = round_up(htab->map.value_size, 8);
 		void __percpu *pptr;
 
-		pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+		pptr = __alloc_percpu_gfp(size, 8, gfp);
 		if (!pptr)
 			goto free_elems;
 		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
@@ -173,6 +179,7 @@ static int prealloc_init(struct bpf_htab *htab)
 				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
 				   offsetof(struct htab_elem, hash) -
 				   offsetof(struct htab_elem, lru_node),
+				   account_mem,
 				   htab_lru_map_delete_node,
 				   htab);
 	else
@@ -313,6 +320,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	 */
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	struct bpf_htab *htab;
 	int err, i;
 	u64 cost;
@@ -374,7 +382,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	err = -ENOMEM;
 	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
 					   sizeof(struct bucket),
-					   htab->map.numa_node);
+					   htab->map.numa_node,
+					   account_mem);
 	if (!htab->buckets)
 		goto free_htab;
 
@@ -389,7 +398,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	}
 
 	if (prealloc) {
-		err = prealloc_init(htab);
+		err = prealloc_init(htab, account_mem);
 		if (err)
 			goto free_buckets;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index abf1002080df..8421fdb816f3 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -277,16 +277,19 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 }
 
 static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
-						 const void *value)
+						 const void *value,
+						 bool account_mem)
 {
 	struct lpm_trie_node *node;
 	size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 
 	if (value)
 		size += trie->map.value_size;
 
-	node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
-			    trie->map.numa_node);
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+	node = kmalloc_node(size, gfp, trie->map.numa_node);
 	if (!node)
 		return NULL;
 
@@ -327,7 +330,7 @@ static int trie_update_elem(struct bpf_map *map,
 		goto out;
 	}
 
-	new_node = lpm_trie_node_alloc(trie, value);
+	new_node = lpm_trie_node_alloc(trie, value, map->account_mem);
 	if (!new_node) {
 		ret = -ENOMEM;
 		goto out;
@@ -394,7 +397,7 @@ static int trie_update_elem(struct bpf_map *map,
 		goto out;
 	}
 
-	im_node = lpm_trie_node_alloc(trie, NULL);
+	im_node = lpm_trie_node_alloc(trie, NULL, map->account_mem);
 	if (!im_node) {
 		ret = -ENOMEM;
 		goto out;
@@ -542,6 +545,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
 	struct lpm_trie *trie;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+	gfp_t gfp = GFP_USER | __GFP_NOWARN;
 	u64 cost = sizeof(*trie), cost_per_node;
 	int ret;
 
@@ -558,7 +563,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	    attr->value_size > LPM_VAL_SIZE_MAX)
 		return ERR_PTR(-EINVAL);
 
-	trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+	trie = kzalloc(sizeof(*trie), gfp);
 	if (!trie)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index b384ea9f3254..040ec350af3d 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -11,7 +11,7 @@
 #include "percpu_freelist.h"
 
 #define QUEUE_STACK_CREATE_FLAG_MASK \
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ACCOUNT_MEM)
 
 
 struct bpf_queue_stack {
@@ -69,6 +69,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_queue_stack *qs;
 	u64 size, queue_size, cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
 	size = (u64) attr->max_entries + 1;
 	cost = queue_size = sizeof(*qs) + size * attr->value_size;
@@ -81,7 +82,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 	if (ret < 0)
 		return ERR_PTR(ret);
 
-	qs = bpf_map_area_alloc(queue_size, numa_node);
+	qs = bpf_map_area_alloc(queue_size, numa_node, account_mem);
 	if (!qs)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..a9a2709c7507 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -152,6 +152,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	int err, numa_node = bpf_map_attr_numa_node(attr);
 	struct reuseport_array *array;
 	u64 cost, array_size;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -170,7 +171,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 		return ERR_PTR(err);
 
 	/* allocate all map elements and zero-initialize them */
-	array = bpf_map_area_alloc(array_size, numa_node);
+	array = bpf_map_area_alloc(array_size, numa_node, account_mem);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index d43b14535827..46d37c7e09a2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -61,13 +61,15 @@ static inline int stack_map_data_size(struct bpf_map *map)
 		sizeof(struct bpf_stack_build_id) : sizeof(u64);
 }
 
-static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap,
+				       bool account_mem)
 {
 	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
 	int err;
 
 	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
-					 smap->map.numa_node);
+					 smap->map.numa_node,
+					 account_mem);
 	if (!smap->elems)
 		return -ENOMEM;
 
@@ -90,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	u32 value_size = attr->value_size;
 	struct bpf_stack_map *smap;
 	u64 cost, n_buckets;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -119,7 +122,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-E2BIG);
 
-	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
+	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr),
+				  account_mem);
 	if (!smap)
 		return ERR_PTR(-ENOMEM);
 
@@ -141,7 +145,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (err)
 		goto free_smap;
 
-	err = prealloc_elems_and_freelist(smap);
+	err = prealloc_elems_and_freelist(smap, account_mem);
 	if (err)
 		goto put_buffers;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..13f2e1731a47 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -131,25 +131,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 		return map;
 	map->ops = ops;
 	map->map_type = type;
+	map->account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	return map;
 }
 
-void *bpf_map_area_alloc(size_t size, int numa_node)
+void *bpf_map_area_alloc(size_t size, int numa_node, bool account_mem)
 {
 	/* We definitely need __GFP_NORETRY, so OOM killer doesn't
 	 * trigger under memory pressure as we really just want to
 	 * fail instead.
 	 */
-	const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+	gfp_t gfp = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
 	void *area;
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		area = kmalloc_node(size, GFP_USER | flags, numa_node);
+		area = kmalloc_node(size, GFP_USER | gfp, numa_node);
 		if (area != NULL)
 			return area;
 	}
 
-	return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+	return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | gfp,
 					   __builtin_return_address(0));
 }
 
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..bbc1f142326f 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -20,6 +20,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	int cpu, err = -EINVAL;
 	struct xsk_map *m;
 	u64 cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+	gfp_t gfp = GFP_KERNEL;
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -49,7 +51,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	err = -ENOMEM;
 
-	m->flush_list = alloc_percpu(struct list_head);
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+	m->flush_list = alloc_percpu_gfp(struct list_head, gfp);
 	if (!m->flush_list)
 		goto free_m;
 
@@ -58,7 +62,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
 					sizeof(struct xdp_sock *),
-					m->map.numa_node);
+					m->map.numa_node,
+					account_mem);
 	if (!m->xsk_map)
 		goto free_percpu;
 	return &m->map;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..eefcfd1294c0 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -18,13 +18,15 @@ struct bpf_stab {
 	raw_spinlock_t lock;
 };
 
-#define SOCK_CREATE_FLAG_MASK				\
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define SOCK_CREATE_FLAG_MASK					\
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\
+	 BPF_F_ACCOUNT_MEM)
 
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_stab *stab;
 	u64 cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	int err;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -56,7 +58,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	stab->sks = bpf_map_area_alloc(stab->map.max_entries *
 				       sizeof(struct sock *),
-				       stab->map.numa_node);
+				       stab->map.numa_node,
+				       account_mem);
 	if (stab->sks)
 		return &stab->map;
 	err = -ENOMEM;
@@ -788,6 +791,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 	struct bpf_htab *htab;
 	int i, err;
 	u64 cost;
+	bool account = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -823,7 +827,8 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 
 	htab->buckets = bpf_map_area_alloc(htab->buckets_num *
 					   sizeof(struct bpf_htab_bucket),
-					   htab->map.numa_node);
+					   htab->map.numa_node,
+					   account);
 	if (!htab->buckets) {
 		err = -ENOMEM;
 		goto free_htab;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 91c43884f295..a2b6cb200c9b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -279,6 +279,7 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+
 #define BPF_OBJ_NAME_LEN 16U
 
 /* Flags for accessing BPF object */
@@ -291,6 +292,9 @@ enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED		(1U << 6)
 
+/* Enable memory accounting for map (only for BPF_MAP_CREATE) */
+#define BPF_F_ACCOUNT_MEM	(1U << 7)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2 bpf-next] bpf: add optional memory accounting for maps
  2019-01-31  9:38 [PATCH v2 bpf-next] bpf: add optional memory accounting for maps Martynas Pumputis
@ 2019-01-31 18:36 ` Alexei Starovoitov
  2019-01-31 20:04 ` Jakub Kicinski
  1 sibling, 0 replies; 4+ messages in thread
From: Alexei Starovoitov @ 2019-01-31 18:36 UTC (permalink / raw)
  To: Martynas Pumputis; +Cc: netdev, ys114321, ast, daniel, Yonghong Song

On Thu, Jan 31, 2019 at 10:38:01AM +0100, Martynas Pumputis wrote:
> Previously, memory allocated for a map was not accounted. Therefore,
> this memory could not be taken into consideration by the cgroups
> memory controller.
> 
> This patch introduces the "BPF_F_ACCOUNT_MEM" flag which enables
> the memory accounting for a map, and it can be set during
> the map creation ("BPF_MAP_CREATE") in "map_flags".
> 
> When enabled, we account only that amount of memory which is charged
> against the "RLIMIT_MEMLOCK" limit.
> 
> To validate the change, first we create the memory cgroup-v1 "test-map":
> 
>     # mkdir /sys/fs/cgroup/memory/test-map
> 
> And then we run the following program against the cgroup:
> 
>     $ cat test_map.c
>     <..>
>     int main() {
>         usleep(3 * 1000000);
>         assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, 0) > 0);
>         usleep(3 * 1000000);
>     }
>     # cgexec -g memory:test-map ./test_map &
>     # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
>     397312
>     258048
> 
>     <after 3 sec the map has been created>
> 
>     # bpftool map list
>     19: hash  flags 0x0
>         key 8B  value 16B  max_entries 65536  memlock 5771264B
>     # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
>     401408
>     262144
> 
> As we can see, the memory allocated for map is not accounted, as
> 397312B + 5771264B > 401408B.
> 
> Next, we enabled the accounting and re-run the test:
> 
>     $ cat test_map.c
>     <..>
>     int main() {
>         usleep(3 * 1000000);
>         assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, BPF_F_ACCOUNT_MEM) > 0);
>         usleep(3 * 1000000);
>     }
>     # cgexec -g memory:test-map ./test_map &
>     # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
>     450560
>     307200
> 
>     <after 3 sec the map has been created>
> 
>     # bpftool map list
>     20: hash  flags 0x80
>         key 8B  value 16B  max_entries 65536  memlock 5771264B
>     # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
>     6221824
>     6078464
> 
> This time, the memory (including kmem) is accounted, as
> 450560B + 5771264B <= 6221824B
> 
> Acked-by: Yonghong Song <yhs@fb.com>
> Signed-off-by: Martynas Pumputis <m@lambda.lt>

see my reply in other thread.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2 bpf-next] bpf: add optional memory accounting for maps
  2019-01-31  9:38 [PATCH v2 bpf-next] bpf: add optional memory accounting for maps Martynas Pumputis
  2019-01-31 18:36 ` Alexei Starovoitov
@ 2019-01-31 20:04 ` Jakub Kicinski
  2019-02-01 13:40   ` Martynas
  1 sibling, 1 reply; 4+ messages in thread
From: Jakub Kicinski @ 2019-01-31 20:04 UTC (permalink / raw)
  To: Martynas Pumputis; +Cc: netdev, ys114321, ast, daniel, Yonghong Song

On Thu, 31 Jan 2019 10:38:01 +0100, Martynas Pumputis wrote:
> Previously, memory allocated for a map was not accounted. Therefore,
> this memory could not be taken into consideration by the cgroups
> memory controller.
> 
> This patch introduces the "BPF_F_ACCOUNT_MEM" flag which enables
> the memory accounting for a map, and it can be set during
> the map creation ("BPF_MAP_CREATE") in "map_flags".

What should happen for no-prealloc maps?  Would it make some sense to
charge the max map size to the user and not each allocation?  Or
perhaps remember the owner to be able to charge the data path
allocations which don't happen in process context as well?

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2 bpf-next] bpf: add optional memory accounting for maps
  2019-01-31 20:04 ` Jakub Kicinski
@ 2019-02-01 13:40   ` Martynas
  0 siblings, 0 replies; 4+ messages in thread
From: Martynas @ 2019-02-01 13:40 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, ys114321, ast, daniel, Yonghong Song


On Thu, Jan 31, 2019, at 10:04 PM, Jakub Kicinski wrote:
> On Thu, 31 Jan 2019 10:38:01 +0100, Martynas Pumputis wrote:
> > Previously, memory allocated for a map was not accounted. Therefore,
> > this memory could not be taken into consideration by the cgroups
> > memory controller.
> > 
> > This patch introduces the "BPF_F_ACCOUNT_MEM" flag which enables
> > the memory accounting for a map, and it can be set during
> > the map creation ("BPF_MAP_CREATE") in "map_flags".
> 
> What should happen for no-prealloc maps? 

I wanted to be consistent with "bpf_map_precharge_memlock". So, as such map elements are not charged against RLIMIT_MEMLOCK, I haven't enabled accounting for them (yet).

> Would it make some sense to
> charge the max map size to the user and not each allocation? 

Hmm, but that would not reflect a real memory usage, and it could potentially lead to some troubles. E.g. a process with tight cgroup mem limits gets OOM'd because we have pre-charged for what it actually doesn't use.

> Or
> perhaps remember the owner to be able to charge the data path
> allocations which don't happen in process context as well?

In use cases I am aware of all map updates happen within the same context, i.e. in the same cgroup. So, that cgroup becomes the "owner" of the allocations.

Another issue is when we pin a map (regardless whether it was prealloc'd or not). In this case, if a process which created the map
gets restarted and continues to use the map, then it is not being charged. But as long as the process runs in the same cgroup, it should be fine.






^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-02-01 13:40 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-31  9:38 [PATCH v2 bpf-next] bpf: add optional memory accounting for maps Martynas Pumputis
2019-01-31 18:36 ` Alexei Starovoitov
2019-01-31 20:04 ` Jakub Kicinski
2019-02-01 13:40   ` Martynas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.