[PATCH v2 bpf-next] bpf: add optional memory accounting for maps

* [PATCH v2 bpf-next] bpf: add optional memory accounting for maps
@ 2019-01-31  9:38 Martynas Pumputis
  2019-01-31 18:36 ` Alexei Starovoitov
  2019-01-31 20:04 ` Jakub Kicinski
  0 siblings, 2 replies; 4+ messages in thread
From: Martynas Pumputis @ 2019-01-31  9:38 UTC (permalink / raw)
  To: netdev; +Cc: ys114321, ast, daniel, m, Yonghong Song

Previously, memory allocated for a map was not accounted. Therefore,
this memory could not be taken into consideration by the cgroups
memory controller.

This patch introduces the "BPF_F_ACCOUNT_MEM" flag which enables
the memory accounting for a map, and it can be set during
the map creation ("BPF_MAP_CREATE") in "map_flags".

When enabled, we account only that amount of memory which is charged
against the "RLIMIT_MEMLOCK" limit.

To validate the change, first we create the memory cgroup-v1 "test-map":

    # mkdir /sys/fs/cgroup/memory/test-map

And then we run the following program against the cgroup:

    $ cat test_map.c
    <..>
    int main() {
        usleep(3 * 1000000);
        assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, 0) > 0);
        usleep(3 * 1000000);
    }
    # cgexec -g memory:test-map ./test_map &
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    397312
    258048

    <after 3 sec the map has been created>

    # bpftool map list
    19: hash  flags 0x0
        key 8B  value 16B  max_entries 65536  memlock 5771264B
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    401408
    262144

As we can see, the memory allocated for map is not accounted, as
397312B + 5771264B > 401408B.

Next, we enabled the accounting and re-run the test:

    $ cat test_map.c
    <..>
    int main() {
        usleep(3 * 1000000);
        assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, BPF_F_ACCOUNT_MEM) > 0);
        usleep(3 * 1000000);
    }
    # cgexec -g memory:test-map ./test_map &
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    450560
    307200

    <after 3 sec the map has been created>

    # bpftool map list
    20: hash  flags 0x80
        key 8B  value 16B  max_entries 65536  memlock 5771264B
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    6221824
    6078464

This time, the memory (including kmem) is accounted, as
450560B + 5771264B <= 6221824B

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Martynas Pumputis <m@lambda.lt>
---
 include/linux/bpf.h            |  5 +++--
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/arraymap.c          | 14 +++++++++-----
 kernel/bpf/bpf_lru_list.c      | 11 +++++++++--
 kernel/bpf/bpf_lru_list.h      |  1 +
 kernel/bpf/cpumap.c            | 12 +++++++++---
 kernel/bpf/devmap.c            | 10 ++++++++--
 kernel/bpf/hashtab.c           | 19 ++++++++++++++-----
 kernel/bpf/lpm_trie.c          | 19 +++++++++++++------
 kernel/bpf/queue_stack_maps.c  |  5 +++--
 kernel/bpf/reuseport_array.c   |  3 ++-
 kernel/bpf/stackmap.c          | 12 ++++++++----
 kernel/bpf/syscall.c           | 12 ++++++++----
 kernel/bpf/xskmap.c            |  9 +++++++--
 net/core/sock_map.c            | 13 +++++++++----
 tools/include/uapi/linux/bpf.h |  4 ++++
 16 files changed, 110 insertions(+), 42 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e734f163bd0b..353a3f4304fe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -79,7 +79,8 @@ struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	bool unpriv_array;
-	/* 55 bytes hole */
+	bool account_mem;
+	/* 54 bytes hole */
 
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
@@ -506,7 +507,7 @@ void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
-void *bpf_map_area_alloc(size_t size, int numa_node);
+void *bpf_map_area_alloc(size_t size, int numa_node, bool account_mem);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91c43884f295..cff34678caf6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -291,6 +291,9 @@ enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED		(1U << 6)
 
+/* Enable memory accounting for map (only for BPF_MAP_CREATE) */
+#define BPF_F_ACCOUNT_MEM	(1U << 7)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..86417f2e6f1b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -34,14 +34,17 @@ static void bpf_array_free_percpu(struct bpf_array *array)
 	}
 }
 
-static int bpf_array_alloc_percpu(struct bpf_array *array)
+static int bpf_array_alloc_percpu(struct bpf_array *array, bool account_mem)
 {
 	void __percpu *ptr;
+	gfp_t gfp = GFP_USER | __GFP_NOWARN;
 	int i;
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+
 	for (i = 0; i < array->map.max_entries; i++) {
-		ptr = __alloc_percpu_gfp(array->elem_size, 8,
-					 GFP_USER | __GFP_NOWARN);
+		ptr = __alloc_percpu_gfp(array->elem_size, 8, gfp);
 		if (!ptr) {
 			bpf_array_free_percpu(array);
 			return -ENOMEM;
@@ -82,6 +85,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	u32 elem_size, index_mask, max_entries;
 	bool unpriv = !capable(CAP_SYS_ADMIN);
 	u64 cost, array_size, mask64;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	struct bpf_array *array;
 
 	elem_size = round_up(attr->value_size, 8);
@@ -129,7 +133,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(ret);
 
 	/* allocate all map elements and zero-initialize them */
-	array = bpf_map_area_alloc(array_size, numa_node);
+	array = bpf_map_area_alloc(array_size, numa_node, account_mem);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
 	array->index_mask = index_mask;
@@ -140,7 +144,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.pages = cost;
 	array->elem_size = elem_size;
 
-	if (percpu && bpf_array_alloc_percpu(array)) {
+	if (percpu && bpf_array_alloc_percpu(array, account_mem)) {
 		bpf_map_area_free(array);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..4d58537e0af2 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -7,6 +7,7 @@
 #include <linux/cpumask.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
+#include <linux/gfp.h>
 
 #include "bpf_lru_list.h"
 
@@ -646,12 +647,17 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+		 bool account_mem,
 		 del_from_htab_func del_from_htab, void *del_arg)
 {
+	gfp_t gfp = GFP_KERNEL;
 	int cpu;
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+
 	if (percpu) {
-		lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
+		lru->percpu_lru = alloc_percpu_gfp(struct bpf_lru_list, gfp);
 		if (!lru->percpu_lru)
 			return -ENOMEM;
 
@@ -665,7 +671,8 @@ int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
 	} else {
 		struct bpf_common_lru *clru = &lru->common_lru;
 
-		clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+		clru->local_list = alloc_percpu_gfp(struct bpf_lru_locallist,
+						    gfp);
 		if (!clru->local_list)
 			return -ENOMEM;
 
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..89566665592b 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -74,6 +74,7 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+		 bool account_mem,
 		 del_from_htab_func del_from_htab, void *delete_arg);
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
 		      u32 elem_size, u32 nr_elems);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8974b3755670..1e84bf78716e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -81,6 +81,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	struct bpf_cpu_map *cmap;
 	int err = -ENOMEM;
 	u64 cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+	gfp_t gfp = GFP_KERNEL;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -117,16 +119,20 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 		goto free_cmap;
 	}
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
 	/* A per cpu bitfield with a bit per possible CPU in map  */
-	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
-					    __alignof__(unsigned long));
+	cmap->flush_needed = __alloc_percpu_gfp(cpu_map_bitmap_size(attr),
+					    __alignof__(unsigned long),
+					    gfp);
 	if (!cmap->flush_needed)
 		goto free_cmap;
 
 	/* Alloc array for possible remote "destination" CPUs */
 	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
 					   sizeof(struct bpf_cpu_map_entry *),
-					   cmap->map.numa_node);
+					   cmap->map.numa_node,
+					   account_mem);
 	if (!cmap->cpu_map)
 		goto free_percpu;
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..acfc1b35aa51 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -90,6 +90,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	struct bpf_dtab *dtab;
 	int err = -EINVAL;
 	u64 cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+	gfp_t gfp;
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -120,16 +122,20 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	err = -ENOMEM;
 
+	gfp = GFP_KERNEL | __GFP_NOWARN;
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
 	/* A per cpu bitfield with a bit per possible net device */
 	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
 						__alignof__(unsigned long),
-						GFP_KERNEL | __GFP_NOWARN);
+						gfp);
 	if (!dtab->flush_needed)
 		goto free_dtab;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
-					      dtab->map.numa_node);
+					      dtab->map.numa_node,
+					      account_mem);
 	if (!dtab->netdev_map)
 		goto free_dtab;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..fc2f44451256 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,6 +23,7 @@
 
 #define HTAB_CREATE_FLAG_MASK						\
 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
+	 BPF_F_ACCOUNT_MEM |						\
 	 BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
 
 struct bucket {
@@ -139,27 +140,32 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 	return NULL;
 }
 
-static int prealloc_init(struct bpf_htab *htab)
+static int prealloc_init(struct bpf_htab *htab, bool account_mem)
 {
 	u32 num_entries = htab->map.max_entries;
+	gfp_t gfp = GFP_USER | __GFP_NOWARN;
 	int err = -ENOMEM, i;
 
 	if (!htab_is_percpu(htab) && !htab_is_lru(htab))
 		num_entries += num_possible_cpus();
 
 	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
-					 htab->map.numa_node);
+					 htab->map.numa_node,
+					 account_mem);
 	if (!htab->elems)
 		return -ENOMEM;
 
 	if (!htab_is_percpu(htab))
 		goto skip_percpu_elems;
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+
 	for (i = 0; i < num_entries; i++) {
 		u32 size = round_up(htab->map.value_size, 8);
 		void __percpu *pptr;
 
-		pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+		pptr = __alloc_percpu_gfp(size, 8, gfp);
 		if (!pptr)
 			goto free_elems;
 		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
@@ -173,6 +179,7 @@ static int prealloc_init(struct bpf_htab *htab)
 				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
 				   offsetof(struct htab_elem, hash) -
 				   offsetof(struct htab_elem, lru_node),
+				   account_mem,
 				   htab_lru_map_delete_node,
 				   htab);
 	else
@@ -313,6 +320,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	 */
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	struct bpf_htab *htab;
 	int err, i;
 	u64 cost;
@@ -374,7 +382,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	err = -ENOMEM;
 	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
 					   sizeof(struct bucket),
-					   htab->map.numa_node);
+					   htab->map.numa_node,
+					   account_mem);
 	if (!htab->buckets)
 		goto free_htab;
 
@@ -389,7 +398,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	}
 
 	if (prealloc) {
-		err = prealloc_init(htab);
+		err = prealloc_init(htab, account_mem);
 		if (err)
 			goto free_buckets;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index abf1002080df..8421fdb816f3 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -277,16 +277,19 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 }
 
 static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
-						 const void *value)
+						 const void *value,
+						 bool account_mem)
 {
 	struct lpm_trie_node *node;
 	size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 
 	if (value)
 		size += trie->map.value_size;
 
-	node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
-			    trie->map.numa_node);
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+	node = kmalloc_node(size, gfp, trie->map.numa_node);
 	if (!node)
 		return NULL;
 
@@ -327,7 +330,7 @@ static int trie_update_elem(struct bpf_map *map,
 		goto out;
 	}
 
-	new_node = lpm_trie_node_alloc(trie, value);
+	new_node = lpm_trie_node_alloc(trie, value, map->account_mem);
 	if (!new_node) {
 		ret = -ENOMEM;
 		goto out;
@@ -394,7 +397,7 @@ static int trie_update_elem(struct bpf_map *map,
 		goto out;
 	}
 
-	im_node = lpm_trie_node_alloc(trie, NULL);
+	im_node = lpm_trie_node_alloc(trie, NULL, map->account_mem);
 	if (!im_node) {
 		ret = -ENOMEM;
 		goto out;
@@ -542,6 +545,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
 	struct lpm_trie *trie;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+	gfp_t gfp = GFP_USER | __GFP_NOWARN;
 	u64 cost = sizeof(*trie), cost_per_node;
 	int ret;
 
@@ -558,7 +563,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	    attr->value_size > LPM_VAL_SIZE_MAX)
 		return ERR_PTR(-EINVAL);
 
-	trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+	trie = kzalloc(sizeof(*trie), gfp);
 	if (!trie)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index b384ea9f3254..040ec350af3d 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -11,7 +11,7 @@
 #include "percpu_freelist.h"
 
 #define QUEUE_STACK_CREATE_FLAG_MASK \
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ACCOUNT_MEM)
 
 
 struct bpf_queue_stack {
@@ -69,6 +69,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_queue_stack *qs;
 	u64 size, queue_size, cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
 	size = (u64) attr->max_entries + 1;
 	cost = queue_size = sizeof(*qs) + size * attr->value_size;
@@ -81,7 +82,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 	if (ret < 0)
 		return ERR_PTR(ret);
 
-	qs = bpf_map_area_alloc(queue_size, numa_node);
+	qs = bpf_map_area_alloc(queue_size, numa_node, account_mem);
 	if (!qs)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..a9a2709c7507 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -152,6 +152,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	int err, numa_node = bpf_map_attr_numa_node(attr);
 	struct reuseport_array *array;
 	u64 cost, array_size;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -170,7 +171,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 		return ERR_PTR(err);
 
 	/* allocate all map elements and zero-initialize them */
-	array = bpf_map_area_alloc(array_size, numa_node);
+	array = bpf_map_area_alloc(array_size, numa_node, account_mem);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index d43b14535827..46d37c7e09a2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -61,13 +61,15 @@ static inline int stack_map_data_size(struct bpf_map *map)
 		sizeof(struct bpf_stack_build_id) : sizeof(u64);
 }
 
-static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap,
+				       bool account_mem)
 {
 	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
 	int err;
 
 	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
-					 smap->map.numa_node);
+					 smap->map.numa_node,
+					 account_mem);
 	if (!smap->elems)
 		return -ENOMEM;
 
@@ -90,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	u32 value_size = attr->value_size;
 	struct bpf_stack_map *smap;
 	u64 cost, n_buckets;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -119,7 +122,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-E2BIG);
 
-	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
+	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr),
+				  account_mem);
 	if (!smap)
 		return ERR_PTR(-ENOMEM);
 
@@ -141,7 +145,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (err)
 		goto free_smap;
 
-	err = prealloc_elems_and_freelist(smap);
+	err = prealloc_elems_and_freelist(smap, account_mem);
 	if (err)
 		goto put_buffers;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..13f2e1731a47 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -131,25 +131,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 		return map;
 	map->ops = ops;
 	map->map_type = type;
+	map->account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	return map;
 }
 
-void *bpf_map_area_alloc(size_t size, int numa_node)
+void *bpf_map_area_alloc(size_t size, int numa_node, bool account_mem)
 {
 	/* We definitely need __GFP_NORETRY, so OOM killer doesn't
 	 * trigger under memory pressure as we really just want to
 	 * fail instead.
 	 */
-	const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+	gfp_t gfp = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
 	void *area;
 
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		area = kmalloc_node(size, GFP_USER | flags, numa_node);
+		area = kmalloc_node(size, GFP_USER | gfp, numa_node);
 		if (area != NULL)
 			return area;
 	}
 
-	return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+	return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | gfp,
 					   __builtin_return_address(0));
 }
 
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..bbc1f142326f 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -20,6 +20,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	int cpu, err = -EINVAL;
 	struct xsk_map *m;
 	u64 cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+	gfp_t gfp = GFP_KERNEL;
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -49,7 +51,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	err = -ENOMEM;
 
-	m->flush_list = alloc_percpu(struct list_head);
+	if (account_mem)
+		gfp |= __GFP_ACCOUNT;
+	m->flush_list = alloc_percpu_gfp(struct list_head, gfp);
 	if (!m->flush_list)
 		goto free_m;
 
@@ -58,7 +62,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
 					sizeof(struct xdp_sock *),
-					m->map.numa_node);
+					m->map.numa_node,
+					account_mem);
 	if (!m->xsk_map)
 		goto free_percpu;
 	return &m->map;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..eefcfd1294c0 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -18,13 +18,15 @@ struct bpf_stab {
 	raw_spinlock_t lock;
 };
 
-#define SOCK_CREATE_FLAG_MASK				\
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define SOCK_CREATE_FLAG_MASK					\
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\
+	 BPF_F_ACCOUNT_MEM)
 
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_stab *stab;
 	u64 cost;
+	bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 	int err;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -56,7 +58,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	stab->sks = bpf_map_area_alloc(stab->map.max_entries *
 				       sizeof(struct sock *),
-				       stab->map.numa_node);
+				       stab->map.numa_node,
+				       account_mem);
 	if (stab->sks)
 		return &stab->map;
 	err = -ENOMEM;
@@ -788,6 +791,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 	struct bpf_htab *htab;
 	int i, err;
 	u64 cost;
+	bool account = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -823,7 +827,8 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 
 	htab->buckets = bpf_map_area_alloc(htab->buckets_num *
 					   sizeof(struct bpf_htab_bucket),
-					   htab->map.numa_node);
+					   htab->map.numa_node,
+					   account);
 	if (!htab->buckets) {
 		err = -ENOMEM;
 		goto free_htab;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 91c43884f295..a2b6cb200c9b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -279,6 +279,7 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+
 #define BPF_OBJ_NAME_LEN 16U
 
 /* Flags for accessing BPF object */
@@ -291,6 +292,9 @@ enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED		(1U << 6)
 
+/* Enable memory accounting for map (only for BPF_MAP_CREATE) */
+#define BPF_F_ACCOUNT_MEM	(1U << 7)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread