From mboxrd@z Thu Jan 1 00:00:00 1970 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Subject: [RFC PATCH v2 07/14] bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP Date: Tue, 27 Mar 2018 18:59:12 +0200 Message-ID: <20180327165919.17933-8-bjorn.topel@gmail.com> References: <20180327165919.17933-1-bjorn.topel@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= , michael.lundkvist@ericsson.com, jesse.brandeburg@intel.com, anjali.singhai@intel.com, qi.z.zhang@intel.com, ravineet.singh@ericsson.com To: bjorn.topel@gmail.com, magnus.karlsson@intel.com, alexander.h.duyck@intel.com, alexander.duyck@gmail.com, john.fastabend@gmail.com, ast@fb.com, brouer@redhat.com, willemdebruijn.kernel@gmail.com, daniel@iogearbox.net, netdev@vger.kernel.org Return-path: Received: from mga07.intel.com ([134.134.136.100]:61496 "EHLO mga07.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755544AbeC0Q7z (ORCPT ); Tue, 27 Mar 2018 12:59:55 -0400 In-Reply-To: <20180327165919.17933-1-bjorn.topel@gmail.com> Sender: netdev-owner@vger.kernel.org List-ID: From: Björn Töpel The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will *only* accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. Signed-off-by: Björn Töpel --- include/linux/bpf.h | 26 +++++ include/linux/bpf_types.h | 3 + include/net/xdp_sock.h | 34 ++++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/verifier.c | 8 +- kernel/bpf/xskmap.c | 281 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 354 insertions(+), 2 deletions(-) create mode 100644 include/net/xdp_sock.h create mode 100644 kernel/bpf/xskmap.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 819229c80eca..0fe9d080adc3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -657,6 +657,32 @@ static inline int sock_map_prog(struct bpf_map *map, } #endif +#if defined(CONFIG_XDP_SOCKETS) +struct xdp_sock; +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); +int __xsk_map_redirect(struct bpf_map *map, u32 index, + struct xdp_buff *xdp, struct xdp_sock *xs); +void __xsk_map_flush(struct bpf_map *map); +#else +struct xdp_sock; +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + +static inline int __xsk_map_redirect(struct bpf_map *map, u32 index, + struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + return -EOPNOTSUPP; +} + +static inline void __xsk_map_flush(struct bpf_map *map) +{ +} +#endif + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5e2e8a49fb21..b525862c98ab 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -47,4 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) +#if defined(CONFIG_XDP_SOCKETS) +BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) +#endif #endif diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h new file mode 100644 index 000000000000..80d119a685b2 --- /dev/null +++ b/include/net/xdp_sock.h @@ -0,0 +1,34 @@ +/* + * AF_XDP internal functions + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XDP_SOCK_H +#define _LINUX_XDP_SOCK_H + +struct xdp_sock; +struct xdp_buff; +#ifdef CONFIG_XDP_SOCKETS +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +void xsk_flush(struct xdp_sock *xs); +#else +static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline void xsk_flush(struct xdp_sock *xs) +{ +} +#endif /* CONFIG_XDP_SOCKETS */ + +#endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18b7c510c511..c8e1d2977712 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -114,6 +114,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a713fd23ec88..3c59d9bcae14 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -7,6 +7,9 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +ifeq ($(CONFIG_XDP_SOCKETS),y) +obj-$(CONFIG_BPF_SYSCALL) += xskmap.o +endif obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9f7c20691c1..46d525539a3b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2059,8 +2059,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; - /* Restrict bpf side of cpumap, open when use-cases appear */ + /* Restrict bpf side of cpumap and xskmap, open when use-cases + * appear. + */ case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -2107,7 +2110,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && - map->map_type != BPF_MAP_TYPE_CPUMAP) + map->map_type != BPF_MAP_TYPE_CPUMAP && + map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; break; case BPF_FUNC_sk_redirect_map: diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c new file mode 100644 index 000000000000..77553f24daa2 --- /dev/null +++ b/kernel/bpf/xskmap.c @@ -0,0 +1,281 @@ +/* + * XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include + +struct xsk_map_entry { + struct xdp_sock *xs; + struct rcu_head rcu; +}; + +struct xsk_map { + struct bpf_map map; + struct xsk_map_entry **xsk_map; + unsigned long __percpu *flush_needed; +}; + +static u64 xsk_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); +} + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + struct xsk_map *m; + int err = -EINVAL; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + m = kzalloc(sizeof(*m), GFP_USER); + if (!m) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&m->map, attr); + + cost = (u64)m->map.max_entries * sizeof(struct xsk_map_entry *); + cost += xsk_map_bitmap_size(attr) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_m; + + m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + err = bpf_map_precharge_memlock(m->map.pages); + if (err) + goto free_m; + + m->flush_needed = __alloc_percpu(xsk_map_bitmap_size(attr), + __alignof__(unsigned long)); + if (!m->flush_needed) + goto free_m; + + m->xsk_map = bpf_map_area_alloc(m->map.max_entries * + sizeof(struct xsk_map_entry *), + m->map.numa_node); + if (!m->xsk_map) + goto free_percpu; + return &m->map; + +free_percpu: + free_percpu(m->flush_needed); +free_m: + kfree(m); + return ERR_PTR(err); +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + int i, cpu; + + /* At this point bpf_prog->aux->refcnt == 0 and this + * map->refcnt == 0, so the programs (can be more than one + * that used this map) were disconnected from events. Wait for + * outstanding critical sections in these programs to + * complete. The rcu critical section only guarantees no + * further reads against xsk_map. It does __not__ ensure + * pending flush operations (if any) are complete. + */ + + synchronize_rcu(); + + /* To ensure all pending flush operations have completed wait + * for flush bitmap to indicate all flush_needed bits to be + * zero on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can + * assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(m->flush_needed, cpu); + + while (!bitmap_empty(bitmap, map->max_entries)) + cond_resched(); + } + + for (i = 0; i < map->max_entries; i++) { + struct xsk_map_entry *entry; + + entry = m->xsk_map[i]; + if (!entry) + continue; + + sock_put((struct sock *)entry->xs); + kfree(entry); + } + + free_percpu(m->flush_needed); + bpf_map_area_free(m->xsk_map); + kfree(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xsk_map_entry *entry; + + if (key >= map->max_entries) + return NULL; + + entry = READ_ONCE(m->xsk_map[key]); + return entry ? entry->xs : NULL; +} + +int __xsk_map_redirect(struct bpf_map *map, u32 index, + struct xdp_buff *xdp, struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + unsigned long *bitmap = this_cpu_ptr(m->flush_needed); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + __set_bit(index, bitmap); + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + unsigned long *bitmap = this_cpu_ptr(m->flush_needed); + u32 bit; + + for_each_set_bit(bit, bitmap, map->max_entries) { + struct xsk_map_entry *entry = READ_ONCE(m->xsk_map[bit]); + + /* This is possible if the entry is removed by user + * space between xdp redirect and flush op. + */ + if (unlikely(!entry)) + continue; + + __clear_bit(bit, bitmap); + xsk_flush(entry->xs); + } +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +static void __xsk_map_entry_free(struct rcu_head *rcu) +{ + struct xsk_map_entry *entry; + + entry = container_of(rcu, struct xsk_map_entry, rcu); + xsk_flush(entry->xs); + sock_put((struct sock *)entry->xs); + kfree(entry); +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xsk_map_entry *entry, *old_entry; + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + entry = kmalloc_node(sizeof(*entry), GFP_ATOMIC | __GFP_NOWARN, + map->numa_node); + if (!entry) { + sockfd_put(sock); + return -ENOMEM; + } + + sock_hold(sock->sk); + entry->xs = (struct xdp_sock *)sock->sk; + + old_entry = xchg(&m->xsk_map[i], entry); + if (old_entry) + call_rcu(&old_entry->rcu, __xsk_map_entry_free); + + sockfd_put(sock); + return 0; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xsk_map_entry *old_entry; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + old_entry = xchg(&m->xsk_map[k], NULL); + if (old_entry) + call_rcu(&old_entry->rcu, __xsk_map_entry_free); + + return 0; +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, +}; + + -- 2.14.1