bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next 0/3] bpf: support cloning sk storage on accept()
@ 2019-08-07 15:47 Stanislav Fomichev
  2019-08-07 15:47 ` [PATCH bpf-next 1/3] " Stanislav Fomichev
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Stanislav Fomichev @ 2019-08-07 15:47 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau

Currently there is no way to propagate sk storage from the listener
socket to a newly accepted one. Consider the following use case:

        fd = socket();
        setsockopt(fd, SOL_IP, IP_TOS,...);
        /* ^^^ setsockopt BPF program triggers here and saves something
         * into sk storage of the listener.
         */
        listen(fd, ...);
        while (client = accept(fd)) {
                /* At this point all association between listener
                 * socket and newly accepted one is gone. New
                 * socket will not have any sk storage attached.
                 */
        }

Let's add new BPF_SK_STORAGE_GET_F_CLONE flag that can be passed to
bpf_sk_storage_get. This new flag indicates that that particular
bpf_sk_storage_elem should be cloned when the socket is cloned.

Cc: Martin KaFai Lau <kafai@fb.com>

Stanislav Fomichev (3):
  bpf: support cloning sk storage on accept()
  bpf: sync bpf.h to tools/
  selftests/bpf: add sockopt clone/inheritance test

 include/net/bpf_sk_storage.h                  |  10 +
 include/uapi/linux/bpf.h                      |   1 +
 net/core/bpf_sk_storage.c                     | 102 ++++++-
 net/core/sock.c                               |   9 +-
 tools/include/uapi/linux/bpf.h                |   1 +
 tools/testing/selftests/bpf/.gitignore        |   1 +
 tools/testing/selftests/bpf/Makefile          |   3 +-
 .../selftests/bpf/progs/sockopt_inherit.c     | 102 +++++++
 .../selftests/bpf/test_sockopt_inherit.c      | 252 ++++++++++++++++++
 9 files changed, 473 insertions(+), 8 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/sockopt_inherit.c
 create mode 100644 tools/testing/selftests/bpf/test_sockopt_inherit.c

-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
  2019-08-07 15:47 [PATCH bpf-next 0/3] bpf: support cloning sk storage on accept() Stanislav Fomichev
@ 2019-08-07 15:47 ` Stanislav Fomichev
  2019-08-07 23:03   ` Yonghong Song
  2019-08-08  6:39   ` Martin Lau
  2019-08-07 15:47 ` [PATCH bpf-next 2/3] bpf: sync bpf.h to tools/ Stanislav Fomichev
  2019-08-07 15:47 ` [PATCH bpf-next 3/3] selftests/bpf: add sockopt clone/inheritance test Stanislav Fomichev
  2 siblings, 2 replies; 14+ messages in thread
From: Stanislav Fomichev @ 2019-08-07 15:47 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau

Add new helper bpf_sk_storage_clone which optionally clones sk storage
and call it from bpf_sk_storage_clone. Reuse the gap in
bpf_sk_storage_elem to store clone/non-clone flag.

Cc: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/net/bpf_sk_storage.h |  10 ++++
 include/uapi/linux/bpf.h     |   1 +
 net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
 net/core/sock.c              |   9 ++--
 4 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index b9dcb02e756b..8e4f831d2e52 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
 extern const struct bpf_func_proto bpf_sk_storage_get_proto;
 extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
 
+#ifdef CONFIG_BPF_SYSCALL
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
+#else
+static inline int bpf_sk_storage_clone(const struct sock *sk,
+				       struct sock *newsk)
+{
+	return 0;
+}
+#endif
+
 #endif /* _BPF_SK_STORAGE_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4393bd4b2419..00459ca4c8cf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2931,6 +2931,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_sk_storage_get flags */
 #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
+#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
 
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 94c7f77ecb6b..b6dea67965bc 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -12,6 +12,9 @@
 
 static atomic_t cache_idx;
 
+#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
+					 BPF_SK_STORAGE_GET_F_CLONE)
+
 struct bucket {
 	struct hlist_head list;
 	raw_spinlock_t lock;
@@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
 	struct hlist_node snode;	/* Linked to bpf_sk_storage */
 	struct bpf_sk_storage __rcu *sk_storage;
 	struct rcu_head rcu;
-	/* 8 bytes hole */
+	u8 clone:1;
+	/* 7 bytes hole */
 	/* The data is stored in aother cacheline to minimize
 	 * the number of cachelines access during a cache hit.
 	 */
@@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
 	return 0;
 }
 
-/* Called by __sk_destruct() */
+/* Called by __sk_destruct() & bpf_sk_storage_clone() */
 void bpf_sk_storage_free(struct sock *sk)
 {
 	struct bpf_sk_storage_elem *selem;
@@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
 	return err;
 }
 
+static struct bpf_sk_storage_elem *
+bpf_sk_storage_clone_elem(struct sock *newsk,
+			  struct bpf_sk_storage_map *smap,
+			  struct bpf_sk_storage_elem *selem)
+{
+	struct bpf_sk_storage_elem *copy_selem;
+
+	copy_selem = selem_alloc(smap, newsk, NULL, true);
+	if (!copy_selem)
+		return ERR_PTR(-ENOMEM);
+
+	if (map_value_has_spin_lock(&smap->map))
+		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
+				      SDATA(selem)->data, true);
+	else
+		copy_map_value(&smap->map, SDATA(copy_selem)->data,
+			       SDATA(selem)->data);
+
+	return copy_selem;
+}
+
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
+{
+	struct bpf_sk_storage *new_sk_storage = NULL;
+	struct bpf_sk_storage *sk_storage;
+	struct bpf_sk_storage_elem *selem;
+	int ret;
+
+	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+
+	rcu_read_lock();
+	sk_storage = rcu_dereference(sk->sk_bpf_storage);
+
+	if (!sk_storage || hlist_empty(&sk_storage->list))
+		goto out;
+
+	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+		struct bpf_sk_storage_map *smap;
+		struct bpf_sk_storage_elem *copy_selem;
+
+		if (!selem->clone)
+			continue;
+
+		smap = rcu_dereference(SDATA(selem)->smap);
+		if (!smap)
+			continue;
+
+		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
+		if (IS_ERR(copy_selem)) {
+			ret = PTR_ERR(copy_selem);
+			goto err;
+		}
+
+		if (!new_sk_storage) {
+			ret = sk_storage_alloc(newsk, smap, copy_selem);
+			if (ret) {
+				kfree(copy_selem);
+				atomic_sub(smap->elem_size,
+					   &newsk->sk_omem_alloc);
+				goto err;
+			}
+
+			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+			continue;
+		}
+
+		raw_spin_lock_bh(&new_sk_storage->lock);
+		selem_link_map(smap, copy_selem);
+		__selem_link_sk(new_sk_storage, copy_selem);
+		raw_spin_unlock_bh(&new_sk_storage->lock);
+	}
+
+out:
+	rcu_read_unlock();
+	return 0;
+
+err:
+	rcu_read_unlock();
+
+	bpf_sk_storage_free(newsk);
+	return ret;
+}
+
 BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 	   void *, value, u64, flags)
 {
 	struct bpf_sk_storage_data *sdata;
 
-	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
+	if (flags & ~BPF_SK_STORAGE_GET_F_MASK)
+		return (unsigned long)NULL;
+
+	if ((flags & BPF_SK_STORAGE_GET_F_CLONE) &&
+	    !(flags & BPF_SK_STORAGE_GET_F_CREATE))
 		return (unsigned long)NULL;
 
 	sdata = sk_storage_lookup(sk, map, true);
 	if (sdata)
 		return (unsigned long)sdata->data;
 
-	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
+	if ((flags & BPF_SK_STORAGE_GET_F_CREATE) &&
 	    /* Cannot add new elem to a going away sk.
 	     * Otherwise, the new elem may become a leak
 	     * (and also other memory issues during map
@@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 		/* sk must be a fullsock (guaranteed by verifier),
 		 * so sock_gen_put() is unnecessary.
 		 */
+		if (!IS_ERR(sdata))
+			SELEM(sdata)->clone =
+				!!(flags & BPF_SK_STORAGE_GET_F_CLONE);
 		sock_put(sk);
 		return IS_ERR(sdata) ?
 			(unsigned long)NULL : (unsigned long)sdata->data;
diff --git a/net/core/sock.c b/net/core/sock.c
index d57b0cc995a0..f5e801a9cea4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 			goto out;
 		}
 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
-#ifdef CONFIG_BPF_SYSCALL
-		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
-#endif
+
+		if (bpf_sk_storage_clone(sk, newsk)) {
+			sk_free_unlock_clone(newsk);
+			newsk = NULL;
+			goto out;
+		}
 
 		newsk->sk_err	   = 0;
 		newsk->sk_err_soft = 0;
-- 
2.22.0.770.g0f2c4a37fd-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH bpf-next 2/3] bpf: sync bpf.h to tools/
  2019-08-07 15:47 [PATCH bpf-next 0/3] bpf: support cloning sk storage on accept() Stanislav Fomichev
  2019-08-07 15:47 ` [PATCH bpf-next 1/3] " Stanislav Fomichev
@ 2019-08-07 15:47 ` Stanislav Fomichev
  2019-08-07 23:04   ` Yonghong Song
  2019-08-07 15:47 ` [PATCH bpf-next 3/3] selftests/bpf: add sockopt clone/inheritance test Stanislav Fomichev
  2 siblings, 1 reply; 14+ messages in thread
From: Stanislav Fomichev @ 2019-08-07 15:47 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau

Sync new sk storage clone flag.

Cc: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4393bd4b2419..00459ca4c8cf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2931,6 +2931,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_sk_storage_get flags */
 #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
+#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
 
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
-- 
2.22.0.770.g0f2c4a37fd-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH bpf-next 3/3] selftests/bpf: add sockopt clone/inheritance test
  2019-08-07 15:47 [PATCH bpf-next 0/3] bpf: support cloning sk storage on accept() Stanislav Fomichev
  2019-08-07 15:47 ` [PATCH bpf-next 1/3] " Stanislav Fomichev
  2019-08-07 15:47 ` [PATCH bpf-next 2/3] bpf: sync bpf.h to tools/ Stanislav Fomichev
@ 2019-08-07 15:47 ` Stanislav Fomichev
  2019-08-07 23:14   ` Yonghong Song
  2 siblings, 1 reply; 14+ messages in thread
From: Stanislav Fomichev @ 2019-08-07 15:47 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau

Add a test that calls setsockopt on the listener socket which triggers
BPF program. This BPF program writes to the sk storage and sets
clone flag. Make sure that sk storage is cloned for a newly
accepted connection.

We have two cloned maps in the tests to make sure we hit both cases
in bpf_sk_storage_clone: first element (sk_storage_alloc) and
non-first element(s) (selem_link_map).

Cc: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/testing/selftests/bpf/.gitignore        |   1 +
 tools/testing/selftests/bpf/Makefile          |   3 +-
 .../selftests/bpf/progs/sockopt_inherit.c     | 102 +++++++
 .../selftests/bpf/test_sockopt_inherit.c      | 252 ++++++++++++++++++
 4 files changed, 357 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/sockopt_inherit.c
 create mode 100644 tools/testing/selftests/bpf/test_sockopt_inherit.c

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 90f70d2c7c22..60c9338cd9b4 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -42,4 +42,5 @@ xdping
 test_sockopt
 test_sockopt_sk
 test_sockopt_multi
+test_sockopt_inherit
 test_tcp_rtt
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3bd0f4a0336a..c875763a851a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_cgroup_storage test_select_reuseport test_section_names \
 	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
 	test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
-	test_sockopt_multi test_tcp_rtt
+	test_sockopt_multi test_sockopt_inherit test_tcp_rtt
 
 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
 TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -110,6 +110,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
 $(OUTPUT)/test_sockopt: cgroup_helpers.c
 $(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
 $(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
+$(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
 $(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
 
 .PHONY: force
diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
new file mode 100644
index 000000000000..357fc9db5874
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+#define SOL_CUSTOM			0xdeadbeef
+#define CUSTOM_INHERIT1			0
+#define CUSTOM_INHERIT2			1
+#define CUSTOM_LISTENER			2
+
+struct sockopt_inherit {
+	__u8 val;
+};
+
+struct bpf_map_def SEC("maps") cloned1_map = {
+	.type = BPF_MAP_TYPE_SK_STORAGE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct sockopt_inherit),
+	.map_flags = BPF_F_NO_PREALLOC,
+};
+BPF_ANNOTATE_KV_PAIR(cloned1_map, int, struct sockopt_inherit);
+
+struct bpf_map_def SEC("maps") cloned2_map = {
+	.type = BPF_MAP_TYPE_SK_STORAGE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct sockopt_inherit),
+	.map_flags = BPF_F_NO_PREALLOC,
+};
+BPF_ANNOTATE_KV_PAIR(cloned2_map, int, struct sockopt_inherit);
+
+struct bpf_map_def SEC("maps") listener_map = {
+	.type = BPF_MAP_TYPE_SK_STORAGE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct sockopt_inherit),
+	.map_flags = BPF_F_NO_PREALLOC,
+};
+BPF_ANNOTATE_KV_PAIR(listener_map, int, struct sockopt_inherit);
+
+static __inline struct sockopt_inherit *get_storage(struct bpf_sockopt *ctx)
+{
+	if (ctx->optname == CUSTOM_INHERIT1)
+		return bpf_sk_storage_get(&cloned1_map, ctx->sk, 0,
+					  BPF_SK_STORAGE_GET_F_CREATE |
+					  BPF_SK_STORAGE_GET_F_CLONE);
+	else if (ctx->optname == CUSTOM_INHERIT2)
+		return bpf_sk_storage_get(&cloned2_map, ctx->sk, 0,
+					  BPF_SK_STORAGE_GET_F_CREATE |
+					  BPF_SK_STORAGE_GET_F_CLONE);
+	else
+		return bpf_sk_storage_get(&listener_map, ctx->sk, 0,
+					  BPF_SK_STORAGE_GET_F_CREATE);
+}
+
+SEC("cgroup/getsockopt")
+int _getsockopt(struct bpf_sockopt *ctx)
+{
+	__u8 *optval_end = ctx->optval_end;
+	struct sockopt_inherit *storage;
+	__u8 *optval = ctx->optval;
+
+	if (ctx->level != SOL_CUSTOM)
+		return 1; /* only interested in SOL_CUSTOM */
+
+	if (optval + 1 > optval_end)
+		return 0; /* EPERM, bounds check */
+
+	storage = get_storage(ctx);
+	if (!storage)
+		return 0; /* EPERM, couldn't get sk storage */
+
+	ctx->retval = 0; /* Reset system call return value to zero */
+
+	optval[0] = storage->val;
+	ctx->optlen = 1;
+
+	return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+	__u8 *optval_end = ctx->optval_end;
+	struct sockopt_inherit *storage;
+	__u8 *optval = ctx->optval;
+
+	if (ctx->level != SOL_CUSTOM)
+		return 1; /* only interested in SOL_CUSTOM */
+
+	if (optval + 1 > optval_end)
+		return 0; /* EPERM, bounds check */
+
+	storage = get_storage(ctx);
+	if (!storage)
+		return 0; /* EPERM, couldn't get sk storage */
+
+	storage->val = optval[0];
+	ctx->optlen = -1;
+
+	return 1;
+}
diff --git a/tools/testing/selftests/bpf/test_sockopt_inherit.c b/tools/testing/selftests/bpf/test_sockopt_inherit.c
new file mode 100644
index 000000000000..e47b9c28d743
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockopt_inherit.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pthread.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH				"/sockopt_inherit"
+#define SOL_CUSTOM			0xdeadbeef
+#define CUSTOM_INHERIT1			0
+#define CUSTOM_INHERIT2			1
+#define CUSTOM_LISTENER			2
+
+static int connect_to_server(int server_fd)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int fd;
+
+	fd = socket(AF_INET, SOCK_STREAM, 0);
+	if (fd < 0) {
+		log_err("Failed to create client socket");
+		return -1;
+	}
+
+	if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+		log_err("Failed to get server addr");
+		goto out;
+	}
+
+	if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
+		log_err("Fail to connect to server");
+		goto out;
+	}
+
+	return fd;
+
+out:
+	close(fd);
+	return -1;
+}
+
+static int verify_sockopt(int fd, int optname, const char *msg, char expected)
+{
+	socklen_t optlen = 1;
+	char buf = 0;
+	int err;
+
+	err = getsockopt(fd, SOL_CUSTOM, optname, &buf, &optlen);
+	if (err) {
+		log_err("%s: failed to call getsockopt", msg);
+		return 1;
+	}
+
+	log_err("%s %d: got=0x%x ? expected=0x%x", msg, optname, buf, expected);
+
+	if (buf != expected) {
+		log_err("%s: unexpected getsockopt value %d != %d", msg,
+			buf, expected);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void *server_thread(void *arg)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int fd = *(int *)arg;
+	int client_fd;
+	int err = 0;
+
+	if (listen(fd, 1) < 0)
+		error(1, errno, "Failed to listed on socket");
+
+	err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
+	err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
+	err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
+
+	client_fd = accept(fd, (struct sockaddr *)&addr, &len);
+	if (client_fd < 0)
+		error(1, errno, "Failed to accept client");
+
+	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
+	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
+	err += verify_sockopt(client_fd, CUSTOM_LISTENER, "accept", 0);
+
+	close(client_fd);
+
+	return (void *)(long)err;
+}
+
+static int start_server(void)
+{
+	struct sockaddr_in addr = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+	};
+	char buf;
+	int err;
+	int fd;
+	int i;
+
+	fd = socket(AF_INET, SOCK_STREAM, 0);
+	if (fd < 0) {
+		log_err("Failed to create server socket");
+		return -1;
+	}
+
+	for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) {
+		buf = 0x01;
+		err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1);
+		if (err) {
+			log_err("Failed to call setsockopt(%d)", i);
+			close(fd);
+			return -1;
+		}
+	}
+
+	if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
+		log_err("Failed to bind socket");
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+	enum bpf_attach_type attach_type;
+	enum bpf_prog_type prog_type;
+	struct bpf_program *prog;
+	int err;
+
+	err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+	if (err) {
+		log_err("Failed to deduct types for %s BPF program", title);
+		return -1;
+	}
+
+	prog = bpf_object__find_program_by_title(obj, title);
+	if (!prog) {
+		log_err("Failed to find %s BPF program", title);
+		return -1;
+	}
+
+	err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+			      attach_type, 0);
+	if (err) {
+		log_err("Failed to attach %s BPF program", title);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int run_test(int cgroup_fd)
+{
+	struct bpf_prog_load_attr attr = {
+		.file = "./sockopt_inherit.o",
+	};
+	int server_fd = -1, client_fd;
+	struct bpf_object *obj;
+	void *server_err;
+	pthread_t tid;
+	int ignored;
+	int err;
+
+	err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+	if (err) {
+		log_err("Failed to load BPF object");
+		return -1;
+	}
+
+	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
+	if (err)
+		goto close_bpf_object;
+
+	err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
+	if (err)
+		goto close_bpf_object;
+
+	server_fd = start_server();
+	if (server_fd < 0) {
+		err = -1;
+		goto close_bpf_object;
+	}
+
+	pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
+
+	client_fd = connect_to_server(server_fd);
+	if (client_fd < 0) {
+		err = -1;
+		goto close_bpf_object;
+	}
+
+	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0);
+	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0);
+	err += verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0);
+
+	pthread_join(tid, &server_err);
+
+	err += (int)(long)server_err;
+
+	close(client_fd);
+
+close_bpf_object:
+	bpf_object__close(obj);
+	close(server_fd);
+	return err;
+}
+
+int main(int args, char **argv)
+{
+	int cgroup_fd;
+	int err = EXIT_SUCCESS;
+
+	if (setup_cgroup_environment())
+		return err;
+
+	cgroup_fd = create_and_get_cgroup(CG_PATH);
+	if (cgroup_fd < 0)
+		goto cleanup_cgroup_env;
+
+	if (join_cgroup(CG_PATH))
+		goto cleanup_cgroup;
+
+	if (run_test(cgroup_fd))
+		err = EXIT_FAILURE;
+
+	printf("test_sockopt_inherit: %s\n",
+	       err == EXIT_SUCCESS ? "PASSED" : "FAILED");
+
+cleanup_cgroup:
+	close(cgroup_fd);
+cleanup_cgroup_env:
+	cleanup_cgroup_environment();
+	return err;
+}
-- 
2.22.0.770.g0f2c4a37fd-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
  2019-08-07 15:47 ` [PATCH bpf-next 1/3] " Stanislav Fomichev
@ 2019-08-07 23:03   ` Yonghong Song
  2019-08-08  0:05     ` Stanislav Fomichev
  2019-08-08  6:39   ` Martin Lau
  1 sibling, 1 reply; 14+ messages in thread
From: Yonghong Song @ 2019-08-07 23:03 UTC (permalink / raw)
  To: Stanislav Fomichev, netdev, bpf; +Cc: davem, ast, daniel, Martin Lau



On 8/7/19 8:47 AM, Stanislav Fomichev wrote:
> Add new helper bpf_sk_storage_clone which optionally clones sk storage
> and call it from bpf_sk_storage_clone. Reuse the gap in
> bpf_sk_storage_elem to store clone/non-clone flag.
> 
> Cc: Martin KaFai Lau <kafai@fb.com>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>

I tried to see whether I can find any missing race conditions in
the code but I failed. So except a minor comments below,
Acked-by: Yonghong Song <yhs@fb.com>

> ---
>   include/net/bpf_sk_storage.h |  10 ++++
>   include/uapi/linux/bpf.h     |   1 +
>   net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
>   net/core/sock.c              |   9 ++--
>   4 files changed, 115 insertions(+), 7 deletions(-)
> 
> diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> index b9dcb02e756b..8e4f831d2e52 100644
> --- a/include/net/bpf_sk_storage.h
> +++ b/include/net/bpf_sk_storage.h
> @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
>   extern const struct bpf_func_proto bpf_sk_storage_get_proto;
>   extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
>   
> +#ifdef CONFIG_BPF_SYSCALL
> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> +#else
> +static inline int bpf_sk_storage_clone(const struct sock *sk,
> +				       struct sock *newsk)
> +{
> +	return 0;
> +}
> +#endif
> +
>   #endif /* _BPF_SK_STORAGE_H */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 4393bd4b2419..00459ca4c8cf 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2931,6 +2931,7 @@ enum bpf_func_id {
>   
>   /* BPF_FUNC_sk_storage_get flags */
>   #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
> +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
>   
>   /* Mode for BPF_FUNC_skb_adjust_room helper. */
>   enum bpf_adj_room_mode {
> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> index 94c7f77ecb6b..b6dea67965bc 100644
> --- a/net/core/bpf_sk_storage.c
> +++ b/net/core/bpf_sk_storage.c
> @@ -12,6 +12,9 @@
>   
>   static atomic_t cache_idx;
>   
> +#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
> +					 BPF_SK_STORAGE_GET_F_CLONE)
> +
>   struct bucket {
>   	struct hlist_head list;
>   	raw_spinlock_t lock;
> @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
>   	struct hlist_node snode;	/* Linked to bpf_sk_storage */
>   	struct bpf_sk_storage __rcu *sk_storage;
>   	struct rcu_head rcu;
> -	/* 8 bytes hole */
> +	u8 clone:1;
> +	/* 7 bytes hole */
>   	/* The data is stored in aother cacheline to minimize
>   	 * the number of cachelines access during a cache hit.
>   	 */
> @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
>   	return 0;
>   }
>   
> -/* Called by __sk_destruct() */
> +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
>   void bpf_sk_storage_free(struct sock *sk)
>   {
>   	struct bpf_sk_storage_elem *selem;
> @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
>   	return err;
>   }
>   
> +static struct bpf_sk_storage_elem *
> +bpf_sk_storage_clone_elem(struct sock *newsk,
> +			  struct bpf_sk_storage_map *smap,
> +			  struct bpf_sk_storage_elem *selem)
> +{
> +	struct bpf_sk_storage_elem *copy_selem;
> +
> +	copy_selem = selem_alloc(smap, newsk, NULL, true);
> +	if (!copy_selem)
> +		return ERR_PTR(-ENOMEM);
> +
> +	if (map_value_has_spin_lock(&smap->map))
> +		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> +				      SDATA(selem)->data, true);
> +	else
> +		copy_map_value(&smap->map, SDATA(copy_selem)->data,
> +			       SDATA(selem)->data);
> +
> +	return copy_selem;
> +}
> +
> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> +{
> +	struct bpf_sk_storage *new_sk_storage = NULL;
> +	struct bpf_sk_storage *sk_storage;
> +	struct bpf_sk_storage_elem *selem;
> +	int ret;
> +
> +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> +
> +	rcu_read_lock();
> +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> +
> +	if (!sk_storage || hlist_empty(&sk_storage->list))
> +		goto out;
> +
> +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> +		struct bpf_sk_storage_map *smap;
> +		struct bpf_sk_storage_elem *copy_selem;
> +
> +		if (!selem->clone)
> +			continue;
> +
> +		smap = rcu_dereference(SDATA(selem)->smap);
> +		if (!smap)
> +			continue;
> +
> +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> +		if (IS_ERR(copy_selem)) {
> +			ret = PTR_ERR(copy_selem);
> +			goto err;
> +		}
> +
> +		if (!new_sk_storage) {
> +			ret = sk_storage_alloc(newsk, smap, copy_selem);
> +			if (ret) {
> +				kfree(copy_selem);
> +				atomic_sub(smap->elem_size,
> +					   &newsk->sk_omem_alloc);
> +				goto err;
> +			}
> +
> +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> +			continue;
> +		}
> +
> +		raw_spin_lock_bh(&new_sk_storage->lock);
> +		selem_link_map(smap, copy_selem);
> +		__selem_link_sk(new_sk_storage, copy_selem);
> +		raw_spin_unlock_bh(&new_sk_storage->lock);

Considering in this particular case, new socket is not visible to 
outside world yet (both kernel and user space), map_delete/map_update
operations are not applicable in this situation, so
the above raw_spin_lock_bh() probably not needed.


> +	}
> +
> +out:
> +	rcu_read_unlock();
> +	return 0;
> +
> +err:
> +	rcu_read_unlock();
> +
> +	bpf_sk_storage_free(newsk);
> +	return ret;
> +}
> +
>   BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>   	   void *, value, u64, flags)
>   {
>   	struct bpf_sk_storage_data *sdata;
>   
> -	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
> +	if (flags & ~BPF_SK_STORAGE_GET_F_MASK)
> +		return (unsigned long)NULL;
> +
> +	if ((flags & BPF_SK_STORAGE_GET_F_CLONE) &&
> +	    !(flags & BPF_SK_STORAGE_GET_F_CREATE))
>   		return (unsigned long)NULL;
>   
>   	sdata = sk_storage_lookup(sk, map, true);
>   	if (sdata)
>   		return (unsigned long)sdata->data;
>   
> -	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
> +	if ((flags & BPF_SK_STORAGE_GET_F_CREATE) &&
>   	    /* Cannot add new elem to a going away sk.
>   	     * Otherwise, the new elem may become a leak
>   	     * (and also other memory issues during map
> @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>   		/* sk must be a fullsock (guaranteed by verifier),
>   		 * so sock_gen_put() is unnecessary.
>   		 */
> +		if (!IS_ERR(sdata))
> +			SELEM(sdata)->clone =
> +				!!(flags & BPF_SK_STORAGE_GET_F_CLONE);
>   		sock_put(sk);
>   		return IS_ERR(sdata) ?
>   			(unsigned long)NULL : (unsigned long)sdata->data;
> diff --git a/net/core/sock.c b/net/core/sock.c
> index d57b0cc995a0..f5e801a9cea4 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>   			goto out;
>   		}
>   		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> -#ifdef CONFIG_BPF_SYSCALL
> -		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> -#endif
> +
> +		if (bpf_sk_storage_clone(sk, newsk)) {
> +			sk_free_unlock_clone(newsk);
> +			newsk = NULL;
> +			goto out;
> +		}
>   
>   		newsk->sk_err	   = 0;
>   		newsk->sk_err_soft = 0;
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 2/3] bpf: sync bpf.h to tools/
  2019-08-07 15:47 ` [PATCH bpf-next 2/3] bpf: sync bpf.h to tools/ Stanislav Fomichev
@ 2019-08-07 23:04   ` Yonghong Song
  0 siblings, 0 replies; 14+ messages in thread
From: Yonghong Song @ 2019-08-07 23:04 UTC (permalink / raw)
  To: Stanislav Fomichev, netdev, bpf; +Cc: davem, ast, daniel, Martin Lau



On 8/7/19 8:47 AM, Stanislav Fomichev wrote:
> Sync new sk storage clone flag.
> 
> Cc: Martin KaFai Lau <kafai@fb.com>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>

Acked-by: Yonghong Song <yhs@fb.com>

> ---
>   tools/include/uapi/linux/bpf.h | 1 +
>   1 file changed, 1 insertion(+)
> 
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 4393bd4b2419..00459ca4c8cf 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -2931,6 +2931,7 @@ enum bpf_func_id {
>   
>   /* BPF_FUNC_sk_storage_get flags */
>   #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
> +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
>   
>   /* Mode for BPF_FUNC_skb_adjust_room helper. */
>   enum bpf_adj_room_mode {
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 3/3] selftests/bpf: add sockopt clone/inheritance test
  2019-08-07 15:47 ` [PATCH bpf-next 3/3] selftests/bpf: add sockopt clone/inheritance test Stanislav Fomichev
@ 2019-08-07 23:14   ` Yonghong Song
  2019-08-08  0:08     ` Stanislav Fomichev
  0 siblings, 1 reply; 14+ messages in thread
From: Yonghong Song @ 2019-08-07 23:14 UTC (permalink / raw)
  To: Stanislav Fomichev, netdev, bpf; +Cc: davem, ast, daniel, Martin Lau



On 8/7/19 8:47 AM, Stanislav Fomichev wrote:
> Add a test that calls setsockopt on the listener socket which triggers
> BPF program. This BPF program writes to the sk storage and sets
> clone flag. Make sure that sk storage is cloned for a newly
> accepted connection.
> 
> We have two cloned maps in the tests to make sure we hit both cases
> in bpf_sk_storage_clone: first element (sk_storage_alloc) and
> non-first element(s) (selem_link_map).
> 
> Cc: Martin KaFai Lau <kafai@fb.com>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>   tools/testing/selftests/bpf/.gitignore        |   1 +
>   tools/testing/selftests/bpf/Makefile          |   3 +-
>   .../selftests/bpf/progs/sockopt_inherit.c     | 102 +++++++
>   .../selftests/bpf/test_sockopt_inherit.c      | 252 ++++++++++++++++++
>   4 files changed, 357 insertions(+), 1 deletion(-)
>   create mode 100644 tools/testing/selftests/bpf/progs/sockopt_inherit.c
>   create mode 100644 tools/testing/selftests/bpf/test_sockopt_inherit.c
> 
> diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
> index 90f70d2c7c22..60c9338cd9b4 100644
> --- a/tools/testing/selftests/bpf/.gitignore
> +++ b/tools/testing/selftests/bpf/.gitignore
> @@ -42,4 +42,5 @@ xdping
>   test_sockopt
>   test_sockopt_sk
>   test_sockopt_multi
> +test_sockopt_inherit
>   test_tcp_rtt
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index 3bd0f4a0336a..c875763a851a 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
>   	test_cgroup_storage test_select_reuseport test_section_names \
>   	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
>   	test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
> -	test_sockopt_multi test_tcp_rtt
> +	test_sockopt_multi test_sockopt_inherit test_tcp_rtt
>   
>   BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
>   TEST_GEN_FILES = $(BPF_OBJ_FILES)
> @@ -110,6 +110,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
>   $(OUTPUT)/test_sockopt: cgroup_helpers.c
>   $(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
>   $(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
> +$(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
>   $(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
>   
>   .PHONY: force
> diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
> new file mode 100644
> index 000000000000..357fc9db5874
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
> @@ -0,0 +1,102 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/bpf.h>
> +#include "bpf_helpers.h"
> +
> +char _license[] SEC("license") = "GPL";
> +__u32 _version SEC("version") = 1;
> +
> +#define SOL_CUSTOM			0xdeadbeef
> +#define CUSTOM_INHERIT1			0
> +#define CUSTOM_INHERIT2			1
> +#define CUSTOM_LISTENER			2
> +
> +struct sockopt_inherit {
> +	__u8 val;
> +};
> +
> +struct bpf_map_def SEC("maps") cloned1_map = {
> +	.type = BPF_MAP_TYPE_SK_STORAGE,
> +	.key_size = sizeof(int),
> +	.value_size = sizeof(struct sockopt_inherit),
> +	.map_flags = BPF_F_NO_PREALLOC,
> +};
> +BPF_ANNOTATE_KV_PAIR(cloned1_map, int, struct sockopt_inherit);
> +
> +struct bpf_map_def SEC("maps") cloned2_map = {
> +	.type = BPF_MAP_TYPE_SK_STORAGE,
> +	.key_size = sizeof(int),
> +	.value_size = sizeof(struct sockopt_inherit),
> +	.map_flags = BPF_F_NO_PREALLOC,
> +};
> +BPF_ANNOTATE_KV_PAIR(cloned2_map, int, struct sockopt_inherit);
> +
> +struct bpf_map_def SEC("maps") listener_map = {
> +	.type = BPF_MAP_TYPE_SK_STORAGE,
> +	.key_size = sizeof(int),
> +	.value_size = sizeof(struct sockopt_inherit),
> +	.map_flags = BPF_F_NO_PREALLOC,
> +};
> +BPF_ANNOTATE_KV_PAIR(listener_map, int, struct sockopt_inherit);

Your still use the old way for map definitions. Is this possible for you
to use new map definitions (in section ".maps")?

> +
> +static __inline struct sockopt_inherit *get_storage(struct bpf_sockopt *ctx)
> +{
> +	if (ctx->optname == CUSTOM_INHERIT1)
> +		return bpf_sk_storage_get(&cloned1_map, ctx->sk, 0,
> +					  BPF_SK_STORAGE_GET_F_CREATE |
> +					  BPF_SK_STORAGE_GET_F_CLONE);
> +	else if (ctx->optname == CUSTOM_INHERIT2)
> +		return bpf_sk_storage_get(&cloned2_map, ctx->sk, 0,
> +					  BPF_SK_STORAGE_GET_F_CREATE |
> +					  BPF_SK_STORAGE_GET_F_CLONE);
> +	else
> +		return bpf_sk_storage_get(&listener_map, ctx->sk, 0,
> +					  BPF_SK_STORAGE_GET_F_CREATE);
> +}
> +
[.....]> diff --git a/tools/testing/selftests/bpf/test_sockopt_inherit.c 
b/tools/testing/selftests/bpf/test_sockopt_inherit.c
> new file mode 100644
> index 000000000000..e47b9c28d743
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/test_sockopt_inherit.c
> @@ -0,0 +1,252 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <error.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <netinet/in.h>
> +#include <pthread.h>
> +
> +#include <linux/filter.h>
> +#include <bpf/bpf.h>
> +#include <bpf/libbpf.h>
> +
> +#include "bpf_rlimit.h"
> +#include "bpf_util.h"
> +#include "cgroup_helpers.h"
> +
> +#define CG_PATH				"/sockopt_inherit"
> +#define SOL_CUSTOM			0xdeadbeef
> +#define CUSTOM_INHERIT1			0
> +#define CUSTOM_INHERIT2			1
> +#define CUSTOM_LISTENER			2
> +
> +static int connect_to_server(int server_fd)
> +{
> +	struct sockaddr_storage addr;
> +	socklen_t len = sizeof(addr);
> +	int fd;
> +
> +	fd = socket(AF_INET, SOCK_STREAM, 0);
> +	if (fd < 0) {
> +		log_err("Failed to create client socket");
> +		return -1;
> +	}
> +
> +	if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
> +		log_err("Failed to get server addr");
> +		goto out;
> +	}
> +
> +	if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
> +		log_err("Fail to connect to server");
> +		goto out;
> +	}
> +
> +	return fd;
> +
> +out:
> +	close(fd);
> +	return -1;
> +}
> +
> +static int verify_sockopt(int fd, int optname, const char *msg, char expected)
> +{
> +	socklen_t optlen = 1;
> +	char buf = 0;
> +	int err;
> +
> +	err = getsockopt(fd, SOL_CUSTOM, optname, &buf, &optlen);
> +	if (err) {
> +		log_err("%s: failed to call getsockopt", msg);
> +		return 1;
> +	}
> +
> +	log_err("%s %d: got=0x%x ? expected=0x%x", msg, optname, buf, expected);

There may not be error here.

> +
> +	if (buf != expected) {
> +		log_err("%s: unexpected getsockopt value %d != %d", msg,
> +			buf, expected);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static void *server_thread(void *arg)
> +{
> +	struct sockaddr_storage addr;
> +	socklen_t len = sizeof(addr);
> +	int fd = *(int *)arg;
> +	int client_fd;
> +	int err = 0;
> +
> +	if (listen(fd, 1) < 0)
> +		error(1, errno, "Failed to listed on socket");
> +
> +	err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
> +	err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
> +	err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
> +
> +	client_fd = accept(fd, (struct sockaddr *)&addr, &len);
> +	if (client_fd < 0)
> +		error(1, errno, "Failed to accept client");
> +
> +	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
> +	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
> +	err += verify_sockopt(client_fd, CUSTOM_LISTENER, "accept", 0);
> +
> +	close(client_fd);
> +
> +	return (void *)(long)err;
> +}
> +
> +static int start_server(void)
> +{
> +	struct sockaddr_in addr = {
> +		.sin_family = AF_INET,
> +		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
> +	};
> +	char buf;
> +	int err;
> +	int fd;
> +	int i;
> +
> +	fd = socket(AF_INET, SOCK_STREAM, 0);
> +	if (fd < 0) {
> +		log_err("Failed to create server socket");
> +		return -1;
> +	}
> +
> +	for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) {
> +		buf = 0x01;
> +		err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1);
> +		if (err) {
> +			log_err("Failed to call setsockopt(%d)", i);
> +			close(fd);
> +			return -1;
> +		}
> +	}
> +
> +	if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
> +		log_err("Failed to bind socket");
> +		close(fd);
> +		return -1;
> +	}
> +
> +	return fd;
> +}
> +
> +static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
> +{
> +	enum bpf_attach_type attach_type;
> +	enum bpf_prog_type prog_type;
> +	struct bpf_program *prog;
> +	int err;
> +
> +	err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
> +	if (err) {
> +		log_err("Failed to deduct types for %s BPF program", title);
> +		return -1;
> +	}
> +
> +	prog = bpf_object__find_program_by_title(obj, title);
> +	if (!prog) {
> +		log_err("Failed to find %s BPF program", title);
> +		return -1;
> +	}
> +
> +	err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
> +			      attach_type, 0);
> +	if (err) {
> +		log_err("Failed to attach %s BPF program", title);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int run_test(int cgroup_fd)
> +{
> +	struct bpf_prog_load_attr attr = {
> +		.file = "./sockopt_inherit.o",
> +	};
> +	int server_fd = -1, client_fd;
> +	struct bpf_object *obj;
> +	void *server_err;
> +	pthread_t tid;
> +	int ignored;
> +	int err;
> +
> +	err = bpf_prog_load_xattr(&attr, &obj, &ignored);
> +	if (err) {
> +		log_err("Failed to load BPF object");
> +		return -1;
> +	}
> +
> +	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
> +	if (err)
> +		goto close_bpf_object;
> +
> +	err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
> +	if (err)
> +		goto close_bpf_object;
> +
> +	server_fd = start_server();
> +	if (server_fd < 0) {
> +		err = -1;
> +		goto close_bpf_object;
> +	}
> +
> +	pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
> +
> +	client_fd = connect_to_server(server_fd);
> +	if (client_fd < 0) {
> +		err = -1;
> +		goto close_bpf_object;
> +	}
> +
> +	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0);
> +	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0);
> +	err += verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0);
> +
> +	pthread_join(tid, &server_err);
> +
> +	err += (int)(long)server_err;
> +
> +	close(client_fd);
> +
> +close_bpf_object:
> +	bpf_object__close(obj);
> +	close(server_fd);

server_fd could be -1 here.

> +	return err;
> +}
> +
> +int main(int args, char **argv)
> +{
> +	int cgroup_fd;
> +	int err = EXIT_SUCCESS;
> +
> +	if (setup_cgroup_environment())
> +		return err;
> +
> +	cgroup_fd = create_and_get_cgroup(CG_PATH);
> +	if (cgroup_fd < 0)
> +		goto cleanup_cgroup_env;
> +
> +	if (join_cgroup(CG_PATH))
> +		goto cleanup_cgroup;
> +
> +	if (run_test(cgroup_fd))
> +		err = EXIT_FAILURE;
> +
> +	printf("test_sockopt_inherit: %s\n",
> +	       err == EXIT_SUCCESS ? "PASSED" : "FAILED");
> +
> +cleanup_cgroup:
> +	close(cgroup_fd);
> +cleanup_cgroup_env:
> +	cleanup_cgroup_environment();
> +	return err;
> +}
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
  2019-08-07 23:03   ` Yonghong Song
@ 2019-08-08  0:05     ` Stanislav Fomichev
  2019-08-08  0:18       ` Yonghong Song
  2019-08-08  7:11       ` Martin Lau
  0 siblings, 2 replies; 14+ messages in thread
From: Stanislav Fomichev @ 2019-08-08  0:05 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Stanislav Fomichev, netdev, bpf, davem, ast, daniel, Martin Lau

On 08/07, Yonghong Song wrote:
> 
> 
> On 8/7/19 8:47 AM, Stanislav Fomichev wrote:
> > Add new helper bpf_sk_storage_clone which optionally clones sk storage
> > and call it from bpf_sk_storage_clone. Reuse the gap in
> > bpf_sk_storage_elem to store clone/non-clone flag.
> > 
> > Cc: Martin KaFai Lau <kafai@fb.com>
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> 
> I tried to see whether I can find any missing race conditions in
> the code but I failed. So except a minor comments below,
Thanks for a review!

> Acked-by: Yonghong Song <yhs@fb.com>
> 
> > ---
> >   include/net/bpf_sk_storage.h |  10 ++++
> >   include/uapi/linux/bpf.h     |   1 +
> >   net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
> >   net/core/sock.c              |   9 ++--
> >   4 files changed, 115 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> > index b9dcb02e756b..8e4f831d2e52 100644
> > --- a/include/net/bpf_sk_storage.h
> > +++ b/include/net/bpf_sk_storage.h
> > @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
> >   extern const struct bpf_func_proto bpf_sk_storage_get_proto;
> >   extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
> >   
> > +#ifdef CONFIG_BPF_SYSCALL
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> > +#else
> > +static inline int bpf_sk_storage_clone(const struct sock *sk,
> > +				       struct sock *newsk)
> > +{
> > +	return 0;
> > +}
> > +#endif
> > +
> >   #endif /* _BPF_SK_STORAGE_H */
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 4393bd4b2419..00459ca4c8cf 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2931,6 +2931,7 @@ enum bpf_func_id {
> >   
> >   /* BPF_FUNC_sk_storage_get flags */
> >   #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
> > +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
> >   
> >   /* Mode for BPF_FUNC_skb_adjust_room helper. */
> >   enum bpf_adj_room_mode {
> > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> > index 94c7f77ecb6b..b6dea67965bc 100644
> > --- a/net/core/bpf_sk_storage.c
> > +++ b/net/core/bpf_sk_storage.c
> > @@ -12,6 +12,9 @@
> >   
> >   static atomic_t cache_idx;
> >   
> > +#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
> > +					 BPF_SK_STORAGE_GET_F_CLONE)
> > +
> >   struct bucket {
> >   	struct hlist_head list;
> >   	raw_spinlock_t lock;
> > @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
> >   	struct hlist_node snode;	/* Linked to bpf_sk_storage */
> >   	struct bpf_sk_storage __rcu *sk_storage;
> >   	struct rcu_head rcu;
> > -	/* 8 bytes hole */
> > +	u8 clone:1;
> > +	/* 7 bytes hole */
> >   	/* The data is stored in aother cacheline to minimize
> >   	 * the number of cachelines access during a cache hit.
> >   	 */
> > @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
> >   	return 0;
> >   }
> >   
> > -/* Called by __sk_destruct() */
> > +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
> >   void bpf_sk_storage_free(struct sock *sk)
> >   {
> >   	struct bpf_sk_storage_elem *selem;
> > @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
> >   	return err;
> >   }
> >   
> > +static struct bpf_sk_storage_elem *
> > +bpf_sk_storage_clone_elem(struct sock *newsk,
> > +			  struct bpf_sk_storage_map *smap,
> > +			  struct bpf_sk_storage_elem *selem)
> > +{
> > +	struct bpf_sk_storage_elem *copy_selem;
> > +
> > +	copy_selem = selem_alloc(smap, newsk, NULL, true);
> > +	if (!copy_selem)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	if (map_value_has_spin_lock(&smap->map))
> > +		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> > +				      SDATA(selem)->data, true);
> > +	else
> > +		copy_map_value(&smap->map, SDATA(copy_selem)->data,
> > +			       SDATA(selem)->data);
> > +
> > +	return copy_selem;
> > +}
> > +
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> > +{
> > +	struct bpf_sk_storage *new_sk_storage = NULL;
> > +	struct bpf_sk_storage *sk_storage;
> > +	struct bpf_sk_storage_elem *selem;
> > +	int ret;
> > +
> > +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > +
> > +	rcu_read_lock();
> > +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > +
> > +	if (!sk_storage || hlist_empty(&sk_storage->list))
> > +		goto out;
> > +
> > +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> > +		struct bpf_sk_storage_map *smap;
> > +		struct bpf_sk_storage_elem *copy_selem;
> > +
> > +		if (!selem->clone)
> > +			continue;
> > +
> > +		smap = rcu_dereference(SDATA(selem)->smap);
> > +		if (!smap)
> > +			continue;
> > +
> > +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> > +		if (IS_ERR(copy_selem)) {
> > +			ret = PTR_ERR(copy_selem);
> > +			goto err;
> > +		}
> > +
> > +		if (!new_sk_storage) {
> > +			ret = sk_storage_alloc(newsk, smap, copy_selem);
> > +			if (ret) {
> > +				kfree(copy_selem);
> > +				atomic_sub(smap->elem_size,
> > +					   &newsk->sk_omem_alloc);
> > +				goto err;
> > +			}
> > +
> > +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> > +			continue;
> > +		}
> > +
> > +		raw_spin_lock_bh(&new_sk_storage->lock);
> > +		selem_link_map(smap, copy_selem);
> > +		__selem_link_sk(new_sk_storage, copy_selem);
> > +		raw_spin_unlock_bh(&new_sk_storage->lock);
> 
> Considering in this particular case, new socket is not visible to 
> outside world yet (both kernel and user space), map_delete/map_update
> operations are not applicable in this situation, so
> the above raw_spin_lock_bh() probably not needed.
I agree, it's doing nothing, but __selem_link_sk has the following comment:
/* sk_storage->lock must be held and sk_storage->list cannot be empty */

Just wanted to keep that invariant for this call site as well (in case
we add some lockdep enforcement or smth else). WDYT?

> > +	}
> > +
> > +out:
> > +	rcu_read_unlock();
> > +	return 0;
> > +
> > +err:
> > +	rcu_read_unlock();
> > +
> > +	bpf_sk_storage_free(newsk);
> > +	return ret;
> > +}
> > +
> >   BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> >   	   void *, value, u64, flags)
> >   {
> >   	struct bpf_sk_storage_data *sdata;
> >   
> > -	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
> > +	if (flags & ~BPF_SK_STORAGE_GET_F_MASK)
> > +		return (unsigned long)NULL;
> > +
> > +	if ((flags & BPF_SK_STORAGE_GET_F_CLONE) &&
> > +	    !(flags & BPF_SK_STORAGE_GET_F_CREATE))
> >   		return (unsigned long)NULL;
> >   
> >   	sdata = sk_storage_lookup(sk, map, true);
> >   	if (sdata)
> >   		return (unsigned long)sdata->data;
> >   
> > -	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
> > +	if ((flags & BPF_SK_STORAGE_GET_F_CREATE) &&
> >   	    /* Cannot add new elem to a going away sk.
> >   	     * Otherwise, the new elem may become a leak
> >   	     * (and also other memory issues during map
> > @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> >   		/* sk must be a fullsock (guaranteed by verifier),
> >   		 * so sock_gen_put() is unnecessary.
> >   		 */
> > +		if (!IS_ERR(sdata))
> > +			SELEM(sdata)->clone =
> > +				!!(flags & BPF_SK_STORAGE_GET_F_CLONE);
> >   		sock_put(sk);
> >   		return IS_ERR(sdata) ?
> >   			(unsigned long)NULL : (unsigned long)sdata->data;
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index d57b0cc995a0..f5e801a9cea4 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
> >   			goto out;
> >   		}
> >   		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> > -#ifdef CONFIG_BPF_SYSCALL
> > -		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > -#endif
> > +
> > +		if (bpf_sk_storage_clone(sk, newsk)) {
> > +			sk_free_unlock_clone(newsk);
> > +			newsk = NULL;
> > +			goto out;
> > +		}
> >   
> >   		newsk->sk_err	   = 0;
> >   		newsk->sk_err_soft = 0;
> > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 3/3] selftests/bpf: add sockopt clone/inheritance test
  2019-08-07 23:14   ` Yonghong Song
@ 2019-08-08  0:08     ` Stanislav Fomichev
  0 siblings, 0 replies; 14+ messages in thread
From: Stanislav Fomichev @ 2019-08-08  0:08 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Stanislav Fomichev, netdev, bpf, davem, ast, daniel, Martin Lau

On 08/07, Yonghong Song wrote:
> 
> 
> On 8/7/19 8:47 AM, Stanislav Fomichev wrote:
> > Add a test that calls setsockopt on the listener socket which triggers
> > BPF program. This BPF program writes to the sk storage and sets
> > clone flag. Make sure that sk storage is cloned for a newly
> > accepted connection.
> > 
> > We have two cloned maps in the tests to make sure we hit both cases
> > in bpf_sk_storage_clone: first element (sk_storage_alloc) and
> > non-first element(s) (selem_link_map).
> > 
> > Cc: Martin KaFai Lau <kafai@fb.com>
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >   tools/testing/selftests/bpf/.gitignore        |   1 +
> >   tools/testing/selftests/bpf/Makefile          |   3 +-
> >   .../selftests/bpf/progs/sockopt_inherit.c     | 102 +++++++
> >   .../selftests/bpf/test_sockopt_inherit.c      | 252 ++++++++++++++++++
> >   4 files changed, 357 insertions(+), 1 deletion(-)
> >   create mode 100644 tools/testing/selftests/bpf/progs/sockopt_inherit.c
> >   create mode 100644 tools/testing/selftests/bpf/test_sockopt_inherit.c
> > 
> > diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
> > index 90f70d2c7c22..60c9338cd9b4 100644
> > --- a/tools/testing/selftests/bpf/.gitignore
> > +++ b/tools/testing/selftests/bpf/.gitignore
> > @@ -42,4 +42,5 @@ xdping
> >   test_sockopt
> >   test_sockopt_sk
> >   test_sockopt_multi
> > +test_sockopt_inherit
> >   test_tcp_rtt
> > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> > index 3bd0f4a0336a..c875763a851a 100644
> > --- a/tools/testing/selftests/bpf/Makefile
> > +++ b/tools/testing/selftests/bpf/Makefile
> > @@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
> >   	test_cgroup_storage test_select_reuseport test_section_names \
> >   	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
> >   	test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
> > -	test_sockopt_multi test_tcp_rtt
> > +	test_sockopt_multi test_sockopt_inherit test_tcp_rtt
> >   
> >   BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
> >   TEST_GEN_FILES = $(BPF_OBJ_FILES)
> > @@ -110,6 +110,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
> >   $(OUTPUT)/test_sockopt: cgroup_helpers.c
> >   $(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
> >   $(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
> > +$(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
> >   $(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
> >   
> >   .PHONY: force
> > diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
> > new file mode 100644
> > index 000000000000..357fc9db5874
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
> > @@ -0,0 +1,102 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <linux/bpf.h>
> > +#include "bpf_helpers.h"
> > +
> > +char _license[] SEC("license") = "GPL";
> > +__u32 _version SEC("version") = 1;
> > +
> > +#define SOL_CUSTOM			0xdeadbeef
> > +#define CUSTOM_INHERIT1			0
> > +#define CUSTOM_INHERIT2			1
> > +#define CUSTOM_LISTENER			2
> > +
> > +struct sockopt_inherit {
> > +	__u8 val;
> > +};
> > +
> > +struct bpf_map_def SEC("maps") cloned1_map = {
> > +	.type = BPF_MAP_TYPE_SK_STORAGE,
> > +	.key_size = sizeof(int),
> > +	.value_size = sizeof(struct sockopt_inherit),
> > +	.map_flags = BPF_F_NO_PREALLOC,
> > +};
> > +BPF_ANNOTATE_KV_PAIR(cloned1_map, int, struct sockopt_inherit);
> > +
> > +struct bpf_map_def SEC("maps") cloned2_map = {
> > +	.type = BPF_MAP_TYPE_SK_STORAGE,
> > +	.key_size = sizeof(int),
> > +	.value_size = sizeof(struct sockopt_inherit),
> > +	.map_flags = BPF_F_NO_PREALLOC,
> > +};
> > +BPF_ANNOTATE_KV_PAIR(cloned2_map, int, struct sockopt_inherit);
> > +
> > +struct bpf_map_def SEC("maps") listener_map = {
> > +	.type = BPF_MAP_TYPE_SK_STORAGE,
> > +	.key_size = sizeof(int),
> > +	.value_size = sizeof(struct sockopt_inherit),
> > +	.map_flags = BPF_F_NO_PREALLOC,
> > +};
> > +BPF_ANNOTATE_KV_PAIR(listener_map, int, struct sockopt_inherit);
> 
> Your still use the old way for map definitions. Is this possible for you
> to use new map definitions (in section ".maps")?
Ah, my bad, I'm not used to the new defs. Will fix!

> > +
> > +static __inline struct sockopt_inherit *get_storage(struct bpf_sockopt *ctx)
> > +{
> > +	if (ctx->optname == CUSTOM_INHERIT1)
> > +		return bpf_sk_storage_get(&cloned1_map, ctx->sk, 0,
> > +					  BPF_SK_STORAGE_GET_F_CREATE |
> > +					  BPF_SK_STORAGE_GET_F_CLONE);
> > +	else if (ctx->optname == CUSTOM_INHERIT2)
> > +		return bpf_sk_storage_get(&cloned2_map, ctx->sk, 0,
> > +					  BPF_SK_STORAGE_GET_F_CREATE |
> > +					  BPF_SK_STORAGE_GET_F_CLONE);
> > +	else
> > +		return bpf_sk_storage_get(&listener_map, ctx->sk, 0,
> > +					  BPF_SK_STORAGE_GET_F_CREATE);
> > +}
> > +
> [.....]> diff --git a/tools/testing/selftests/bpf/test_sockopt_inherit.c 
> b/tools/testing/selftests/bpf/test_sockopt_inherit.c
> > new file mode 100644
> > index 000000000000..e47b9c28d743
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/test_sockopt_inherit.c
> > @@ -0,0 +1,252 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <error.h>
> > +#include <errno.h>
> > +#include <stdio.h>
> > +#include <unistd.h>
> > +#include <sys/types.h>
> > +#include <sys/socket.h>
> > +#include <netinet/in.h>
> > +#include <pthread.h>
> > +
> > +#include <linux/filter.h>
> > +#include <bpf/bpf.h>
> > +#include <bpf/libbpf.h>
> > +
> > +#include "bpf_rlimit.h"
> > +#include "bpf_util.h"
> > +#include "cgroup_helpers.h"
> > +
> > +#define CG_PATH				"/sockopt_inherit"
> > +#define SOL_CUSTOM			0xdeadbeef
> > +#define CUSTOM_INHERIT1			0
> > +#define CUSTOM_INHERIT2			1
> > +#define CUSTOM_LISTENER			2
> > +
> > +static int connect_to_server(int server_fd)
> > +{
> > +	struct sockaddr_storage addr;
> > +	socklen_t len = sizeof(addr);
> > +	int fd;
> > +
> > +	fd = socket(AF_INET, SOCK_STREAM, 0);
> > +	if (fd < 0) {
> > +		log_err("Failed to create client socket");
> > +		return -1;
> > +	}
> > +
> > +	if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
> > +		log_err("Failed to get server addr");
> > +		goto out;
> > +	}
> > +
> > +	if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
> > +		log_err("Fail to connect to server");
> > +		goto out;
> > +	}
> > +
> > +	return fd;
> > +
> > +out:
> > +	close(fd);
> > +	return -1;
> > +}
> > +
> > +static int verify_sockopt(int fd, int optname, const char *msg, char expected)
> > +{
> > +	socklen_t optlen = 1;
> > +	char buf = 0;
> > +	int err;
> > +
> > +	err = getsockopt(fd, SOL_CUSTOM, optname, &buf, &optlen);
> > +	if (err) {
> > +		log_err("%s: failed to call getsockopt", msg);
> > +		return 1;
> > +	}
> > +
> > +	log_err("%s %d: got=0x%x ? expected=0x%x", msg, optname, buf, expected);
> 
> There may not be error here.
Good point, will switch to simple printf.

> > +
> > +	if (buf != expected) {
> > +		log_err("%s: unexpected getsockopt value %d != %d", msg,
> > +			buf, expected);
> > +		return 1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static void *server_thread(void *arg)
> > +{
> > +	struct sockaddr_storage addr;
> > +	socklen_t len = sizeof(addr);
> > +	int fd = *(int *)arg;
> > +	int client_fd;
> > +	int err = 0;
> > +
> > +	if (listen(fd, 1) < 0)
> > +		error(1, errno, "Failed to listed on socket");
> > +
> > +	err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
> > +	err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
> > +	err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
> > +
> > +	client_fd = accept(fd, (struct sockaddr *)&addr, &len);
> > +	if (client_fd < 0)
> > +		error(1, errno, "Failed to accept client");
> > +
> > +	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
> > +	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
> > +	err += verify_sockopt(client_fd, CUSTOM_LISTENER, "accept", 0);
> > +
> > +	close(client_fd);
> > +
> > +	return (void *)(long)err;
> > +}
> > +
> > +static int start_server(void)
> > +{
> > +	struct sockaddr_in addr = {
> > +		.sin_family = AF_INET,
> > +		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
> > +	};
> > +	char buf;
> > +	int err;
> > +	int fd;
> > +	int i;
> > +
> > +	fd = socket(AF_INET, SOCK_STREAM, 0);
> > +	if (fd < 0) {
> > +		log_err("Failed to create server socket");
> > +		return -1;
> > +	}
> > +
> > +	for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) {
> > +		buf = 0x01;
> > +		err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1);
> > +		if (err) {
> > +			log_err("Failed to call setsockopt(%d)", i);
> > +			close(fd);
> > +			return -1;
> > +		}
> > +	}
> > +
> > +	if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
> > +		log_err("Failed to bind socket");
> > +		close(fd);
> > +		return -1;
> > +	}
> > +
> > +	return fd;
> > +}
> > +
> > +static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
> > +{
> > +	enum bpf_attach_type attach_type;
> > +	enum bpf_prog_type prog_type;
> > +	struct bpf_program *prog;
> > +	int err;
> > +
> > +	err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
> > +	if (err) {
> > +		log_err("Failed to deduct types for %s BPF program", title);
> > +		return -1;
> > +	}
> > +
> > +	prog = bpf_object__find_program_by_title(obj, title);
> > +	if (!prog) {
> > +		log_err("Failed to find %s BPF program", title);
> > +		return -1;
> > +	}
> > +
> > +	err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
> > +			      attach_type, 0);
> > +	if (err) {
> > +		log_err("Failed to attach %s BPF program", title);
> > +		return -1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int run_test(int cgroup_fd)
> > +{
> > +	struct bpf_prog_load_attr attr = {
> > +		.file = "./sockopt_inherit.o",
> > +	};
> > +	int server_fd = -1, client_fd;
> > +	struct bpf_object *obj;
> > +	void *server_err;
> > +	pthread_t tid;
> > +	int ignored;
> > +	int err;
> > +
> > +	err = bpf_prog_load_xattr(&attr, &obj, &ignored);
> > +	if (err) {
> > +		log_err("Failed to load BPF object");
> > +		return -1;
> > +	}
> > +
> > +	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
> > +	if (err)
> > +		goto close_bpf_object;
> > +
> > +	err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
> > +	if (err)
> > +		goto close_bpf_object;
> > +
> > +	server_fd = start_server();
> > +	if (server_fd < 0) {
> > +		err = -1;
> > +		goto close_bpf_object;
> > +	}
> > +
> > +	pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
> > +
> > +	client_fd = connect_to_server(server_fd);
> > +	if (client_fd < 0) {
> > +		err = -1;
> > +		goto close_bpf_object;
> > +	}
> > +
> > +	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0);
> > +	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0);
> > +	err += verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0);
> > +
> > +	pthread_join(tid, &server_err);
> > +
> > +	err += (int)(long)server_err;
> > +
> > +	close(client_fd);
> > +
> > +close_bpf_object:
> > +	bpf_object__close(obj);
> > +	close(server_fd);
> 
> server_fd could be -1 here.
I've initialized it to -1 so we can close(-1) here and not close(some
random data). Shouldn't be a problem, right?

The order is backwards though, should be:
close(server_fd);
bpf_object__close(obj);

I can probably add a label for bpf_object__close case to avoid this close(-1).
Will do for a v2.

> > +	return err;
> > +}
> > +
> > +int main(int args, char **argv)
> > +{
> > +	int cgroup_fd;
> > +	int err = EXIT_SUCCESS;
> > +
> > +	if (setup_cgroup_environment())
> > +		return err;
> > +
> > +	cgroup_fd = create_and_get_cgroup(CG_PATH);
> > +	if (cgroup_fd < 0)
> > +		goto cleanup_cgroup_env;
> > +
> > +	if (join_cgroup(CG_PATH))
> > +		goto cleanup_cgroup;
> > +
> > +	if (run_test(cgroup_fd))
> > +		err = EXIT_FAILURE;
> > +
> > +	printf("test_sockopt_inherit: %s\n",
> > +	       err == EXIT_SUCCESS ? "PASSED" : "FAILED");
> > +
> > +cleanup_cgroup:
> > +	close(cgroup_fd);
> > +cleanup_cgroup_env:
> > +	cleanup_cgroup_environment();
> > +	return err;
> > +}
> > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
  2019-08-08  0:05     ` Stanislav Fomichev
@ 2019-08-08  0:18       ` Yonghong Song
  2019-08-08  7:11       ` Martin Lau
  1 sibling, 0 replies; 14+ messages in thread
From: Yonghong Song @ 2019-08-08  0:18 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Stanislav Fomichev, netdev, bpf, davem, ast, daniel, Martin Lau



On 8/7/19 5:05 PM, Stanislav Fomichev wrote:
> On 08/07, Yonghong Song wrote:
>>
>>
>> On 8/7/19 8:47 AM, Stanislav Fomichev wrote:
>>> Add new helper bpf_sk_storage_clone which optionally clones sk storage
>>> and call it from bpf_sk_storage_clone. Reuse the gap in
>>> bpf_sk_storage_elem to store clone/non-clone flag.
>>>
>>> Cc: Martin KaFai Lau <kafai@fb.com>
>>> Signed-off-by: Stanislav Fomichev <sdf@google.com>
>>
>> I tried to see whether I can find any missing race conditions in
>> the code but I failed. So except a minor comments below,
> Thanks for a review!
> 
>> Acked-by: Yonghong Song <yhs@fb.com>
>>
>>> ---
>>>    include/net/bpf_sk_storage.h |  10 ++++
>>>    include/uapi/linux/bpf.h     |   1 +
>>>    net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
>>>    net/core/sock.c              |   9 ++--
>>>    4 files changed, 115 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
>>> index b9dcb02e756b..8e4f831d2e52 100644
>>> --- a/include/net/bpf_sk_storage.h
>>> +++ b/include/net/bpf_sk_storage.h
>>> @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
>>>    extern const struct bpf_func_proto bpf_sk_storage_get_proto;
>>>    extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
>>>    
>>> +#ifdef CONFIG_BPF_SYSCALL
>>> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
>>> +#else
>>> +static inline int bpf_sk_storage_clone(const struct sock *sk,
>>> +				       struct sock *newsk)
>>> +{
>>> +	return 0;
>>> +}
>>> +#endif
>>> +
>>>    #endif /* _BPF_SK_STORAGE_H */
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 4393bd4b2419..00459ca4c8cf 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -2931,6 +2931,7 @@ enum bpf_func_id {
>>>    
>>>    /* BPF_FUNC_sk_storage_get flags */
>>>    #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
>>> +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
>>>    
>>>    /* Mode for BPF_FUNC_skb_adjust_room helper. */
>>>    enum bpf_adj_room_mode {
>>> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
>>> index 94c7f77ecb6b..b6dea67965bc 100644
>>> --- a/net/core/bpf_sk_storage.c
>>> +++ b/net/core/bpf_sk_storage.c
>>> @@ -12,6 +12,9 @@
>>>    
>>>    static atomic_t cache_idx;
>>>    
>>> +#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
>>> +					 BPF_SK_STORAGE_GET_F_CLONE)
>>> +
>>>    struct bucket {
>>>    	struct hlist_head list;
>>>    	raw_spinlock_t lock;
>>> @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
>>>    	struct hlist_node snode;	/* Linked to bpf_sk_storage */
>>>    	struct bpf_sk_storage __rcu *sk_storage;
>>>    	struct rcu_head rcu;
>>> -	/* 8 bytes hole */
>>> +	u8 clone:1;
>>> +	/* 7 bytes hole */
>>>    	/* The data is stored in aother cacheline to minimize
>>>    	 * the number of cachelines access during a cache hit.
>>>    	 */
>>> @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
>>>    	return 0;
>>>    }
>>>    
>>> -/* Called by __sk_destruct() */
>>> +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
>>>    void bpf_sk_storage_free(struct sock *sk)
>>>    {
>>>    	struct bpf_sk_storage_elem *selem;
>>> @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
>>>    	return err;
>>>    }
>>>    
>>> +static struct bpf_sk_storage_elem *
>>> +bpf_sk_storage_clone_elem(struct sock *newsk,
>>> +			  struct bpf_sk_storage_map *smap,
>>> +			  struct bpf_sk_storage_elem *selem)
>>> +{
>>> +	struct bpf_sk_storage_elem *copy_selem;
>>> +
>>> +	copy_selem = selem_alloc(smap, newsk, NULL, true);
>>> +	if (!copy_selem)
>>> +		return ERR_PTR(-ENOMEM);
>>> +
>>> +	if (map_value_has_spin_lock(&smap->map))
>>> +		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
>>> +				      SDATA(selem)->data, true);
>>> +	else
>>> +		copy_map_value(&smap->map, SDATA(copy_selem)->data,
>>> +			       SDATA(selem)->data);
>>> +
>>> +	return copy_selem;
>>> +}
>>> +
>>> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
>>> +{
>>> +	struct bpf_sk_storage *new_sk_storage = NULL;
>>> +	struct bpf_sk_storage *sk_storage;
>>> +	struct bpf_sk_storage_elem *selem;
>>> +	int ret;
>>> +
>>> +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
>>> +
>>> +	rcu_read_lock();
>>> +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
>>> +
>>> +	if (!sk_storage || hlist_empty(&sk_storage->list))
>>> +		goto out;
>>> +
>>> +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
>>> +		struct bpf_sk_storage_map *smap;
>>> +		struct bpf_sk_storage_elem *copy_selem;
>>> +
>>> +		if (!selem->clone)
>>> +			continue;
>>> +
>>> +		smap = rcu_dereference(SDATA(selem)->smap);
>>> +		if (!smap)
>>> +			continue;
>>> +
>>> +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
>>> +		if (IS_ERR(copy_selem)) {
>>> +			ret = PTR_ERR(copy_selem);
>>> +			goto err;
>>> +		}
>>> +
>>> +		if (!new_sk_storage) {
>>> +			ret = sk_storage_alloc(newsk, smap, copy_selem);
>>> +			if (ret) {
>>> +				kfree(copy_selem);
>>> +				atomic_sub(smap->elem_size,
>>> +					   &newsk->sk_omem_alloc);
>>> +				goto err;
>>> +			}
>>> +
>>> +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
>>> +			continue;
>>> +		}
>>> +
>>> +		raw_spin_lock_bh(&new_sk_storage->lock);
>>> +		selem_link_map(smap, copy_selem);
>>> +		__selem_link_sk(new_sk_storage, copy_selem);
>>> +		raw_spin_unlock_bh(&new_sk_storage->lock);
>>
>> Considering in this particular case, new socket is not visible to
>> outside world yet (both kernel and user space), map_delete/map_update
>> operations are not applicable in this situation, so
>> the above raw_spin_lock_bh() probably not needed.
> I agree, it's doing nothing, but __selem_link_sk has the following comment:
> /* sk_storage->lock must be held and sk_storage->list cannot be empty */
> 
> Just wanted to keep that invariant for this call site as well (in case
> we add some lockdep enforcement or smth else). WDYT?

Agree. Let us keep the locking to be consistent with other uses in
the same file. This is not the critical path.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
  2019-08-07 15:47 ` [PATCH bpf-next 1/3] " Stanislav Fomichev
  2019-08-07 23:03   ` Yonghong Song
@ 2019-08-08  6:39   ` Martin Lau
  2019-08-08 15:28     ` Stanislav Fomichev
  1 sibling, 1 reply; 14+ messages in thread
From: Martin Lau @ 2019-08-08  6:39 UTC (permalink / raw)
  To: Stanislav Fomichev; +Cc: netdev, bpf, davem, ast, daniel

On Wed, Aug 07, 2019 at 08:47:18AM -0700, Stanislav Fomichev wrote:
> Add new helper bpf_sk_storage_clone which optionally clones sk storage
> and call it from bpf_sk_storage_clone. Reuse the gap in
> bpf_sk_storage_elem to store clone/non-clone flag.
> 
> Cc: Martin KaFai Lau <kafai@fb.com>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  include/net/bpf_sk_storage.h |  10 ++++
>  include/uapi/linux/bpf.h     |   1 +
>  net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
>  net/core/sock.c              |   9 ++--
>  4 files changed, 115 insertions(+), 7 deletions(-)
> 
> diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> index b9dcb02e756b..8e4f831d2e52 100644
> --- a/include/net/bpf_sk_storage.h
> +++ b/include/net/bpf_sk_storage.h
> @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
>  extern const struct bpf_func_proto bpf_sk_storage_get_proto;
>  extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
>  
> +#ifdef CONFIG_BPF_SYSCALL
> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> +#else
> +static inline int bpf_sk_storage_clone(const struct sock *sk,
> +				       struct sock *newsk)
> +{
> +	return 0;
> +}
> +#endif
> +
>  #endif /* _BPF_SK_STORAGE_H */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 4393bd4b2419..00459ca4c8cf 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2931,6 +2931,7 @@ enum bpf_func_id {
>  
>  /* BPF_FUNC_sk_storage_get flags */
>  #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
> +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
It is only used in bpf_sk_storage_get().
What if the elem is created from bpf_fd_sk_storage_update_elem()
i.e. from the syscall API ?

What may be the use case for a map to have both CLONE and non-CLONE
elements?  If it is not the case, would it be better to add
BPF_F_CLONE to bpf_attr->map_flags?

>  
>  /* Mode for BPF_FUNC_skb_adjust_room helper. */
>  enum bpf_adj_room_mode {
> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> index 94c7f77ecb6b..b6dea67965bc 100644
> --- a/net/core/bpf_sk_storage.c
> +++ b/net/core/bpf_sk_storage.c
> @@ -12,6 +12,9 @@
>  
>  static atomic_t cache_idx;
>  
> +#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
> +					 BPF_SK_STORAGE_GET_F_CLONE)
> +
>  struct bucket {
>  	struct hlist_head list;
>  	raw_spinlock_t lock;
> @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
>  	struct hlist_node snode;	/* Linked to bpf_sk_storage */
>  	struct bpf_sk_storage __rcu *sk_storage;
>  	struct rcu_head rcu;
> -	/* 8 bytes hole */
> +	u8 clone:1;
> +	/* 7 bytes hole */
>  	/* The data is stored in aother cacheline to minimize
>  	 * the number of cachelines access during a cache hit.
>  	 */
> @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
>  	return 0;
>  }
>  
> -/* Called by __sk_destruct() */
> +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
>  void bpf_sk_storage_free(struct sock *sk)
>  {
>  	struct bpf_sk_storage_elem *selem;
> @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
>  	return err;
>  }
>  
> +static struct bpf_sk_storage_elem *
> +bpf_sk_storage_clone_elem(struct sock *newsk,
> +			  struct bpf_sk_storage_map *smap,
> +			  struct bpf_sk_storage_elem *selem)
> +{
> +	struct bpf_sk_storage_elem *copy_selem;
> +
> +	copy_selem = selem_alloc(smap, newsk, NULL, true);
> +	if (!copy_selem)
> +		return ERR_PTR(-ENOMEM);
nit.
may be just return NULL as selem_alloc() does.

> +
> +	if (map_value_has_spin_lock(&smap->map))
> +		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> +				      SDATA(selem)->data, true);
> +	else
> +		copy_map_value(&smap->map, SDATA(copy_selem)->data,
> +			       SDATA(selem)->data);
> +
> +	return copy_selem;
> +}
> +
> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> +{
> +	struct bpf_sk_storage *new_sk_storage = NULL;
> +	struct bpf_sk_storage *sk_storage;
> +	struct bpf_sk_storage_elem *selem;
> +	int ret;
> +
> +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> +
> +	rcu_read_lock();
> +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> +
> +	if (!sk_storage || hlist_empty(&sk_storage->list))
> +		goto out;
> +
> +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> +		struct bpf_sk_storage_map *smap;
> +		struct bpf_sk_storage_elem *copy_selem;
> +
> +		if (!selem->clone)
> +			continue;
> +
> +		smap = rcu_dereference(SDATA(selem)->smap);
> +		if (!smap)
smap should not be NULL.

> +			continue;
> +
> +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> +		if (IS_ERR(copy_selem)) {
> +			ret = PTR_ERR(copy_selem);
> +			goto err;
> +		}
> +
> +		if (!new_sk_storage) {
> +			ret = sk_storage_alloc(newsk, smap, copy_selem);
> +			if (ret) {
> +				kfree(copy_selem);
> +				atomic_sub(smap->elem_size,
> +					   &newsk->sk_omem_alloc);
> +				goto err;
> +			}
> +
> +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> +			continue;
> +		}
> +
> +		raw_spin_lock_bh(&new_sk_storage->lock);
> +		selem_link_map(smap, copy_selem);
Unlike the existing selem-update use-cases in bpf_sk_storage.c,
the smap->map.refcnt has not been held here.  Reading the smap
is fine.  However, adding a new selem to a deleting smap is an issue.
Hence, I think bpf_map_inc_not_zero() should be done first.

> +		__selem_link_sk(new_sk_storage, copy_selem);
> +		raw_spin_unlock_bh(&new_sk_storage->lock);
> +	}
> +
> +out:
> +	rcu_read_unlock();
> +	return 0;
> +
> +err:
> +	rcu_read_unlock();
> +
> +	bpf_sk_storage_free(newsk);
> +	return ret;
> +}
> +
>  BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>  	   void *, value, u64, flags)
>  {
>  	struct bpf_sk_storage_data *sdata;
>  
> -	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
> +	if (flags & ~BPF_SK_STORAGE_GET_F_MASK)
> +		return (unsigned long)NULL;
> +
> +	if ((flags & BPF_SK_STORAGE_GET_F_CLONE) &&
> +	    !(flags & BPF_SK_STORAGE_GET_F_CREATE))
>  		return (unsigned long)NULL;
>  
>  	sdata = sk_storage_lookup(sk, map, true);
>  	if (sdata)
>  		return (unsigned long)sdata->data;
>  
> -	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
> +	if ((flags & BPF_SK_STORAGE_GET_F_CREATE) &&
>  	    /* Cannot add new elem to a going away sk.
>  	     * Otherwise, the new elem may become a leak
>  	     * (and also other memory issues during map
> @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>  		/* sk must be a fullsock (guaranteed by verifier),
>  		 * so sock_gen_put() is unnecessary.
>  		 */
> +		if (!IS_ERR(sdata))
> +			SELEM(sdata)->clone =
> +				!!(flags & BPF_SK_STORAGE_GET_F_CLONE);
>  		sock_put(sk);
>  		return IS_ERR(sdata) ?
>  			(unsigned long)NULL : (unsigned long)sdata->data;
> diff --git a/net/core/sock.c b/net/core/sock.c
> index d57b0cc995a0..f5e801a9cea4 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>  			goto out;
>  		}
>  		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> -#ifdef CONFIG_BPF_SYSCALL
> -		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> -#endif
> +
> +		if (bpf_sk_storage_clone(sk, newsk)) {
> +			sk_free_unlock_clone(newsk);
> +			newsk = NULL;
> +			goto out;
> +		}
>  
>  		newsk->sk_err	   = 0;
>  		newsk->sk_err_soft = 0;
> -- 
> 2.22.0.770.g0f2c4a37fd-goog
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
  2019-08-08  0:05     ` Stanislav Fomichev
  2019-08-08  0:18       ` Yonghong Song
@ 2019-08-08  7:11       ` Martin Lau
  1 sibling, 0 replies; 14+ messages in thread
From: Martin Lau @ 2019-08-08  7:11 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Yonghong Song, Stanislav Fomichev, netdev, bpf, davem, ast, daniel

On Wed, Aug 07, 2019 at 05:05:33PM -0700, Stanislav Fomichev wrote:
[ ... ]
> > > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> > > +{
> > > +	struct bpf_sk_storage *new_sk_storage = NULL;
> > > +	struct bpf_sk_storage *sk_storage;
> > > +	struct bpf_sk_storage_elem *selem;
> > > +	int ret;
> > > +
> > > +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > > +
> > > +	rcu_read_lock();
> > > +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > +
> > > +	if (!sk_storage || hlist_empty(&sk_storage->list))
> > > +		goto out;
> > > +
> > > +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> > > +		struct bpf_sk_storage_map *smap;
> > > +		struct bpf_sk_storage_elem *copy_selem;
> > > +
> > > +		if (!selem->clone)
> > > +			continue;
> > > +
> > > +		smap = rcu_dereference(SDATA(selem)->smap);
> > > +		if (!smap)
> > > +			continue;
> > > +
> > > +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> > > +		if (IS_ERR(copy_selem)) {
> > > +			ret = PTR_ERR(copy_selem);
> > > +			goto err;
> > > +		}
> > > +
> > > +		if (!new_sk_storage) {
> > > +			ret = sk_storage_alloc(newsk, smap, copy_selem);
> > > +			if (ret) {
> > > +				kfree(copy_selem);
> > > +				atomic_sub(smap->elem_size,
> > > +					   &newsk->sk_omem_alloc);
> > > +				goto err;
> > > +			}
> > > +
> > > +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> > > +			continue;
> > > +		}
> > > +
> > > +		raw_spin_lock_bh(&new_sk_storage->lock);
> > > +		selem_link_map(smap, copy_selem);
> > > +		__selem_link_sk(new_sk_storage, copy_selem);
> > > +		raw_spin_unlock_bh(&new_sk_storage->lock);
> > 
> > Considering in this particular case, new socket is not visible to 
> > outside world yet (both kernel and user space), map_delete/map_update
> > operations are not applicable in this situation, so
> > the above raw_spin_lock_bh() probably not needed.
> I agree, it's doing nothing, but __selem_link_sk has the following comment:
> /* sk_storage->lock must be held and sk_storage->list cannot be empty */
I believe I may have forgotten to remove this comment after reusing it
in sk_storage_alloc() which also has a similar no-lock-needed situation
like here.

I would also prefer not to acquire the new_sk_storage->lock here to avoid
confusion when investigating racing bugs in the future.

> 
> Just wanted to keep that invariant for this call site as well (in case
> we add some lockdep enforcement or smth else). WDYT?
> 
> > > +	}
> > > +
> > > +out:
> > > +	rcu_read_unlock();
> > > +	return 0;
> > > +
> > > +err:
> > > +	rcu_read_unlock();
> > > +
> > > +	bpf_sk_storage_free(newsk);
> > > +	return ret;
> > > +}
> > > +
> > >   BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > >   	   void *, value, u64, flags)
> > >   {
> > >   	struct bpf_sk_storage_data *sdata;
> > >   
> > > -	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
> > > +	if (flags & ~BPF_SK_STORAGE_GET_F_MASK)
> > > +		return (unsigned long)NULL;
> > > +
> > > +	if ((flags & BPF_SK_STORAGE_GET_F_CLONE) &&
> > > +	    !(flags & BPF_SK_STORAGE_GET_F_CREATE))
> > >   		return (unsigned long)NULL;
> > >   
> > >   	sdata = sk_storage_lookup(sk, map, true);
> > >   	if (sdata)
> > >   		return (unsigned long)sdata->data;
> > >   
> > > -	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
> > > +	if ((flags & BPF_SK_STORAGE_GET_F_CREATE) &&
> > >   	    /* Cannot add new elem to a going away sk.
> > >   	     * Otherwise, the new elem may become a leak
> > >   	     * (and also other memory issues during map
> > > @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > >   		/* sk must be a fullsock (guaranteed by verifier),
> > >   		 * so sock_gen_put() is unnecessary.
> > >   		 */
> > > +		if (!IS_ERR(sdata))
> > > +			SELEM(sdata)->clone =
> > > +				!!(flags & BPF_SK_STORAGE_GET_F_CLONE);
> > >   		sock_put(sk);
> > >   		return IS_ERR(sdata) ?
> > >   			(unsigned long)NULL : (unsigned long)sdata->data;
> > > diff --git a/net/core/sock.c b/net/core/sock.c
> > > index d57b0cc995a0..f5e801a9cea4 100644
> > > --- a/net/core/sock.c
> > > +++ b/net/core/sock.c
> > > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
> > >   			goto out;
> > >   		}
> > >   		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> > > -#ifdef CONFIG_BPF_SYSCALL
> > > -		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > > -#endif
> > > +
> > > +		if (bpf_sk_storage_clone(sk, newsk)) {
> > > +			sk_free_unlock_clone(newsk);
> > > +			newsk = NULL;
> > > +			goto out;
> > > +		}
> > >   
> > >   		newsk->sk_err	   = 0;
> > >   		newsk->sk_err_soft = 0;
> > > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
  2019-08-08  6:39   ` Martin Lau
@ 2019-08-08 15:28     ` Stanislav Fomichev
  2019-08-08 18:27       ` Martin Lau
  0 siblings, 1 reply; 14+ messages in thread
From: Stanislav Fomichev @ 2019-08-08 15:28 UTC (permalink / raw)
  To: Martin Lau; +Cc: Stanislav Fomichev, netdev, bpf, davem, ast, daniel

On 08/08, Martin Lau wrote:
> On Wed, Aug 07, 2019 at 08:47:18AM -0700, Stanislav Fomichev wrote:
> > Add new helper bpf_sk_storage_clone which optionally clones sk storage
> > and call it from bpf_sk_storage_clone. Reuse the gap in
> > bpf_sk_storage_elem to store clone/non-clone flag.
> > 
> > Cc: Martin KaFai Lau <kafai@fb.com>
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  include/net/bpf_sk_storage.h |  10 ++++
> >  include/uapi/linux/bpf.h     |   1 +
> >  net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
> >  net/core/sock.c              |   9 ++--
> >  4 files changed, 115 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> > index b9dcb02e756b..8e4f831d2e52 100644
> > --- a/include/net/bpf_sk_storage.h
> > +++ b/include/net/bpf_sk_storage.h
> > @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
> >  extern const struct bpf_func_proto bpf_sk_storage_get_proto;
> >  extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
> >  
> > +#ifdef CONFIG_BPF_SYSCALL
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> > +#else
> > +static inline int bpf_sk_storage_clone(const struct sock *sk,
> > +				       struct sock *newsk)
> > +{
> > +	return 0;
> > +}
> > +#endif
> > +
> >  #endif /* _BPF_SK_STORAGE_H */
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 4393bd4b2419..00459ca4c8cf 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2931,6 +2931,7 @@ enum bpf_func_id {
> >  
> >  /* BPF_FUNC_sk_storage_get flags */
> >  #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
> > +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
> It is only used in bpf_sk_storage_get().
> What if the elem is created from bpf_fd_sk_storage_update_elem()
> i.e. from the syscall API ?
> 
> What may be the use case for a map to have both CLONE and non-CLONE
> elements?  If it is not the case, would it be better to add
> BPF_F_CLONE to bpf_attr->map_flags?
I didn't think about putting it on the map itself since the API
is on a per-element, but it does make sense. I can't come up
with a use-case for a per-element selective clone/non-clone.
Thanks, will move to the map itself.

> >  
> >  /* Mode for BPF_FUNC_skb_adjust_room helper. */
> >  enum bpf_adj_room_mode {
> > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> > index 94c7f77ecb6b..b6dea67965bc 100644
> > --- a/net/core/bpf_sk_storage.c
> > +++ b/net/core/bpf_sk_storage.c
> > @@ -12,6 +12,9 @@
> >  
> >  static atomic_t cache_idx;
> >  
> > +#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
> > +					 BPF_SK_STORAGE_GET_F_CLONE)
> > +
> >  struct bucket {
> >  	struct hlist_head list;
> >  	raw_spinlock_t lock;
> > @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
> >  	struct hlist_node snode;	/* Linked to bpf_sk_storage */
> >  	struct bpf_sk_storage __rcu *sk_storage;
> >  	struct rcu_head rcu;
> > -	/* 8 bytes hole */
> > +	u8 clone:1;
> > +	/* 7 bytes hole */
> >  	/* The data is stored in aother cacheline to minimize
> >  	 * the number of cachelines access during a cache hit.
> >  	 */
> > @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
> >  	return 0;
> >  }
> >  
> > -/* Called by __sk_destruct() */
> > +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
> >  void bpf_sk_storage_free(struct sock *sk)
> >  {
> >  	struct bpf_sk_storage_elem *selem;
> > @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
> >  	return err;
> >  }
> >  
> > +static struct bpf_sk_storage_elem *
> > +bpf_sk_storage_clone_elem(struct sock *newsk,
> > +			  struct bpf_sk_storage_map *smap,
> > +			  struct bpf_sk_storage_elem *selem)
> > +{
> > +	struct bpf_sk_storage_elem *copy_selem;
> > +
> > +	copy_selem = selem_alloc(smap, newsk, NULL, true);
> > +	if (!copy_selem)
> > +		return ERR_PTR(-ENOMEM);
> nit.
> may be just return NULL as selem_alloc() does.
Sounds good.

> > +
> > +	if (map_value_has_spin_lock(&smap->map))
> > +		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> > +				      SDATA(selem)->data, true);
> > +	else
> > +		copy_map_value(&smap->map, SDATA(copy_selem)->data,
> > +			       SDATA(selem)->data);
> > +
> > +	return copy_selem;
> > +}
> > +
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> > +{
> > +	struct bpf_sk_storage *new_sk_storage = NULL;
> > +	struct bpf_sk_storage *sk_storage;
> > +	struct bpf_sk_storage_elem *selem;
> > +	int ret;
> > +
> > +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > +
> > +	rcu_read_lock();
> > +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > +
> > +	if (!sk_storage || hlist_empty(&sk_storage->list))
> > +		goto out;
> > +
> > +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> > +		struct bpf_sk_storage_map *smap;
> > +		struct bpf_sk_storage_elem *copy_selem;
> > +
> > +		if (!selem->clone)
> > +			continue;
> > +
> > +		smap = rcu_dereference(SDATA(selem)->smap);
> > +		if (!smap)
> smap should not be NULL.
I see; you never set it back to NULL and we are guaranteed that the
map is still around due to rcu. Removed.

> > +			continue;
> > +
> > +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> > +		if (IS_ERR(copy_selem)) {
> > +			ret = PTR_ERR(copy_selem);
> > +			goto err;
> > +		}
> > +
> > +		if (!new_sk_storage) {
> > +			ret = sk_storage_alloc(newsk, smap, copy_selem);
> > +			if (ret) {
> > +				kfree(copy_selem);
> > +				atomic_sub(smap->elem_size,
> > +					   &newsk->sk_omem_alloc);
> > +				goto err;
> > +			}
> > +
> > +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> > +			continue;
> > +		}
> > +
> > +		raw_spin_lock_bh(&new_sk_storage->lock);
> > +		selem_link_map(smap, copy_selem);
> Unlike the existing selem-update use-cases in bpf_sk_storage.c,
> the smap->map.refcnt has not been held here.  Reading the smap
> is fine.  However, adding a new selem to a deleting smap is an issue.
> Hence, I think bpf_map_inc_not_zero() should be done first.
In this case, I should probably do it after smap = rcu_deref()?

> > +		__selem_link_sk(new_sk_storage, copy_selem);
> > +		raw_spin_unlock_bh(&new_sk_storage->lock);
> > +	}
> > +
> > +out:
> > +	rcu_read_unlock();
> > +	return 0;
> > +
> > +err:
> > +	rcu_read_unlock();
> > +
> > +	bpf_sk_storage_free(newsk);
> > +	return ret;
> > +}
> > +
> >  BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> >  	   void *, value, u64, flags)
> >  {
> >  	struct bpf_sk_storage_data *sdata;
> >  
> > -	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
> > +	if (flags & ~BPF_SK_STORAGE_GET_F_MASK)
> > +		return (unsigned long)NULL;
> > +
> > +	if ((flags & BPF_SK_STORAGE_GET_F_CLONE) &&
> > +	    !(flags & BPF_SK_STORAGE_GET_F_CREATE))
> >  		return (unsigned long)NULL;
> >  
> >  	sdata = sk_storage_lookup(sk, map, true);
> >  	if (sdata)
> >  		return (unsigned long)sdata->data;
> >  
> > -	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
> > +	if ((flags & BPF_SK_STORAGE_GET_F_CREATE) &&
> >  	    /* Cannot add new elem to a going away sk.
> >  	     * Otherwise, the new elem may become a leak
> >  	     * (and also other memory issues during map
> > @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> >  		/* sk must be a fullsock (guaranteed by verifier),
> >  		 * so sock_gen_put() is unnecessary.
> >  		 */
> > +		if (!IS_ERR(sdata))
> > +			SELEM(sdata)->clone =
> > +				!!(flags & BPF_SK_STORAGE_GET_F_CLONE);
> >  		sock_put(sk);
> >  		return IS_ERR(sdata) ?
> >  			(unsigned long)NULL : (unsigned long)sdata->data;
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index d57b0cc995a0..f5e801a9cea4 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
> >  			goto out;
> >  		}
> >  		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> > -#ifdef CONFIG_BPF_SYSCALL
> > -		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > -#endif
> > +
> > +		if (bpf_sk_storage_clone(sk, newsk)) {
> > +			sk_free_unlock_clone(newsk);
> > +			newsk = NULL;
> > +			goto out;
> > +		}
> >  
> >  		newsk->sk_err	   = 0;
> >  		newsk->sk_err_soft = 0;
> > -- 
> > 2.22.0.770.g0f2c4a37fd-goog
> > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
  2019-08-08 15:28     ` Stanislav Fomichev
@ 2019-08-08 18:27       ` Martin Lau
  0 siblings, 0 replies; 14+ messages in thread
From: Martin Lau @ 2019-08-08 18:27 UTC (permalink / raw)
  To: Stanislav Fomichev; +Cc: Stanislav Fomichev, netdev, bpf, davem, ast, daniel

On Thu, Aug 08, 2019 at 08:28:30AM -0700, Stanislav Fomichev wrote:
> On 08/08, Martin Lau wrote:
> > On Wed, Aug 07, 2019 at 08:47:18AM -0700, Stanislav Fomichev wrote:
> > > Add new helper bpf_sk_storage_clone which optionally clones sk storage
> > > and call it from bpf_sk_storage_clone. Reuse the gap in
> > > bpf_sk_storage_elem to store clone/non-clone flag.
> > > 
> > > Cc: Martin KaFai Lau <kafai@fb.com>
> > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > ---
> > >  include/net/bpf_sk_storage.h |  10 ++++
> > >  include/uapi/linux/bpf.h     |   1 +
> > >  net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
> > >  net/core/sock.c              |   9 ++--
> > >  4 files changed, 115 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> > > index b9dcb02e756b..8e4f831d2e52 100644
> > > --- a/include/net/bpf_sk_storage.h
> > > +++ b/include/net/bpf_sk_storage.h
> > > @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
> > >  extern const struct bpf_func_proto bpf_sk_storage_get_proto;
> > >  extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
> > >  
> > > +#ifdef CONFIG_BPF_SYSCALL
> > > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> > > +#else
> > > +static inline int bpf_sk_storage_clone(const struct sock *sk,
> > > +				       struct sock *newsk)
> > > +{
> > > +	return 0;
> > > +}
> > > +#endif
> > > +
> > >  #endif /* _BPF_SK_STORAGE_H */
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 4393bd4b2419..00459ca4c8cf 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -2931,6 +2931,7 @@ enum bpf_func_id {
> > >  
> > >  /* BPF_FUNC_sk_storage_get flags */
> > >  #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
> > > +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
> > It is only used in bpf_sk_storage_get().
> > What if the elem is created from bpf_fd_sk_storage_update_elem()
> > i.e. from the syscall API ?
> > 
> > What may be the use case for a map to have both CLONE and non-CLONE
> > elements?  If it is not the case, would it be better to add
> > BPF_F_CLONE to bpf_attr->map_flags?
> I didn't think about putting it on the map itself since the API
> is on a per-element, but it does make sense. I can't come up
> with a use-case for a per-element selective clone/non-clone.
> Thanks, will move to the map itself.
> 
> > >  
> > >  /* Mode for BPF_FUNC_skb_adjust_room helper. */
> > >  enum bpf_adj_room_mode {
> > > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> > > index 94c7f77ecb6b..b6dea67965bc 100644
> > > --- a/net/core/bpf_sk_storage.c
> > > +++ b/net/core/bpf_sk_storage.c
> > > @@ -12,6 +12,9 @@
> > >  
> > >  static atomic_t cache_idx;
> > >  
> > > +#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
> > > +					 BPF_SK_STORAGE_GET_F_CLONE)
> > > +
> > >  struct bucket {
> > >  	struct hlist_head list;
> > >  	raw_spinlock_t lock;
> > > @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
> > >  	struct hlist_node snode;	/* Linked to bpf_sk_storage */
> > >  	struct bpf_sk_storage __rcu *sk_storage;
> > >  	struct rcu_head rcu;
> > > -	/* 8 bytes hole */
> > > +	u8 clone:1;
> > > +	/* 7 bytes hole */
> > >  	/* The data is stored in aother cacheline to minimize
> > >  	 * the number of cachelines access during a cache hit.
> > >  	 */
> > > @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
> > >  	return 0;
> > >  }
> > >  
> > > -/* Called by __sk_destruct() */
> > > +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
> > >  void bpf_sk_storage_free(struct sock *sk)
> > >  {
> > >  	struct bpf_sk_storage_elem *selem;
> > > @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
> > >  	return err;
> > >  }
> > >  
> > > +static struct bpf_sk_storage_elem *
> > > +bpf_sk_storage_clone_elem(struct sock *newsk,
> > > +			  struct bpf_sk_storage_map *smap,
> > > +			  struct bpf_sk_storage_elem *selem)
> > > +{
> > > +	struct bpf_sk_storage_elem *copy_selem;
> > > +
> > > +	copy_selem = selem_alloc(smap, newsk, NULL, true);
> > > +	if (!copy_selem)
> > > +		return ERR_PTR(-ENOMEM);
> > nit.
> > may be just return NULL as selem_alloc() does.
> Sounds good.
> 
> > > +
> > > +	if (map_value_has_spin_lock(&smap->map))
> > > +		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> > > +				      SDATA(selem)->data, true);
> > > +	else
> > > +		copy_map_value(&smap->map, SDATA(copy_selem)->data,
> > > +			       SDATA(selem)->data);
> > > +
> > > +	return copy_selem;
> > > +}
> > > +
> > > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> > > +{
> > > +	struct bpf_sk_storage *new_sk_storage = NULL;
> > > +	struct bpf_sk_storage *sk_storage;
> > > +	struct bpf_sk_storage_elem *selem;
> > > +	int ret;
> > > +
> > > +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > > +
> > > +	rcu_read_lock();
> > > +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > +
> > > +	if (!sk_storage || hlist_empty(&sk_storage->list))
> > > +		goto out;
> > > +
> > > +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> > > +		struct bpf_sk_storage_map *smap;
> > > +		struct bpf_sk_storage_elem *copy_selem;
> > > +
> > > +		if (!selem->clone)
> > > +			continue;
> > > +
> > > +		smap = rcu_dereference(SDATA(selem)->smap);
> > > +		if (!smap)
> > smap should not be NULL.
> I see; you never set it back to NULL and we are guaranteed that the
> map is still around due to rcu. Removed.
> 
> > > +			continue;
> > > +
> > > +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> > > +		if (IS_ERR(copy_selem)) {
> > > +			ret = PTR_ERR(copy_selem);
> > > +			goto err;
> > > +		}
> > > +
> > > +		if (!new_sk_storage) {
> > > +			ret = sk_storage_alloc(newsk, smap, copy_selem);
> > > +			if (ret) {
> > > +				kfree(copy_selem);
> > > +				atomic_sub(smap->elem_size,
> > > +					   &newsk->sk_omem_alloc);
> > > +				goto err;
> > > +			}
> > > +
> > > +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> > > +			continue;
> > > +		}
> > > +
> > > +		raw_spin_lock_bh(&new_sk_storage->lock);
> > > +		selem_link_map(smap, copy_selem);
> > Unlike the existing selem-update use-cases in bpf_sk_storage.c,
> > the smap->map.refcnt has not been held here.  Reading the smap
> > is fine.  However, adding a new selem to a deleting smap is an issue.
> > Hence, I think bpf_map_inc_not_zero() should be done first.
> In this case, I should probably do it after smap = rcu_deref()?
Right.

and bpf_map_put should be called when done.  Becasue of bpf_map_put,
it may be a good idea to add a comment to the first synchronize_rcu()
in bpf_sk_storage_map_free() since this new bpf_sk_storage_clone()
also depends on it now,
which makes it different from other bpf maps.

> 
> > > +		__selem_link_sk(new_sk_storage, copy_selem);
> > > +		raw_spin_unlock_bh(&new_sk_storage->lock);
> > > +	}
> > > +
> > > +out:
> > > +	rcu_read_unlock();
> > > +	return 0;
> > > +
> > > +err:
> > > +	rcu_read_unlock();
> > > +
> > > +	bpf_sk_storage_free(newsk);
> > > +	return ret;
> > > +}
> > > +
> > >  BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > >  	   void *, value, u64, flags)
> > >  {
> > >  	struct bpf_sk_storage_data *sdata;
> > >  
> > > -	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
> > > +	if (flags & ~BPF_SK_STORAGE_GET_F_MASK)
> > > +		return (unsigned long)NULL;
> > > +
> > > +	if ((flags & BPF_SK_STORAGE_GET_F_CLONE) &&
> > > +	    !(flags & BPF_SK_STORAGE_GET_F_CREATE))
> > >  		return (unsigned long)NULL;
> > >  
> > >  	sdata = sk_storage_lookup(sk, map, true);
> > >  	if (sdata)
> > >  		return (unsigned long)sdata->data;
> > >  
> > > -	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
> > > +	if ((flags & BPF_SK_STORAGE_GET_F_CREATE) &&
> > >  	    /* Cannot add new elem to a going away sk.
> > >  	     * Otherwise, the new elem may become a leak
> > >  	     * (and also other memory issues during map
> > > @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > >  		/* sk must be a fullsock (guaranteed by verifier),
> > >  		 * so sock_gen_put() is unnecessary.
> > >  		 */
> > > +		if (!IS_ERR(sdata))
> > > +			SELEM(sdata)->clone =
> > > +				!!(flags & BPF_SK_STORAGE_GET_F_CLONE);
> > >  		sock_put(sk);
> > >  		return IS_ERR(sdata) ?
> > >  			(unsigned long)NULL : (unsigned long)sdata->data;
> > > diff --git a/net/core/sock.c b/net/core/sock.c
> > > index d57b0cc995a0..f5e801a9cea4 100644
> > > --- a/net/core/sock.c
> > > +++ b/net/core/sock.c
> > > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
> > >  			goto out;
> > >  		}
> > >  		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> > > -#ifdef CONFIG_BPF_SYSCALL
> > > -		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > > -#endif
> > > +
> > > +		if (bpf_sk_storage_clone(sk, newsk)) {
> > > +			sk_free_unlock_clone(newsk);
> > > +			newsk = NULL;
> > > +			goto out;
> > > +		}
> > >  
> > >  		newsk->sk_err	   = 0;
> > >  		newsk->sk_err_soft = 0;
> > > -- 
> > > 2.22.0.770.g0f2c4a37fd-goog
> > > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2019-08-08 18:28 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-07 15:47 [PATCH bpf-next 0/3] bpf: support cloning sk storage on accept() Stanislav Fomichev
2019-08-07 15:47 ` [PATCH bpf-next 1/3] " Stanislav Fomichev
2019-08-07 23:03   ` Yonghong Song
2019-08-08  0:05     ` Stanislav Fomichev
2019-08-08  0:18       ` Yonghong Song
2019-08-08  7:11       ` Martin Lau
2019-08-08  6:39   ` Martin Lau
2019-08-08 15:28     ` Stanislav Fomichev
2019-08-08 18:27       ` Martin Lau
2019-08-07 15:47 ` [PATCH bpf-next 2/3] bpf: sync bpf.h to tools/ Stanislav Fomichev
2019-08-07 23:04   ` Yonghong Song
2019-08-07 15:47 ` [PATCH bpf-next 3/3] selftests/bpf: add sockopt clone/inheritance test Stanislav Fomichev
2019-08-07 23:14   ` Yonghong Song
2019-08-08  0:08     ` Stanislav Fomichev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).