* [Patch bpf-next v6 1/8] bpf: clean up sockmap related Kconfigs
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
@ 2021-02-20 5:29 ` Cong Wang
2021-02-22 8:51 ` Jakub Sitnicki
2021-02-20 5:29 ` [Patch bpf-next v6 2/8] skmsg: get rid of struct sk_psock_parser Cong Wang
` (7 subsequent siblings)
8 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2021-02-20 5:29 UTC (permalink / raw)
To: netdev
Cc: bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang,
Daniel Borkmann, Jakub Sitnicki, Lorenz Bauer, John Fastabend
From: Cong Wang <cong.wang@bytedance.com>
As suggested by John, clean up sockmap related Kconfigs:
Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream
parser, to reflect its name.
Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL
and CONFIG_INET, the latter is still needed at this point because
of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched,
as it is used by non-sockmap cases.
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
include/linux/bpf.h | 26 ++++---
include/linux/bpf_types.h | 6 +-
include/linux/skmsg.h | 22 ++++++
include/net/tcp.h | 16 +++--
include/net/udp.h | 4 +-
init/Kconfig | 1 +
net/Kconfig | 6 +-
net/core/Makefile | 6 +-
net/core/skmsg.c | 139 ++++++++++++++++++++------------------
net/core/sock_map.c | 2 +
net/ipv4/Makefile | 2 +-
net/ipv4/tcp_bpf.c | 4 +-
12 files changed, 131 insertions(+), 103 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cccaef1088ea..813f30ef44ff 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1768,7 +1768,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
}
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
-#if defined(CONFIG_BPF_STREAM_PARSER)
+#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
struct bpf_prog *old, u32 which);
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
@@ -1776,7 +1776,18 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
void sock_map_unhash(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
+
+void bpf_sk_reuseport_detach(struct sock *sk);
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+ void *value);
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags);
#else
+static inline void bpf_sk_reuseport_detach(struct sock *sk)
+{
+}
+
+#ifdef CONFIG_BPF_SYSCALL
static inline int sock_map_prog_update(struct bpf_map *map,
struct bpf_prog *prog,
struct bpf_prog *old, u32 which)
@@ -1801,20 +1812,7 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void
{
return -EOPNOTSUPP;
}
-#endif /* CONFIG_BPF_STREAM_PARSER */
-#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
-void bpf_sk_reuseport_detach(struct sock *sk);
-int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
- void *value);
-int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags);
-#else
-static inline void bpf_sk_reuseport_detach(struct sock *sk)
-{
-}
-
-#ifdef CONFIG_BPF_SYSCALL
static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
void *key, void *value)
{
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 99f7fd657d87..38fd98901ba9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -103,10 +103,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
-#if defined(CONFIG_BPF_STREAM_PARSER)
-BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
-#endif
#ifdef CONFIG_BPF_LSM
BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
@@ -116,6 +112,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
#ifdef CONFIG_INET
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
#endif
#endif
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 8edbbf5f2f93..041faef00937 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -71,7 +71,9 @@ struct sk_psock_link {
};
struct sk_psock_parser {
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
struct strparser strp;
+#endif
bool enabled;
void (*saved_data_ready)(struct sock *sk);
};
@@ -305,9 +307,29 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
struct sk_psock *sk_psock_init(struct sock *sk, int node);
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_done_strp(struct sk_psock *psock);
+#else
+static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+}
+
+static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+}
+
+static inline void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock);
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 963cd86d12dd..c00e125dcfb9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2222,25 +2222,27 @@ void tcp_update_ulp(struct sock *sk, struct proto *p,
__MODULE_INFO(alias, alias_userspace, name); \
__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
+#ifdef CONFIG_NET_SOCK_MSG
struct sk_msg;
struct sk_psock;
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
-#else
-static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
-{
-}
-#endif /* CONFIG_BPF_STREAM_PARSER */
+#endif /* CONFIG_BPF_SYSCALL */
-#ifdef CONFIG_NET_SOCK_MSG
int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
int flags);
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
struct msghdr *msg, int len, int flags);
#endif /* CONFIG_NET_SOCK_MSG */
+#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
+static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
+{
+}
+#endif
+
#ifdef CONFIG_CGROUP_BPF
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
struct sk_buff *skb,
diff --git a/include/net/udp.h b/include/net/udp.h
index a132a02b2f2c..60fa0bd4d27d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -515,9 +515,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
return segs;
}
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_NET_SOCK_MSG
struct sk_psock;
struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
-#endif /* BPF_STREAM_PARSER */
+#endif /* CONFIG_NET_SOCK_MSG */
#endif /* _UDP_H */
diff --git a/init/Kconfig b/init/Kconfig
index 29ad68325028..5a65bfb48f60 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1702,6 +1702,7 @@ config BPF_SYSCALL
select BPF
select IRQ_WORK
select TASKS_TRACE_RCU
+ select NET_SOCK_MSG if INET
default n
help
Enable the bpf() system call that allows to manipulate eBPF
diff --git a/net/Kconfig b/net/Kconfig
index 8cea808ad9e8..0ead7ec0d2bd 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -317,13 +317,9 @@ config BPF_STREAM_PARSER
select STREAM_PARSER
select NET_SOCK_MSG
help
- Enabling this allows a stream parser to be used with
+ Enabling this allows a TCP stream parser to be used with
BPF_MAP_TYPE_SOCKMAP.
- BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
- It can be used to enforce socket policy, implement socket redirects,
- etc.
-
config NET_FLOW_LIMIT
bool
depends on RPS
diff --git a/net/core/Makefile b/net/core/Makefile
index 3e2c378e5f31..0c2233c826fd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,7 +16,6 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
-obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
obj-$(CONFIG_FAILOVER) += failover.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
+obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
+endif
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 1261512d6807..6cb5ff6f8f9c 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -651,9 +651,7 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
/* No sk_callback_lock since already detached. */
- /* Parser has been stopped */
- if (psock->progs.skb_parser)
- strp_done(&psock->parser.strp);
+ sk_psock_done_strp(psock);
cancel_work_sync(&psock->work);
@@ -750,14 +748,6 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
return bpf_prog_run_pin_on_cpu(prog, skb);
}
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
-{
- struct sk_psock_parser *parser;
-
- parser = container_of(strp, struct sk_psock_parser, strp);
- return container_of(parser, struct sk_psock, parser);
-}
-
static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
@@ -866,6 +856,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
}
}
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk) = NULL;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ }
+ rcu_read_unlock();
+ if (write_space)
+ write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock;
@@ -897,6 +905,14 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
return err;
}
+static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
+{
+ struct sk_psock_parser *parser;
+
+ parser = container_of(strp, struct sk_psock_parser, strp);
+ return container_of(parser, struct sk_psock, parser);
+}
+
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock = sk_psock_from_strp(strp);
@@ -933,6 +949,52 @@ static void sk_psock_strp_data_ready(struct sock *sk)
rcu_read_unlock();
}
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = sk_psock_strp_read,
+ .read_sock_done = sk_psock_strp_read_done,
+ .parse_msg = sk_psock_strp_parse,
+ };
+
+ psock->parser.enabled = false;
+ return strp_init(&psock->parser.strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_parser *parser = &psock->parser;
+
+ if (parser->enabled)
+ return;
+
+ parser->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_strp_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+ parser->enabled = true;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_parser *parser = &psock->parser;
+
+ if (!parser->enabled)
+ return;
+
+ sk->sk_data_ready = parser->saved_data_ready;
+ parser->saved_data_ready = NULL;
+ strp_stop(&parser->strp);
+ parser->enabled = false;
+}
+
+void sk_psock_done_strp(struct sk_psock *psock)
+{
+ /* Parser has been stopped */
+ if (psock->progs.skb_parser)
+ strp_done(&psock->parser.strp);
+}
+#endif
+
static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t orig_len)
{
@@ -984,35 +1046,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
}
-static void sk_psock_write_space(struct sock *sk)
-{
- struct sk_psock *psock;
- void (*write_space)(struct sock *sk) = NULL;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock)) {
- if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
- schedule_work(&psock->work);
- write_space = psock->saved_write_space;
- }
- rcu_read_unlock();
- if (write_space)
- write_space(sk);
-}
-
-int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
-{
- static const struct strp_callbacks cb = {
- .rcv_msg = sk_psock_strp_read,
- .read_sock_done = sk_psock_strp_read_done,
- .parse_msg = sk_psock_strp_parse,
- };
-
- psock->parser.enabled = false;
- return strp_init(&psock->parser.strp, sk, &cb);
-}
-
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
@@ -1026,32 +1059,6 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
parser->enabled = true;
}
-void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
- return;
-
- parser->saved_data_ready = sk->sk_data_ready;
- sk->sk_data_ready = sk_psock_strp_data_ready;
- sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
-}
-
-void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
- return;
-
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- strp_stop(&parser->strp);
- parser->enabled = false;
-}
-
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index d758fb83c884..ee3334dd3a38 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1461,9 +1461,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
case BPF_SK_MSG_VERDICT:
pprog = &progs->msg_parser;
break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
pprog = &progs->skb_parser;
break;
+#endif
case BPF_SK_SKB_STREAM_VERDICT:
pprog = &progs->skb_verdict;
break;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 5b77a46885b9..bbdd9c44f14e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o
+obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index bc7d2a586e18..17c322b875fd 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -229,7 +229,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
}
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
static bool tcp_bpf_stream_read(const struct sock *sk)
{
struct sk_psock *psock;
@@ -629,4 +629,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE])
newsk->sk_prot = sk->sk_prot_creator;
}
-#endif /* CONFIG_BPF_STREAM_PARSER */
+#endif /* CONFIG_BPF_SYSCALL */
--
2.25.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 1/8] bpf: clean up sockmap related Kconfigs
2021-02-20 5:29 ` [Patch bpf-next v6 1/8] bpf: clean up sockmap related Kconfigs Cong Wang
@ 2021-02-22 8:51 ` Jakub Sitnicki
2021-02-22 23:23 ` Cong Wang
0 siblings, 1 reply; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-22 8:51 UTC (permalink / raw)
To: Cong Wang
Cc: netdev, bpf, duanxiongchun, wangdongdong.6, jiang.wang,
Cong Wang, Daniel Borkmann, Lorenz Bauer, John Fastabend
On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> As suggested by John, clean up sockmap related Kconfigs:
>
> Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream
> parser, to reflect its name.
>
> Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL
> and CONFIG_INET, the latter is still needed at this point because
> of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched,
> as it is used by non-sockmap cases.
>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
Couple comments:
1. sk_psock_done_strp() could be static to skmsg.c, as mentioned
earlier.
2. udp_bpf.c is built when CONFIG_BPF_SYSCALL is enabled, while its API
declarations in udp.h are guarded on CONFIG_NET_SOCK_MSG.
This works because BPF_SYSCALL now selects NET_SOCK_MSG if INET, and
INET has to be enabled when using udp, but seems confusing to me.
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 1/8] bpf: clean up sockmap related Kconfigs
2021-02-22 8:51 ` Jakub Sitnicki
@ 2021-02-22 23:23 ` Cong Wang
0 siblings, 0 replies; 22+ messages in thread
From: Cong Wang @ 2021-02-22 23:23 UTC (permalink / raw)
To: Jakub Sitnicki
Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
Dongdong Wang, Jiang Wang, Cong Wang, Daniel Borkmann,
Lorenz Bauer, John Fastabend
On Mon, Feb 22, 2021 at 12:52 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > As suggested by John, clean up sockmap related Kconfigs:
> >
> > Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream
> > parser, to reflect its name.
> >
> > Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL
> > and CONFIG_INET, the latter is still needed at this point because
> > of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched,
> > as it is used by non-sockmap cases.
> >
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
> > Acked-by: John Fastabend <john.fastabend@gmail.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
>
> Couple comments:
>
> 1. sk_psock_done_strp() could be static to skmsg.c, as mentioned
> earlier.
Oops, I thought you meant to move it to sock_map.c...
>
> 2. udp_bpf.c is built when CONFIG_BPF_SYSCALL is enabled, while its API
> declarations in udp.h are guarded on CONFIG_NET_SOCK_MSG.
>
> This works because BPF_SYSCALL now selects NET_SOCK_MSG if INET, and
> INET has to be enabled when using udp, but seems confusing to me.
>
Sure.
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [Patch bpf-next v6 2/8] skmsg: get rid of struct sk_psock_parser
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
2021-02-20 5:29 ` [Patch bpf-next v6 1/8] bpf: clean up sockmap related Kconfigs Cong Wang
@ 2021-02-20 5:29 ` Cong Wang
2021-02-20 5:29 ` [Patch bpf-next v6 3/8] bpf: compute data_end dynamically with JIT code Cong Wang
` (6 subsequent siblings)
8 siblings, 0 replies; 22+ messages in thread
From: Cong Wang @ 2021-02-20 5:29 UTC (permalink / raw)
To: netdev
Cc: bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang,
Daniel Borkmann, Lorenz Bauer, John Fastabend, Jakub Sitnicki
From: Cong Wang <cong.wang@bytedance.com>
struct sk_psock_parser is embedded in sk_psock, it is
unnecessary as skb verdict also uses ->saved_data_ready.
We can simply fold these fields into sk_psock, and get rid
of ->enabled.
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
include/linux/skmsg.h | 19 ++++++----------
net/core/skmsg.c | 53 +++++++++++++------------------------------
net/core/sock_map.c | 8 +++----
3 files changed, 27 insertions(+), 53 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 041faef00937..e3bb712af257 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -70,14 +70,6 @@ struct sk_psock_link {
void *link_raw;
};
-struct sk_psock_parser {
-#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
- struct strparser strp;
-#endif
- bool enabled;
- void (*saved_data_ready)(struct sock *sk);
-};
-
struct sk_psock_work_state {
struct sk_buff *skb;
u32 len;
@@ -92,7 +84,9 @@ struct sk_psock {
u32 eval;
struct sk_msg *cork;
struct sk_psock_progs progs;
- struct sk_psock_parser parser;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ struct strparser strp;
+#endif
struct sk_buff_head ingress_skb;
struct list_head ingress_msg;
unsigned long state;
@@ -102,6 +96,7 @@ struct sk_psock {
void (*saved_unhash)(struct sock *sk);
void (*saved_close)(struct sock *sk, long timeout);
void (*saved_write_space)(struct sock *sk);
+ void (*saved_data_ready)(struct sock *sk);
struct proto *sk_proto;
struct sk_psock_work_state work_state;
struct work_struct work;
@@ -422,8 +417,8 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
{
- if (psock->parser.enabled)
- psock->parser.saved_data_ready(sk);
+ if (psock->saved_data_ready)
+ psock->saved_data_ready(sk);
else
sk->sk_data_ready(sk);
}
@@ -462,6 +457,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
{
if (!psock)
return false;
- return psock->parser.enabled;
+ return !!psock->saved_data_ready;
}
#endif /* _LINUX_SKMSG_H */
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 6cb5ff6f8f9c..7f400d044cda 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -905,17 +905,9 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
return err;
}
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
-{
- struct sk_psock_parser *parser;
-
- parser = container_of(strp, struct sk_psock_parser, strp);
- return container_of(parser, struct sk_psock, parser);
-}
-
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
{
- struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
struct bpf_prog *prog;
int ret = skb->len;
@@ -939,10 +931,10 @@ static void sk_psock_strp_data_ready(struct sock *sk)
psock = sk_psock(sk);
if (likely(psock)) {
if (tls_sw_has_ctx_rx(sk)) {
- psock->parser.saved_data_ready(sk);
+ psock->saved_data_ready(sk);
} else {
write_lock_bh(&sk->sk_callback_lock);
- strp_data_ready(&psock->parser.strp);
+ strp_data_ready(&psock->strp);
write_unlock_bh(&sk->sk_callback_lock);
}
}
@@ -957,41 +949,34 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
.parse_msg = sk_psock_strp_parse,
};
- psock->parser.enabled = false;
- return strp_init(&psock->parser.strp, sk, &cb);
+ return strp_init(&psock->strp, sk, &cb);
}
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
+ if (psock->saved_data_ready)
return;
- parser->saved_data_ready = sk->sk_data_ready;
+ psock->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_strp_data_ready;
sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
}
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
+ if (!psock->saved_data_ready)
return;
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- strp_stop(&parser->strp);
- parser->enabled = false;
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+ strp_stop(&psock->strp);
}
void sk_psock_done_strp(struct sk_psock *psock)
{
/* Parser has been stopped */
if (psock->progs.skb_parser)
- strp_done(&psock->parser.strp);
+ strp_done(&psock->strp);
}
#endif
@@ -1048,25 +1033,19 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
+ if (psock->saved_data_ready)
return;
- parser->saved_data_ready = sk->sk_data_ready;
+ psock->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_verdict_data_ready;
sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
}
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
+ if (!psock->saved_data_ready)
return;
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- parser->enabled = false;
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index ee3334dd3a38..1a28a5c2c61e 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk,
struct bpf_map *map = link->map;
struct bpf_stab *stab = container_of(map, struct bpf_stab,
map);
- if (psock->parser.enabled && stab->progs.skb_parser)
+ if (psock->saved_data_ready && stab->progs.skb_parser)
strp_stop = true;
- if (psock->parser.enabled && stab->progs.skb_verdict)
+ if (psock->saved_data_ready && stab->progs.skb_verdict)
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
@@ -283,14 +283,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
- if (skb_parser && skb_verdict && !psock->parser.enabled) {
+ if (skb_parser && skb_verdict && !psock->saved_data_ready) {
ret = sk_psock_init_strp(sk, psock);
if (ret)
goto out_unlock_drop;
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
psock_set_prog(&psock->progs.skb_parser, skb_parser);
sk_psock_start_strp(sk, psock);
- } else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
+ } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) {
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
sk_psock_start_verdict(sk,psock);
}
--
2.25.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [Patch bpf-next v6 3/8] bpf: compute data_end dynamically with JIT code
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
2021-02-20 5:29 ` [Patch bpf-next v6 1/8] bpf: clean up sockmap related Kconfigs Cong Wang
2021-02-20 5:29 ` [Patch bpf-next v6 2/8] skmsg: get rid of struct sk_psock_parser Cong Wang
@ 2021-02-20 5:29 ` Cong Wang
2021-02-20 5:29 ` [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb Cong Wang
` (5 subsequent siblings)
8 siblings, 0 replies; 22+ messages in thread
From: Cong Wang @ 2021-02-20 5:29 UTC (permalink / raw)
To: netdev
Cc: bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang,
Jakub Sitnicki, Daniel Borkmann, Lorenz Bauer, John Fastabend
From: Cong Wang <cong.wang@bytedance.com>
Currently, we compute ->data_end with a compile-time constant
offset of skb. But as Jakub pointed out, we can actually compute
it in eBPF JIT code at run-time, so that we can competely get
rid of ->data_end. This is similar to skb_shinfo(skb) computation
in bpf_convert_shinfo_access().
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
include/net/tcp.h | 6 ------
net/core/filter.c | 48 +++++++++++++++++++++++++++--------------------
net/core/skmsg.c | 1 -
3 files changed, 28 insertions(+), 27 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c00e125dcfb9..947ef5da6867 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -886,18 +886,12 @@ struct tcp_skb_cb {
struct {
__u32 flags;
struct sock *sk_redir;
- void *data_end;
} bpf;
};
};
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
-static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
-{
- TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
-}
-
static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
diff --git a/net/core/filter.c b/net/core/filter.c
index adfdad234674..13bcf248ee7b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = {
static inline int sk_skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
- int err = __bpf_try_make_writable(skb, write_len);
-
- bpf_compute_data_end_sk_skb(skb);
- return err;
+ return __bpf_try_make_writable(skb, write_len);
}
BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -3577,7 +3574,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
return -ENOMEM;
__skb_pull(skb, len_diff_abs);
}
- bpf_compute_data_end_sk_skb(skb);
if (tls_sw_has_ctx_rx(skb->sk)) {
struct strp_msg *rxm = strp_msg(skb);
@@ -3742,10 +3738,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
u64, flags)
{
- int ret = __bpf_skb_change_tail(skb, new_len, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_tail(skb, new_len, flags);
}
static const struct bpf_func_proto sk_skb_change_tail_proto = {
@@ -3808,10 +3801,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
u64, flags)
{
- int ret = __bpf_skb_change_head(skb, head_room, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_head(skb, head_room, flags);
}
static const struct bpf_func_proto sk_skb_change_head_proto = {
@@ -9655,22 +9645,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+/* data_end = skb->data + skb_headlen() */
+static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb->data */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ /* AX = skb->len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, len));
+ /* si->dst_reg = skb->data + skb->len */
+ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+ /* AX = skb->data_len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, data_len));
+ /* si->dst_reg = skb->data + skb->len - skb->data_len */
+ *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
+
+ return insn;
+}
+
static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
- int off;
switch (si->off) {
case offsetof(struct __sk_buff, data_end):
- off = si->off;
- off -= offsetof(struct __sk_buff, data_end);
- off += offsetof(struct sk_buff, cb);
- off += offsetof(struct tcp_skb_cb, bpf.data_end);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
- si->src_reg, off);
+ insn = bpf_convert_data_end_access(si, insn);
break;
default:
return bpf_convert_ctx_access(type, si, insn_buf, prog,
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 7f400d044cda..2d8bbb3fd87c 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -744,7 +744,6 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
struct sk_buff *skb)
{
- bpf_compute_data_end_sk_skb(skb);
return bpf_prog_run_pin_on_cpu(prog, skb);
}
--
2.25.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
` (2 preceding siblings ...)
2021-02-20 5:29 ` [Patch bpf-next v6 3/8] bpf: compute data_end dynamically with JIT code Cong Wang
@ 2021-02-20 5:29 ` Cong Wang
2021-02-22 12:20 ` Jakub Sitnicki
2021-02-20 5:29 ` [Patch bpf-next v6 5/8] sock_map: rename skb_parser and skb_verdict Cong Wang
` (4 subsequent siblings)
8 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2021-02-20 5:29 UTC (permalink / raw)
To: netdev
Cc: bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang,
Daniel Borkmann, Jakub Sitnicki, Lorenz Bauer, John Fastabend
From: Cong Wang <cong.wang@bytedance.com>
Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly
does not work for any other non-TCP protocols. We can move them to
skb ext, but it introduces a memory allocation on fast path.
Fortunately, we only need to a word-size to store all the information,
because the flags actually only contains 1 bit so can be just packed
into the lowest bit of the "pointer", which is stored as unsigned
long.
Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is
no longer needed after ->sk_data_ready() so we can just drop it.
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
include/linux/skbuff.h | 3 +++
include/linux/skmsg.h | 35 +++++++++++++++++++++++++++++++++++
include/net/tcp.h | 19 -------------------
net/core/skmsg.c | 32 ++++++++++++++++++++------------
net/core/sock_map.c | 8 ++------
5 files changed, 60 insertions(+), 37 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6d0a33d1c0db..bd84f799c952 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -755,6 +755,9 @@ struct sk_buff {
void (*destructor)(struct sk_buff *skb);
};
struct list_head tcp_tsorted_anchor;
+#ifdef CONFIG_NET_SOCK_MSG
+ unsigned long _sk_redir;
+#endif
};
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e3bb712af257..fc234d507fd7 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -459,4 +459,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
return false;
return !!psock->saved_data_ready;
}
+
+#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+static inline bool skb_bpf_ingress(const struct sk_buff *skb)
+{
+ unsigned long sk_redir = skb->_sk_redir;
+
+ return sk_redir & BPF_F_INGRESS;
+}
+
+static inline void skb_bpf_set_ingress(struct sk_buff *skb)
+{
+ skb->_sk_redir |= BPF_F_INGRESS;
+}
+
+static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir,
+ bool ingress)
+{
+ skb->_sk_redir = (unsigned long)sk_redir;
+ if (ingress)
+ skb->_sk_redir |= BPF_F_INGRESS;
+}
+
+static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb)
+{
+ unsigned long sk_redir = skb->_sk_redir;
+
+ sk_redir &= ~0x1UL;
+ return (struct sock *)sk_redir;
+}
+
+static inline void skb_bpf_redirect_clear(struct sk_buff *skb)
+{
+ skb->_sk_redir = 0;
+}
+#endif /* CONFIG_NET_SOCK_MSG */
#endif /* _LINUX_SKMSG_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 947ef5da6867..075de26f449d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -883,30 +883,11 @@ struct tcp_skb_cb {
struct inet6_skb_parm h6;
#endif
} header; /* For incoming skbs */
- struct {
- __u32 flags;
- struct sock *sk_redir;
- } bpf;
};
};
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
-static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
-{
- return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
-}
-
-static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
-{
- return TCP_SKB_CB(skb)->bpf.sk_redir;
-}
-
-static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
-{
- TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
-}
-
extern const struct inet_connection_sock_af_ops ipv4_specific;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 2d8bbb3fd87c..05b5af09ff42 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -494,6 +494,8 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
+ skb_bpf_redirect_clear(skb);
+
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
@@ -525,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work)
len = skb->len;
off = 0;
start:
- ingress = tcp_skb_bpf_ingress(skb);
+ ingress = skb_bpf_ingress(skb);
do {
ret = -EIO;
if (likely(psock->sk->sk_socket))
@@ -631,7 +633,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
static void sk_psock_zap_ingress(struct sk_psock *psock)
{
- __skb_queue_purge(&psock->ingress_skb);
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
+ skb_bpf_redirect_clear(skb);
+ kfree_skb(skb);
+ }
__sk_psock_purge_ingress_msg(psock);
}
@@ -752,7 +759,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
struct sk_psock *psock_other;
struct sock *sk_other;
- sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ sk_other = skb_bpf_redirect_fetch(skb);
/* This error is a buggy BPF program, it returned a redirect
* return code, but then didn't set a redirect interface.
*/
@@ -802,9 +809,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
* TLS context.
*/
skb->sk = psock->sk;
- tcp_skb_bpf_redirect_clear(skb);
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
@@ -816,7 +824,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
- struct tcp_skb_cb *tcp;
struct sock *sk_other;
int err = -EIO;
@@ -828,8 +835,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
goto out_free;
}
- tcp = TCP_SKB_CB(skb);
- tcp->bpf.flags |= BPF_F_INGRESS;
+ skb_bpf_set_ingress(skb);
/* If the queue is empty then we can submit directly
* into the msg queue. If its not empty we have to
@@ -890,9 +896,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
- tcp_skb_bpf_redirect_clear(skb);
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
}
sk_psock_verdict_apply(psock, skb, ret);
out:
@@ -1005,9 +1012,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
- tcp_skb_bpf_redirect_clear(skb);
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
}
sk_psock_verdict_apply(psock, skb, ret);
out:
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1a28a5c2c61e..dbfcd7006338 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = sk;
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
}
@@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = {
BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
struct bpf_map *, map, void *, key, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = sk;
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
}
--
2.25.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb
2021-02-20 5:29 ` [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb Cong Wang
@ 2021-02-22 12:20 ` Jakub Sitnicki
2021-02-22 19:27 ` Cong Wang
0 siblings, 1 reply; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-22 12:20 UTC (permalink / raw)
To: Cong Wang
Cc: netdev, bpf, duanxiongchun, wangdongdong.6, jiang.wang,
Cong Wang, Daniel Borkmann, Lorenz Bauer, John Fastabend
On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly
> does not work for any other non-TCP protocols. We can move them to
> skb ext, but it introduces a memory allocation on fast path.
>
> Fortunately, we only need to a word-size to store all the information,
> because the flags actually only contains 1 bit so can be just packed
> into the lowest bit of the "pointer", which is stored as unsigned
> long.
>
> Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is
> no longer needed after ->sk_data_ready() so we can just drop it.
>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Cc: Lorenz Bauer <lmb@cloudflare.com>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
LGTM. I have some questions (below) that would help me confirm if I
understand the changes, and what could be improved, if anything.
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
> include/linux/skbuff.h | 3 +++
> include/linux/skmsg.h | 35 +++++++++++++++++++++++++++++++++++
> include/net/tcp.h | 19 -------------------
> net/core/skmsg.c | 32 ++++++++++++++++++++------------
> net/core/sock_map.c | 8 ++------
> 5 files changed, 60 insertions(+), 37 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 6d0a33d1c0db..bd84f799c952 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -755,6 +755,9 @@ struct sk_buff {
> void (*destructor)(struct sk_buff *skb);
> };
> struct list_head tcp_tsorted_anchor;
> +#ifdef CONFIG_NET_SOCK_MSG
> + unsigned long _sk_redir;
> +#endif
> };
>
> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index e3bb712af257..fc234d507fd7 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -459,4 +459,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
> return false;
> return !!psock->saved_data_ready;
> }
> +
> +#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
> +static inline bool skb_bpf_ingress(const struct sk_buff *skb)
> +{
> + unsigned long sk_redir = skb->_sk_redir;
> +
> + return sk_redir & BPF_F_INGRESS;
> +}
> +
> +static inline void skb_bpf_set_ingress(struct sk_buff *skb)
> +{
> + skb->_sk_redir |= BPF_F_INGRESS;
> +}
> +
> +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir,
> + bool ingress)
> +{
> + skb->_sk_redir = (unsigned long)sk_redir;
> + if (ingress)
> + skb->_sk_redir |= BPF_F_INGRESS;
> +}
> +
> +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb)
> +{
> + unsigned long sk_redir = skb->_sk_redir;
> +
> + sk_redir &= ~0x1UL;
We're using the enum when setting the bit flag, but a hardcoded constant
when masking it. ~BPF_F_INGRESS would be more consistent here.
> + return (struct sock *)sk_redir;
> +}
> +
> +static inline void skb_bpf_redirect_clear(struct sk_buff *skb)
> +{
> + skb->_sk_redir = 0;
> +}
> +#endif /* CONFIG_NET_SOCK_MSG */
> #endif /* _LINUX_SKMSG_H */
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 947ef5da6867..075de26f449d 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -883,30 +883,11 @@ struct tcp_skb_cb {
> struct inet6_skb_parm h6;
> #endif
> } header; /* For incoming skbs */
> - struct {
> - __u32 flags;
> - struct sock *sk_redir;
> - } bpf;
> };
> };
>
> #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
>
> -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
> -{
> - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
> -}
> -
> -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
> -{
> - return TCP_SKB_CB(skb)->bpf.sk_redir;
> -}
> -
> -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
> -{
> - TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
> -}
> -
> extern const struct inet_connection_sock_af_ops ipv4_specific;
>
> #if IS_ENABLED(CONFIG_IPV6)
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index 2d8bbb3fd87c..05b5af09ff42 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -494,6 +494,8 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
> static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
> u32 off, u32 len, bool ingress)
> {
> + skb_bpf_redirect_clear(skb);
This is called to avoid leaking state in skb->_skb_refdst. Correct?
I'm wondering why we're doing it every time sk_psock_handle_skb() gets
invoked from the do/while loop in sk_psock_backlog(), instead of doing
it once after reading ingress flag with skb_bpf_ingress()?
> +
> if (!ingress) {
> if (!sock_writeable(psock->sk))
> return -EAGAIN;
> @@ -525,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work)
> len = skb->len;
> off = 0;
> start:
> - ingress = tcp_skb_bpf_ingress(skb);
> + ingress = skb_bpf_ingress(skb);
> do {
> ret = -EIO;
> if (likely(psock->sk->sk_socket))
> @@ -631,7 +633,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
>
> static void sk_psock_zap_ingress(struct sk_psock *psock)
> {
> - __skb_queue_purge(&psock->ingress_skb);
> + struct sk_buff *skb;
> +
> + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
> + skb_bpf_redirect_clear(skb);
I believe we clone the skb before enqueuing it psock->ingress_skb.
Clone happens either in sk_psock_verdict_recv() or in __strp_recv().
There are not other users holding a ref, so clearing the redirect seems
unneeded. Unless I'm missing something?
> + kfree_skb(skb);
> + }
> __sk_psock_purge_ingress_msg(psock);
> }
>
> @@ -752,7 +759,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
> struct sk_psock *psock_other;
> struct sock *sk_other;
>
> - sk_other = tcp_skb_bpf_redirect_fetch(skb);
> + sk_other = skb_bpf_redirect_fetch(skb);
> /* This error is a buggy BPF program, it returned a redirect
> * return code, but then didn't set a redirect interface.
> */
> @@ -802,9 +809,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
> * TLS context.
> */
> skb->sk = psock->sk;
> - tcp_skb_bpf_redirect_clear(skb);
> + skb_dst_drop(skb);
> + skb_bpf_redirect_clear(skb);
After skb_dst_drop(), skb->_skb_refdst is clear. So it seems the
redirect_clear() is not needed. But I'm guessing it is being invoked
to communicate the intention?
> ret = sk_psock_bpf_run(psock, prog, skb);
> - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
> + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
> skb->sk = NULL;
> }
> sk_psock_tls_verdict_apply(skb, psock->sk, ret);
> @@ -816,7 +824,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
> static void sk_psock_verdict_apply(struct sk_psock *psock,
> struct sk_buff *skb, int verdict)
> {
> - struct tcp_skb_cb *tcp;
> struct sock *sk_other;
> int err = -EIO;
>
> @@ -828,8 +835,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
> goto out_free;
> }
>
> - tcp = TCP_SKB_CB(skb);
> - tcp->bpf.flags |= BPF_F_INGRESS;
> + skb_bpf_set_ingress(skb);
>
> /* If the queue is empty then we can submit directly
> * into the msg queue. If its not empty we have to
> @@ -890,9 +896,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
> skb_set_owner_r(skb, sk);
> prog = READ_ONCE(psock->progs.skb_verdict);
> if (likely(prog)) {
> - tcp_skb_bpf_redirect_clear(skb);
> + skb_dst_drop(skb);
> + skb_bpf_redirect_clear(skb);
> ret = sk_psock_bpf_run(psock, prog, skb);
> - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
> + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
> }
> sk_psock_verdict_apply(psock, skb, ret);
> out:
> @@ -1005,9 +1012,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
> skb_set_owner_r(skb, sk);
> prog = READ_ONCE(psock->progs.skb_verdict);
> if (likely(prog)) {
> - tcp_skb_bpf_redirect_clear(skb);
> + skb_dst_drop(skb);
> + skb_bpf_redirect_clear(skb);
> ret = sk_psock_bpf_run(psock, prog, skb);
> - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
> + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
> }
> sk_psock_verdict_apply(psock, skb, ret);
> out:
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index 1a28a5c2c61e..dbfcd7006338 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
> BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
> struct bpf_map *, map, u32, key, u64, flags)
> {
> - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
> struct sock *sk;
>
> if (unlikely(flags & ~(BPF_F_INGRESS)))
> @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
> if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
> return SK_DROP;
>
> - tcb->bpf.flags = flags;
> - tcb->bpf.sk_redir = sk;
> + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
> return SK_PASS;
> }
>
> @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = {
> BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
> struct bpf_map *, map, void *, key, u64, flags)
> {
> - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
> struct sock *sk;
>
> if (unlikely(flags & ~(BPF_F_INGRESS)))
> @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
> if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
> return SK_DROP;
>
> - tcb->bpf.flags = flags;
> - tcb->bpf.sk_redir = sk;
> + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
> return SK_PASS;
> }
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb
2021-02-22 12:20 ` Jakub Sitnicki
@ 2021-02-22 19:27 ` Cong Wang
2021-02-23 17:52 ` Jakub Sitnicki
0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2021-02-22 19:27 UTC (permalink / raw)
To: Jakub Sitnicki
Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
Dongdong Wang, Jiang Wang, Cong Wang, Daniel Borkmann,
Lorenz Bauer, John Fastabend
On Mon, Feb 22, 2021 at 4:20 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly
> > does not work for any other non-TCP protocols. We can move them to
> > skb ext, but it introduces a memory allocation on fast path.
> >
> > Fortunately, we only need to a word-size to store all the information,
> > because the flags actually only contains 1 bit so can be just packed
> > into the lowest bit of the "pointer", which is stored as unsigned
> > long.
> >
> > Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is
> > no longer needed after ->sk_data_ready() so we can just drop it.
> >
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > Acked-by: John Fastabend <john.fastabend@gmail.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
>
> LGTM. I have some questions (below) that would help me confirm if I
> understand the changes, and what could be improved, if anything.
>
> Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
>
> > include/linux/skbuff.h | 3 +++
> > include/linux/skmsg.h | 35 +++++++++++++++++++++++++++++++++++
> > include/net/tcp.h | 19 -------------------
> > net/core/skmsg.c | 32 ++++++++++++++++++++------------
> > net/core/sock_map.c | 8 ++------
> > 5 files changed, 60 insertions(+), 37 deletions(-)
> >
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 6d0a33d1c0db..bd84f799c952 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -755,6 +755,9 @@ struct sk_buff {
> > void (*destructor)(struct sk_buff *skb);
> > };
> > struct list_head tcp_tsorted_anchor;
> > +#ifdef CONFIG_NET_SOCK_MSG
> > + unsigned long _sk_redir;
> > +#endif
> > };
> >
> > #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> > index e3bb712af257..fc234d507fd7 100644
> > --- a/include/linux/skmsg.h
> > +++ b/include/linux/skmsg.h
> > @@ -459,4 +459,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
> > return false;
> > return !!psock->saved_data_ready;
> > }
> > +
> > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
> > +static inline bool skb_bpf_ingress(const struct sk_buff *skb)
> > +{
> > + unsigned long sk_redir = skb->_sk_redir;
> > +
> > + return sk_redir & BPF_F_INGRESS;
> > +}
> > +
> > +static inline void skb_bpf_set_ingress(struct sk_buff *skb)
> > +{
> > + skb->_sk_redir |= BPF_F_INGRESS;
> > +}
> > +
> > +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir,
> > + bool ingress)
> > +{
> > + skb->_sk_redir = (unsigned long)sk_redir;
> > + if (ingress)
> > + skb->_sk_redir |= BPF_F_INGRESS;
> > +}
> > +
> > +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb)
> > +{
> > + unsigned long sk_redir = skb->_sk_redir;
> > +
> > + sk_redir &= ~0x1UL;
>
> We're using the enum when setting the bit flag, but a hardcoded constant
> when masking it. ~BPF_F_INGRESS would be more consistent here.
Well, here we need a mask, not a bit, but we don't have a mask yet,
hence I just use hard-coded 0x1. Does #define BPF_F_MASK 0x1UL
look any better?
>
> > + return (struct sock *)sk_redir;
> > +}
> > +
> > +static inline void skb_bpf_redirect_clear(struct sk_buff *skb)
> > +{
> > + skb->_sk_redir = 0;
> > +}
> > +#endif /* CONFIG_NET_SOCK_MSG */
> > #endif /* _LINUX_SKMSG_H */
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 947ef5da6867..075de26f449d 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -883,30 +883,11 @@ struct tcp_skb_cb {
> > struct inet6_skb_parm h6;
> > #endif
> > } header; /* For incoming skbs */
> > - struct {
> > - __u32 flags;
> > - struct sock *sk_redir;
> > - } bpf;
> > };
> > };
> >
> > #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
> >
> > -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
> > -{
> > - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
> > -}
> > -
> > -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
> > -{
> > - return TCP_SKB_CB(skb)->bpf.sk_redir;
> > -}
> > -
> > -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
> > -{
> > - TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
> > -}
> > -
> > extern const struct inet_connection_sock_af_ops ipv4_specific;
> >
> > #if IS_ENABLED(CONFIG_IPV6)
> > diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> > index 2d8bbb3fd87c..05b5af09ff42 100644
> > --- a/net/core/skmsg.c
> > +++ b/net/core/skmsg.c
> > @@ -494,6 +494,8 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
> > static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
> > u32 off, u32 len, bool ingress)
> > {
> > + skb_bpf_redirect_clear(skb);
>
> This is called to avoid leaking state in skb->_skb_refdst. Correct?
This is to teach kfree_skb() not to consider it as a valid _skb_refdst.
>
> I'm wondering why we're doing it every time sk_psock_handle_skb() gets
> invoked from the do/while loop in sk_psock_backlog(), instead of doing
> it once after reading ingress flag with skb_bpf_ingress()?
It should also work, I don't see much difference here, as we almost
always process a full skb, that is, ret == skb->len.
>
> > +
> > if (!ingress) {
> > if (!sock_writeable(psock->sk))
> > return -EAGAIN;
> > @@ -525,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work)
> > len = skb->len;
> > off = 0;
> > start:
> > - ingress = tcp_skb_bpf_ingress(skb);
> > + ingress = skb_bpf_ingress(skb);
> > do {
> > ret = -EIO;
> > if (likely(psock->sk->sk_socket))
> > @@ -631,7 +633,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
> >
> > static void sk_psock_zap_ingress(struct sk_psock *psock)
> > {
> > - __skb_queue_purge(&psock->ingress_skb);
> > + struct sk_buff *skb;
> > +
> > + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
> > + skb_bpf_redirect_clear(skb);
>
> I believe we clone the skb before enqueuing it psock->ingress_skb.
> Clone happens either in sk_psock_verdict_recv() or in __strp_recv().
> There are not other users holding a ref, so clearing the redirect seems
> unneeded. Unless I'm missing something?
Yes, skb dst is also cloned:
980 static void __copy_skb_header(struct sk_buff *new, const struct
sk_buff *old)
981 {
982 new->tstamp = old->tstamp;
983 /* We do not copy old->sk */
984 new->dev = old->dev;
985 memcpy(new->cb, old->cb, sizeof(old->cb));
986 skb_dst_copy(new, old);
Also, if without this, dst_release() would complain again. I was not smart
enough to add it in the beginning, dst_release() taught me this lesson. ;)
>
> > + kfree_skb(skb);
> > + }
> > __sk_psock_purge_ingress_msg(psock);
> > }
> >
> > @@ -752,7 +759,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
> > struct sk_psock *psock_other;
> > struct sock *sk_other;
> >
> > - sk_other = tcp_skb_bpf_redirect_fetch(skb);
> > + sk_other = skb_bpf_redirect_fetch(skb);
> > /* This error is a buggy BPF program, it returned a redirect
> > * return code, but then didn't set a redirect interface.
> > */
> > @@ -802,9 +809,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
> > * TLS context.
> > */
> > skb->sk = psock->sk;
> > - tcp_skb_bpf_redirect_clear(skb);
> > + skb_dst_drop(skb);
> > + skb_bpf_redirect_clear(skb);
>
> After skb_dst_drop(), skb->_skb_refdst is clear. So it seems the
> redirect_clear() is not needed. But I'm guessing it is being invoked
> to communicate the intention?
Technically true, but I prefer to call them explicitly, not to rely on the
fact skb->_skb_refdst shares the same storage with skb->_sk_redir,
which would also require some comments to explain.
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb
2021-02-22 19:27 ` Cong Wang
@ 2021-02-23 17:52 ` Jakub Sitnicki
2021-02-23 18:04 ` Cong Wang
0 siblings, 1 reply; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-23 17:52 UTC (permalink / raw)
To: Cong Wang
Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
Dongdong Wang, Jiang Wang, Cong Wang, Daniel Borkmann,
Lorenz Bauer, John Fastabend
On Mon, Feb 22, 2021 at 08:27 PM CET, Cong Wang wrote:
> On Mon, Feb 22, 2021 at 4:20 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>>
>> On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
>> > From: Cong Wang <cong.wang@bytedance.com>
>> >
>> > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly
>> > does not work for any other non-TCP protocols. We can move them to
>> > skb ext, but it introduces a memory allocation on fast path.
>> >
>> > Fortunately, we only need to a word-size to store all the information,
>> > because the flags actually only contains 1 bit so can be just packed
>> > into the lowest bit of the "pointer", which is stored as unsigned
>> > long.
>> >
>> > Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is
>> > no longer needed after ->sk_data_ready() so we can just drop it.
>> >
>> > Cc: Daniel Borkmann <daniel@iogearbox.net>
>> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
>> > Cc: Lorenz Bauer <lmb@cloudflare.com>
>> > Acked-by: John Fastabend <john.fastabend@gmail.com>
>> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
>> > ---
>>
>> LGTM. I have some questions (below) that would help me confirm if I
>> understand the changes, and what could be improved, if anything.
>>
>> Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
>>
>> > include/linux/skbuff.h | 3 +++
>> > include/linux/skmsg.h | 35 +++++++++++++++++++++++++++++++++++
>> > include/net/tcp.h | 19 -------------------
>> > net/core/skmsg.c | 32 ++++++++++++++++++++------------
>> > net/core/sock_map.c | 8 ++------
>> > 5 files changed, 60 insertions(+), 37 deletions(-)
>> >
>> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>> > index 6d0a33d1c0db..bd84f799c952 100644
>> > --- a/include/linux/skbuff.h
>> > +++ b/include/linux/skbuff.h
>> > @@ -755,6 +755,9 @@ struct sk_buff {
>> > void (*destructor)(struct sk_buff *skb);
>> > };
>> > struct list_head tcp_tsorted_anchor;
>> > +#ifdef CONFIG_NET_SOCK_MSG
>> > + unsigned long _sk_redir;
>> > +#endif
>> > };
>> >
>> > #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
>> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
>> > index e3bb712af257..fc234d507fd7 100644
>> > --- a/include/linux/skmsg.h
>> > +++ b/include/linux/skmsg.h
>> > @@ -459,4 +459,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
>> > return false;
>> > return !!psock->saved_data_ready;
>> > }
>> > +
>> > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
>> > +static inline bool skb_bpf_ingress(const struct sk_buff *skb)
>> > +{
>> > + unsigned long sk_redir = skb->_sk_redir;
>> > +
>> > + return sk_redir & BPF_F_INGRESS;
>> > +}
>> > +
>> > +static inline void skb_bpf_set_ingress(struct sk_buff *skb)
>> > +{
>> > + skb->_sk_redir |= BPF_F_INGRESS;
>> > +}
>> > +
>> > +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir,
>> > + bool ingress)
>> > +{
>> > + skb->_sk_redir = (unsigned long)sk_redir;
>> > + if (ingress)
>> > + skb->_sk_redir |= BPF_F_INGRESS;
>> > +}
>> > +
>> > +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb)
>> > +{
>> > + unsigned long sk_redir = skb->_sk_redir;
>> > +
>> > + sk_redir &= ~0x1UL;
>>
>> We're using the enum when setting the bit flag, but a hardcoded constant
>> when masking it. ~BPF_F_INGRESS would be more consistent here.
>
> Well, here we need a mask, not a bit, but we don't have a mask yet,
> hence I just use hard-coded 0x1. Does #define BPF_F_MASK 0x1UL
> look any better?
Based on what I've seen around, mask for sanitizing tagged pointers is
usually derived from the flag(s). For instance:
#define SKB_DST_NOREF 1UL
#define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
#define SK_USER_DATA_NOCOPY 1UL
#define SK_USER_DATA_BPF 2UL /* Managed by BPF */
#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF)
Using ~(BPF_F_INGRESS) expression would be like substituting mask
definition.
[..]
>> > diff --git a/include/net/tcp.h b/include/net/tcp.h
>> > index 947ef5da6867..075de26f449d 100644
>> > --- a/include/net/tcp.h
>> > +++ b/include/net/tcp.h
>> > @@ -883,30 +883,11 @@ struct tcp_skb_cb {
>> > struct inet6_skb_parm h6;
>> > #endif
>> > } header; /* For incoming skbs */
>> > - struct {
>> > - __u32 flags;
>> > - struct sock *sk_redir;
>> > - } bpf;
>> > };
>> > };
>> >
>> > #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
>> >
>> > -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
>> > -{
>> > - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
>> > -}
>> > -
>> > -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
>> > -{
>> > - return TCP_SKB_CB(skb)->bpf.sk_redir;
>> > -}
>> > -
>> > -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
>> > -{
>> > - TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
>> > -}
>> > -
>> > extern const struct inet_connection_sock_af_ops ipv4_specific;
>> >
>> > #if IS_ENABLED(CONFIG_IPV6)
>> > diff --git a/net/core/skmsg.c b/net/core/skmsg.c
>> > index 2d8bbb3fd87c..05b5af09ff42 100644
>> > --- a/net/core/skmsg.c
>> > +++ b/net/core/skmsg.c
>> > @@ -494,6 +494,8 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
>> > static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
>> > u32 off, u32 len, bool ingress)
>> > {
>> > + skb_bpf_redirect_clear(skb);
>>
>> This is called to avoid leaking state in skb->_skb_refdst. Correct?
>
> This is to teach kfree_skb() not to consider it as a valid _skb_refdst.
OK
>
>>
>> I'm wondering why we're doing it every time sk_psock_handle_skb() gets
>> invoked from the do/while loop in sk_psock_backlog(), instead of doing
>> it once after reading ingress flag with skb_bpf_ingress()?
>
> It should also work, I don't see much difference here, as we almost
> always process a full skb, that is, ret == skb->len.
OK
>
>
>>
>> > +
>> > if (!ingress) {
>> > if (!sock_writeable(psock->sk))
>> > return -EAGAIN;
>> > @@ -525,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work)
>> > len = skb->len;
>> > off = 0;
>> > start:
>> > - ingress = tcp_skb_bpf_ingress(skb);
>> > + ingress = skb_bpf_ingress(skb);
>> > do {
>> > ret = -EIO;
>> > if (likely(psock->sk->sk_socket))
>> > @@ -631,7 +633,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
>> >
>> > static void sk_psock_zap_ingress(struct sk_psock *psock)
>> > {
>> > - __skb_queue_purge(&psock->ingress_skb);
>> > + struct sk_buff *skb;
>> > +
>> > + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
>> > + skb_bpf_redirect_clear(skb);
>>
>> I believe we clone the skb before enqueuing it psock->ingress_skb.
>> Clone happens either in sk_psock_verdict_recv() or in __strp_recv().
>> There are not other users holding a ref, so clearing the redirect seems
>> unneeded. Unless I'm missing something?
>
> Yes, skb dst is also cloned:
>
> 980 static void __copy_skb_header(struct sk_buff *new, const struct
> sk_buff *old)
> 981 {
> 982 new->tstamp = old->tstamp;
> 983 /* We do not copy old->sk */
> 984 new->dev = old->dev;
> 985 memcpy(new->cb, old->cb, sizeof(old->cb));
> 986 skb_dst_copy(new, old);
>
> Also, if without this, dst_release() would complain again. I was not smart
> enough to add it in the beginning, dst_release() taught me this lesson. ;)
OK, I think I follow you now.
Alternatively we could clear _skb_refdest after clone, but before
enqueuing the skb in ingress_skb. And only for when we're redirecting.
I believe that would be in sk_psock_skb_redirect, right before skb_queue_tail.
>
>>
>> > + kfree_skb(skb);
>> > + }
>> > __sk_psock_purge_ingress_msg(psock);
>> > }
>> >
>> > @@ -752,7 +759,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
>> > struct sk_psock *psock_other;
>> > struct sock *sk_other;
>> >
>> > - sk_other = tcp_skb_bpf_redirect_fetch(skb);
>> > + sk_other = skb_bpf_redirect_fetch(skb);
>> > /* This error is a buggy BPF program, it returned a redirect
>> > * return code, but then didn't set a redirect interface.
>> > */
>> > @@ -802,9 +809,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
>> > * TLS context.
>> > */
>> > skb->sk = psock->sk;
>> > - tcp_skb_bpf_redirect_clear(skb);
>> > + skb_dst_drop(skb);
>> > + skb_bpf_redirect_clear(skb);
>>
>> After skb_dst_drop(), skb->_skb_refdst is clear. So it seems the
>> redirect_clear() is not needed. But I'm guessing it is being invoked
>> to communicate the intention?
>
> Technically true, but I prefer to call them explicitly, not to rely on the
> fact skb->_skb_refdst shares the same storage with skb->_sk_redir,
> which would also require some comments to explain.
>
OK
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb
2021-02-23 17:52 ` Jakub Sitnicki
@ 2021-02-23 18:04 ` Cong Wang
2021-02-23 18:36 ` Jakub Sitnicki
0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2021-02-23 18:04 UTC (permalink / raw)
To: Jakub Sitnicki
Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
Dongdong Wang, Jiang Wang, Cong Wang, Daniel Borkmann,
Lorenz Bauer, John Fastabend
On Tue, Feb 23, 2021 at 9:53 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
> Based on what I've seen around, mask for sanitizing tagged pointers is
> usually derived from the flag(s). For instance:
>
> #define SKB_DST_NOREF 1UL
> #define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
>
> #define SK_USER_DATA_NOCOPY 1UL
> #define SK_USER_DATA_BPF 2UL /* Managed by BPF */
> #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF)
>
> Using ~(BPF_F_INGRESS) expression would be like substituting mask
> definition.
Yes, that is why I said we need a mask.
>
> Alternatively we could clear _skb_refdest after clone, but before
> enqueuing the skb in ingress_skb. And only for when we're redirecting.
>
> I believe that would be in sk_psock_skb_redirect, right before skb_queue_tail.
Hmm? We definitely cannot clear skb->_sk_redir there, as it is used after
enqueued in ingress_skb, that is in sk_psock_backlog().
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb
2021-02-23 18:04 ` Cong Wang
@ 2021-02-23 18:36 ` Jakub Sitnicki
0 siblings, 0 replies; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-23 18:36 UTC (permalink / raw)
To: Cong Wang
Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
Dongdong Wang, Jiang Wang, Cong Wang, Daniel Borkmann,
Lorenz Bauer, John Fastabend
On Tue, Feb 23, 2021 at 07:04 PM CET, Cong Wang wrote:
> On Tue, Feb 23, 2021 at 9:53 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>> Based on what I've seen around, mask for sanitizing tagged pointers is
>> usually derived from the flag(s). For instance:
>>
>> #define SKB_DST_NOREF 1UL
>> #define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
>>
>> #define SK_USER_DATA_NOCOPY 1UL
>> #define SK_USER_DATA_BPF 2UL /* Managed by BPF */
>> #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF)
>>
>> Using ~(BPF_F_INGRESS) expression would be like substituting mask
>> definition.
>
> Yes, that is why I said we need a mask.
OK
>
>>
>> Alternatively we could clear _skb_refdest after clone, but before
>> enqueuing the skb in ingress_skb. And only for when we're redirecting.
>>
>> I believe that would be in sk_psock_skb_redirect, right before skb_queue_tail.
>
> Hmm? We definitely cannot clear skb->_sk_redir there, as it is used after
> enqueued in ingress_skb, that is in sk_psock_backlog().
You're right. I focused on the sk pointer and forgot it also carries the
ingress flag.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [Patch bpf-next v6 5/8] sock_map: rename skb_parser and skb_verdict
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
` (3 preceding siblings ...)
2021-02-20 5:29 ` [Patch bpf-next v6 4/8] skmsg: move sk_redir from TCP_SKB_CB to skb Cong Wang
@ 2021-02-20 5:29 ` Cong Wang
2021-02-22 12:28 ` Jakub Sitnicki
2021-02-20 5:29 ` [Patch bpf-next v6 6/8] sock_map: make sock_map_prog_update() static Cong Wang
` (3 subsequent siblings)
8 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2021-02-20 5:29 UTC (permalink / raw)
To: netdev
Cc: bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang,
Daniel Borkmann, Jakub Sitnicki, Lorenz Bauer, John Fastabend
From: Cong Wang <cong.wang@bytedance.com>
These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER
and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact
they are only used for TCP. And save the name 'skb_verdict' for
general use later.
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
include/linux/skmsg.h | 8 +--
net/core/skmsg.c | 14 ++---
net/core/sock_map.c | 60 +++++++++----------
.../selftests/bpf/prog_tests/sockmap_listen.c | 8 +--
.../selftests/bpf/progs/test_sockmap_listen.c | 4 +-
5 files changed, 47 insertions(+), 47 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index fc234d507fd7..ab3f3f2c426f 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -56,8 +56,8 @@ struct sk_msg {
struct sk_psock_progs {
struct bpf_prog *msg_parser;
- struct bpf_prog *skb_parser;
- struct bpf_prog *skb_verdict;
+ struct bpf_prog *stream_parser;
+ struct bpf_prog *stream_verdict;
};
enum sk_psock_state_bits {
@@ -447,8 +447,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog,
static inline void psock_progs_drop(struct sk_psock_progs *progs)
{
psock_set_prog(&progs->msg_parser, NULL);
- psock_set_prog(&progs->skb_parser, NULL);
- psock_set_prog(&progs->skb_verdict, NULL);
+ psock_set_prog(&progs->stream_parser, NULL);
+ psock_set_prog(&progs->stream_verdict, NULL);
}
int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 05b5af09ff42..dbb176427c14 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -690,9 +690,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
- if (psock->progs.skb_parser)
+ if (psock->progs.stream_parser)
sk_psock_stop_strp(sk, psock);
- else if (psock->progs.skb_verdict)
+ else if (psock->progs.stream_verdict)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
@@ -802,7 +802,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
int ret = __SK_PASS;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
/* We skip full set_owner_r here because if we do a SK_PASS
* or SK_DROP we can skip skb memory accounting and use the
@@ -894,7 +894,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
goto out;
}
skb_set_owner_r(skb, sk);
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);
@@ -918,7 +918,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
int ret = skb->len;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_parser);
+ prog = READ_ONCE(psock->progs.stream_parser);
if (likely(prog)) {
skb->sk = psock->sk;
ret = sk_psock_bpf_run(psock, prog, skb);
@@ -981,7 +981,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
void sk_psock_done_strp(struct sk_psock *psock)
{
/* Parser has been stopped */
- if (psock->progs.skb_parser)
+ if (psock->progs.stream_parser)
strp_done(&psock->strp);
}
#endif
@@ -1010,7 +1010,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
goto out;
}
skb_set_owner_r(skb, sk);
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index dbfcd7006338..69785070f02d 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk,
struct bpf_map *map = link->map;
struct bpf_stab *stab = container_of(map, struct bpf_stab,
map);
- if (psock->saved_data_ready && stab->progs.skb_parser)
+ if (psock->saved_data_ready && stab->progs.stream_parser)
strp_stop = true;
- if (psock->saved_data_ready && stab->progs.skb_verdict)
+ if (psock->saved_data_ready && stab->progs.stream_verdict)
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
@@ -224,23 +224,23 @@ static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
struct sock *sk)
{
- struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+ struct bpf_prog *msg_parser, *stream_parser, *stream_verdict;
struct sk_psock *psock;
int ret;
- skb_verdict = READ_ONCE(progs->skb_verdict);
- if (skb_verdict) {
- skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
- if (IS_ERR(skb_verdict))
- return PTR_ERR(skb_verdict);
+ stream_verdict = READ_ONCE(progs->stream_verdict);
+ if (stream_verdict) {
+ stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
+ if (IS_ERR(stream_verdict))
+ return PTR_ERR(stream_verdict);
}
- skb_parser = READ_ONCE(progs->skb_parser);
- if (skb_parser) {
- skb_parser = bpf_prog_inc_not_zero(skb_parser);
- if (IS_ERR(skb_parser)) {
- ret = PTR_ERR(skb_parser);
- goto out_put_skb_verdict;
+ stream_parser = READ_ONCE(progs->stream_parser);
+ if (stream_parser) {
+ stream_parser = bpf_prog_inc_not_zero(stream_parser);
+ if (IS_ERR(stream_parser)) {
+ ret = PTR_ERR(stream_parser);
+ goto out_put_stream_verdict;
}
}
@@ -249,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
msg_parser = bpf_prog_inc_not_zero(msg_parser);
if (IS_ERR(msg_parser)) {
ret = PTR_ERR(msg_parser);
- goto out_put_skb_parser;
+ goto out_put_stream_parser;
}
}
@@ -261,8 +261,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
- (skb_parser && READ_ONCE(psock->progs.skb_parser)) ||
- (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
+ (stream_parser && READ_ONCE(psock->progs.stream_parser)) ||
+ (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
goto out_progs;
@@ -283,15 +283,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
- if (skb_parser && skb_verdict && !psock->saved_data_ready) {
+ if (stream_parser && stream_verdict && !psock->saved_data_ready) {
ret = sk_psock_init_strp(sk, psock);
if (ret)
goto out_unlock_drop;
- psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
- psock_set_prog(&psock->progs.skb_parser, skb_parser);
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
+ psock_set_prog(&psock->progs.stream_parser, stream_parser);
sk_psock_start_strp(sk, psock);
- } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) {
- psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
sk_psock_start_verdict(sk,psock);
}
write_unlock_bh(&sk->sk_callback_lock);
@@ -303,12 +303,12 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
out_progs:
if (msg_parser)
bpf_prog_put(msg_parser);
-out_put_skb_parser:
- if (skb_parser)
- bpf_prog_put(skb_parser);
-out_put_skb_verdict:
- if (skb_verdict)
- bpf_prog_put(skb_verdict);
+out_put_stream_parser:
+ if (stream_parser)
+ bpf_prog_put(stream_parser);
+out_put_stream_verdict:
+ if (stream_verdict)
+ bpf_prog_put(stream_verdict);
return ret;
}
@@ -1459,11 +1459,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
break;
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
- pprog = &progs->skb_parser;
+ pprog = &progs->stream_parser;
break;
#endif
case BPF_SK_SKB_STREAM_VERDICT:
- pprog = &progs->skb_verdict;
+ pprog = &progs->stream_verdict;
break;
default:
return -EOPNOTSUPP;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index d7d65a700799..c26e6bf05e49 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel,
struct bpf_map *inner_map, int family,
int sotype)
{
- int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
- int parser = bpf_program__fd(skel->progs.prog_skb_parser);
+ int verdict = bpf_program__fd(skel->progs.prog_stream_verdict);
+ int parser = bpf_program__fd(skel->progs.prog_stream_parser);
int verdict_map = bpf_map__fd(skel->maps.verdict_map);
int sock_map = bpf_map__fd(inner_map);
int err;
@@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel,
struct bpf_map *inner_map, int family,
int sotype)
{
- int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
- int parser = bpf_program__fd(skel->progs.prog_skb_parser);
+ int verdict = bpf_program__fd(skel->progs.prog_stream_verdict);
+ int parser = bpf_program__fd(skel->progs.prog_stream_parser);
int verdict_map = bpf_map__fd(skel->maps.verdict_map);
int sock_map = bpf_map__fd(inner_map);
int err;
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
index a3a366c57ce1..fa221141e9c1 100644
--- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
@@ -31,13 +31,13 @@ struct {
static volatile bool test_sockmap; /* toggled by user-space */
SEC("sk_skb/stream_parser")
-int prog_skb_parser(struct __sk_buff *skb)
+int prog_stream_parser(struct __sk_buff *skb)
{
return skb->len;
}
SEC("sk_skb/stream_verdict")
-int prog_skb_verdict(struct __sk_buff *skb)
+int prog_stream_verdict(struct __sk_buff *skb)
{
unsigned int *count;
__u32 zero = 0;
--
2.25.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 5/8] sock_map: rename skb_parser and skb_verdict
2021-02-20 5:29 ` [Patch bpf-next v6 5/8] sock_map: rename skb_parser and skb_verdict Cong Wang
@ 2021-02-22 12:28 ` Jakub Sitnicki
2021-02-22 19:32 ` Cong Wang
0 siblings, 1 reply; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-22 12:28 UTC (permalink / raw)
To: Cong Wang
Cc: netdev, bpf, duanxiongchun, wangdongdong.6, jiang.wang,
Cong Wang, Daniel Borkmann, Lorenz Bauer, John Fastabend
On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER
> and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact
> they are only used for TCP. And save the name 'skb_verdict' for
> general use later.
>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
skb_parser also appears in:
tools/testing/selftests/bpf/test_sockmap.c:int txmsg_omit_skb_parser;
tools/testing/selftests/bpf/test_sockmap.c: {"txmsg_omit_skb_parser", no_argument, &txmsg_omit_skb_parser, 1},
tools/testing/selftests/bpf/test_sockmap.c: txmsg_omit_skb_parser = 0;
tools/testing/selftests/bpf/test_sockmap.c: if (!txmsg_omit_skb_parser) {
tools/testing/selftests/bpf/test_sockmap.c: if (!txmsg_omit_skb_parser) {
tools/testing/selftests/bpf/test_sockmap.c: /* Tests that omit skb_parser */
tools/testing/selftests/bpf/test_sockmap.c: txmsg_omit_skb_parser = 1;
tools/testing/selftests/bpf/test_sockmap.c: txmsg_omit_skb_parser = 0;
But I understand that changing the option name could break scripts or CI
setups. And even if that's not the case it can be cleanup up later.
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 5/8] sock_map: rename skb_parser and skb_verdict
2021-02-22 12:28 ` Jakub Sitnicki
@ 2021-02-22 19:32 ` Cong Wang
0 siblings, 0 replies; 22+ messages in thread
From: Cong Wang @ 2021-02-22 19:32 UTC (permalink / raw)
To: Jakub Sitnicki
Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
Dongdong Wang, Jiang Wang, Cong Wang, Daniel Borkmann,
Lorenz Bauer, John Fastabend
On Mon, Feb 22, 2021 at 4:28 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> skb_parser also appears in:
>
> tools/testing/selftests/bpf/test_sockmap.c:int txmsg_omit_skb_parser;
> tools/testing/selftests/bpf/test_sockmap.c: {"txmsg_omit_skb_parser", no_argument, &txmsg_omit_skb_parser, 1},
> tools/testing/selftests/bpf/test_sockmap.c: txmsg_omit_skb_parser = 0;
> tools/testing/selftests/bpf/test_sockmap.c: if (!txmsg_omit_skb_parser) {
> tools/testing/selftests/bpf/test_sockmap.c: if (!txmsg_omit_skb_parser) {
> tools/testing/selftests/bpf/test_sockmap.c: /* Tests that omit skb_parser */
> tools/testing/selftests/bpf/test_sockmap.c: txmsg_omit_skb_parser = 1;
> tools/testing/selftests/bpf/test_sockmap.c: txmsg_omit_skb_parser = 0;
These are harmless, because they are internal variables of a self test.
So, I prefer to just leave them as they are.
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [Patch bpf-next v6 6/8] sock_map: make sock_map_prog_update() static
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
` (4 preceding siblings ...)
2021-02-20 5:29 ` [Patch bpf-next v6 5/8] sock_map: rename skb_parser and skb_verdict Cong Wang
@ 2021-02-20 5:29 ` Cong Wang
2021-02-22 12:29 ` Jakub Sitnicki
2021-02-20 5:29 ` [Patch bpf-next v6 7/8] skmsg: make __sk_psock_purge_ingress_msg() static Cong Wang
` (2 subsequent siblings)
8 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2021-02-20 5:29 UTC (permalink / raw)
To: netdev
Cc: bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang,
Jakub Sitnicki, Daniel Borkmann, Lorenz Bauer, John Fastabend
From: Cong Wang <cong.wang@bytedance.com>
It is only used within sock_map.c so can become static.
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
include/linux/bpf.h | 9 ---------
net/core/sock_map.c | 7 +++++--
2 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 813f30ef44ff..521b75a81aa6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1769,8 +1769,6 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which);
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
@@ -1788,13 +1786,6 @@ static inline void bpf_sk_reuseport_detach(struct sock *sk)
}
#ifdef CONFIG_BPF_SYSCALL
-static inline int sock_map_prog_update(struct bpf_map *map,
- struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
-{
- return -EOPNOTSUPP;
-}
-
static inline int sock_map_get_from_fd(const union bpf_attr *attr,
struct bpf_prog *prog)
{
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 69785070f02d..dd53a7771d7e 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,6 +24,9 @@ struct bpf_stab {
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which);
+
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
struct bpf_stab *stab;
@@ -1444,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
return NULL;
}
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
struct bpf_prog **pprog;
--
2.25.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 6/8] sock_map: make sock_map_prog_update() static
2021-02-20 5:29 ` [Patch bpf-next v6 6/8] sock_map: make sock_map_prog_update() static Cong Wang
@ 2021-02-22 12:29 ` Jakub Sitnicki
0 siblings, 0 replies; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-22 12:29 UTC (permalink / raw)
To: Cong Wang
Cc: netdev, bpf, duanxiongchun, wangdongdong.6, jiang.wang,
Cong Wang, Daniel Borkmann, Lorenz Bauer, John Fastabend
On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> It is only used within sock_map.c so can become static.
>
> Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Lorenz Bauer <lmb@cloudflare.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
Thanks for the cleanup.
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [Patch bpf-next v6 7/8] skmsg: make __sk_psock_purge_ingress_msg() static
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
` (5 preceding siblings ...)
2021-02-20 5:29 ` [Patch bpf-next v6 6/8] sock_map: make sock_map_prog_update() static Cong Wang
@ 2021-02-20 5:29 ` Cong Wang
2021-02-22 12:30 ` Jakub Sitnicki
2021-02-20 5:29 ` [Patch bpf-next v6 8/8] skmsg: get rid of sk_psock_bpf_run() Cong Wang
2021-02-22 12:32 ` [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Jakub Sitnicki
8 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2021-02-20 5:29 UTC (permalink / raw)
To: netdev
Cc: bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang,
Jakub Sitnicki, Daniel Borkmann, Lorenz Bauer, John Fastabend
From: Cong Wang <cong.wang@bytedance.com>
It is only used within skmsg.c so can become static.
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
include/linux/skmsg.h | 2 --
net/core/skmsg.c | 2 +-
2 files changed, 1 insertion(+), 3 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index ab3f3f2c426f..9f838bdf2db3 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -344,8 +344,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link)
struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
-
static inline void sk_psock_cork_free(struct sk_psock *psock)
{
if (psock->cork) {
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index dbb176427c14..286a95304e03 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -620,7 +620,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
return link;
}
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
{
struct sk_msg *msg, *tmp;
--
2.25.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 7/8] skmsg: make __sk_psock_purge_ingress_msg() static
2021-02-20 5:29 ` [Patch bpf-next v6 7/8] skmsg: make __sk_psock_purge_ingress_msg() static Cong Wang
@ 2021-02-22 12:30 ` Jakub Sitnicki
0 siblings, 0 replies; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-22 12:30 UTC (permalink / raw)
To: Cong Wang
Cc: netdev, bpf, duanxiongchun, wangdongdong.6, jiang.wang,
Cong Wang, Daniel Borkmann, Lorenz Bauer, John Fastabend
On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> It is only used within skmsg.c so can become static.
>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Lorenz Bauer <lmb@cloudflare.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [Patch bpf-next v6 8/8] skmsg: get rid of sk_psock_bpf_run()
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
` (6 preceding siblings ...)
2021-02-20 5:29 ` [Patch bpf-next v6 7/8] skmsg: make __sk_psock_purge_ingress_msg() static Cong Wang
@ 2021-02-20 5:29 ` Cong Wang
2021-02-22 12:31 ` Jakub Sitnicki
2021-02-22 12:32 ` [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Jakub Sitnicki
8 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2021-02-20 5:29 UTC (permalink / raw)
To: netdev
Cc: bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang,
Jakub Sitnicki, Daniel Borkmann, Lorenz Bauer, John Fastabend
From: Cong Wang <cong.wang@bytedance.com>
It is now nearly identical to bpf_prog_run_pin_on_cpu() and
it has an unused parameter 'psock', so we can just get rid
of it and call bpf_prog_run_pin_on_cpu() directly.
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
net/core/skmsg.c | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 286a95304e03..b240be71f21f 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -748,12 +748,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
-static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
- struct sk_buff *skb)
-{
- return bpf_prog_run_pin_on_cpu(prog, skb);
-}
-
static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
@@ -811,7 +805,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
skb->sk = psock->sk;
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
@@ -898,7 +892,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
if (likely(prog)) {
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
}
sk_psock_verdict_apply(psock, skb, ret);
@@ -921,7 +915,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
prog = READ_ONCE(psock->progs.stream_parser);
if (likely(prog)) {
skb->sk = psock->sk;
- ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
skb->sk = NULL;
}
rcu_read_unlock();
@@ -1014,7 +1008,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
if (likely(prog)) {
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
}
sk_psock_verdict_apply(psock, skb, ret);
--
2.25.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 8/8] skmsg: get rid of sk_psock_bpf_run()
2021-02-20 5:29 ` [Patch bpf-next v6 8/8] skmsg: get rid of sk_psock_bpf_run() Cong Wang
@ 2021-02-22 12:31 ` Jakub Sitnicki
0 siblings, 0 replies; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-22 12:31 UTC (permalink / raw)
To: Cong Wang
Cc: netdev, bpf, duanxiongchun, wangdongdong.6, jiang.wang,
Cong Wang, Daniel Borkmann, Lorenz Bauer, John Fastabend
On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> It is now nearly identical to bpf_prog_run_pin_on_cpu() and
> it has an unused parameter 'psock', so we can just get rid
> of it and call bpf_prog_run_pin_on_cpu() directly.
>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Lorenz Bauer <lmb@cloudflare.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT
2021-02-20 5:29 [Patch bpf-next v6 0/8] sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT Cong Wang
` (7 preceding siblings ...)
2021-02-20 5:29 ` [Patch bpf-next v6 8/8] skmsg: get rid of sk_psock_bpf_run() Cong Wang
@ 2021-02-22 12:32 ` Jakub Sitnicki
8 siblings, 0 replies; 22+ messages in thread
From: Jakub Sitnicki @ 2021-02-22 12:32 UTC (permalink / raw)
To: Cong Wang
Cc: netdev, bpf, duanxiongchun, wangdongdong.6, jiang.wang, Cong Wang
On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
>
> This patchset is the first series of patches separated out from
> the original large patchset, to make reviews easier. This patchset
> does not add any new feature or change any functionality but merely
> cleans up the existing sockmap and skmsg code and refactors it, to
> prepare for the patches followed up. This passed all BPF selftests.
>
> To see the big picture, the original whole patchset is available
> on github: https://github.com/congwang/linux/tree/sockmap
>
> and this patchset is also available on github:
> https://github.com/congwang/linux/tree/sockmap1
>
> ---
Thanks for the effort. It definitely looks like an improvement to me.
-Jakub
^ permalink raw reply [flat|nested] 22+ messages in thread