netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Andrii Nakryiko <andrii.nakryiko@gmail.com>
To: Martin KaFai Lau <kafai@fb.com>
Cc: bpf <bpf@vger.kernel.org>, Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	David Miller <davem@davemloft.net>,
	Kernel Team <kernel-team@fb.com>,
	Networking <netdev@vger.kernel.org>
Subject: Re: [PATCH bpf-next v2 11/11] bpf: Add bpf_dctcp example
Date: Mon, 23 Dec 2019 15:26:50 -0800	[thread overview]
Message-ID: <CAEf4BzZX_TNUXJktJUtqmxgMefDzie=Ta18TbBqBhG0-GSLQMg@mail.gmail.com> (raw)
In-Reply-To: <20191221062620.1184118-1-kafai@fb.com>

On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> This patch adds a bpf_dctcp example.  It currently does not do
> no-ECN fallback but the same could be done through the cgrp2-bpf.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
>  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
>  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
>  3 files changed, 656 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
>
> diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> new file mode 100644
> index 000000000000..7ba8c1b4157a
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> @@ -0,0 +1,228 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __BPF_TCP_HELPERS_H
> +#define __BPF_TCP_HELPERS_H
> +
> +#include <stdbool.h>
> +#include <linux/types.h>
> +#include <bpf_helpers.h>
> +#include <bpf_core_read.h>
> +#include "bpf_trace_helpers.h"
> +
> +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)

Should we try to put those BPF programs into some section that would
indicate they are used with struct opts? libbpf doesn't use or enforce
that (even though it could to derive and enforce that they are
STRUCT_OPS programs). So something like
SEC("struct_ops/<ideally-operation-name-here>"). I think having this
convention is very useful for consistency and to do a quick ELF dump
and see what is where. WDYT?

> +
> +struct sock_common {
> +       unsigned char   skc_state;
> +} __attribute__((preserve_access_index));
> +
> +struct sock {
> +       struct sock_common      __sk_common;
> +} __attribute__((preserve_access_index));
> +
> +struct inet_sock {
> +       struct sock             sk;
> +} __attribute__((preserve_access_index));
> +
> +struct inet_connection_sock {
> +       struct inet_sock          icsk_inet;
> +       __u8                      icsk_ca_state:6,
> +                                 icsk_ca_setsockopt:1,
> +                                 icsk_ca_dst_locked:1;
> +       struct {
> +               __u8              pending;
> +       } icsk_ack;
> +       __u64                     icsk_ca_priv[104 / sizeof(__u64)];
> +} __attribute__((preserve_access_index));
> +
> +struct tcp_sock {
> +       struct inet_connection_sock     inet_conn;
> +
> +       __u32   rcv_nxt;
> +       __u32   snd_nxt;
> +       __u32   snd_una;
> +       __u8    ecn_flags;
> +       __u32   delivered;
> +       __u32   delivered_ce;
> +       __u32   snd_cwnd;
> +       __u32   snd_cwnd_cnt;
> +       __u32   snd_cwnd_clamp;
> +       __u32   snd_ssthresh;
> +       __u8    syn_data:1,     /* SYN includes data */
> +               syn_fastopen:1, /* SYN includes Fast Open option */
> +               syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
> +               syn_fastopen_ch:1, /* Active TFO re-enabling probe */
> +               syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
> +               save_syn:1,     /* Save headers of SYN packet */
> +               is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
> +               syn_smc:1;      /* SYN includes SMC */
> +       __u32   max_packets_out;
> +       __u32   lsndtime;
> +       __u32   prior_cwnd;
> +} __attribute__((preserve_access_index));
> +
> +static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
> +{
> +       return (struct inet_connection_sock *)sk;
> +}
> +
> +static __always_inline void *inet_csk_ca(const struct sock *sk)
> +{
> +       return (void *)inet_csk(sk)->icsk_ca_priv;
> +}
> +
> +static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
> +{
> +       return (struct tcp_sock *)sk;
> +}
> +
> +static __always_inline bool before(__u32 seq1, __u32 seq2)
> +{
> +       return (__s32)(seq1-seq2) < 0;
> +}
> +#define after(seq2, seq1)      before(seq1, seq2)
> +
> +#define        TCP_ECN_OK              1
> +#define        TCP_ECN_QUEUE_CWR       2
> +#define        TCP_ECN_DEMAND_CWR      4
> +#define        TCP_ECN_SEEN            8
> +
> +enum inet_csk_ack_state_t {
> +       ICSK_ACK_SCHED  = 1,
> +       ICSK_ACK_TIMER  = 2,
> +       ICSK_ACK_PUSHED = 4,
> +       ICSK_ACK_PUSHED2 = 8,
> +       ICSK_ACK_NOW = 16       /* Send the next ACK immediately (once) */
> +};
> +
> +enum tcp_ca_event {
> +       CA_EVENT_TX_START = 0,
> +       CA_EVENT_CWND_RESTART = 1,
> +       CA_EVENT_COMPLETE_CWR = 2,
> +       CA_EVENT_LOSS = 3,
> +       CA_EVENT_ECN_NO_CE = 4,
> +       CA_EVENT_ECN_IS_CE = 5,
> +};
> +
> +enum tcp_ca_state {
> +       TCP_CA_Open = 0,
> +       TCP_CA_Disorder = 1,
> +       TCP_CA_CWR = 2,
> +       TCP_CA_Recovery = 3,
> +       TCP_CA_Loss = 4
> +};
> +
> +struct ack_sample {
> +       __u32 pkts_acked;
> +       __s32 rtt_us;
> +       __u32 in_flight;
> +} __attribute__((preserve_access_index));
> +
> +struct rate_sample {
> +       __u64  prior_mstamp; /* starting timestamp for interval */
> +       __u32  prior_delivered; /* tp->delivered at "prior_mstamp" */
> +       __s32  delivered;               /* number of packets delivered over interval */
> +       long interval_us;       /* time for tp->delivered to incr "delivered" */
> +       __u32 snd_interval_us;  /* snd interval for delivered packets */
> +       __u32 rcv_interval_us;  /* rcv interval for delivered packets */
> +       long rtt_us;            /* RTT of last (S)ACKed packet (or -1) */
> +       int  losses;            /* number of packets marked lost upon ACK */
> +       __u32  acked_sacked;    /* number of packets newly (S)ACKed upon ACK */
> +       __u32  prior_in_flight; /* in flight before this ACK */
> +       bool is_app_limited;    /* is sample from packet with bubble in pipe? */
> +       bool is_retrans;        /* is sample from retransmission? */
> +       bool is_ack_delayed;    /* is this (likely) a delayed ACK? */
> +} __attribute__((preserve_access_index));
> +
> +#define TCP_CA_NAME_MAX                16
> +#define TCP_CONG_NEEDS_ECN     0x2
> +
> +struct tcp_congestion_ops {
> +       __u32 flags;
> +
> +       /* initialize private data (optional) */
> +       void (*init)(struct sock *sk);
> +       /* cleanup private data  (optional) */
> +       void (*release)(struct sock *sk);
> +
> +       /* return slow start threshold (required) */
> +       __u32 (*ssthresh)(struct sock *sk);
> +       /* do new cwnd calculation (required) */
> +       void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
> +       /* call before changing ca_state (optional) */
> +       void (*set_state)(struct sock *sk, __u8 new_state);
> +       /* call when cwnd event occurs (optional) */
> +       void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
> +       /* call when ack arrives (optional) */
> +       void (*in_ack_event)(struct sock *sk, __u32 flags);
> +       /* new value of cwnd after loss (required) */
> +       __u32  (*undo_cwnd)(struct sock *sk);
> +       /* hook for packet ack accounting (optional) */
> +       void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
> +       /* override sysctl_tcp_min_tso_segs */
> +       __u32 (*min_tso_segs)(struct sock *sk);
> +       /* returns the multiplier used in tcp_sndbuf_expand (optional) */
> +       __u32 (*sndbuf_expand)(struct sock *sk);
> +       /* call when packets are delivered to update cwnd and pacing rate,
> +        * after all the ca_state processing. (optional)
> +        */
> +       void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
> +
> +       char            name[TCP_CA_NAME_MAX];
> +};

Can all of these types come from vmlinux.h instead of being duplicated here?

> +
> +#define min(a, b) ((a) < (b) ? (a) : (b))
> +#define max(a, b) ((a) > (b) ? (a) : (b))
> +#define min_not_zero(x, y) ({                  \
> +       typeof(x) __x = (x);                    \
> +       typeof(y) __y = (y);                    \
> +       __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
> +

[...]

> +static struct bpf_object *load(const char *filename, const char *map_name,
> +                              struct bpf_link **link)
> +{
> +       struct bpf_object *obj;
> +       struct bpf_map *map;
> +       struct bpf_link *l;
> +       int err;
> +
> +       obj = bpf_object__open(filename);
> +       if (CHECK(IS_ERR(obj), "bpf_obj__open_file", "obj:%ld\n",
> +                 PTR_ERR(obj)))
> +               return obj;
> +
> +       err = bpf_object__load(obj);
> +       if (CHECK(err, "bpf_object__load", "err:%d\n", err)) {
> +               bpf_object__close(obj);
> +               return ERR_PTR(err);
> +       }
> +
> +       map = bpf_object__find_map_by_name(obj, map_name);
> +       if (CHECK(!map, "bpf_object__find_map_by_name", "%s not found\n",
> +                   map_name)) {
> +               bpf_object__close(obj);
> +               return ERR_PTR(-ENOENT);
> +       }
> +

use skeleton instead?

> +       l = bpf_map__attach_struct_ops(map);
> +       if (CHECK(IS_ERR(l), "bpf_struct_ops_map__attach", "err:%ld\n",
> +                 PTR_ERR(l))) {
> +               bpf_object__close(obj);
> +               return (void *)l;
> +       }
> +
> +       *link = l;
> +
> +       return obj;
> +}
> +
> +static void test_dctcp(void)
> +{
> +       struct bpf_object *obj;
> +       /* compiler warning... */
> +       struct bpf_link *link = NULL;
> +
> +       obj = load("bpf_dctcp.o", "dctcp", &link);
> +       if (IS_ERR(obj))
> +               return;
> +
> +       do_test("bpf_dctcp");
> +
> +       bpf_link__destroy(link);
> +       bpf_object__close(obj);
> +}
> +
> +void test_bpf_tcp_ca(void)
> +{
> +       if (test__start_subtest("dctcp"))
> +               test_dctcp();
> +}

[...]

  reply	other threads:[~2019-12-23 23:27 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-12-21  6:25 [PATCH bpf-next v2 00/11] Introduce BPF STRUCT_OPS Martin KaFai Lau
2019-12-21  6:25 ` [PATCH bpf-next v2 01/11] bpf: Save PTR_TO_BTF_ID register state when spilling to stack Martin KaFai Lau
2019-12-21  6:25 ` [PATCH bpf-next v2 02/11] bpf: Avoid storing modifier to info->btf_id Martin KaFai Lau
2019-12-21  6:26 ` [PATCH bpf-next v2 03/11] bpf: Add enum support to btf_ctx_access() Martin KaFai Lau
2019-12-21  6:26 ` [PATCH bpf-next v2 04/11] bpf: Support bitfield read access in btf_struct_access Martin KaFai Lau
2019-12-23  7:49   ` Yonghong Song
2019-12-23 20:05   ` Andrii Nakryiko
2019-12-23 21:21     ` Yonghong Song
2019-12-21  6:26 ` [PATCH bpf-next v2 05/11] bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS Martin KaFai Lau
2019-12-23 19:33   ` Yonghong Song
2019-12-23 20:29   ` Andrii Nakryiko
2019-12-23 22:29     ` Martin Lau
2019-12-23 22:55       ` Andrii Nakryiko
2019-12-24 11:46   ` kbuild test robot
2019-12-21  6:26 ` [PATCH bpf-next v2 06/11] bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS Martin KaFai Lau
2019-12-23 19:57   ` Yonghong Song
2019-12-23 21:44     ` Andrii Nakryiko
2019-12-23 22:15       ` Martin Lau
2019-12-27  6:16     ` Martin Lau
2019-12-23 23:05   ` Andrii Nakryiko
2019-12-28  1:47     ` Martin Lau
2019-12-28  2:24       ` Andrii Nakryiko
2019-12-28  5:16         ` Martin Lau
2019-12-24 12:28   ` kbuild test robot
2019-12-21  6:26 ` [PATCH bpf-next v2 07/11] bpf: tcp: Support tcp_congestion_ops in bpf Martin KaFai Lau
2019-12-23 20:18   ` Yonghong Song
2019-12-23 23:20   ` Andrii Nakryiko
2019-12-24  7:16   ` kbuild test robot
2019-12-24 13:06   ` kbuild test robot
2019-12-21  6:26 ` [PATCH bpf-next v2 08/11] bpf: Add BPF_FUNC_tcp_send_ack helper Martin KaFai Lau
2019-12-21  6:26 ` [PATCH bpf-next v2 09/11] bpf: Synch uapi bpf.h to tools/ Martin KaFai Lau
2019-12-21  6:26 ` [PATCH bpf-next v2 10/11] bpf: libbpf: Add STRUCT_OPS support Martin KaFai Lau
2019-12-23 19:54   ` Andrii Nakryiko
2019-12-26 22:47     ` Martin Lau
2019-12-21  6:26 ` [PATCH bpf-next v2 11/11] bpf: Add bpf_dctcp example Martin KaFai Lau
2019-12-23 23:26   ` Andrii Nakryiko [this message]
2019-12-24  1:31     ` Martin Lau
2019-12-24  7:01       ` Andrii Nakryiko
2019-12-24  7:32         ` Martin Lau
2019-12-24 16:50         ` Martin Lau
2019-12-26 19:02           ` Andrii Nakryiko
2019-12-26 20:25             ` Martin Lau
2019-12-26 20:48               ` Andrii Nakryiko
2019-12-26 22:20                 ` Martin Lau
2019-12-26 22:25                   ` Andrii Nakryiko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAEf4BzZX_TNUXJktJUtqmxgMefDzie=Ta18TbBqBhG0-GSLQMg@mail.gmail.com' \
    --to=andrii.nakryiko@gmail.com \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=kafai@fb.com \
    --cc=kernel-team@fb.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).