netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Martin KaFai Lau <martin.lau@linux.dev>
To: Amery Hung <ameryhung@gmail.com>
Cc: bpf@vger.kernel.org, yangpeihao@sjtu.edu.cn, toke@redhat.com,
	jhs@mojatatu.com, jiri@resnulli.us, sdf@google.com,
	xiyou.wangcong@gmail.com, yepeilin.cs@gmail.com,
	netdev@vger.kernel.org
Subject: Re: [RFC PATCH v7 1/8] net_sched: Introduce eBPF based Qdisc
Date: Tue, 23 Jan 2024 15:51:26 -0800	[thread overview]
Message-ID: <0484f7f7-715f-4084-b42d-6d43ebb5180f@linux.dev> (raw)
In-Reply-To: <232881645a5c4c05a35df4ff1f08a19ef9a02662.1705432850.git.amery.hung@bytedance.com>

On 1/17/24 1:56 PM, Amery Hung wrote:
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 0bb92414c036..df280bbb7c0d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -997,6 +997,7 @@ enum bpf_prog_type {
>   	BPF_PROG_TYPE_SK_LOOKUP,
>   	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
>   	BPF_PROG_TYPE_NETFILTER,
> +	BPF_PROG_TYPE_QDISC,
>   };
>   
>   enum bpf_attach_type {
> @@ -1056,6 +1057,8 @@ enum bpf_attach_type {
>   	BPF_CGROUP_UNIX_GETSOCKNAME,
>   	BPF_NETKIT_PRIMARY,
>   	BPF_NETKIT_PEER,
> +	BPF_QDISC_ENQUEUE,
> +	BPF_QDISC_DEQUEUE,
>   	__MAX_BPF_ATTACH_TYPE
>   };
>   
> @@ -7357,4 +7360,22 @@ struct bpf_iter_num {
>   	__u64 __opaque[1];
>   } __attribute__((aligned(8)));
>   
> +struct bpf_qdisc_ctx {
> +	__bpf_md_ptr(struct sk_buff *, skb);
> +	__u32 classid;
> +	__u64 expire;
> +	__u64 delta_ns;
> +};
> +
> +enum {
> +	SCH_BPF_QUEUED,
> +	SCH_BPF_DEQUEUED = SCH_BPF_QUEUED,
> +	SCH_BPF_DROP,
> +	SCH_BPF_CN,
> +	SCH_BPF_THROTTLE,
> +	SCH_BPF_PASS,
> +	SCH_BPF_BYPASS,
> +	SCH_BPF_STOLEN,
> +};
> +
>   #endif /* _UAPI__LINUX_BPF_H__ */

[ ... ]

> +static bool tc_qdisc_is_valid_access(int off, int size,
> +				     enum bpf_access_type type,
> +				     const struct bpf_prog *prog,
> +				     struct bpf_insn_access_aux *info)
> +{
> +	struct btf *btf;
> +
> +	if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx))
> +		return false;
> +
> +	switch (off) {
> +	case offsetof(struct bpf_qdisc_ctx, skb):
> +		if (type == BPF_WRITE ||
> +		    size != sizeof_field(struct bpf_qdisc_ctx, skb))
> +			return false;
> +
> +		if (prog->expected_attach_type != BPF_QDISC_ENQUEUE)
> +			return false;
> +
> +		btf = bpf_get_btf_vmlinux();
> +		if (IS_ERR_OR_NULL(btf))
> +			return false;
> +
> +		info->btf = btf;
> +		info->btf_id = tc_qdisc_ctx_access_btf_ids[0];
> +		info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
> +		return true;
> +	case bpf_ctx_range(struct bpf_qdisc_ctx, classid):
> +		return size == sizeof_field(struct bpf_qdisc_ctx, classid);
> +	case bpf_ctx_range(struct bpf_qdisc_ctx, expire):
> +		return size == sizeof_field(struct bpf_qdisc_ctx, expire);
> +	case bpf_ctx_range(struct bpf_qdisc_ctx, delta_ns):
> +		return size == sizeof_field(struct bpf_qdisc_ctx, delta_ns);
> +	default:
> +		return false;
> +	}
> +
> +	return false;
> +}
> +

[ ... ]

> +static int sch_bpf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> +			   struct sk_buff **to_free)
> +{
> +	struct bpf_sched_data *q = qdisc_priv(sch);
> +	unsigned int len = qdisc_pkt_len(skb);
> +	struct bpf_qdisc_ctx ctx = {};
> +	int res = NET_XMIT_SUCCESS;
> +	struct sch_bpf_class *cl;
> +	struct bpf_prog *enqueue;
> +
> +	enqueue = rcu_dereference(q->enqueue_prog.prog);
> +	if (!enqueue)
> +		return NET_XMIT_DROP;
> +
> +	ctx.skb = skb;
> +	ctx.classid = sch->handle;
> +	res = bpf_prog_run(enqueue, &ctx);
> +	switch (res) {
> +	case SCH_BPF_THROTTLE:
> +		qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> +		qdisc_qstats_overlimit(sch);
> +		fallthrough;
> +	case SCH_BPF_QUEUED:
> +		qdisc_qstats_backlog_inc(sch, skb);
> +		return NET_XMIT_SUCCESS;
> +	case SCH_BPF_BYPASS:
> +		qdisc_qstats_drop(sch);
> +		__qdisc_drop(skb, to_free);
> +		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> +	case SCH_BPF_STOLEN:
> +		__qdisc_drop(skb, to_free);
> +		return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
> +	case SCH_BPF_CN:
> +		return NET_XMIT_CN;
> +	case SCH_BPF_PASS:
> +		break;
> +	default:
> +		return qdisc_drop(skb, sch, to_free);
> +	}
> +
> +	cl = sch_bpf_find(sch, ctx.classid);
> +	if (!cl || !cl->qdisc)
> +		return qdisc_drop(skb, sch, to_free);
> +
> +	res = qdisc_enqueue(skb, cl->qdisc, to_free);
> +	if (res != NET_XMIT_SUCCESS) {
> +		if (net_xmit_drop_count(res)) {
> +			qdisc_qstats_drop(sch);
> +			cl->drops++;
> +		}
> +		return res;
> +	}
> +
> +	sch->qstats.backlog += len;
> +	sch->q.qlen++;
> +	return res;
> +}
> +
> +DEFINE_PER_CPU(struct sk_buff*, bpf_skb_dequeue);
> +
> +static struct sk_buff *sch_bpf_dequeue(struct Qdisc *sch)
> +{
> +	struct bpf_sched_data *q = qdisc_priv(sch);
> +	struct bpf_qdisc_ctx ctx = {};
> +	struct sk_buff *skb = NULL;
> +	struct bpf_prog *dequeue;
> +	struct sch_bpf_class *cl;
> +	int res;
> +
> +	dequeue = rcu_dereference(q->dequeue_prog.prog);
> +	if (!dequeue)
> +		return NULL;
> +
> +	__this_cpu_write(bpf_skb_dequeue, NULL);
> +	ctx.classid = sch->handle;
> +	res = bpf_prog_run(dequeue, &ctx);
> +	switch (res) {
> +	case SCH_BPF_DEQUEUED:
> +		skb = __this_cpu_read(bpf_skb_dequeue);
> +		qdisc_bstats_update(sch, skb);
> +		qdisc_qstats_backlog_dec(sch, skb);
> +		break;
> +	case SCH_BPF_THROTTLE:
> +		qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> +		qdisc_qstats_overlimit(sch);
> +		cl = sch_bpf_find(sch, ctx.classid);
> +		if (cl)
> +			cl->overlimits++;
> +		return NULL;
> +	case SCH_BPF_PASS:
> +		cl = sch_bpf_find(sch, ctx.classid);
> +		if (!cl || !cl->qdisc)
> +			return NULL;
> +		skb = qdisc_dequeue_peeked(cl->qdisc);
> +		if (skb) {
> +			bstats_update(&cl->bstats, skb);
> +			qdisc_bstats_update(sch, skb);
> +			qdisc_qstats_backlog_dec(sch, skb);
> +			sch->q.qlen--;
> +		}
> +		break;
> +	}
> +
> +	return skb;
> +}

[ ... ]

> +static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt,
> +			struct netlink_ext_ack *extack)
> +{
> +	struct bpf_sched_data *q = qdisc_priv(sch);
> +	int err;
> +
> +	qdisc_watchdog_init(&q->watchdog, sch);
> +	if (opt) {
> +		err = sch_bpf_change(sch, opt, extack);
> +		if (err)
> +			return err;
> +	}
> +
> +	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
> +	if (err)
> +		return err;
> +
> +	return qdisc_class_hash_init(&q->clhash);
> +}
> +
> +static void sch_bpf_reset(struct Qdisc *sch)
> +{
> +	struct bpf_sched_data *q = qdisc_priv(sch);
> +	struct sch_bpf_class *cl;
> +	unsigned int i;
> +
> +	for (i = 0; i < q->clhash.hashsize; i++) {
> +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
> +			if (cl->qdisc)
> +				qdisc_reset(cl->qdisc);
> +		}
> +	}
> +
> +	qdisc_watchdog_cancel(&q->watchdog);
> +}
> +

[ ... ]

> +static const struct Qdisc_class_ops sch_bpf_class_ops = {
> +	.graft		=	sch_bpf_graft,
> +	.leaf		=	sch_bpf_leaf,
> +	.find		=	sch_bpf_search,
> +	.change		=	sch_bpf_change_class,
> +	.delete		=	sch_bpf_delete,
> +	.tcf_block	=	sch_bpf_tcf_block,
> +	.bind_tcf	=	sch_bpf_bind,
> +	.unbind_tcf	=	sch_bpf_unbind,
> +	.dump		=	sch_bpf_dump_class,
> +	.dump_stats	=	sch_bpf_dump_class_stats,
> +	.walk		=	sch_bpf_walk,
> +};
> +
> +static struct Qdisc_ops sch_bpf_qdisc_ops __read_mostly = {
> +	.cl_ops		=	&sch_bpf_class_ops,
> +	.id		=	"bpf",
> +	.priv_size	=	sizeof(struct bpf_sched_data),
> +	.enqueue	=	sch_bpf_enqueue,
> +	.dequeue	=	sch_bpf_dequeue,

I looked at the high level of the patchset. The major ops that it wants to be 
programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in 
patch 4 and patch 5).

This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for 
".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the 
uapi. It is no long an acceptable way to add new bpf extension.

Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented 
in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops 
can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api.

If other ops (like ".dump", ".dump_stats"...) do not have good use case to be 
programmable in bpf, it can stay with the kernel implementation for now and only 
allows the userspace to load the a bpf Qdisc_ops with .equeue/dequeue/init/reset 
implemented.

You mentioned in the cover letter that:
"Current struct_ops attachment model does not seem to support replacing only 
functions of a specific instance of a module, but I might be wrong."

I assumed you meant allow bpf to replace only "some" ops of the Qdisc_ops? Yes, 
it can be done through the struct_ops's ".init_member". Take a look at 
bpf_tcp_ca_init_member. The kernel can assign the kernel implementation for 
".dump" (for example) when loading the bpf Qdisc_ops.

> +	.peek		=	qdisc_peek_dequeued,
> +	.init		=	sch_bpf_init,
> +	.reset		=	sch_bpf_reset, > +	.destroy	=	sch_bpf_destroy,
> +	.change		=	sch_bpf_change,
> +	.dump		=	sch_bpf_dump,
> +	.dump_stats	=	sch_bpf_dump_stats,
> +	.owner		=	THIS_MODULE,
> +};
> +
> +static int __init sch_bpf_mod_init(void)
> +{
> +	return register_qdisc(&sch_bpf_qdisc_ops);
> +}
> +
> +static void __exit sch_bpf_mod_exit(void)
> +{
> +	unregister_qdisc(&sch_bpf_qdisc_ops);
> +}


  reply	other threads:[~2024-01-23 23:51 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-17 21:56 [RFC PATCH v7 0/8] net_sched: Introduce eBPF based Qdisc Amery Hung
2024-01-17 21:56 ` [RFC PATCH v7 1/8] " Amery Hung
2024-01-23 23:51   ` Martin KaFai Lau [this message]
2024-01-24  5:22     ` Amery Hung
2024-01-26  2:22       ` Martin KaFai Lau
2024-01-27  1:17         ` Amery Hung
2024-01-30  6:39           ` Martin KaFai Lau
2024-01-30 17:49             ` Kui-Feng Lee
2024-01-31  1:01               ` Martin KaFai Lau
2024-01-31 16:49                 ` Kui-Feng Lee
2024-01-31 16:59                   ` Amery Hung
2024-01-31 16:23             ` Amery Hung
2024-02-02  1:47               ` Martin KaFai Lau
2024-02-09 20:14                 ` Amery Hung
2024-01-17 21:56 ` [RFC PATCH v7 2/8] net_sched: Add kfuncs for working with skb Amery Hung
2024-01-17 21:56 ` [RFC PATCH v7 3/8] net_sched: Introduce kfunc bpf_skb_tc_classify() Amery Hung
2024-01-17 21:56 ` [RFC PATCH v7 4/8] net_sched: Add reset program Amery Hung
2024-01-17 21:56 ` [RFC PATCH v7 5/8] net_sched: Add init program Amery Hung
2024-01-17 21:56 ` [RFC PATCH v7 6/8] tools/libbpf: Add support for BPF_PROG_TYPE_QDISC Amery Hung
2024-01-23  0:17   ` Andrii Nakryiko
2024-01-23 19:40     ` Amery Hung
2024-01-17 21:56 ` [RFC PATCH v7 7/8] samples/bpf: Add an example of bpf fq qdisc Amery Hung
2024-01-24 10:29   ` Daniel Borkmann
2024-01-26 19:49     ` Amery Hung
2024-01-17 21:56 ` [RFC PATCH v7 8/8] samples/bpf: Add an example of bpf netem qdisc Amery Hung
2024-01-23 21:13 ` [RFC PATCH v7 0/8] net_sched: Introduce eBPF based Qdisc Stanislav Fomichev
2024-01-24 10:10   ` Daniel Borkmann
2024-01-24 12:09   ` Jamal Hadi Salim
2024-01-24 13:07     ` Daniel Borkmann
2024-01-24 14:11       ` Jamal Hadi Salim
2024-01-24 15:26         ` Daniel Borkmann
2024-01-24 21:26           ` Amery Hung
2024-01-25 11:57             ` Daniel Borkmann

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0484f7f7-715f-4084-b42d-6d43ebb5180f@linux.dev \
    --to=martin.lau@linux.dev \
    --cc=ameryhung@gmail.com \
    --cc=bpf@vger.kernel.org \
    --cc=jhs@mojatatu.com \
    --cc=jiri@resnulli.us \
    --cc=netdev@vger.kernel.org \
    --cc=sdf@google.com \
    --cc=toke@redhat.com \
    --cc=xiyou.wangcong@gmail.com \
    --cc=yangpeihao@sjtu.edu.cn \
    --cc=yepeilin.cs@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).