bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
       [not found] <20230919145951.352548-1-victor@mojatatu.com>
@ 2023-09-19 22:15 ` Daniel Borkmann
  2023-09-19 23:20   ` Jamal Hadi Salim
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Borkmann @ 2023-09-19 22:15 UTC (permalink / raw)
  To: Victor Nogueira, jhs, xiyou.wangcong, jiri, davem, edumazet,
	kuba, pabeni, paulb
  Cc: netdev, kernel, martin.lau, bpf

[ +Martin, bpf ]

On 9/19/23 4:59 PM, Victor Nogueira wrote:
> Currently there is no way to distinguish between an error and a
> classification verdict. This patch adds the verdict field as a part of
> struct tcf_result. That way, tcf_classify can return a proper
> error number when it fails, and we keep the classification result
> information encapsulated in struct tcf_result.
> 
> Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
> SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
> With that we can distinguish between a drop from a processing error versus
> a drop from classification.
> 
> Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> ---
>   include/net/dropreason-core.h |  6 +++++
>   include/net/sch_generic.h     |  7 ++++++
>   net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
>   net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
>   net/sched/sch_cake.c          | 32 +++++++++++++-------------
>   net/sched/sch_drr.c           | 33 +++++++++++++--------------
>   net/sched/sch_ets.c           |  6 +++--
>   net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
>   net/sched/sch_fq_pie.c        | 28 +++++++++++------------
>   net/sched/sch_hfsc.c          |  6 +++--
>   net/sched/sch_htb.c           |  6 +++--
>   net/sched/sch_multiq.c        |  6 +++--
>   net/sched/sch_prio.c          |  7 ++++--
>   net/sched/sch_qfq.c           | 34 +++++++++++++---------------
>   net/sched/sch_sfb.c           | 29 ++++++++++++------------
>   net/sched/sch_sfq.c           | 28 +++++++++++------------
>   16 files changed, 195 insertions(+), 142 deletions(-)
> 
> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> index a587e83fc169..b1c069c8e7f2 100644
> --- a/include/net/dropreason-core.h
> +++ b/include/net/dropreason-core.h
> @@ -80,6 +80,8 @@
>   	FN(IPV6_NDISC_BAD_OPTIONS)	\
>   	FN(IPV6_NDISC_NS_OTHERHOST)	\
>   	FN(QUEUE_PURGE)			\
> +	FN(TC_EGRESS_ERROR)		\
> +	FN(TC_INGRESS_ERROR)		\
>   	FNe(MAX)
>   
>   /**
> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>   	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>   	/** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>   	SKB_DROP_REASON_QUEUE_PURGE,
> +	/** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> +	SKB_DROP_REASON_TC_EGRESS_ERROR,
> +	/** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> +	SKB_DROP_REASON_TC_INGRESS_ERROR,
>   	/**
>   	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>   	 * shouldn't be used as a real 'reason' - only for tracing code gen
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index f232512505f8..9a3f71d2545e 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -326,6 +326,7 @@ struct Qdisc_ops {
>   
>   
>   struct tcf_result {
> +	u32 verdict;
>   	union {
>   		struct {
>   			unsigned long	class;
> @@ -336,6 +337,12 @@ struct tcf_result {
>   	};
>   };
>   
> +static inline void tcf_result_set_verdict(struct tcf_result *res,
> +					  const u32 verdict)
> +{
> +	res->verdict = verdict;
> +}
> +
>   struct tcf_chain;
>   
>   struct tcf_proto_ops {
> diff --git a/net/core/dev.c b/net/core/dev.c
> index ccff2b6ef958..1450f4741d9b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
>   #endif /* CONFIG_NET_EGRESS */
>   
>   #ifdef CONFIG_NET_XGRESS
> -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
> +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
> +		  struct tcf_result *res)
>   {
> -	int ret = TC_ACT_UNSPEC;
> +	int ret = 0;
>   #ifdef CONFIG_NET_CLS_ACT
>   	struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
> -	struct tcf_result res;
>   
> -	if (!miniq)
> +	if (!miniq) {
> +		tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>   		return ret;
> +	}
>   
>   	tc_skb_cb(skb)->mru = 0;
>   	tc_skb_cb(skb)->post_ct = false;
>   
>   	mini_qdisc_bstats_cpu_update(miniq, skb);
> -	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> +	ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
> +	if (ret < 0) {
> +		mini_qdisc_qstats_cpu_drop(miniq);
> +		return ret;
> +	}
>   	/* Only tcf related quirks below. */
> -	switch (ret) {
> +	switch (res->verdict) {
>   	case TC_ACT_SHOT:
>   		mini_qdisc_qstats_cpu_drop(miniq);
>   		break;
>   	case TC_ACT_OK:
>   	case TC_ACT_RECLASSIFY:
> -		skb->tc_index = TC_H_MIN(res.classid);
> +		skb->tc_index = TC_H_MIN(res->classid);
>   		break;
>   	}
> +#else
> +	tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>   #endif /* CONFIG_NET_CLS_ACT */
>   	return ret;
>   }
> @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>   		   struct net_device *orig_dev, bool *another)
>   {
>   	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
> +	struct tcf_result res = {0};
>   	int sch_ret;
>   
>   	if (!entry)
> @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>   		if (sch_ret != TC_ACT_UNSPEC)
>   			goto ingress_verdict;
>   	}
> -	sch_ret = tc_run(tcx_entry(entry), skb);
> +	sch_ret = tc_run(tcx_entry(entry), skb, &res);
> +	if (sch_ret < 0) {
> +		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> +		*ret = NET_RX_DROP;
> +		return NULL;
> +	}
>   ingress_verdict:
> -	switch (sch_ret) {
> +	switch (res.verdict) {

This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
in such case.

>   	case TC_ACT_REDIRECT:
>   		/* skb_mac_header check was done by BPF, so we can safely
>   		 * push the L2 header back before redirecting to another
> @@ -4032,6 +4046,7 @@ static __always_inline struct sk_buff *
>   sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>   {
>   	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
> +	struct tcf_result res = {0};
>   	int sch_ret;
>   
>   	if (!entry)
> @@ -4045,9 +4060,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>   		if (sch_ret != TC_ACT_UNSPEC)
>   			goto egress_verdict;
>   	}
> -	sch_ret = tc_run(tcx_entry(entry), skb);
> +	sch_ret = tc_run(tcx_entry(entry), skb, &res);
> +	if (sch_ret < 0) {
> +		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS_ERROR);
> +		*ret = NET_XMIT_DROP;
> +		return NULL;
> +	}
>   egress_verdict:
> -	switch (sch_ret) {
> +	switch (res.verdict) {
>   	case TC_ACT_REDIRECT:
>   		/* No need to push/pop skb's mac_header here on egress! */
>   		skb_do_redirect(skb);

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-09-19 22:15 ` [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code Daniel Borkmann
@ 2023-09-19 23:20   ` Jamal Hadi Salim
  2023-09-22  8:12     ` Daniel Borkmann
  0 siblings, 1 reply; 19+ messages in thread
From: Jamal Hadi Salim @ 2023-09-19 23:20 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> [ +Martin, bpf ]
>
> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> > Currently there is no way to distinguish between an error and a
> > classification verdict. This patch adds the verdict field as a part of
> > struct tcf_result. That way, tcf_classify can return a proper
> > error number when it fails, and we keep the classification result
> > information encapsulated in struct tcf_result.
> >
> > Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
> > SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
> > With that we can distinguish between a drop from a processing error versus
> > a drop from classification.
> >
> > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > ---
> >   include/net/dropreason-core.h |  6 +++++
> >   include/net/sch_generic.h     |  7 ++++++
> >   net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
> >   net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
> >   net/sched/sch_cake.c          | 32 +++++++++++++-------------
> >   net/sched/sch_drr.c           | 33 +++++++++++++--------------
> >   net/sched/sch_ets.c           |  6 +++--
> >   net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
> >   net/sched/sch_fq_pie.c        | 28 +++++++++++------------
> >   net/sched/sch_hfsc.c          |  6 +++--
> >   net/sched/sch_htb.c           |  6 +++--
> >   net/sched/sch_multiq.c        |  6 +++--
> >   net/sched/sch_prio.c          |  7 ++++--
> >   net/sched/sch_qfq.c           | 34 +++++++++++++---------------
> >   net/sched/sch_sfb.c           | 29 ++++++++++++------------
> >   net/sched/sch_sfq.c           | 28 +++++++++++------------
> >   16 files changed, 195 insertions(+), 142 deletions(-)
> >
> > diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> > index a587e83fc169..b1c069c8e7f2 100644
> > --- a/include/net/dropreason-core.h
> > +++ b/include/net/dropreason-core.h
> > @@ -80,6 +80,8 @@
> >       FN(IPV6_NDISC_BAD_OPTIONS)      \
> >       FN(IPV6_NDISC_NS_OTHERHOST)     \
> >       FN(QUEUE_PURGE)                 \
> > +     FN(TC_EGRESS_ERROR)             \
> > +     FN(TC_INGRESS_ERROR)            \
> >       FNe(MAX)
> >
> >   /**
> > @@ -345,6 +347,10 @@ enum skb_drop_reason {
> >       SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
> >       /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
> >       SKB_DROP_REASON_QUEUE_PURGE,
> > +     /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> > +     SKB_DROP_REASON_TC_EGRESS_ERROR,
> > +     /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> > +     SKB_DROP_REASON_TC_INGRESS_ERROR,
> >       /**
> >        * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
> >        * shouldn't be used as a real 'reason' - only for tracing code gen
> > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > index f232512505f8..9a3f71d2545e 100644
> > --- a/include/net/sch_generic.h
> > +++ b/include/net/sch_generic.h
> > @@ -326,6 +326,7 @@ struct Qdisc_ops {
> >
> >
> >   struct tcf_result {
> > +     u32 verdict;
> >       union {
> >               struct {
> >                       unsigned long   class;
> > @@ -336,6 +337,12 @@ struct tcf_result {
> >       };
> >   };
> >
> > +static inline void tcf_result_set_verdict(struct tcf_result *res,
> > +                                       const u32 verdict)
> > +{
> > +     res->verdict = verdict;
> > +}
> > +
> >   struct tcf_chain;
> >
> >   struct tcf_proto_ops {
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index ccff2b6ef958..1450f4741d9b 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
> >   #endif /* CONFIG_NET_EGRESS */
> >
> >   #ifdef CONFIG_NET_XGRESS
> > -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
> > +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
> > +               struct tcf_result *res)
> >   {
> > -     int ret = TC_ACT_UNSPEC;
> > +     int ret = 0;
> >   #ifdef CONFIG_NET_CLS_ACT
> >       struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
> > -     struct tcf_result res;
> >
> > -     if (!miniq)
> > +     if (!miniq) {
> > +             tcf_result_set_verdict(res, TC_ACT_UNSPEC);
> >               return ret;
> > +     }
> >
> >       tc_skb_cb(skb)->mru = 0;
> >       tc_skb_cb(skb)->post_ct = false;
> >
> >       mini_qdisc_bstats_cpu_update(miniq, skb);
> > -     ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> > +     ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
> > +     if (ret < 0) {
> > +             mini_qdisc_qstats_cpu_drop(miniq);
> > +             return ret;
> > +     }
> >       /* Only tcf related quirks below. */
> > -     switch (ret) {
> > +     switch (res->verdict) {
> >       case TC_ACT_SHOT:
> >               mini_qdisc_qstats_cpu_drop(miniq);
> >               break;
> >       case TC_ACT_OK:
> >       case TC_ACT_RECLASSIFY:
> > -             skb->tc_index = TC_H_MIN(res.classid);
> > +             skb->tc_index = TC_H_MIN(res->classid);
> >               break;
> >       }
> > +#else
> > +     tcf_result_set_verdict(res, TC_ACT_UNSPEC);
> >   #endif /* CONFIG_NET_CLS_ACT */
> >       return ret;
> >   }
> > @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >                  struct net_device *orig_dev, bool *another)
> >   {
> >       struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
> > +     struct tcf_result res = {0};
> >       int sch_ret;
> >
> >       if (!entry)
> > @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >               if (sch_ret != TC_ACT_UNSPEC)
> >                       goto ingress_verdict;
> >       }
> > -     sch_ret = tc_run(tcx_entry(entry), skb);
> > +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
> > +     if (sch_ret < 0) {
> > +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> > +             *ret = NET_RX_DROP;
> > +             return NULL;
> > +     }
> >   ingress_verdict:
> > -     switch (sch_ret) {
> > +     switch (res.verdict) {
>
> This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
> or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
> in such case.
>

I think it is valuable to have a good reason code like
SKB_DROP_REASON_TC_XXX_ERROR to disambiguate between errors vs
verdicts in the case of tc_run() variant.
For tcx_run(), does this look ok (for consistency)?:

if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, true);
                if (sch_ret != TC_ACT_UNSPEC) {
                        res.verdict = sch_ret;
                        goto ingress_verdict;
                }
}

cheers,
jamal

> >       case TC_ACT_REDIRECT:
> >               /* skb_mac_header check was done by BPF, so we can safely
> >                * push the L2 header back before redirecting to another
> > @@ -4032,6 +4046,7 @@ static __always_inline struct sk_buff *
> >   sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
> >   {
> >       struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
> > +     struct tcf_result res = {0};
> >       int sch_ret;
> >
> >       if (!entry)
> > @@ -4045,9 +4060,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
> >               if (sch_ret != TC_ACT_UNSPEC)
> >                       goto egress_verdict;
> >       }
> > -     sch_ret = tc_run(tcx_entry(entry), skb);
> > +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
> > +     if (sch_ret < 0) {
> > +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS_ERROR);
> > +             *ret = NET_XMIT_DROP;
> > +             return NULL;
> > +     }
> >   egress_verdict:
> > -     switch (sch_ret) {
> > +     switch (res.verdict) {
> >       case TC_ACT_REDIRECT:
> >               /* No need to push/pop skb's mac_header here on egress! */
> >               skb_do_redirect(skb);

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-09-19 23:20   ` Jamal Hadi Salim
@ 2023-09-22  8:12     ` Daniel Borkmann
  2023-09-25 23:01       ` Jamal Hadi Salim
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Borkmann @ 2023-09-22  8:12 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> [ +Martin, bpf ]
>>
>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
>>> Currently there is no way to distinguish between an error and a
>>> classification verdict. This patch adds the verdict field as a part of
>>> struct tcf_result. That way, tcf_classify can return a proper
>>> error number when it fails, and we keep the classification result
>>> information encapsulated in struct tcf_result.
>>>
>>> Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
>>> SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
>>> With that we can distinguish between a drop from a processing error versus
>>> a drop from classification.
>>>
>>> Signed-off-by: Victor Nogueira <victor@mojatatu.com>
>>> ---
>>>    include/net/dropreason-core.h |  6 +++++
>>>    include/net/sch_generic.h     |  7 ++++++
>>>    net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
>>>    net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
>>>    net/sched/sch_cake.c          | 32 +++++++++++++-------------
>>>    net/sched/sch_drr.c           | 33 +++++++++++++--------------
>>>    net/sched/sch_ets.c           |  6 +++--
>>>    net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
>>>    net/sched/sch_fq_pie.c        | 28 +++++++++++------------
>>>    net/sched/sch_hfsc.c          |  6 +++--
>>>    net/sched/sch_htb.c           |  6 +++--
>>>    net/sched/sch_multiq.c        |  6 +++--
>>>    net/sched/sch_prio.c          |  7 ++++--
>>>    net/sched/sch_qfq.c           | 34 +++++++++++++---------------
>>>    net/sched/sch_sfb.c           | 29 ++++++++++++------------
>>>    net/sched/sch_sfq.c           | 28 +++++++++++------------
>>>    16 files changed, 195 insertions(+), 142 deletions(-)
>>>
>>> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
>>> index a587e83fc169..b1c069c8e7f2 100644
>>> --- a/include/net/dropreason-core.h
>>> +++ b/include/net/dropreason-core.h
>>> @@ -80,6 +80,8 @@
>>>        FN(IPV6_NDISC_BAD_OPTIONS)      \
>>>        FN(IPV6_NDISC_NS_OTHERHOST)     \
>>>        FN(QUEUE_PURGE)                 \
>>> +     FN(TC_EGRESS_ERROR)             \
>>> +     FN(TC_INGRESS_ERROR)            \
>>>        FNe(MAX)
>>>
>>>    /**
>>> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>>>        SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>>>        /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>>>        SKB_DROP_REASON_QUEUE_PURGE,
>>> +     /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
>>> +     SKB_DROP_REASON_TC_EGRESS_ERROR,
>>> +     /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
>>> +     SKB_DROP_REASON_TC_INGRESS_ERROR,
>>>        /**
>>>         * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>>>         * shouldn't be used as a real 'reason' - only for tracing code gen
>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>>> index f232512505f8..9a3f71d2545e 100644
>>> --- a/include/net/sch_generic.h
>>> +++ b/include/net/sch_generic.h
>>> @@ -326,6 +326,7 @@ struct Qdisc_ops {
>>>
>>>
>>>    struct tcf_result {
>>> +     u32 verdict;
>>>        union {
>>>                struct {
>>>                        unsigned long   class;
>>> @@ -336,6 +337,12 @@ struct tcf_result {
>>>        };
>>>    };
>>>
>>> +static inline void tcf_result_set_verdict(struct tcf_result *res,
>>> +                                       const u32 verdict)
>>> +{
>>> +     res->verdict = verdict;
>>> +}
>>> +
>>>    struct tcf_chain;
>>>
>>>    struct tcf_proto_ops {
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index ccff2b6ef958..1450f4741d9b 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
>>>    #endif /* CONFIG_NET_EGRESS */
>>>
>>>    #ifdef CONFIG_NET_XGRESS
>>> -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
>>> +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
>>> +               struct tcf_result *res)
>>>    {
>>> -     int ret = TC_ACT_UNSPEC;
>>> +     int ret = 0;
>>>    #ifdef CONFIG_NET_CLS_ACT
>>>        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
>>> -     struct tcf_result res;
>>>
>>> -     if (!miniq)
>>> +     if (!miniq) {
>>> +             tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>>>                return ret;
>>> +     }
>>>
>>>        tc_skb_cb(skb)->mru = 0;
>>>        tc_skb_cb(skb)->post_ct = false;
>>>
>>>        mini_qdisc_bstats_cpu_update(miniq, skb);
>>> -     ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
>>> +     ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
>>> +     if (ret < 0) {
>>> +             mini_qdisc_qstats_cpu_drop(miniq);
>>> +             return ret;
>>> +     }
>>>        /* Only tcf related quirks below. */
>>> -     switch (ret) {
>>> +     switch (res->verdict) {
>>>        case TC_ACT_SHOT:
>>>                mini_qdisc_qstats_cpu_drop(miniq);
>>>                break;
>>>        case TC_ACT_OK:
>>>        case TC_ACT_RECLASSIFY:
>>> -             skb->tc_index = TC_H_MIN(res.classid);
>>> +             skb->tc_index = TC_H_MIN(res->classid);
>>>                break;
>>>        }
>>> +#else
>>> +     tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>>>    #endif /* CONFIG_NET_CLS_ACT */
>>>        return ret;
>>>    }
>>> @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>>>                   struct net_device *orig_dev, bool *another)
>>>    {
>>>        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
>>> +     struct tcf_result res = {0};
>>>        int sch_ret;
>>>
>>>        if (!entry)
>>> @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>>>                if (sch_ret != TC_ACT_UNSPEC)
>>>                        goto ingress_verdict;
>>>        }
>>> -     sch_ret = tc_run(tcx_entry(entry), skb);
>>> +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
>>> +     if (sch_ret < 0) {
>>> +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
>>> +             *ret = NET_RX_DROP;
>>> +             return NULL;
>>> +     }
>>>    ingress_verdict:
>>> -     switch (sch_ret) {
>>> +     switch (res.verdict) {
>>
>> This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
>> or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
>> in such case.
> 
> I think it is valuable to have a good reason code like
> SKB_DROP_REASON_TC_XXX_ERROR to disambiguate between errors vs
> verdicts in the case of tc_run() variant.
> For tcx_run(), does this look ok (for consistency)?:
> 
> if (static_branch_unlikely(&tcx_needed_key)) {
>                  sch_ret = tcx_run(entry, skb, true);
>                  if (sch_ret != TC_ACT_UNSPEC) {
>                          res.verdict = sch_ret;
>                          goto ingress_verdict;
>                  }
> }

In the above case we don't have 'internal' errors which you want to trace, so I would
also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
every packet.

I was more thinking like something below could be a better choice. I presume your main
goal is to trace where these errors originated in the first place, so it might even be
useful to capture the actual return code as well.

Then you can use perf script, bpf and whatnot to gather further insights into what
happened while being less invasive and avoiding the need to extend struct tcf_result.

This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
that in fast path all errors are < TC_ACT_UNSPEC anyway.

diff --git a/net/core/dev.c b/net/core/dev.c
index 85df22f05c38..4089d195144d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)

  	mini_qdisc_bstats_cpu_update(miniq, skb);
  	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
+	if (unlikely(ret < TC_ACT_UNSPEC)) {
+		trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
+		ret = TC_ACT_SHOT;
+	}
  	/* Only tcf related quirks below. */
  	switch (ret) {
  	case TC_ACT_SHOT:

Best,
Daniel

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-09-22  8:12     ` Daniel Borkmann
@ 2023-09-25 23:01       ` Jamal Hadi Salim
  2023-09-29 15:48         ` Daniel Borkmann
  0 siblings, 1 reply; 19+ messages in thread
From: Jamal Hadi Salim @ 2023-09-25 23:01 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> > On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>
> >> [ +Martin, bpf ]
> >>
> >> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> >>> Currently there is no way to distinguish between an error and a
> >>> classification verdict. This patch adds the verdict field as a part of
> >>> struct tcf_result. That way, tcf_classify can return a proper
> >>> error number when it fails, and we keep the classification result
> >>> information encapsulated in struct tcf_result.
> >>>
> >>> Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
> >>> SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
> >>> With that we can distinguish between a drop from a processing error versus
> >>> a drop from classification.
> >>>
> >>> Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> >>> ---
> >>>    include/net/dropreason-core.h |  6 +++++
> >>>    include/net/sch_generic.h     |  7 ++++++
> >>>    net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
> >>>    net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
> >>>    net/sched/sch_cake.c          | 32 +++++++++++++-------------
> >>>    net/sched/sch_drr.c           | 33 +++++++++++++--------------
> >>>    net/sched/sch_ets.c           |  6 +++--
> >>>    net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
> >>>    net/sched/sch_fq_pie.c        | 28 +++++++++++------------
> >>>    net/sched/sch_hfsc.c          |  6 +++--
> >>>    net/sched/sch_htb.c           |  6 +++--
> >>>    net/sched/sch_multiq.c        |  6 +++--
> >>>    net/sched/sch_prio.c          |  7 ++++--
> >>>    net/sched/sch_qfq.c           | 34 +++++++++++++---------------
> >>>    net/sched/sch_sfb.c           | 29 ++++++++++++------------
> >>>    net/sched/sch_sfq.c           | 28 +++++++++++------------
> >>>    16 files changed, 195 insertions(+), 142 deletions(-)
> >>>
> >>> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> >>> index a587e83fc169..b1c069c8e7f2 100644
> >>> --- a/include/net/dropreason-core.h
> >>> +++ b/include/net/dropreason-core.h
> >>> @@ -80,6 +80,8 @@
> >>>        FN(IPV6_NDISC_BAD_OPTIONS)      \
> >>>        FN(IPV6_NDISC_NS_OTHERHOST)     \
> >>>        FN(QUEUE_PURGE)                 \
> >>> +     FN(TC_EGRESS_ERROR)             \
> >>> +     FN(TC_INGRESS_ERROR)            \
> >>>        FNe(MAX)
> >>>
> >>>    /**
> >>> @@ -345,6 +347,10 @@ enum skb_drop_reason {
> >>>        SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
> >>>        /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
> >>>        SKB_DROP_REASON_QUEUE_PURGE,
> >>> +     /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> >>> +     SKB_DROP_REASON_TC_EGRESS_ERROR,
> >>> +     /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> >>> +     SKB_DROP_REASON_TC_INGRESS_ERROR,
> >>>        /**
> >>>         * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
> >>>         * shouldn't be used as a real 'reason' - only for tracing code gen
> >>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> >>> index f232512505f8..9a3f71d2545e 100644
> >>> --- a/include/net/sch_generic.h
> >>> +++ b/include/net/sch_generic.h
> >>> @@ -326,6 +326,7 @@ struct Qdisc_ops {
> >>>
> >>>
> >>>    struct tcf_result {
> >>> +     u32 verdict;
> >>>        union {
> >>>                struct {
> >>>                        unsigned long   class;
> >>> @@ -336,6 +337,12 @@ struct tcf_result {
> >>>        };
> >>>    };
> >>>
> >>> +static inline void tcf_result_set_verdict(struct tcf_result *res,
> >>> +                                       const u32 verdict)
> >>> +{
> >>> +     res->verdict = verdict;
> >>> +}
> >>> +
> >>>    struct tcf_chain;
> >>>
> >>>    struct tcf_proto_ops {
> >>> diff --git a/net/core/dev.c b/net/core/dev.c
> >>> index ccff2b6ef958..1450f4741d9b 100644
> >>> --- a/net/core/dev.c
> >>> +++ b/net/core/dev.c
> >>> @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
> >>>    #endif /* CONFIG_NET_EGRESS */
> >>>
> >>>    #ifdef CONFIG_NET_XGRESS
> >>> -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
> >>> +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
> >>> +               struct tcf_result *res)
> >>>    {
> >>> -     int ret = TC_ACT_UNSPEC;
> >>> +     int ret = 0;
> >>>    #ifdef CONFIG_NET_CLS_ACT
> >>>        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
> >>> -     struct tcf_result res;
> >>>
> >>> -     if (!miniq)
> >>> +     if (!miniq) {
> >>> +             tcf_result_set_verdict(res, TC_ACT_UNSPEC);
> >>>                return ret;
> >>> +     }
> >>>
> >>>        tc_skb_cb(skb)->mru = 0;
> >>>        tc_skb_cb(skb)->post_ct = false;
> >>>
> >>>        mini_qdisc_bstats_cpu_update(miniq, skb);
> >>> -     ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> >>> +     ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
> >>> +     if (ret < 0) {
> >>> +             mini_qdisc_qstats_cpu_drop(miniq);
> >>> +             return ret;
> >>> +     }
> >>>        /* Only tcf related quirks below. */
> >>> -     switch (ret) {
> >>> +     switch (res->verdict) {
> >>>        case TC_ACT_SHOT:
> >>>                mini_qdisc_qstats_cpu_drop(miniq);
> >>>                break;
> >>>        case TC_ACT_OK:
> >>>        case TC_ACT_RECLASSIFY:
> >>> -             skb->tc_index = TC_H_MIN(res.classid);
> >>> +             skb->tc_index = TC_H_MIN(res->classid);
> >>>                break;
> >>>        }
> >>> +#else
> >>> +     tcf_result_set_verdict(res, TC_ACT_UNSPEC);
> >>>    #endif /* CONFIG_NET_CLS_ACT */
> >>>        return ret;
> >>>    }
> >>> @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >>>                   struct net_device *orig_dev, bool *another)
> >>>    {
> >>>        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
> >>> +     struct tcf_result res = {0};
> >>>        int sch_ret;
> >>>
> >>>        if (!entry)
> >>> @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >>>                if (sch_ret != TC_ACT_UNSPEC)
> >>>                        goto ingress_verdict;
> >>>        }
> >>> -     sch_ret = tc_run(tcx_entry(entry), skb);
> >>> +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
> >>> +     if (sch_ret < 0) {
> >>> +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> >>> +             *ret = NET_RX_DROP;
> >>> +             return NULL;
> >>> +     }
> >>>    ingress_verdict:
> >>> -     switch (sch_ret) {
> >>> +     switch (res.verdict) {
> >>
> >> This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
> >> or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
> >> in such case.
> >
> > I think it is valuable to have a good reason code like
> > SKB_DROP_REASON_TC_XXX_ERROR to disambiguate between errors vs
> > verdicts in the case of tc_run() variant.
> > For tcx_run(), does this look ok (for consistency)?:
> >
> > if (static_branch_unlikely(&tcx_needed_key)) {
> >                  sch_ret = tcx_run(entry, skb, true);
> >                  if (sch_ret != TC_ACT_UNSPEC) {
> >                          res.verdict = sch_ret;
> >                          goto ingress_verdict;
> >                  }
> > }
>
> In the above case we don't have 'internal' errors which you want to trace, so I would
> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
> every packet.

We can move the zeroing inside tc_run() but we declare it in the same
spot as we do right now. You will still need to set res.verdict as
above.
Would that work for you?

> I was more thinking like something below could be a better choice. I presume your main
> goal is to trace where these errors originated in the first place, so it might even be
> useful to capture the actual return code as well.

The main motivation is a few syzkaller bugs which resulted in not
disambiguating between errors being returned and sometimes
TC_ACT_SHOT.

> Then you can use perf script, bpf and whatnot to gather further insights into what
> happened while being less invasive and avoiding the need to extend struct tcf_result.
>

We could use trace instead - the reason we have the skb reason is
being used in the other spots (does this trace require ebpf to be
usable?).

> This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
> that in fast path all errors are < TC_ACT_UNSPEC anyway.
>

I am not sure i followed. 0 means success, result codes are returned in res now.

cheers,
jamal

> diff --git a/net/core/dev.c b/net/core/dev.c
> index 85df22f05c38..4089d195144d 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
>
>         mini_qdisc_bstats_cpu_update(miniq, skb);
>         ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> +       if (unlikely(ret < TC_ACT_UNSPEC)) {
> +               trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
> +               ret = TC_ACT_SHOT;
> +       }
>         /* Only tcf related quirks below. */
>         switch (ret) {
>         case TC_ACT_SHOT:
>
> Best,
> Daniel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-09-25 23:01       ` Jamal Hadi Salim
@ 2023-09-29 15:48         ` Daniel Borkmann
  2023-10-02 19:54           ` Jamal Hadi Salim
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Borkmann @ 2023-09-29 15:48 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
[...]
>>
>> In the above case we don't have 'internal' errors which you want to trace, so I would
>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
>> every packet.
> 
> We can move the zeroing inside tc_run() but we declare it in the same
> spot as we do right now. You will still need to set res.verdict as
> above.
> Would that work for you?

What I'm not following is that with the below you can avoid the unnecessary
fast path cost (which is only for corner case which is almost never hit) and
get even better visibility. Are you saying it doesn't work?

>> I was more thinking like something below could be a better choice. I presume your main
>> goal is to trace where these errors originated in the first place, so it might even be
>> useful to capture the actual return code as well.
> 
> The main motivation is a few syzkaller bugs which resulted in not
> disambiguating between errors being returned and sometimes
> TC_ACT_SHOT.
> 
>> Then you can use perf script, bpf and whatnot to gather further insights into what
>> happened while being less invasive and avoiding the need to extend struct tcf_result.
> 
> We could use trace instead - the reason we have the skb reason is
> being used in the other spots (does this trace require ebpf to be
> usable?).

No you can just use regular perf by attaching to the tracepoint, no need for using
bpf at all here.

>> This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
>> that in fast path all errors are < TC_ACT_UNSPEC anyway.
> 
> I am not sure i followed. 0 means success, result codes are returned in res now.

What I was saying is that you don't need the struct change from the patch, but only
the changes where you rework TC_ACT_SHOT into one of the -E<errors>, and then with
the below you can pass this through an exception tracepoint.

>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 85df22f05c38..4089d195144d 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
>>
>>          mini_qdisc_bstats_cpu_update(miniq, skb);
>>          ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
>> +       if (unlikely(ret < TC_ACT_UNSPEC)) {
>> +               trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
>> +               ret = TC_ACT_SHOT;
>> +       }
>>          /* Only tcf related quirks below. */
>>          switch (ret) {
>>          case TC_ACT_SHOT:

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-09-29 15:48         ` Daniel Borkmann
@ 2023-10-02 19:54           ` Jamal Hadi Salim
  2023-10-03  9:00             ` Daniel Borkmann
  0 siblings, 1 reply; 19+ messages in thread
From: Jamal Hadi Salim @ 2023-10-02 19:54 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

Hi Daniel,
On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
> > On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> >>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> [...]
> >>
> >> In the above case we don't have 'internal' errors which you want to trace, so I would
> >> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
> >> every packet.
> >
> > We can move the zeroing inside tc_run() but we declare it in the same
> > spot as we do right now. You will still need to set res.verdict as
> > above.
> > Would that work for you?
>
> What I'm not following is that with the below you can avoid the unnecessary
> fast path cost (which is only for corner case which is almost never hit) and
> get even better visibility. Are you saying it doesn't work?

I am probably missing something:
-1/UNSPEC is a legit errno. And the main motivation here for this
patch is to disambiguate if it was -EPERM vs UNSPEC
Maybe that is what you are calling a "corner case"?

There are two options in my mind right now (since you are guaranteed
in tcx_run you will never return anything below UNSPEC):
1) we just have the switch statement invocation inside an inline
function and you can pass it sch_ret (for tcx case) and we'll pass it
res.verdit for tc_run() case.
2) is something is we leave tcx_run alone and we have something along
the lines of:

--------------
diff --git a/net/core/dev.c b/net/core/dev.c
index 1450f4741d9b..93613bce647c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
 {
        struct bpf_mprog_entry *entry =
rcu_dereference_bh(skb->dev->tcx_ingress);
-       struct tcf_result res = {0};
+       struct tcf_result res;
        int sch_ret;

        if (!entry)
@@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
packet_type **pt_prev, int *ret,
                if (sch_ret != TC_ACT_UNSPEC)
                        goto ingress_verdict;
        }
+
+       res.verdict = 0;
        sch_ret = tc_run(tcx_entry(entry), skb, &res);
        if (sch_ret < 0) {
                kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
                *ret = NET_RX_DROP;
                return NULL;
        }
+       sch_ret = res.verdict;
 ingress_verdict:
-       switch (res.verdict) {
+       switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* skb_mac_header check was done by BPF, so we can
safely
                 * push the L2 header back before redirecting to another
-----------

on the drop reason - our thinking is to support drop_watch alongside
tracepoint given kfree_skb_reason exists already; if i am not mistaken
what you suggested would require us to create a new tracepoint?

cheers,
jamal

> >> I was more thinking like something below could be a better choice. I presume your main
> >> goal is to trace where these errors originated in the first place, so it might even be
> >> useful to capture the actual return code as well.
> >
> > The main motivation is a few syzkaller bugs which resulted in not
> > disambiguating between errors being returned and sometimes
> > TC_ACT_SHOT.
> >
> >> Then you can use perf script, bpf and whatnot to gather further insights into what
> >> happened while being less invasive and avoiding the need to extend struct tcf_result.
> >
> > We could use trace instead - the reason we have the skb reason is
> > being used in the other spots (does this trace require ebpf to be
> > usable?).
>
> No you can just use regular perf by attaching to the tracepoint, no need for using
> bpf at all here.
>
> >> This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
> >> that in fast path all errors are < TC_ACT_UNSPEC anyway.
> >
> > I am not sure i followed. 0 means success, result codes are returned in res now.
>
> What I was saying is that you don't need the struct change from the patch, but only
> the changes where you rework TC_ACT_SHOT into one of the -E<errors>, and then with
> the below you can pass this through an exception tracepoint.
>
> >> diff --git a/net/core/dev.c b/net/core/dev.c
> >> index 85df22f05c38..4089d195144d 100644
> >> --- a/net/core/dev.c
> >> +++ b/net/core/dev.c
> >> @@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
> >>
> >>          mini_qdisc_bstats_cpu_update(miniq, skb);
> >>          ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> >> +       if (unlikely(ret < TC_ACT_UNSPEC)) {
> >> +               trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
> >> +               ret = TC_ACT_SHOT;
> >> +       }
> >>          /* Only tcf related quirks below. */
> >>          switch (ret) {
> >>          case TC_ACT_SHOT:
>
> Thanks,
> Daniel

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-02 19:54           ` Jamal Hadi Salim
@ 2023-10-03  9:00             ` Daniel Borkmann
  2023-10-03 12:46               ` Jamal Hadi Salim
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Borkmann @ 2023-10-03  9:00 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
> On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
>>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
>>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
>> [...]
>>>>
>>>> In the above case we don't have 'internal' errors which you want to trace, so I would
>>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
>>>> every packet.
>>>
>>> We can move the zeroing inside tc_run() but we declare it in the same
>>> spot as we do right now. You will still need to set res.verdict as
>>> above.
>>> Would that work for you?
>>
>> What I'm not following is that with the below you can avoid the unnecessary
>> fast path cost (which is only for corner case which is almost never hit) and
>> get even better visibility. Are you saying it doesn't work?
> 
> I am probably missing something:
> -1/UNSPEC is a legit errno. And the main motivation here for this
> patch is to disambiguate if it was -EPERM vs UNSPEC
> Maybe that is what you are calling a "corner case"?

Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
be audited for the code in the tree and therefore avoided so that you never run into
this problem.

> There are two options in my mind right now (since you are guaranteed
> in tcx_run you will never return anything below UNSPEC):
> 1) we just have the switch statement invocation inside an inline
> function and you can pass it sch_ret (for tcx case) and we'll pass it
> res.verdit for tc_run() case.
> 2) is something is we leave tcx_run alone and we have something along
> the lines of:
> 
> --------------
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1450f4741d9b..93613bce647c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
> packet_type **pt_prev, int *ret,
>                     struct net_device *orig_dev, bool *another)
>   {
>          struct bpf_mprog_entry *entry =
> rcu_dereference_bh(skb->dev->tcx_ingress);
> -       struct tcf_result res = {0};
> +       struct tcf_result res;
>          int sch_ret;
> 
>          if (!entry)
> @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
> packet_type **pt_prev, int *ret,
>                  if (sch_ret != TC_ACT_UNSPEC)
>                          goto ingress_verdict;
>          }
> +
> +       res.verdict = 0;
>          sch_ret = tc_run(tcx_entry(entry), skb, &res);
>          if (sch_ret < 0) {
>                  kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
>                  *ret = NET_RX_DROP;
>                  return NULL;
>          }
> +       sch_ret = res.verdict;
>   ingress_verdict:
> -       switch (res.verdict) {
> +       switch (sch_ret) {
>          case TC_ACT_REDIRECT:
>                  /* skb_mac_header check was done by BPF, so we can
> safely
>                   * push the L2 header back before redirecting to another
> -----------
> 
> on the drop reason - our thinking is to support drop_watch alongside
> tracepoint given kfree_skb_reason exists already; if i am not mistaken
> what you suggested would require us to create a new tracepoint?

So if the only thing you really care about is the different drop reason for
kfree_skb_reason, then I still don't follow why you need to drag this into
struct tcf_result. This can be done in a much simpler and more efficient way
like the following:

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a587e83fc169..b1c069c8e7f2 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -80,6 +80,8 @@
  	FN(IPV6_NDISC_BAD_OPTIONS)	\
  	FN(IPV6_NDISC_NS_OTHERHOST)	\
  	FN(QUEUE_PURGE)			\
+	FN(TC_EGRESS_ERROR)		\
+	FN(TC_INGRESS_ERROR)		\
  	FNe(MAX)

  /**
@@ -345,6 +347,10 @@ enum skb_drop_reason {
  	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
  	/** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
  	SKB_DROP_REASON_QUEUE_PURGE,
+	/** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
+	SKB_DROP_REASON_TC_EGRESS_ERROR,
+	/** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
+	SKB_DROP_REASON_TC_INGRESS_ERROR,
  	/**
  	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
  	 * shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f308e8268651..cd2444dd3745 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -10,6 +10,7 @@

  /* TC action not accessible from user space */
  #define TC_ACT_CONSUMED		(TC_ACT_VALUE_MAX + 1)
+#define TC_ACT_ABORT		(TC_ACT_VALUE_MAX + 2)

  /* Basic packet classifier frontend definitions. */

diff --git a/net/core/dev.c b/net/core/dev.c
index 85df22f05c38..3abb4d71c170 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
  		*ret = NET_RX_SUCCESS;
  		return NULL;
  	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+	case TC_ACT_ABORT:
+		kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
+				 SKB_DROP_REASON_TC_INGRESS :
+				 SKB_DROP_REASON_TC_INGRESS_ERROR);
  		*ret = NET_RX_DROP;
  		return NULL;
  	/* used by tc_run */
@@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  		*ret = NET_XMIT_SUCCESS;
  		return NULL;
  	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+	case TC_ACT_ABORT:
+		kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
+				 SKB_DROP_REASON_TC_EGRESS :
+				 SKB_DROP_REASON_TC_EGRESS_ERROR);
  		*ret = NET_XMIT_DROP;
  		return NULL;
  	/* used by tc_run */

Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
and you'll get the same result to make it observable for dropwatch.

Thanks,
Daniel

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-03  9:00             ` Daniel Borkmann
@ 2023-10-03 12:46               ` Jamal Hadi Salim
  2023-10-03 13:49                 ` Daniel Borkmann
  0 siblings, 1 reply; 19+ messages in thread
From: Jamal Hadi Salim @ 2023-10-03 12:46 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

On Tue, Oct 3, 2023 at 5:00 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
> > On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
> >>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> >>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> >> [...]
> >>>>
> >>>> In the above case we don't have 'internal' errors which you want to trace, so I would
> >>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
> >>>> every packet.
> >>>
> >>> We can move the zeroing inside tc_run() but we declare it in the same
> >>> spot as we do right now. You will still need to set res.verdict as
> >>> above.
> >>> Would that work for you?
> >>
> >> What I'm not following is that with the below you can avoid the unnecessary
> >> fast path cost (which is only for corner case which is almost never hit) and
> >> get even better visibility. Are you saying it doesn't work?
> >
> > I am probably missing something:
> > -1/UNSPEC is a legit errno. And the main motivation here for this
> > patch is to disambiguate if it was -EPERM vs UNSPEC
> > Maybe that is what you are calling a "corner case"?
>
> Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
> be audited for the code in the tree and therefore avoided so that you never run into
> this problem.

I am sorry but i am not in favor of this approach.
You are suggesting audits are the way to go forward when in fact lack
of said audits is what got us in this trouble with syzkaller to begin
with. We cant rely on tribal knowledge to be able to spot these
discrepancies. The elder of the tribe may move to a different mountain
at some point and TheLinuxWay(tm) is cutnpaste, so i dont see this as
long term good for maintainance. We have a clear distinction between
an error vs verdict - lets use that.
We really dont want to make this a special case just for eBPF and how
to make it a happy world for eBPF at the cost of everyone else. I made
a suggestion of leaving tcx alone, you can do your own thing there;
but for tc_run my view is we should keep it generic.

cheers,
jamal

> > There are two options in my mind right now (since you are guaranteed
> > in tcx_run you will never return anything below UNSPEC):
> > 1) we just have the switch statement invocation inside an inline
> > function and you can pass it sch_ret (for tcx case) and we'll pass it
> > res.verdit for tc_run() case.
> > 2) is something is we leave tcx_run alone and we have something along
> > the lines of:
> >
> > --------------
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 1450f4741d9b..93613bce647c 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
> > packet_type **pt_prev, int *ret,
> >                     struct net_device *orig_dev, bool *another)
> >   {
> >          struct bpf_mprog_entry *entry =
> > rcu_dereference_bh(skb->dev->tcx_ingress);
> > -       struct tcf_result res = {0};
> > +       struct tcf_result res;
> >          int sch_ret;
> >
> >          if (!entry)
> > @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
> > packet_type **pt_prev, int *ret,
> >                  if (sch_ret != TC_ACT_UNSPEC)
> >                          goto ingress_verdict;
> >          }
> > +
> > +       res.verdict = 0;
> >          sch_ret = tc_run(tcx_entry(entry), skb, &res);
> >          if (sch_ret < 0) {
> >                  kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> >                  *ret = NET_RX_DROP;
> >                  return NULL;
> >          }
> > +       sch_ret = res.verdict;
> >   ingress_verdict:
> > -       switch (res.verdict) {
> > +       switch (sch_ret) {
> >          case TC_ACT_REDIRECT:
> >                  /* skb_mac_header check was done by BPF, so we can
> > safely
> >                   * push the L2 header back before redirecting to another
> > -----------
> >
> > on the drop reason - our thinking is to support drop_watch alongside
> > tracepoint given kfree_skb_reason exists already; if i am not mistaken
> > what you suggested would require us to create a new tracepoint?
>
> So if the only thing you really care about is the different drop reason for
> kfree_skb_reason, then I still don't follow why you need to drag this into
> struct tcf_result. This can be done in a much simpler and more efficient way
> like the following:
>
> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> index a587e83fc169..b1c069c8e7f2 100644
> --- a/include/net/dropreason-core.h
> +++ b/include/net/dropreason-core.h
> @@ -80,6 +80,8 @@
>         FN(IPV6_NDISC_BAD_OPTIONS)      \
>         FN(IPV6_NDISC_NS_OTHERHOST)     \
>         FN(QUEUE_PURGE)                 \
> +       FN(TC_EGRESS_ERROR)             \
> +       FN(TC_INGRESS_ERROR)            \
>         FNe(MAX)
>
>   /**
> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>         SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>         /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>         SKB_DROP_REASON_QUEUE_PURGE,
> +       /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> +       SKB_DROP_REASON_TC_EGRESS_ERROR,
> +       /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> +       SKB_DROP_REASON_TC_INGRESS_ERROR,
>         /**
>          * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>          * shouldn't be used as a real 'reason' - only for tracing code gen
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> index f308e8268651..cd2444dd3745 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -10,6 +10,7 @@
>
>   /* TC action not accessible from user space */
>   #define TC_ACT_CONSUMED               (TC_ACT_VALUE_MAX + 1)
> +#define TC_ACT_ABORT           (TC_ACT_VALUE_MAX + 2)
>
>   /* Basic packet classifier frontend definitions. */
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 85df22f05c38..3abb4d71c170 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>                 *ret = NET_RX_SUCCESS;
>                 return NULL;
>         case TC_ACT_SHOT:
> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
> +       case TC_ACT_ABORT:
> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
> +                                SKB_DROP_REASON_TC_INGRESS :
> +                                SKB_DROP_REASON_TC_INGRESS_ERROR);
>                 *ret = NET_RX_DROP;
>                 return NULL;
>         /* used by tc_run */
> @@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>                 *ret = NET_XMIT_SUCCESS;
>                 return NULL;
>         case TC_ACT_SHOT:
> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
> +       case TC_ACT_ABORT:
> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
> +                                SKB_DROP_REASON_TC_EGRESS :
> +                                SKB_DROP_REASON_TC_EGRESS_ERROR);
>                 *ret = NET_XMIT_DROP;
>                 return NULL;
>         /* used by tc_run */
>
> Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
> and you'll get the same result to make it observable for dropwatch.
>
> Thanks,
> Daniel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-03 12:46               ` Jamal Hadi Salim
@ 2023-10-03 13:49                 ` Daniel Borkmann
  2023-10-03 21:36                   ` Jamal Hadi Salim
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Borkmann @ 2023-10-03 13:49 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

On 10/3/23 2:46 PM, Jamal Hadi Salim wrote:
> On Tue, Oct 3, 2023 at 5:00 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
>>> On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
>>>>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
>>>>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
>>>> [...]
>>>>>>
>>>>>> In the above case we don't have 'internal' errors which you want to trace, so I would
>>>>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
>>>>>> every packet.
>>>>>
>>>>> We can move the zeroing inside tc_run() but we declare it in the same
>>>>> spot as we do right now. You will still need to set res.verdict as
>>>>> above.
>>>>> Would that work for you?
>>>>
>>>> What I'm not following is that with the below you can avoid the unnecessary
>>>> fast path cost (which is only for corner case which is almost never hit) and
>>>> get even better visibility. Are you saying it doesn't work?
>>>
>>> I am probably missing something:
>>> -1/UNSPEC is a legit errno. And the main motivation here for this
>>> patch is to disambiguate if it was -EPERM vs UNSPEC
>>> Maybe that is what you are calling a "corner case"?
>>
>> Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
>> be audited for the code in the tree and therefore avoided so that you never run into
>> this problem.
> 
> I am sorry but i am not in favor of this approach.
> You are suggesting audits are the way to go forward when in fact lack
> of said audits is what got us in this trouble with syzkaller to begin
> with. We cant rely on tribal knowledge to be able to spot these
> discrepancies. The elder of the tribe may move to a different mountain
> at some point and TheLinuxWay(tm) is cutnpaste, so i dont see this as
> long term good for maintainance. We have a clear distinction between
> an error vs verdict - lets use that.
> We really dont want to make this a special case just for eBPF and how
> to make it a happy world for eBPF at the cost of everyone else. I made
> a suggestion of leaving tcx alone, you can do your own thing there;
> but for tc_run my view is we should keep it generic.

Jamal, before you come to early conclusions, it would be great if you also
read until the end of the email, because what I suggested below *is* generic
and with less churn throughout the code base.

>>> There are two options in my mind right now (since you are guaranteed
>>> in tcx_run you will never return anything below UNSPEC):
>>> 1) we just have the switch statement invocation inside an inline
>>> function and you can pass it sch_ret (for tcx case) and we'll pass it
>>> res.verdit for tc_run() case.
>>> 2) is something is we leave tcx_run alone and we have something along
>>> the lines of:
>>>
>>> --------------
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index 1450f4741d9b..93613bce647c 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
>>> packet_type **pt_prev, int *ret,
>>>                      struct net_device *orig_dev, bool *another)
>>>    {
>>>           struct bpf_mprog_entry *entry =
>>> rcu_dereference_bh(skb->dev->tcx_ingress);
>>> -       struct tcf_result res = {0};
>>> +       struct tcf_result res;
>>>           int sch_ret;
>>>
>>>           if (!entry)
>>> @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
>>> packet_type **pt_prev, int *ret,
>>>                   if (sch_ret != TC_ACT_UNSPEC)
>>>                           goto ingress_verdict;
>>>           }
>>> +
>>> +       res.verdict = 0;
>>>           sch_ret = tc_run(tcx_entry(entry), skb, &res);
>>>           if (sch_ret < 0) {
>>>                   kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
>>>                   *ret = NET_RX_DROP;
>>>                   return NULL;
>>>           }
>>> +       sch_ret = res.verdict;
>>>    ingress_verdict:
>>> -       switch (res.verdict) {
>>> +       switch (sch_ret) {
>>>           case TC_ACT_REDIRECT:
>>>                   /* skb_mac_header check was done by BPF, so we can
>>> safely
>>>                    * push the L2 header back before redirecting to another
>>> -----------
>>>
>>> on the drop reason - our thinking is to support drop_watch alongside
>>> tracepoint given kfree_skb_reason exists already; if i am not mistaken
>>> what you suggested would require us to create a new tracepoint?
>>
>> So if the only thing you really care about is the different drop reason for
>> kfree_skb_reason, then I still don't follow why you need to drag this into
>> struct tcf_result. This can be done in a much simpler and more efficient way
>> like the following:
>>
>> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
>> index a587e83fc169..b1c069c8e7f2 100644
>> --- a/include/net/dropreason-core.h
>> +++ b/include/net/dropreason-core.h
>> @@ -80,6 +80,8 @@
>>          FN(IPV6_NDISC_BAD_OPTIONS)      \
>>          FN(IPV6_NDISC_NS_OTHERHOST)     \
>>          FN(QUEUE_PURGE)                 \
>> +       FN(TC_EGRESS_ERROR)             \
>> +       FN(TC_INGRESS_ERROR)            \
>>          FNe(MAX)
>>
>>    /**
>> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>>          SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>>          /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>>          SKB_DROP_REASON_QUEUE_PURGE,
>> +       /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
>> +       SKB_DROP_REASON_TC_EGRESS_ERROR,
>> +       /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
>> +       SKB_DROP_REASON_TC_INGRESS_ERROR,
>>          /**
>>           * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>>           * shouldn't be used as a real 'reason' - only for tracing code gen
>> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
>> index f308e8268651..cd2444dd3745 100644
>> --- a/include/net/pkt_cls.h
>> +++ b/include/net/pkt_cls.h
>> @@ -10,6 +10,7 @@
>>
>>    /* TC action not accessible from user space */
>>    #define TC_ACT_CONSUMED               (TC_ACT_VALUE_MAX + 1)
>> +#define TC_ACT_ABORT           (TC_ACT_VALUE_MAX + 2)
>>
>>    /* Basic packet classifier frontend definitions. */
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 85df22f05c38..3abb4d71c170 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>>                  *ret = NET_RX_SUCCESS;
>>                  return NULL;
>>          case TC_ACT_SHOT:
>> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
>> +       case TC_ACT_ABORT:
>> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
>> +                                SKB_DROP_REASON_TC_INGRESS :
>> +                                SKB_DROP_REASON_TC_INGRESS_ERROR);
>>                  *ret = NET_RX_DROP;
>>                  return NULL;
>>          /* used by tc_run */
>> @@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>>                  *ret = NET_XMIT_SUCCESS;
>>                  return NULL;
>>          case TC_ACT_SHOT:
>> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
>> +       case TC_ACT_ABORT:
>> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
>> +                                SKB_DROP_REASON_TC_EGRESS :
>> +                                SKB_DROP_REASON_TC_EGRESS_ERROR);
>>                  *ret = NET_XMIT_DROP;
>>                  return NULL;
>>          /* used by tc_run */
>>
>> Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
>> and you'll get the same result to make it observable for dropwatch.
>>
>> Thanks,
>> Daniel


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-03 13:49                 ` Daniel Borkmann
@ 2023-10-03 21:36                   ` Jamal Hadi Salim
  2023-10-06 11:18                     ` Daniel Borkmann
  0 siblings, 1 reply; 19+ messages in thread
From: Jamal Hadi Salim @ 2023-10-03 21:36 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

On Tue, Oct 3, 2023 at 9:49 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 10/3/23 2:46 PM, Jamal Hadi Salim wrote:
> > On Tue, Oct 3, 2023 at 5:00 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
> >>> On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
> >>>>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> >>>>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> >>>> [...]
> >>>>>>
> >>>>>> In the above case we don't have 'internal' errors which you want to trace, so I would
> >>>>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
> >>>>>> every packet.
> >>>>>
> >>>>> We can move the zeroing inside tc_run() but we declare it in the same
> >>>>> spot as we do right now. You will still need to set res.verdict as
> >>>>> above.
> >>>>> Would that work for you?
> >>>>
> >>>> What I'm not following is that with the below you can avoid the unnecessary
> >>>> fast path cost (which is only for corner case which is almost never hit) and
> >>>> get even better visibility. Are you saying it doesn't work?
> >>>
> >>> I am probably missing something:
> >>> -1/UNSPEC is a legit errno. And the main motivation here for this
> >>> patch is to disambiguate if it was -EPERM vs UNSPEC
> >>> Maybe that is what you are calling a "corner case"?
> >>
> >> Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
> >> be audited for the code in the tree and therefore avoided so that you never run into
> >> this problem.
> >
> > I am sorry but i am not in favor of this approach.
> > You are suggesting audits are the way to go forward when in fact lack
> > of said audits is what got us in this trouble with syzkaller to begin
> > with. We cant rely on tribal knowledge to be able to spot these
> > discrepancies. The elder of the tribe may move to a different mountain
> > at some point and TheLinuxWay(tm) is cutnpaste, so i dont see this as
> > long term good for maintainance. We have a clear distinction between
> > an error vs verdict - lets use that.
> > We really dont want to make this a special case just for eBPF and how
> > to make it a happy world for eBPF at the cost of everyone else. I made
> > a suggestion of leaving tcx alone, you can do your own thing there;
> > but for tc_run my view is we should keep it generic.
>
> Jamal, before you come to early conclusions, it would be great if you also
> read until the end of the email, because what I suggested below *is* generic
> and with less churn throughout the code base.
>

I did look, Daniel. You are lumping all the error codes into one -
which doesnt change my view on disambiguation. If i was to debug
closely and run kprobe now i am seeing only one error code
TC_ACT_ABORT instead of -EINVAL vs -ENOMEM, etc. Easier for me to find
the source manually (and possibly even better with Andrii's tool i saw
once if it would work in the datapath - iirc, i think it prints return
codes on the code paths).

cheers,
jamal

> >>> There are two options in my mind right now (since you are guaranteed
> >>> in tcx_run you will never return anything below UNSPEC):
> >>> 1) we just have the switch statement invocation inside an inline
> >>> function and you can pass it sch_ret (for tcx case) and we'll pass it
> >>> res.verdit for tc_run() case.
> >>> 2) is something is we leave tcx_run alone and we have something along
> >>> the lines of:
> >>>
> >>> --------------
> >>> diff --git a/net/core/dev.c b/net/core/dev.c
> >>> index 1450f4741d9b..93613bce647c 100644
> >>> --- a/net/core/dev.c
> >>> +++ b/net/core/dev.c
> >>> @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
> >>> packet_type **pt_prev, int *ret,
> >>>                      struct net_device *orig_dev, bool *another)
> >>>    {
> >>>           struct bpf_mprog_entry *entry =
> >>> rcu_dereference_bh(skb->dev->tcx_ingress);
> >>> -       struct tcf_result res = {0};
> >>> +       struct tcf_result res;
> >>>           int sch_ret;
> >>>
> >>>           if (!entry)
> >>> @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
> >>> packet_type **pt_prev, int *ret,
> >>>                   if (sch_ret != TC_ACT_UNSPEC)
> >>>                           goto ingress_verdict;
> >>>           }
> >>> +
> >>> +       res.verdict = 0;
> >>>           sch_ret = tc_run(tcx_entry(entry), skb, &res);
> >>>           if (sch_ret < 0) {
> >>>                   kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> >>>                   *ret = NET_RX_DROP;
> >>>                   return NULL;
> >>>           }
> >>> +       sch_ret = res.verdict;
> >>>    ingress_verdict:
> >>> -       switch (res.verdict) {
> >>> +       switch (sch_ret) {
> >>>           case TC_ACT_REDIRECT:
> >>>                   /* skb_mac_header check was done by BPF, so we can
> >>> safely
> >>>                    * push the L2 header back before redirecting to another
> >>> -----------
> >>>
> >>> on the drop reason - our thinking is to support drop_watch alongside
> >>> tracepoint given kfree_skb_reason exists already; if i am not mistaken
> >>> what you suggested would require us to create a new tracepoint?
> >>
> >> So if the only thing you really care about is the different drop reason for
> >> kfree_skb_reason, then I still don't follow why you need to drag this into
> >> struct tcf_result. This can be done in a much simpler and more efficient way
> >> like the following:
> >>
> >> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> >> index a587e83fc169..b1c069c8e7f2 100644
> >> --- a/include/net/dropreason-core.h
> >> +++ b/include/net/dropreason-core.h
> >> @@ -80,6 +80,8 @@
> >>          FN(IPV6_NDISC_BAD_OPTIONS)      \
> >>          FN(IPV6_NDISC_NS_OTHERHOST)     \
> >>          FN(QUEUE_PURGE)                 \
> >> +       FN(TC_EGRESS_ERROR)             \
> >> +       FN(TC_INGRESS_ERROR)            \
> >>          FNe(MAX)
> >>
> >>    /**
> >> @@ -345,6 +347,10 @@ enum skb_drop_reason {
> >>          SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
> >>          /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
> >>          SKB_DROP_REASON_QUEUE_PURGE,
> >> +       /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> >> +       SKB_DROP_REASON_TC_EGRESS_ERROR,
> >> +       /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> >> +       SKB_DROP_REASON_TC_INGRESS_ERROR,
> >>          /**
> >>           * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
> >>           * shouldn't be used as a real 'reason' - only for tracing code gen
> >> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> >> index f308e8268651..cd2444dd3745 100644
> >> --- a/include/net/pkt_cls.h
> >> +++ b/include/net/pkt_cls.h
> >> @@ -10,6 +10,7 @@
> >>
> >>    /* TC action not accessible from user space */
> >>    #define TC_ACT_CONSUMED               (TC_ACT_VALUE_MAX + 1)
> >> +#define TC_ACT_ABORT           (TC_ACT_VALUE_MAX + 2)
> >>
> >>    /* Basic packet classifier frontend definitions. */
> >>
> >> diff --git a/net/core/dev.c b/net/core/dev.c
> >> index 85df22f05c38..3abb4d71c170 100644
> >> --- a/net/core/dev.c
> >> +++ b/net/core/dev.c
> >> @@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >>                  *ret = NET_RX_SUCCESS;
> >>                  return NULL;
> >>          case TC_ACT_SHOT:
> >> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
> >> +       case TC_ACT_ABORT:
> >> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
> >> +                                SKB_DROP_REASON_TC_INGRESS :
> >> +                                SKB_DROP_REASON_TC_INGRESS_ERROR);
> >>                  *ret = NET_RX_DROP;
> >>                  return NULL;
> >>          /* used by tc_run */
> >> @@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
> >>                  *ret = NET_XMIT_SUCCESS;
> >>                  return NULL;
> >>          case TC_ACT_SHOT:
> >> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
> >> +       case TC_ACT_ABORT:
> >> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
> >> +                                SKB_DROP_REASON_TC_EGRESS :
> >> +                                SKB_DROP_REASON_TC_EGRESS_ERROR);
> >>                  *ret = NET_XMIT_DROP;
> >>                  return NULL;
> >>          /* used by tc_run */
> >>
> >> Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
> >> and you'll get the same result to make it observable for dropwatch.
> >>
> >> Thanks,
> >> Daniel
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-03 21:36                   ` Jamal Hadi Salim
@ 2023-10-06 11:18                     ` Daniel Borkmann
  2023-10-06 13:32                       ` Jakub Kicinski
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Borkmann @ 2023-10-06 11:18 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, kuba,
	pabeni, paulb, netdev, kernel, martin.lau, bpf

Hi Jamal,

On 10/3/23 11:36 PM, Jamal Hadi Salim wrote:
[...]
> I did look, Daniel. You are lumping all the error codes into one -
> which doesnt change my view on disambiguation. If i was to debug
> closely and run kprobe now i am seeing only one error code
> TC_ACT_ABORT instead of -EINVAL vs -ENOMEM, etc. Easier for me to find
> the source manually (and possibly even better with Andrii's tool i saw
> once if it would work in the datapath - iirc, i think it prints return
> codes on the code paths).

That should be possible with some work this way, agree. I've been toying a bit
more on this issue, and actually there is an even better way which would cleanly
solve all use cases and we likely would utilize it for bpf as well in future.
I wasn't aware of it before, but the drop reason actually has per-subsystem infra
already which so far only mac80211 and ovs makes use of.

I wrote up below patch as a starting point to get the base infra up and with
TC_DROP_MAX_RECLASSIFY as the initial example on how to utilize it. Then you can
simply just use regular tooling and get more detailed kfree_skb_reason() codes,
which would also remove the need for kprobes/kretprobes to gather the error.

Let me know if this looks like a good path forward, then I'll cook a proper one
and you or Victor can extend it further with more drop reasons. The nice thing is
also that this can be extended successively with more reasons whenever needed.

Best & thanks,
Daniel

 From d62b4a52f9c725d4a63d5c76a576d4e3bbbea4ef Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 6 Oct 2023 08:42:19 +0000
Subject: [PATCH net-next] net, tc: Add extensible drop reason subsystem codes

[ commit msg tbd ]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
  include/net/dropreason-core.h |  4 ++--
  include/net/dropreason.h      |  6 ++++++
  include/net/sch_drop.h        | 35 +++++++++++++++++++++++++++++++++++
  include/net/sch_generic.h     | 11 +++++++++--
  net/core/dev.c                | 15 ++++++++++-----
  net/sched/cls_api.c           |  1 +
  6 files changed, 63 insertions(+), 9 deletions(-)
  create mode 100644 include/net/sch_drop.h

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a587e83fc169..670eac9923aa 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -235,7 +235,7 @@ enum skb_drop_reason {
  	SKB_DROP_REASON_NEIGH_QUEUEFULL,
  	/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
  	SKB_DROP_REASON_NEIGH_DEAD,
-	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
+	/** @SKB_DROP_REASON_TC_EGRESS: Unused, see TC_DROP_EGRESS instead */
  	SKB_DROP_REASON_TC_EGRESS,
  	/**
  	 * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
@@ -250,7 +250,7 @@ enum skb_drop_reason {
  	SKB_DROP_REASON_CPU_BACKLOG,
  	/** @SKB_DROP_REASON_XDP: dropped by XDP in input path */
  	SKB_DROP_REASON_XDP,
-	/** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */
+	/** @SKB_DROP_REASON_TC_INGRESS: Unused, see TC_DROP_INGRESS instead */
  	SKB_DROP_REASON_TC_INGRESS,
  	/** @SKB_DROP_REASON_UNHANDLED_PROTO: protocol not implemented or not supported */
  	SKB_DROP_REASON_UNHANDLED_PROTO,
diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index 56cb7be92244..434ed2124836 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -29,6 +29,12 @@ enum skb_drop_reason_subsys {
  	 */
  	SKB_DROP_REASON_SUBSYS_OPENVSWITCH,

+	/**
+	 * @SKB_DROP_REASON_SUBSYS_TC: traffic control (tc) drop reasons,
+	 * see include/net/sch_drop.h
+	 */
+	SKB_DROP_REASON_SUBSYS_TC,
+
  	/** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */
  	SKB_DROP_REASON_SUBSYS_NUM
  };
diff --git a/include/net/sch_drop.h b/include/net/sch_drop.h
new file mode 100644
index 000000000000..c2471a62c10b
--- /dev/null
+++ b/include/net/sch_drop.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Traffic control drop reason list. */
+
+#ifndef NET_SCH_DROP_H
+#define NET_SCH_DROP_H
+
+#include <linux/skbuff.h>
+#include <net/dropreason.h>
+
+/**
+ * @TC_DROP_INGRESS: dropped in tc ingress hook (generic, catch-all code)
+ * @TC_DROP_EGRESS: dropped in tc egress hook (generic, catch-all code)
+ * @TC_DROP_MAX_RECLASSIFY: dropped due to hitting maximum reclassify limit
+ */
+#define TC_DROP_REASONS(R)			\
+	R(TC_DROP_INGRESS)			\
+	R(TC_DROP_EGRESS)			\
+	R(TC_DROP_MAX_RECLASSIFY)		\
+	/* deliberate comment for trailing \ */
+
+enum tc_drop_reason {
+	__TC_DROP_REASON = SKB_DROP_REASON_SUBSYS_TC <<
+		SKB_DROP_REASON_SUBSYS_SHIFT,
+#define ENUM(x) x,
+	TC_DROP_REASONS(ENUM)
+#undef ENUM
+	TC_DROP_MAX,
+};
+
+static inline void
+tc_kfree_skb_reason(struct sk_buff *skb, enum tc_drop_reason reason)
+{
+	kfree_skb_reason(skb, (u32)reason);
+}
+#endif /* NET_SCH_DROP_H */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c7318c73cfd6..e50a281ff1af 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -16,9 +16,11 @@
  #include <linux/rwsem.h>
  #include <linux/atomic.h>
  #include <linux/hashtable.h>
+
  #include <net/gen_stats.h>
  #include <net/rtnetlink.h>
  #include <net/flow_offload.h>
+#include <net/sch_drop.h>

  struct Qdisc_ops;
  struct qdisc_walker;
@@ -324,15 +326,14 @@ struct Qdisc_ops {
  	struct module		*owner;
  };

-
  struct tcf_result {
+	enum tc_drop_reason		drop_reason;
  	union {
  		struct {
  			unsigned long	class;
  			u32		classid;
  		};
  		const struct tcf_proto *goto_tp;
-
  	};
  };

@@ -667,6 +668,12 @@ static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
  	return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
  }

+static inline void tc_set_drop_reason(struct tcf_result *res,
+				      enum tc_drop_reason reason)
+{
+	res->drop_reason = reason;
+}
+
  int qdisc_class_hash_init(struct Qdisc_class_hash *);
  void qdisc_class_hash_insert(struct Qdisc_class_hash *,
  			     struct Qdisc_class_common *);
diff --git a/net/core/dev.c b/net/core/dev.c
index 606a366cc209..93cebe374082 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3910,7 +3910,8 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
  #endif /* CONFIG_NET_EGRESS */

  #ifdef CONFIG_NET_XGRESS
-static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+		  enum tc_drop_reason *drop_reason)
  {
  	int ret = TC_ACT_UNSPEC;
  #ifdef CONFIG_NET_CLS_ACT
@@ -3922,12 +3923,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)

  	tc_skb_cb(skb)->mru = 0;
  	tc_skb_cb(skb)->post_ct = false;
+	res.drop_reason = *drop_reason;

  	mini_qdisc_bstats_cpu_update(miniq, skb);
  	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
  	/* Only tcf related quirks below. */
  	switch (ret) {
  	case TC_ACT_SHOT:
+		*drop_reason = res.drop_reason;
  		mini_qdisc_qstats_cpu_drop(miniq);
  		break;
  	case TC_ACT_OK:
@@ -3977,6 +3980,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
  		   struct net_device *orig_dev, bool *another)
  {
  	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+	enum tc_drop_reason drop_reason = TC_DROP_INGRESS;
  	int sch_ret;

  	if (!entry)
@@ -3994,7 +3998,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
  		if (sch_ret != TC_ACT_UNSPEC)
  			goto ingress_verdict;
  	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
  ingress_verdict:
  	switch (sch_ret) {
  	case TC_ACT_REDIRECT:
@@ -4011,7 +4015,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
  		*ret = NET_RX_SUCCESS;
  		return NULL;
  	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+		tc_kfree_skb_reason(skb, drop_reason);
  		*ret = NET_RX_DROP;
  		return NULL;
  	/* used by tc_run */
@@ -4032,6 +4036,7 @@ static __always_inline struct sk_buff *
  sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  {
  	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+	enum tc_drop_reason drop_reason = TC_DROP_EGRESS;
  	int sch_ret;

  	if (!entry)
@@ -4045,7 +4050,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  		if (sch_ret != TC_ACT_UNSPEC)
  			goto egress_verdict;
  	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
  egress_verdict:
  	switch (sch_ret) {
  	case TC_ACT_REDIRECT:
@@ -4054,7 +4059,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  		*ret = NET_XMIT_SUCCESS;
  		return NULL;
  	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+		tc_kfree_skb_reason(skb, drop_reason);
  		*ret = NET_XMIT_DROP;
  		return NULL;
  	/* used by tc_run */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a193cc7b3241..5d56ddb1462f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1723,6 +1723,7 @@ static inline int __tcf_classify(struct sk_buff *skb,
  				       tp->chain->block->index,
  				       tp->prio & 0xffff,
  				       ntohs(tp->protocol));
+		tc_set_drop_reason(res, TC_DROP_MAX_RECLASSIFY);
  		return TC_ACT_SHOT;
  	}

-- 
2.34.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-06 11:18                     ` Daniel Borkmann
@ 2023-10-06 13:32                       ` Jakub Kicinski
  2023-10-06 13:49                         ` Daniel Borkmann
  0 siblings, 1 reply; 19+ messages in thread
From: Jakub Kicinski @ 2023-10-06 13:32 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Jamal Hadi Salim, Victor Nogueira, xiyou.wangcong, jiri, davem,
	edumazet, pabeni, paulb, netdev, kernel, martin.lau, bpf

On Fri, 6 Oct 2023 13:18:40 +0200 Daniel Borkmann wrote:
> That should be possible with some work this way, agree. I've been toying a bit
> more on this issue, and actually there is an even better way which would cleanly
> solve all use cases and we likely would utilize it for bpf as well in future.
> I wasn't aware of it before, but the drop reason actually has per-subsystem infra
> already which so far only mac80211 and ovs makes use of.

FWIW I'm not sure if leaning into the subsys specific error codes for
something as core as TC is a good direction. I'd think that what
matters to the user is was it an intentional policy drop or some form
of an error / exception. More detailed info can come from stats.

Maybe I'm overly conservative because I don't care about debugging
mac80211 or ovs but do very much care about TC :) And I think Alastair 
(bpftrace) is working on auto-prettifying enums when bpftrace outputs
maps. So we can do something like:

$ bpftrace -e 'tracepoint:skb:kfree_skb { @[args->reason] = count(); }'
Attaching 1 probe...
^C

@[SKB_DROP_REASON_TC_INGRESS]: 2
@[SKB_CONSUMED]: 34

  ^^^^^^^^^^^^ names!!

Auto-magically.

Which will no longer work with the "pack multiple values into 
the reason" scheme of subsys-specific values :(

What I'm saying is that there is a trade-off here between providing 
as much info as possible vs basic user getting intelligible data..

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-06 13:32                       ` Jakub Kicinski
@ 2023-10-06 13:49                         ` Daniel Borkmann
  2023-10-06 14:12                           ` Jakub Kicinski
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Borkmann @ 2023-10-06 13:49 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Jamal Hadi Salim, Victor Nogueira, xiyou.wangcong, jiri, davem,
	edumazet, pabeni, paulb, netdev, kernel, martin.lau, bpf

On 10/6/23 3:32 PM, Jakub Kicinski wrote:
> On Fri, 6 Oct 2023 13:18:40 +0200 Daniel Borkmann wrote:
>> That should be possible with some work this way, agree. I've been toying a bit
>> more on this issue, and actually there is an even better way which would cleanly
>> solve all use cases and we likely would utilize it for bpf as well in future.
>> I wasn't aware of it before, but the drop reason actually has per-subsystem infra
>> already which so far only mac80211 and ovs makes use of.
> 
> FWIW I'm not sure if leaning into the subsys specific error codes for
> something as core as TC is a good direction. I'd think that what
> matters to the user is was it an intentional policy drop or some form
> of an error / exception. More detailed info can come from stats.
> 
> Maybe I'm overly conservative because I don't care about debugging
> mac80211 or ovs but do very much care about TC :) And I think Alastair
> (bpftrace) is working on auto-prettifying enums when bpftrace outputs
> maps. So we can do something like:
> 
> $ bpftrace -e 'tracepoint:skb:kfree_skb { @[args->reason] = count(); }'
> Attaching 1 probe...
> ^C
> 
> @[SKB_DROP_REASON_TC_INGRESS]: 2
> @[SKB_CONSUMED]: 34
> 
>    ^^^^^^^^^^^^ names!!
> 
> Auto-magically.

Very cool!

> Which will no longer work with the "pack multiple values into
> the reason" scheme of subsys-specific values :(

Too bad, do you happen to know why it won't work? Given they went into the
length of extending this for subsystems, they presumably would also like to
benefit from above. :/

> What I'm saying is that there is a trade-off here between providing
> as much info as possible vs basic user getting intelligible data..

Makes sense. I think we can drop that aspect for the subsys specific error
codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
be fine if you think it's better. The rest of the patch shown should still
apply the same way. I can tweak it to use the core section for codes, and
then it can be successively extended if that looks good to you - unless you
are saying from above, that just one error code is better and then going via
detailed stats for specific errors is preferred.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-06 13:49                         ` Daniel Borkmann
@ 2023-10-06 14:12                           ` Jakub Kicinski
  2023-10-06 15:25                             ` Jamal Hadi Salim
  2023-10-06 17:59                             ` Daniel Xu
  0 siblings, 2 replies; 19+ messages in thread
From: Jakub Kicinski @ 2023-10-06 14:12 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Jamal Hadi Salim, Victor Nogueira, xiyou.wangcong, jiri, davem,
	edumazet, pabeni, paulb, netdev, kernel, martin.lau, bpf

On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
> > Which will no longer work with the "pack multiple values into
> > the reason" scheme of subsys-specific values :(  
> 
> Too bad, do you happen to know why it won't work? 

I'm just guessing but the reason is enum skb_drop_reason
and the values of subsystem specific reasons won't be part
of that enum.

> Given they went into the
> length of extending this for subsystems, they presumably would also like to
> benefit from above. :/
> 
> > What I'm saying is that there is a trade-off here between providing
> > as much info as possible vs basic user getting intelligible data..  
> 
> Makes sense. I think we can drop that aspect for the subsys specific error
> codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
> be fine if you think it's better. The rest of the patch shown should still
> apply the same way. I can tweak it to use the core section for codes, and
> then it can be successively extended if that looks good to you - unless you
> are saying from above, that just one error code is better and then going via
> detailed stats for specific errors is preferred.

No, no, multiple reasons are perfectly fine. The non-technical
advantage of mac80211 error codes being separate is that there
are no git conflicts when we add new ones. TC codes can just 
be added to the main enum like TCP 🤷️

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-06 14:12                           ` Jakub Kicinski
@ 2023-10-06 15:25                             ` Jamal Hadi Salim
  2023-10-06 15:45                               ` Daniel Borkmann
  2023-10-06 17:59                             ` Daniel Xu
  1 sibling, 1 reply; 19+ messages in thread
From: Jamal Hadi Salim @ 2023-10-06 15:25 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Daniel Borkmann, Victor Nogueira, xiyou.wangcong, jiri, davem,
	edumazet, pabeni, paulb, netdev, kernel, martin.lau, bpf

On Fri, Oct 6, 2023 at 10:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
> > > Which will no longer work with the "pack multiple values into
> > > the reason" scheme of subsys-specific values :(
> >
> > Too bad, do you happen to know why it won't work?
>
> I'm just guessing but the reason is enum skb_drop_reason
> and the values of subsystem specific reasons won't be part
> of that enum.

IIUC, this would gives us the readability and never require any
changes to bpftrace, whereas the major:minor encoding would require
further logic in bpftrace.

> > Given they went into the
> > length of extending this for subsystems, they presumably would also like to
> > benefit from above. :/
> >
> > > What I'm saying is that there is a trade-off here between providing
> > > as much info as possible vs basic user getting intelligible data..
> >
> > Makes sense. I think we can drop that aspect for the subsys specific error
> > codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
> > be fine if you think it's better. The rest of the patch shown should still
> > apply the same way. I can tweak it to use the core section for codes, and
> > then it can be successively extended if that looks good to you - unless you
> > are saying from above, that just one error code is better and then going via
> > detailed stats for specific errors is preferred.
>
> No, no, multiple reasons are perfectly fine. The non-technical
> advantage of mac80211 error codes being separate is that there
> are no git conflicts when we add new ones. TC codes can just
> be added to the main enum like TCP 🤷️

We still need to differentiate policy vs error - I suppose we could go
with Daniel's idea of introducing TC_ACT_ABORT/ERROR and ensure all
the callees set the drop_reason.

cheers,
jamal

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-06 15:25                             ` Jamal Hadi Salim
@ 2023-10-06 15:45                               ` Daniel Borkmann
  2023-10-06 19:39                                 ` Jamal Hadi Salim
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Borkmann @ 2023-10-06 15:45 UTC (permalink / raw)
  To: Jamal Hadi Salim, Jakub Kicinski
  Cc: Victor Nogueira, xiyou.wangcong, jiri, davem, edumazet, pabeni,
	paulb, netdev, kernel, martin.lau, bpf

[-- Attachment #1: Type: text/plain, Size: 2446 bytes --]

On 10/6/23 5:25 PM, Jamal Hadi Salim wrote:
> On Fri, Oct 6, 2023 at 10:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
>> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
>>>> Which will no longer work with the "pack multiple values into
>>>> the reason" scheme of subsys-specific values :(
>>>
>>> Too bad, do you happen to know why it won't work?
>>
>> I'm just guessing but the reason is enum skb_drop_reason
>> and the values of subsystem specific reasons won't be part
>> of that enum.
> 
> IIUC, this would gives us the readability and never require any
> changes to bpftrace, whereas the major:minor encoding would require
> further logic in bpftrace.

Makes sense, agree.

>>> Given they went into the
>>> length of extending this for subsystems, they presumably would also like to
>>> benefit from above. :/
>>>
>>>> What I'm saying is that there is a trade-off here between providing
>>>> as much info as possible vs basic user getting intelligible data..
>>>
>>> Makes sense. I think we can drop that aspect for the subsys specific error
>>> codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
>>> be fine if you think it's better. The rest of the patch shown should still
>>> apply the same way. I can tweak it to use the core section for codes, and
>>> then it can be successively extended if that looks good to you - unless you
>>> are saying from above, that just one error code is better and then going via
>>> detailed stats for specific errors is preferred.
>>
>> No, no, multiple reasons are perfectly fine. The non-technical
>> advantage of mac80211 error codes being separate is that there
>> are no git conflicts when we add new ones. TC codes can just
>> be added to the main enum like TCP 🤷️
> 
> We still need to differentiate policy vs error - I suppose we could go
> with Daniel's idea of introducing TC_ACT_ABORT/ERROR and ensure all
> the callees set the drop_reason.

I've simplified the set (attached). The disambiguation could eventually be on
SKB_DROP_REASON_TC_{INGRESS,EGRESS} == intentional drop vs SKB_DROP_REASON_TC_ERROR_*
which indicates an internal error code once these are covered on all locations.
There could probably also be just a SKB_DROP_REASON_TC_ERROR which could act as
a catch-all for the time being to initially mark all error locations with something
generic. I think this should be flexible where you wouldn't need extra TC_ACT_ABORT.

Thanks,
Daniel

[-- Attachment #2: 0001-net-tc-Make-tc-related-drop-reason-more-flexible.patch --]
[-- Type: text/x-patch, Size: 6570 bytes --]

From d72cffb57e09dd657007eb108392274dde8793ef Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 6 Oct 2023 08:42:19 +0000
Subject: [PATCH net-next 1/2] net, tc: Make tc-related drop reason more flexible

Currently, the kfree_skb_reason() in sch_handle_{ingress,egress}() can only
express a basic SKB_DROP_REASON_TC_INGRESS or SKB_DROP_REASON_TC_EGRESS reason.

Victor kicked-off an initial proposal to make this more flexible by disambiguating
verdict from return code by moving the verdict into struct tcf_result and
letting tcf_classify() return a negative error. If hit, then two new drop
reasons were added in the proposal, that is SKB_DROP_REASON_TC_INGRESS_ERROR
as well as SKB_DROP_REASON_TC_EGRESS_ERROR. Further analysis of the actual
error codes would have required to attach to tcf_classify via kprobe/kretprobe
to more deeply debug skb and the returned error.

In order to make the kfree_skb_reason() in sch_handle_{ingress,egress}() more
extensible, it can be addressed in a more straight forward way, that is: Instead
of placing the verdict into struct tcf_result, we can just put the drop reason
in there, which does not require changes throughout various classful schedulers
given the existing verdict logic can stay as is. Then, SKB_DROP_REASON_TC_ERROR_*
can be added to the enum skb_drop_reason to disambiguate between an error or an
intentional drop. New drop reason error codes can be added successively to the
tc code base. For internal error locations which have not yet been annotated with
a SKB_DROP_REASON_TC_ERROR_*, the fallback is SKB_DROP_REASON_TC_INGRESS and
SKB_DROP_REASON_TC_EGRESS, respectively.

While drop reasons have infrastructure for subsystem specific error codes which
are currently used by mac80211 and ovs, Jakub mentioned that it is preferred for
tc to use the enum skb_drop_reason core codes given i) it better belongs there,
and ii) the tooling support is better, too:

  And I think Alastair (bpftrace) is working on auto-prettifying enums when
  bpftrace outputs maps. So we can do something like:

  $ bpftrace -e 'tracepoint:skb:kfree_skb { @[args->reason] = count(); }'
  Attaching 1 probe...
  ^C

  @[SKB_DROP_REASON_TC_INGRESS]: 2
  @[SKB_CONSUMED]: 34

  ^^^^^^^^^^^^ names!!

  Auto-magically.

Add a small helper tc_set_drop_reason() which can be used to set the drop reason
into tcf_result.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Victor Nogueira <victor@mojatatu.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/netdev/20231006063233.74345d36@kernel.org
---
 include/net/sch_generic.h |  9 +++++++--
 net/core/dev.c            | 15 ++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c7318c73cfd6..90774cb2ac03 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -324,7 +324,6 @@ struct Qdisc_ops {
 	struct module		*owner;
 };
 
-
 struct tcf_result {
 	union {
 		struct {
@@ -332,8 +331,8 @@ struct tcf_result {
 			u32		classid;
 		};
 		const struct tcf_proto *goto_tp;
-
 	};
+	enum skb_drop_reason		drop_reason;
 };
 
 struct tcf_chain;
@@ -667,6 +666,12 @@ static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
 	return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
 }
 
+static inline void tc_set_drop_reason(struct tcf_result *res,
+				      enum skb_drop_reason reason)
+{
+	res->drop_reason = reason;
+}
+
 int qdisc_class_hash_init(struct Qdisc_class_hash *);
 void qdisc_class_hash_insert(struct Qdisc_class_hash *,
 			     struct Qdisc_class_common *);
diff --git a/net/core/dev.c b/net/core/dev.c
index 606a366cc209..664426285fa3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3910,7 +3910,8 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 #endif /* CONFIG_NET_EGRESS */
 
 #ifdef CONFIG_NET_XGRESS
-static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+		  enum skb_drop_reason *drop_reason)
 {
 	int ret = TC_ACT_UNSPEC;
 #ifdef CONFIG_NET_CLS_ACT
@@ -3922,12 +3923,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
 
 	tc_skb_cb(skb)->mru = 0;
 	tc_skb_cb(skb)->post_ct = false;
+	res.drop_reason = *drop_reason;
 
 	mini_qdisc_bstats_cpu_update(miniq, skb);
 	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
 	/* Only tcf related quirks below. */
 	switch (ret) {
 	case TC_ACT_SHOT:
+		*drop_reason = res.drop_reason;
 		mini_qdisc_qstats_cpu_drop(miniq);
 		break;
 	case TC_ACT_OK:
@@ -3977,6 +3980,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		   struct net_device *orig_dev, bool *another)
 {
 	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
 	int sch_ret;
 
 	if (!entry)
@@ -3994,7 +3998,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		if (sch_ret != TC_ACT_UNSPEC)
 			goto ingress_verdict;
 	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 ingress_verdict:
 	switch (sch_ret) {
 	case TC_ACT_REDIRECT:
@@ -4011,7 +4015,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		*ret = NET_RX_SUCCESS;
 		return NULL;
 	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+		kfree_skb_reason(skb, drop_reason);
 		*ret = NET_RX_DROP;
 		return NULL;
 	/* used by tc_run */
@@ -4032,6 +4036,7 @@ static __always_inline struct sk_buff *
 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
 	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
 	int sch_ret;
 
 	if (!entry)
@@ -4045,7 +4050,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		if (sch_ret != TC_ACT_UNSPEC)
 			goto egress_verdict;
 	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 egress_verdict:
 	switch (sch_ret) {
 	case TC_ACT_REDIRECT:
@@ -4054,7 +4059,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		*ret = NET_XMIT_SUCCESS;
 		return NULL;
 	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+		kfree_skb_reason(skb, drop_reason);
 		*ret = NET_XMIT_DROP;
 		return NULL;
 	/* used by tc_run */
-- 
2.34.1


[-- Attachment #3: 0002-net-tc-Add-tc_set_drop_reason-for-reclassify-limit.patch --]
[-- Type: text/x-patch, Size: 1907 bytes --]

From b2c2a9eeb22ab48d4d31b61d2b4980faafb46c83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 6 Oct 2023 14:41:21 +0000
Subject: [PATCH net-next 2/2] net, tc: Add tc_set_drop_reason for reclassify limit

Add an initial user for the newly added tc_set_drop_reason() helper to
set the drop reason to SKB_DROP_REASON_TC_ERROR_MAX_LOOP when the maximum
reclassification limit is hit.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Victor Nogueira <victor@mojatatu.com>
Cc: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason-core.h | 3 +++
 net/sched/cls_api.c           | 1 +
 2 files changed, 4 insertions(+)

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a587e83fc169..a8503a72fddf 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -80,6 +80,7 @@
 	FN(IPV6_NDISC_BAD_OPTIONS)	\
 	FN(IPV6_NDISC_NS_OTHERHOST)	\
 	FN(QUEUE_PURGE)			\
+	FN(TC_ERROR_MAX_LOOP)		\
 	FNe(MAX)
 
 /**
@@ -345,6 +346,8 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
 	/** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
 	SKB_DROP_REASON_QUEUE_PURGE,
+	/** @SKB_DROP_REASON_TC_ERROR_MAX_LOOP: dropped due to hitting maximum reclassify limit. */
+	SKB_DROP_REASON_TC_ERROR_MAX_LOOP,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
 	 * shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a193cc7b3241..ed740f070dc4 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1723,6 +1723,7 @@ static inline int __tcf_classify(struct sk_buff *skb,
 				       tp->chain->block->index,
 				       tp->prio & 0xffff,
 				       ntohs(tp->protocol));
+		tc_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR_MAX_LOOP);
 		return TC_ACT_SHOT;
 	}
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-06 14:12                           ` Jakub Kicinski
  2023-10-06 15:25                             ` Jamal Hadi Salim
@ 2023-10-06 17:59                             ` Daniel Xu
  1 sibling, 0 replies; 19+ messages in thread
From: Daniel Xu @ 2023-10-06 17:59 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Daniel Borkmann, Jamal Hadi Salim, Victor Nogueira,
	xiyou.wangcong, jiri, davem, edumazet, pabeni, paulb, netdev,
	kernel, martin.lau, bpf

On Fri, Oct 06, 2023 at 07:12:15AM -0700, Jakub Kicinski wrote:
> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
> > > Which will no longer work with the "pack multiple values into
> > > the reason" scheme of subsys-specific values :(  
> > 
> > Too bad, do you happen to know why it won't work? 
> 
> I'm just guessing but the reason is enum skb_drop_reason
> and the values of subsystem specific reasons won't be part
> of that enum.

Yeah, looks like the subsystem reasons are different enums right?
There's probably a way to still support it in bpftrace but it might take
some minor changes and/or ugly conditionals in scripts.

But I also wonder: why are the subsystem error codes not included into
`enum skb_drop_reason`? It looks like the enum space is partitioned
already. And the modules have already registered themselves into core
kernel (in `enum skb_drop_reason_subsys`). So even if modules are
compiled out there are still hints of it laying around vmlinux.

Thanks,
Daniel

[..]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-06 15:45                               ` Daniel Borkmann
@ 2023-10-06 19:39                                 ` Jamal Hadi Salim
  2023-10-06 19:59                                   ` Daniel Borkmann
  0 siblings, 1 reply; 19+ messages in thread
From: Jamal Hadi Salim @ 2023-10-06 19:39 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Jakub Kicinski, Victor Nogueira, xiyou.wangcong, jiri, davem,
	edumazet, pabeni, paulb, netdev, kernel, martin.lau, bpf

On Fri, Oct 6, 2023 at 11:45 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 10/6/23 5:25 PM, Jamal Hadi Salim wrote:
> > On Fri, Oct 6, 2023 at 10:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
> >>>> Which will no longer work with the "pack multiple values into
> >>>> the reason" scheme of subsys-specific values :(
> >>>
> >>> Too bad, do you happen to know why it won't work?
> >>
> >> I'm just guessing but the reason is enum skb_drop_reason
> >> and the values of subsystem specific reasons won't be part
> >> of that enum.
> >
> > IIUC, this would gives us the readability and never require any
> > changes to bpftrace, whereas the major:minor encoding would require
> > further logic in bpftrace.
>
> Makes sense, agree.
>
> >>> Given they went into the
> >>> length of extending this for subsystems, they presumably would also like to
> >>> benefit from above. :/
> >>>
> >>>> What I'm saying is that there is a trade-off here between providing
> >>>> as much info as possible vs basic user getting intelligible data..
> >>>
> >>> Makes sense. I think we can drop that aspect for the subsys specific error
> >>> codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
> >>> be fine if you think it's better. The rest of the patch shown should still
> >>> apply the same way. I can tweak it to use the core section for codes, and
> >>> then it can be successively extended if that looks good to you - unless you
> >>> are saying from above, that just one error code is better and then going via
> >>> detailed stats for specific errors is preferred.
> >>
> >> No, no, multiple reasons are perfectly fine. The non-technical
> >> advantage of mac80211 error codes being separate is that there
> >> are no git conflicts when we add new ones. TC codes can just
> >> be added to the main enum like TCP 🤷️
> >
> > We still need to differentiate policy vs error - I suppose we could go
> > with Daniel's idea of introducing TC_ACT_ABORT/ERROR and ensure all
> > the callees set the drop_reason.
>
> I've simplified the set (attached). The disambiguation could eventually be on
> SKB_DROP_REASON_TC_{INGRESS,EGRESS} == intentional drop vs SKB_DROP_REASON_TC_ERROR_*
> which indicates an internal error code once these are covered on all locations.
> There could probably also be just a SKB_DROP_REASON_TC_ERROR which could act as
> a catch-all for the time being to initially mark all error locations with something
> generic. I think this should be flexible where you wouldn't need extra TC_ACT_ABORT.

I think this should work - either Victor or myself will work on a followup.

cheers,
jamal

> Thanks,
> Daniel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code
  2023-10-06 19:39                                 ` Jamal Hadi Salim
@ 2023-10-06 19:59                                   ` Daniel Borkmann
  0 siblings, 0 replies; 19+ messages in thread
From: Daniel Borkmann @ 2023-10-06 19:59 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Jakub Kicinski, Victor Nogueira, xiyou.wangcong, jiri, davem,
	edumazet, pabeni, paulb, netdev, kernel, martin.lau, bpf

On 10/6/23 9:39 PM, Jamal Hadi Salim wrote:
> On Fri, Oct 6, 2023 at 11:45 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 10/6/23 5:25 PM, Jamal Hadi Salim wrote:
>>> On Fri, Oct 6, 2023 at 10:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
>>>> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
>>>>>> Which will no longer work with the "pack multiple values into
>>>>>> the reason" scheme of subsys-specific values :(
>>>>>
>>>>> Too bad, do you happen to know why it won't work?
>>>>
>>>> I'm just guessing but the reason is enum skb_drop_reason
>>>> and the values of subsystem specific reasons won't be part
>>>> of that enum.
>>>
>>> IIUC, this would gives us the readability and never require any
>>> changes to bpftrace, whereas the major:minor encoding would require
>>> further logic in bpftrace.
>>
>> Makes sense, agree.
>>
>>>>> Given they went into the
>>>>> length of extending this for subsystems, they presumably would also like to
>>>>> benefit from above. :/
>>>>>
>>>>>> What I'm saying is that there is a trade-off here between providing
>>>>>> as much info as possible vs basic user getting intelligible data..
>>>>>
>>>>> Makes sense. I think we can drop that aspect for the subsys specific error
>>>>> codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
>>>>> be fine if you think it's better. The rest of the patch shown should still
>>>>> apply the same way. I can tweak it to use the core section for codes, and
>>>>> then it can be successively extended if that looks good to you - unless you
>>>>> are saying from above, that just one error code is better and then going via
>>>>> detailed stats for specific errors is preferred.
>>>>
>>>> No, no, multiple reasons are perfectly fine. The non-technical
>>>> advantage of mac80211 error codes being separate is that there
>>>> are no git conflicts when we add new ones. TC codes can just
>>>> be added to the main enum like TCP 🤷️
>>>
>>> We still need to differentiate policy vs error - I suppose we could go
>>> with Daniel's idea of introducing TC_ACT_ABORT/ERROR and ensure all
>>> the callees set the drop_reason.
>>
>> I've simplified the set (attached). The disambiguation could eventually be on
>> SKB_DROP_REASON_TC_{INGRESS,EGRESS} == intentional drop vs SKB_DROP_REASON_TC_ERROR_*
>> which indicates an internal error code once these are covered on all locations.
>> There could probably also be just a SKB_DROP_REASON_TC_ERROR which could act as
>> a catch-all for the time being to initially mark all error locations with something
>> generic. I think this should be flexible where you wouldn't need extra TC_ACT_ABORT.
> 
> I think this should work - either Victor or myself will work on a followup.

Sounds great, thanks!

Cheers,
Daniel

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2023-10-06 19:59 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20230919145951.352548-1-victor@mojatatu.com>
2023-09-19 22:15 ` [PATCH net-next 1/1] net/sched: Disambiguate verdict from return code Daniel Borkmann
2023-09-19 23:20   ` Jamal Hadi Salim
2023-09-22  8:12     ` Daniel Borkmann
2023-09-25 23:01       ` Jamal Hadi Salim
2023-09-29 15:48         ` Daniel Borkmann
2023-10-02 19:54           ` Jamal Hadi Salim
2023-10-03  9:00             ` Daniel Borkmann
2023-10-03 12:46               ` Jamal Hadi Salim
2023-10-03 13:49                 ` Daniel Borkmann
2023-10-03 21:36                   ` Jamal Hadi Salim
2023-10-06 11:18                     ` Daniel Borkmann
2023-10-06 13:32                       ` Jakub Kicinski
2023-10-06 13:49                         ` Daniel Borkmann
2023-10-06 14:12                           ` Jakub Kicinski
2023-10-06 15:25                             ` Jamal Hadi Salim
2023-10-06 15:45                               ` Daniel Borkmann
2023-10-06 19:39                                 ` Jamal Hadi Salim
2023-10-06 19:59                                   ` Daniel Borkmann
2023-10-06 17:59                             ` Daniel Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).