Re: [PATCH RFC bpf-next 1/2] bpf_redirect_neigh: Support supplying the nexthop as a helper parameter

From: Daniel Borkmann <daniel@iogearbox.net>
To: "Toke Høiland-Jørgensen" <toke@redhat.com>
Cc: David Ahern <dsahern@kernel.org>,
	netdev@vger.kernel.org, bpf@vger.kernel.org
Subject: Re: [PATCH RFC bpf-next 1/2] bpf_redirect_neigh: Support supplying the nexthop as a helper parameter
Date: Mon, 19 Oct 2020 17:01:12 +0200	[thread overview]
Message-ID: <013e2c8b-13b5-661c-89c5-508b91cd3f4c@iogearbox.net> (raw)
In-Reply-To: <160277680864.157904.8719768977907736015.stgit@toke.dk>

On 10/15/20 5:46 PM, Toke Høiland-Jørgensen wrote:
[...]
> +struct bpf_redir_neigh {
> +	/* network family for lookup (AF_INET, AF_INET6)
> +	 */
> +	__u8	nh_family;
> +	/* network address of nexthop; skips fib lookup to find gateway */
> +	union {
> +		__be32		ipv4_nh;
> +		__u32		ipv6_nh[4];  /* in6_addr; network order */
> +	};
> +};
> +
>   enum bpf_task_fd_type {
>   	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
>   	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c5e2a1c5fd8d..d073031a3a61 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2165,12 +2165,11 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
>   }
>   
>   #if IS_ENABLED(CONFIG_IPV6)
> -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
> +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
> +			    struct net_device *dev, const struct in6_addr *nexthop)
>   {
> -	struct dst_entry *dst = skb_dst(skb);
> -	struct net_device *dev = dst->dev;
>   	u32 hh_len = LL_RESERVED_SPACE(dev);
> -	const struct in6_addr *nexthop;
> +	struct dst_entry *dst = NULL;
>   	struct neighbour *neigh;
>   
>   	if (dev_xmit_recursion()) {
> @@ -2196,8 +2195,11 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
>   	}
>   
>   	rcu_read_lock_bh();
> -	nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
> -			      &ipv6_hdr(skb)->daddr);
> +	if (!nexthop) {
> +		dst = skb_dst(skb);
> +		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
> +				      &ipv6_hdr(skb)->daddr);
> +	}
>   	neigh = ip_neigh_gw6(dev, nexthop);
>   	if (likely(!IS_ERR(neigh))) {
>   		int ret;
> @@ -2210,36 +2212,46 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
>   		return ret;
>   	}
>   	rcu_read_unlock_bh();
> -	IP6_INC_STATS(dev_net(dst->dev),
> -		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
> +	if (dst)
> +		IP6_INC_STATS(dev_net(dst->dev),
> +			      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
>   out_drop:
>   	kfree_skb(skb);
>   	return -ENETDOWN;
>   }
>   
> -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
> +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
> +				   struct bpf_nh_params *nh)
>   {
>   	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
> +	struct in6_addr *nexthop = NULL;
>   	struct net *net = dev_net(dev);
>   	int err, ret = NET_XMIT_DROP;
> -	struct dst_entry *dst;
> -	struct flowi6 fl6 = {
> -		.flowi6_flags	= FLOWI_FLAG_ANYSRC,
> -		.flowi6_mark	= skb->mark,
> -		.flowlabel	= ip6_flowinfo(ip6h),
> -		.flowi6_oif	= dev->ifindex,
> -		.flowi6_proto	= ip6h->nexthdr,
> -		.daddr		= ip6h->daddr,
> -		.saddr		= ip6h->saddr,
> -	};
>   
> -	dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
> -	if (IS_ERR(dst))
> -		goto out_drop;
> +	if (!nh->nh_family) {
> +		struct dst_entry *dst;
> +		struct flowi6 fl6 = {
> +			.flowi6_flags = FLOWI_FLAG_ANYSRC,
> +			.flowi6_mark = skb->mark,
> +			.flowlabel = ip6_flowinfo(ip6h),
> +			.flowi6_oif = dev->ifindex,
> +			.flowi6_proto = ip6h->nexthdr,
> +			.daddr = ip6h->daddr,
> +			.saddr = ip6h->saddr,

nit: Would be good for readability to keep the previous whitespace alignment intact.

> +		};
> +
> +		dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
> +		if (IS_ERR(dst))
> +			goto out_drop;
>   
> -	skb_dst_set(skb, dst);
> +		skb_dst_set(skb, dst);
> +	} else if (nh->nh_family == AF_INET6) {
> +		nexthop = &nh->ipv6_nh;
> +	} else {
> +		goto out_drop;
> +	}
>   
> -	err = bpf_out_neigh_v6(net, skb);
> +	err = bpf_out_neigh_v6(net, skb, dev, nexthop);

I'd probably model the bpf_out_neigh_v{4,6}() as close as possible similar to each other in terms
of args we pass etc. In the v6 case you pass the nexthop in6_addr directly whereas v4 passes
bpf_nh_params, I'd probably also stick to the latter for v6 to keep it symmetric.

>   	if (unlikely(net_xmit_eval(err)))
>   		dev->stats.tx_errors++;
>   	else
> @@ -2260,11 +2272,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
>   #endif /* CONFIG_IPV6 */
>   
>   #if IS_ENABLED(CONFIG_INET)
> -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
> +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
> +			    struct net_device *dev, struct bpf_nh_params *nh)
>   {
> -	struct dst_entry *dst = skb_dst(skb);
> -	struct rtable *rt = container_of(dst, struct rtable, dst);
> -	struct net_device *dev = dst->dev;
>   	u32 hh_len = LL_RESERVED_SPACE(dev);
>   	struct neighbour *neigh;
>   	bool is_v6gw = false;
> @@ -2292,7 +2302,20 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
>   	}
>   
>   	rcu_read_lock_bh();
> -	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
> +	if (!nh) {
> +		struct dst_entry *dst = skb_dst(skb);
> +		struct rtable *rt = container_of(dst, struct rtable, dst);
> +
> +		neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
> +	} else if (nh->nh_family == AF_INET6) {
> +		neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
> +		is_v6gw = true;
> +	} else if (nh->nh_family == AF_INET) {
> +		neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
> +	} else {
> +		goto out_drop;
> +	}
> +
>   	if (likely(!IS_ERR(neigh))) {
>   		int ret;
>