bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
       [not found] ` <20200710224924.4087399-3-andriin@fb.com>
@ 2020-07-13 14:19   ` David Ahern
  2020-07-13 22:33     ` Andrii Nakryiko
  2020-07-14 13:57   ` Toke Høiland-Jørgensen
  1 sibling, 1 reply; 15+ messages in thread
From: David Ahern @ 2020-07-13 14:19 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, netdev, ast, daniel
  Cc: andrii.nakryiko, kernel-team, Jakub Kicinski, Andrey Ignatov,
	Takshak Chahande, Toke Høiland-Jørgensen

On 7/10/20 4:49 PM, Andrii Nakryiko wrote:
> Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
> BPF_LINK_CREATE command.
> 
> bpf_xdp_link is mutually exclusive with direct BPF program attachment,
> previous BPF program should be detached prior to attempting to create a new
> bpf_xdp_link attachment (for a given XDP mode). Once link is attached, it
> can't be replaced by other BPF program attachment or link attachment. It will
> be detached only when the last BPF link FD is closed.
> 
> bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to
> how other BPF links behave (cgroup, flow_dissector). At that point bpf_link
> will become defunct, but won't be destroyed until last FD is closed.
> 
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> ---
>  include/linux/netdevice.h |   6 +
>  include/uapi/linux/bpf.h  |   7 +-
>  kernel/bpf/syscall.c      |   5 +
>  net/core/dev.c            | 385 ++++++++++++++++++++++++++++----------

That's big diff for 1 patch. A fair bit of is refactoring / code
movement that can be done in a separate refactoring patch making it
cleaer what changes you need for the bpf_link piece.


>  4 files changed, 301 insertions(+), 102 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index d5630e535836..93bcd81d645d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -886,6 +886,7 @@ struct bpf_prog_offload_ops;
>  struct netlink_ext_ack;
>  struct xdp_umem;
>  struct xdp_dev_bulk_queue;
> +struct bpf_xdp_link;
>  
>  enum bpf_xdp_mode {
>  	XDP_MODE_SKB = 0,
> @@ -896,6 +897,7 @@ enum bpf_xdp_mode {
>  
>  struct bpf_xdp_entity {
>  	struct bpf_prog *prog;
> +	struct bpf_xdp_link *link;
>  };
>  
>  struct netdev_bpf {
> @@ -3824,6 +3826,10 @@ typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
>  int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
>  		      int fd, int expected_fd, u32 flags);
>  u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
> +
> +struct bpf_xdp_link;

already stated above.

> +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
> +
>  int xdp_umem_query(struct net_device *dev, u16 queue_id);
>  
>  int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 548a749aebb3..41eba148217b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -227,6 +227,7 @@ enum bpf_attach_type {
>  	BPF_CGROUP_INET6_GETSOCKNAME,
>  	BPF_XDP_DEVMAP,
>  	BPF_CGROUP_INET_SOCK_RELEASE,
> +	BPF_XDP,

This really does not add value for the uapi. The link_type uniquely
identifies the type and the expected program type.

>  	__MAX_BPF_ATTACH_TYPE
>  };
>  
> @@ -239,6 +240,7 @@ enum bpf_link_type {
>  	BPF_LINK_TYPE_CGROUP = 3,
>  	BPF_LINK_TYPE_ITER = 4,
>  	BPF_LINK_TYPE_NETNS = 5,
> +	BPF_LINK_TYPE_XDP = 6,
>  
>  	MAX_BPF_LINK_TYPE,
>  };
> @@ -604,7 +606,10 @@ union bpf_attr {
>  
>  	struct { /* struct used by BPF_LINK_CREATE command */
>  		__u32		prog_fd;	/* eBPF program to attach */
> -		__u32		target_fd;	/* object to attach to */
> +		union {
> +			__u32		target_fd;	/* object to attach to */
> +			__u32		target_ifindex; /* target ifindex */
> +		};
>  		__u32		attach_type;	/* attach type */
>  		__u32		flags;		/* extra flags */
>  	} link_create;
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 156f51ffada2..eb4ed4b29418 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2817,6 +2817,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
>  		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
>  	case BPF_TRACE_ITER:
>  		return BPF_PROG_TYPE_TRACING;
> +	case BPF_XDP:
> +		return BPF_PROG_TYPE_XDP;
>  	default:
>  		return BPF_PROG_TYPE_UNSPEC;
>  	}
> @@ -3890,6 +3892,9 @@ static int link_create(union bpf_attr *attr)
>  	case BPF_PROG_TYPE_FLOW_DISSECTOR:
>  		ret = netns_bpf_link_create(attr, prog);
>  		break;
> +	case BPF_PROG_TYPE_XDP:
> +		ret = bpf_xdp_link_attach(attr, prog);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  	}
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d3b82b664e2d..84f755a1ec36 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -8713,8 +8713,47 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
>  }
>  EXPORT_SYMBOL(dev_change_proto_down_generic);
>  
> -static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode)
> +struct bpf_xdp_link {
> +	struct bpf_link link;
> +	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
> +	int flags;
> +};
> +
> +static enum bpf_xdp_mode dev_xdp_mode(u32 flags)
> +{
> +	if (flags & XDP_FLAGS_HW_MODE)
> +		return XDP_MODE_HW;
> +	if (flags & XDP_FLAGS_DRV_MODE)
> +		return XDP_MODE_DRV;
> +	return XDP_MODE_SKB;
> +}
> +
> +static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
>  {
> +	switch (mode) {
> +	case XDP_MODE_SKB:
> +		return generic_xdp_install;
> +	case XDP_MODE_DRV:
> +	case XDP_MODE_HW:
> +		return dev->netdev_ops->ndo_bpf;
> +	default:
> +		return NULL;
> +	};
> +}
> +
> +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
> +					 enum bpf_xdp_mode mode)
> +{
> +	return dev->xdp_state[mode].link;
> +}
> +
> +static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
> +				     enum bpf_xdp_mode mode)
> +{
> +	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
> +
> +	if (link)
> +		return link->link.prog;
>  	return dev->xdp_state[mode].prog;
>  }
>  
> @@ -8725,9 +8764,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
>  	return prog ? prog->aux->id : 0;
>  }
>  
> +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
> +			     struct bpf_xdp_link *link)
> +{
> +	dev->xdp_state[mode].link = link;
> +	dev->xdp_state[mode].prog = NULL;
> +}
> +
>  static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
>  			     struct bpf_prog *prog)
>  {
> +	dev->xdp_state[mode].link = NULL;
>  	dev->xdp_state[mode].prog = prog;
>  }
>  
> @@ -8744,6 +8791,14 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
>  	xdp.flags = flags;
>  	xdp.prog = prog;
>  
> +	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
> +	 * "moved" into driver), so they don't increment it on their own, but
> +	 * they do decrement refcnt when program is detached or replaced.
> +	 * Given net_device also owns link/prog, we need to bump refcnt here
> +	 * to prevent drivers from underflowing it.
> +	 */
> +	if (prog)
> +		bpf_prog_inc(prog);

Why is this refcnt bump not needed today but is needed for your change?

>  	err = bpf_op(dev, &xdp);
>  	if (err)
>  		return err;

and the error path is not decrementing it.

> @@ -8756,39 +8811,221 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
>  
>  static void dev_xdp_uninstall(struct net_device *dev)
>  {
> -	bpf_op_t ndo_bpf;
> +	struct bpf_xdp_link *link;
> +	struct bpf_prog *prog;
> +	enum bpf_xdp_mode mode;
> +	bpf_op_t bpf_op;
>  
> -	/* Remove generic XDP */
> -	WARN_ON(dev_xdp_install(dev, XDP_MODE_SKB, generic_xdp_install,
> -				NULL, 0, NULL));
> -	dev_xdp_set_prog(dev, XDP_MODE_SKB, NULL);
> +	ASSERT_RTNL();
>  
> -	/* Remove from the driver */
> -	ndo_bpf = dev->netdev_ops->ndo_bpf;
> -	if (!ndo_bpf)
> -		return;
> +	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
> +		prog = dev_xdp_prog(dev, mode);
> +		if (!prog)
> +			continue;
> +
> +		bpf_op = dev_xdp_bpf_op(dev, mode);
> +		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
> +
> +		/* auto-detach link from net device */
> +		link = dev_xdp_link(dev, mode);
> +		if (link)
> +			link->dev = NULL;
> +		else
> +			bpf_prog_put(prog);
> +
> +		dev_xdp_set_link(dev, mode, NULL);
> +	}
> +}
> +
> +static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
> +			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
> +			  struct bpf_prog *old_prog, u32 flags)
> +{
> +	struct bpf_prog *cur_prog;
> +	enum bpf_xdp_mode mode;
> +	bpf_op_t bpf_op;
> +	int err;
> +
> +	ASSERT_RTNL();
>  
> -	if (dev_xdp_prog_id(dev, XDP_MODE_DRV)) {
> -		WARN_ON(dev_xdp_install(dev, XDP_MODE_DRV, ndo_bpf,
> -					NULL, 0, NULL));
> -		dev_xdp_set_prog(dev, XDP_MODE_DRV, NULL);
> +	/* link supports only XDP mode flags */
> +	if (link && (flags & ~XDP_FLAGS_MODES))
> +		return -EINVAL;

everyone of the -errno returns needs an extack message explaining why

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 4/7] bpf: implement BPF XDP link-specific introspection APIs
       [not found] ` <20200710224924.4087399-5-andriin@fb.com>
@ 2020-07-13 14:32   ` David Ahern
  2020-07-13 22:41     ` Andrii Nakryiko
  0 siblings, 1 reply; 15+ messages in thread
From: David Ahern @ 2020-07-13 14:32 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, netdev, ast, daniel
  Cc: andrii.nakryiko, kernel-team, Jakub Kicinski, Andrey Ignatov,
	Takshak Chahande, Toke Høiland-Jørgensen

On 7/10/20 4:49 PM, Andrii Nakryiko wrote:
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 025687120442..a9c634be8dd7 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -8973,6 +8973,35 @@ static void bpf_xdp_link_dealloc(struct bpf_link *link)
>  	kfree(xdp_link);
>  }
>  
> +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
> +				     struct seq_file *seq)
> +{
> +	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
> +	u32 ifindex = 0;
> +
> +	rtnl_lock();
> +	if (xdp_link->dev)
> +		ifindex = xdp_link->dev->ifindex;
> +	rtnl_unlock();

Patch 2 you set dev but don't hold a refcnt on it which is why you need
the locking here. How do you know that the dev pointer is even valid here?

If xdp_link is going to have dev reference you need to take the refcnt
and you need to handle NETDEV notifications to cleanup the bpf_link when
the device goes away.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-13 14:19   ` [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API David Ahern
@ 2020-07-13 22:33     ` Andrii Nakryiko
  0 siblings, 0 replies; 15+ messages in thread
From: Andrii Nakryiko @ 2020-07-13 22:33 UTC (permalink / raw)
  To: David Ahern
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, Jakub Kicinski, Andrey Ignatov,
	Takshak Chahande, Toke Høiland-Jørgensen

On Mon, Jul 13, 2020 at 7:19 AM David Ahern <dsahern@gmail.com> wrote:
>
> On 7/10/20 4:49 PM, Andrii Nakryiko wrote:
> > Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
> > BPF_LINK_CREATE command.
> >
> > bpf_xdp_link is mutually exclusive with direct BPF program attachment,
> > previous BPF program should be detached prior to attempting to create a new
> > bpf_xdp_link attachment (for a given XDP mode). Once link is attached, it
> > can't be replaced by other BPF program attachment or link attachment. It will
> > be detached only when the last BPF link FD is closed.
> >
> > bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to
> > how other BPF links behave (cgroup, flow_dissector). At that point bpf_link
> > will become defunct, but won't be destroyed until last FD is closed.
> >
> > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> > ---
> >  include/linux/netdevice.h |   6 +
> >  include/uapi/linux/bpf.h  |   7 +-
> >  kernel/bpf/syscall.c      |   5 +
> >  net/core/dev.c            | 385 ++++++++++++++++++++++++++++----------
>
> That's big diff for 1 patch. A fair bit of is refactoring / code
> movement that can be done in a separate refactoring patch making it
> cleaer what changes you need for the bpf_link piece.

Ok, I'll do another refactoring patch for prog attach logic only.

>
>
> >  4 files changed, 301 insertions(+), 102 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index d5630e535836..93bcd81d645d 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -886,6 +886,7 @@ struct bpf_prog_offload_ops;
> >  struct netlink_ext_ack;
> >  struct xdp_umem;
> >  struct xdp_dev_bulk_queue;
> > +struct bpf_xdp_link;
> >
> >  enum bpf_xdp_mode {
> >       XDP_MODE_SKB = 0,
> > @@ -896,6 +897,7 @@ enum bpf_xdp_mode {
> >
> >  struct bpf_xdp_entity {
> >       struct bpf_prog *prog;
> > +     struct bpf_xdp_link *link;
> >  };
> >
> >  struct netdev_bpf {
> > @@ -3824,6 +3826,10 @@ typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
> >  int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
> >                     int fd, int expected_fd, u32 flags);
> >  u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
> > +
> > +struct bpf_xdp_link;
>
> already stated above.

oh, right, will drop

>
> > +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
> > +
> >  int xdp_umem_query(struct net_device *dev, u16 queue_id);
> >
> >  int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 548a749aebb3..41eba148217b 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -227,6 +227,7 @@ enum bpf_attach_type {
> >       BPF_CGROUP_INET6_GETSOCKNAME,
> >       BPF_XDP_DEVMAP,
> >       BPF_CGROUP_INET_SOCK_RELEASE,
> > +     BPF_XDP,
>
> This really does not add value for the uapi. The link_type uniquely
> identifies the type and the expected program type.

Yes, but that's how PROG_ATTACH/LINK_CREATE is set up. We had a
similar discussion for SK_LOOKUP recently. The only downside right now
is increasing struct cgroup_bpf size, but I think we have a plan for
mitigating it, similarly how netns bpf_link does it.

>
> >       __MAX_BPF_ATTACH_TYPE
> >  };
> >
> > @@ -239,6 +240,7 @@ enum bpf_link_type {
> >       BPF_LINK_TYPE_CGROUP = 3,
> >       BPF_LINK_TYPE_ITER = 4,
> >       BPF_LINK_TYPE_NETNS = 5,
> > +     BPF_LINK_TYPE_XDP = 6,
> >
> >       MAX_BPF_LINK_TYPE,
> >  };

[...]

> > +     /* Drivers assume refcnt is already incremented (i.e, prog pointer is
> > +      * "moved" into driver), so they don't increment it on their own, but
> > +      * they do decrement refcnt when program is detached or replaced.
> > +      * Given net_device also owns link/prog, we need to bump refcnt here
> > +      * to prevent drivers from underflowing it.
> > +      */
> > +     if (prog)
> > +             bpf_prog_inc(prog);
>
> Why is this refcnt bump not needed today but is needed for your change?

Previously driver/generic_xdp_install "owned" this program and assumed
an already incremented ref_cnt, but dropped that count when the
current program was replaced with a new one (or NULL). Now, net_device
*also* owns prog (in xdp_state[mode].prog), in addition to whatever
book-keeping driver does internally. So there needs to be extra
refcnt.

But you are right that this should have been part of refactoring in
patch #1, I'll move it there.

>
> >       err = bpf_op(dev, &xdp);
> >       if (err)
> >               return err;
>
> and the error path is not decrementing it.

yep, great catch, thanks!

>
> > @@ -8756,39 +8811,221 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
> >

[...]

> >
> > -     if (dev_xdp_prog_id(dev, XDP_MODE_DRV)) {
> > -             WARN_ON(dev_xdp_install(dev, XDP_MODE_DRV, ndo_bpf,
> > -                                     NULL, 0, NULL));
> > -             dev_xdp_set_prog(dev, XDP_MODE_DRV, NULL);
> > +     /* link supports only XDP mode flags */
> > +     if (link && (flags & ~XDP_FLAGS_MODES))
> > +             return -EINVAL;
>
> everyone of the -errno returns needs an extack message explaining why

sure

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 4/7] bpf: implement BPF XDP link-specific introspection APIs
  2020-07-13 14:32   ` [PATCH bpf-next 4/7] bpf: implement BPF XDP link-specific introspection APIs David Ahern
@ 2020-07-13 22:41     ` Andrii Nakryiko
  0 siblings, 0 replies; 15+ messages in thread
From: Andrii Nakryiko @ 2020-07-13 22:41 UTC (permalink / raw)
  To: David Ahern
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, Jakub Kicinski, Andrey Ignatov,
	Takshak Chahande, Toke Høiland-Jørgensen

On Mon, Jul 13, 2020 at 7:32 AM David Ahern <dsahern@gmail.com> wrote:
>
> On 7/10/20 4:49 PM, Andrii Nakryiko wrote:
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 025687120442..a9c634be8dd7 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -8973,6 +8973,35 @@ static void bpf_xdp_link_dealloc(struct bpf_link *link)
> >       kfree(xdp_link);
> >  }
> >
> > +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
> > +                                  struct seq_file *seq)
> > +{
> > +     struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
> > +     u32 ifindex = 0;
> > +
> > +     rtnl_lock();
> > +     if (xdp_link->dev)
> > +             ifindex = xdp_link->dev->ifindex;
> > +     rtnl_unlock();
>
> Patch 2 you set dev but don't hold a refcnt on it which is why you need
> the locking here. How do you know that the dev pointer is even valid here?
>
> If xdp_link is going to have dev reference you need to take the refcnt
> and you need to handle NETDEV notifications to cleanup the bpf_link when
> the device goes away.

Here I'm following the approach taken for cgroup and netns, where we
don't want to hold cgroup with extra refcnt (as well as netns for
bpf_netns_link). The dev is guaranteed to be valid because
dev_xdp_uninstall() will be called (under rtnl_lock) before net_device
is removed/destroyed. dev_xdp_uninstall() is the only one that can set
xdp_link->dev to NULL. So if we got rtnl_lock() and see non-NULL dev
here, it means that at worst we are waiting on a rtnl lock in
dev_xdp_uninstall() in a separate thread, and until this thread
releases that lock, it's ok to query dev.

Even if we do extra refcnt, due to dev_xdp_uninstall() which sets
xdp_link->dev to NULL, any code (fill_info, show_fdinfo, update, etc)
that does something with xdp_link->dev will have to take a lock
anyways.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
       [not found] ` <20200710224924.4087399-3-andriin@fb.com>
  2020-07-13 14:19   ` [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API David Ahern
@ 2020-07-14 13:57   ` Toke Høiland-Jørgensen
  2020-07-14 18:59     ` Andrii Nakryiko
  1 sibling, 1 reply; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2020-07-14 13:57 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, netdev, ast, daniel
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko, David Ahern,
	Jakub Kicinski, Andrey Ignatov, Takshak Chahande

Andrii Nakryiko <andriin@fb.com> writes:

> Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
> BPF_LINK_CREATE command.

I'm still not convinced this is a good idea. As far as I can tell, at
this point adding this gets you three things:

1. The ability to 'lock' an attachment in place.

2. Automatic detach on fd close

3. API unification with other uses of BPF_LINK_CREATE.


Of those, 1. is certainly useful, but can be trivially achieved with the
existing netlink API (add a flag on attach that prevents removal unless
the original prog_fd is supplied as EXPECTED_FD).

2. is IMO the wrong model for XDP, as I believe I argued the last time
we discussed this :)
In particular, in a situation with multiple XDP programs attached
through a dispatcher, the 'owner' application of each program don't
'own' the interface attachment anyway, so if using bpf_link for that it
would have to be pinned somewhere anyway. So the 'automatic detach'
feature is only useful in the "xdpd" deployment scenario, whereas in the
common usage model of command-line attachment ('ip link set xdp...') it
is something that needs to be worked around.

3. would be kinda nice, I guess, if we were designing the API from
scratch. But we already have an existing API, so IMO the cost of
duplication outweighs any benefits of API unification.

So why is XDP worth it? I assume you weigh this differently, but please
explain how. Ideally, this should have been in the commit message
already...

> bpf_xdp_link is mutually exclusive with direct BPF program attachment,
> previous BPF program should be detached prior to attempting to create a new
> bpf_xdp_link attachment (for a given XDP mode). Once link is attached, it
> can't be replaced by other BPF program attachment or link attachment. It will
> be detached only when the last BPF link FD is closed.

I was under the impression that forcible attachment of bpf_links was
already possible, but looking at the code now it doesn't appear to be?
Wasn't that the whole point of BPF_LINK_GET_FD_BY_ID? I.e., that a
sysadmin with CAP_SYS_ADMIN privs could grab the offending bpf_link FD
and force-remove it? I certainly think this should be added before we
expand bpf_link usage any more...

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-14 13:57   ` Toke Høiland-Jørgensen
@ 2020-07-14 18:59     ` Andrii Nakryiko
  2020-07-14 20:12       ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 15+ messages in thread
From: Andrii Nakryiko @ 2020-07-14 18:59 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

On Tue, Jul 14, 2020 at 6:57 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andriin@fb.com> writes:
>
> > Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
> > BPF_LINK_CREATE command.
>
> I'm still not convinced this is a good idea. As far as I can tell, at
> this point adding this gets you three things:
>
> 1. The ability to 'lock' an attachment in place.
>
> 2. Automatic detach on fd close
>
> 3. API unification with other uses of BPF_LINK_CREATE.
>
>
> Of those, 1. is certainly useful, but can be trivially achieved with the
> existing netlink API (add a flag on attach that prevents removal unless
> the original prog_fd is supplied as EXPECTED_FD).

Given it's trivial to discover attached prog FD on a given ifindex, it
doesn't add much of a peace of mind to the application that installs
bpf_link. Any other XDP-enabled program (even some trivial test
program) can unknowingly break other applications by deciding to
"auto-cleanup" it's previous instance on restart ("what's my previous
prog FD? let's replace it with my up-to-date program FD! What do you
mean it wasn't my prog FD before?). We went over this discussion many
times already: relying on the correct behavior of *other*
applications, which you don't necessarily control, is not working well
in real production use cases.

>
> 2. is IMO the wrong model for XDP, as I believe I argued the last time
> we discussed this :)
> In particular, in a situation with multiple XDP programs attached
> through a dispatcher, the 'owner' application of each program don't
> 'own' the interface attachment anyway, so if using bpf_link for that it
> would have to be pinned somewhere anyway. So the 'automatic detach'
> feature is only useful in the "xdpd" deployment scenario, whereas in the
> common usage model of command-line attachment ('ip link set xdp...') it
> is something that needs to be worked around.

Right, nothing changed since we last discussed. There are cases where
one or another approach is more convenient. Having bpf_link for XDP
finally gives an option to have an auto-detaching (on last FD close)
approach, but you still insist there shouldn't be such an option. Why?

>
> 3. would be kinda nice, I guess, if we were designing the API from
> scratch. But we already have an existing API, so IMO the cost of
> duplication outweighs any benefits of API unification.

Not unification of BPF_LINK_CREATE, but unification of bpf_link
infrastructure in general, with its introspection and discoverability
APIs. bpftool can show which programs are attached where and it can
show PIDs of processes that own the BPF link. With CAP_BPF you have
also more options now how to control who can mess with your bpf_link.

>
> So why is XDP worth it? I assume you weigh this differently, but please
> explain how. Ideally, this should have been in the commit message
> already...

It's the 6th BPF link class we are implementing, I didn't think I
needed to go over all the same general points again. I can point to
patches originally adding BPF link for justification, I suppose.

>
> > bpf_xdp_link is mutually exclusive with direct BPF program attachment,
> > previous BPF program should be detached prior to attempting to create a new
> > bpf_xdp_link attachment (for a given XDP mode). Once link is attached, it
> > can't be replaced by other BPF program attachment or link attachment. It will
> > be detached only when the last BPF link FD is closed.
>
> I was under the impression that forcible attachment of bpf_links was
> already possible, but looking at the code now it doesn't appear to be?
> Wasn't that the whole point of BPF_LINK_GET_FD_BY_ID? I.e., that a
> sysadmin with CAP_SYS_ADMIN privs could grab the offending bpf_link FD
> and force-remove it? I certainly think this should be added before we
> expand bpf_link usage any more...

I still maintain that killing processes that installed the bpf_link is
the better approach. Instead of letting the process believe and act as
if it has an active XDP program, while it doesn't, it's better to
altogether kill/restart the process.

>
> -Toke
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-14 18:59     ` Andrii Nakryiko
@ 2020-07-14 20:12       ` Toke Høiland-Jørgensen
  2020-07-14 20:37         ` Andrii Nakryiko
  0 siblings, 1 reply; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2020-07-14 20:12 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Tue, Jul 14, 2020 at 6:57 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Andrii Nakryiko <andriin@fb.com> writes:
>>
>> > Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
>> > BPF_LINK_CREATE command.
>>
>> I'm still not convinced this is a good idea. As far as I can tell, at
>> this point adding this gets you three things:
>>
>> 1. The ability to 'lock' an attachment in place.
>>
>> 2. Automatic detach on fd close
>>
>> 3. API unification with other uses of BPF_LINK_CREATE.
>>
>>
>> Of those, 1. is certainly useful, but can be trivially achieved with the
>> existing netlink API (add a flag on attach that prevents removal unless
>> the original prog_fd is supplied as EXPECTED_FD).
>
> Given it's trivial to discover attached prog FD on a given ifindex, it
> doesn't add much of a peace of mind to the application that installs
> bpf_link. Any other XDP-enabled program (even some trivial test
> program) can unknowingly break other applications by deciding to
> "auto-cleanup" it's previous instance on restart ("what's my previous
> prog FD? let's replace it with my up-to-date program FD! What do you
> mean it wasn't my prog FD before?). We went over this discussion many
> times already: relying on the correct behavior of *other*
> applications, which you don't necessarily control, is not working well
> in real production use cases.

It's trivial to discover the attached *ID*. But the id-to-fd transition
requires CAP_SYS_ADMIN, which presumably you're not granting these
not-necessarily-well-behaved programs. Because if you are, what's
stopping them from just killing the owner of the bpf_link to clear it
("oh, must be a previous instance of myself that's still running, let's
clear that up")? Or what else am I missing here?

>> 2. is IMO the wrong model for XDP, as I believe I argued the last time
>> we discussed this :)
>> In particular, in a situation with multiple XDP programs attached
>> through a dispatcher, the 'owner' application of each program don't
>> 'own' the interface attachment anyway, so if using bpf_link for that it
>> would have to be pinned somewhere anyway. So the 'automatic detach'
>> feature is only useful in the "xdpd" deployment scenario, whereas in the
>> common usage model of command-line attachment ('ip link set xdp...') it
>> is something that needs to be worked around.
>
> Right, nothing changed since we last discussed. There are cases where
> one or another approach is more convenient. Having bpf_link for XDP
> finally gives an option to have an auto-detaching (on last FD close)
> approach, but you still insist there shouldn't be such an option. Why?

Because the last time we discussed this, it was in the context of me
trying to extend the existing API and being told "no, don't do that, use
bpf_link instead". So I'm objecting to bpf_link being a *replacement*
for the exiting API; if that's not what you're intending, and we can
agree to keep both around and actively supported (including things like
adding that flag to the netlink API I talked about above), then that's a
totally different matter :)

>> 3. would be kinda nice, I guess, if we were designing the API from
>> scratch. But we already have an existing API, so IMO the cost of
>> duplication outweighs any benefits of API unification.
>
> Not unification of BPF_LINK_CREATE, but unification of bpf_link
> infrastructure in general, with its introspection and discoverability
> APIs. bpftool can show which programs are attached where and it can
> show PIDs of processes that own the BPF link.

Right, sure, I was using BPF_LINK_CREATE as a shorthand for bpf_link in
general.

> With CAP_BPF you have also more options now how to control who can
> mess with your bpf_link.

What are those, exactly?

[...]

>> I was under the impression that forcible attachment of bpf_links was
>> already possible, but looking at the code now it doesn't appear to be?
>> Wasn't that the whole point of BPF_LINK_GET_FD_BY_ID? I.e., that a
>> sysadmin with CAP_SYS_ADMIN privs could grab the offending bpf_link FD
>> and force-remove it? I certainly think this should be added before we
>> expand bpf_link usage any more...
>
> I still maintain that killing processes that installed the bpf_link is
> the better approach. Instead of letting the process believe and act as
> if it has an active XDP program, while it doesn't, it's better to
> altogether kill/restart the process.

Killing the process seems like a very blunt tool, though. Say it's a
daemon that attaches XDP programs to all available interfaces, but you
want to bring down an interface for some one-off maintenance task, but
the daemon authors neglected to provide an interface to tell the daemon
to detach from specific interfaces. If your only option is to kill the
daemon, you can't bring down that interface without disrupting whatever
that daemon is doing with XDP on all the other interfaces.

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-14 20:12       ` Toke Høiland-Jørgensen
@ 2020-07-14 20:37         ` Andrii Nakryiko
  2020-07-14 21:41           ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 15+ messages in thread
From: Andrii Nakryiko @ 2020-07-14 20:37 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

On Tue, Jul 14, 2020 at 1:13 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Tue, Jul 14, 2020 at 6:57 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> Andrii Nakryiko <andriin@fb.com> writes:
> >>
> >> > Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
> >> > BPF_LINK_CREATE command.
> >>
> >> I'm still not convinced this is a good idea. As far as I can tell, at
> >> this point adding this gets you three things:
> >>
> >> 1. The ability to 'lock' an attachment in place.
> >>
> >> 2. Automatic detach on fd close
> >>
> >> 3. API unification with other uses of BPF_LINK_CREATE.
> >>
> >>
> >> Of those, 1. is certainly useful, but can be trivially achieved with the
> >> existing netlink API (add a flag on attach that prevents removal unless
> >> the original prog_fd is supplied as EXPECTED_FD).
> >
> > Given it's trivial to discover attached prog FD on a given ifindex, it
> > doesn't add much of a peace of mind to the application that installs
> > bpf_link. Any other XDP-enabled program (even some trivial test
> > program) can unknowingly break other applications by deciding to
> > "auto-cleanup" it's previous instance on restart ("what's my previous
> > prog FD? let's replace it with my up-to-date program FD! What do you
> > mean it wasn't my prog FD before?). We went over this discussion many
> > times already: relying on the correct behavior of *other*
> > applications, which you don't necessarily control, is not working well
> > in real production use cases.
>
> It's trivial to discover the attached *ID*. But the id-to-fd transition
> requires CAP_SYS_ADMIN, which presumably you're not granting these
> not-necessarily-well-behaved programs. Because if you are, what's
> stopping them from just killing the owner of the bpf_link to clear it
> ("oh, must be a previous instance of myself that's still running, let's
> clear that up")? Or what else am I missing here?

Well, I actually assumed CAP_SYS_ADMIN, given CAP_BPF is very-very
fresh. Without it yes, you can't go ID->FD.

But with CAP_SYS_ADMIN, you can't accidentally: 1) discover link ID,
2) link ID-to-FD 3) query link to discover prog ID 4) prog ID-to-FD 5)
replace with EXPECTED_FD, because that's not expected flow with link.
With link you just have to assume that there is nothing attached to
ifindex, otherwise it's up to admin to "recover".

While with prog FD-based permanent attachment, you assume that you
have to 1) discover prog ID 2) prog ID-to-FD 3) replace with your new
prog FD, setting EXPECTED_FD, because you have to assume that if you
crashed before, your old prog FD is still attached and you have to
detach/replace it on restart. With such assumption, distinguishing
"your old BPF prog" vs "someone else's active BPF prog" isn't simple.
And you might not even think about the latter case.

There is no 100%-fool-proof case, but there are very different flows
and assumptions, which I, hopefully, outlined above.

>
> >> 2. is IMO the wrong model for XDP, as I believe I argued the last time
> >> we discussed this :)
> >> In particular, in a situation with multiple XDP programs attached
> >> through a dispatcher, the 'owner' application of each program don't
> >> 'own' the interface attachment anyway, so if using bpf_link for that it
> >> would have to be pinned somewhere anyway. So the 'automatic detach'
> >> feature is only useful in the "xdpd" deployment scenario, whereas in the
> >> common usage model of command-line attachment ('ip link set xdp...') it
> >> is something that needs to be worked around.
> >
> > Right, nothing changed since we last discussed. There are cases where
> > one or another approach is more convenient. Having bpf_link for XDP
> > finally gives an option to have an auto-detaching (on last FD close)
> > approach, but you still insist there shouldn't be such an option. Why?
>
> Because the last time we discussed this, it was in the context of me
> trying to extend the existing API and being told "no, don't do that, use
> bpf_link instead". So I'm objecting to bpf_link being a *replacement*
> for the exiting API; if that's not what you're intending, and we can
> agree to keep both around and actively supported (including things like
> adding that flag to the netlink API I talked about above), then that's a
> totally different matter :)

Yes, we didn't want to extend what we still perceive as unsafe
error-prone API, given we had a better approach in mind. Thus the
opposition. But you've ultimately got EXPECTED_FD, so hopefully it
works well for your use cases.

There is no removal of APIs from the kernel. Prog FD attachment for
XDP is here to stay forever, did anyone ever indicate otherwise?
bpf_link is an alternative, just like bpf_link for cgroup is an
alternative to persistent BPF prog FD-based attachments, which we
can't remove, even if we want to.

>
> >> 3. would be kinda nice, I guess, if we were designing the API from
> >> scratch. But we already have an existing API, so IMO the cost of
> >> duplication outweighs any benefits of API unification.
> >
> > Not unification of BPF_LINK_CREATE, but unification of bpf_link
> > infrastructure in general, with its introspection and discoverability
> > APIs. bpftool can show which programs are attached where and it can
> > show PIDs of processes that own the BPF link.
>
> Right, sure, I was using BPF_LINK_CREATE as a shorthand for bpf_link in
> general.
>
> > With CAP_BPF you have also more options now how to control who can
> > mess with your bpf_link.
>
> What are those, exactly?

I meant ID->FD conversion restrictions flexibility with CAP_BPF. You
get all of BPF with CAP_BPF, but no ID->FD conversion to mess with
bpf_link (e.g., to update underlying program).

>
> [...]
>
> >> I was under the impression that forcible attachment of bpf_links was
> >> already possible, but looking at the code now it doesn't appear to be?
> >> Wasn't that the whole point of BPF_LINK_GET_FD_BY_ID? I.e., that a
> >> sysadmin with CAP_SYS_ADMIN privs could grab the offending bpf_link FD
> >> and force-remove it? I certainly think this should be added before we
> >> expand bpf_link usage any more...
> >
> > I still maintain that killing processes that installed the bpf_link is
> > the better approach. Instead of letting the process believe and act as
> > if it has an active XDP program, while it doesn't, it's better to
> > altogether kill/restart the process.
>
> Killing the process seems like a very blunt tool, though. Say it's a
> daemon that attaches XDP programs to all available interfaces, but you
> want to bring down an interface for some one-off maintenance task, but
> the daemon authors neglected to provide an interface to tell the daemon
> to detach from specific interfaces. If your only option is to kill the
> daemon, you can't bring down that interface without disrupting whatever
> that daemon is doing with XDP on all the other interfaces.
>

I'd rather avoid addressing made up hypothetical cases, really. Get
better and more flexible daemon? Make it pin links, so you can delete
them, if necessary? Force-detaching is surely a way to address an
issue like this, but not necessarily the best or required one.

Killing process is a blunt tool, of course, but one can argue that a
dead process is better than a misbehaving process. We can keep coming
up with ever more elaborate hypothetical examples, but I don't see
much point. This force-detach functionality isn't hard to add, but so
far we had no real reason to do that. Once we do have such use cases,
we can add it, if we agree it's still a good idea.

> -Toke
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-14 20:37         ` Andrii Nakryiko
@ 2020-07-14 21:41           ` Toke Høiland-Jørgensen
  2020-07-14 22:26             ` Andrii Nakryiko
  0 siblings, 1 reply; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2020-07-14 21:41 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Tue, Jul 14, 2020 at 1:13 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>
>> > On Tue, Jul 14, 2020 at 6:57 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>> >>
>> >> Andrii Nakryiko <andriin@fb.com> writes:
>> >>
>> >> > Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
>> >> > BPF_LINK_CREATE command.
>> >>
>> >> I'm still not convinced this is a good idea. As far as I can tell, at
>> >> this point adding this gets you three things:
>> >>
>> >> 1. The ability to 'lock' an attachment in place.
>> >>
>> >> 2. Automatic detach on fd close
>> >>
>> >> 3. API unification with other uses of BPF_LINK_CREATE.
>> >>
>> >>
>> >> Of those, 1. is certainly useful, but can be trivially achieved with the
>> >> existing netlink API (add a flag on attach that prevents removal unless
>> >> the original prog_fd is supplied as EXPECTED_FD).
>> >
>> > Given it's trivial to discover attached prog FD on a given ifindex, it
>> > doesn't add much of a peace of mind to the application that installs
>> > bpf_link. Any other XDP-enabled program (even some trivial test
>> > program) can unknowingly break other applications by deciding to
>> > "auto-cleanup" it's previous instance on restart ("what's my previous
>> > prog FD? let's replace it with my up-to-date program FD! What do you
>> > mean it wasn't my prog FD before?). We went over this discussion many
>> > times already: relying on the correct behavior of *other*
>> > applications, which you don't necessarily control, is not working well
>> > in real production use cases.
>>
>> It's trivial to discover the attached *ID*. But the id-to-fd transition
>> requires CAP_SYS_ADMIN, which presumably you're not granting these
>> not-necessarily-well-behaved programs. Because if you are, what's
>> stopping them from just killing the owner of the bpf_link to clear it
>> ("oh, must be a previous instance of myself that's still running, let's
>> clear that up")? Or what else am I missing here?
>
> Well, I actually assumed CAP_SYS_ADMIN, given CAP_BPF is very-very
> fresh. Without it yes, you can't go ID->FD.
>
> But with CAP_SYS_ADMIN, you can't accidentally: 1) discover link ID,
> 2) link ID-to-FD 3) query link to discover prog ID 4) prog ID-to-FD 5)
> replace with EXPECTED_FD, because that's not expected flow with link.
> With link you just have to assume that there is nothing attached to
> ifindex, otherwise it's up to admin to "recover".
>
> While with prog FD-based permanent attachment, you assume that you
> have to 1) discover prog ID 2) prog ID-to-FD 3) replace with your new
> prog FD, setting EXPECTED_FD, because you have to assume that if you
> crashed before, your old prog FD is still attached and you have to
> detach/replace it on restart. With such assumption, distinguishing
> "your old BPF prog" vs "someone else's active BPF prog" isn't simple.
> And you might not even think about the latter case.
>
> There is no 100%-fool-proof case, but there are very different flows
> and assumptions, which I, hopefully, outlined above.

Yup, that was helpful, thanks! I think our difference of opinion on this
stems from the same place as our disagreement about point 2.: You are
assuming that an application that uses XDP sticks around and holds on to
its bpf_link, while I'm assuming it doesn't (and so has to rely on
pinning for any persistence). In the latter case you don't really gain
much from the bpf_link auto-cleanup, and whether it's a prog fd or a
link fd you go find in your bpffs doesn't make that much difference...

>> >> 2. is IMO the wrong model for XDP, as I believe I argued the last time
>> >> we discussed this :)
>> >> In particular, in a situation with multiple XDP programs attached
>> >> through a dispatcher, the 'owner' application of each program don't
>> >> 'own' the interface attachment anyway, so if using bpf_link for that it
>> >> would have to be pinned somewhere anyway. So the 'automatic detach'
>> >> feature is only useful in the "xdpd" deployment scenario, whereas in the
>> >> common usage model of command-line attachment ('ip link set xdp...') it
>> >> is something that needs to be worked around.
>> >
>> > Right, nothing changed since we last discussed. There are cases where
>> > one or another approach is more convenient. Having bpf_link for XDP
>> > finally gives an option to have an auto-detaching (on last FD close)
>> > approach, but you still insist there shouldn't be such an option. Why?
>>
>> Because the last time we discussed this, it was in the context of me
>> trying to extend the existing API and being told "no, don't do that, use
>> bpf_link instead". So I'm objecting to bpf_link being a *replacement*
>> for the exiting API; if that's not what you're intending, and we can
>> agree to keep both around and actively supported (including things like
>> adding that flag to the netlink API I talked about above), then that's a
>> totally different matter :)
>
> Yes, we didn't want to extend what we still perceive as unsafe
> error-prone API, given we had a better approach in mind. Thus the
> opposition. But you've ultimately got EXPECTED_FD, so hopefully it
> works well for your use cases.
>
> There is no removal of APIs from the kernel. Prog FD attachment for
> XDP is here to stay forever, did anyone ever indicate otherwise?
> bpf_link is an alternative, just like bpf_link for cgroup is an
> alternative to persistent BPF prog FD-based attachments, which we
> can't remove, even if we want to.
>
>> >> 3. would be kinda nice, I guess, if we were designing the API from
>> >> scratch. But we already have an existing API, so IMO the cost of
>> >> duplication outweighs any benefits of API unification.
>> >
>> > Not unification of BPF_LINK_CREATE, but unification of bpf_link
>> > infrastructure in general, with its introspection and discoverability
>> > APIs. bpftool can show which programs are attached where and it can
>> > show PIDs of processes that own the BPF link.
>>
>> Right, sure, I was using BPF_LINK_CREATE as a shorthand for bpf_link in
>> general.
>>
>> > With CAP_BPF you have also more options now how to control who can
>> > mess with your bpf_link.
>>
>> What are those, exactly?
>
> I meant ID->FD conversion restrictions flexibility with CAP_BPF. You
> get all of BPF with CAP_BPF, but no ID->FD conversion to mess with
> bpf_link (e.g., to update underlying program).

Right, sure. I just don't consider that specific to bpf_link; that goes
for any type of fd...

>> [...]
>>
>> >> I was under the impression that forcible attachment of bpf_links was
>> >> already possible, but looking at the code now it doesn't appear to be?
>> >> Wasn't that the whole point of BPF_LINK_GET_FD_BY_ID? I.e., that a
>> >> sysadmin with CAP_SYS_ADMIN privs could grab the offending bpf_link FD
>> >> and force-remove it? I certainly think this should be added before we
>> >> expand bpf_link usage any more...
>> >
>> > I still maintain that killing processes that installed the bpf_link is
>> > the better approach. Instead of letting the process believe and act as
>> > if it has an active XDP program, while it doesn't, it's better to
>> > altogether kill/restart the process.
>>
>> Killing the process seems like a very blunt tool, though. Say it's a
>> daemon that attaches XDP programs to all available interfaces, but you
>> want to bring down an interface for some one-off maintenance task, but
>> the daemon authors neglected to provide an interface to tell the daemon
>> to detach from specific interfaces. If your only option is to kill the
>> daemon, you can't bring down that interface without disrupting whatever
>> that daemon is doing with XDP on all the other interfaces.
>>
>
> I'd rather avoid addressing made up hypothetical cases, really. Get
> better and more flexible daemon?

I know you guys don't consider an issue to be real until it has already
crashed something in production. But some of us don't have the luxury of
your fast issue-discovered-to-fix-shipped turnaround times; so instead
we have to go with the old-fashioned way of thinking about how things
can go wrong ahead of time, and making sure we're prepared to handle
issues if (or, all too often, when) they occur. And it's frankly
tiresome to have all such efforts be dismissed as "made up
hypotheticals". Please consider that the whole world does not operate
like your org, and that there may be legitimate reasons to do things
differently.

That being said...

> This force-detach functionality isn't hard to add, but so far we had
> no real reason to do that. Once we do have such use cases, we can add
> it, if we agree it's still a good idea.

...this is fair enough, and I guess I should just put this on my list of
things to add. I was just somehow under the impression that this had
already been added.

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-14 21:41           ` Toke Høiland-Jørgensen
@ 2020-07-14 22:26             ` Andrii Nakryiko
  2020-07-15 15:48               ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 15+ messages in thread
From: Andrii Nakryiko @ 2020-07-14 22:26 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

On Tue, Jul 14, 2020 at 2:41 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Tue, Jul 14, 2020 at 1:13 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
> >>
> >> > On Tue, Jul 14, 2020 at 6:57 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >> >>
> >> >> Andrii Nakryiko <andriin@fb.com> writes:
> >> >>
> >> >> > Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
> >> >> > BPF_LINK_CREATE command.
> >> >>
> >> >> I'm still not convinced this is a good idea. As far as I can tell, at
> >> >> this point adding this gets you three things:
> >> >>
> >> >> 1. The ability to 'lock' an attachment in place.
> >> >>
> >> >> 2. Automatic detach on fd close
> >> >>
> >> >> 3. API unification with other uses of BPF_LINK_CREATE.
> >> >>
> >> >>
> >> >> Of those, 1. is certainly useful, but can be trivially achieved with the
> >> >> existing netlink API (add a flag on attach that prevents removal unless
> >> >> the original prog_fd is supplied as EXPECTED_FD).
> >> >
> >> > Given it's trivial to discover attached prog FD on a given ifindex, it
> >> > doesn't add much of a peace of mind to the application that installs
> >> > bpf_link. Any other XDP-enabled program (even some trivial test
> >> > program) can unknowingly break other applications by deciding to
> >> > "auto-cleanup" it's previous instance on restart ("what's my previous
> >> > prog FD? let's replace it with my up-to-date program FD! What do you
> >> > mean it wasn't my prog FD before?). We went over this discussion many
> >> > times already: relying on the correct behavior of *other*
> >> > applications, which you don't necessarily control, is not working well
> >> > in real production use cases.
> >>
> >> It's trivial to discover the attached *ID*. But the id-to-fd transition
> >> requires CAP_SYS_ADMIN, which presumably you're not granting these
> >> not-necessarily-well-behaved programs. Because if you are, what's
> >> stopping them from just killing the owner of the bpf_link to clear it
> >> ("oh, must be a previous instance of myself that's still running, let's
> >> clear that up")? Or what else am I missing here?
> >
> > Well, I actually assumed CAP_SYS_ADMIN, given CAP_BPF is very-very
> > fresh. Without it yes, you can't go ID->FD.
> >
> > But with CAP_SYS_ADMIN, you can't accidentally: 1) discover link ID,
> > 2) link ID-to-FD 3) query link to discover prog ID 4) prog ID-to-FD 5)
> > replace with EXPECTED_FD, because that's not expected flow with link.
> > With link you just have to assume that there is nothing attached to
> > ifindex, otherwise it's up to admin to "recover".
> >
> > While with prog FD-based permanent attachment, you assume that you
> > have to 1) discover prog ID 2) prog ID-to-FD 3) replace with your new
> > prog FD, setting EXPECTED_FD, because you have to assume that if you
> > crashed before, your old prog FD is still attached and you have to
> > detach/replace it on restart. With such assumption, distinguishing
> > "your old BPF prog" vs "someone else's active BPF prog" isn't simple.
> > And you might not even think about the latter case.
> >
> > There is no 100%-fool-proof case, but there are very different flows
> > and assumptions, which I, hopefully, outlined above.
>
> Yup, that was helpful, thanks! I think our difference of opinion on this
> stems from the same place as our disagreement about point 2.: You are
> assuming that an application that uses XDP sticks around and holds on to
> its bpf_link, while I'm assuming it doesn't (and so has to rely on
> pinning for any persistence). In the latter case you don't really gain
> much from the bpf_link auto-cleanup, and whether it's a prog fd or a
> link fd you go find in your bpffs doesn't make that much difference...

Right. But if I had to pick just one implementation (prog fd-based vs
bpf_link), I'd stick with bpf_link because it is flexible enough to
"emulate" prog fd attachment (through BPF FS), but the same isn't true
about prog fd attachment emulating bpf_link. That's it. I really don't
enjoy harping on that point, but it feels to be silently dismissed all
the time based on one particular arrangement for particular existing
XDP flow.

>
> >> >> 2. is IMO the wrong model for XDP, as I believe I argued the last time
> >> >> we discussed this :)
> >> >> In particular, in a situation with multiple XDP programs attached
> >> >> through a dispatcher, the 'owner' application of each program don't
> >> >> 'own' the interface attachment anyway, so if using bpf_link for that it
> >> >> would have to be pinned somewhere anyway. So the 'automatic detach'
> >> >> feature is only useful in the "xdpd" deployment scenario, whereas in the
> >> >> common usage model of command-line attachment ('ip link set xdp...') it
> >> >> is something that needs to be worked around.
> >> >
> >> > Right, nothing changed since we last discussed. There are cases where
> >> > one or another approach is more convenient. Having bpf_link for XDP
> >> > finally gives an option to have an auto-detaching (on last FD close)
> >> > approach, but you still insist there shouldn't be such an option. Why?
> >>
> >> Because the last time we discussed this, it was in the context of me
> >> trying to extend the existing API and being told "no, don't do that, use
> >> bpf_link instead". So I'm objecting to bpf_link being a *replacement*
> >> for the exiting API; if that's not what you're intending, and we can
> >> agree to keep both around and actively supported (including things like
> >> adding that flag to the netlink API I talked about above), then that's a
> >> totally different matter :)
> >
> > Yes, we didn't want to extend what we still perceive as unsafe
> > error-prone API, given we had a better approach in mind. Thus the
> > opposition. But you've ultimately got EXPECTED_FD, so hopefully it
> > works well for your use cases.
> >
> > There is no removal of APIs from the kernel. Prog FD attachment for
> > XDP is here to stay forever, did anyone ever indicate otherwise?
> > bpf_link is an alternative, just like bpf_link for cgroup is an
> > alternative to persistent BPF prog FD-based attachments, which we
> > can't remove, even if we want to.
> >
> >> >> 3. would be kinda nice, I guess, if we were designing the API from
> >> >> scratch. But we already have an existing API, so IMO the cost of
> >> >> duplication outweighs any benefits of API unification.
> >> >
> >> > Not unification of BPF_LINK_CREATE, but unification of bpf_link
> >> > infrastructure in general, with its introspection and discoverability
> >> > APIs. bpftool can show which programs are attached where and it can
> >> > show PIDs of processes that own the BPF link.
> >>
> >> Right, sure, I was using BPF_LINK_CREATE as a shorthand for bpf_link in
> >> general.
> >>
> >> > With CAP_BPF you have also more options now how to control who can
> >> > mess with your bpf_link.
> >>
> >> What are those, exactly?
> >
> > I meant ID->FD conversion restrictions flexibility with CAP_BPF. You
> > get all of BPF with CAP_BPF, but no ID->FD conversion to mess with
> > bpf_link (e.g., to update underlying program).
>
> Right, sure. I just don't consider that specific to bpf_link; that goes
> for any type of fd...

Fair enough. Unified introspection/discoverability are specific to
bpf_link, I'll stick to just that then.


>
> >> [...]
> >>
> >> >> I was under the impression that forcible attachment of bpf_links was
> >> >> already possible, but looking at the code now it doesn't appear to be?
> >> >> Wasn't that the whole point of BPF_LINK_GET_FD_BY_ID? I.e., that a
> >> >> sysadmin with CAP_SYS_ADMIN privs could grab the offending bpf_link FD
> >> >> and force-remove it? I certainly think this should be added before we
> >> >> expand bpf_link usage any more...
> >> >
> >> > I still maintain that killing processes that installed the bpf_link is
> >> > the better approach. Instead of letting the process believe and act as
> >> > if it has an active XDP program, while it doesn't, it's better to
> >> > altogether kill/restart the process.
> >>
> >> Killing the process seems like a very blunt tool, though. Say it's a
> >> daemon that attaches XDP programs to all available interfaces, but you
> >> want to bring down an interface for some one-off maintenance task, but
> >> the daemon authors neglected to provide an interface to tell the daemon
> >> to detach from specific interfaces. If your only option is to kill the
> >> daemon, you can't bring down that interface without disrupting whatever
> >> that daemon is doing with XDP on all the other interfaces.
> >>
> >
> > I'd rather avoid addressing made up hypothetical cases, really. Get
> > better and more flexible daemon?
>
> I know you guys don't consider an issue to be real until it has already
> crashed something in production. But some of us don't have the luxury of
> your fast issue-discovered-to-fix-shipped turnaround times; so instead
> we have to go with the old-fashioned way of thinking about how things
> can go wrong ahead of time, and making sure we're prepared to handle
> issues if (or, all too often, when) they occur. And it's frankly
> tiresome to have all such efforts be dismissed as "made up
> hypotheticals". Please consider that the whole world does not operate
> like your org, and that there may be legitimate reasons to do things
> differently.
>

Having something that breaks production is a very hard evidence of a
problem and makes it easier to better understand a **real** issue well
and argue why something has to be solved or prevented. But it's not a
necessary condition, of course. It's just that made up hypotheticals
aren't necessarily good ways to motivate a feature, because all too
often it turns out to be just that, hypothetical issue, while the real
issue is different enough to warrant a different and better solution.
By being conservative with adding features, we are trying to not make
unnecessary design and API mistakes, because in the kernel they are
here to stay forever.

In your example, I'd argue that the design of that daemon is bad if it
doesn't allow you to control what's attached where, and how to detach
it without breaking completely independent network interfaces. That
should be the problem that has to be solved first, IMO. And just
because in some scenarios it might be **convenient** to force-detach
bpf_link, is in no way a good justification (in my book) to add that
feature, especially if there are other (and arguably safer) ways to
achieve the same **troubleshooting** effect.

> That being said...
>
> > This force-detach functionality isn't hard to add, but so far we had
> > no real reason to do that. Once we do have such use cases, we can add
> > it, if we agree it's still a good idea.
>
> ...this is fair enough, and I guess I should just put this on my list of
> things to add. I was just somehow under the impression that this had
> already been added.
>

So, overall, do I understand correctly that you are in principle not
opposed to adding BPF XDP link, or you still have some concerns that
were not addressed?

> -Toke
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-14 22:26             ` Andrii Nakryiko
@ 2020-07-15 15:48               ` Toke Høiland-Jørgensen
  2020-07-15 20:54                 ` Andrii Nakryiko
  0 siblings, 1 reply; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2020-07-15 15:48 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

>> Yup, that was helpful, thanks! I think our difference of opinion on this
>> stems from the same place as our disagreement about point 2.: You are
>> assuming that an application that uses XDP sticks around and holds on to
>> its bpf_link, while I'm assuming it doesn't (and so has to rely on
>> pinning for any persistence). In the latter case you don't really gain
>> much from the bpf_link auto-cleanup, and whether it's a prog fd or a
>> link fd you go find in your bpffs doesn't make that much difference...
>
> Right. But if I had to pick just one implementation (prog fd-based vs
> bpf_link), I'd stick with bpf_link because it is flexible enough to
> "emulate" prog fd attachment (through BPF FS), but the same isn't true
> about prog fd attachment emulating bpf_link. That's it. I really don't
> enjoy harping on that point, but it feels to be silently dismissed all
> the time based on one particular arrangement for particular existing
> XDP flow.

It can; kinda. But you introduce a dependency on bpffs that wasn't there
before, and you end up with resources that are kept around in the kernel
if the interface disappears (because they are still pinned). So I
consider it a poor emulation.

>> >> >> I was under the impression that forcible attachment of bpf_links was
>> >> >> already possible, but looking at the code now it doesn't appear to be?
>> >> >> Wasn't that the whole point of BPF_LINK_GET_FD_BY_ID? I.e., that a
>> >> >> sysadmin with CAP_SYS_ADMIN privs could grab the offending bpf_link FD
>> >> >> and force-remove it? I certainly think this should be added before we
>> >> >> expand bpf_link usage any more...
>> >> >
>> >> > I still maintain that killing processes that installed the bpf_link is
>> >> > the better approach. Instead of letting the process believe and act as
>> >> > if it has an active XDP program, while it doesn't, it's better to
>> >> > altogether kill/restart the process.
>> >>
>> >> Killing the process seems like a very blunt tool, though. Say it's a
>> >> daemon that attaches XDP programs to all available interfaces, but you
>> >> want to bring down an interface for some one-off maintenance task, but
>> >> the daemon authors neglected to provide an interface to tell the daemon
>> >> to detach from specific interfaces. If your only option is to kill the
>> >> daemon, you can't bring down that interface without disrupting whatever
>> >> that daemon is doing with XDP on all the other interfaces.
>> >>
>> >
>> > I'd rather avoid addressing made up hypothetical cases, really. Get
>> > better and more flexible daemon?
>>
>> I know you guys don't consider an issue to be real until it has already
>> crashed something in production. But some of us don't have the luxury of
>> your fast issue-discovered-to-fix-shipped turnaround times; so instead
>> we have to go with the old-fashioned way of thinking about how things
>> can go wrong ahead of time, and making sure we're prepared to handle
>> issues if (or, all too often, when) they occur. And it's frankly
>> tiresome to have all such efforts be dismissed as "made up
>> hypotheticals". Please consider that the whole world does not operate
>> like your org, and that there may be legitimate reasons to do things
>> differently.
>>
>
> Having something that breaks production is a very hard evidence of a
> problem and makes it easier to better understand a **real** issue well
> and argue why something has to be solved or prevented. But it's not a
> necessary condition, of course. It's just that made up hypotheticals
> aren't necessarily good ways to motivate a feature, because all too
> often it turns out to be just that, hypothetical issue, while the real
> issue is different enough to warrant a different and better solution.
> By being conservative with adding features, we are trying to not make
> unnecessary design and API mistakes, because in the kernel they are
> here to stay forever.

I do appreciate the need to be conservative with the interfaces we add,
and I am aware of the maintenance burden. And it's not like I want
contingencies for any hypothetical I can think of put into the kernel
ahead of time (talk about a never-ending process :)). What I'm asking
for is just that something be argued on its merits, instead of
*automatically* being dismissed as "hypothetical". I.e., the difference
between "that can be handled by..." or "that is not likely to occur
because...", as opposed to "that has not happened yet, so come back when
it does".

> In your example, I'd argue that the design of that daemon is bad if it
> doesn't allow you to control what's attached where, and how to detach
> it without breaking completely independent network interfaces. That
> should be the problem that has to be solved first, IMO. And just
> because in some scenarios it might be **convenient** to force-detach
> bpf_link, is in no way a good justification (in my book) to add that
> feature, especially if there are other (and arguably safer) ways to
> achieve the same **troubleshooting** effect.

See this is actually what was I asking for - considering a question on
its merits; so thank you! And yeah, it would be a poorly designed
daemon, but I'm not quite convinced that such daemons won't show up and
be put into production by, say, someone running an enterprise operating
system :)

Anyway, let's leave that particular issue aside for now, and I'll circle
back to adding the force-remove if needed once I've thought this over a
bit more.

>> That being said...
>>
>> > This force-detach functionality isn't hard to add, but so far we had
>> > no real reason to do that. Once we do have such use cases, we can add
>> > it, if we agree it's still a good idea.
>>
>> ...this is fair enough, and I guess I should just put this on my list of
>> things to add. I was just somehow under the impression that this had
>> already been added.
>>
>
> So, overall, do I understand correctly that you are in principle not
> opposed to adding BPF XDP link, or you still have some concerns that
> were not addressed?

Broadly speaking yeah, I am OK with adding this as a second interface. I
am still concerned about the "locking in place" issued we discussed
above, but that can be addressed later. I am also a little concerned
about adding a second interface for configuring an interface that has to
take the RTNL lock etc; but those are mostly details, I suppose.

Unfortunately I won't have to review the latest version of your patch
set to look at those details before I go on vacation (sorry about that :/).
I guess I'll just have to leave that to you guys to take care of...

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-15 15:48               ` Toke Høiland-Jørgensen
@ 2020-07-15 20:54                 ` Andrii Nakryiko
  2020-07-16 10:52                   ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 15+ messages in thread
From: Andrii Nakryiko @ 2020-07-15 20:54 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

On Wed, Jul 15, 2020 at 8:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> >> Yup, that was helpful, thanks! I think our difference of opinion on this
> >> stems from the same place as our disagreement about point 2.: You are
> >> assuming that an application that uses XDP sticks around and holds on to
> >> its bpf_link, while I'm assuming it doesn't (and so has to rely on
> >> pinning for any persistence). In the latter case you don't really gain
> >> much from the bpf_link auto-cleanup, and whether it's a prog fd or a
> >> link fd you go find in your bpffs doesn't make that much difference...
> >
> > Right. But if I had to pick just one implementation (prog fd-based vs
> > bpf_link), I'd stick with bpf_link because it is flexible enough to
> > "emulate" prog fd attachment (through BPF FS), but the same isn't true
> > about prog fd attachment emulating bpf_link. That's it. I really don't
> > enjoy harping on that point, but it feels to be silently dismissed all
> > the time based on one particular arrangement for particular existing
> > XDP flow.
>
> It can; kinda. But you introduce a dependency on bpffs that wasn't there
> before, and you end up with resources that are kept around in the kernel
> if the interface disappears (because they are still pinned). So I
> consider it a poor emulation.

Yes, it's not exactly 100% the same semantics.
It is possible with slight additions to API to support essentially
exactly the same semantics you want with prog attachment. E.g., we can
either have a flag at LINK_CREATE time, or a separate command (e.g.,
LINK_PIN or something), that would mark bpf_link as "sticky", bump
it's refcnt. What happens then is that even if last FD is closed,
there is still refcnt 1 there, and then there are two ways to detach
that link:

1) interface/cgroup/whatever is destroyed and bpf_link is
auto-detached. At that point auto-detach handler will see that it's a
"sticky" bpf_link, will decrement refcnt and subsequently free
bpf_link kernel object (unless some application still has FD open, of
course).

2) a new LINK_DESTROY BPF command will be introduced, which will only
work with "sticky" bpf_links. It will decrement refcnt and do the same
stuff as the auto-detach handler does today (so bpf_link->dev = NULL,
for XDP link).

I don't mind this, as long as this is not a default semantics and
require conscious opt-in from whoever creates the link.

>
> >> >> >> I was under the impression that forcible attachment of bpf_links was
> >> >> >> already possible, but looking at the code now it doesn't appear to be?
> >> >> >> Wasn't that the whole point of BPF_LINK_GET_FD_BY_ID? I.e., that a
> >> >> >> sysadmin with CAP_SYS_ADMIN privs could grab the offending bpf_link FD
> >> >> >> and force-remove it? I certainly think this should be added before we
> >> >> >> expand bpf_link usage any more...
> >> >> >
> >> >> > I still maintain that killing processes that installed the bpf_link is
> >> >> > the better approach. Instead of letting the process believe and act as
> >> >> > if it has an active XDP program, while it doesn't, it's better to
> >> >> > altogether kill/restart the process.
> >> >>
> >> >> Killing the process seems like a very blunt tool, though. Say it's a
> >> >> daemon that attaches XDP programs to all available interfaces, but you
> >> >> want to bring down an interface for some one-off maintenance task, but
> >> >> the daemon authors neglected to provide an interface to tell the daemon
> >> >> to detach from specific interfaces. If your only option is to kill the
> >> >> daemon, you can't bring down that interface without disrupting whatever
> >> >> that daemon is doing with XDP on all the other interfaces.
> >> >>
> >> >
> >> > I'd rather avoid addressing made up hypothetical cases, really. Get
> >> > better and more flexible daemon?
> >>
> >> I know you guys don't consider an issue to be real until it has already
> >> crashed something in production. But some of us don't have the luxury of
> >> your fast issue-discovered-to-fix-shipped turnaround times; so instead
> >> we have to go with the old-fashioned way of thinking about how things
> >> can go wrong ahead of time, and making sure we're prepared to handle
> >> issues if (or, all too often, when) they occur. And it's frankly
> >> tiresome to have all such efforts be dismissed as "made up
> >> hypotheticals". Please consider that the whole world does not operate
> >> like your org, and that there may be legitimate reasons to do things
> >> differently.
> >>
> >
> > Having something that breaks production is a very hard evidence of a
> > problem and makes it easier to better understand a **real** issue well
> > and argue why something has to be solved or prevented. But it's not a
> > necessary condition, of course. It's just that made up hypotheticals
> > aren't necessarily good ways to motivate a feature, because all too
> > often it turns out to be just that, hypothetical issue, while the real
> > issue is different enough to warrant a different and better solution.
> > By being conservative with adding features, we are trying to not make
> > unnecessary design and API mistakes, because in the kernel they are
> > here to stay forever.
>
> I do appreciate the need to be conservative with the interfaces we add,
> and I am aware of the maintenance burden. And it's not like I want
> contingencies for any hypothetical I can think of put into the kernel
> ahead of time (talk about a never-ending process :)). What I'm asking
> for is just that something be argued on its merits, instead of
> *automatically* being dismissed as "hypothetical". I.e., the difference
> between "that can be handled by..." or "that is not likely to occur
> because...", as opposed to "that has not happened yet, so come back when
> it does".
>
> > In your example, I'd argue that the design of that daemon is bad if it
> > doesn't allow you to control what's attached where, and how to detach
> > it without breaking completely independent network interfaces. That
> > should be the problem that has to be solved first, IMO. And just
> > because in some scenarios it might be **convenient** to force-detach
> > bpf_link, is in no way a good justification (in my book) to add that
> > feature, especially if there are other (and arguably safer) ways to
> > achieve the same **troubleshooting** effect.
>
> See this is actually what was I asking for - considering a question on
> its merits; so thank you!

Honestly, I was hoping this is more or less obvious, which is why I
didn't go into a lengthy explanation originally. There is only so much
time in a day, unfortunately. So "hypothetical" example was more of
"it doesn't happen today, but when it happens, it should be solved
differently", in my opinion, of course. But anyways, as you say, we
can leave this aside, no need to waste more time on this.

> And yeah, it would be a poorly designed
> daemon, but I'm not quite convinced that such daemons won't show up and
> be put into production by, say, someone running an enterprise operating
> system :)
>
> Anyway, let's leave that particular issue aside for now, and I'll circle
> back to adding the force-remove if needed once I've thought this over a
> bit more.
>
> >> That being said...
> >>
> >> > This force-detach functionality isn't hard to add, but so far we had
> >> > no real reason to do that. Once we do have such use cases, we can add
> >> > it, if we agree it's still a good idea.
> >>
> >> ...this is fair enough, and I guess I should just put this on my list of
> >> things to add. I was just somehow under the impression that this had
> >> already been added.
> >>
> >
> > So, overall, do I understand correctly that you are in principle not
> > opposed to adding BPF XDP link, or you still have some concerns that
> > were not addressed?
>
> Broadly speaking yeah, I am OK with adding this as a second interface. I
> am still concerned about the "locking in place" issued we discussed
> above, but that can be addressed later. I am also a little concerned

Right, see above the "extensions" I outlined to make those bpf_links
have auto-cleanup semantics, similar to prog attachment today.

> about adding a second interface for configuring an interface that has to
> take the RTNL lock etc; but those are mostly details, I suppose.
>
> Unfortunately I won't have to review the latest version of your patch
> set to look at those details before I go on vacation (sorry about that :/).
> I guess I'll just have to leave that to you guys to take care of...
>

Sure, thanks, enjoy your vacation! I'll post v3 then with build fixes
I have so far.

> -Toke
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-15 20:54                 ` Andrii Nakryiko
@ 2020-07-16 10:52                   ` Toke Høiland-Jørgensen
  2020-07-22  6:45                     ` Andrii Nakryiko
  0 siblings, 1 reply; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2020-07-16 10:52 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Wed, Jul 15, 2020 at 8:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>
>> >> Yup, that was helpful, thanks! I think our difference of opinion on this
>> >> stems from the same place as our disagreement about point 2.: You are
>> >> assuming that an application that uses XDP sticks around and holds on to
>> >> its bpf_link, while I'm assuming it doesn't (and so has to rely on
>> >> pinning for any persistence). In the latter case you don't really gain
>> >> much from the bpf_link auto-cleanup, and whether it's a prog fd or a
>> >> link fd you go find in your bpffs doesn't make that much difference...
>> >
>> > Right. But if I had to pick just one implementation (prog fd-based vs
>> > bpf_link), I'd stick with bpf_link because it is flexible enough to
>> > "emulate" prog fd attachment (through BPF FS), but the same isn't true
>> > about prog fd attachment emulating bpf_link. That's it. I really don't
>> > enjoy harping on that point, but it feels to be silently dismissed all
>> > the time based on one particular arrangement for particular existing
>> > XDP flow.
>>
>> It can; kinda. But you introduce a dependency on bpffs that wasn't there
>> before, and you end up with resources that are kept around in the kernel
>> if the interface disappears (because they are still pinned). So I
>> consider it a poor emulation.
>
> Yes, it's not exactly 100% the same semantics.
> It is possible with slight additions to API to support essentially
> exactly the same semantics you want with prog attachment. E.g., we can
> either have a flag at LINK_CREATE time, or a separate command (e.g.,
> LINK_PIN or something), that would mark bpf_link as "sticky", bump
> it's refcnt. What happens then is that even if last FD is closed,
> there is still refcnt 1 there, and then there are two ways to detach
> that link:
>
> 1) interface/cgroup/whatever is destroyed and bpf_link is
> auto-detached. At that point auto-detach handler will see that it's a
> "sticky" bpf_link, will decrement refcnt and subsequently free
> bpf_link kernel object (unless some application still has FD open, of
> course).
>
> 2) a new LINK_DESTROY BPF command will be introduced, which will only
> work with "sticky" bpf_links. It will decrement refcnt and do the same
> stuff as the auto-detach handler does today (so bpf_link->dev = NULL,
> for XDP link).
>
> I don't mind this, as long as this is not a default semantics and
> require conscious opt-in from whoever creates the link.

Now *this* I would like to see! I have the issue with component progs of
the multiprog dispatcher being pinned and making everything stick
around.

[...]

> Sure, thanks, enjoy your vacation! I'll post v3 then with build fixes
> I have so far.

Thanks! :)

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API
  2020-07-16 10:52                   ` Toke Høiland-Jørgensen
@ 2020-07-22  6:45                     ` Andrii Nakryiko
  0 siblings, 0 replies; 15+ messages in thread
From: Andrii Nakryiko @ 2020-07-22  6:45 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, David Ahern, Jakub Kicinski,
	Andrey Ignatov, Takshak Chahande

On Thu, Jul 16, 2020 at 3:52 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Wed, Jul 15, 2020 at 8:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
> >>
> >> >> Yup, that was helpful, thanks! I think our difference of opinion on this
> >> >> stems from the same place as our disagreement about point 2.: You are
> >> >> assuming that an application that uses XDP sticks around and holds on to
> >> >> its bpf_link, while I'm assuming it doesn't (and so has to rely on
> >> >> pinning for any persistence). In the latter case you don't really gain
> >> >> much from the bpf_link auto-cleanup, and whether it's a prog fd or a
> >> >> link fd you go find in your bpffs doesn't make that much difference...
> >> >
> >> > Right. But if I had to pick just one implementation (prog fd-based vs
> >> > bpf_link), I'd stick with bpf_link because it is flexible enough to
> >> > "emulate" prog fd attachment (through BPF FS), but the same isn't true
> >> > about prog fd attachment emulating bpf_link. That's it. I really don't
> >> > enjoy harping on that point, but it feels to be silently dismissed all
> >> > the time based on one particular arrangement for particular existing
> >> > XDP flow.
> >>
> >> It can; kinda. But you introduce a dependency on bpffs that wasn't there
> >> before, and you end up with resources that are kept around in the kernel
> >> if the interface disappears (because they are still pinned). So I
> >> consider it a poor emulation.
> >
> > Yes, it's not exactly 100% the same semantics.
> > It is possible with slight additions to API to support essentially
> > exactly the same semantics you want with prog attachment. E.g., we can
> > either have a flag at LINK_CREATE time, or a separate command (e.g.,
> > LINK_PIN or something), that would mark bpf_link as "sticky", bump
> > it's refcnt. What happens then is that even if last FD is closed,
> > there is still refcnt 1 there, and then there are two ways to detach
> > that link:
> >
> > 1) interface/cgroup/whatever is destroyed and bpf_link is
> > auto-detached. At that point auto-detach handler will see that it's a
> > "sticky" bpf_link, will decrement refcnt and subsequently free
> > bpf_link kernel object (unless some application still has FD open, of
> > course).
> >
> > 2) a new LINK_DESTROY BPF command will be introduced, which will only
> > work with "sticky" bpf_links. It will decrement refcnt and do the same
> > stuff as the auto-detach handler does today (so bpf_link->dev = NULL,
> > for XDP link).

So it seems there is an interest in having this LINK_DESTROY command
irrespective of sticky/non-sticky BPF links. I'm going to follow up
with a patch set adding this as a LINK_DETACH command (I think it's a
more proper name) generically to links that it makes most sense for.
I.e., cgroup, netns and (now) xdp links are natural candidates, as
they already have to do this as part of auto-detach. I'll take a look
at tracing/bpf_iter links as well and see how hard it is to support
something like that. Of course `bpftool link detach` command needs to
be added, etc.

Sticky links are a bit more controversial and less clear from API
stand-point, so we can consider them separately from LINK_DETACH this.

> >
> > I don't mind this, as long as this is not a default semantics and
> > require conscious opt-in from whoever creates the link.
>
> Now *this* I would like to see! I have the issue with component progs of
> the multiprog dispatcher being pinned and making everything stick
> around.
>
> [...]
>
> > Sure, thanks, enjoy your vacation! I'll post v3 then with build fixes
> > I have so far.
>
> Thanks! :)
>
> -Toke
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH bpf-next 4/7] bpf: implement BPF XDP link-specific introspection APIs
  2020-07-13  5:12 [PATCH bpf-next 0/7] BPF XDP link Andrii Nakryiko
@ 2020-07-13  5:12 ` Andrii Nakryiko
  0 siblings, 0 replies; 15+ messages in thread
From: Andrii Nakryiko @ 2020-07-13  5:12 UTC (permalink / raw)
  To: bpf, netdev, ast, daniel
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko, Andrey Ignatov,
	Takshak Chahande

Implement XDP link-specific show_fdinfo and link_info to emit ifindex.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 include/uapi/linux/bpf.h |  3 +++
 net/core/dev.c           | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 41eba148217b..533eb4fe4e03 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3985,6 +3985,9 @@ struct bpf_link_info {
 			__u32 netns_ino;
 			__u32 attach_type;
 		} netns;
+		struct {
+			__u32 ifindex;
+		} xdp;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 025687120442..a9c634be8dd7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8973,6 +8973,35 @@ static void bpf_xdp_link_dealloc(struct bpf_link *link)
 	kfree(xdp_link);
 }
 
+static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
+				     struct seq_file *seq)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+	u32 ifindex = 0;
+
+	rtnl_lock();
+	if (xdp_link->dev)
+		ifindex = xdp_link->dev->ifindex;
+	rtnl_unlock();
+
+	seq_printf(seq, "ifindex:\t%u\n", ifindex);
+}
+
+static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
+				       struct bpf_link_info *info)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+	u32 ifindex = 0;
+
+	rtnl_lock();
+	if (xdp_link->dev)
+		ifindex = xdp_link->dev->ifindex;
+	rtnl_unlock();
+
+	info->xdp.ifindex = ifindex;
+	return 0;
+}
+
 static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 			       struct bpf_prog *old_prog)
 {
@@ -9018,6 +9047,8 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 static const struct bpf_link_ops bpf_xdp_link_lops = {
 	.release = bpf_xdp_link_release,
 	.dealloc = bpf_xdp_link_dealloc,
+	.show_fdinfo = bpf_xdp_link_show_fdinfo,
+	.fill_link_info = bpf_xdp_link_fill_link_info,
 	.update_prog = bpf_xdp_link_update,
 };
 
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2020-07-22  6:45 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20200710224924.4087399-1-andriin@fb.com>
     [not found] ` <20200710224924.4087399-3-andriin@fb.com>
2020-07-13 14:19   ` [PATCH bpf-next 2/7] bpf, xdp: add bpf_link-based XDP attachment API David Ahern
2020-07-13 22:33     ` Andrii Nakryiko
2020-07-14 13:57   ` Toke Høiland-Jørgensen
2020-07-14 18:59     ` Andrii Nakryiko
2020-07-14 20:12       ` Toke Høiland-Jørgensen
2020-07-14 20:37         ` Andrii Nakryiko
2020-07-14 21:41           ` Toke Høiland-Jørgensen
2020-07-14 22:26             ` Andrii Nakryiko
2020-07-15 15:48               ` Toke Høiland-Jørgensen
2020-07-15 20:54                 ` Andrii Nakryiko
2020-07-16 10:52                   ` Toke Høiland-Jørgensen
2020-07-22  6:45                     ` Andrii Nakryiko
     [not found] ` <20200710224924.4087399-5-andriin@fb.com>
2020-07-13 14:32   ` [PATCH bpf-next 4/7] bpf: implement BPF XDP link-specific introspection APIs David Ahern
2020-07-13 22:41     ` Andrii Nakryiko
2020-07-13  5:12 [PATCH bpf-next 0/7] BPF XDP link Andrii Nakryiko
2020-07-13  5:12 ` [PATCH bpf-next 4/7] bpf: implement BPF XDP link-specific introspection APIs Andrii Nakryiko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).