bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next] bpf: add bpf_link_new_file that doesn't install FD
@ 2020-03-09 23:10 Andrii Nakryiko
  2020-03-10 21:45 ` John Fastabend
  2020-03-11 13:22 ` Daniel Borkmann
  0 siblings, 2 replies; 4+ messages in thread
From: Andrii Nakryiko @ 2020-03-09 23:10 UTC (permalink / raw)
  To: bpf, netdev, ast, daniel; +Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko

Add bpf_link_new_file() API for cases when we need to ensure anon_inode is
successfully created before we proceed with expensive BPF program attachment
procedure, which will require equally (if not more so) expensive and
potentially failing compensation detachment procedure just because anon_inode
creation failed. This API allows to simplify code by ensuring first that
anon_inode is created and after BPF program is attached proceed with
fd_install() that can't fail.

After anon_inode file is created, link can't be just kfree()'d anymore,
because its destruction will be performed by deferred file_operations->release
call. For this, bpf_link API required specifying two separate operations:
release() and dealloc(), former performing detachment only, while the latter
frees memory used by bpf_link itself. dealloc() needs to be specified, because
struct bpf_link is frequently embedded into link type-specific container
struct (e.g., struct bpf_raw_tp_link), so bpf_link itself doesn't know how to
properly free the memory. In case when anon_inode file was successfully
created, but subsequent BPF attachment failed, bpf_link needs to be marked as
"defunct", so that file's release() callback will perform only memory
deallocation, but no detachment.

Convert raw tracepoint and tracing attachment to new API and eliminate
detachment from error handling path.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 include/linux/bpf.h  |   3 ++
 kernel/bpf/syscall.c | 122 +++++++++++++++++++++++++++++++------------
 2 files changed, 91 insertions(+), 34 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 94a329b9da81..4fd91b7c95ea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1070,13 +1070,16 @@ struct bpf_link;
 
 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
+	void (*dealloc)(struct bpf_link *link);
 };
 
 void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 		   struct bpf_prog *prog);
+void bpf_link_defunct(struct bpf_link *link);
 void bpf_link_inc(struct bpf_link *link);
 void bpf_link_put(struct bpf_link *link);
 int bpf_link_new_fd(struct bpf_link *link);
+struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7ce0815793dd..b2f73ecacced 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2188,6 +2188,11 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 	link->prog = prog;
 }
 
+void bpf_link_defunct(struct bpf_link *link)
+{
+	link->prog = NULL;
+}
+
 void bpf_link_inc(struct bpf_link *link)
 {
 	atomic64_inc(&link->refcnt);
@@ -2196,14 +2201,13 @@ void bpf_link_inc(struct bpf_link *link)
 /* bpf_link_free is guaranteed to be called from process context */
 static void bpf_link_free(struct bpf_link *link)
 {
-	struct bpf_prog *prog;
-
-	/* remember prog locally, because release below will free link memory */
-	prog = link->prog;
-	/* extra clean up and kfree of container link struct */
-	link->ops->release(link);
-	/* no more accesing of link members after this point */
-	bpf_prog_put(prog);
+	if (link->prog) {
+		/* detach BPF program, clean up used resources */
+		link->ops->release(link);
+		bpf_prog_put(link->prog);
+	}
+	/* free bpf_link and its containing memory */
+	link->ops->dealloc(link);
 }
 
 static void bpf_link_put_deferred(struct work_struct *work)
@@ -2281,6 +2285,33 @@ int bpf_link_new_fd(struct bpf_link *link)
 	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
 }
 
+/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
+ * instead of immediately installing fd in fdtable, just reserve it and
+ * return. Caller then need to either install it with fd_install(fd, file) or
+ * release with put_unused_fd(fd).
+ * This is useful for cases when bpf_link attachment/detachment are
+ * complicated and expensive operations and should be delayed until all the fd
+ * reservation and anon_inode creation succeeds.
+ */
+struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+{
+	struct file *file;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return file;
+	}
+
+	*reserved_fd = fd;
+	return file;
+}
+
 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
 {
 	struct fd f = fdget(ufd);
@@ -2305,21 +2336,27 @@ struct bpf_tracing_link {
 };
 
 static void bpf_tracing_link_release(struct bpf_link *link)
+{
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
+}
+
+static void bpf_tracing_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_tracing_link *tr_link =
 		container_of(link, struct bpf_tracing_link, link);
 
-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
 	kfree(tr_link);
 }
 
 static const struct bpf_link_ops bpf_tracing_link_lops = {
 	.release = bpf_tracing_link_release,
+	.dealloc = bpf_tracing_link_dealloc,
 };
 
 static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 {
 	struct bpf_tracing_link *link;
+	struct file *link_file;
 	int link_fd, err;
 
 	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
@@ -2337,20 +2374,24 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 	}
 	bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
 
-	err = bpf_trampoline_link_prog(prog);
-	if (err)
-		goto out_free_link;
+	link_file = bpf_link_new_file(&link->link, &link_fd);
+	if (IS_ERR(link_file)) {
+		kfree(link);
+		err = PTR_ERR(link_file);
+		goto out_put_prog;
+	}
 
-	link_fd = bpf_link_new_fd(&link->link);
-	if (link_fd < 0) {
-		WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
-		err = link_fd;
-		goto out_free_link;
+	err = bpf_trampoline_link_prog(prog);
+	if (err) {
+		bpf_link_defunct(&link->link);
+		fput(link_file);
+		put_unused_fd(link_fd);
+		goto out_put_prog;
 	}
+
+	fd_install(link_fd, link_file);
 	return link_fd;
 
-out_free_link:
-	kfree(link);
 out_put_prog:
 	bpf_prog_put(prog);
 	return err;
@@ -2368,19 +2409,28 @@ static void bpf_raw_tp_link_release(struct bpf_link *link)
 
 	bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
 	bpf_put_raw_tracepoint(raw_tp->btp);
+}
+
+static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_raw_tp_link *raw_tp =
+		container_of(link, struct bpf_raw_tp_link, link);
+
 	kfree(raw_tp);
 }
 
 static const struct bpf_link_ops bpf_raw_tp_lops = {
 	.release = bpf_raw_tp_link_release,
+	.dealloc = bpf_raw_tp_link_dealloc,
 };
 
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
 
 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 {
-	struct bpf_raw_tp_link *raw_tp;
+	struct bpf_raw_tp_link *link;
 	struct bpf_raw_event_map *btp;
+	struct file *link_file;
 	struct bpf_prog *prog;
 	const char *tp_name;
 	char buf[128];
@@ -2431,28 +2481,32 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 		goto out_put_prog;
 	}
 
-	raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
-	if (!raw_tp) {
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
 		err = -ENOMEM;
 		goto out_put_btp;
 	}
-	bpf_link_init(&raw_tp->link, &bpf_raw_tp_lops, prog);
-	raw_tp->btp = btp;
+	bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+	link->btp = btp;
 
-	err = bpf_probe_register(raw_tp->btp, prog);
-	if (err)
-		goto out_free_tp;
+	link_file = bpf_link_new_file(&link->link, &link_fd);
+	if (IS_ERR(link_file)) {
+		kfree(link);
+		err = PTR_ERR(link_file);
+		goto out_put_btp;
+	}
 
-	link_fd = bpf_link_new_fd(&raw_tp->link);
-	if (link_fd < 0) {
-		bpf_probe_unregister(raw_tp->btp, prog);
-		err = link_fd;
-		goto out_free_tp;
+	err = bpf_probe_register(link->btp, prog);
+	if (err) {
+		bpf_link_defunct(&link->link);
+		fput(link_file);
+		put_unused_fd(link_fd);
+		goto out_put_btp;
 	}
+
+	fd_install(link_fd, link_file);
 	return link_fd;
 
-out_free_tp:
-	kfree(raw_tp);
 out_put_btp:
 	bpf_put_raw_tracepoint(btp);
 out_put_prog:
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* RE: [PATCH bpf-next] bpf: add bpf_link_new_file that doesn't install FD
  2020-03-09 23:10 [PATCH bpf-next] bpf: add bpf_link_new_file that doesn't install FD Andrii Nakryiko
@ 2020-03-10 21:45 ` John Fastabend
  2020-03-11 13:22 ` Daniel Borkmann
  1 sibling, 0 replies; 4+ messages in thread
From: John Fastabend @ 2020-03-10 21:45 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, netdev, ast, daniel
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko

Andrii Nakryiko wrote:
> Add bpf_link_new_file() API for cases when we need to ensure anon_inode is
> successfully created before we proceed with expensive BPF program attachment
> procedure, which will require equally (if not more so) expensive and
> potentially failing compensation detachment procedure just because anon_inode
> creation failed. This API allows to simplify code by ensuring first that
> anon_inode is created and after BPF program is attached proceed with
> fd_install() that can't fail.
> 
> After anon_inode file is created, link can't be just kfree()'d anymore,
> because its destruction will be performed by deferred file_operations->release
> call. For this, bpf_link API required specifying two separate operations:
> release() and dealloc(), former performing detachment only, while the latter
> frees memory used by bpf_link itself. dealloc() needs to be specified, because
> struct bpf_link is frequently embedded into link type-specific container
> struct (e.g., struct bpf_raw_tp_link), so bpf_link itself doesn't know how to
> properly free the memory. In case when anon_inode file was successfully
> created, but subsequent BPF attachment failed, bpf_link needs to be marked as
> "defunct", so that file's release() callback will perform only memory
> deallocation, but no detachment.
> 
> Convert raw tracepoint and tracing attachment to new API and eliminate
> detachment from error handling path.
> 
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> ---

LGTM

Acked-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_link_new_file that doesn't install FD
  2020-03-09 23:10 [PATCH bpf-next] bpf: add bpf_link_new_file that doesn't install FD Andrii Nakryiko
  2020-03-10 21:45 ` John Fastabend
@ 2020-03-11 13:22 ` Daniel Borkmann
  2020-03-11 15:21   ` Andrii Nakryiko
  1 sibling, 1 reply; 4+ messages in thread
From: Daniel Borkmann @ 2020-03-11 13:22 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, netdev, ast; +Cc: andrii.nakryiko, kernel-team

On 3/10/20 12:10 AM, Andrii Nakryiko wrote:
> Add bpf_link_new_file() API for cases when we need to ensure anon_inode is
> successfully created before we proceed with expensive BPF program attachment
> procedure, which will require equally (if not more so) expensive and
> potentially failing compensation detachment procedure just because anon_inode
> creation failed. This API allows to simplify code by ensuring first that
> anon_inode is created and after BPF program is attached proceed with
> fd_install() that can't fail.
> 
> After anon_inode file is created, link can't be just kfree()'d anymore,
> because its destruction will be performed by deferred file_operations->release
> call. For this, bpf_link API required specifying two separate operations:
> release() and dealloc(), former performing detachment only, while the latter
> frees memory used by bpf_link itself. dealloc() needs to be specified, because
> struct bpf_link is frequently embedded into link type-specific container
> struct (e.g., struct bpf_raw_tp_link), so bpf_link itself doesn't know how to
> properly free the memory. In case when anon_inode file was successfully
> created, but subsequent BPF attachment failed, bpf_link needs to be marked as
> "defunct", so that file's release() callback will perform only memory
> deallocation, but no detachment.
> 
> Convert raw tracepoint and tracing attachment to new API and eliminate
> detachment from error handling path.
> 
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>

Applied, but ...

[...]
> @@ -2337,20 +2374,24 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
>   	}
>   	bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
>   
> -	err = bpf_trampoline_link_prog(prog);
> -	if (err)
> -		goto out_free_link;
> +	link_file = bpf_link_new_file(&link->link, &link_fd);
> +	if (IS_ERR(link_file)) {
> +		kfree(link);
> +		err = PTR_ERR(link_file);
> +		goto out_put_prog;
> +	}
>   
> -	link_fd = bpf_link_new_fd(&link->link);
> -	if (link_fd < 0) {
> -		WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
> -		err = link_fd;
> -		goto out_free_link;
> +	err = bpf_trampoline_link_prog(prog);
> +	if (err) {
> +		bpf_link_defunct(&link->link);
> +		fput(link_file);
> +		put_unused_fd(link_fd);

Given the tear-down in error case requires 3 manual steps here, I think this begs
for a small helper.

> +		goto out_put_prog;
>   	}
> +
> +	fd_install(link_fd, link_file);
>   	return link_fd;
>   
[...]
> @@ -2431,28 +2481,32 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
>   		goto out_put_prog;
>   	}
>   
[...]
>   
> -	link_fd = bpf_link_new_fd(&raw_tp->link);
> -	if (link_fd < 0) {
> -		bpf_probe_unregister(raw_tp->btp, prog);
> -		err = link_fd;
> -		goto out_free_tp;
> +	err = bpf_probe_register(link->btp, prog);
> +	if (err) {
> +		bpf_link_defunct(&link->link);
> +		fput(link_file);
> +		put_unused_fd(link_fd);

Especially since you need it in multiple places; please follow-up.

> +		goto out_put_btp;
>   	}
> +
> +	fd_install(link_fd, link_file);
>   	return link_fd;
>   
> -out_free_tp:
> -	kfree(raw_tp);
>   out_put_btp:
>   	bpf_put_raw_tracepoint(btp);
>   out_put_prog:
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_link_new_file that doesn't install FD
  2020-03-11 13:22 ` Daniel Borkmann
@ 2020-03-11 15:21   ` Andrii Nakryiko
  0 siblings, 0 replies; 4+ messages in thread
From: Andrii Nakryiko @ 2020-03-11 15:21 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov, Kernel Team

On Wed, Mar 11, 2020 at 6:22 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 3/10/20 12:10 AM, Andrii Nakryiko wrote:
> > Add bpf_link_new_file() API for cases when we need to ensure anon_inode is
> > successfully created before we proceed with expensive BPF program attachment
> > procedure, which will require equally (if not more so) expensive and
> > potentially failing compensation detachment procedure just because anon_inode
> > creation failed. This API allows to simplify code by ensuring first that
> > anon_inode is created and after BPF program is attached proceed with
> > fd_install() that can't fail.
> >
> > After anon_inode file is created, link can't be just kfree()'d anymore,
> > because its destruction will be performed by deferred file_operations->release
> > call. For this, bpf_link API required specifying two separate operations:
> > release() and dealloc(), former performing detachment only, while the latter
> > frees memory used by bpf_link itself. dealloc() needs to be specified, because
> > struct bpf_link is frequently embedded into link type-specific container
> > struct (e.g., struct bpf_raw_tp_link), so bpf_link itself doesn't know how to
> > properly free the memory. In case when anon_inode file was successfully
> > created, but subsequent BPF attachment failed, bpf_link needs to be marked as
> > "defunct", so that file's release() callback will perform only memory
> > deallocation, but no detachment.
> >
> > Convert raw tracepoint and tracing attachment to new API and eliminate
> > detachment from error handling path.
> >
> > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
>
> Applied, but ...
>
> [...]
> > @@ -2337,20 +2374,24 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
> >       }
> >       bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
> >
> > -     err = bpf_trampoline_link_prog(prog);
> > -     if (err)
> > -             goto out_free_link;
> > +     link_file = bpf_link_new_file(&link->link, &link_fd);
> > +     if (IS_ERR(link_file)) {
> > +             kfree(link);
> > +             err = PTR_ERR(link_file);
> > +             goto out_put_prog;
> > +     }
> >
> > -     link_fd = bpf_link_new_fd(&link->link);
> > -     if (link_fd < 0) {
> > -             WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
> > -             err = link_fd;
> > -             goto out_free_link;
> > +     err = bpf_trampoline_link_prog(prog);
> > +     if (err) {
> > +             bpf_link_defunct(&link->link);
> > +             fput(link_file);
> > +             put_unused_fd(link_fd);
>
> Given the tear-down in error case requires 3 manual steps here, I think this begs
> for a small helper.

Sounds good, will follow up. Thanks for applying!

>
> > +             goto out_put_prog;
> >       }
> > +
> > +     fd_install(link_fd, link_file);
> >       return link_fd;
> >
> [...]
> > @@ -2431,28 +2481,32 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
> >               goto out_put_prog;
> >       }
> >
> [...]
> >
> > -     link_fd = bpf_link_new_fd(&raw_tp->link);
> > -     if (link_fd < 0) {
> > -             bpf_probe_unregister(raw_tp->btp, prog);
> > -             err = link_fd;
> > -             goto out_free_tp;
> > +     err = bpf_probe_register(link->btp, prog);
> > +     if (err) {
> > +             bpf_link_defunct(&link->link);
> > +             fput(link_file);
> > +             put_unused_fd(link_fd);
>
> Especially since you need it in multiple places; please follow-up.
>
> > +             goto out_put_btp;
> >       }
> > +
> > +     fd_install(link_fd, link_file);
> >       return link_fd;
> >
> > -out_free_tp:
> > -     kfree(raw_tp);
> >   out_put_btp:
> >       bpf_put_raw_tracepoint(btp);
> >   out_put_prog:
> >
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-03-11 15:22 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-09 23:10 [PATCH bpf-next] bpf: add bpf_link_new_file that doesn't install FD Andrii Nakryiko
2020-03-10 21:45 ` John Fastabend
2020-03-11 13:22 ` Daniel Borkmann
2020-03-11 15:21   ` Andrii Nakryiko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).