From: Andrii Nakryiko <andriin@fb.com>
To: <bpf@vger.kernel.org>, <netdev@vger.kernel.org>, <ast@fb.com>,
<daniel@iogearbox.net>
Cc: <andrii.nakryiko@gmail.com>, <kernel-team@fb.com>,
Andrii Nakryiko <andriin@fb.com>
Subject: [RFC PATCH bpf-next 3/8] bpf: allocate ID for bpf_link
Date: Fri, 3 Apr 2020 17:09:42 -0700 [thread overview]
Message-ID: <20200404000948.3980903-4-andriin@fb.com> (raw)
In-Reply-To: <20200404000948.3980903-1-andriin@fb.com>
Generate ID for each bpf_link using IDR, similarly to bpf_map and bpf_prog.
bpf_link creation, initialization, attachment, and exposing to user-space
through FD and ID is a complicated multi-step process, abstract it away
through bpf_link_primer and bpf_link_prime(), bpf_link_settle(), and
bpf_link_cleanup() internal API. They guarantee that until bpf_link is
properly attached, user-space won't be able to access partially-initialized
bpf_link either from FD or ID. All this allows to simplify bpf_link attachment
and error handling code.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
include/linux/bpf.h | 17 +++--
include/uapi/linux/bpf.h | 1 +
kernel/bpf/cgroup.c | 14 ++--
kernel/bpf/syscall.c | 140 +++++++++++++++++++++++++++------------
4 files changed, 116 insertions(+), 56 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3474f8e34a63..67ce74890911 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1085,11 +1085,19 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
struct bpf_link {
atomic64_t refcnt;
+ u32 id;
const struct bpf_link_ops *ops;
struct bpf_prog *prog;
struct work_struct work;
};
+struct bpf_link_primer {
+ struct bpf_link *link;
+ struct file *file;
+ int fd;
+ u32 id;
+};
+
struct bpf_link_ops {
void (*release)(struct bpf_link *link);
void (*dealloc)(struct bpf_link *link);
@@ -1097,10 +1105,11 @@ struct bpf_link_ops {
struct bpf_prog *old_prog);
};
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
- struct bpf_prog *prog);
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
- int link_fd);
+void bpf_link_init(struct bpf_link *link,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog);
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
+int bpf_link_settle(struct bpf_link_primer *primer);
+void bpf_link_cleanup(struct bpf_link_primer *primer);
void bpf_link_inc(struct bpf_link *link);
void bpf_link_put(struct bpf_link *link);
int bpf_link_new_fd(struct bpf_link *link, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e29a671d67e..eccfd1dea951 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -523,6 +523,7 @@ union bpf_attr {
__u32 prog_id;
__u32 map_id;
__u32 btf_id;
+ __u32 link_id;
};
__u32 next_id;
__u32 open_flags;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 54eacc44d1e4..ae84c5c90631 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -841,10 +841,10 @@ const struct bpf_link_ops bpf_cgroup_link_lops = {
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_cgroup_link *link;
- struct file *link_file;
struct cgroup *cgrp;
- int err, link_fd;
+ int err;
if (attr->link_create.flags)
return -EINVAL;
@@ -862,22 +862,20 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
link->cgroup = cgrp;
link->type = attr->link_create.attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_cgroup;
}
err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
BPF_F_ALLOW_MULTI);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_cgroup:
cgroup_put(cgrp);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 47f323901ed9..8b3a7d5814ae 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -42,6 +42,8 @@ static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
+static DEFINE_IDR(link_idr);
+static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -2184,25 +2186,39 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
- struct bpf_prog *prog)
+void bpf_link_init(struct bpf_link *link,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
{
atomic64_set(&link->refcnt, 1);
+ link->id = 0;
link->ops = ops;
link->prog = prog;
}
+static void bpf_link_free_id(struct bpf_link *link)
+{
+ unsigned long flags;
+
+ if (!link->id)
+ return;
+
+ spin_lock_irqsave(&link_idr_lock, flags);
+ idr_remove(&link_idr, link->id);
+ link->id = 0;
+ spin_unlock_irqrestore(&link_idr_lock, flags);
+}
+
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
* anon_inode's release() call. This helper manages marking bpf_link as
* defunct, releases anon_inode file and puts reserved FD.
*/
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
- int link_fd)
+void bpf_link_cleanup(struct bpf_link_primer *primer)
{
- link->prog = NULL;
- fput(link_file);
- put_unused_fd(link_fd);
+ primer->link->prog = NULL;
+ bpf_link_free_id(primer->link);
+ fput(primer->file);
+ put_unused_fd(primer->fd);
}
void bpf_link_inc(struct bpf_link *link)
@@ -2213,6 +2229,7 @@ void bpf_link_inc(struct bpf_link *link)
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ bpf_link_free_id(link);
if (link->prog) {
/* detach BPF program, clean up used resources */
link->ops->release(link);
@@ -2278,9 +2295,11 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
+ "link_id:\t%u\n"
"prog_tag:\t%s\n"
"prog_id:\t%u\n",
link_type,
+ link->id,
prog_tag,
prog->aux->id);
}
@@ -2295,38 +2314,76 @@ const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
-int bpf_link_new_fd(struct bpf_link *link, int flags)
+static int bpf_link_alloc_id(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link,
- flags | O_CLOEXEC);
-}
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&link_idr_lock);
+ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
+ spin_unlock_bh(&link_idr_lock);
+ idr_preload_end();
-/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
- * instead of immediately installing fd in fdtable, just reserve it and
- * return. Caller then need to either install it with fd_install(fd, file) or
- * release with put_unused_fd(fd).
- * This is useful for cases when bpf_link attachment/detachment are
- * complicated and expensive operations and should be delayed until all the fd
- * reservation and anon_inode creation succeeds.
+ return id;
+}
+
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
+ * transient state is passed around in struct bpf_link_primer.
+ * This is preferred way to create and initialize bpf_link, especially when
+ * there are complicated and expensive operations inbetween creating bpf_link
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
+ * expensive (and potentially failing) roll back operations in a rare case
+ * that file, FD, or ID can't be allocated.
*/
-struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
{
struct file *file;
- int fd;
+ int fd, id;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
- return ERR_PTR(fd);
+ return fd;
file = anon_inode_getfile("bpf_link", &bpf_link_fops, link,
O_RDWR | O_CLOEXEC);
if (IS_ERR(file)) {
put_unused_fd(fd);
- return file;
+ return PTR_ERR(file);
+ }
+
+ id = bpf_link_alloc_id(link);
+ if (id < 0) {
+ put_unused_fd(fd);
+ fput(file);
+ return id;
}
- *reserved_fd = fd;
- return file;
+ primer->link = link;
+ primer->file = file;
+ primer->fd = fd;
+ primer->id = id;
+ return 0;
+}
+
+int bpf_link_settle(struct bpf_link_primer *primer)
+{
+ /* make bpf_link fetchable by ID */
+ WRITE_ONCE(primer->link->id, primer->id);
+ /* make bpf_link fetchable by FD */
+ fd_install(primer->fd, primer->file);
+ /* pass through installed FD */
+ return primer->fd;
+}
+
+int bpf_link_new_fd(struct bpf_link *link, int flags)
+{
+ return anon_inode_getfd("bpf-link", &bpf_link_fops, link,
+ flags | O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd, fmode_t *link_mode)
@@ -2374,9 +2431,9 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
static int bpf_tracing_prog_attach(struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_tracing_link *link;
- struct file *link_file;
- int link_fd, err;
+ int err;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
@@ -2411,22 +2468,19 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
}
bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_prog;
}
err = bpf_trampoline_link_prog(prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_prog;
}
- fd_install(link_fd, link_file);
- return link_fd;
-
+ return bpf_link_settle(&link_primer);
out_put_prog:
bpf_prog_put(prog);
return err;
@@ -2454,7 +2508,7 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
kfree(raw_tp);
}
-static const struct bpf_link_ops bpf_raw_tp_lops = {
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
.dealloc = bpf_raw_tp_link_dealloc,
};
@@ -2463,13 +2517,13 @@ static const struct bpf_link_ops bpf_raw_tp_lops = {
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
+ struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct file *link_file;
struct bpf_prog *prog;
const char *tp_name;
char buf[128];
- int link_fd, err;
+ int err;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
return -EINVAL;
@@ -2522,24 +2576,22 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+ bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog);
link->btp = btp;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_btp;
}
err = bpf_probe_register(link->btp, prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_btp;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_btp:
bpf_put_raw_tracepoint(btp);
@@ -3471,7 +3523,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
- if (link->ops == &bpf_raw_tp_lops) {
+ if (link->ops == &bpf_raw_tp_link_lops) {
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
struct bpf_raw_event_map *btp = raw_tp->btp;
--
2.24.1
next prev parent reply other threads:[~2020-04-04 0:10 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-04-04 0:09 [RFC PATCH bpf-next 0/8] bpf_link observability APIs Andrii Nakryiko
2020-04-04 0:09 ` [RFC PATCH bpf-next 1/8] bpf: refactor bpf_link update handling Andrii Nakryiko
2020-04-04 0:09 ` [RFC PATCH bpf-next 2/8] bpf: allow bpf_link pinning as read-only and enforce LINK_UPDATE Andrii Nakryiko
2020-04-04 0:09 ` Andrii Nakryiko [this message]
2020-04-04 0:09 ` [RFC PATCH bpf-next 4/8] bpf: support GET_FD_BY_ID and GET_NEXT_ID for bpf_link Andrii Nakryiko
2020-04-06 11:34 ` Toke Høiland-Jørgensen
2020-04-06 19:06 ` Andrii Nakryiko
2020-04-08 15:14 ` Toke Høiland-Jørgensen
2020-04-08 20:23 ` Andrii Nakryiko
2020-04-08 21:21 ` Toke Høiland-Jørgensen
2020-04-09 18:49 ` Andrii Nakryiko
2020-04-14 10:32 ` Toke Høiland-Jørgensen
2020-04-14 18:47 ` Andrii Nakryiko
2020-04-15 9:26 ` Toke Høiland-Jørgensen
2020-04-04 0:09 ` [RFC PATCH bpf-next 5/8] bpf: add support for BPF_OBJ_GET_INFO_BY_FD " Andrii Nakryiko
2020-04-06 11:34 ` Toke Høiland-Jørgensen
2020-04-06 18:58 ` Andrii Nakryiko
2020-04-04 0:09 ` [RFC PATCH bpf-next 6/8] libbpf: add low-level APIs for new bpf_link commands Andrii Nakryiko
2020-04-04 0:09 ` [RFC PATCH bpf-next 7/8] bpftool: expose attach_type-to-string array to non-cgroup code Andrii Nakryiko
2020-04-04 0:09 ` [RFC PATCH bpf-next 8/8] bpftool: add bpf_link show and pin support Andrii Nakryiko
2020-04-08 23:44 ` David Ahern
2020-04-09 18:50 ` Andrii Nakryiko
2020-04-05 16:26 ` [RFC PATCH bpf-next 0/8] bpf_link observability APIs David Ahern
2020-04-05 18:31 ` Andrii Nakryiko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200404000948.3980903-4-andriin@fb.com \
--to=andriin@fb.com \
--cc=andrii.nakryiko@gmail.com \
--cc=ast@fb.com \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=kernel-team@fb.com \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).