[RFC PATCH bpf-next 1/4] bpf: Introduce bpf iterator for file-system inode

From: Hou Tao <houtao@huaweicloud.com>
To: bpf@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org,
	Alexei Starovoitov <alexei.starovoitov@gmail.com>,
	Yonghong Song <yhs@fb.com>,
	Andrii Nakryiko <andrii.nakryiko@gmail.com>,
	Viacheslav Dubeyko <slava@dubeyko.com>,
	Amir Goldstein <amir73il@gmail.com>,
	houtao1@huawei.com
Subject: [RFC PATCH bpf-next 1/4] bpf: Introduce bpf iterator for file-system inode
Date: Sun,  7 May 2023 12:01:04 +0800	[thread overview]
Message-ID: <20230507040107.3755166-2-houtao@huaweicloud.com> (raw)
In-Reply-To: <20230507040107.3755166-1-houtao@huaweicloud.com>

From: Hou Tao <houtao1@huawei.com>

The usual way to get information about a fs inode is statx(), but the
returned information is so limited and sometimes it is impossible to
get some internal information (e.g., dirty pages of one inode) through
existed syscalls.

So introduce bpf iterator for fs inode to solve the problem. By passing
one fd of the specific inode and one bpf program to the bpf file-system
inode iterator, a bpf iterator fd will be created and reading the
iterator fd will output the content customized by the provided bpf
program. Now only the bpf iterator for specific inode is supported, the
support for all inodes in a file-system could be added later if needed.

Without any inode related bpf helper, only the content of inode itself
and the typed-pointer in inode (e.g., i_sb) can be printed in a bpf
program as shown below:

  (struct inode){
   .i_mode = (umode_t)33188,
   .i_opflags = (short unsigned int)13,
   .i_flags = (unsigned int)4096,
   .i_op = (struct inode_operations *)0x000000004dd45285,
   .i_sb = (struct super_block *)0x0000000006c11996,
   .i_mapping = (struct address_space *)0x00000000333cf64b,
   .i_ino = (long unsigned int)30982996,
   (union){
    .i_nlink = ()1,
    .__i_nlink = (unsigned int)1,
   },
   .i_size = (loff_t)4095,
   ......
  (struct super_block){
   .s_list = (struct list_head){
    .next = (struct list_head *)0x000000008af29511,
    .prev = (struct list_head *)0x000000003d8c9095,
   },
   .s_dev = (dev_t)265289730,
   .s_blocksize_bits = (unsigned char)12,
   .s_blocksize = (long unsigned int)4096,
   .s_maxbytes = (loff_t)9223372036854775807,
   ......

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/bpf.h            |   2 +
 include/linux/btf_ids.h        |   5 +-
 include/uapi/linux/bpf.h       |   8 ++
 kernel/bpf/Makefile            |   1 +
 kernel/bpf/fs_iter.c           | 174 +++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |   8 ++
 6 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/fs_iter.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 456f33b9d205..3b2324269647 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2120,6 +2120,8 @@ struct bpf_iter_aux_info {
 		enum bpf_iter_task_type	type;
 		u32 pid;
 	} task;
+	/* for fs iter */
+	void *fs;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 00950cc03bff..9e036d1360e7 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -255,7 +255,10 @@ extern u32 btf_sock_ids[];
 #define BTF_TRACING_TYPE_xxx	\
 	BTF_TRACING_TYPE(BTF_TRACING_TYPE_TASK, task_struct)	\
 	BTF_TRACING_TYPE(BTF_TRACING_TYPE_FILE, file)		\
-	BTF_TRACING_TYPE(BTF_TRACING_TYPE_VMA, vm_area_struct)
+	BTF_TRACING_TYPE(BTF_TRACING_TYPE_VMA, vm_area_struct)	\
+	BTF_TRACING_TYPE(BTF_TRACING_TYPE_INODE, inode)		\
+	BTF_TRACING_TYPE(BTF_TRACING_TYPE_DENTRY, dentry)
+
 
 enum {
 #define BTF_TRACING_TYPE(name, type) name,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1bb11a6ee667..099048ba3edc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,10 @@ enum bpf_cgroup_iter_order {
 	BPF_CGROUP_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
 };
 
+enum bpf_fs_iter_type {
+	BPF_FS_ITER_INODE = 0,	/* a specific inode */
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
@@ -116,6 +120,10 @@ union bpf_iter_link_info {
 		__u32	pid;
 		__u32	pid_fd;
 	} task;
+	struct {
+		enum bpf_fs_iter_type type;
+		__u32 fd;
+	} fs;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1d3892168d32..e945d6e23eed 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += fs_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/fs_iter.c b/kernel/bpf/fs_iter.c
new file mode 100644
index 000000000000..cd7f10ea00ab
--- /dev/null
+++ b/kernel/bpf/fs_iter.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd
+ */
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/seq_file.h>
+
+DEFINE_BPF_ITER_FUNC(fs_inode, struct bpf_iter_meta *meta, struct inode *inode, struct dentry *dentry);
+
+struct bpf_iter__fs_inode {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct inode *, inode);
+	__bpf_md_ptr(struct dentry *, dentry);
+};
+
+struct bpf_fs_iter_aux_info {
+	atomic_t count;
+	enum bpf_fs_iter_type type;
+	struct file *filp;
+};
+
+struct bpf_iter_seq_fs_info {
+	struct bpf_fs_iter_aux_info *fs;
+};
+
+static inline void bpf_fs_iter_get(struct bpf_fs_iter_aux_info *fs)
+{
+	atomic_inc(&fs->count);
+}
+
+static void bpf_fs_iter_put(struct bpf_fs_iter_aux_info *fs)
+{
+	if (!atomic_dec_and_test(&fs->count))
+		return;
+
+	fput(fs->filp);
+	kfree(fs);
+}
+
+static int bpf_iter_attach_fs(struct bpf_prog *prog, union bpf_iter_link_info *linfo,
+			      struct bpf_iter_aux_info *aux)
+{
+	struct bpf_fs_iter_aux_info *fs;
+	struct file *filp;
+
+	if (linfo->fs.type > BPF_FS_ITER_INODE)
+		return -EINVAL;
+	/* TODO: The file-system is pinned */
+	filp = fget(linfo->fs.fd);
+	if (!filp)
+		return -EINVAL;
+
+	fs = kmalloc(sizeof(*fs), GFP_KERNEL);
+	if (!fs) {
+		fput(filp);
+		return -ENOMEM;
+	}
+
+	atomic_set(&fs->count, 1);
+	fs->type = linfo->fs.type;
+	fs->filp = filp;
+	aux->fs = fs;
+
+	return 0;
+}
+
+static void bpf_iter_detach_fs(struct bpf_iter_aux_info *aux)
+{
+	bpf_fs_iter_put(aux->fs);
+}
+
+static int bpf_iter_init_seq_fs_priv(void *priv, struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_fs_info *info = priv;
+	struct bpf_fs_iter_aux_info *fs = aux->fs;
+
+	/* link fd is still alive, so it is OK to inc ref-count directly */
+	bpf_fs_iter_get(fs);
+	info->fs = fs;
+
+	return 0;
+}
+
+static void bpf_iter_fini_seq_fs_priv(void *priv)
+{
+	struct bpf_iter_seq_fs_info *info = priv;
+
+	bpf_fs_iter_put(info->fs);
+}
+
+static void *fs_iter_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct bpf_iter_seq_fs_info *info = m->private;
+
+	if (*pos == 0)
+		++*pos;
+
+	return file_inode(info->fs->filp);
+}
+
+static int __fs_iter_seq_show(struct seq_file *m, void *v, bool stop)
+{
+	struct bpf_iter__fs_inode ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int err;
+
+	meta.seq = m;
+	prog = bpf_iter_get_info(&meta, stop);
+	if (!prog)
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.inode = v;
+	ctx.dentry = v ? d_find_alias(v) : NULL;
+	err = bpf_iter_run_prog(prog, &ctx);
+	dput(ctx.dentry);
+	return err;
+}
+
+static void fs_iter_seq_stop(struct seq_file *m, void *v)
+{
+	if (!v)
+		__fs_iter_seq_show(m, NULL, true);
+}
+
+static void *fs_iter_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static int fs_iter_seq_show(struct seq_file *m, void *v)
+{
+	return __fs_iter_seq_show(m, v, false);
+}
+
+static const struct seq_operations fs_iter_seq_ops = {
+	.start = fs_iter_seq_start,
+	.stop = fs_iter_seq_stop,
+	.next = fs_iter_seq_next,
+	.show = fs_iter_seq_show,
+};
+
+static const struct bpf_iter_seq_info fs_iter_seq_info = {
+	.seq_ops = &fs_iter_seq_ops,
+	.init_seq_private = bpf_iter_init_seq_fs_priv,
+	.fini_seq_private = bpf_iter_fini_seq_fs_priv,
+	.seq_priv_size = sizeof(struct bpf_iter_seq_fs_info),
+};
+
+static struct bpf_iter_reg fs_inode_reg_info = {
+	.target = "fs_inode",
+	.attach_target = bpf_iter_attach_fs,
+	.detach_target = bpf_iter_detach_fs,
+	.ctx_arg_info_size = 2,
+	.ctx_arg_info = {
+		{ offsetof(struct bpf_iter__fs_inode, inode), PTR_TO_BTF_ID_OR_NULL },
+		{ offsetof(struct bpf_iter__fs_inode, dentry), PTR_TO_BTF_ID_OR_NULL },
+	},
+	.seq_info = &fs_iter_seq_info,
+};
+
+static int __init fs_iter_init(void)
+{
+	fs_inode_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_INODE];
+	fs_inode_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_DENTRY];
+	return bpf_iter_reg_target(&fs_inode_reg_info);
+}
+late_initcall(fs_iter_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1bb11a6ee667..099048ba3edc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -95,6 +95,10 @@ enum bpf_cgroup_iter_order {
 	BPF_CGROUP_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
 };
 
+enum bpf_fs_iter_type {
+	BPF_FS_ITER_INODE = 0,	/* a specific inode */
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
@@ -116,6 +120,10 @@ union bpf_iter_link_info {
 		__u32	pid;
 		__u32	pid_fd;
 	} task;
+	struct {
+		enum bpf_fs_iter_type type;
+		__u32 fd;
+	} fs;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
-- 
2.29.2