BPF Archive on lore.kernel.org
 help / color / Atom feed
* [RFC bpf-next 0/1] bpf: Add page cache iterator
@ 2021-04-07 21:46 Daniel Xu
  2021-04-07 21:46 ` [RFC bpf-next 1/1] bpf: Introduce iter_pagecache Daniel Xu
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Daniel Xu @ 2021-04-07 21:46 UTC (permalink / raw)
  To: bpf, linux-fsdevel, linux-mm
  Cc: Daniel Xu, linux-kernel, kernel-team, jolsa, hannes, yhs

There currently does not exist a way to answer the question: "What is in
the page cache?". There are various heuristics and counters but nothing
that can tell you anything like:

  * 3M from /home/dxu/foo.txt
  * 5K from ...
  * etc.

The answer to the question is particularly useful in the stacked
container world. Stacked containers implies multiple containers are run
on the same physical host. Memory is precious resource on some (if not
most) of these systems. On these systems, it's useful to know how much
duplicated data is in the page cache. Once you know the answer, you can
do something about it. One possible technique would be bind mount common
items from the root host into each container.

NOTES: 

  * This patch compiles and (maybe) works -- totally not fully tested
    or in a final state

  * I'm sending this early RFC to get comments on the general approach.
    I chatted w/ Johannes a little bit and it seems like the best way to
    do this is through superblock -> inode -> address_space iteration
    rather than going from numa node -> LRU iteration

  * I'll most likely add a page_hash() helper (or something) that hashes
    a page so that userspace can more easily tell which pages are
    duplicate

Daniel Xu (1):
  bpf: Introduce iter_pagecache

 kernel/bpf/Makefile         |   2 +-
 kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++
 2 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/pagecache_iter.c

-- 
2.26.3


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-07 21:46 [RFC bpf-next 0/1] bpf: Add page cache iterator Daniel Xu
@ 2021-04-07 21:46 ` Daniel Xu
  2021-04-08  6:14   ` Matthew Wilcox
                     ` (3 more replies)
  2021-04-08  7:51 ` [RFC bpf-next 0/1] bpf: Add page cache iterator Christian Brauner
                   ` (2 subsequent siblings)
  3 siblings, 4 replies; 16+ messages in thread
From: Daniel Xu @ 2021-04-07 21:46 UTC (permalink / raw)
  To: bpf, linux-fsdevel, linux-mm
  Cc: Daniel Xu, linux-kernel, kernel-team, jolsa, hannes, yhs

This commit introduces the bpf page cache iterator. This iterator allows
users to run a bpf prog against each page in the "page cache".
Internally, the "page cache" is extremely tied to VFS superblock + inode
combo. Because of this, iter_pagecache will only examine pages in the
caller's mount namespace.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 kernel/bpf/Makefile         |   2 +-
 kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++
 2 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/pagecache_iter.c

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7f33098ca63f..3deb6a8d3f75 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o pagecache_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/pagecache_iter.c b/kernel/bpf/pagecache_iter.c
new file mode 100644
index 000000000000..8442ab0d4221
--- /dev/null
+++ b/kernel/bpf/pagecache_iter.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/init.h>
+#include <linux/mm_types.h>
+#include <linux/mnt_namespace.h>
+#include <linux/nsproxy.h>
+#include <linux/pagemap.h>
+#include <linux/radix-tree.h>
+#include <linux/seq_file.h>
+#include "../../fs/mount.h"
+
+struct bpf_iter_seq_pagecache_info {
+	struct mnt_namespace *ns;
+	struct radix_tree_root superblocks;
+	struct super_block *cur_sb;
+	struct inode *cur_inode;
+	unsigned long cur_page_idx;
+};
+
+static struct super_block *goto_next_sb(struct bpf_iter_seq_pagecache_info *info)
+{
+	struct super_block *sb = NULL;
+	struct radix_tree_iter iter;
+	void **slot;
+
+	radix_tree_for_each_slot(slot, &info->superblocks, &iter,
+				 ((unsigned long)info->cur_sb + 1)) {
+		sb = (struct super_block *)iter.index;
+		break;
+	}
+
+	info->cur_sb = sb;
+	info->cur_inode = NULL;
+	info->cur_page_idx = 0;
+	return sb;
+}
+
+static bool inode_unusual(struct inode *inode) {
+	return ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		(inode->i_mapping->nrpages == 0));
+}
+
+static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info)
+{
+	struct inode *prev_inode = info->cur_inode;
+	struct inode *inode;
+
+retry:
+	BUG_ON(!info->cur_sb);
+	spin_lock(&info->cur_sb->s_inode_list_lock);
+
+	if (!info->cur_inode) {
+		list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) {
+			spin_lock(&inode->i_lock);
+			if (inode_unusual(inode)) {
+				spin_unlock(&inode->i_lock);
+				continue;
+			}
+			__iget(inode);
+			spin_unlock(&inode->i_lock);
+			info->cur_inode = inode;
+			break;
+		}
+	} else {
+		inode = info->cur_inode;
+		info->cur_inode = NULL;
+		list_for_each_entry_continue(inode, &info->cur_sb->s_inodes,
+					     i_sb_list) {
+			spin_lock(&inode->i_lock);
+			if (inode_unusual(inode)) {
+				spin_unlock(&inode->i_lock);
+				continue;
+			}
+			__iget(inode);
+			spin_unlock(&inode->i_lock);
+			info->cur_inode = inode;
+			break;
+		}
+	}
+
+	/* Seen all inodes in this superblock */
+	if (!info->cur_inode) {
+		spin_unlock(&info->cur_sb->s_inode_list_lock);
+		if (!goto_next_sb(info)) {
+			inode = NULL;
+			goto out;
+		}
+
+		goto retry;
+	}
+
+	spin_unlock(&info->cur_sb->s_inode_list_lock);
+	info->cur_page_idx = 0;
+out:
+	iput(prev_inode);
+	return info->cur_inode;
+}
+
+static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
+{
+	struct page *page, *ret = NULL;
+	unsigned long idx;
+
+	rcu_read_lock();
+retry:
+	BUG_ON(!info->cur_inode);
+	ret = NULL;
+	xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page,
+			  info->cur_page_idx) {
+		if (!page_cache_get_speculative(page))
+			continue;
+
+		ret = page;
+		info->cur_page_idx = idx + 1;
+		break;
+	}
+
+	if (!ret) {
+		/* Seen all inodes and superblocks */
+		if (!goto_next_inode(info))
+			goto out;
+
+		goto retry;
+	}
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static void *pagecache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_pagecache_info *info = seq->private;
+	struct page *page;
+
+	if (!info->cur_sb && !goto_next_sb(info))
+		return NULL;
+	if (!info->cur_inode && !goto_next_inode(info))
+		return NULL;
+
+	page = goto_next_page(info);
+	if (!page)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+
+	return page;
+
+}
+
+static void *pagecache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_pagecache_info *info = seq->private;
+	struct page *page;
+
+	++*pos;
+	put_page((struct page *)v);
+	page = goto_next_page(info);
+	if (!page)
+		return NULL;
+
+	return page;
+}
+
+struct bpf_iter__pagecache {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct page *, page);
+};
+
+DEFINE_BPF_ITER_FUNC(pagecache, struct bpf_iter_meta *meta, struct page *page)
+
+static int __pagecache_seq_show(struct seq_file *seq, struct page *page,
+				bool in_stop)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__pagecache ctx;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (!prog)
+		return 0;
+
+	meta.seq = seq;
+	ctx.meta = &meta;
+	ctx.page = page;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int pagecache_seq_show(struct seq_file *seq, void *v)
+{
+	return __pagecache_seq_show(seq, v, false);
+}
+
+static void pagecache_seq_stop(struct seq_file *seq, void *v)
+{
+	(void)__pagecache_seq_show(seq, v, true);
+	if (v)
+		put_page((struct page *)v);
+}
+
+static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_pagecache_info *info = priv_data;
+	struct radix_tree_iter iter;
+	struct super_block *sb;
+	struct mount *mnt;
+	void **slot;
+	int err;
+
+	info->ns = current->nsproxy->mnt_ns;
+	get_mnt_ns(info->ns);
+	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
+
+	spin_lock(&info->ns->ns_lock);
+	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
+		sb = mnt->mnt.mnt_sb;
+
+		/* The same mount may be mounted in multiple places */
+		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
+			continue;
+
+		err = radix_tree_insert(&info->superblocks,
+				        (unsigned long)sb, (void *)1);
+		if (err)
+			goto out;
+	}
+
+	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
+		sb = (struct super_block *)iter.index;
+		atomic_inc(&sb->s_active);
+	}
+
+	err = 0;
+out:
+	spin_unlock(&info->ns->ns_lock);
+	return err;
+}
+
+static void fini_seq_pagecache(void *priv_data)
+{
+	struct bpf_iter_seq_pagecache_info *info = priv_data;
+	struct radix_tree_iter iter;
+	struct super_block *sb;
+	void **slot;
+
+	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
+		sb = (struct super_block *)iter.index;
+		atomic_dec(&sb->s_active);
+		radix_tree_delete(&info->superblocks, iter.index);
+	}
+
+	put_mnt_ns(info->ns);
+}
+
+static const struct seq_operations pagecache_seq_ops = {
+	.start	= pagecache_seq_start,
+	.next	= pagecache_seq_next,
+	.stop	= pagecache_seq_stop,
+	.show	= pagecache_seq_show,
+};
+
+static const struct bpf_iter_seq_info pagecache_seq_info = {
+	.seq_ops		= &pagecache_seq_ops,
+	.init_seq_private	= init_seq_pagecache,
+	.fini_seq_private	= fini_seq_pagecache,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_pagecache_info),
+};
+
+static struct bpf_iter_reg pagecache_reg_info = {
+	.target			= "pagecache",
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__pagecache, page),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+	.seq_info		= &pagecache_seq_info,
+};
+
+BTF_ID_LIST(btf_page_id)
+BTF_ID(struct, page)
+
+static int __init bpf_pagecache_iter_init(void)
+{
+	pagecache_reg_info.ctx_arg_info[0].btf_id = *btf_page_id;
+	return bpf_iter_reg_target(&pagecache_reg_info);
+}
+
+late_initcall(bpf_pagecache_iter_init);
-- 
2.26.3


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-07 21:46 ` [RFC bpf-next 1/1] bpf: Introduce iter_pagecache Daniel Xu
@ 2021-04-08  6:14   ` Matthew Wilcox
  2021-04-08 19:48     ` Daniel Xu
  2021-04-08  8:19   ` Christian Brauner
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2021-04-08  6:14 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> +struct bpf_iter_seq_pagecache_info {
> +	struct mnt_namespace *ns;
> +	struct radix_tree_root superblocks;

Why are you adding a new radix tree?  Use an XArray instead.

> +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct page *page, *ret = NULL;
> +	unsigned long idx;
> +
> +	rcu_read_lock();
> +retry:
> +	BUG_ON(!info->cur_inode);
> +	ret = NULL;
> +	xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page,
> +			  info->cur_page_idx) {
> +		if (!page_cache_get_speculative(page))
> +			continue;

Why do you feel the need to poke around in i_pages directly?  Is there
something wrong with find_get_entries()?

> +static int __pagecache_seq_show(struct seq_file *seq, struct page *page,
> +				bool in_stop)
> +{
> +	struct bpf_iter_meta meta;
> +	struct bpf_iter__pagecache ctx;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (!prog)
> +		return 0;
> +
> +	meta.seq = seq;
> +	ctx.meta = &meta;
> +	ctx.page = page;
> +	return bpf_iter_run_prog(prog, &ctx);

I'm not really keen on the idea of random BPF programs being able to poke
at pages in the page cache like this.  From your initial description,
it sounded like all you needed was a list of which pages are present.

> +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> +
> +	spin_lock(&info->ns->ns_lock);
> +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
> +		sb = mnt->mnt.mnt_sb;
> +
> +		/* The same mount may be mounted in multiple places */
> +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> +			continue;
> +
> +		err = radix_tree_insert(&info->superblocks,
> +				        (unsigned long)sb, (void *)1);
> +		if (err)
> +			goto out;
> +	}
> +
> +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> +		sb = (struct super_block *)iter.index;
> +		atomic_inc(&sb->s_active);
> +	}

Uh.  What on earth made you think this was a good way to use the radix
tree?  And, no, the XArray doesn't change that.

If you don't understand why this is so bad, call xa_dump() on it after
constructing it.  I'll wait.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 0/1] bpf: Add page cache iterator
  2021-04-07 21:46 [RFC bpf-next 0/1] bpf: Add page cache iterator Daniel Xu
  2021-04-07 21:46 ` [RFC bpf-next 1/1] bpf: Introduce iter_pagecache Daniel Xu
@ 2021-04-08  7:51 ` Christian Brauner
  2021-04-08 16:08   ` Daniel Xu
  2021-04-08 21:33 ` Shakeel Butt
  2021-04-08 23:13 ` Darrick J. Wong
  3 siblings, 1 reply; 16+ messages in thread
From: Christian Brauner @ 2021-04-08  7:51 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Wed, Apr 07, 2021 at 02:46:10PM -0700, Daniel Xu wrote:
> There currently does not exist a way to answer the question: "What is in
> the page cache?". There are various heuristics and counters but nothing
> that can tell you anything like:
> 
>   * 3M from /home/dxu/foo.txt
>   * 5K from ...
>   * etc.
> 
> The answer to the question is particularly useful in the stacked
> container world. Stacked containers implies multiple containers are run
> on the same physical host. Memory is precious resource on some (if not

Just to clarify: what are "stacked containers"? Do you mean nested
containers, i.e. containers running within containers?

Christian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-07 21:46 ` [RFC bpf-next 1/1] bpf: Introduce iter_pagecache Daniel Xu
  2021-04-08  6:14   ` Matthew Wilcox
@ 2021-04-08  8:19   ` Christian Brauner
  2021-04-08 20:44     ` Daniel Xu
  2021-04-08 16:45   ` Al Viro
  2021-04-08 22:11   ` Dave Chinner
  3 siblings, 1 reply; 16+ messages in thread
From: Christian Brauner @ 2021-04-08  8:19 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs, Al Viro

On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> This commit introduces the bpf page cache iterator. This iterator allows
> users to run a bpf prog against each page in the "page cache".
> Internally, the "page cache" is extremely tied to VFS superblock + inode
> combo. Because of this, iter_pagecache will only examine pages in the
> caller's mount namespace.
> 
> Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> ---
>  kernel/bpf/Makefile         |   2 +-
>  kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 294 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/pagecache_iter.c
> 
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 7f33098ca63f..3deb6a8d3f75 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
>  endif
>  CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
>  
> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o pagecache_iter.o map_iter.o task_iter.o prog_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
>  obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
> diff --git a/kernel/bpf/pagecache_iter.c b/kernel/bpf/pagecache_iter.c
> new file mode 100644
> index 000000000000..8442ab0d4221
> --- /dev/null
> +++ b/kernel/bpf/pagecache_iter.c
> @@ -0,0 +1,293 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2021 Facebook */
> +
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/init.h>
> +#include <linux/mm_types.h>
> +#include <linux/mnt_namespace.h>
> +#include <linux/nsproxy.h>
> +#include <linux/pagemap.h>
> +#include <linux/radix-tree.h>
> +#include <linux/seq_file.h>
> +#include "../../fs/mount.h"

This is a private header on purpose. Outside of fs/ poking around in
struct mount or struct mount_namespace should not be done.

> +
> +struct bpf_iter_seq_pagecache_info {
> +	struct mnt_namespace *ns;
> +	struct radix_tree_root superblocks;
> +	struct super_block *cur_sb;
> +	struct inode *cur_inode;
> +	unsigned long cur_page_idx;
> +};
> +
> +static struct super_block *goto_next_sb(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct super_block *sb = NULL;
> +	struct radix_tree_iter iter;
> +	void **slot;
> +
> +	radix_tree_for_each_slot(slot, &info->superblocks, &iter,
> +				 ((unsigned long)info->cur_sb + 1)) {
> +		sb = (struct super_block *)iter.index;
> +		break;
> +	}
> +
> +	info->cur_sb = sb;
> +	info->cur_inode = NULL;
> +	info->cur_page_idx = 0;
> +	return sb;
> +}
> +
> +static bool inode_unusual(struct inode *inode) {
> +	return ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
> +		(inode->i_mapping->nrpages == 0));
> +}
> +
> +static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct inode *prev_inode = info->cur_inode;
> +	struct inode *inode;
> +
> +retry:
> +	BUG_ON(!info->cur_sb);
> +	spin_lock(&info->cur_sb->s_inode_list_lock);
> +
> +	if (!info->cur_inode) {
> +		list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) {
> +			spin_lock(&inode->i_lock);
> +			if (inode_unusual(inode)) {
> +				spin_unlock(&inode->i_lock);
> +				continue;
> +			}
> +			__iget(inode);
> +			spin_unlock(&inode->i_lock);
> +			info->cur_inode = inode;
> +			break;
> +		}
> +	} else {
> +		inode = info->cur_inode;
> +		info->cur_inode = NULL;
> +		list_for_each_entry_continue(inode, &info->cur_sb->s_inodes,
> +					     i_sb_list) {
> +			spin_lock(&inode->i_lock);
> +			if (inode_unusual(inode)) {
> +				spin_unlock(&inode->i_lock);
> +				continue;
> +			}
> +			__iget(inode);
> +			spin_unlock(&inode->i_lock);
> +			info->cur_inode = inode;
> +			break;
> +		}
> +	}
> +
> +	/* Seen all inodes in this superblock */
> +	if (!info->cur_inode) {
> +		spin_unlock(&info->cur_sb->s_inode_list_lock);
> +		if (!goto_next_sb(info)) {
> +			inode = NULL;
> +			goto out;
> +		}
> +
> +		goto retry;
> +	}
> +
> +	spin_unlock(&info->cur_sb->s_inode_list_lock);
> +	info->cur_page_idx = 0;
> +out:
> +	iput(prev_inode);
> +	return info->cur_inode;
> +}
> +
> +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct page *page, *ret = NULL;
> +	unsigned long idx;
> +
> +	rcu_read_lock();
> +retry:
> +	BUG_ON(!info->cur_inode);
> +	ret = NULL;
> +	xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page,
> +			  info->cur_page_idx) {
> +		if (!page_cache_get_speculative(page))
> +			continue;
> +
> +		ret = page;
> +		info->cur_page_idx = idx + 1;
> +		break;
> +	}
> +
> +	if (!ret) {
> +		/* Seen all inodes and superblocks */
> +		if (!goto_next_inode(info))
> +			goto out;
> +
> +		goto retry;
> +	}
> +
> +out:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
> +static void *pagecache_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct bpf_iter_seq_pagecache_info *info = seq->private;
> +	struct page *page;
> +
> +	if (!info->cur_sb && !goto_next_sb(info))
> +		return NULL;
> +	if (!info->cur_inode && !goto_next_inode(info))
> +		return NULL;
> +
> +	page = goto_next_page(info);
> +	if (!page)
> +		return NULL;
> +
> +	if (*pos == 0)
> +		++*pos;
> +
> +	return page;
> +
> +}
> +
> +static void *pagecache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct bpf_iter_seq_pagecache_info *info = seq->private;
> +	struct page *page;
> +
> +	++*pos;
> +	put_page((struct page *)v);
> +	page = goto_next_page(info);
> +	if (!page)
> +		return NULL;
> +
> +	return page;
> +}
> +
> +struct bpf_iter__pagecache {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct page *, page);
> +};
> +
> +DEFINE_BPF_ITER_FUNC(pagecache, struct bpf_iter_meta *meta, struct page *page)
> +
> +static int __pagecache_seq_show(struct seq_file *seq, struct page *page,
> +				bool in_stop)
> +{
> +	struct bpf_iter_meta meta;
> +	struct bpf_iter__pagecache ctx;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (!prog)
> +		return 0;
> +
> +	meta.seq = seq;
> +	ctx.meta = &meta;
> +	ctx.page = page;
> +	return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int pagecache_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __pagecache_seq_show(seq, v, false);
> +}
> +
> +static void pagecache_seq_stop(struct seq_file *seq, void *v)
> +{
> +	(void)__pagecache_seq_show(seq, v, true);
> +	if (v)
> +		put_page((struct page *)v);
> +}
> +
> +static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux)
> +{
> +	struct bpf_iter_seq_pagecache_info *info = priv_data;
> +	struct radix_tree_iter iter;
> +	struct super_block *sb;
> +	struct mount *mnt;
> +	void **slot;
> +	int err;
> +
> +	info->ns = current->nsproxy->mnt_ns;
> +	get_mnt_ns(info->ns);
> +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> +
> +	spin_lock(&info->ns->ns_lock);
> +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {

Not just are there helpers for taking ns_lock
static inline void lock_ns_list(struct mnt_namespace *ns)
static inline void unlock_ns_list(struct mnt_namespace *ns)
they are private to fs/namespace.c because it's the only place that
should ever walk this list.

This seems buggy: why is it ok here to only take ns_lock and not also
namespace_sem like mnt_already_visible() and __is_local_mountpoint() or
the relevant proc iterators? I might be missing something.

> +		sb = mnt->mnt.mnt_sb;
> +
> +		/* The same mount may be mounted in multiple places */
> +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> +			continue;
> +
> +		err = radix_tree_insert(&info->superblocks,
> +				        (unsigned long)sb, (void *)1);
> +		if (err)
> +			goto out;
> +	}
> +
> +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> +		sb = (struct super_block *)iter.index;
> +		atomic_inc(&sb->s_active);

It also isn't nice that you mess with sb->s_active directly.

Imho, this is poking around in a lot of fs/ specific stuff that other
parts of the kernel should not care about or have access to.

Christian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 0/1] bpf: Add page cache iterator
  2021-04-08  7:51 ` [RFC bpf-next 0/1] bpf: Add page cache iterator Christian Brauner
@ 2021-04-08 16:08   ` Daniel Xu
  0 siblings, 0 replies; 16+ messages in thread
From: Daniel Xu @ 2021-04-08 16:08 UTC (permalink / raw)
  To: Christian Brauner
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

Hi Christian, thanks for taking a look.

On Thu, Apr 08, 2021 at 09:51:17AM +0200, Christian Brauner wrote:
> On Wed, Apr 07, 2021 at 02:46:10PM -0700, Daniel Xu wrote:
> > There currently does not exist a way to answer the question: "What is in
> > the page cache?". There are various heuristics and counters but nothing
> > that can tell you anything like:
> > 
> >   * 3M from /home/dxu/foo.txt
> >   * 5K from ...
> >   * etc.
> > 
> > The answer to the question is particularly useful in the stacked
> > container world. Stacked containers implies multiple containers are run
> > on the same physical host. Memory is precious resource on some (if not
> 
> Just to clarify: what are "stacked containers"? Do you mean nested
> containers, i.e. containers running within containers?

I mean multiple containers running side by side on the same host.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-07 21:46 ` [RFC bpf-next 1/1] bpf: Introduce iter_pagecache Daniel Xu
  2021-04-08  6:14   ` Matthew Wilcox
  2021-04-08  8:19   ` Christian Brauner
@ 2021-04-08 16:45   ` Al Viro
  2021-04-08 20:49     ` Daniel Xu
  2021-04-08 22:11   ` Dave Chinner
  3 siblings, 1 reply; 16+ messages in thread
From: Al Viro @ 2021-04-08 16:45 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:

> +static void fini_seq_pagecache(void *priv_data)
> +{
> +	struct bpf_iter_seq_pagecache_info *info = priv_data;
> +	struct radix_tree_iter iter;
> +	struct super_block *sb;
> +	void **slot;
> +
> +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> +		sb = (struct super_block *)iter.index;
> +		atomic_dec(&sb->s_active);
> +		radix_tree_delete(&info->superblocks, iter.index);
> +	}

... and if in the meanwhile all other contributors to ->s_active have
gone away, that will result in...?

IOW, NAK.  The objects you are playing with have non-trivial lifecycle
and poking into the guts of data structures without bothering to
understand it is not a good idea.

Rule of the thumb: if your code ends up using fields that are otherwise
handled by a small part of codebase, the odds are that you need to be
bloody careful.  In particular, ->ns_lock has 3 users - all in
fs/namespace.c.  ->list/->mnt_list: all users in fs/namespace.c and
fs/pnode.c.  ->s_active: majority in fs/super.c, with several outliers
in filesystems and safety of those is not trivial.

Any time you see that kind of pattern, you are risking to reprise
a scene from The Modern Times - the one with Charlie taking a trip
through the guts of machinery.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-08  6:14   ` Matthew Wilcox
@ 2021-04-08 19:48     ` Daniel Xu
  2021-04-08 21:29       ` Matthew Wilcox
  0 siblings, 1 reply; 16+ messages in thread
From: Daniel Xu @ 2021-04-08 19:48 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Thu, Apr 08, 2021 at 07:14:01AM +0100, Matthew Wilcox wrote:
> On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> > +struct bpf_iter_seq_pagecache_info {
> > +	struct mnt_namespace *ns;
> > +	struct radix_tree_root superblocks;
> 
> Why are you adding a new radix tree?  Use an XArray instead.

Ah right, sorry. Will do.

> > +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
> > +{
> > +	struct page *page, *ret = NULL;
> > +	unsigned long idx;
> > +
> > +	rcu_read_lock();
> > +retry:
> > +	BUG_ON(!info->cur_inode);
> > +	ret = NULL;
> > +	xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page,
> > +			  info->cur_page_idx) {
> > +		if (!page_cache_get_speculative(page))
> > +			continue;
> 
> Why do you feel the need to poke around in i_pages directly?  Is there
> something wrong with find_get_entries()?

No reason other than I didn't know about the latter. Thanks for the
hint. find_get_entries() seems to return a pagevec of entries which
would complicate the iteration (a 4th layer of things to iterate over).

But I did find find_get_pages_range() which I think can be used to find
1 page at a time. I'll look into it further.

> > +static int __pagecache_seq_show(struct seq_file *seq, struct page *page,
> > +				bool in_stop)
> > +{
> > +	struct bpf_iter_meta meta;
> > +	struct bpf_iter__pagecache ctx;
> > +	struct bpf_prog *prog;
> > +
> > +	meta.seq = seq;
> > +	prog = bpf_iter_get_info(&meta, in_stop);
> > +	if (!prog)
> > +		return 0;
> > +
> > +	meta.seq = seq;
> > +	ctx.meta = &meta;
> > +	ctx.page = page;
> > +	return bpf_iter_run_prog(prog, &ctx);
> 
> I'm not really keen on the idea of random BPF programs being able to poke
> at pages in the page cache like this.  From your initial description,
> it sounded like all you needed was a list of which pages are present.

Could you elaborate on what "list of which pages are present" implies?
The overall goal with this patch is to detect duplicate content in the
page cache. So anything that helps achieve that goal I would (in theory)
be OK with.

My understanding is the user would need to hash the contents
of each page in the page cache. And BPF provides the flexibility such
that this work could be reused for currently unanticipated use cases.

Furthermore, bpf programs could already look at all the pages in the
page cache by hooking into tracepoint:filemap:mm_filemap_add_to_page_cache,
albeit at a much slower rate. I figure the downside of adding this
page cache iterator is we're explicitly condoning the behavior.

> > +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> > +
> > +	spin_lock(&info->ns->ns_lock);
> > +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
> > +		sb = mnt->mnt.mnt_sb;
> > +
> > +		/* The same mount may be mounted in multiple places */
> > +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> > +			continue;
> > +
> > +		err = radix_tree_insert(&info->superblocks,
> > +				        (unsigned long)sb, (void *)1);
> > +		if (err)
> > +			goto out;
> > +	}
> > +
> > +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> > +		sb = (struct super_block *)iter.index;
> > +		atomic_inc(&sb->s_active);
> > +	}
> 
> Uh.  What on earth made you think this was a good way to use the radix
> tree?  And, no, the XArray doesn't change that.

The idea behind the radix tree was to deduplicate the mounts by
superblock. Because a single filesystem may be mounted in different
locations. I didn't find a set data structure I could reuse so I
figured radix tree / xarray would work too.

Happy to take any better ideas too.

> If you don't understand why this is so bad, call xa_dump() on it after
> constructing it.  I'll wait.

I did a dump and got the following results: http://ix.io/2VpY .

I receieved a hint that you may be referring to how the xarray/radix
tree would be as large as the largest pointer. To my uneducated eye it
doesn't look like that's the case in this dump. Could you please
clarify?

<...>

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-08  8:19   ` Christian Brauner
@ 2021-04-08 20:44     ` Daniel Xu
  0 siblings, 0 replies; 16+ messages in thread
From: Daniel Xu @ 2021-04-08 20:44 UTC (permalink / raw)
  To: Christian Brauner
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs, Al Viro

On Thu, Apr 08, 2021 at 10:19:35AM +0200, Christian Brauner wrote:
> On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> > This commit introduces the bpf page cache iterator. This iterator allows
> > users to run a bpf prog against each page in the "page cache".
> > Internally, the "page cache" is extremely tied to VFS superblock + inode
> > combo. Because of this, iter_pagecache will only examine pages in the
> > caller's mount namespace.
> > 
> > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> > ---
> >  kernel/bpf/Makefile         |   2 +-
> >  kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++
> >  2 files changed, 294 insertions(+), 1 deletion(-)
> >  create mode 100644 kernel/bpf/pagecache_iter.c

<...>

> > 
> > +static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux)
> > +{
> > +	struct bpf_iter_seq_pagecache_info *info = priv_data;
> > +	struct radix_tree_iter iter;
> > +	struct super_block *sb;
> > +	struct mount *mnt;
> > +	void **slot;
> > +	int err;
> > +
> > +	info->ns = current->nsproxy->mnt_ns;
> > +	get_mnt_ns(info->ns);
> > +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> > +
> > +	spin_lock(&info->ns->ns_lock);
> > +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
> 
> Not just are there helpers for taking ns_lock
> static inline void lock_ns_list(struct mnt_namespace *ns)
> static inline void unlock_ns_list(struct mnt_namespace *ns)
> they are private to fs/namespace.c because it's the only place that
> should ever walk this list.

Thanks for the hints. Would it be acceptable to add some helpers to
fs/namespace.c to allow walking the list?

IIUC the only way to find a list of mounts is by looking at the mount
namespace. And walking each mount and looking at each `struct
super_node`'s inode's `struct address_space` seemed like the cleanest
way to walkthe page cache.

> This seems buggy: why is it ok here to only take ns_lock and not also
> namespace_sem like mnt_already_visible() and __is_local_mountpoint()
> or the relevant proc iterators? I might be missing something.

Thanks for the hints. I'll take a closer look at the locking. Most
probably I didn't get it right.

I should have also mentioned in the cover letter that I'm fairly sure I
messed up the locking somewhere.

> 
> > +		sb = mnt->mnt.mnt_sb;
> > +
> > +		/* The same mount may be mounted in multiple places */
> > +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> > +			continue;
> > +
> > +		err = radix_tree_insert(&info->superblocks,
> > +				        (unsigned long)sb, (void *)1);
> > +		if (err)
> > +			goto out;
> > +	}
> > +
> > +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> > +		sb = (struct super_block *)iter.index;
> > +		atomic_inc(&sb->s_active);
> 
> It also isn't nice that you mess with sb->s_active directly.
> 
> Imho, this is poking around in a lot of fs/ specific stuff that other
> parts of the kernel should not care about or have access to.

Re above: do you think it'd be appropriate to add more helpers to fs/ ?

<...>

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-08 16:45   ` Al Viro
@ 2021-04-08 20:49     ` Daniel Xu
  2021-04-08 21:04       ` Al Viro
  0 siblings, 1 reply; 16+ messages in thread
From: Daniel Xu @ 2021-04-08 20:49 UTC (permalink / raw)
  To: Al Viro
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Thu, Apr 08, 2021 at 04:45:37PM +0000, Al Viro wrote:
> On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> 
> > +static void fini_seq_pagecache(void *priv_data)
> > +{
> > +	struct bpf_iter_seq_pagecache_info *info = priv_data;
> > +	struct radix_tree_iter iter;
> > +	struct super_block *sb;
> > +	void **slot;
> > +
> > +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> > +		sb = (struct super_block *)iter.index;
> > +		atomic_dec(&sb->s_active);
> > +		radix_tree_delete(&info->superblocks, iter.index);
> > +	}
> 
> ... and if in the meanwhile all other contributors to ->s_active have
> gone away, that will result in...?

Ah right, sorry. Nobody will clean up the super_block.

> IOW, NAK.  The objects you are playing with have non-trivial lifecycle
> and poking into the guts of data structures without bothering to
> understand it is not a good idea.
> 
> Rule of the thumb: if your code ends up using fields that are otherwise
> handled by a small part of codebase, the odds are that you need to be
> bloody careful.  In particular, ->ns_lock has 3 users - all in
> fs/namespace.c.  ->list/->mnt_list: all users in fs/namespace.c and
> fs/pnode.c.  ->s_active: majority in fs/super.c, with several outliers
> in filesystems and safety of those is not trivial.
> 
> Any time you see that kind of pattern, you are risking to reprise
> a scene from The Modern Times - the one with Charlie taking a trip
> through the guts of machinery.

I'll take a closer look at the lifetime semantics.

Hopefully the overall goal of the patch is ok. Happy to iterate on the
implementation details until it's correct.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-08 20:49     ` Daniel Xu
@ 2021-04-08 21:04       ` Al Viro
  0 siblings, 0 replies; 16+ messages in thread
From: Al Viro @ 2021-04-08 21:04 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Thu, Apr 08, 2021 at 01:49:35PM -0700, Daniel Xu wrote:

> Ah right, sorry. Nobody will clean up the super_block.
> 
> > IOW, NAK.  The objects you are playing with have non-trivial lifecycle
> > and poking into the guts of data structures without bothering to
> > understand it is not a good idea.
> > 
> > Rule of the thumb: if your code ends up using fields that are otherwise
> > handled by a small part of codebase, the odds are that you need to be
> > bloody careful.  In particular, ->ns_lock has 3 users - all in
> > fs/namespace.c.  ->list/->mnt_list: all users in fs/namespace.c and
> > fs/pnode.c.  ->s_active: majority in fs/super.c, with several outliers
> > in filesystems and safety of those is not trivial.
> > 
> > Any time you see that kind of pattern, you are risking to reprise
> > a scene from The Modern Times - the one with Charlie taking a trip
> > through the guts of machinery.
> 
> I'll take a closer look at the lifetime semantics.
> 
> Hopefully the overall goal of the patch is ok. Happy to iterate on the
> implementation details until it's correct.

That depends.  Note that bumping ->s_active means that umount of that
sucker will *NOT* shut it down - that would happen only on the thread
doing the final deactivation.  What's more, having e.g. a USB stick
mounted, doing umount(1), having it complete successfully, pulling the
damn thing out and getting writes lost would make for a nasty surprise
for users.

With your approach it seems to be inevitable.  Holding namespace_sem
through the entire thing would prevent that, but's it's a non-starter
for other reasons (starting with "it's a system-wide lock, so that'd
be highly antisocial").  Are there any limits on what could be done
to the pages, anyway?  Because if it's "anything user wanted to do",
it's *really* not feasible.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-08 19:48     ` Daniel Xu
@ 2021-04-08 21:29       ` Matthew Wilcox
  0 siblings, 0 replies; 16+ messages in thread
From: Matthew Wilcox @ 2021-04-08 21:29 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Thu, Apr 08, 2021 at 12:48:49PM -0700, Daniel Xu wrote:
> No reason other than I didn't know about the latter. Thanks for the
> hint. find_get_entries() seems to return a pagevec of entries which
> would complicate the iteration (a 4th layer of things to iterate over).
> 
> But I did find find_get_pages_range() which I think can be used to find
> 1 page at a time. I'll look into it further.

Please don't, that's going to be a pagevec too.

> > I'm not really keen on the idea of random BPF programs being able to poke
> > at pages in the page cache like this.  From your initial description,
> > it sounded like all you needed was a list of which pages are present.
> 
> Could you elaborate on what "list of which pages are present" implies?
> The overall goal with this patch is to detect duplicate content in the
> page cache. So anything that helps achieve that goal I would (in theory)
> be OK with.
> 
> My understanding is the user would need to hash the contents
> of each page in the page cache. And BPF provides the flexibility such
> that this work could be reused for currently unanticipated use cases.

But if you need the contents, then you'll need to kmap() the pages.
I don't see people being keen on exposing kmap() to bpf either.  I think
you're much better off providing an interface that returns a hash of
each page to the BPF program.

> Furthermore, bpf programs could already look at all the pages in the
> page cache by hooking into tracepoint:filemap:mm_filemap_add_to_page_cache,
> albeit at a much slower rate. I figure the downside of adding this
> page cache iterator is we're explicitly condoning the behavior.

That should never have been exposed.  It's only supposed to be for error
injection.  If people have started actually using it for something,
then it's time we delete that tracepoint.

> The idea behind the radix tree was to deduplicate the mounts by
> superblock. Because a single filesystem may be mounted in different
> locations. I didn't find a set data structure I could reuse so I
> figured radix tree / xarray would work too.
> 
> Happy to take any better ideas too.
> 
> > If you don't understand why this is so bad, call xa_dump() on it after
> > constructing it.  I'll wait.
> 
> I did a dump and got the following results: http://ix.io/2VpY .
> 
> I receieved a hint that you may be referring to how the xarray/radix
> tree would be as large as the largest pointer. To my uneducated eye it
> doesn't look like that's the case in this dump. Could you please
> clarify?

We get seven nodes per 4kB page.

$ grep -c 'value 0' 2VpY 
15
$ grep -c node 2VpY 
43

so we use 6+1/7 pages in order to store 15 values.  That's 387 cache
lines, for the amount of data that could fit in two.

Liam and I are working on a data structure that would support doing
something along these lines in an efficient manner, but it's not
ready yet.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 0/1] bpf: Add page cache iterator
  2021-04-07 21:46 [RFC bpf-next 0/1] bpf: Add page cache iterator Daniel Xu
  2021-04-07 21:46 ` [RFC bpf-next 1/1] bpf: Introduce iter_pagecache Daniel Xu
  2021-04-08  7:51 ` [RFC bpf-next 0/1] bpf: Add page cache iterator Christian Brauner
@ 2021-04-08 21:33 ` Shakeel Butt
  2021-04-08 23:13 ` Darrick J. Wong
  3 siblings, 0 replies; 16+ messages in thread
From: Shakeel Butt @ 2021-04-08 21:33 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, Linux MM, LKML, Kernel Team, jolsa,
	Johannes Weiner, Yonghong Song

On Wed, Apr 7, 2021 at 2:47 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> There currently does not exist a way to answer the question: "What is in
> the page cache?". There are various heuristics and counters but nothing
> that can tell you anything like:
>
>   * 3M from /home/dxu/foo.txt
>   * 5K from ...
>   * etc.
>

So, you need the list of inodes for a filesystem that are in memory.
With that list, you can open, map and do mincore on them?

I don't know of a way to get that inode list.

> The answer to the question is particularly useful in the stacked
> container world. Stacked containers implies multiple containers are run
> on the same physical host. Memory is precious resource on some (if not
> most) of these systems. On these systems, it's useful to know how much
> duplicated data is in the page cache. Once you know the answer, you can
> do something about it. One possible technique would be bind mount common
> items from the root host into each container.
>

Oh you want to share page cache between different containers/jobs?
That would complicate charging.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache
  2021-04-07 21:46 ` [RFC bpf-next 1/1] bpf: Introduce iter_pagecache Daniel Xu
                     ` (2 preceding siblings ...)
  2021-04-08 16:45   ` Al Viro
@ 2021-04-08 22:11   ` Dave Chinner
  3 siblings, 0 replies; 16+ messages in thread
From: Dave Chinner @ 2021-04-08 22:11 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> This commit introduces the bpf page cache iterator. This iterator allows
> users to run a bpf prog against each page in the "page cache".
> Internally, the "page cache" is extremely tied to VFS superblock + inode
> combo. Because of this, iter_pagecache will only examine pages in the
> caller's mount namespace.

No, it does not just examine pages with in the callers mount
namespace, because ....

> +static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct inode *prev_inode = info->cur_inode;
> +	struct inode *inode;
> +
> +retry:
> +	BUG_ON(!info->cur_sb);
> +	spin_lock(&info->cur_sb->s_inode_list_lock);
> +
> +	if (!info->cur_inode) {
> +		list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) {

... this is an "all inodes on the superblock" walk.  This will also
iterate inodes in other mount namespaces that point to the same
superblock.

IOWs, if you have different parts of the same filesystem mounted
into hundreds of container mount namespaces, this script will not
just iterate the local mount name space, it will iterate every inode
in every mount namespace.

And, of course, if the same files are mounted into multiple
containers (think read-only bind mounts using idmapping) then you
have zero indication of which container is actually using them, just
that there are hundreds of paths to the same inode. And every
container will appear to be using exactly the same amount of page cache.

IOWs, the stats this generates provide no insight into page cache
usage across mount namespaces in many situations, and it leaks
information about page cache usage across mount namespace
boundaries.

And that's before I say "iterating all inodes in a superblock is
bad" because it causes lock contention and interrupts normal usage.
We avoid s_inodes lists walks as much as we possibly can, and the
last thing we want is for userspace to be able to trivially
instigate long running walks of the s_inodes list. Remember, we can
have hundreds of millions of inodes on this list....

> +			spin_lock(&inode->i_lock);
> +			if (inode_unusual(inode)) {
> +				spin_unlock(&inode->i_lock);
> +				continue;
> +			}
> +			__iget(inode);
> +			spin_unlock(&inode->i_lock);

This can spin long enough to trigger livelock warnings. Even if it's
not held that long, it can cause unexpected long tail latencies in
memory reclaim and inode instantiation. Every s_inodes list walk has
cond_resched() built into it now....

> +	info->ns = current->nsproxy->mnt_ns;
> +	get_mnt_ns(info->ns);
> +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> +
> +	spin_lock(&info->ns->ns_lock);
> +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
> +		sb = mnt->mnt.mnt_sb;
> +
> +		/* The same mount may be mounted in multiple places */
> +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> +			continue;
> +
> +		err = radix_tree_insert(&info->superblocks,
> +				        (unsigned long)sb, (void *)1);

And just because nobody has pointed it out yet: radix_tree_insert()
will do GFP_KERNEL memory allocations inside the spinlock being held
here.

----

You said that you didn't take the "walk the LRUs" approach because
walking superblocks "seemed simpler". It's not. Page cache residency
and accounting is managed by memcgs, not by mount namespaces.

That is, containers usually have a memcg associated with them to control
memory usage of the container. The page cache used by a container is
accounted directly to the memcg, and memory reclaim can find all the
file-backed page cache pages associated with a memcg very quickly
(via mem_cgroup_lruvec()).  This will find pages associated directly
with the memcg, so it gives you a fairly accurate picture of the
page cache usage within the container.

This has none of the issues that arise from "sb != mnt_ns" that
walking superblocks and inode lists have, and it doesn't require you
to play games with mounts, superblocks and inode references....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 0/1] bpf: Add page cache iterator
  2021-04-07 21:46 [RFC bpf-next 0/1] bpf: Add page cache iterator Daniel Xu
                   ` (2 preceding siblings ...)
  2021-04-08 21:33 ` Shakeel Butt
@ 2021-04-08 23:13 ` Darrick J. Wong
  2021-04-09  0:24   ` Daniel Xu
  3 siblings, 1 reply; 16+ messages in thread
From: Darrick J. Wong @ 2021-04-08 23:13 UTC (permalink / raw)
  To: Daniel Xu
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Wed, Apr 07, 2021 at 02:46:10PM -0700, Daniel Xu wrote:
> There currently does not exist a way to answer the question: "What is in
> the page cache?". There are various heuristics and counters but nothing
> that can tell you anything like:
> 
>   * 3M from /home/dxu/foo.txt
>   * 5K from ...

5K?  That's an extraordinary Weird Machine(tm).

>   * etc.
> 
> The answer to the question is particularly useful in the stacked
> container world. Stacked containers implies multiple containers are run
> on the same physical host. Memory is precious resource on some (if not
> most) of these systems. On these systems, it's useful to know how much
> duplicated data is in the page cache. Once you know the answer, you can
> do something about it. One possible technique would be bind mount common
> items from the root host into each container.

Um, are you describing a system that uses BPF to deduplicating the page
cache by using bind mounts?  Can the containers scribble on these files
and thereby mess up the other containers?  What happens if the container
wants to update itself and clobbers the root host's copy instead?  How
do you deal with a software update process failing because the root host
fights back against the container trying to change its files?

Also, I thought we weren't supposed to share resources across security
boundaries anymore?

--D

> 
> NOTES: 
> 
>   * This patch compiles and (maybe) works -- totally not fully tested
>     or in a final state
> 
>   * I'm sending this early RFC to get comments on the general approach.
>     I chatted w/ Johannes a little bit and it seems like the best way to
>     do this is through superblock -> inode -> address_space iteration
>     rather than going from numa node -> LRU iteration
> 
>   * I'll most likely add a page_hash() helper (or something) that hashes
>     a page so that userspace can more easily tell which pages are
>     duplicate
> 
> Daniel Xu (1):
>   bpf: Introduce iter_pagecache
> 
>  kernel/bpf/Makefile         |   2 +-
>  kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 294 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/pagecache_iter.c
> 
> -- 
> 2.26.3
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC bpf-next 0/1] bpf: Add page cache iterator
  2021-04-08 23:13 ` Darrick J. Wong
@ 2021-04-09  0:24   ` Daniel Xu
  0 siblings, 0 replies; 16+ messages in thread
From: Daniel Xu @ 2021-04-09  0:24 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: bpf, linux-fsdevel, linux-mm, linux-kernel, kernel-team, jolsa,
	hannes, yhs

On Thu, Apr 08, 2021 at 04:13:32PM -0700, Darrick J. Wong wrote:
> On Wed, Apr 07, 2021 at 02:46:10PM -0700, Daniel Xu wrote:
> > There currently does not exist a way to answer the question: "What is in
> > the page cache?". There are various heuristics and counters but nothing
> > that can tell you anything like:
> > 
> >   * 3M from /home/dxu/foo.txt
> >   * 5K from ...
> 
> 5K?  That's an extraordinary Weird Machine(tm).

Just typing random numbers :)

> >   * etc.
> > 
> > The answer to the question is particularly useful in the stacked
> > container world. Stacked containers implies multiple containers are run
> > on the same physical host. Memory is precious resource on some (if not
> > most) of these systems. On these systems, it's useful to know how much
> > duplicated data is in the page cache. Once you know the answer, you can
> > do something about it. One possible technique would be bind mount common
> > items from the root host into each container.
> 
> Um, are you describing a system that uses BPF to deduplicating the page
> cache by using bind mounts?  Can the containers scribble on these files
> and thereby mess up the other containers?  What happens if the container
> wants to update itself and clobbers the root host's copy instead?  How
> do you deal with a software update process failing because the root host
> fights back against the container trying to change its files?

No, the BPF progs are not intended to modify the pages. This is just for
read only observability.

> Also, I thought we weren't supposed to share resources across security
> boundaries anymore?

I can't speak to this, but bpf progs can pretty much be attached to
anywhere so this iterator doesn't expose anything new.

<...>

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, back to index

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-07 21:46 [RFC bpf-next 0/1] bpf: Add page cache iterator Daniel Xu
2021-04-07 21:46 ` [RFC bpf-next 1/1] bpf: Introduce iter_pagecache Daniel Xu
2021-04-08  6:14   ` Matthew Wilcox
2021-04-08 19:48     ` Daniel Xu
2021-04-08 21:29       ` Matthew Wilcox
2021-04-08  8:19   ` Christian Brauner
2021-04-08 20:44     ` Daniel Xu
2021-04-08 16:45   ` Al Viro
2021-04-08 20:49     ` Daniel Xu
2021-04-08 21:04       ` Al Viro
2021-04-08 22:11   ` Dave Chinner
2021-04-08  7:51 ` [RFC bpf-next 0/1] bpf: Add page cache iterator Christian Brauner
2021-04-08 16:08   ` Daniel Xu
2021-04-08 21:33 ` Shakeel Butt
2021-04-08 23:13 ` Darrick J. Wong
2021-04-09  0:24   ` Daniel Xu

BPF Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/bpf/0 bpf/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 bpf bpf/ https://lore.kernel.org/bpf \
		bpf@vger.kernel.org
	public-inbox-index bpf

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.bpf


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git