All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/4] mm/shmem: support deterministic charging of tmpfs
       [not found] <20211111234203.1824138-1-almasrymina@google.com>
  2021-11-11 23:42   ` Mina Almasry
@ 2021-11-11 23:42   ` Mina Almasry
  2021-11-11 23:42   ` Mina Almasry
  2021-11-11 23:42   ` Mina Almasry
  3 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

Add memcg= option to shmem mount.

Users can specify this option at mount time and all data page charges
will be charged to the memcg supplied. Processes are only allowed to
direct tmpfs changes to a cgroup that they themselves can enter and
allocate memory in.

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
CC: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---

Changes in v3:
- Fixed build failures/warnings Reported-by: kernel test robot <lkp@intel.com>

Changes in v2:
- Fixed Roman's email.
- Added a new wrapper around charge_memcg() instead of __mem_cgroup_charge()
- Merged the permission check into this patch as Roman suggested.
- Instead of checking for a s_memcg_to_charge off the superblock in the
filemap code, I set_active_memcg() before calling into the fs generic
code as Dave suggests.
- I have kept the s_memcg_to_charge in the superblock to keep the
struct address_space pointer small and preserve the remount use case..

---
 fs/super.c                 |   7 ++
 include/linux/fs.h         |   5 ++
 include/linux/memcontrol.h |  58 +++++++++++++++++
 mm/memcontrol.c            | 130 +++++++++++++++++++++++++++++++++++++
 mm/shmem.c                 |  73 ++++++++++++++++++++-
 5 files changed, 271 insertions(+), 2 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 3bfc0f8fbd5bc..5484b08ba0025 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/memcontrol.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
@@ -180,6 +181,9 @@ static void destroy_unused_super(struct super_block *s)
 	up_write(&s->s_umount);
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
+#if CONFIG_MEMCG
+	mem_cgroup_set_charge_target(&s->s_memcg_to_charge, NULL);
+#endif
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
@@ -292,6 +296,9 @@ static void __put_super(struct super_block *s)
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(!list_empty(&s->s_mounts));
+#if CONFIG_MEMCG
+		mem_cgroup_set_charge_target(&s->s_memcg_to_charge, NULL);
+#endif
 		security_sb_free(s);
 		fscrypt_sb_free(s);
 		put_user_ns(s->s_user_ns);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3afca821df32e..59407b3e7aee3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1567,6 +1567,11 @@ struct super_block {
 	struct workqueue_struct *s_dio_done_wq;
 	struct hlist_head s_pins;

+#ifdef CONFIG_MEMCG
+	/* memcg to charge for pages allocated to this filesystem */
+	struct mem_cgroup *s_memcg_to_charge;
+#endif
+
 	/*
 	 * Owning user namespace and default context in which to
 	 * interpret filesystem uids, gids, quotas, device nodes,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c5c403f4be6b..8583d37c05d9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,6 +27,7 @@ struct obj_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct super_block;

 /* Cgroup-specific page state, on top of universal node page state */
 enum memcg_stat_item {
@@ -713,6 +714,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }

+int mem_cgroup_charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+			    gfp_t gfp);
+
 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -923,6 +927,24 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return !!(memcg->css.flags & CSS_ONLINE);
 }

+struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping);
+
+static inline void mem_cgroup_put_memcg(struct mem_cgroup *memcg)
+{
+	if (memcg)
+		css_put(&memcg->css);
+}
+
+void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+				  struct mem_cgroup *memcg);
+struct mem_cgroup *mem_cgroup_get_from_path(const char *path);
+/**
+ * User is responsible for providing a buffer @buf of length @len and freeing
+ * it.
+ */
+int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len);
+
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int zid, int nr_pages);

@@ -1223,6 +1245,42 @@ static inline int mem_cgroup_charge(struct folio *folio,
 	return 0;
 }

+static inline int mem_cgroup_charge_memcg(struct folio *folio,
+					  struct mem_cgroup *memcg,
+					  gfp_t gfp_mask)
+{
+	return 0;
+}
+
+static inline struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping)
+{
+	return NULL;
+}
+
+static inline void mem_cgroup_put_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+						struct mem_cgroup *memcg)
+{
+}
+
+static inline struct mem_cgroup *mem_cgroup_get_from_path(const char *path)
+{
+	return NULL;
+}
+
+static inline int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf,
+					      size_t len)
+{
+	if (len < 1)
+		return -EINVAL;
+	buf[0] = '\0';
+	return 0;
+}
+
 static inline int mem_cgroup_swapin_charge_page(struct page *page,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 781605e920153..b3d8f52a63d17 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
 #include <linux/tracehook.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
+#include <linux/string.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -2580,6 +2581,126 @@ void mem_cgroup_handle_over_high(void)
 	css_put(&memcg->css);
 }

+/*
+ * Non error return value must eventually be released with css_put().
+ */
+struct mem_cgroup *mem_cgroup_get_from_path(const char *path)
+{
+	static const char procs_filename[] = "/cgroup.procs";
+	struct file *file, *procs;
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+	char *procs_path =
+		kmalloc(strlen(path) + sizeof(procs_filename), GFP_KERNEL);
+
+	if (procs_path == NULL)
+		return ERR_PTR(-ENOMEM);
+	strcpy(procs_path, path);
+	strcat(procs_path, procs_filename);
+
+	procs = filp_open(procs_path, O_WRONLY, 0);
+	kfree(procs_path);
+
+	/*
+	 * Restrict the capability for tasks to mount with memcg charging to the
+	 * cgroup they could not join. For example, disallow:
+	 *
+	 * mount -t tmpfs -o memcg=root-cgroup nodev <MOUNT_DIR>
+	 *
+	 * if it is a non-root task.
+	 */
+	if (IS_ERR(procs))
+		return (struct mem_cgroup *)procs;
+	fput(procs);
+
+	file = filp_open(path, O_DIRECTORY | O_RDONLY, 0);
+	if (IS_ERR(file))
+		return (struct mem_cgroup *)file;
+
+	css = css_tryget_online_from_dir(file->f_path.dentry,
+					 &memory_cgrp_subsys);
+	if (IS_ERR(css))
+		memcg = (struct mem_cgroup *)css;
+	else
+		memcg = container_of(css, struct mem_cgroup, css);
+
+	fput(file);
+	return memcg;
+}
+
+/*
+ * Get the name of the optional charge target memcg associated with @sb.  This
+ * is the cgroup name, not the cgroup path.
+ */
+int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len)
+{
+	struct mem_cgroup *memcg;
+	int ret = 0;
+
+	buf[0] = '\0';
+
+	rcu_read_lock();
+	memcg = rcu_dereference(sb->s_memcg_to_charge);
+	if (memcg && !css_tryget_online(&memcg->css))
+		memcg = NULL;
+	rcu_read_unlock();
+
+	if (!memcg)
+		return 0;
+
+	ret = cgroup_path(memcg->css.cgroup, buf + len / 2, len / 2);
+	if (ret >= len / 2)
+		strcpy(buf, "?");
+	else {
+		char *p = mangle_path(buf, buf + len / 2, " \t\n\\");
+
+		if (p)
+			*p = '\0';
+		else
+			strcpy(buf, "?");
+	}
+
+	css_put(&memcg->css);
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * Set or clear (if @memcg is NULL) charge association from file system to
+ * memcg.  If @memcg != NULL, then a css reference must be held by the caller to
+ * ensure that the cgroup is not deleted during this operation.
+ */
+void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+				  struct mem_cgroup *memcg)
+{
+	if (memcg)
+		css_get(&memcg->css);
+	memcg = xchg(target, memcg);
+	if (memcg)
+		css_put(&memcg->css);
+}
+
+/*
+ * Returns the memcg to charge for inode pages.  If non-NULL is returned, caller
+ * must drop reference with css_put().  NULL indicates that the inode does not
+ * have a memcg to charge, so the default process based policy should be used.
+ */
+struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping)
+{
+	struct mem_cgroup *memcg;
+
+	if (!mapping)
+		return NULL;
+
+	rcu_read_lock();
+	memcg = rcu_dereference(mapping->host->i_sb->s_memcg_to_charge);
+	if (memcg && !css_tryget_online(&memcg->css))
+		memcg = NULL;
+	rcu_read_unlock();
+
+	return memcg;
+}
+
 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			unsigned int nr_pages)
 {
@@ -6678,6 +6799,15 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
 	return ret;
 }

+int mem_cgroup_charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+			    gfp_t gfp)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+
+	return charge_memcg(folio, memcg, gfp);
+}
+
 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
diff --git a/mm/shmem.c b/mm/shmem.c
index 23c91a8beb781..8b623c49ee50d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -115,10 +115,14 @@ struct shmem_options {
 	bool full_inums;
 	int huge;
 	int seen;
+#if CONFIG_MEMCG
+	struct mem_cgroup *memcg;
+#endif
 #define SHMEM_SEEN_BLOCKS 1
 #define SHMEM_SEEN_INODES 2
 #define SHMEM_SEEN_HUGE 4
 #define SHMEM_SEEN_INUMS 8
+#define SHMEM_SEEN_MEMCG 16
 };

 #ifdef CONFIG_TMPFS
@@ -697,6 +701,7 @@ static int shmem_add_to_page_cache(struct page *page,
 	unsigned long i = 0;
 	unsigned long nr = compound_nr(page);
 	int error;
+	struct mem_cgroup *remote_memcg;

 	VM_BUG_ON_PAGE(PageTail(page), page);
 	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -709,7 +714,14 @@ static int shmem_add_to_page_cache(struct page *page,
 	page->index = index;

 	if (!PageSwapCache(page)) {
-		error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
+		remote_memcg = mem_cgroup_mapping_get_charge_target(mapping);
+		if (remote_memcg) {
+			error = mem_cgroup_charge_memcg(page_folio(page),
+							remote_memcg, gfp);
+			mem_cgroup_put_memcg(remote_memcg);
+		} else
+			error = mem_cgroup_charge(page_folio(page), charge_mm,
+						  gfp);
 		if (error) {
 			if (PageTransHuge(page)) {
 				count_vm_event(THP_FILE_FALLBACK);
@@ -1822,6 +1834,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	int error;
 	int once = 0;
 	int alloced = 0;
+	struct mem_cgroup *remote_memcg, *old_memcg;

 	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
 		return -EFBIG;
@@ -1834,8 +1847,21 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	sbinfo = SHMEM_SB(inode->i_sb);
 	charge_mm = vma ? vma->vm_mm : NULL;

+	/*
+	 * If we're doing a remote charge here, set the active_memcg as the
+	 * remote memcg, so that eventually if pagecache_get_page() calls into
+	 * filemap_add_folio(), we charge the correct memcg.
+	 */
+	remote_memcg = mem_cgroup_mapping_get_charge_target(mapping);
+	if (remote_memcg)
+		old_memcg = set_active_memcg(remote_memcg);
+
 	page = pagecache_get_page(mapping, index,
 					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
+	if (remote_memcg) {
+		set_active_memcg(old_memcg);
+		mem_cgroup_put_memcg(remote_memcg);
+	}

 	if (page && vma && userfaultfd_minor(vma)) {
 		if (!xa_is_value(page)) {
@@ -3342,6 +3368,7 @@ static const struct export_operations shmem_export_ops = {
 enum shmem_param {
 	Opt_gid,
 	Opt_huge,
+	Opt_memcg,
 	Opt_mode,
 	Opt_mpol,
 	Opt_nr_blocks,
@@ -3363,6 +3390,7 @@ static const struct constant_table shmem_param_enums_huge[] = {
 const struct fs_parameter_spec shmem_fs_parameters[] = {
 	fsparam_u32   ("gid",		Opt_gid),
 	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
+	fsparam_string("memcg",		Opt_memcg),
 	fsparam_u32oct("mode",		Opt_mode),
 	fsparam_string("mpol",		Opt_mpol),
 	fsparam_string("nr_blocks",	Opt_nr_blocks),
@@ -3379,6 +3407,9 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 	struct shmem_options *ctx = fc->fs_private;
 	struct fs_parse_result result;
 	unsigned long long size;
+#if CONFIG_MEMCG
+	struct mem_cgroup *memcg;
+#endif
 	char *rest;
 	int opt;

@@ -3412,6 +3443,17 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 			goto bad_value;
 		ctx->seen |= SHMEM_SEEN_INODES;
 		break;
+#if CONFIG_MEMCG
+	case Opt_memcg:
+		if (ctx->memcg)
+			css_put(&ctx->memcg->css);
+		memcg = mem_cgroup_get_from_path(param->string);
+		if (IS_ERR(memcg))
+			goto bad_value;
+		ctx->memcg = memcg;
+		ctx->seen |= SHMEM_SEEN_MEMCG;
+		break;
+#endif
 	case Opt_mode:
 		ctx->mode = result.uint_32 & 07777;
 		break;
@@ -3573,6 +3615,14 @@ static int shmem_reconfigure(struct fs_context *fc)
 	}
 	raw_spin_unlock(&sbinfo->stat_lock);
 	mpol_put(mpol);
+#if CONFIG_MEMCG
+	if (ctx->seen & SHMEM_SEEN_MEMCG && ctx->memcg) {
+		mem_cgroup_set_charge_target(&fc->root->d_sb->s_memcg_to_charge,
+					     ctx->memcg);
+		css_put(&ctx->memcg->css);
+		ctx->memcg = NULL;
+	}
+#endif
 	return 0;
 out:
 	raw_spin_unlock(&sbinfo->stat_lock);
@@ -3582,6 +3632,11 @@ static int shmem_reconfigure(struct fs_context *fc)
 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
+	int err;
+	char *buf = __getname();
+
+	if (!buf)
+		return -ENOMEM;

 	if (sbinfo->max_blocks != shmem_default_max_blocks())
 		seq_printf(seq, ",size=%luk",
@@ -3625,7 +3680,13 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
 #endif
 	shmem_show_mpol(seq, sbinfo->mpol);
-	return 0;
+	/* Memory cgroup binding: memcg=cgroup_name */
+	err = mem_cgroup_get_name_from_sb(root->d_sb, buf, PATH_MAX);
+	if (!err && buf[0] != '\0')
+		seq_printf(seq, ",memcg=%s", buf);
+
+	__putname(buf);
+	return err;
 }

 #endif /* CONFIG_TMPFS */
@@ -3710,6 +3771,14 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_flags |= SB_POSIXACL;
 #endif
 	uuid_gen(&sb->s_uuid);
+#if CONFIG_MEMCG
+	if (ctx->memcg) {
+		mem_cgroup_set_charge_target(&sb->s_memcg_to_charge,
+					     ctx->memcg);
+		css_put(&ctx->memcg->css);
+		ctx->memcg = NULL;
+	}
+#endif

 	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
 	if (!inode)
--
2.34.0.rc1.387.gb447b232ab-goog

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 1/4] mm/shmem: support deterministic charging of tmpfs
@ 2021-11-11 23:42   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

Add memcg= option to shmem mount.

Users can specify this option at mount time and all data page charges
will be charged to the memcg supplied. Processes are only allowed to
direct tmpfs changes to a cgroup that they themselves can enter and
allocate memory in.

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
CC: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---

Changes in v3:
- Fixed build failures/warnings Reported-by: kernel test robot <lkp@intel.com>

Changes in v2:
- Fixed Roman's email.
- Added a new wrapper around charge_memcg() instead of __mem_cgroup_charge()
- Merged the permission check into this patch as Roman suggested.
- Instead of checking for a s_memcg_to_charge off the superblock in the
filemap code, I set_active_memcg() before calling into the fs generic
code as Dave suggests.
- I have kept the s_memcg_to_charge in the superblock to keep the
struct address_space pointer small and preserve the remount use case..

---
 fs/super.c                 |   7 ++
 include/linux/fs.h         |   5 ++
 include/linux/memcontrol.h |  58 +++++++++++++++++
 mm/memcontrol.c            | 130 +++++++++++++++++++++++++++++++++++++
 mm/shmem.c                 |  73 ++++++++++++++++++++-
 5 files changed, 271 insertions(+), 2 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 3bfc0f8fbd5bc..5484b08ba0025 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/memcontrol.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
@@ -180,6 +181,9 @@ static void destroy_unused_super(struct super_block *s)
 	up_write(&s->s_umount);
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
+#if CONFIG_MEMCG
+	mem_cgroup_set_charge_target(&s->s_memcg_to_charge, NULL);
+#endif
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
@@ -292,6 +296,9 @@ static void __put_super(struct super_block *s)
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(!list_empty(&s->s_mounts));
+#if CONFIG_MEMCG
+		mem_cgroup_set_charge_target(&s->s_memcg_to_charge, NULL);
+#endif
 		security_sb_free(s);
 		fscrypt_sb_free(s);
 		put_user_ns(s->s_user_ns);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3afca821df32e..59407b3e7aee3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1567,6 +1567,11 @@ struct super_block {
 	struct workqueue_struct *s_dio_done_wq;
 	struct hlist_head s_pins;

+#ifdef CONFIG_MEMCG
+	/* memcg to charge for pages allocated to this filesystem */
+	struct mem_cgroup *s_memcg_to_charge;
+#endif
+
 	/*
 	 * Owning user namespace and default context in which to
 	 * interpret filesystem uids, gids, quotas, device nodes,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c5c403f4be6b..8583d37c05d9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,6 +27,7 @@ struct obj_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct super_block;

 /* Cgroup-specific page state, on top of universal node page state */
 enum memcg_stat_item {
@@ -713,6 +714,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }

+int mem_cgroup_charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+			    gfp_t gfp);
+
 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -923,6 +927,24 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return !!(memcg->css.flags & CSS_ONLINE);
 }

+struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping);
+
+static inline void mem_cgroup_put_memcg(struct mem_cgroup *memcg)
+{
+	if (memcg)
+		css_put(&memcg->css);
+}
+
+void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+				  struct mem_cgroup *memcg);
+struct mem_cgroup *mem_cgroup_get_from_path(const char *path);
+/**
+ * User is responsible for providing a buffer @buf of length @len and freeing
+ * it.
+ */
+int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len);
+
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int zid, int nr_pages);

@@ -1223,6 +1245,42 @@ static inline int mem_cgroup_charge(struct folio *folio,
 	return 0;
 }

+static inline int mem_cgroup_charge_memcg(struct folio *folio,
+					  struct mem_cgroup *memcg,
+					  gfp_t gfp_mask)
+{
+	return 0;
+}
+
+static inline struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping)
+{
+	return NULL;
+}
+
+static inline void mem_cgroup_put_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+						struct mem_cgroup *memcg)
+{
+}
+
+static inline struct mem_cgroup *mem_cgroup_get_from_path(const char *path)
+{
+	return NULL;
+}
+
+static inline int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf,
+					      size_t len)
+{
+	if (len < 1)
+		return -EINVAL;
+	buf[0] = '\0';
+	return 0;
+}
+
 static inline int mem_cgroup_swapin_charge_page(struct page *page,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 781605e920153..b3d8f52a63d17 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
 #include <linux/tracehook.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
+#include <linux/string.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -2580,6 +2581,126 @@ void mem_cgroup_handle_over_high(void)
 	css_put(&memcg->css);
 }

+/*
+ * Non error return value must eventually be released with css_put().
+ */
+struct mem_cgroup *mem_cgroup_get_from_path(const char *path)
+{
+	static const char procs_filename[] = "/cgroup.procs";
+	struct file *file, *procs;
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+	char *procs_path =
+		kmalloc(strlen(path) + sizeof(procs_filename), GFP_KERNEL);
+
+	if (procs_path == NULL)
+		return ERR_PTR(-ENOMEM);
+	strcpy(procs_path, path);
+	strcat(procs_path, procs_filename);
+
+	procs = filp_open(procs_path, O_WRONLY, 0);
+	kfree(procs_path);
+
+	/*
+	 * Restrict the capability for tasks to mount with memcg charging to the
+	 * cgroup they could not join. For example, disallow:
+	 *
+	 * mount -t tmpfs -o memcg=root-cgroup nodev <MOUNT_DIR>
+	 *
+	 * if it is a non-root task.
+	 */
+	if (IS_ERR(procs))
+		return (struct mem_cgroup *)procs;
+	fput(procs);
+
+	file = filp_open(path, O_DIRECTORY | O_RDONLY, 0);
+	if (IS_ERR(file))
+		return (struct mem_cgroup *)file;
+
+	css = css_tryget_online_from_dir(file->f_path.dentry,
+					 &memory_cgrp_subsys);
+	if (IS_ERR(css))
+		memcg = (struct mem_cgroup *)css;
+	else
+		memcg = container_of(css, struct mem_cgroup, css);
+
+	fput(file);
+	return memcg;
+}
+
+/*
+ * Get the name of the optional charge target memcg associated with @sb.  This
+ * is the cgroup name, not the cgroup path.
+ */
+int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len)
+{
+	struct mem_cgroup *memcg;
+	int ret = 0;
+
+	buf[0] = '\0';
+
+	rcu_read_lock();
+	memcg = rcu_dereference(sb->s_memcg_to_charge);
+	if (memcg && !css_tryget_online(&memcg->css))
+		memcg = NULL;
+	rcu_read_unlock();
+
+	if (!memcg)
+		return 0;
+
+	ret = cgroup_path(memcg->css.cgroup, buf + len / 2, len / 2);
+	if (ret >= len / 2)
+		strcpy(buf, "?");
+	else {
+		char *p = mangle_path(buf, buf + len / 2, " \t\n\\");
+
+		if (p)
+			*p = '\0';
+		else
+			strcpy(buf, "?");
+	}
+
+	css_put(&memcg->css);
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * Set or clear (if @memcg is NULL) charge association from file system to
+ * memcg.  If @memcg != NULL, then a css reference must be held by the caller to
+ * ensure that the cgroup is not deleted during this operation.
+ */
+void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+				  struct mem_cgroup *memcg)
+{
+	if (memcg)
+		css_get(&memcg->css);
+	memcg = xchg(target, memcg);
+	if (memcg)
+		css_put(&memcg->css);
+}
+
+/*
+ * Returns the memcg to charge for inode pages.  If non-NULL is returned, caller
+ * must drop reference with css_put().  NULL indicates that the inode does not
+ * have a memcg to charge, so the default process based policy should be used.
+ */
+struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping)
+{
+	struct mem_cgroup *memcg;
+
+	if (!mapping)
+		return NULL;
+
+	rcu_read_lock();
+	memcg = rcu_dereference(mapping->host->i_sb->s_memcg_to_charge);
+	if (memcg && !css_tryget_online(&memcg->css))
+		memcg = NULL;
+	rcu_read_unlock();
+
+	return memcg;
+}
+
 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			unsigned int nr_pages)
 {
@@ -6678,6 +6799,15 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
 	return ret;
 }

+int mem_cgroup_charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+			    gfp_t gfp)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+
+	return charge_memcg(folio, memcg, gfp);
+}
+
 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
diff --git a/mm/shmem.c b/mm/shmem.c
index 23c91a8beb781..8b623c49ee50d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -115,10 +115,14 @@ struct shmem_options {
 	bool full_inums;
 	int huge;
 	int seen;
+#if CONFIG_MEMCG
+	struct mem_cgroup *memcg;
+#endif
 #define SHMEM_SEEN_BLOCKS 1
 #define SHMEM_SEEN_INODES 2
 #define SHMEM_SEEN_HUGE 4
 #define SHMEM_SEEN_INUMS 8
+#define SHMEM_SEEN_MEMCG 16
 };

 #ifdef CONFIG_TMPFS
@@ -697,6 +701,7 @@ static int shmem_add_to_page_cache(struct page *page,
 	unsigned long i = 0;
 	unsigned long nr = compound_nr(page);
 	int error;
+	struct mem_cgroup *remote_memcg;

 	VM_BUG_ON_PAGE(PageTail(page), page);
 	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -709,7 +714,14 @@ static int shmem_add_to_page_cache(struct page *page,
 	page->index = index;

 	if (!PageSwapCache(page)) {
-		error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
+		remote_memcg = mem_cgroup_mapping_get_charge_target(mapping);
+		if (remote_memcg) {
+			error = mem_cgroup_charge_memcg(page_folio(page),
+							remote_memcg, gfp);
+			mem_cgroup_put_memcg(remote_memcg);
+		} else
+			error = mem_cgroup_charge(page_folio(page), charge_mm,
+						  gfp);
 		if (error) {
 			if (PageTransHuge(page)) {
 				count_vm_event(THP_FILE_FALLBACK);
@@ -1822,6 +1834,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	int error;
 	int once = 0;
 	int alloced = 0;
+	struct mem_cgroup *remote_memcg, *old_memcg;

 	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
 		return -EFBIG;
@@ -1834,8 +1847,21 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	sbinfo = SHMEM_SB(inode->i_sb);
 	charge_mm = vma ? vma->vm_mm : NULL;

+	/*
+	 * If we're doing a remote charge here, set the active_memcg as the
+	 * remote memcg, so that eventually if pagecache_get_page() calls into
+	 * filemap_add_folio(), we charge the correct memcg.
+	 */
+	remote_memcg = mem_cgroup_mapping_get_charge_target(mapping);
+	if (remote_memcg)
+		old_memcg = set_active_memcg(remote_memcg);
+
 	page = pagecache_get_page(mapping, index,
 					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
+	if (remote_memcg) {
+		set_active_memcg(old_memcg);
+		mem_cgroup_put_memcg(remote_memcg);
+	}

 	if (page && vma && userfaultfd_minor(vma)) {
 		if (!xa_is_value(page)) {
@@ -3342,6 +3368,7 @@ static const struct export_operations shmem_export_ops = {
 enum shmem_param {
 	Opt_gid,
 	Opt_huge,
+	Opt_memcg,
 	Opt_mode,
 	Opt_mpol,
 	Opt_nr_blocks,
@@ -3363,6 +3390,7 @@ static const struct constant_table shmem_param_enums_huge[] = {
 const struct fs_parameter_spec shmem_fs_parameters[] = {
 	fsparam_u32   ("gid",		Opt_gid),
 	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
+	fsparam_string("memcg",		Opt_memcg),
 	fsparam_u32oct("mode",		Opt_mode),
 	fsparam_string("mpol",		Opt_mpol),
 	fsparam_string("nr_blocks",	Opt_nr_blocks),
@@ -3379,6 +3407,9 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 	struct shmem_options *ctx = fc->fs_private;
 	struct fs_parse_result result;
 	unsigned long long size;
+#if CONFIG_MEMCG
+	struct mem_cgroup *memcg;
+#endif
 	char *rest;
 	int opt;

@@ -3412,6 +3443,17 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 			goto bad_value;
 		ctx->seen |= SHMEM_SEEN_INODES;
 		break;
+#if CONFIG_MEMCG
+	case Opt_memcg:
+		if (ctx->memcg)
+			css_put(&ctx->memcg->css);
+		memcg = mem_cgroup_get_from_path(param->string);
+		if (IS_ERR(memcg))
+			goto bad_value;
+		ctx->memcg = memcg;
+		ctx->seen |= SHMEM_SEEN_MEMCG;
+		break;
+#endif
 	case Opt_mode:
 		ctx->mode = result.uint_32 & 07777;
 		break;
@@ -3573,6 +3615,14 @@ static int shmem_reconfigure(struct fs_context *fc)
 	}
 	raw_spin_unlock(&sbinfo->stat_lock);
 	mpol_put(mpol);
+#if CONFIG_MEMCG
+	if (ctx->seen & SHMEM_SEEN_MEMCG && ctx->memcg) {
+		mem_cgroup_set_charge_target(&fc->root->d_sb->s_memcg_to_charge,
+					     ctx->memcg);
+		css_put(&ctx->memcg->css);
+		ctx->memcg = NULL;
+	}
+#endif
 	return 0;
 out:
 	raw_spin_unlock(&sbinfo->stat_lock);
@@ -3582,6 +3632,11 @@ static int shmem_reconfigure(struct fs_context *fc)
 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
+	int err;
+	char *buf = __getname();
+
+	if (!buf)
+		return -ENOMEM;

 	if (sbinfo->max_blocks != shmem_default_max_blocks())
 		seq_printf(seq, ",size=%luk",
@@ -3625,7 +3680,13 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
 #endif
 	shmem_show_mpol(seq, sbinfo->mpol);
-	return 0;
+	/* Memory cgroup binding: memcg=cgroup_name */
+	err = mem_cgroup_get_name_from_sb(root->d_sb, buf, PATH_MAX);
+	if (!err && buf[0] != '\0')
+		seq_printf(seq, ",memcg=%s", buf);
+
+	__putname(buf);
+	return err;
 }

 #endif /* CONFIG_TMPFS */
@@ -3710,6 +3771,14 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_flags |= SB_POSIXACL;
 #endif
 	uuid_gen(&sb->s_uuid);
+#if CONFIG_MEMCG
+	if (ctx->memcg) {
+		mem_cgroup_set_charge_target(&sb->s_memcg_to_charge,
+					     ctx->memcg);
+		css_put(&ctx->memcg->css);
+		ctx->memcg = NULL;
+	}
+#endif

 	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
 	if (!inode)
--
2.34.0.rc1.387.gb447b232ab-goog


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 1/4] mm/shmem: support deterministic charging of tmpfs
@ 2021-11-11 23:42   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song,
	riel-ebMLmSuQjDVBDgjK7y7TUQ, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

Add memcg= option to shmem mount.

Users can specify this option at mount time and all data page charges
will be charged to the memcg supplied. Processes are only allowed to
direct tmpfs changes to a cgroup that they themselves can enter and
allocate memory in.

Signed-off-by: Mina Almasry <almasrymina-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Cc: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
Cc: Theodore Ts'o <tytso-3s7WtUTddSA@public.gmane.org>
Cc: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Shakeel Butt <shakeelb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Roman Gushchin <guro-b10kYP2dOMg@public.gmane.org>
Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Vladimir Davydov <vdavydov.dev-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Cc: Muchun Song <songmuchun-EC8Uxl6Npydl57MIdRCFDg@public.gmane.org>
Cc: riel-ebMLmSuQjDVBDgjK7y7TUQ@public.gmane.org
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

---

Changes in v3:
- Fixed build failures/warnings Reported-by: kernel test robot <lkp-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Changes in v2:
- Fixed Roman's email.
- Added a new wrapper around charge_memcg() instead of __mem_cgroup_charge()
- Merged the permission check into this patch as Roman suggested.
- Instead of checking for a s_memcg_to_charge off the superblock in the
filemap code, I set_active_memcg() before calling into the fs generic
code as Dave suggests.
- I have kept the s_memcg_to_charge in the superblock to keep the
struct address_space pointer small and preserve the remount use case..

---
 fs/super.c                 |   7 ++
 include/linux/fs.h         |   5 ++
 include/linux/memcontrol.h |  58 +++++++++++++++++
 mm/memcontrol.c            | 130 +++++++++++++++++++++++++++++++++++++
 mm/shmem.c                 |  73 ++++++++++++++++++++-
 5 files changed, 271 insertions(+), 2 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 3bfc0f8fbd5bc..5484b08ba0025 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/memcontrol.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
@@ -180,6 +181,9 @@ static void destroy_unused_super(struct super_block *s)
 	up_write(&s->s_umount);
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
+#if CONFIG_MEMCG
+	mem_cgroup_set_charge_target(&s->s_memcg_to_charge, NULL);
+#endif
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
@@ -292,6 +296,9 @@ static void __put_super(struct super_block *s)
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(!list_empty(&s->s_mounts));
+#if CONFIG_MEMCG
+		mem_cgroup_set_charge_target(&s->s_memcg_to_charge, NULL);
+#endif
 		security_sb_free(s);
 		fscrypt_sb_free(s);
 		put_user_ns(s->s_user_ns);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3afca821df32e..59407b3e7aee3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1567,6 +1567,11 @@ struct super_block {
 	struct workqueue_struct *s_dio_done_wq;
 	struct hlist_head s_pins;

+#ifdef CONFIG_MEMCG
+	/* memcg to charge for pages allocated to this filesystem */
+	struct mem_cgroup *s_memcg_to_charge;
+#endif
+
 	/*
 	 * Owning user namespace and default context in which to
 	 * interpret filesystem uids, gids, quotas, device nodes,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c5c403f4be6b..8583d37c05d9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,6 +27,7 @@ struct obj_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct super_block;

 /* Cgroup-specific page state, on top of universal node page state */
 enum memcg_stat_item {
@@ -713,6 +714,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }

+int mem_cgroup_charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+			    gfp_t gfp);
+
 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -923,6 +927,24 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return !!(memcg->css.flags & CSS_ONLINE);
 }

+struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping);
+
+static inline void mem_cgroup_put_memcg(struct mem_cgroup *memcg)
+{
+	if (memcg)
+		css_put(&memcg->css);
+}
+
+void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+				  struct mem_cgroup *memcg);
+struct mem_cgroup *mem_cgroup_get_from_path(const char *path);
+/**
+ * User is responsible for providing a buffer @buf of length @len and freeing
+ * it.
+ */
+int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len);
+
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int zid, int nr_pages);

@@ -1223,6 +1245,42 @@ static inline int mem_cgroup_charge(struct folio *folio,
 	return 0;
 }

+static inline int mem_cgroup_charge_memcg(struct folio *folio,
+					  struct mem_cgroup *memcg,
+					  gfp_t gfp_mask)
+{
+	return 0;
+}
+
+static inline struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping)
+{
+	return NULL;
+}
+
+static inline void mem_cgroup_put_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+						struct mem_cgroup *memcg)
+{
+}
+
+static inline struct mem_cgroup *mem_cgroup_get_from_path(const char *path)
+{
+	return NULL;
+}
+
+static inline int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf,
+					      size_t len)
+{
+	if (len < 1)
+		return -EINVAL;
+	buf[0] = '\0';
+	return 0;
+}
+
 static inline int mem_cgroup_swapin_charge_page(struct page *page,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 781605e920153..b3d8f52a63d17 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
 #include <linux/tracehook.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
+#include <linux/string.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -2580,6 +2581,126 @@ void mem_cgroup_handle_over_high(void)
 	css_put(&memcg->css);
 }

+/*
+ * Non error return value must eventually be released with css_put().
+ */
+struct mem_cgroup *mem_cgroup_get_from_path(const char *path)
+{
+	static const char procs_filename[] = "/cgroup.procs";
+	struct file *file, *procs;
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+	char *procs_path =
+		kmalloc(strlen(path) + sizeof(procs_filename), GFP_KERNEL);
+
+	if (procs_path == NULL)
+		return ERR_PTR(-ENOMEM);
+	strcpy(procs_path, path);
+	strcat(procs_path, procs_filename);
+
+	procs = filp_open(procs_path, O_WRONLY, 0);
+	kfree(procs_path);
+
+	/*
+	 * Restrict the capability for tasks to mount with memcg charging to the
+	 * cgroup they could not join. For example, disallow:
+	 *
+	 * mount -t tmpfs -o memcg=root-cgroup nodev <MOUNT_DIR>
+	 *
+	 * if it is a non-root task.
+	 */
+	if (IS_ERR(procs))
+		return (struct mem_cgroup *)procs;
+	fput(procs);
+
+	file = filp_open(path, O_DIRECTORY | O_RDONLY, 0);
+	if (IS_ERR(file))
+		return (struct mem_cgroup *)file;
+
+	css = css_tryget_online_from_dir(file->f_path.dentry,
+					 &memory_cgrp_subsys);
+	if (IS_ERR(css))
+		memcg = (struct mem_cgroup *)css;
+	else
+		memcg = container_of(css, struct mem_cgroup, css);
+
+	fput(file);
+	return memcg;
+}
+
+/*
+ * Get the name of the optional charge target memcg associated with @sb.  This
+ * is the cgroup name, not the cgroup path.
+ */
+int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len)
+{
+	struct mem_cgroup *memcg;
+	int ret = 0;
+
+	buf[0] = '\0';
+
+	rcu_read_lock();
+	memcg = rcu_dereference(sb->s_memcg_to_charge);
+	if (memcg && !css_tryget_online(&memcg->css))
+		memcg = NULL;
+	rcu_read_unlock();
+
+	if (!memcg)
+		return 0;
+
+	ret = cgroup_path(memcg->css.cgroup, buf + len / 2, len / 2);
+	if (ret >= len / 2)
+		strcpy(buf, "?");
+	else {
+		char *p = mangle_path(buf, buf + len / 2, " \t\n\\");
+
+		if (p)
+			*p = '\0';
+		else
+			strcpy(buf, "?");
+	}
+
+	css_put(&memcg->css);
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * Set or clear (if @memcg is NULL) charge association from file system to
+ * memcg.  If @memcg != NULL, then a css reference must be held by the caller to
+ * ensure that the cgroup is not deleted during this operation.
+ */
+void mem_cgroup_set_charge_target(struct mem_cgroup **target,
+				  struct mem_cgroup *memcg)
+{
+	if (memcg)
+		css_get(&memcg->css);
+	memcg = xchg(target, memcg);
+	if (memcg)
+		css_put(&memcg->css);
+}
+
+/*
+ * Returns the memcg to charge for inode pages.  If non-NULL is returned, caller
+ * must drop reference with css_put().  NULL indicates that the inode does not
+ * have a memcg to charge, so the default process based policy should be used.
+ */
+struct mem_cgroup *
+mem_cgroup_mapping_get_charge_target(struct address_space *mapping)
+{
+	struct mem_cgroup *memcg;
+
+	if (!mapping)
+		return NULL;
+
+	rcu_read_lock();
+	memcg = rcu_dereference(mapping->host->i_sb->s_memcg_to_charge);
+	if (memcg && !css_tryget_online(&memcg->css))
+		memcg = NULL;
+	rcu_read_unlock();
+
+	return memcg;
+}
+
 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			unsigned int nr_pages)
 {
@@ -6678,6 +6799,15 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
 	return ret;
 }

+int mem_cgroup_charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+			    gfp_t gfp)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+
+	return charge_memcg(folio, memcg, gfp);
+}
+
 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
diff --git a/mm/shmem.c b/mm/shmem.c
index 23c91a8beb781..8b623c49ee50d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -115,10 +115,14 @@ struct shmem_options {
 	bool full_inums;
 	int huge;
 	int seen;
+#if CONFIG_MEMCG
+	struct mem_cgroup *memcg;
+#endif
 #define SHMEM_SEEN_BLOCKS 1
 #define SHMEM_SEEN_INODES 2
 #define SHMEM_SEEN_HUGE 4
 #define SHMEM_SEEN_INUMS 8
+#define SHMEM_SEEN_MEMCG 16
 };

 #ifdef CONFIG_TMPFS
@@ -697,6 +701,7 @@ static int shmem_add_to_page_cache(struct page *page,
 	unsigned long i = 0;
 	unsigned long nr = compound_nr(page);
 	int error;
+	struct mem_cgroup *remote_memcg;

 	VM_BUG_ON_PAGE(PageTail(page), page);
 	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -709,7 +714,14 @@ static int shmem_add_to_page_cache(struct page *page,
 	page->index = index;

 	if (!PageSwapCache(page)) {
-		error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
+		remote_memcg = mem_cgroup_mapping_get_charge_target(mapping);
+		if (remote_memcg) {
+			error = mem_cgroup_charge_memcg(page_folio(page),
+							remote_memcg, gfp);
+			mem_cgroup_put_memcg(remote_memcg);
+		} else
+			error = mem_cgroup_charge(page_folio(page), charge_mm,
+						  gfp);
 		if (error) {
 			if (PageTransHuge(page)) {
 				count_vm_event(THP_FILE_FALLBACK);
@@ -1822,6 +1834,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	int error;
 	int once = 0;
 	int alloced = 0;
+	struct mem_cgroup *remote_memcg, *old_memcg;

 	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
 		return -EFBIG;
@@ -1834,8 +1847,21 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	sbinfo = SHMEM_SB(inode->i_sb);
 	charge_mm = vma ? vma->vm_mm : NULL;

+	/*
+	 * If we're doing a remote charge here, set the active_memcg as the
+	 * remote memcg, so that eventually if pagecache_get_page() calls into
+	 * filemap_add_folio(), we charge the correct memcg.
+	 */
+	remote_memcg = mem_cgroup_mapping_get_charge_target(mapping);
+	if (remote_memcg)
+		old_memcg = set_active_memcg(remote_memcg);
+
 	page = pagecache_get_page(mapping, index,
 					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
+	if (remote_memcg) {
+		set_active_memcg(old_memcg);
+		mem_cgroup_put_memcg(remote_memcg);
+	}

 	if (page && vma && userfaultfd_minor(vma)) {
 		if (!xa_is_value(page)) {
@@ -3342,6 +3368,7 @@ static const struct export_operations shmem_export_ops = {
 enum shmem_param {
 	Opt_gid,
 	Opt_huge,
+	Opt_memcg,
 	Opt_mode,
 	Opt_mpol,
 	Opt_nr_blocks,
@@ -3363,6 +3390,7 @@ static const struct constant_table shmem_param_enums_huge[] = {
 const struct fs_parameter_spec shmem_fs_parameters[] = {
 	fsparam_u32   ("gid",		Opt_gid),
 	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
+	fsparam_string("memcg",		Opt_memcg),
 	fsparam_u32oct("mode",		Opt_mode),
 	fsparam_string("mpol",		Opt_mpol),
 	fsparam_string("nr_blocks",	Opt_nr_blocks),
@@ -3379,6 +3407,9 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 	struct shmem_options *ctx = fc->fs_private;
 	struct fs_parse_result result;
 	unsigned long long size;
+#if CONFIG_MEMCG
+	struct mem_cgroup *memcg;
+#endif
 	char *rest;
 	int opt;

@@ -3412,6 +3443,17 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 			goto bad_value;
 		ctx->seen |= SHMEM_SEEN_INODES;
 		break;
+#if CONFIG_MEMCG
+	case Opt_memcg:
+		if (ctx->memcg)
+			css_put(&ctx->memcg->css);
+		memcg = mem_cgroup_get_from_path(param->string);
+		if (IS_ERR(memcg))
+			goto bad_value;
+		ctx->memcg = memcg;
+		ctx->seen |= SHMEM_SEEN_MEMCG;
+		break;
+#endif
 	case Opt_mode:
 		ctx->mode = result.uint_32 & 07777;
 		break;
@@ -3573,6 +3615,14 @@ static int shmem_reconfigure(struct fs_context *fc)
 	}
 	raw_spin_unlock(&sbinfo->stat_lock);
 	mpol_put(mpol);
+#if CONFIG_MEMCG
+	if (ctx->seen & SHMEM_SEEN_MEMCG && ctx->memcg) {
+		mem_cgroup_set_charge_target(&fc->root->d_sb->s_memcg_to_charge,
+					     ctx->memcg);
+		css_put(&ctx->memcg->css);
+		ctx->memcg = NULL;
+	}
+#endif
 	return 0;
 out:
 	raw_spin_unlock(&sbinfo->stat_lock);
@@ -3582,6 +3632,11 @@ static int shmem_reconfigure(struct fs_context *fc)
 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
+	int err;
+	char *buf = __getname();
+
+	if (!buf)
+		return -ENOMEM;

 	if (sbinfo->max_blocks != shmem_default_max_blocks())
 		seq_printf(seq, ",size=%luk",
@@ -3625,7 +3680,13 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
 #endif
 	shmem_show_mpol(seq, sbinfo->mpol);
-	return 0;
+	/* Memory cgroup binding: memcg=cgroup_name */
+	err = mem_cgroup_get_name_from_sb(root->d_sb, buf, PATH_MAX);
+	if (!err && buf[0] != '\0')
+		seq_printf(seq, ",memcg=%s", buf);
+
+	__putname(buf);
+	return err;
 }

 #endif /* CONFIG_TMPFS */
@@ -3710,6 +3771,14 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_flags |= SB_POSIXACL;
 #endif
 	uuid_gen(&sb->s_uuid);
+#if CONFIG_MEMCG
+	if (ctx->memcg) {
+		mem_cgroup_set_charge_target(&sb->s_memcg_to_charge,
+					     ctx->memcg);
+		css_put(&ctx->memcg->css);
+		ctx->memcg = NULL;
+	}
+#endif

 	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
 	if (!inode)

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 2/4] mm/oom: handle remote ooms
       [not found] <20211111234203.1824138-1-almasrymina@google.com>
  2021-11-11 23:42   ` Mina Almasry
@ 2021-11-11 23:42   ` Mina Almasry
  2021-11-11 23:42   ` Mina Almasry
  2021-11-11 23:42   ` Mina Almasry
  3 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

On remote ooms (OOMs due to remote charging), the oom-killer will attempt
to find a task to kill in the memcg under oom, if the oom-killer
is unable to find one, the oom-killer should simply return ENOMEM to the
allocating process.

If we're in pagefault path and we're unable to return ENOMEM to the
allocating process, we instead kill the allocating process.

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
CC: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---

Changes in v3:
- Fixed build failures/warnings Reported-by: kernel test robot <lkp@intel.com>

Changes in v2:
- Moved the remote oom handling as Roman requested.
- Used mem_cgroup_from_task(current) instead of grabbing the memcg from
current->mm

---
 include/linux/memcontrol.h | 16 ++++++++++++++++
 mm/memcontrol.c            | 29 +++++++++++++++++++++++++++++
 mm/oom_kill.c              | 22 ++++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8583d37c05d9b..b7a045ace7b2c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -944,6 +944,7 @@ struct mem_cgroup *mem_cgroup_get_from_path(const char *path);
  * it.
  */
 int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len);
+bool is_remote_oom(struct mem_cgroup *memcg_under_oom);

 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int zid, int nr_pages);
@@ -981,6 +982,11 @@ static inline void mem_cgroup_exit_user_fault(void)
 	current->in_user_fault = 0;
 }

+static inline bool is_in_user_fault(void)
+{
+	return current->in_user_fault;
+}
+
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return p->memcg_in_oom;
@@ -1281,6 +1287,11 @@ static inline int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf,
 	return 0;
 }

+static inline bool is_remote_oom(struct mem_cgroup *memcg_under_oom)
+{
+	return false;
+}
+
 static inline int mem_cgroup_swapin_charge_page(struct page *page,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
@@ -1472,6 +1483,11 @@ static inline void mem_cgroup_exit_user_fault(void)
 {
 }

+static inline bool is_in_user_fault(void)
+{
+	return false;
+}
+
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return false;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b3d8f52a63d17..8019c396bfdd9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2664,6 +2664,35 @@ int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len)
 	return ret < 0 ? ret : 0;
 }

+/*
+ * Returns true if current's mm is a descendant of the memcg_under_oom (or
+ * equal to it). False otherwise. This is used by the oom-killer to detect
+ * ooms due to remote charging.
+ */
+bool is_remote_oom(struct mem_cgroup *memcg_under_oom)
+{
+	struct mem_cgroup *current_memcg;
+	bool is_remote_oom;
+
+	if (!memcg_under_oom)
+		return false;
+
+	rcu_read_lock();
+	current_memcg = mem_cgroup_from_task(current);
+	if (current_memcg && !css_tryget_online(&current_memcg->css))
+		current_memcg = NULL;
+	rcu_read_unlock();
+
+	if (!current_memcg)
+		return false;
+
+	is_remote_oom =
+		!mem_cgroup_is_descendant(current_memcg, memcg_under_oom);
+	css_put(&current_memcg->css);
+
+	return is_remote_oom;
+}
+
 /*
  * Set or clear (if @memcg is NULL) charge association from file system to
  * memcg.  If @memcg != NULL, then a css reference must be held by the caller to
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a7e16b16b8c3..499924efab370 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1108,6 +1108,28 @@ bool out_of_memory(struct oom_control *oc)
 	select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
+		if (is_remote_oom(oc->memcg)) {
+			/*
+			 * For remote ooms in userfaults, we have no choice but
+			 * to kill the allocating process.
+			 */
+			if (is_in_user_fault() &&
+			    !oom_unkillable_task(current)) {
+				get_task_struct(current);
+				oc->chosen = current;
+				oom_kill_process(
+					oc,
+					"Out of memory (Killing remote allocating task)");
+				return true;
+			}
+
+			/*
+			 * For remote ooms in non-userfaults, simply return
+			 * ENOMEM to the caller.
+			 */
+			return false;
+		}
+
 		dump_header(oc, NULL);
 		pr_warn("Out of memory and no killable processes...\n");
 		/*
--
2.34.0.rc1.387.gb447b232ab-goog

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-11 23:42   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

On remote ooms (OOMs due to remote charging), the oom-killer will attempt
to find a task to kill in the memcg under oom, if the oom-killer
is unable to find one, the oom-killer should simply return ENOMEM to the
allocating process.

If we're in pagefault path and we're unable to return ENOMEM to the
allocating process, we instead kill the allocating process.

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
CC: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---

Changes in v3:
- Fixed build failures/warnings Reported-by: kernel test robot <lkp@intel.com>

Changes in v2:
- Moved the remote oom handling as Roman requested.
- Used mem_cgroup_from_task(current) instead of grabbing the memcg from
current->mm

---
 include/linux/memcontrol.h | 16 ++++++++++++++++
 mm/memcontrol.c            | 29 +++++++++++++++++++++++++++++
 mm/oom_kill.c              | 22 ++++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8583d37c05d9b..b7a045ace7b2c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -944,6 +944,7 @@ struct mem_cgroup *mem_cgroup_get_from_path(const char *path);
  * it.
  */
 int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len);
+bool is_remote_oom(struct mem_cgroup *memcg_under_oom);

 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int zid, int nr_pages);
@@ -981,6 +982,11 @@ static inline void mem_cgroup_exit_user_fault(void)
 	current->in_user_fault = 0;
 }

+static inline bool is_in_user_fault(void)
+{
+	return current->in_user_fault;
+}
+
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return p->memcg_in_oom;
@@ -1281,6 +1287,11 @@ static inline int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf,
 	return 0;
 }

+static inline bool is_remote_oom(struct mem_cgroup *memcg_under_oom)
+{
+	return false;
+}
+
 static inline int mem_cgroup_swapin_charge_page(struct page *page,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
@@ -1472,6 +1483,11 @@ static inline void mem_cgroup_exit_user_fault(void)
 {
 }

+static inline bool is_in_user_fault(void)
+{
+	return false;
+}
+
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return false;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b3d8f52a63d17..8019c396bfdd9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2664,6 +2664,35 @@ int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len)
 	return ret < 0 ? ret : 0;
 }

+/*
+ * Returns true if current's mm is a descendant of the memcg_under_oom (or
+ * equal to it). False otherwise. This is used by the oom-killer to detect
+ * ooms due to remote charging.
+ */
+bool is_remote_oom(struct mem_cgroup *memcg_under_oom)
+{
+	struct mem_cgroup *current_memcg;
+	bool is_remote_oom;
+
+	if (!memcg_under_oom)
+		return false;
+
+	rcu_read_lock();
+	current_memcg = mem_cgroup_from_task(current);
+	if (current_memcg && !css_tryget_online(&current_memcg->css))
+		current_memcg = NULL;
+	rcu_read_unlock();
+
+	if (!current_memcg)
+		return false;
+
+	is_remote_oom =
+		!mem_cgroup_is_descendant(current_memcg, memcg_under_oom);
+	css_put(&current_memcg->css);
+
+	return is_remote_oom;
+}
+
 /*
  * Set or clear (if @memcg is NULL) charge association from file system to
  * memcg.  If @memcg != NULL, then a css reference must be held by the caller to
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a7e16b16b8c3..499924efab370 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1108,6 +1108,28 @@ bool out_of_memory(struct oom_control *oc)
 	select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
+		if (is_remote_oom(oc->memcg)) {
+			/*
+			 * For remote ooms in userfaults, we have no choice but
+			 * to kill the allocating process.
+			 */
+			if (is_in_user_fault() &&
+			    !oom_unkillable_task(current)) {
+				get_task_struct(current);
+				oc->chosen = current;
+				oom_kill_process(
+					oc,
+					"Out of memory (Killing remote allocating task)");
+				return true;
+			}
+
+			/*
+			 * For remote ooms in non-userfaults, simply return
+			 * ENOMEM to the caller.
+			 */
+			return false;
+		}
+
 		dump_header(oc, NULL);
 		pr_warn("Out of memory and no killable processes...\n");
 		/*
--
2.34.0.rc1.387.gb447b232ab-goog


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-11 23:42   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

On remote ooms (OOMs due to remote charging), the oom-killer will attempt
to find a task to kill in the memcg under oom, if the oom-killer
is unable to find one, the oom-killer should simply return ENOMEM to the
allocating process.

If we're in pagefault path and we're unable to return ENOMEM to the
allocating process, we instead kill the allocating process.

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
CC: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---

Changes in v3:
- Fixed build failures/warnings Reported-by: kernel test robot <lkp@intel.com>

Changes in v2:
- Moved the remote oom handling as Roman requested.
- Used mem_cgroup_from_task(current) instead of grabbing the memcg from
current->mm

---
 include/linux/memcontrol.h | 16 ++++++++++++++++
 mm/memcontrol.c            | 29 +++++++++++++++++++++++++++++
 mm/oom_kill.c              | 22 ++++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8583d37c05d9b..b7a045ace7b2c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -944,6 +944,7 @@ struct mem_cgroup *mem_cgroup_get_from_path(const char *path);
  * it.
  */
 int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len);
+bool is_remote_oom(struct mem_cgroup *memcg_under_oom);

 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int zid, int nr_pages);
@@ -981,6 +982,11 @@ static inline void mem_cgroup_exit_user_fault(void)
 	current->in_user_fault = 0;
 }

+static inline bool is_in_user_fault(void)
+{
+	return current->in_user_fault;
+}
+
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return p->memcg_in_oom;
@@ -1281,6 +1287,11 @@ static inline int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf,
 	return 0;
 }

+static inline bool is_remote_oom(struct mem_cgroup *memcg_under_oom)
+{
+	return false;
+}
+
 static inline int mem_cgroup_swapin_charge_page(struct page *page,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
@@ -1472,6 +1483,11 @@ static inline void mem_cgroup_exit_user_fault(void)
 {
 }

+static inline bool is_in_user_fault(void)
+{
+	return false;
+}
+
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return false;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b3d8f52a63d17..8019c396bfdd9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2664,6 +2664,35 @@ int mem_cgroup_get_name_from_sb(struct super_block *sb, char *buf, size_t len)
 	return ret < 0 ? ret : 0;
 }

+/*
+ * Returns true if current's mm is a descendant of the memcg_under_oom (or
+ * equal to it). False otherwise. This is used by the oom-killer to detect
+ * ooms due to remote charging.
+ */
+bool is_remote_oom(struct mem_cgroup *memcg_under_oom)
+{
+	struct mem_cgroup *current_memcg;
+	bool is_remote_oom;
+
+	if (!memcg_under_oom)
+		return false;
+
+	rcu_read_lock();
+	current_memcg = mem_cgroup_from_task(current);
+	if (current_memcg && !css_tryget_online(&current_memcg->css))
+		current_memcg = NULL;
+	rcu_read_unlock();
+
+	if (!current_memcg)
+		return false;
+
+	is_remote_oom =
+		!mem_cgroup_is_descendant(current_memcg, memcg_under_oom);
+	css_put(&current_memcg->css);
+
+	return is_remote_oom;
+}
+
 /*
  * Set or clear (if @memcg is NULL) charge association from file system to
  * memcg.  If @memcg != NULL, then a css reference must be held by the caller to
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a7e16b16b8c3..499924efab370 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1108,6 +1108,28 @@ bool out_of_memory(struct oom_control *oc)
 	select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
+		if (is_remote_oom(oc->memcg)) {
+			/*
+			 * For remote ooms in userfaults, we have no choice but
+			 * to kill the allocating process.
+			 */
+			if (is_in_user_fault() &&
+			    !oom_unkillable_task(current)) {
+				get_task_struct(current);
+				oc->chosen = current;
+				oom_kill_process(
+					oc,
+					"Out of memory (Killing remote allocating task)");
+				return true;
+			}
+
+			/*
+			 * For remote ooms in non-userfaults, simply return
+			 * ENOMEM to the caller.
+			 */
+			return false;
+		}
+
 		dump_header(oc, NULL);
 		pr_warn("Out of memory and no killable processes...\n");
 		/*
--
2.34.0.rc1.387.gb447b232ab-goog

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 3/4] mm, shmem: add tmpfs memcg= option documentation
       [not found] <20211111234203.1824138-1-almasrymina@google.com>
  2021-11-11 23:42   ` Mina Almasry
@ 2021-11-11 23:42   ` Mina Almasry
  2021-11-11 23:42   ` Mina Almasry
  2021-11-11 23:42   ` Mina Almasry
  3 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---
 Documentation/filesystems/tmpfs.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst
index 0408c245785e3..1ab04e8fa9222 100644
--- a/Documentation/filesystems/tmpfs.rst
+++ b/Documentation/filesystems/tmpfs.rst
@@ -137,6 +137,23 @@ mount options.  It can be added later, when the tmpfs is already mounted
 on MountPoint, by 'mount -o remount,mpol=Policy:NodeList MountPoint'.


+If CONFIG_MEMCG is enabled, tmpfs has a mount option to specify the memory
+cgroup to be charged for page allocations.
+
+memcg=/sys/fs/cgroup/unified/test/: data page allocations are charged to
+cgroup /sys/fs/cgroup/unified/test/.
+
+When charging memory to the remote memcg (memcg specified with memcg=) and
+hitting the limit, the oom-killer will be invoked and will attempt to kill
+a process in the remote memcg. If no such processes are found, the remote
+charging process gets an ENOMEM. If the remote charging process is in the
+pagefault path, it gets killed.
+
+Only processes that have access to /sys/fs/cgroup/unified/test/cgroup.procs can
+mount a tmpfs with memcg=/sys/fs/cgroup/unified/test. Thus, a process is able
+to charge memory to a cgroup only if it itself is able to enter that cgroup.
+
+
 To specify the initial root directory you can use the following mount
 options:

--
2.34.0.rc1.387.gb447b232ab-goog

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 3/4] mm, shmem: add tmpfs memcg= option documentation
@ 2021-11-11 23:42   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---
 Documentation/filesystems/tmpfs.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst
index 0408c245785e3..1ab04e8fa9222 100644
--- a/Documentation/filesystems/tmpfs.rst
+++ b/Documentation/filesystems/tmpfs.rst
@@ -137,6 +137,23 @@ mount options.  It can be added later, when the tmpfs is already mounted
 on MountPoint, by 'mount -o remount,mpol=Policy:NodeList MountPoint'.


+If CONFIG_MEMCG is enabled, tmpfs has a mount option to specify the memory
+cgroup to be charged for page allocations.
+
+memcg=/sys/fs/cgroup/unified/test/: data page allocations are charged to
+cgroup /sys/fs/cgroup/unified/test/.
+
+When charging memory to the remote memcg (memcg specified with memcg=) and
+hitting the limit, the oom-killer will be invoked and will attempt to kill
+a process in the remote memcg. If no such processes are found, the remote
+charging process gets an ENOMEM. If the remote charging process is in the
+pagefault path, it gets killed.
+
+Only processes that have access to /sys/fs/cgroup/unified/test/cgroup.procs can
+mount a tmpfs with memcg=/sys/fs/cgroup/unified/test. Thus, a process is able
+to charge memory to a cgroup only if it itself is able to enter that cgroup.
+
+
 To specify the initial root directory you can use the following mount
 options:

--
2.34.0.rc1.387.gb447b232ab-goog


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 3/4] mm, shmem: add tmpfs memcg= option documentation
@ 2021-11-11 23:42   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---
 Documentation/filesystems/tmpfs.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst
index 0408c245785e3..1ab04e8fa9222 100644
--- a/Documentation/filesystems/tmpfs.rst
+++ b/Documentation/filesystems/tmpfs.rst
@@ -137,6 +137,23 @@ mount options.  It can be added later, when the tmpfs is already mounted
 on MountPoint, by 'mount -o remount,mpol=Policy:NodeList MountPoint'.


+If CONFIG_MEMCG is enabled, tmpfs has a mount option to specify the memory
+cgroup to be charged for page allocations.
+
+memcg=/sys/fs/cgroup/unified/test/: data page allocations are charged to
+cgroup /sys/fs/cgroup/unified/test/.
+
+When charging memory to the remote memcg (memcg specified with memcg=) and
+hitting the limit, the oom-killer will be invoked and will attempt to kill
+a process in the remote memcg. If no such processes are found, the remote
+charging process gets an ENOMEM. If the remote charging process is in the
+pagefault path, it gets killed.
+
+Only processes that have access to /sys/fs/cgroup/unified/test/cgroup.procs can
+mount a tmpfs with memcg=/sys/fs/cgroup/unified/test. Thus, a process is able
+to charge memory to a cgroup only if it itself is able to enter that cgroup.
+
+
 To specify the initial root directory you can use the following mount
 options:

--
2.34.0.rc1.387.gb447b232ab-goog

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 4/4] mm, shmem, selftests: add tmpfs memcg= mount option tests
       [not found] <20211111234203.1824138-1-almasrymina@google.com>
  2021-11-11 23:42   ` Mina Almasry
@ 2021-11-11 23:42   ` Mina Almasry
  2021-11-11 23:42   ` Mina Almasry
  2021-11-11 23:42   ` Mina Almasry
  3 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---
 tools/testing/selftests/vm/.gitignore     |   1 +
 tools/testing/selftests/vm/mmap_write.c   | 103 ++++++++++++++++++++++
 tools/testing/selftests/vm/tmpfs-memcg.sh |  91 +++++++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 tools/testing/selftests/vm/mmap_write.c
 create mode 100755 tools/testing/selftests/vm/tmpfs-memcg.sh

diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 2e7e86e852828..cb229974c5f15 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -19,6 +19,7 @@ madv_populate
 userfaultfd
 mlock-intersect-test
 mlock-random-test
+mmap_write
 virtual_address_range
 gup_test
 va_128TBswitch
diff --git a/tools/testing/selftests/vm/mmap_write.c b/tools/testing/selftests/vm/mmap_write.c
new file mode 100644
index 0000000000000..88a8468f2128c
--- /dev/null
+++ b/tools/testing/selftests/vm/mmap_write.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program faults memory in tmpfs
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+/* Global definitions. */
+
+/* Global variables. */
+static const char *self;
+static char *shmaddr;
+static int shmid;
+
+/*
+ * Show usage and exit.
+ */
+static void exit_usage(void)
+{
+	printf("Usage: %s -p <path to tmpfs file> -s <size to map>\n", self);
+	exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv)
+{
+	int fd = 0;
+	int key = 0;
+	int *ptr = NULL;
+	int c = 0;
+	int size = 0;
+	char path[256] = "";
+	int want_sleep = 0, private = 0;
+	int populate = 0;
+	int write = 0;
+	int reserve = 1;
+
+	/* Parse command-line arguments. */
+	setvbuf(stdout, NULL, _IONBF, 0);
+	self = argv[0];
+
+	while ((c = getopt(argc, argv, ":s:p:")) != -1) {
+		switch (c) {
+		case 's':
+			size = atoi(optarg);
+			break;
+		case 'p':
+			strncpy(path, optarg, sizeof(path));
+			break;
+		default:
+			errno = EINVAL;
+			perror("Invalid arg");
+			exit_usage();
+		}
+	}
+
+	printf("%s\n", path);
+	if (strncmp(path, "", sizeof(path)) != 0) {
+		printf("Writing to this path: %s\n", path);
+	} else {
+		errno = EINVAL;
+		perror("path not found");
+		exit_usage();
+	}
+
+	if (size != 0) {
+		printf("Writing this size: %d\n", size);
+	} else {
+		errno = EINVAL;
+		perror("size not found");
+		exit_usage();
+	}
+
+	fd = open(path, O_CREAT | O_RDWR, 0777);
+	if (fd == -1)
+		err(1, "Failed to open file.");
+
+	if (ftruncate(fd, size))
+		err(1, "failed to ftruncate %s", path);
+
+	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (ptr == MAP_FAILED) {
+		close(fd);
+		err(1, "Error mapping the file");
+	}
+
+	printf("Writing to memory.\n");
+	memset(ptr, 1, size);
+	printf("Done writing to memory.\n");
+	close(fd);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/tmpfs-memcg.sh b/tools/testing/selftests/vm/tmpfs-memcg.sh
new file mode 100755
index 0000000000000..30da2fad06357
--- /dev/null
+++ b/tools/testing/selftests/vm/tmpfs-memcg.sh
@@ -0,0 +1,91 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+CGROUP_PATH=/dev/cgroup/memory/tmpfs-memcg-test
+
+function cleanup() {
+  rm -rf /mnt/tmpfs/*
+  umount /mnt/tmpfs
+  rm -rf /mnt/tmpfs
+
+  rmdir $CGROUP_PATH
+
+  echo CLEANUP DONE
+}
+
+function setup() {
+  mkdir -p $CGROUP_PATH
+  echo $((10 * 1024 * 1024)) > $CGROUP_PATH/memory.limit_in_bytes
+  echo 0 > $CGROUP_PATH/cpuset.cpus
+  echo 0 > $CGROUP_PATH/cpuset.mems
+
+  mkdir -p /mnt/tmpfs
+
+  echo SETUP DONE
+}
+
+function expect_equal() {
+  local expected="$1"
+  local actual="$2"
+  local error="$3"
+
+  if [[ "$actual" != "$expected" ]]; then
+    echo "expected ($expected) != actual ($actual): $3" >&2
+    cleanup
+    exit 1
+  fi
+}
+
+function expect_ge() {
+  local expected="$1"
+  local actual="$2"
+  local error="$3"
+
+  if [[ "$actual" -lt "$expected" ]]; then
+    echo "expected ($expected) < actual ($actual): $3" >&2
+    cleanup
+    exit 1
+  fi
+}
+
+cleanup
+setup
+
+mount -t tmpfs -o memcg=$CGROUP_PATH tmpfs /mnt/tmpfs
+
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_equal 0 "$TARGET_MEMCG_USAGE" "Before echo, memcg usage should be 0"
+
+# Echo to allocate a page in the tmpfs
+echo
+echo
+echo hello > /mnt/tmpfs/test
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_ge 4096 "$TARGET_MEMCG_USAGE" "After echo, memcg usage should be greater than 4096"
+echo "Echo test succeeded"
+
+echo
+echo
+tools/testing/selftests/vm/mmap_write -p /mnt/tmpfs/test -s $((1 * 1024 * 1024))
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_ge $((1 * 1024 * 1024)) "$TARGET_MEMCG_USAGE" "After echo, memcg usage should greater than 1MB"
+echo "Write succeeded"
+
+# OOM the remote container on pagefault.
+echo
+echo
+echo "OOMing the remote container using pagefault."
+echo "This will take a long time because the kernel goes through reclaim retries,"
+echo "but should eventually be OOM-killed by 'Out of memory (Killing remote allocating task)'"
+tools/testing/selftests/vm/mmap_write -p /mnt/tmpfs/test -s $((11 * 1024 * 1024))
+
+# OOM the remote container on non pagefault.
+echo
+echo
+echo "OOMing the remote container using cat (non-pagefault)"
+echo "This will take a long time because the kernel goes through reclaim retries,"
+echo "but should eventually the cat command should receive an ENOMEM"
+cat /dev/random > /mnt/tmpfs/random
+
+cleanup
+echo SUCCESS
--
2.34.0.rc1.387.gb447b232ab-goog

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 4/4] mm, shmem, selftests: add tmpfs memcg= mount option tests
@ 2021-11-11 23:42   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song, riel,
	linux-mm, linux-fsdevel, cgroups

Signed-off-by: Mina Almasry <almasrymina@google.com>

Cc: Michal Hocko <mhocko@suse.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Greg Thelen <gthelen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: riel@surriel.com
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cgroups@vger.kernel.org

---
 tools/testing/selftests/vm/.gitignore     |   1 +
 tools/testing/selftests/vm/mmap_write.c   | 103 ++++++++++++++++++++++
 tools/testing/selftests/vm/tmpfs-memcg.sh |  91 +++++++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 tools/testing/selftests/vm/mmap_write.c
 create mode 100755 tools/testing/selftests/vm/tmpfs-memcg.sh

diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 2e7e86e852828..cb229974c5f15 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -19,6 +19,7 @@ madv_populate
 userfaultfd
 mlock-intersect-test
 mlock-random-test
+mmap_write
 virtual_address_range
 gup_test
 va_128TBswitch
diff --git a/tools/testing/selftests/vm/mmap_write.c b/tools/testing/selftests/vm/mmap_write.c
new file mode 100644
index 0000000000000..88a8468f2128c
--- /dev/null
+++ b/tools/testing/selftests/vm/mmap_write.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program faults memory in tmpfs
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+/* Global definitions. */
+
+/* Global variables. */
+static const char *self;
+static char *shmaddr;
+static int shmid;
+
+/*
+ * Show usage and exit.
+ */
+static void exit_usage(void)
+{
+	printf("Usage: %s -p <path to tmpfs file> -s <size to map>\n", self);
+	exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv)
+{
+	int fd = 0;
+	int key = 0;
+	int *ptr = NULL;
+	int c = 0;
+	int size = 0;
+	char path[256] = "";
+	int want_sleep = 0, private = 0;
+	int populate = 0;
+	int write = 0;
+	int reserve = 1;
+
+	/* Parse command-line arguments. */
+	setvbuf(stdout, NULL, _IONBF, 0);
+	self = argv[0];
+
+	while ((c = getopt(argc, argv, ":s:p:")) != -1) {
+		switch (c) {
+		case 's':
+			size = atoi(optarg);
+			break;
+		case 'p':
+			strncpy(path, optarg, sizeof(path));
+			break;
+		default:
+			errno = EINVAL;
+			perror("Invalid arg");
+			exit_usage();
+		}
+	}
+
+	printf("%s\n", path);
+	if (strncmp(path, "", sizeof(path)) != 0) {
+		printf("Writing to this path: %s\n", path);
+	} else {
+		errno = EINVAL;
+		perror("path not found");
+		exit_usage();
+	}
+
+	if (size != 0) {
+		printf("Writing this size: %d\n", size);
+	} else {
+		errno = EINVAL;
+		perror("size not found");
+		exit_usage();
+	}
+
+	fd = open(path, O_CREAT | O_RDWR, 0777);
+	if (fd == -1)
+		err(1, "Failed to open file.");
+
+	if (ftruncate(fd, size))
+		err(1, "failed to ftruncate %s", path);
+
+	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (ptr == MAP_FAILED) {
+		close(fd);
+		err(1, "Error mapping the file");
+	}
+
+	printf("Writing to memory.\n");
+	memset(ptr, 1, size);
+	printf("Done writing to memory.\n");
+	close(fd);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/tmpfs-memcg.sh b/tools/testing/selftests/vm/tmpfs-memcg.sh
new file mode 100755
index 0000000000000..30da2fad06357
--- /dev/null
+++ b/tools/testing/selftests/vm/tmpfs-memcg.sh
@@ -0,0 +1,91 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+CGROUP_PATH=/dev/cgroup/memory/tmpfs-memcg-test
+
+function cleanup() {
+  rm -rf /mnt/tmpfs/*
+  umount /mnt/tmpfs
+  rm -rf /mnt/tmpfs
+
+  rmdir $CGROUP_PATH
+
+  echo CLEANUP DONE
+}
+
+function setup() {
+  mkdir -p $CGROUP_PATH
+  echo $((10 * 1024 * 1024)) > $CGROUP_PATH/memory.limit_in_bytes
+  echo 0 > $CGROUP_PATH/cpuset.cpus
+  echo 0 > $CGROUP_PATH/cpuset.mems
+
+  mkdir -p /mnt/tmpfs
+
+  echo SETUP DONE
+}
+
+function expect_equal() {
+  local expected="$1"
+  local actual="$2"
+  local error="$3"
+
+  if [[ "$actual" != "$expected" ]]; then
+    echo "expected ($expected) != actual ($actual): $3" >&2
+    cleanup
+    exit 1
+  fi
+}
+
+function expect_ge() {
+  local expected="$1"
+  local actual="$2"
+  local error="$3"
+
+  if [[ "$actual" -lt "$expected" ]]; then
+    echo "expected ($expected) < actual ($actual): $3" >&2
+    cleanup
+    exit 1
+  fi
+}
+
+cleanup
+setup
+
+mount -t tmpfs -o memcg=$CGROUP_PATH tmpfs /mnt/tmpfs
+
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_equal 0 "$TARGET_MEMCG_USAGE" "Before echo, memcg usage should be 0"
+
+# Echo to allocate a page in the tmpfs
+echo
+echo
+echo hello > /mnt/tmpfs/test
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_ge 4096 "$TARGET_MEMCG_USAGE" "After echo, memcg usage should be greater than 4096"
+echo "Echo test succeeded"
+
+echo
+echo
+tools/testing/selftests/vm/mmap_write -p /mnt/tmpfs/test -s $((1 * 1024 * 1024))
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_ge $((1 * 1024 * 1024)) "$TARGET_MEMCG_USAGE" "After echo, memcg usage should greater than 1MB"
+echo "Write succeeded"
+
+# OOM the remote container on pagefault.
+echo
+echo
+echo "OOMing the remote container using pagefault."
+echo "This will take a long time because the kernel goes through reclaim retries,"
+echo "but should eventually be OOM-killed by 'Out of memory (Killing remote allocating task)'"
+tools/testing/selftests/vm/mmap_write -p /mnt/tmpfs/test -s $((11 * 1024 * 1024))
+
+# OOM the remote container on non pagefault.
+echo
+echo
+echo "OOMing the remote container using cat (non-pagefault)"
+echo "This will take a long time because the kernel goes through reclaim retries,"
+echo "but should eventually the cat command should receive an ENOMEM"
+cat /dev/random > /mnt/tmpfs/random
+
+cleanup
+echo SUCCESS
--
2.34.0.rc1.387.gb447b232ab-goog


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v3 4/4] mm, shmem, selftests: add tmpfs memcg= mount option tests
@ 2021-11-11 23:42   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-11 23:42 UTC (permalink / raw)
  Cc: Mina Almasry, Michal Hocko, Theodore Ts'o, Greg Thelen,
	Shakeel Butt, Andrew Morton, Hugh Dickins, Roman Gushchin,
	Johannes Weiner, Tejun Heo, Vladimir Davydov, Muchun Song,
	riel-ebMLmSuQjDVBDgjK7y7TUQ, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

Signed-off-by: Mina Almasry <almasrymina-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Cc: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
Cc: Theodore Ts'o <tytso-3s7WtUTddSA@public.gmane.org>
Cc: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Shakeel Butt <shakeelb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Roman Gushchin <guro-b10kYP2dOMg@public.gmane.org>
Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Vladimir Davydov <vdavydov.dev-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Cc: Muchun Song <songmuchun-EC8Uxl6Npydl57MIdRCFDg@public.gmane.org>
Cc: riel-ebMLmSuQjDVBDgjK7y7TUQ@public.gmane.org
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

---
 tools/testing/selftests/vm/.gitignore     |   1 +
 tools/testing/selftests/vm/mmap_write.c   | 103 ++++++++++++++++++++++
 tools/testing/selftests/vm/tmpfs-memcg.sh |  91 +++++++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 tools/testing/selftests/vm/mmap_write.c
 create mode 100755 tools/testing/selftests/vm/tmpfs-memcg.sh

diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 2e7e86e852828..cb229974c5f15 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -19,6 +19,7 @@ madv_populate
 userfaultfd
 mlock-intersect-test
 mlock-random-test
+mmap_write
 virtual_address_range
 gup_test
 va_128TBswitch
diff --git a/tools/testing/selftests/vm/mmap_write.c b/tools/testing/selftests/vm/mmap_write.c
new file mode 100644
index 0000000000000..88a8468f2128c
--- /dev/null
+++ b/tools/testing/selftests/vm/mmap_write.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program faults memory in tmpfs
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+/* Global definitions. */
+
+/* Global variables. */
+static const char *self;
+static char *shmaddr;
+static int shmid;
+
+/*
+ * Show usage and exit.
+ */
+static void exit_usage(void)
+{
+	printf("Usage: %s -p <path to tmpfs file> -s <size to map>\n", self);
+	exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv)
+{
+	int fd = 0;
+	int key = 0;
+	int *ptr = NULL;
+	int c = 0;
+	int size = 0;
+	char path[256] = "";
+	int want_sleep = 0, private = 0;
+	int populate = 0;
+	int write = 0;
+	int reserve = 1;
+
+	/* Parse command-line arguments. */
+	setvbuf(stdout, NULL, _IONBF, 0);
+	self = argv[0];
+
+	while ((c = getopt(argc, argv, ":s:p:")) != -1) {
+		switch (c) {
+		case 's':
+			size = atoi(optarg);
+			break;
+		case 'p':
+			strncpy(path, optarg, sizeof(path));
+			break;
+		default:
+			errno = EINVAL;
+			perror("Invalid arg");
+			exit_usage();
+		}
+	}
+
+	printf("%s\n", path);
+	if (strncmp(path, "", sizeof(path)) != 0) {
+		printf("Writing to this path: %s\n", path);
+	} else {
+		errno = EINVAL;
+		perror("path not found");
+		exit_usage();
+	}
+
+	if (size != 0) {
+		printf("Writing this size: %d\n", size);
+	} else {
+		errno = EINVAL;
+		perror("size not found");
+		exit_usage();
+	}
+
+	fd = open(path, O_CREAT | O_RDWR, 0777);
+	if (fd == -1)
+		err(1, "Failed to open file.");
+
+	if (ftruncate(fd, size))
+		err(1, "failed to ftruncate %s", path);
+
+	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (ptr == MAP_FAILED) {
+		close(fd);
+		err(1, "Error mapping the file");
+	}
+
+	printf("Writing to memory.\n");
+	memset(ptr, 1, size);
+	printf("Done writing to memory.\n");
+	close(fd);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/tmpfs-memcg.sh b/tools/testing/selftests/vm/tmpfs-memcg.sh
new file mode 100755
index 0000000000000..30da2fad06357
--- /dev/null
+++ b/tools/testing/selftests/vm/tmpfs-memcg.sh
@@ -0,0 +1,91 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+CGROUP_PATH=/dev/cgroup/memory/tmpfs-memcg-test
+
+function cleanup() {
+  rm -rf /mnt/tmpfs/*
+  umount /mnt/tmpfs
+  rm -rf /mnt/tmpfs
+
+  rmdir $CGROUP_PATH
+
+  echo CLEANUP DONE
+}
+
+function setup() {
+  mkdir -p $CGROUP_PATH
+  echo $((10 * 1024 * 1024)) > $CGROUP_PATH/memory.limit_in_bytes
+  echo 0 > $CGROUP_PATH/cpuset.cpus
+  echo 0 > $CGROUP_PATH/cpuset.mems
+
+  mkdir -p /mnt/tmpfs
+
+  echo SETUP DONE
+}
+
+function expect_equal() {
+  local expected="$1"
+  local actual="$2"
+  local error="$3"
+
+  if [[ "$actual" != "$expected" ]]; then
+    echo "expected ($expected) != actual ($actual): $3" >&2
+    cleanup
+    exit 1
+  fi
+}
+
+function expect_ge() {
+  local expected="$1"
+  local actual="$2"
+  local error="$3"
+
+  if [[ "$actual" -lt "$expected" ]]; then
+    echo "expected ($expected) < actual ($actual): $3" >&2
+    cleanup
+    exit 1
+  fi
+}
+
+cleanup
+setup
+
+mount -t tmpfs -o memcg=$CGROUP_PATH tmpfs /mnt/tmpfs
+
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_equal 0 "$TARGET_MEMCG_USAGE" "Before echo, memcg usage should be 0"
+
+# Echo to allocate a page in the tmpfs
+echo
+echo
+echo hello > /mnt/tmpfs/test
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_ge 4096 "$TARGET_MEMCG_USAGE" "After echo, memcg usage should be greater than 4096"
+echo "Echo test succeeded"
+
+echo
+echo
+tools/testing/selftests/vm/mmap_write -p /mnt/tmpfs/test -s $((1 * 1024 * 1024))
+TARGET_MEMCG_USAGE=$(cat $CGROUP_PATH/memory.usage_in_bytes)
+expect_ge $((1 * 1024 * 1024)) "$TARGET_MEMCG_USAGE" "After echo, memcg usage should greater than 1MB"
+echo "Write succeeded"
+
+# OOM the remote container on pagefault.
+echo
+echo
+echo "OOMing the remote container using pagefault."
+echo "This will take a long time because the kernel goes through reclaim retries,"
+echo "but should eventually be OOM-killed by 'Out of memory (Killing remote allocating task)'"
+tools/testing/selftests/vm/mmap_write -p /mnt/tmpfs/test -s $((11 * 1024 * 1024))
+
+# OOM the remote container on non pagefault.
+echo
+echo
+echo "OOMing the remote container using cat (non-pagefault)"
+echo "This will take a long time because the kernel goes through reclaim retries,"
+echo "but should eventually the cat command should receive an ENOMEM"
+cat /dev/random > /mnt/tmpfs/random
+
+cleanup
+echo SUCCESS

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 1/4] mm/shmem: support deterministic charging of tmpfs
  2021-11-11 23:42   ` Mina Almasry
@ 2021-11-12  2:41     ` kernel test robot
  -1 siblings, 0 replies; 42+ messages in thread
From: kernel test robot @ 2021-11-12  2:41 UTC (permalink / raw)
  To: Mina Almasry
  Cc: kbuild-all, Mina Almasry, Michal Hocko, Theodore Ts'o,
	Greg Thelen, Shakeel Butt, Andrew Morton,
	Linux Memory Management List, Hugh Dickins, Roman Gushchin,
	Johannes Weiner

[-- Attachment #1: Type: text/plain, Size: 2161 bytes --]

Hi Mina,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on hnaz-mm/master]

url:    https://github.com/0day-ci/linux/commits/Mina-Almasry/mm-shmem-support-deterministic-charging-of-tmpfs/20211112-084506
base:   https://github.com/hnaz/linux-mm master
config: um-i386_defconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/51d8c281f9e96cc0269902e4597a972c7e2510f0
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Mina-Almasry/mm-shmem-support-deterministic-charging-of-tmpfs/20211112-084506
        git checkout 51d8c281f9e96cc0269902e4597a972c7e2510f0
        # save the attached .config to linux build tree
        make W=1 ARCH=um SUBARCH=i386

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   fs/super.c: In function 'destroy_unused_super':
>> fs/super.c:184:5: warning: "CONFIG_MEMCG" is not defined, evaluates to 0 [-Wundef]
     184 | #if CONFIG_MEMCG
         |     ^~~~~~~~~~~~
   fs/super.c: In function '__put_super':
   fs/super.c:299:5: warning: "CONFIG_MEMCG" is not defined, evaluates to 0 [-Wundef]
     299 | #if CONFIG_MEMCG
         |     ^~~~~~~~~~~~


vim +/CONFIG_MEMCG +184 fs/super.c

   175	
   176	/* Free a superblock that has never been seen by anyone */
   177	static void destroy_unused_super(struct super_block *s)
   178	{
   179		if (!s)
   180			return;
   181		up_write(&s->s_umount);
   182		list_lru_destroy(&s->s_dentry_lru);
   183		list_lru_destroy(&s->s_inode_lru);
 > 184	#if CONFIG_MEMCG
   185		mem_cgroup_set_charge_target(&s->s_memcg_to_charge, NULL);
   186	#endif
   187		security_sb_free(s);
   188		put_user_ns(s->s_user_ns);
   189		kfree(s->s_subtype);
   190		free_prealloced_shrinker(&s->s_shrink);
   191		/* no delays needed */
   192		destroy_super_work(&s->destroy_work);
   193	}
   194	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 9939 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 1/4] mm/shmem: support deterministic charging of tmpfs
@ 2021-11-12  2:41     ` kernel test robot
  0 siblings, 0 replies; 42+ messages in thread
From: kernel test robot @ 2021-11-12  2:41 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2222 bytes --]

Hi Mina,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on hnaz-mm/master]

url:    https://github.com/0day-ci/linux/commits/Mina-Almasry/mm-shmem-support-deterministic-charging-of-tmpfs/20211112-084506
base:   https://github.com/hnaz/linux-mm master
config: um-i386_defconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/51d8c281f9e96cc0269902e4597a972c7e2510f0
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Mina-Almasry/mm-shmem-support-deterministic-charging-of-tmpfs/20211112-084506
        git checkout 51d8c281f9e96cc0269902e4597a972c7e2510f0
        # save the attached .config to linux build tree
        make W=1 ARCH=um SUBARCH=i386

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   fs/super.c: In function 'destroy_unused_super':
>> fs/super.c:184:5: warning: "CONFIG_MEMCG" is not defined, evaluates to 0 [-Wundef]
     184 | #if CONFIG_MEMCG
         |     ^~~~~~~~~~~~
   fs/super.c: In function '__put_super':
   fs/super.c:299:5: warning: "CONFIG_MEMCG" is not defined, evaluates to 0 [-Wundef]
     299 | #if CONFIG_MEMCG
         |     ^~~~~~~~~~~~


vim +/CONFIG_MEMCG +184 fs/super.c

   175	
   176	/* Free a superblock that has never been seen by anyone */
   177	static void destroy_unused_super(struct super_block *s)
   178	{
   179		if (!s)
   180			return;
   181		up_write(&s->s_umount);
   182		list_lru_destroy(&s->s_dentry_lru);
   183		list_lru_destroy(&s->s_inode_lru);
 > 184	#if CONFIG_MEMCG
   185		mem_cgroup_set_charge_target(&s->s_memcg_to_charge, NULL);
   186	#endif
   187		security_sb_free(s);
   188		put_user_ns(s->s_user_ns);
   189		kfree(s->s_subtype);
   190		free_prealloced_shrinker(&s->s_shrink);
   191		/* no delays needed */
   192		destroy_super_work(&s->destroy_work);
   193	}
   194	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 9939 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-12  7:51     ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-12  7:51 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> to find a task to kill in the memcg under oom, if the oom-killer
> is unable to find one, the oom-killer should simply return ENOMEM to the
> allocating process.

This really begs for some justification.

> If we're in pagefault path and we're unable to return ENOMEM to the
> allocating process, we instead kill the allocating process.

Why do you handle those differently?

> Signed-off-by: Mina Almasry <almasrymina@google.com>
> 
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Theodore Ts'o <tytso@mit.edu>
> Cc: Greg Thelen <gthelen@google.com>
> Cc: Shakeel Butt <shakeelb@google.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Hugh Dickins <hughd@google.com>
> CC: Roman Gushchin <guro@fb.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> Cc: Muchun Song <songmuchun@bytedance.com>
> Cc: riel@surriel.com
> Cc: linux-mm@kvack.org
> Cc: linux-fsdevel@vger.kernel.org
> Cc: cgroups@vger.kernel.org
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-12  7:51     ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-12  7:51 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> to find a task to kill in the memcg under oom, if the oom-killer
> is unable to find one, the oom-killer should simply return ENOMEM to the
> allocating process.

This really begs for some justification.

> If we're in pagefault path and we're unable to return ENOMEM to the
> allocating process, we instead kill the allocating process.

Why do you handle those differently?

> Signed-off-by: Mina Almasry <almasrymina-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> 
> Cc: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
> Cc: Theodore Ts'o <tytso-3s7WtUTddSA@public.gmane.org>
> Cc: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> Cc: Shakeel Butt <shakeelb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> CC: Roman Gushchin <guro-b10kYP2dOMg@public.gmane.org>
> Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> Cc: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Cc: Vladimir Davydov <vdavydov.dev-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> Cc: Muchun Song <songmuchun-EC8Uxl6Npydl57MIdRCFDg@public.gmane.org>
> Cc: riel-ebMLmSuQjDVBDgjK7y7TUQ@public.gmane.org
> Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
> Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-12  8:12       ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-12  8:12 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
>
> On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > to find a task to kill in the memcg under oom, if the oom-killer
> > is unable to find one, the oom-killer should simply return ENOMEM to the
> > allocating process.
>
> This really begs for some justification.
>

I'm thinking (and I can add to the commit message in v4) that we have
2 reasonable options when the oom-killer gets invoked and finds
nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
thinking returning ENOMEM allows the application to gracefully handle
the failure to remote charge and continue operation.

For example, in the network service use case that I mentioned in the
RFC proposal, it's beneficial for the network service to get an ENOMEM
and continue to service network requests for other clients running on
the machine, rather than get oom-killed when hitting the remote memcg
limit. But, this is not a hard requirement, the network service could
fork a process that does the remote charging to guard against the
remote charge bringing down the entire process.

> > If we're in pagefault path and we're unable to return ENOMEM to the
> > allocating process, we instead kill the allocating process.
>
> Why do you handle those differently?
>

I'm thinking (possibly incorrectly) it's beneficial to return ENOMEM
to the allocating task rather than killing it. I would love to return
ENOMEM in both these cases, but I can't return ENOMEM in the fault
path. The behavior I see is that the oom-killer gets invoked over and
over again looking to find something to kill and continually failing
to find something to kill and the pagefault never gets handled.

I could, however, kill the allocating task whether it's in the
pagefault path or not; it's not a hard requirement that I return
ENOMEM. If this is what you'd like to see in v4, please let me know,
but I do see some value in allowing some callers to gracefully handle
the ENOMEM.

> > Signed-off-by: Mina Almasry <almasrymina@google.com>
> >
> > Cc: Michal Hocko <mhocko@suse.com>
> > Cc: Theodore Ts'o <tytso@mit.edu>
> > Cc: Greg Thelen <gthelen@google.com>
> > Cc: Shakeel Butt <shakeelb@google.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Hugh Dickins <hughd@google.com>
> > CC: Roman Gushchin <guro@fb.com>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> > Cc: Muchun Song <songmuchun@bytedance.com>
> > Cc: riel@surriel.com
> > Cc: linux-mm@kvack.org
> > Cc: linux-fsdevel@vger.kernel.org
> > Cc: cgroups@vger.kernel.org
> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-12  8:12       ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-12  8:12 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
>
> On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > to find a task to kill in the memcg under oom, if the oom-killer
> > is unable to find one, the oom-killer should simply return ENOMEM to the
> > allocating process.
>
> This really begs for some justification.
>

I'm thinking (and I can add to the commit message in v4) that we have
2 reasonable options when the oom-killer gets invoked and finds
nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
thinking returning ENOMEM allows the application to gracefully handle
the failure to remote charge and continue operation.

For example, in the network service use case that I mentioned in the
RFC proposal, it's beneficial for the network service to get an ENOMEM
and continue to service network requests for other clients running on
the machine, rather than get oom-killed when hitting the remote memcg
limit. But, this is not a hard requirement, the network service could
fork a process that does the remote charging to guard against the
remote charge bringing down the entire process.

> > If we're in pagefault path and we're unable to return ENOMEM to the
> > allocating process, we instead kill the allocating process.
>
> Why do you handle those differently?
>

I'm thinking (possibly incorrectly) it's beneficial to return ENOMEM
to the allocating task rather than killing it. I would love to return
ENOMEM in both these cases, but I can't return ENOMEM in the fault
path. The behavior I see is that the oom-killer gets invoked over and
over again looking to find something to kill and continually failing
to find something to kill and the pagefault never gets handled.

I could, however, kill the allocating task whether it's in the
pagefault path or not; it's not a hard requirement that I return
ENOMEM. If this is what you'd like to see in v4, please let me know,
but I do see some value in allowing some callers to gracefully handle
the ENOMEM.

> > Signed-off-by: Mina Almasry <almasrymina-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> >
> > Cc: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
> > Cc: Theodore Ts'o <tytso-3s7WtUTddSA@public.gmane.org>
> > Cc: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > Cc: Shakeel Butt <shakeelb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> > Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > CC: Roman Gushchin <guro-b10kYP2dOMg@public.gmane.org>
> > Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> > Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > Cc: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> > Cc: Vladimir Davydov <vdavydov.dev-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > Cc: Muchun Song <songmuchun-EC8Uxl6Npydl57MIdRCFDg@public.gmane.org>
> > Cc: riel-ebMLmSuQjDVBDgjK7y7TUQ@public.gmane.org
> > Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
> > Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
  2021-11-12  8:12       ` Mina Almasry
  (?)
@ 2021-11-12  8:36       ` Michal Hocko
  2021-11-12 17:59           ` Mina Almasry
  -1 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2021-11-12  8:36 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > to find a task to kill in the memcg under oom, if the oom-killer
> > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > allocating process.
> >
> > This really begs for some justification.
> >
> 
> I'm thinking (and I can add to the commit message in v4) that we have
> 2 reasonable options when the oom-killer gets invoked and finds
> nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> thinking returning ENOMEM allows the application to gracefully handle
> the failure to remote charge and continue operation.
> 
> For example, in the network service use case that I mentioned in the
> RFC proposal, it's beneficial for the network service to get an ENOMEM
> and continue to service network requests for other clients running on
> the machine, rather than get oom-killed when hitting the remote memcg
> limit. But, this is not a hard requirement, the network service could
> fork a process that does the remote charging to guard against the
> remote charge bringing down the entire process.

This all belongs to the changelog so that we can discuss all potential
implication and do not rely on any implicit assumptions. E.g. why does
it even make sense to kill a task in the origin cgroup?

> > > If we're in pagefault path and we're unable to return ENOMEM to the
> > > allocating process, we instead kill the allocating process.
> >
> > Why do you handle those differently?
> >
> 
> I'm thinking (possibly incorrectly) it's beneficial to return ENOMEM
> to the allocating task rather than killing it. I would love to return
> ENOMEM in both these cases, but I can't return ENOMEM in the fault
> path. The behavior I see is that the oom-killer gets invoked over and
> over again looking to find something to kill and continually failing
> to find something to kill and the pagefault never gets handled.

Just one remark. Until just very recently VM_FAULT_OOM (a result of
ENOMEM) would trigger the global OOM killer. This has changed by
60e2793d440a ("mm, oom: do not trigger out_of_memory from the #PF").
But you are right that you might just end up looping in the page fault
for ever. Is that bad though? The situation is fundamentaly
unresolveable at this stage. On the other hand the task is still
killable so the userspace can decide to terminate and break out of the
loop.

What is the best approach I am not quite sure. As I've said earlier this
is very likely going to open a can of worms and so it should be
evaluated very carefuly. For that, please make sure to describe your
thinking in details.
 
> I could, however, kill the allocating task whether it's in the
> pagefault path or not; it's not a hard requirement that I return
> ENOMEM. If this is what you'd like to see in v4, please let me know,
> but I do see some value in allowing some callers to gracefully handle
> the ENOMEM.
> 
> > > Signed-off-by: Mina Almasry <almasrymina@google.com>
> > >
> > > Cc: Michal Hocko <mhocko@suse.com>
> > > Cc: Theodore Ts'o <tytso@mit.edu>
> > > Cc: Greg Thelen <gthelen@google.com>
> > > Cc: Shakeel Butt <shakeelb@google.com>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: Hugh Dickins <hughd@google.com>
> > > CC: Roman Gushchin <guro@fb.com>
> > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > Cc: Hugh Dickins <hughd@google.com>
> > > Cc: Tejun Heo <tj@kernel.org>
> > > Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> > > Cc: Muchun Song <songmuchun@bytedance.com>
> > > Cc: riel@surriel.com
> > > Cc: linux-mm@kvack.org
> > > Cc: linux-fsdevel@vger.kernel.org
> > > Cc: cgroups@vger.kernel.org
> > --
> > Michal Hocko
> > SUSE Labs

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-12 17:59           ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-12 17:59 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > allocating process.
> > >
> > > This really begs for some justification.
> > >
> >
> > I'm thinking (and I can add to the commit message in v4) that we have
> > 2 reasonable options when the oom-killer gets invoked and finds
> > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > thinking returning ENOMEM allows the application to gracefully handle
> > the failure to remote charge and continue operation.
> >
> > For example, in the network service use case that I mentioned in the
> > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > and continue to service network requests for other clients running on
> > the machine, rather than get oom-killed when hitting the remote memcg
> > limit. But, this is not a hard requirement, the network service could
> > fork a process that does the remote charging to guard against the
> > remote charge bringing down the entire process.
>
> This all belongs to the changelog so that we can discuss all potential
> implication and do not rely on any implicit assumptions.

Understood. Maybe I'll wait to collect more feedback and upload v4
with a thorough explanation of the thought process.

> E.g. why does
> it even make sense to kill a task in the origin cgroup?
>

The behavior I saw returning ENOMEM for this edge case was that the
code was forever looping the pagefault, and I was (seemingly
incorrectly) under the impression that a suggestion to forever loop
the pagefault would be completely fundamentally unacceptable.

> > > > If we're in pagefault path and we're unable to return ENOMEM to the
> > > > allocating process, we instead kill the allocating process.
> > >
> > > Why do you handle those differently?
> > >
> >
> > I'm thinking (possibly incorrectly) it's beneficial to return ENOMEM
> > to the allocating task rather than killing it. I would love to return
> > ENOMEM in both these cases, but I can't return ENOMEM in the fault
> > path. The behavior I see is that the oom-killer gets invoked over and
> > over again looking to find something to kill and continually failing
> > to find something to kill and the pagefault never gets handled.
>
> Just one remark. Until just very recently VM_FAULT_OOM (a result of
> ENOMEM) would trigger the global OOM killer. This has changed by
> 60e2793d440a ("mm, oom: do not trigger out_of_memory from the #PF").
> But you are right that you might just end up looping in the page fault
> for ever. Is that bad though? The situation is fundamentaly
> unresolveable at this stage. On the other hand the task is still
> killable so the userspace can decide to terminate and break out of the
> loop.
>

I think what you're saying here makes a lot of sense and I think is a
workable approach, and maybe is slightly preferable to killing the
task IMO (and both are workable IMO). The pagefault can loop until
memory becomes available in the remote memcg, and the userspace can
decide to always terminate the process if desired or maybe handle the
issue more gracefully by freeing memory in the remote memcg somehow;
i.e. maybe we don't need the kernel to be heavy handed here and kill
the remote allocating task immediately.

> What is the best approach I am not quite sure. As I've said earlier this
> is very likely going to open a can of worms and so it should be
> evaluated very carefuly. For that, please make sure to describe your
> thinking in details.
>

OK, thanks for reviewing and the next iteration should include a
thorough explanation of my thinking.

> > I could, however, kill the allocating task whether it's in the
> > pagefault path or not; it's not a hard requirement that I return
> > ENOMEM. If this is what you'd like to see in v4, please let me know,
> > but I do see some value in allowing some callers to gracefully handle
> > the ENOMEM.
> >
> > > > Signed-off-by: Mina Almasry <almasrymina@google.com>
> > > >
> > > > Cc: Michal Hocko <mhocko@suse.com>
> > > > Cc: Theodore Ts'o <tytso@mit.edu>
> > > > Cc: Greg Thelen <gthelen@google.com>
> > > > Cc: Shakeel Butt <shakeelb@google.com>
> > > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > > Cc: Hugh Dickins <hughd@google.com>
> > > > CC: Roman Gushchin <guro@fb.com>
> > > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > > Cc: Hugh Dickins <hughd@google.com>
> > > > Cc: Tejun Heo <tj@kernel.org>
> > > > Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> > > > Cc: Muchun Song <songmuchun@bytedance.com>
> > > > Cc: riel@surriel.com
> > > > Cc: linux-mm@kvack.org
> > > > Cc: linux-fsdevel@vger.kernel.org
> > > > Cc: cgroups@vger.kernel.org
> > > --
> > > Michal Hocko
> > > SUSE Labs
>
> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-12 17:59           ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-12 17:59 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
>
> On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > >
> > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > allocating process.
> > >
> > > This really begs for some justification.
> > >
> >
> > I'm thinking (and I can add to the commit message in v4) that we have
> > 2 reasonable options when the oom-killer gets invoked and finds
> > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > thinking returning ENOMEM allows the application to gracefully handle
> > the failure to remote charge and continue operation.
> >
> > For example, in the network service use case that I mentioned in the
> > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > and continue to service network requests for other clients running on
> > the machine, rather than get oom-killed when hitting the remote memcg
> > limit. But, this is not a hard requirement, the network service could
> > fork a process that does the remote charging to guard against the
> > remote charge bringing down the entire process.
>
> This all belongs to the changelog so that we can discuss all potential
> implication and do not rely on any implicit assumptions.

Understood. Maybe I'll wait to collect more feedback and upload v4
with a thorough explanation of the thought process.

> E.g. why does
> it even make sense to kill a task in the origin cgroup?
>

The behavior I saw returning ENOMEM for this edge case was that the
code was forever looping the pagefault, and I was (seemingly
incorrectly) under the impression that a suggestion to forever loop
the pagefault would be completely fundamentally unacceptable.

> > > > If we're in pagefault path and we're unable to return ENOMEM to the
> > > > allocating process, we instead kill the allocating process.
> > >
> > > Why do you handle those differently?
> > >
> >
> > I'm thinking (possibly incorrectly) it's beneficial to return ENOMEM
> > to the allocating task rather than killing it. I would love to return
> > ENOMEM in both these cases, but I can't return ENOMEM in the fault
> > path. The behavior I see is that the oom-killer gets invoked over and
> > over again looking to find something to kill and continually failing
> > to find something to kill and the pagefault never gets handled.
>
> Just one remark. Until just very recently VM_FAULT_OOM (a result of
> ENOMEM) would trigger the global OOM killer. This has changed by
> 60e2793d440a ("mm, oom: do not trigger out_of_memory from the #PF").
> But you are right that you might just end up looping in the page fault
> for ever. Is that bad though? The situation is fundamentaly
> unresolveable at this stage. On the other hand the task is still
> killable so the userspace can decide to terminate and break out of the
> loop.
>

I think what you're saying here makes a lot of sense and I think is a
workable approach, and maybe is slightly preferable to killing the
task IMO (and both are workable IMO). The pagefault can loop until
memory becomes available in the remote memcg, and the userspace can
decide to always terminate the process if desired or maybe handle the
issue more gracefully by freeing memory in the remote memcg somehow;
i.e. maybe we don't need the kernel to be heavy handed here and kill
the remote allocating task immediately.

> What is the best approach I am not quite sure. As I've said earlier this
> is very likely going to open a can of worms and so it should be
> evaluated very carefuly. For that, please make sure to describe your
> thinking in details.
>

OK, thanks for reviewing and the next iteration should include a
thorough explanation of my thinking.

> > I could, however, kill the allocating task whether it's in the
> > pagefault path or not; it's not a hard requirement that I return
> > ENOMEM. If this is what you'd like to see in v4, please let me know,
> > but I do see some value in allowing some callers to gracefully handle
> > the ENOMEM.
> >
> > > > Signed-off-by: Mina Almasry <almasrymina-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > > >
> > > > Cc: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
> > > > Cc: Theodore Ts'o <tytso-3s7WtUTddSA@public.gmane.org>
> > > > Cc: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > > > Cc: Shakeel Butt <shakeelb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > > > Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> > > > Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > > > CC: Roman Gushchin <guro-b10kYP2dOMg@public.gmane.org>
> > > > Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> > > > Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > > > Cc: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> > > > Cc: Vladimir Davydov <vdavydov.dev-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > > > Cc: Muchun Song <songmuchun-EC8Uxl6Npydl57MIdRCFDg@public.gmane.org>
> > > > Cc: riel-ebMLmSuQjDVBDgjK7y7TUQ@public.gmane.org
> > > > Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
> > > > Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > > > Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > > --
> > > Michal Hocko
> > > SUSE Labs
>
> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
  2021-11-12 17:59           ` Mina Almasry
  (?)
@ 2021-11-15 10:58           ` Michal Hocko
  2021-11-15 17:32               ` Shakeel Butt
  2021-11-16  0:58               ` Mina Almasry
  -1 siblings, 2 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-15 10:58 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> > > >
> > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > allocating process.
> > > >
> > > > This really begs for some justification.
> > > >
> > >
> > > I'm thinking (and I can add to the commit message in v4) that we have
> > > 2 reasonable options when the oom-killer gets invoked and finds
> > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > thinking returning ENOMEM allows the application to gracefully handle
> > > the failure to remote charge and continue operation.
> > >
> > > For example, in the network service use case that I mentioned in the
> > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > and continue to service network requests for other clients running on
> > > the machine, rather than get oom-killed when hitting the remote memcg
> > > limit. But, this is not a hard requirement, the network service could
> > > fork a process that does the remote charging to guard against the
> > > remote charge bringing down the entire process.
> >
> > This all belongs to the changelog so that we can discuss all potential
> > implication and do not rely on any implicit assumptions.
> 
> Understood. Maybe I'll wait to collect more feedback and upload v4
> with a thorough explanation of the thought process.
> 
> > E.g. why does
> > it even make sense to kill a task in the origin cgroup?
> >
> 
> The behavior I saw returning ENOMEM for this edge case was that the
> code was forever looping the pagefault, and I was (seemingly
> incorrectly) under the impression that a suggestion to forever loop
> the pagefault would be completely fundamentally unacceptable.

Well, I have to say I am not entirely sure what is the best way to
handle this situation. Another option would be to treat this similar to
ENOSPACE situation. This would result into SIGBUS IIRC.

The main problem with OOM killer is that it will not resolve the
underlying problem in most situations. Shmem files would likely stay
laying around and their charge along with them. Killing the allocating
task has problems on its own because this could be just a DoS vector by
other unrelated tasks sharing the shmem mount point without a gracefull
fallback. Retrying the page fault is hard to detect. SIGBUS might be
something that helps with the latest. The question is how to communicate
this requerement down to the memcg code to know that the memory reclaim
should happen (Should it? How hard we should try?) but do not invoke the
oom killer. The more I think about this the nastier this is.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-15 17:32               ` Shakeel Butt
  0 siblings, 0 replies; 42+ messages in thread
From: Shakeel Butt @ 2021-11-15 17:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Mina Almasry, Theodore Ts'o, Greg Thelen, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko@suse.com> wrote:
>
[...]
> >
> > The behavior I saw returning ENOMEM for this edge case was that the
> > code was forever looping the pagefault, and I was (seemingly
> > incorrectly) under the impression that a suggestion to forever loop
> > the pagefault would be completely fundamentally unacceptable.
>
> Well, I have to say I am not entirely sure what is the best way to
> handle this situation. Another option would be to treat this similar to
> ENOSPACE situation. This would result into SIGBUS IIRC.
>
> The main problem with OOM killer is that it will not resolve the
> underlying problem in most situations. Shmem files would likely stay
> laying around and their charge along with them.

This and similar topics were discussed during LSFMM 2019
(https://lwn.net/Articles/787626/).

> Killing the allocating
> task has problems on its own because this could be just a DoS vector by
> other unrelated tasks sharing the shmem mount point without a gracefull
> fallback. Retrying the page fault is hard to detect. SIGBUS might be
> something that helps with the latest. The question is how to communicate
> this requerement down to the memcg code to know that the memory reclaim
> should happen (Should it? How hard we should try?) but do not invoke the
> oom killer. The more I think about this the nastier this is.
> --

IMHO we should punt the resolution to the userspace and keep the
kernel simple. This is an opt-in feature and the user is expected to
know and handle exceptional scenarios. The kernel just needs to tell
the userspace that this exceptional situation is happening somehow.

How about for remote ooms irrespective of page fault path or not, keep
the allocator looping but keep incrementing a new memcg event
MEMCG_OOM_NO_VICTIM? The userspace will get to know the situation
either through inotify or polling and can handle the situation by
either increasing the limit or by releasing the memory of the
monitored memcg.

thanks,
Shakeel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-15 17:32               ` Shakeel Butt
  0 siblings, 0 replies; 42+ messages in thread
From: Shakeel Butt @ 2021-11-15 17:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Mina Almasry, Theodore Ts'o, Greg Thelen, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
>
[...]
> >
> > The behavior I saw returning ENOMEM for this edge case was that the
> > code was forever looping the pagefault, and I was (seemingly
> > incorrectly) under the impression that a suggestion to forever loop
> > the pagefault would be completely fundamentally unacceptable.
>
> Well, I have to say I am not entirely sure what is the best way to
> handle this situation. Another option would be to treat this similar to
> ENOSPACE situation. This would result into SIGBUS IIRC.
>
> The main problem with OOM killer is that it will not resolve the
> underlying problem in most situations. Shmem files would likely stay
> laying around and their charge along with them.

This and similar topics were discussed during LSFMM 2019
(https://lwn.net/Articles/787626/).

> Killing the allocating
> task has problems on its own because this could be just a DoS vector by
> other unrelated tasks sharing the shmem mount point without a gracefull
> fallback. Retrying the page fault is hard to detect. SIGBUS might be
> something that helps with the latest. The question is how to communicate
> this requerement down to the memcg code to know that the memory reclaim
> should happen (Should it? How hard we should try?) but do not invoke the
> oom killer. The more I think about this the nastier this is.
> --

IMHO we should punt the resolution to the userspace and keep the
kernel simple. This is an opt-in feature and the user is expected to
know and handle exceptional scenarios. The kernel just needs to tell
the userspace that this exceptional situation is happening somehow.

How about for remote ooms irrespective of page fault path or not, keep
the allocator looping but keep incrementing a new memcg event
MEMCG_OOM_NO_VICTIM? The userspace will get to know the situation
either through inotify or polling and can handle the situation by
either increasing the limit or by releasing the memory of the
monitored memcg.

thanks,
Shakeel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16  0:58               ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-16  0:58 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> > > > >
> > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > allocating process.
> > > > >
> > > > > This really begs for some justification.
> > > > >
> > > >
> > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > the failure to remote charge and continue operation.
> > > >
> > > > For example, in the network service use case that I mentioned in the
> > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > and continue to service network requests for other clients running on
> > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > limit. But, this is not a hard requirement, the network service could
> > > > fork a process that does the remote charging to guard against the
> > > > remote charge bringing down the entire process.
> > >
> > > This all belongs to the changelog so that we can discuss all potential
> > > implication and do not rely on any implicit assumptions.
> >
> > Understood. Maybe I'll wait to collect more feedback and upload v4
> > with a thorough explanation of the thought process.
> >
> > > E.g. why does
> > > it even make sense to kill a task in the origin cgroup?
> > >
> >
> > The behavior I saw returning ENOMEM for this edge case was that the
> > code was forever looping the pagefault, and I was (seemingly
> > incorrectly) under the impression that a suggestion to forever loop
> > the pagefault would be completely fundamentally unacceptable.
>
> Well, I have to say I am not entirely sure what is the best way to
> handle this situation. Another option would be to treat this similar to
> ENOSPACE situation. This would result into SIGBUS IIRC.
>
> The main problem with OOM killer is that it will not resolve the
> underlying problem in most situations. Shmem files would likely stay
> laying around and their charge along with them. Killing the allocating
> task has problems on its own because this could be just a DoS vector by
> other unrelated tasks sharing the shmem mount point without a gracefull
> fallback. Retrying the page fault is hard to detect. SIGBUS might be
> something that helps with the latest. The question is how to communicate
> this requerement down to the memcg code to know that the memory reclaim
> should happen (Should it? How hard we should try?) but do not invoke the
> oom killer. The more I think about this the nastier this is.

So actually I thought the ENOSPC suggestion was interesting so I took
the liberty to prototype it. The changes required:

1. In out_of_memory() we return false if !oc->chosen &&
is_remote_oom(). This gets bubbled up to try_charge_memcg() as
mem_cgroup_oom() returning OOM_FAILED.
2. In try_charge_memcg(), if we get an OOM_FAILED we again check
is_remote_oom(), if it is a remote oom, return ENOSPC.
3. The calling code would return ENOSPC to the user in the no-fault
path, and SIGBUS the user in the fault path with no changes.

To be honest I think this is very workable, as is Shakeel's suggestion
of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
document the behavior and if the userspace doesn't want to get killed
they can catch the sigbus and handle it gracefully. If not, the
userspace just gets killed if we hit this edge case.

I may be missing something but AFAICT we don't have to "communicate
this requirement down to the memcg code" with this implementation. The
memcg code is aware the memcg is oom and will do the reclaim or
whatever before invoking the oom-killer. It's only when the oom-killer
can't find something to kill that we return ENOSPC or SIGBUS.

As always thank you very much for reviewing and providing feedback.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16  0:58               ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-16  0:58 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
>
> On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > >
> > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > > > >
> > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > allocating process.
> > > > >
> > > > > This really begs for some justification.
> > > > >
> > > >
> > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > the failure to remote charge and continue operation.
> > > >
> > > > For example, in the network service use case that I mentioned in the
> > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > and continue to service network requests for other clients running on
> > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > limit. But, this is not a hard requirement, the network service could
> > > > fork a process that does the remote charging to guard against the
> > > > remote charge bringing down the entire process.
> > >
> > > This all belongs to the changelog so that we can discuss all potential
> > > implication and do not rely on any implicit assumptions.
> >
> > Understood. Maybe I'll wait to collect more feedback and upload v4
> > with a thorough explanation of the thought process.
> >
> > > E.g. why does
> > > it even make sense to kill a task in the origin cgroup?
> > >
> >
> > The behavior I saw returning ENOMEM for this edge case was that the
> > code was forever looping the pagefault, and I was (seemingly
> > incorrectly) under the impression that a suggestion to forever loop
> > the pagefault would be completely fundamentally unacceptable.
>
> Well, I have to say I am not entirely sure what is the best way to
> handle this situation. Another option would be to treat this similar to
> ENOSPACE situation. This would result into SIGBUS IIRC.
>
> The main problem with OOM killer is that it will not resolve the
> underlying problem in most situations. Shmem files would likely stay
> laying around and their charge along with them. Killing the allocating
> task has problems on its own because this could be just a DoS vector by
> other unrelated tasks sharing the shmem mount point without a gracefull
> fallback. Retrying the page fault is hard to detect. SIGBUS might be
> something that helps with the latest. The question is how to communicate
> this requerement down to the memcg code to know that the memory reclaim
> should happen (Should it? How hard we should try?) but do not invoke the
> oom killer. The more I think about this the nastier this is.

So actually I thought the ENOSPC suggestion was interesting so I took
the liberty to prototype it. The changes required:

1. In out_of_memory() we return false if !oc->chosen &&
is_remote_oom(). This gets bubbled up to try_charge_memcg() as
mem_cgroup_oom() returning OOM_FAILED.
2. In try_charge_memcg(), if we get an OOM_FAILED we again check
is_remote_oom(), if it is a remote oom, return ENOSPC.
3. The calling code would return ENOSPC to the user in the no-fault
path, and SIGBUS the user in the fault path with no changes.

To be honest I think this is very workable, as is Shakeel's suggestion
of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
document the behavior and if the userspace doesn't want to get killed
they can catch the sigbus and handle it gracefully. If not, the
userspace just gets killed if we hit this edge case.

I may be missing something but AFAICT we don't have to "communicate
this requirement down to the memcg code" with this implementation. The
memcg code is aware the memcg is oom and will do the reclaim or
whatever before invoking the oom-killer. It's only when the oom-killer
can't find something to kill that we return ENOSPC or SIGBUS.

As always thank you very much for reviewing and providing feedback.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16  9:28                 ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-16  9:28 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko@suse.com> wrote:
> > > >
> > > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> > > > > >
> > > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > > allocating process.
> > > > > >
> > > > > > This really begs for some justification.
> > > > > >
> > > > >
> > > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > > the failure to remote charge and continue operation.
> > > > >
> > > > > For example, in the network service use case that I mentioned in the
> > > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > > and continue to service network requests for other clients running on
> > > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > > limit. But, this is not a hard requirement, the network service could
> > > > > fork a process that does the remote charging to guard against the
> > > > > remote charge bringing down the entire process.
> > > >
> > > > This all belongs to the changelog so that we can discuss all potential
> > > > implication and do not rely on any implicit assumptions.
> > >
> > > Understood. Maybe I'll wait to collect more feedback and upload v4
> > > with a thorough explanation of the thought process.
> > >
> > > > E.g. why does
> > > > it even make sense to kill a task in the origin cgroup?
> > > >
> > >
> > > The behavior I saw returning ENOMEM for this edge case was that the
> > > code was forever looping the pagefault, and I was (seemingly
> > > incorrectly) under the impression that a suggestion to forever loop
> > > the pagefault would be completely fundamentally unacceptable.
> >
> > Well, I have to say I am not entirely sure what is the best way to
> > handle this situation. Another option would be to treat this similar to
> > ENOSPACE situation. This would result into SIGBUS IIRC.
> >
> > The main problem with OOM killer is that it will not resolve the
> > underlying problem in most situations. Shmem files would likely stay
> > laying around and their charge along with them. Killing the allocating
> > task has problems on its own because this could be just a DoS vector by
> > other unrelated tasks sharing the shmem mount point without a gracefull
> > fallback. Retrying the page fault is hard to detect. SIGBUS might be
> > something that helps with the latest. The question is how to communicate
> > this requerement down to the memcg code to know that the memory reclaim
> > should happen (Should it? How hard we should try?) but do not invoke the
> > oom killer. The more I think about this the nastier this is.
> 
> So actually I thought the ENOSPC suggestion was interesting so I took
> the liberty to prototype it. The changes required:
> 
> 1. In out_of_memory() we return false if !oc->chosen &&
> is_remote_oom(). This gets bubbled up to try_charge_memcg() as
> mem_cgroup_oom() returning OOM_FAILED.
> 2. In try_charge_memcg(), if we get an OOM_FAILED we again check
> is_remote_oom(), if it is a remote oom, return ENOSPC.
> 3. The calling code would return ENOSPC to the user in the no-fault
> path, and SIGBUS the user in the fault path with no changes.

I think this should be implemented at the caller side rather than
somehow hacked into the memcg core. It is the caller to know what to do.
The caller can use gfp flags to control the reclaim behavior.

> To be honest I think this is very workable, as is Shakeel's suggestion
> of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
> document the behavior and if the userspace doesn't want to get killed
> they can catch the sigbus and handle it gracefully. If not, the
> userspace just gets killed if we hit this edge case.

I am not sure about the MEMCG_OOM_NO_VICTIM approach. It sounds really
hackish to me. I will get back to Shakeel's email as time permits. The
primary problem I have with this, though, is that the kernel oom killer
cannot really do anything sensible if the limit is reached and there
is nothing reclaimable left in this case. The tmpfs backed memory will
simply stay around and there are no means to recover without userspace
intervention.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16  9:28                 ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-16  9:28 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> >
> > On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > > >
> > > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > > > > >
> > > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > > allocating process.
> > > > > >
> > > > > > This really begs for some justification.
> > > > > >
> > > > >
> > > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > > the failure to remote charge and continue operation.
> > > > >
> > > > > For example, in the network service use case that I mentioned in the
> > > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > > and continue to service network requests for other clients running on
> > > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > > limit. But, this is not a hard requirement, the network service could
> > > > > fork a process that does the remote charging to guard against the
> > > > > remote charge bringing down the entire process.
> > > >
> > > > This all belongs to the changelog so that we can discuss all potential
> > > > implication and do not rely on any implicit assumptions.
> > >
> > > Understood. Maybe I'll wait to collect more feedback and upload v4
> > > with a thorough explanation of the thought process.
> > >
> > > > E.g. why does
> > > > it even make sense to kill a task in the origin cgroup?
> > > >
> > >
> > > The behavior I saw returning ENOMEM for this edge case was that the
> > > code was forever looping the pagefault, and I was (seemingly
> > > incorrectly) under the impression that a suggestion to forever loop
> > > the pagefault would be completely fundamentally unacceptable.
> >
> > Well, I have to say I am not entirely sure what is the best way to
> > handle this situation. Another option would be to treat this similar to
> > ENOSPACE situation. This would result into SIGBUS IIRC.
> >
> > The main problem with OOM killer is that it will not resolve the
> > underlying problem in most situations. Shmem files would likely stay
> > laying around and their charge along with them. Killing the allocating
> > task has problems on its own because this could be just a DoS vector by
> > other unrelated tasks sharing the shmem mount point without a gracefull
> > fallback. Retrying the page fault is hard to detect. SIGBUS might be
> > something that helps with the latest. The question is how to communicate
> > this requerement down to the memcg code to know that the memory reclaim
> > should happen (Should it? How hard we should try?) but do not invoke the
> > oom killer. The more I think about this the nastier this is.
> 
> So actually I thought the ENOSPC suggestion was interesting so I took
> the liberty to prototype it. The changes required:
> 
> 1. In out_of_memory() we return false if !oc->chosen &&
> is_remote_oom(). This gets bubbled up to try_charge_memcg() as
> mem_cgroup_oom() returning OOM_FAILED.
> 2. In try_charge_memcg(), if we get an OOM_FAILED we again check
> is_remote_oom(), if it is a remote oom, return ENOSPC.
> 3. The calling code would return ENOSPC to the user in the no-fault
> path, and SIGBUS the user in the fault path with no changes.

I think this should be implemented at the caller side rather than
somehow hacked into the memcg core. It is the caller to know what to do.
The caller can use gfp flags to control the reclaim behavior.

> To be honest I think this is very workable, as is Shakeel's suggestion
> of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
> document the behavior and if the userspace doesn't want to get killed
> they can catch the sigbus and handle it gracefully. If not, the
> userspace just gets killed if we hit this edge case.

I am not sure about the MEMCG_OOM_NO_VICTIM approach. It sounds really
hackish to me. I will get back to Shakeel's email as time permits. The
primary problem I have with this, though, is that the kernel oom killer
cannot really do anything sensible if the limit is reached and there
is nothing reclaimable left in this case. The tmpfs backed memory will
simply stay around and there are no means to recover without userspace
intervention.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16  9:39                   ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-16  9:39 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Tue 16-11-21 10:28:25, Michal Hocko wrote:
> On Mon 15-11-21 16:58:19, Mina Almasry wrote:
[...]
> > To be honest I think this is very workable, as is Shakeel's suggestion
> > of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
> > document the behavior and if the userspace doesn't want to get killed
> > they can catch the sigbus and handle it gracefully. If not, the
> > userspace just gets killed if we hit this edge case.
> 
> I am not sure about the MEMCG_OOM_NO_VICTIM approach. It sounds really
> hackish to me. I will get back to Shakeel's email as time permits. The
> primary problem I have with this, though, is that the kernel oom killer
> cannot really do anything sensible if the limit is reached and there
> is nothing reclaimable left in this case. The tmpfs backed memory will
> simply stay around and there are no means to recover without userspace
> intervention.

And just a small clarification. Tmpfs is fundamentally problematic from
the OOM handling POV. The nuance here is that the OOM happens in a
different memcg and thus a different resource domain. If you kill a task
in the target memcg then you effectively DoS that workload. If you kill
the allocating task then it is DoSed by anybody allowed to write to that
shmem. All that without a graceful fallback.

I still have very hard time seeing how that can work reasonably except
for a very special case with a lot of other measures to ensure the
target memcg never hits the hard limit so the OOM simply is not a
problem.

Memory controller has always been used to enforce and balance memory
usage among resource domains and this goes against that principle.
I would be really curious what Johannes thinks about this.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16  9:39                   ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-16  9:39 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Tue 16-11-21 10:28:25, Michal Hocko wrote:
> On Mon 15-11-21 16:58:19, Mina Almasry wrote:
[...]
> > To be honest I think this is very workable, as is Shakeel's suggestion
> > of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
> > document the behavior and if the userspace doesn't want to get killed
> > they can catch the sigbus and handle it gracefully. If not, the
> > userspace just gets killed if we hit this edge case.
> 
> I am not sure about the MEMCG_OOM_NO_VICTIM approach. It sounds really
> hackish to me. I will get back to Shakeel's email as time permits. The
> primary problem I have with this, though, is that the kernel oom killer
> cannot really do anything sensible if the limit is reached and there
> is nothing reclaimable left in this case. The tmpfs backed memory will
> simply stay around and there are no means to recover without userspace
> intervention.

And just a small clarification. Tmpfs is fundamentally problematic from
the OOM handling POV. The nuance here is that the OOM happens in a
different memcg and thus a different resource domain. If you kill a task
in the target memcg then you effectively DoS that workload. If you kill
the allocating task then it is DoSed by anybody allowed to write to that
shmem. All that without a graceful fallback.

I still have very hard time seeing how that can work reasonably except
for a very special case with a lot of other measures to ensure the
target memcg never hits the hard limit so the OOM simply is not a
problem.

Memory controller has always been used to enforce and balance memory
usage among resource domains and this goes against that principle.
I would be really curious what Johannes thinks about this.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16 10:17                   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-16 10:17 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Tue, Nov 16, 2021 at 1:28 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> > On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > > > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > >
> > > > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> > > > > > >
> > > > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > > > allocating process.
> > > > > > >
> > > > > > > This really begs for some justification.
> > > > > > >
> > > > > >
> > > > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > > > the failure to remote charge and continue operation.
> > > > > >
> > > > > > For example, in the network service use case that I mentioned in the
> > > > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > > > and continue to service network requests for other clients running on
> > > > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > > > limit. But, this is not a hard requirement, the network service could
> > > > > > fork a process that does the remote charging to guard against the
> > > > > > remote charge bringing down the entire process.
> > > > >
> > > > > This all belongs to the changelog so that we can discuss all potential
> > > > > implication and do not rely on any implicit assumptions.
> > > >
> > > > Understood. Maybe I'll wait to collect more feedback and upload v4
> > > > with a thorough explanation of the thought process.
> > > >
> > > > > E.g. why does
> > > > > it even make sense to kill a task in the origin cgroup?
> > > > >
> > > >
> > > > The behavior I saw returning ENOMEM for this edge case was that the
> > > > code was forever looping the pagefault, and I was (seemingly
> > > > incorrectly) under the impression that a suggestion to forever loop
> > > > the pagefault would be completely fundamentally unacceptable.
> > >
> > > Well, I have to say I am not entirely sure what is the best way to
> > > handle this situation. Another option would be to treat this similar to
> > > ENOSPACE situation. This would result into SIGBUS IIRC.
> > >
> > > The main problem with OOM killer is that it will not resolve the
> > > underlying problem in most situations. Shmem files would likely stay
> > > laying around and their charge along with them. Killing the allocating
> > > task has problems on its own because this could be just a DoS vector by
> > > other unrelated tasks sharing the shmem mount point without a gracefull
> > > fallback. Retrying the page fault is hard to detect. SIGBUS might be
> > > something that helps with the latest. The question is how to communicate
> > > this requerement down to the memcg code to know that the memory reclaim
> > > should happen (Should it? How hard we should try?) but do not invoke the
> > > oom killer. The more I think about this the nastier this is.
> >
> > So actually I thought the ENOSPC suggestion was interesting so I took
> > the liberty to prototype it. The changes required:
> >
> > 1. In out_of_memory() we return false if !oc->chosen &&
> > is_remote_oom(). This gets bubbled up to try_charge_memcg() as
> > mem_cgroup_oom() returning OOM_FAILED.
> > 2. In try_charge_memcg(), if we get an OOM_FAILED we again check
> > is_remote_oom(), if it is a remote oom, return ENOSPC.
> > 3. The calling code would return ENOSPC to the user in the no-fault
> > path, and SIGBUS the user in the fault path with no changes.
>
> I think this should be implemented at the caller side rather than
> somehow hacked into the memcg core. It is the caller to know what to do.
> The caller can use gfp flags to control the reclaim behavior.
>

Hmm I'm a bit struggling to envision this.  So would it be acceptable
at the call sites where we doing a remote charge, such as
shmem_add_to_page_cache(), if we get ENOMEM from the
mem_cgroup_charge(), and we know we're doing a remote charge (because
current's memcg != the super block memcg), then we return ENOSPC from
shmem_add_to_page_cache()? I believe that will return ENOSPC to the
userspace in the non-pagefault path and SIGBUS in the pagefault path.
Or you had something else in mind?

> > To be honest I think this is very workable, as is Shakeel's suggestion
> > of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
> > document the behavior and if the userspace doesn't want to get killed
> > they can catch the sigbus and handle it gracefully. If not, the
> > userspace just gets killed if we hit this edge case.
>
> I am not sure about the MEMCG_OOM_NO_VICTIM approach. It sounds really
> hackish to me. I will get back to Shakeel's email as time permits. The
> primary problem I have with this, though, is that the kernel oom killer
> cannot really do anything sensible if the limit is reached and there
> is nothing reclaimable left in this case. The tmpfs backed memory will
> simply stay around and there are no means to recover without userspace
> intervention.
> --
> Michal Hocko
> SUSE Labs

On Tue, Nov 16, 2021 at 1:39 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 16-11-21 10:28:25, Michal Hocko wrote:
> > On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> [...]
> > > To be honest I think this is very workable, as is Shakeel's suggestion
> > > of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
> > > document the behavior and if the userspace doesn't want to get killed
> > > they can catch the sigbus and handle it gracefully. If not, the
> > > userspace just gets killed if we hit this edge case.
> >
> > I am not sure about the MEMCG_OOM_NO_VICTIM approach. It sounds really
> > hackish to me. I will get back to Shakeel's email as time permits. The
> > primary problem I have with this, though, is that the kernel oom killer
> > cannot really do anything sensible if the limit is reached and there
> > is nothing reclaimable left in this case. The tmpfs backed memory will
> > simply stay around and there are no means to recover without userspace
> > intervention.
>
> And just a small clarification. Tmpfs is fundamentally problematic from
> the OOM handling POV. The nuance here is that the OOM happens in a
> different memcg and thus a different resource domain. If you kill a task
> in the target memcg then you effectively DoS that workload. If you kill
> the allocating task then it is DoSed by anybody allowed to write to that
> shmem. All that without a graceful fallback.

I don't know if this addresses your concern, but I'm limiting the
memcg= use to processes that can enter that memcg. Therefore they
would be able to allocate memory in that memcg anyway by entering it.
So if they wanted to intentionally DoS that memcg they can already do
it without this feature.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16 10:17                   ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-16 10:17 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Tue, Nov 16, 2021 at 1:28 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
>
> On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> > On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > >
> > > On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > > > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > > > >
> > > > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > > > > > >
> > > > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > > > allocating process.
> > > > > > >
> > > > > > > This really begs for some justification.
> > > > > > >
> > > > > >
> > > > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > > > the failure to remote charge and continue operation.
> > > > > >
> > > > > > For example, in the network service use case that I mentioned in the
> > > > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > > > and continue to service network requests for other clients running on
> > > > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > > > limit. But, this is not a hard requirement, the network service could
> > > > > > fork a process that does the remote charging to guard against the
> > > > > > remote charge bringing down the entire process.
> > > > >
> > > > > This all belongs to the changelog so that we can discuss all potential
> > > > > implication and do not rely on any implicit assumptions.
> > > >
> > > > Understood. Maybe I'll wait to collect more feedback and upload v4
> > > > with a thorough explanation of the thought process.
> > > >
> > > > > E.g. why does
> > > > > it even make sense to kill a task in the origin cgroup?
> > > > >
> > > >
> > > > The behavior I saw returning ENOMEM for this edge case was that the
> > > > code was forever looping the pagefault, and I was (seemingly
> > > > incorrectly) under the impression that a suggestion to forever loop
> > > > the pagefault would be completely fundamentally unacceptable.
> > >
> > > Well, I have to say I am not entirely sure what is the best way to
> > > handle this situation. Another option would be to treat this similar to
> > > ENOSPACE situation. This would result into SIGBUS IIRC.
> > >
> > > The main problem with OOM killer is that it will not resolve the
> > > underlying problem in most situations. Shmem files would likely stay
> > > laying around and their charge along with them. Killing the allocating
> > > task has problems on its own because this could be just a DoS vector by
> > > other unrelated tasks sharing the shmem mount point without a gracefull
> > > fallback. Retrying the page fault is hard to detect. SIGBUS might be
> > > something that helps with the latest. The question is how to communicate
> > > this requerement down to the memcg code to know that the memory reclaim
> > > should happen (Should it? How hard we should try?) but do not invoke the
> > > oom killer. The more I think about this the nastier this is.
> >
> > So actually I thought the ENOSPC suggestion was interesting so I took
> > the liberty to prototype it. The changes required:
> >
> > 1. In out_of_memory() we return false if !oc->chosen &&
> > is_remote_oom(). This gets bubbled up to try_charge_memcg() as
> > mem_cgroup_oom() returning OOM_FAILED.
> > 2. In try_charge_memcg(), if we get an OOM_FAILED we again check
> > is_remote_oom(), if it is a remote oom, return ENOSPC.
> > 3. The calling code would return ENOSPC to the user in the no-fault
> > path, and SIGBUS the user in the fault path with no changes.
>
> I think this should be implemented at the caller side rather than
> somehow hacked into the memcg core. It is the caller to know what to do.
> The caller can use gfp flags to control the reclaim behavior.
>

Hmm I'm a bit struggling to envision this.  So would it be acceptable
at the call sites where we doing a remote charge, such as
shmem_add_to_page_cache(), if we get ENOMEM from the
mem_cgroup_charge(), and we know we're doing a remote charge (because
current's memcg != the super block memcg), then we return ENOSPC from
shmem_add_to_page_cache()? I believe that will return ENOSPC to the
userspace in the non-pagefault path and SIGBUS in the pagefault path.
Or you had something else in mind?

> > To be honest I think this is very workable, as is Shakeel's suggestion
> > of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
> > document the behavior and if the userspace doesn't want to get killed
> > they can catch the sigbus and handle it gracefully. If not, the
> > userspace just gets killed if we hit this edge case.
>
> I am not sure about the MEMCG_OOM_NO_VICTIM approach. It sounds really
> hackish to me. I will get back to Shakeel's email as time permits. The
> primary problem I have with this, though, is that the kernel oom killer
> cannot really do anything sensible if the limit is reached and there
> is nothing reclaimable left in this case. The tmpfs backed memory will
> simply stay around and there are no means to recover without userspace
> intervention.
> --
> Michal Hocko
> SUSE Labs

On Tue, Nov 16, 2021 at 1:39 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
>
> On Tue 16-11-21 10:28:25, Michal Hocko wrote:
> > On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> [...]
> > > To be honest I think this is very workable, as is Shakeel's suggestion
> > > of MEMCG_OOM_NO_VICTIM. Since this is an opt-in feature, we can
> > > document the behavior and if the userspace doesn't want to get killed
> > > they can catch the sigbus and handle it gracefully. If not, the
> > > userspace just gets killed if we hit this edge case.
> >
> > I am not sure about the MEMCG_OOM_NO_VICTIM approach. It sounds really
> > hackish to me. I will get back to Shakeel's email as time permits. The
> > primary problem I have with this, though, is that the kernel oom killer
> > cannot really do anything sensible if the limit is reached and there
> > is nothing reclaimable left in this case. The tmpfs backed memory will
> > simply stay around and there are no means to recover without userspace
> > intervention.
>
> And just a small clarification. Tmpfs is fundamentally problematic from
> the OOM handling POV. The nuance here is that the OOM happens in a
> different memcg and thus a different resource domain. If you kill a task
> in the target memcg then you effectively DoS that workload. If you kill
> the allocating task then it is DoSed by anybody allowed to write to that
> shmem. All that without a graceful fallback.

I don't know if this addresses your concern, but I'm limiting the
memcg= use to processes that can enter that memcg. Therefore they
would be able to allocate memory in that memcg anyway by entering it.
So if they wanted to intentionally DoS that memcg they can already do
it without this feature.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16 11:29                     ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-16 11:29 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Tue 16-11-21 02:17:09, Mina Almasry wrote:
> On Tue, Nov 16, 2021 at 1:28 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> > > On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko@suse.com> wrote:
> > > >
> > > > On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > > > > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > > >
> > > > > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> > > > > > > >
> > > > > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > > > > allocating process.
> > > > > > > >
> > > > > > > > This really begs for some justification.
> > > > > > > >
> > > > > > >
> > > > > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > > > > the failure to remote charge and continue operation.
> > > > > > >
> > > > > > > For example, in the network service use case that I mentioned in the
> > > > > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > > > > and continue to service network requests for other clients running on
> > > > > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > > > > limit. But, this is not a hard requirement, the network service could
> > > > > > > fork a process that does the remote charging to guard against the
> > > > > > > remote charge bringing down the entire process.
> > > > > >
> > > > > > This all belongs to the changelog so that we can discuss all potential
> > > > > > implication and do not rely on any implicit assumptions.
> > > > >
> > > > > Understood. Maybe I'll wait to collect more feedback and upload v4
> > > > > with a thorough explanation of the thought process.
> > > > >
> > > > > > E.g. why does
> > > > > > it even make sense to kill a task in the origin cgroup?
> > > > > >
> > > > >
> > > > > The behavior I saw returning ENOMEM for this edge case was that the
> > > > > code was forever looping the pagefault, and I was (seemingly
> > > > > incorrectly) under the impression that a suggestion to forever loop
> > > > > the pagefault would be completely fundamentally unacceptable.
> > > >
> > > > Well, I have to say I am not entirely sure what is the best way to
> > > > handle this situation. Another option would be to treat this similar to
> > > > ENOSPACE situation. This would result into SIGBUS IIRC.
> > > >
> > > > The main problem with OOM killer is that it will not resolve the
> > > > underlying problem in most situations. Shmem files would likely stay
> > > > laying around and their charge along with them. Killing the allocating
> > > > task has problems on its own because this could be just a DoS vector by
> > > > other unrelated tasks sharing the shmem mount point without a gracefull
> > > > fallback. Retrying the page fault is hard to detect. SIGBUS might be
> > > > something that helps with the latest. The question is how to communicate
> > > > this requerement down to the memcg code to know that the memory reclaim
> > > > should happen (Should it? How hard we should try?) but do not invoke the
> > > > oom killer. The more I think about this the nastier this is.
> > >
> > > So actually I thought the ENOSPC suggestion was interesting so I took
> > > the liberty to prototype it. The changes required:
> > >
> > > 1. In out_of_memory() we return false if !oc->chosen &&
> > > is_remote_oom(). This gets bubbled up to try_charge_memcg() as
> > > mem_cgroup_oom() returning OOM_FAILED.
> > > 2. In try_charge_memcg(), if we get an OOM_FAILED we again check
> > > is_remote_oom(), if it is a remote oom, return ENOSPC.
> > > 3. The calling code would return ENOSPC to the user in the no-fault
> > > path, and SIGBUS the user in the fault path with no changes.
> >
> > I think this should be implemented at the caller side rather than
> > somehow hacked into the memcg core. It is the caller to know what to do.
> > The caller can use gfp flags to control the reclaim behavior.
> >
> 
> Hmm I'm a bit struggling to envision this.  So would it be acceptable
> at the call sites where we doing a remote charge, such as
> shmem_add_to_page_cache(), if we get ENOMEM from the
> mem_cgroup_charge(), and we know we're doing a remote charge (because
> current's memcg != the super block memcg), then we return ENOSPC from
> shmem_add_to_page_cache()? I believe that will return ENOSPC to the
> userspace in the non-pagefault path and SIGBUS in the pagefault path.
> Or you had something else in mind?

Yes, exactly. I meant that all this special casing would be done at the
shmem layer as it knows how to communicate this usecase.

[...]

> > And just a small clarification. Tmpfs is fundamentally problematic from
> > the OOM handling POV. The nuance here is that the OOM happens in a
> > different memcg and thus a different resource domain. If you kill a task
> > in the target memcg then you effectively DoS that workload. If you kill
> > the allocating task then it is DoSed by anybody allowed to write to that
> > shmem. All that without a graceful fallback.
> 
> I don't know if this addresses your concern, but I'm limiting the
> memcg= use to processes that can enter that memcg. Therefore they
> would be able to allocate memory in that memcg anyway by entering it.
> So if they wanted to intentionally DoS that memcg they can already do
> it without this feature.

Can you elaborate some more? How do you enforce that the mount point
cannot be accessed by anybody outside of that constraint?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-16 11:29                     ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-16 11:29 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Tue 16-11-21 02:17:09, Mina Almasry wrote:
> On Tue, Nov 16, 2021 at 1:28 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> >
> > On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> > > On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > > >
> > > > On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > > > > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > > > > >
> > > > > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > > > > > > >
> > > > > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > > > > allocating process.
> > > > > > > >
> > > > > > > > This really begs for some justification.
> > > > > > > >
> > > > > > >
> > > > > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > > > > the failure to remote charge and continue operation.
> > > > > > >
> > > > > > > For example, in the network service use case that I mentioned in the
> > > > > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > > > > and continue to service network requests for other clients running on
> > > > > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > > > > limit. But, this is not a hard requirement, the network service could
> > > > > > > fork a process that does the remote charging to guard against the
> > > > > > > remote charge bringing down the entire process.
> > > > > >
> > > > > > This all belongs to the changelog so that we can discuss all potential
> > > > > > implication and do not rely on any implicit assumptions.
> > > > >
> > > > > Understood. Maybe I'll wait to collect more feedback and upload v4
> > > > > with a thorough explanation of the thought process.
> > > > >
> > > > > > E.g. why does
> > > > > > it even make sense to kill a task in the origin cgroup?
> > > > > >
> > > > >
> > > > > The behavior I saw returning ENOMEM for this edge case was that the
> > > > > code was forever looping the pagefault, and I was (seemingly
> > > > > incorrectly) under the impression that a suggestion to forever loop
> > > > > the pagefault would be completely fundamentally unacceptable.
> > > >
> > > > Well, I have to say I am not entirely sure what is the best way to
> > > > handle this situation. Another option would be to treat this similar to
> > > > ENOSPACE situation. This would result into SIGBUS IIRC.
> > > >
> > > > The main problem with OOM killer is that it will not resolve the
> > > > underlying problem in most situations. Shmem files would likely stay
> > > > laying around and their charge along with them. Killing the allocating
> > > > task has problems on its own because this could be just a DoS vector by
> > > > other unrelated tasks sharing the shmem mount point without a gracefull
> > > > fallback. Retrying the page fault is hard to detect. SIGBUS might be
> > > > something that helps with the latest. The question is how to communicate
> > > > this requerement down to the memcg code to know that the memory reclaim
> > > > should happen (Should it? How hard we should try?) but do not invoke the
> > > > oom killer. The more I think about this the nastier this is.
> > >
> > > So actually I thought the ENOSPC suggestion was interesting so I took
> > > the liberty to prototype it. The changes required:
> > >
> > > 1. In out_of_memory() we return false if !oc->chosen &&
> > > is_remote_oom(). This gets bubbled up to try_charge_memcg() as
> > > mem_cgroup_oom() returning OOM_FAILED.
> > > 2. In try_charge_memcg(), if we get an OOM_FAILED we again check
> > > is_remote_oom(), if it is a remote oom, return ENOSPC.
> > > 3. The calling code would return ENOSPC to the user in the no-fault
> > > path, and SIGBUS the user in the fault path with no changes.
> >
> > I think this should be implemented at the caller side rather than
> > somehow hacked into the memcg core. It is the caller to know what to do.
> > The caller can use gfp flags to control the reclaim behavior.
> >
> 
> Hmm I'm a bit struggling to envision this.  So would it be acceptable
> at the call sites where we doing a remote charge, such as
> shmem_add_to_page_cache(), if we get ENOMEM from the
> mem_cgroup_charge(), and we know we're doing a remote charge (because
> current's memcg != the super block memcg), then we return ENOSPC from
> shmem_add_to_page_cache()? I believe that will return ENOSPC to the
> userspace in the non-pagefault path and SIGBUS in the pagefault path.
> Or you had something else in mind?

Yes, exactly. I meant that all this special casing would be done at the
shmem layer as it knows how to communicate this usecase.

[...]

> > And just a small clarification. Tmpfs is fundamentally problematic from
> > the OOM handling POV. The nuance here is that the OOM happens in a
> > different memcg and thus a different resource domain. If you kill a task
> > in the target memcg then you effectively DoS that workload. If you kill
> > the allocating task then it is DoSed by anybody allowed to write to that
> > shmem. All that without a graceful fallback.
> 
> I don't know if this addresses your concern, but I'm limiting the
> memcg= use to processes that can enter that memcg. Therefore they
> would be able to allocate memory in that memcg anyway by entering it.
> So if they wanted to intentionally DoS that memcg they can already do
> it without this feature.

Can you elaborate some more? How do you enforce that the mount point
cannot be accessed by anybody outside of that constraint?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
  2021-11-16 11:29                     ` Michal Hocko
  (?)
@ 2021-11-16 21:27                     ` Mina Almasry
  2021-11-16 21:55                       ` Shakeel Butt
  2021-11-18  8:47                         ` Michal Hocko
  -1 siblings, 2 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-16 21:27 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 16-11-21 02:17:09, Mina Almasry wrote:
> > On Tue, Nov 16, 2021 at 1:28 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Mon 15-11-21 16:58:19, Mina Almasry wrote:
> > > > On Mon, Nov 15, 2021 at 2:58 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > >
> > > > > On Fri 12-11-21 09:59:22, Mina Almasry wrote:
> > > > > > On Fri, Nov 12, 2021 at 12:36 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > > > >
> > > > > > > On Fri 12-11-21 00:12:52, Mina Almasry wrote:
> > > > > > > > On Thu, Nov 11, 2021 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> > > > > > > > >
> > > > > > > > > On Thu 11-11-21 15:42:01, Mina Almasry wrote:
> > > > > > > > > > On remote ooms (OOMs due to remote charging), the oom-killer will attempt
> > > > > > > > > > to find a task to kill in the memcg under oom, if the oom-killer
> > > > > > > > > > is unable to find one, the oom-killer should simply return ENOMEM to the
> > > > > > > > > > allocating process.
> > > > > > > > >
> > > > > > > > > This really begs for some justification.
> > > > > > > > >
> > > > > > > >
> > > > > > > > I'm thinking (and I can add to the commit message in v4) that we have
> > > > > > > > 2 reasonable options when the oom-killer gets invoked and finds
> > > > > > > > nothing to kill: (1) return ENOMEM, (2) kill the allocating task. I'm
> > > > > > > > thinking returning ENOMEM allows the application to gracefully handle
> > > > > > > > the failure to remote charge and continue operation.
> > > > > > > >
> > > > > > > > For example, in the network service use case that I mentioned in the
> > > > > > > > RFC proposal, it's beneficial for the network service to get an ENOMEM
> > > > > > > > and continue to service network requests for other clients running on
> > > > > > > > the machine, rather than get oom-killed when hitting the remote memcg
> > > > > > > > limit. But, this is not a hard requirement, the network service could
> > > > > > > > fork a process that does the remote charging to guard against the
> > > > > > > > remote charge bringing down the entire process.
> > > > > > >
> > > > > > > This all belongs to the changelog so that we can discuss all potential
> > > > > > > implication and do not rely on any implicit assumptions.
> > > > > >
> > > > > > Understood. Maybe I'll wait to collect more feedback and upload v4
> > > > > > with a thorough explanation of the thought process.
> > > > > >
> > > > > > > E.g. why does
> > > > > > > it even make sense to kill a task in the origin cgroup?
> > > > > > >
> > > > > >
> > > > > > The behavior I saw returning ENOMEM for this edge case was that the
> > > > > > code was forever looping the pagefault, and I was (seemingly
> > > > > > incorrectly) under the impression that a suggestion to forever loop
> > > > > > the pagefault would be completely fundamentally unacceptable.
> > > > >
> > > > > Well, I have to say I am not entirely sure what is the best way to
> > > > > handle this situation. Another option would be to treat this similar to
> > > > > ENOSPACE situation. This would result into SIGBUS IIRC.
> > > > >
> > > > > The main problem with OOM killer is that it will not resolve the
> > > > > underlying problem in most situations. Shmem files would likely stay
> > > > > laying around and their charge along with them. Killing the allocating
> > > > > task has problems on its own because this could be just a DoS vector by
> > > > > other unrelated tasks sharing the shmem mount point without a gracefull
> > > > > fallback. Retrying the page fault is hard to detect. SIGBUS might be
> > > > > something that helps with the latest. The question is how to communicate
> > > > > this requerement down to the memcg code to know that the memory reclaim
> > > > > should happen (Should it? How hard we should try?) but do not invoke the
> > > > > oom killer. The more I think about this the nastier this is.
> > > >
> > > > So actually I thought the ENOSPC suggestion was interesting so I took
> > > > the liberty to prototype it. The changes required:
> > > >
> > > > 1. In out_of_memory() we return false if !oc->chosen &&
> > > > is_remote_oom(). This gets bubbled up to try_charge_memcg() as
> > > > mem_cgroup_oom() returning OOM_FAILED.
> > > > 2. In try_charge_memcg(), if we get an OOM_FAILED we again check
> > > > is_remote_oom(), if it is a remote oom, return ENOSPC.
> > > > 3. The calling code would return ENOSPC to the user in the no-fault
> > > > path, and SIGBUS the user in the fault path with no changes.
> > >
> > > I think this should be implemented at the caller side rather than
> > > somehow hacked into the memcg core. It is the caller to know what to do.
> > > The caller can use gfp flags to control the reclaim behavior.
> > >
> >
> > Hmm I'm a bit struggling to envision this.  So would it be acceptable
> > at the call sites where we doing a remote charge, such as
> > shmem_add_to_page_cache(), if we get ENOMEM from the
> > mem_cgroup_charge(), and we know we're doing a remote charge (because
> > current's memcg != the super block memcg), then we return ENOSPC from
> > shmem_add_to_page_cache()? I believe that will return ENOSPC to the
> > userspace in the non-pagefault path and SIGBUS in the pagefault path.
> > Or you had something else in mind?
>
> Yes, exactly. I meant that all this special casing would be done at the
> shmem layer as it knows how to communicate this usecase.
>

Awesome. The more I think of it I think the ENOSPC handling is perfect
for this use case, because it gives all users of the shared memory and
remote chargers a chance to gracefully handle the ENOSPC or the SIGBUS
when we hit the nothing to kill case. The only issue is finding a
clean implementation, and if the implementation I just proposed sounds
good to you then I see no issues and I'm happy to submit this in the
next version. Shakeel and others I would love to know what you think
either now or when I post the next version.

> [...]
>
> > > And just a small clarification. Tmpfs is fundamentally problematic from
> > > the OOM handling POV. The nuance here is that the OOM happens in a
> > > different memcg and thus a different resource domain. If you kill a task
> > > in the target memcg then you effectively DoS that workload. If you kill
> > > the allocating task then it is DoSed by anybody allowed to write to that
> > > shmem. All that without a graceful fallback.
> >
> > I don't know if this addresses your concern, but I'm limiting the
> > memcg= use to processes that can enter that memcg. Therefore they
> > would be able to allocate memory in that memcg anyway by entering it.
> > So if they wanted to intentionally DoS that memcg they can already do
> > it without this feature.
>
> Can you elaborate some more? How do you enforce that the mount point
> cannot be accessed by anybody outside of that constraint?

So if I'm a bad actor that wants to intentionally DoS random memcgs on
the system I can:

mount -t tmpfs -o memcg=/sys/fs/cgroup/unified/memcg-to-dos tmpfs /mnt/tmpfs
cat /dev/random > /mnt/tmpfs

That will reliably DoS the container. But we only allow you to mount
with memcg=/sys/fs/cgroup/unified/memcg-to-dos if you're able to enter
that memcg, so I can just do:

echo $$ > /sys/fs/cgroup/unified/memcg-to-dos/cgroup.procs
allocate_infinited_memory()

So we haven't added an attack vector really. More reasonably a sys
admin will set up a tmpfs mount with
memcg=/sys/fs/cgroup/unified/shared-memory-owner, and set the limit of
the shared-memory-owner to be big enough to handle the tasks running
in that memcg _and_ all the shared memory. The sys admin can also
limit the tmpfs with the size= option to limit the total size of the
shared memory. I think the sys admin could also set permissions on the
mount so only the users that share the memory can read/write, etc.

I'm sorry if this wasn't clear before and I'll take a good look at the
commit messages I'm writing and put as much info as possible in each.

As always thank you very much for your review and feedback.

> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
  2021-11-16 21:27                     ` Mina Almasry
@ 2021-11-16 21:55                       ` Shakeel Butt
  2021-11-18  8:48                           ` Michal Hocko
  2021-11-18  8:47                         ` Michal Hocko
  1 sibling, 1 reply; 42+ messages in thread
From: Shakeel Butt @ 2021-11-16 21:55 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Michal Hocko, Theodore Ts'o, Greg Thelen, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Tue, Nov 16, 2021 at 1:27 PM Mina Almasry <almasrymina@google.com> wrote:
>
> On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko@suse.com> wrote:
[...]
> > Yes, exactly. I meant that all this special casing would be done at the
> > shmem layer as it knows how to communicate this usecase.
> >
>
> Awesome. The more I think of it I think the ENOSPC handling is perfect
> for this use case, because it gives all users of the shared memory and
> remote chargers a chance to gracefully handle the ENOSPC or the SIGBUS
> when we hit the nothing to kill case. The only issue is finding a
> clean implementation, and if the implementation I just proposed sounds
> good to you then I see no issues and I'm happy to submit this in the
> next version. Shakeel and others I would love to know what you think
> either now or when I post the next version.
>

The direction seems reasonable to me. I would have more comments on
the actual code. At the high level I would prefer not to expose these
cases in the filesystem code (shmem or others) and instead be done in
a new memcg interface for filesystem users.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-18  8:47                         ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-18  8:47 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Tue 16-11-21 13:27:34, Mina Almasry wrote:
> On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko@suse.com> wrote:
[...]
> > Can you elaborate some more? How do you enforce that the mount point
> > cannot be accessed by anybody outside of that constraint?
> 
> So if I'm a bad actor that wants to intentionally DoS random memcgs on
> the system I can:
> 
> mount -t tmpfs -o memcg=/sys/fs/cgroup/unified/memcg-to-dos tmpfs /mnt/tmpfs
> cat /dev/random > /mnt/tmpfs

If you can mount tmpfs then you do not need to fiddle with memcgs at
all. You just DoS the whole machine. That is not what I was asking
though.

My question was more towards a difference scenario. How do you
prevent random processes to _write_ to those mount points? User/group
permissions might be just too coarse to describe memcg relation. Without
memcg in place somebody could cause ENOSPC to the mount point users
and that is not great either but that should be recoverable to some
degree. With memcg configuration this would cause the memcg OOM which
would be harder to recover from because it affects all memcg charges in
that cgroup - not just that specific fs access. See what I mean? This is
a completely new failure mode. 

The only reasonable way would be to reduce the visibility of that mount
point. This is certainly possible but it seems rather awkward when it
should be accessible from multiple resource domains.

I cannot really shake off feeling that this is potentially adding more
problems than it solves.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-18  8:47                         ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-18  8:47 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Theodore Ts'o, Greg Thelen, Shakeel Butt, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Tue 16-11-21 13:27:34, Mina Almasry wrote:
> On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
[...]
> > Can you elaborate some more? How do you enforce that the mount point
> > cannot be accessed by anybody outside of that constraint?
> 
> So if I'm a bad actor that wants to intentionally DoS random memcgs on
> the system I can:
> 
> mount -t tmpfs -o memcg=/sys/fs/cgroup/unified/memcg-to-dos tmpfs /mnt/tmpfs
> cat /dev/random > /mnt/tmpfs

If you can mount tmpfs then you do not need to fiddle with memcgs at
all. You just DoS the whole machine. That is not what I was asking
though.

My question was more towards a difference scenario. How do you
prevent random processes to _write_ to those mount points? User/group
permissions might be just too coarse to describe memcg relation. Without
memcg in place somebody could cause ENOSPC to the mount point users
and that is not great either but that should be recoverable to some
degree. With memcg configuration this would cause the memcg OOM which
would be harder to recover from because it affects all memcg charges in
that cgroup - not just that specific fs access. See what I mean? This is
a completely new failure mode. 

The only reasonable way would be to reduce the visibility of that mount
point. This is certainly possible but it seems rather awkward when it
should be accessible from multiple resource domains.

I cannot really shake off feeling that this is potentially adding more
problems than it solves.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-18  8:48                           ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-18  8:48 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Mina Almasry, Theodore Ts'o, Greg Thelen, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Tue 16-11-21 13:55:54, Shakeel Butt wrote:
> On Tue, Nov 16, 2021 at 1:27 PM Mina Almasry <almasrymina@google.com> wrote:
> >
> > On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko@suse.com> wrote:
> [...]
> > > Yes, exactly. I meant that all this special casing would be done at the
> > > shmem layer as it knows how to communicate this usecase.
> > >
> >
> > Awesome. The more I think of it I think the ENOSPC handling is perfect
> > for this use case, because it gives all users of the shared memory and
> > remote chargers a chance to gracefully handle the ENOSPC or the SIGBUS
> > when we hit the nothing to kill case. The only issue is finding a
> > clean implementation, and if the implementation I just proposed sounds
> > good to you then I see no issues and I'm happy to submit this in the
> > next version. Shakeel and others I would love to know what you think
> > either now or when I post the next version.
> >
> 
> The direction seems reasonable to me. I would have more comments on
> the actual code. At the high level I would prefer not to expose these
> cases in the filesystem code (shmem or others) and instead be done in
> a new memcg interface for filesystem users.

A library like function in the memcg proper sounds good to me I just
want to avoid any special casing in the core of the memcg charging and
special casing there.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-18  8:48                           ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2021-11-18  8:48 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Mina Almasry, Theodore Ts'o, Greg Thelen, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Tue 16-11-21 13:55:54, Shakeel Butt wrote:
> On Tue, Nov 16, 2021 at 1:27 PM Mina Almasry <almasrymina-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> >
> > On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> [...]
> > > Yes, exactly. I meant that all this special casing would be done at the
> > > shmem layer as it knows how to communicate this usecase.
> > >
> >
> > Awesome. The more I think of it I think the ENOSPC handling is perfect
> > for this use case, because it gives all users of the shared memory and
> > remote chargers a chance to gracefully handle the ENOSPC or the SIGBUS
> > when we hit the nothing to kill case. The only issue is finding a
> > clean implementation, and if the implementation I just proposed sounds
> > good to you then I see no issues and I'm happy to submit this in the
> > next version. Shakeel and others I would love to know what you think
> > either now or when I post the next version.
> >
> 
> The direction seems reasonable to me. I would have more comments on
> the actual code. At the high level I would prefer not to expose these
> cases in the filesystem code (shmem or others) and instead be done in
> a new memcg interface for filesystem users.

A library like function in the memcg proper sounds good to me I just
want to avoid any special casing in the core of the memcg charging and
special casing there.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-19 22:32                             ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-19 22:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Shakeel Butt, Theodore Ts'o, Greg Thelen, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel, linux-mm, linux-fsdevel,
	cgroups

On Thu, Nov 18, 2021 at 12:47 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 16-11-21 13:27:34, Mina Almasry wrote:
> > On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko@suse.com> wrote:
> [...]
> > > Can you elaborate some more? How do you enforce that the mount point
> > > cannot be accessed by anybody outside of that constraint?
> >
> > So if I'm a bad actor that wants to intentionally DoS random memcgs on
> > the system I can:
> >
> > mount -t tmpfs -o memcg=/sys/fs/cgroup/unified/memcg-to-dos tmpfs /mnt/tmpfs
> > cat /dev/random > /mnt/tmpfs
>
> If you can mount tmpfs then you do not need to fiddle with memcgs at
> all. You just DoS the whole machine. That is not what I was asking
> though.
>
> My question was more towards a difference scenario. How do you
> prevent random processes to _write_ to those mount points? User/group
> permissions might be just too coarse to describe memcg relation. Without
> memcg in place somebody could cause ENOSPC to the mount point users
> and that is not great either but that should be recoverable to some
> degree. With memcg configuration this would cause the memcg OOM which
> would be harder to recover from because it affects all memcg charges in
> that cgroup - not just that specific fs access. See what I mean? This is
> a completely new failure mode.
>
> The only reasonable way would be to reduce the visibility of that mount
> point. This is certainly possible but it seems rather awkward when it
> should be accessible from multiple resource domains.
>

So the problem of preventing random processes from writing to a mount
point is a generic problem on machine configurations where you have
untrusted code running on the machine, which is a very common case.
For us we have any number of random workloads or VMs running on the
machine and it's critical to limit their credentials to exactly what
these workloads need. Because of this, regardless of whether the
filesystem is mounted with memcg= or not, the write/execute/read
permissions are only given to those that need access to the mount
point. If this is not done correctly, there are potentially even more
serious problems than causing OOMs or SIGBUSes to users of the mount
point.

Because this is a generic problem, it's addressed 'elsewhere'. I'm
honestly not extremely familiar but my rough understanding is that
there are linux filesystem permissions and user namespaces to address
this, and there are also higher level constructs like containerd which
which limits the visibility of jobs running on the system. My
understanding is that there are also sandboxes which go well beyond
limiting file access permissions.

To speak more concretely, for the 3 use cases I mention in the RFC
proposal (I'll attach that as cover letter in the next version):
1. For services running on the system, the shared tmpfs mount is only
visible and accessible (write/read) to the network service and its
client.
2. For large jobs with subprocesses that share memory like kubernetes,
the shared tmpfs is again only visible and accessible to the processes
in this job.
3. For filesystems that host shared libraries, it's a big no-no to
give anyone on the machine write permissions to the runtime AFAIU, so
I expect the mount point to be read-only.

Note that all these restrictions should and would be in place
regardless of whether the kernel supports the memcg= option or the
filesystem is mounted with memcg=. I'm not extremely familiar with the
implementation details on these restrictions, but I can grab them.

> I cannot really shake off feeling that this is potentially adding more
> problems than it solves.
> --
> Michal Hocko
> SUSE Labs

On Thu, Nov 18, 2021 at 12:48 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 16-11-21 13:55:54, Shakeel Butt wrote:
> > On Tue, Nov 16, 2021 at 1:27 PM Mina Almasry <almasrymina@google.com> wrote:
> > >
> > > On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko@suse.com> wrote:
> > [...]
> > > > Yes, exactly. I meant that all this special casing would be done at the
> > > > shmem layer as it knows how to communicate this usecase.
> > > >
> > >
> > > Awesome. The more I think of it I think the ENOSPC handling is perfect
> > > for this use case, because it gives all users of the shared memory and
> > > remote chargers a chance to gracefully handle the ENOSPC or the SIGBUS
> > > when we hit the nothing to kill case. The only issue is finding a
> > > clean implementation, and if the implementation I just proposed sounds
> > > good to you then I see no issues and I'm happy to submit this in the
> > > next version. Shakeel and others I would love to know what you think
> > > either now or when I post the next version.
> > >
> >
> > The direction seems reasonable to me. I would have more comments on
> > the actual code. At the high level I would prefer not to expose these
> > cases in the filesystem code (shmem or others) and instead be done in
> > a new memcg interface for filesystem users.
>
> A library like function in the memcg proper sounds good to me I just
> want to avoid any special casing in the core of the memcg charging and
> special casing there.
>

Yes, this is the implementation I'm working on and I'll submit another version.


> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/4] mm/oom: handle remote ooms
@ 2021-11-19 22:32                             ` Mina Almasry
  0 siblings, 0 replies; 42+ messages in thread
From: Mina Almasry @ 2021-11-19 22:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Shakeel Butt, Theodore Ts'o, Greg Thelen, Andrew Morton,
	Hugh Dickins, Roman Gushchin, Johannes Weiner, Tejun Heo,
	Vladimir Davydov, Muchun Song, riel-ebMLmSuQjDVBDgjK7y7TUQ,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Thu, Nov 18, 2021 at 12:47 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
>
> On Tue 16-11-21 13:27:34, Mina Almasry wrote:
> > On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> [...]
> > > Can you elaborate some more? How do you enforce that the mount point
> > > cannot be accessed by anybody outside of that constraint?
> >
> > So if I'm a bad actor that wants to intentionally DoS random memcgs on
> > the system I can:
> >
> > mount -t tmpfs -o memcg=/sys/fs/cgroup/unified/memcg-to-dos tmpfs /mnt/tmpfs
> > cat /dev/random > /mnt/tmpfs
>
> If you can mount tmpfs then you do not need to fiddle with memcgs at
> all. You just DoS the whole machine. That is not what I was asking
> though.
>
> My question was more towards a difference scenario. How do you
> prevent random processes to _write_ to those mount points? User/group
> permissions might be just too coarse to describe memcg relation. Without
> memcg in place somebody could cause ENOSPC to the mount point users
> and that is not great either but that should be recoverable to some
> degree. With memcg configuration this would cause the memcg OOM which
> would be harder to recover from because it affects all memcg charges in
> that cgroup - not just that specific fs access. See what I mean? This is
> a completely new failure mode.
>
> The only reasonable way would be to reduce the visibility of that mount
> point. This is certainly possible but it seems rather awkward when it
> should be accessible from multiple resource domains.
>

So the problem of preventing random processes from writing to a mount
point is a generic problem on machine configurations where you have
untrusted code running on the machine, which is a very common case.
For us we have any number of random workloads or VMs running on the
machine and it's critical to limit their credentials to exactly what
these workloads need. Because of this, regardless of whether the
filesystem is mounted with memcg= or not, the write/execute/read
permissions are only given to those that need access to the mount
point. If this is not done correctly, there are potentially even more
serious problems than causing OOMs or SIGBUSes to users of the mount
point.

Because this is a generic problem, it's addressed 'elsewhere'. I'm
honestly not extremely familiar but my rough understanding is that
there are linux filesystem permissions and user namespaces to address
this, and there are also higher level constructs like containerd which
which limits the visibility of jobs running on the system. My
understanding is that there are also sandboxes which go well beyond
limiting file access permissions.

To speak more concretely, for the 3 use cases I mention in the RFC
proposal (I'll attach that as cover letter in the next version):
1. For services running on the system, the shared tmpfs mount is only
visible and accessible (write/read) to the network service and its
client.
2. For large jobs with subprocesses that share memory like kubernetes,
the shared tmpfs is again only visible and accessible to the processes
in this job.
3. For filesystems that host shared libraries, it's a big no-no to
give anyone on the machine write permissions to the runtime AFAIU, so
I expect the mount point to be read-only.

Note that all these restrictions should and would be in place
regardless of whether the kernel supports the memcg= option or the
filesystem is mounted with memcg=. I'm not extremely familiar with the
implementation details on these restrictions, but I can grab them.

> I cannot really shake off feeling that this is potentially adding more
> problems than it solves.
> --
> Michal Hocko
> SUSE Labs

On Thu, Nov 18, 2021 at 12:48 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
>
> On Tue 16-11-21 13:55:54, Shakeel Butt wrote:
> > On Tue, Nov 16, 2021 at 1:27 PM Mina Almasry <almasrymina-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> > >
> > > On Tue, Nov 16, 2021 at 3:29 AM Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org> wrote:
> > [...]
> > > > Yes, exactly. I meant that all this special casing would be done at the
> > > > shmem layer as it knows how to communicate this usecase.
> > > >
> > >
> > > Awesome. The more I think of it I think the ENOSPC handling is perfect
> > > for this use case, because it gives all users of the shared memory and
> > > remote chargers a chance to gracefully handle the ENOSPC or the SIGBUS
> > > when we hit the nothing to kill case. The only issue is finding a
> > > clean implementation, and if the implementation I just proposed sounds
> > > good to you then I see no issues and I'm happy to submit this in the
> > > next version. Shakeel and others I would love to know what you think
> > > either now or when I post the next version.
> > >
> >
> > The direction seems reasonable to me. I would have more comments on
> > the actual code. At the high level I would prefer not to expose these
> > cases in the filesystem code (shmem or others) and instead be done in
> > a new memcg interface for filesystem users.
>
> A library like function in the memcg proper sounds good to me I just
> want to avoid any special casing in the core of the memcg charging and
> special casing there.
>

Yes, this is the implementation I'm working on and I'll submit another version.


> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2021-11-19 22:32 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20211111234203.1824138-1-almasrymina@google.com>
2021-11-11 23:42 ` [PATCH v3 1/4] mm/shmem: support deterministic charging of tmpfs Mina Almasry
2021-11-11 23:42   ` Mina Almasry
2021-11-11 23:42   ` Mina Almasry
2021-11-12  2:41   ` kernel test robot
2021-11-12  2:41     ` kernel test robot
2021-11-11 23:42 ` [PATCH v3 2/4] mm/oom: handle remote ooms Mina Almasry
2021-11-11 23:42   ` Mina Almasry
2021-11-11 23:42   ` Mina Almasry
2021-11-12  7:51   ` Michal Hocko
2021-11-12  7:51     ` Michal Hocko
2021-11-12  8:12     ` Mina Almasry
2021-11-12  8:12       ` Mina Almasry
2021-11-12  8:36       ` Michal Hocko
2021-11-12 17:59         ` Mina Almasry
2021-11-12 17:59           ` Mina Almasry
2021-11-15 10:58           ` Michal Hocko
2021-11-15 17:32             ` Shakeel Butt
2021-11-15 17:32               ` Shakeel Butt
2021-11-16  0:58             ` Mina Almasry
2021-11-16  0:58               ` Mina Almasry
2021-11-16  9:28               ` Michal Hocko
2021-11-16  9:28                 ` Michal Hocko
2021-11-16  9:39                 ` Michal Hocko
2021-11-16  9:39                   ` Michal Hocko
2021-11-16 10:17                 ` Mina Almasry
2021-11-16 10:17                   ` Mina Almasry
2021-11-16 11:29                   ` Michal Hocko
2021-11-16 11:29                     ` Michal Hocko
2021-11-16 21:27                     ` Mina Almasry
2021-11-16 21:55                       ` Shakeel Butt
2021-11-18  8:48                         ` Michal Hocko
2021-11-18  8:48                           ` Michal Hocko
2021-11-19 22:32                           ` Mina Almasry
2021-11-19 22:32                             ` Mina Almasry
2021-11-18  8:47                       ` Michal Hocko
2021-11-18  8:47                         ` Michal Hocko
2021-11-11 23:42 ` [PATCH v3 3/4] mm, shmem: add tmpfs memcg= option documentation Mina Almasry
2021-11-11 23:42   ` Mina Almasry
2021-11-11 23:42   ` Mina Almasry
2021-11-11 23:42 ` [PATCH v3 4/4] mm, shmem, selftests: add tmpfs memcg= mount option tests Mina Almasry
2021-11-11 23:42   ` Mina Almasry
2021-11-11 23:42   ` Mina Almasry

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.