linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
@ 2021-07-22 15:20 brookxu
  2021-07-22 15:20 ` [RFC PATCH v2 2/3] misc_cgroup: add failcnt counter brookxu
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: brookxu @ 2021-07-22 15:20 UTC (permalink / raw)
  To: viro, tj, lizefan.x, hannes; +Cc: linux-kernel, linux-fsdevel, cgroups

From: Chunguang Xu <brookxu@tencent.com>

Since the global open files are limited, in order to avoid the
abnormal behavior of some containers from generating too many
files, causing other containers to be unavailable, we need to
limit the open files of some containers.

v2: fix compile error while CONFIG_CGROUP_MISC not set.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Reported-by: kernel test robot <lkp@intel.com>
---
 fs/file_table.c             | 28 ++++++++++++++++++++++++++--
 include/linux/fs.h          |  4 +++-
 include/linux/misc_cgroup.h |  1 +
 kernel/cgroup/misc.c        |  1 +
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 45437f8e1003..5957b2de9701 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -29,6 +29,7 @@
 #include <linux/swap.h>
 
 #include <linux/atomic.h>
+#include <linux/misc_cgroup.h>
 
 #include "internal.h"
 
@@ -53,8 +54,16 @@ static void file_free_rcu(struct rcu_head *head)
 static inline void file_free(struct file *f)
 {
 	security_file_free(f);
-	if (!(f->f_mode & FMODE_NOACCOUNT))
+	if (!(f->f_mode & FMODE_NOACCOUNT)) {
+#ifdef CONFIG_CGROUP_MISC
+		struct misc_cg *misc_cg = css_misc(f->f_css);
+
+		misc_cg_uncharge(MISC_CG_RES_NOFILE, misc_cg, 1);
+		put_misc_cg(misc_cg);
+#endif
+
 		percpu_counter_dec(&nr_files);
+	}
 	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
@@ -148,8 +157,22 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
 	}
 
 	f = __alloc_file(flags, cred);
-	if (!IS_ERR(f))
+	if (!IS_ERR(f)) {
+#ifdef CONFIG_CGROUP_MISC
+		struct misc_cg *misc_cg = get_current_misc_cg();
+		int ret;
+
+		ret = misc_cg_try_charge(MISC_CG_RES_NOFILE, misc_cg, 1);
+		if (ret < 0) {
+			file_free(f);
+			put_misc_cg(misc_cg);
+			return ERR_PTR(-ENFILE);
+		}
+		f->f_css = &misc_cg->css;
+#endif
+
 		percpu_counter_inc(&nr_files);
+	}
 
 	return f;
 
@@ -397,4 +420,5 @@ void __init files_maxfiles_init(void)
 	n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
+	misc_cg_set_capacity(MISC_CG_RES_NOFILE, files_stat.max_files);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fad6663cd1b0..9ef3dd579ed6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -947,7 +947,9 @@ struct file {
 #endif
 	/* needed for tty driver, and maybe others */
 	void			*private_data;
-
+#ifdef CONFIG_CGROUP_MISC
+	struct cgroup_subsys_state *f_css;
+#endif
 #ifdef CONFIG_EPOLL
 	/* Used by fs/eventpoll.c to link all the hooks to this file */
 	struct hlist_head	*f_ep;
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index da2367e2ac1e..8450a5e66de0 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -18,6 +18,7 @@ enum misc_res_type {
 	/* AMD SEV-ES ASIDs resource */
 	MISC_CG_RES_SEV_ES,
 #endif
+	MISC_CG_RES_NOFILE,
 	MISC_CG_RES_TYPES
 };
 
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index ec02d963cad1..5d51b8eeece6 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -24,6 +24,7 @@ static const char *const misc_res_name[] = {
 	/* AMD SEV-ES ASIDs resource */
 	"sev_es",
 #endif
+	"nofile"
 };
 
 /* Root misc cgroup */
-- 
2.30.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC PATCH v2 2/3] misc_cgroup: add failcnt counter
  2021-07-22 15:20 [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit brookxu
@ 2021-07-22 15:20 ` brookxu
  2021-07-22 15:20 ` [RFC PATCH v2 3/3] misc_cgroup: delete failed logs to avoid log flooding brookxu
  2021-07-26 21:27 ` [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit Tejun Heo
  2 siblings, 0 replies; 11+ messages in thread
From: brookxu @ 2021-07-22 15:20 UTC (permalink / raw)
  To: viro, tj, lizefan.x, hannes; +Cc: linux-kernel, linux-fsdevel, cgroups

From: Chunguang Xu <brookxu@tencent.com>

Instead of printing logs, we should probably track failures through
a failcnt counter, similar to mem_cgroup.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
---
 include/linux/misc_cgroup.h |  1 +
 kernel/cgroup/misc.c        | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index 8450a5e66de0..dd1a786f39b8 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -37,6 +37,7 @@ struct misc_cg;
 struct misc_res {
 	unsigned long max;
 	atomic_long_t usage;
+	atomic_long_t failcnt;
 	bool failed;
 };
 
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 5d51b8eeece6..7c568b619f82 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -165,6 +165,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
 				pr_cont("\n");
 				res->failed = true;
 			}
+			atomic_long_inc(&res->failcnt);
 			ret = -EBUSY;
 			goto err_charge;
 		}
@@ -312,6 +313,29 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
 	return 0;
 }
 
+/**
+ * misc_cg_failcnt_show() - Show the fail count of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_failcnt_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long failcnt;
+	struct misc_cg *cg = css_misc(seq_css(sf));
+
+	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+		failcnt = atomic_long_read(&cg->res[i].failcnt);
+		if (READ_ONCE(misc_res_capacity[i]) || failcnt)
+			seq_printf(sf, "%s %lu\n", misc_res_name[i], failcnt);
+	}
+
+	return 0;
+}
+
 /**
  * misc_cg_capacity_show() - Show the total capacity of misc res on the host.
  * @sf: Interface file
@@ -349,6 +373,11 @@ static struct cftype misc_cg_files[] = {
 		.seq_show = misc_cg_current_show,
 		.flags = CFTYPE_NOT_ON_ROOT,
 	},
+	{
+		.name = "failcnt",
+		.seq_show = misc_cg_failcnt_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
 	{
 		.name = "capacity",
 		.seq_show = misc_cg_capacity_show,
@@ -383,6 +412,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
 	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
 		WRITE_ONCE(cg->res[i].max, MAX_NUM);
 		atomic_long_set(&cg->res[i].usage, 0);
+		atomic_long_set(&cg->res[i].failcnt, 0);
 	}
 
 	return &cg->css;
-- 
2.30.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC PATCH v2 3/3] misc_cgroup: delete failed logs to avoid log flooding
  2021-07-22 15:20 [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit brookxu
  2021-07-22 15:20 ` [RFC PATCH v2 2/3] misc_cgroup: add failcnt counter brookxu
@ 2021-07-22 15:20 ` brookxu
  2021-07-26 21:27 ` [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit Tejun Heo
  2 siblings, 0 replies; 11+ messages in thread
From: brookxu @ 2021-07-22 15:20 UTC (permalink / raw)
  To: viro, tj, lizefan.x, hannes; +Cc: linux-kernel, linux-fsdevel, cgroups

From: Chunguang Xu <brookxu@tencent.com>

Since the upper-level logic will constantly retry when it fails, in
high-stress scenarios, a large number of failure logs may affect
performance. Therefore, we can replace it with the failcnt counter.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
---
 kernel/cgroup/misc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 7c568b619f82..b7de0fafa48a 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -159,8 +159,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
 		if (new_usage > READ_ONCE(res->max) ||
 		    new_usage > READ_ONCE(misc_res_capacity[type])) {
 			if (!res->failed) {
-				pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
-					misc_res_name[type]);
 				pr_cont_cgroup_path(i->css.cgroup);
 				pr_cont("\n");
 				res->failed = true;
-- 
2.30.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
  2021-07-22 15:20 [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit brookxu
  2021-07-22 15:20 ` [RFC PATCH v2 2/3] misc_cgroup: add failcnt counter brookxu
  2021-07-22 15:20 ` [RFC PATCH v2 3/3] misc_cgroup: delete failed logs to avoid log flooding brookxu
@ 2021-07-26 21:27 ` Tejun Heo
  2021-07-27  3:18   ` brookxu
  2 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2021-07-26 21:27 UTC (permalink / raw)
  To: brookxu; +Cc: viro, lizefan.x, hannes, linux-kernel, linux-fsdevel, cgroups

On Thu, Jul 22, 2021 at 11:20:17PM +0800, brookxu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> Since the global open files are limited, in order to avoid the
> abnormal behavior of some containers from generating too many
> files, causing other containers to be unavailable, we need to
> limit the open files of some containers.
> 
> v2: fix compile error while CONFIG_CGROUP_MISC not set.
> 
> Signed-off-by: Chunguang Xu <brookxu@tencent.com>
> Reported-by: kernel test robot <lkp@intel.com>

This is different from pid in that there's no actual limit on how many open
files there can be in the system other than the total amount of available
memory. I don't see why this would need a separate limit outside of memory
control. A couple machines I looked at all have file-max at LONG_MAX by
default too.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
  2021-07-26 21:27 ` [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit Tejun Heo
@ 2021-07-27  3:18   ` brookxu
  2021-07-27 16:32     ` Tejun Heo
  0 siblings, 1 reply; 11+ messages in thread
From: brookxu @ 2021-07-27  3:18 UTC (permalink / raw)
  To: Tejun Heo; +Cc: viro, lizefan.x, hannes, linux-kernel, linux-fsdevel, cgroups


Thanks for your time.

Tejun Heo wrote on 2021/7/27 5:27:
> On Thu, Jul 22, 2021 at 11:20:17PM +0800, brookxu wrote:
>> From: Chunguang Xu <brookxu@tencent.com>
>>
>> Since the global open files are limited, in order to avoid the
>> abnormal behavior of some containers from generating too many
>> files, causing other containers to be unavailable, we need to
>> limit the open files of some containers.
>>
>> v2: fix compile error while CONFIG_CGROUP_MISC not set.
>>
>> Signed-off-by: Chunguang Xu <brookxu@tencent.com>
>> Reported-by: kernel test robot <lkp@intel.com>
> 
> This is different from pid in that there's no actual limit on how many open
> files there can be in the system other than the total amount of available
> memory. I don't see why this would need a separate limit outside of memory
> control. A couple machines I looked at all have file-max at LONG_MAX by
> default too.

According to files_maxfiles_init(), we only allow about 10% of free memory to
create filps, and each filp occupies about 1K of cache. In this way, on a 16G
memory machine, the maximum usable filp is about 1,604,644. In general
scenarios, this may not be a big problem, but if the task is abnormal, it will
very likely become a bottleneck and affect other modules. 

> Thanks.
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
  2021-07-27  3:18   ` brookxu
@ 2021-07-27 16:32     ` Tejun Heo
  2021-07-28  3:17       ` brookxu
  0 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2021-07-27 16:32 UTC (permalink / raw)
  To: brookxu; +Cc: viro, lizefan.x, hannes, linux-kernel, linux-fsdevel, cgroups

Hello,

On Tue, Jul 27, 2021 at 11:18:00AM +0800, brookxu wrote:
> According to files_maxfiles_init(), we only allow about 10% of free memory to
> create filps, and each filp occupies about 1K of cache. In this way, on a 16G
> memory machine, the maximum usable filp is about 1,604,644. In general
> scenarios, this may not be a big problem, but if the task is abnormal, it will
> very likely become a bottleneck and affect other modules. 

Yeah but that can be configured trivially through sysfs. The reason why the
default limit is lowered is because we wanna prevent a part of system to
consume all the memory through fds. With cgroups, we already have that
protection and at least some systems already configure file-max to maximum,
so I don't see a point in adding another interface to subdivide the
artificial limit.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
  2021-07-27 16:32     ` Tejun Heo
@ 2021-07-28  3:17       ` brookxu
  2021-07-28  7:41         ` Tejun Heo
  0 siblings, 1 reply; 11+ messages in thread
From: brookxu @ 2021-07-28  3:17 UTC (permalink / raw)
  To: Tejun Heo; +Cc: viro, lizefan.x, hannes, linux-kernel, linux-fsdevel, cgroups



Tejun Heo wrote on 2021/7/28 12:32 上午:
> Hello,
> 
> On Tue, Jul 27, 2021 at 11:18:00AM +0800, brookxu wrote:
>> According to files_maxfiles_init(), we only allow about 10% of free memory to
>> create filps, and each filp occupies about 1K of cache. In this way, on a 16G
>> memory machine, the maximum usable filp is about 1,604,644. In general
>> scenarios, this may not be a big problem, but if the task is abnormal, it will
>> very likely become a bottleneck and affect other modules. 
> 
> Yeah but that can be configured trivially through sysfs. The reason why the
> default limit is lowered is because we wanna prevent a part of system to
> consume all the memory through fds. With cgroups, we already have that
> protection and at least some systems already configure file-max to maximum,
> so I don't see a point in adding another interface to subdivide the
> artificial limit.
> 

Yeah we can adjust file-max through sysctl, but in many cases we adjust it according
to the actual load of the machine, not for abnormal tasks. Another problem is that in
practical applications, kmem_limit will cause some minor problems. In many cases,
kmem_limit is disabled. Limit_in_bytes mainly counts user pages and pagecache, which
may cause files_cache to be out of control. In this case, if file-max is set to MAX,
we may have a risk in the abnormal scene, which prevents us from recovering from the
abnormal scene. Maybe I missed something.

> Thanks.
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
  2021-07-28  3:17       ` brookxu
@ 2021-07-28  7:41         ` Tejun Heo
  2021-07-28  9:47           ` brookxu
  0 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2021-07-28  7:41 UTC (permalink / raw)
  To: brookxu; +Cc: viro, lizefan.x, hannes, linux-kernel, linux-fsdevel, cgroups

On Wed, Jul 28, 2021 at 11:17:08AM +0800, brookxu wrote:
> Yeah we can adjust file-max through sysctl, but in many cases we adjust it according
> to the actual load of the machine, not for abnormal tasks. Another problem is that in
> practical applications, kmem_limit will cause some minor problems. In many cases,
> kmem_limit is disabled. Limit_in_bytes mainly counts user pages and pagecache, which
> may cause files_cache to be out of control. In this case, if file-max is set to MAX,
> we may have a risk in the abnormal scene, which prevents us from recovering from the
> abnormal scene. Maybe I missed something.

Kmem control is always on in cgroup2 and has been in wide production use for
years now. If there are problems with it, we need to fix them. That really
doesn't justify adding another feature.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
  2021-07-28  7:41         ` Tejun Heo
@ 2021-07-28  9:47           ` brookxu
  2021-07-28 15:38             ` Tejun Heo
  0 siblings, 1 reply; 11+ messages in thread
From: brookxu @ 2021-07-28  9:47 UTC (permalink / raw)
  To: Tejun Heo; +Cc: viro, lizefan.x, hannes, linux-kernel, linux-fsdevel, cgroups



Tejun Heo wrote on 2021/7/28 3:41 下午:
> On Wed, Jul 28, 2021 at 11:17:08AM +0800, brookxu wrote:
>> Yeah we can adjust file-max through sysctl, but in many cases we adjust it according
>> to the actual load of the machine, not for abnormal tasks. Another problem is that in
>> practical applications, kmem_limit will cause some minor problems. In many cases,
>> kmem_limit is disabled. Limit_in_bytes mainly counts user pages and pagecache, which
>> may cause files_cache to be out of control. In this case, if file-max is set to MAX,
>> we may have a risk in the abnormal scene, which prevents us from recovering from the
>> abnormal scene. Maybe I missed something.
> 
> Kmem control is always on in cgroup2 and has been in wide production use for
> years now. If there are problems with it, we need to fix them. That really
> doesn't justify adding another feature.

But considering stability issues(k8s), There are still many production environments use
cgroup v1 without kmem. If kmem is enabled, due to the relatively large granularity
of kmem, this feature can also prevent the abnormal open behavior from making the entire
container unavailable? but I currently do not have this scenario.

Thanks for your time.

> Thanks.
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
  2021-07-28  9:47           ` brookxu
@ 2021-07-28 15:38             ` Tejun Heo
  2021-07-29  6:37               ` brookxu
  0 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2021-07-28 15:38 UTC (permalink / raw)
  To: brookxu; +Cc: viro, lizefan.x, hannes, linux-kernel, linux-fsdevel, cgroups

Hello,

On Wed, Jul 28, 2021 at 05:47:05PM +0800, brookxu wrote:
> But considering stability issues(k8s), There are still many production environments use
> cgroup v1 without kmem. If kmem is enabled, due to the relatively large granularity
> of kmem, this feature can also prevent the abnormal open behavior from making the entire
> container unavailable? but I currently do not have this scenario.

Now we are repeating the same points. This simply doesn't justify adding a
user-facing feature that we have to maintain for eternity.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit
  2021-07-28 15:38             ` Tejun Heo
@ 2021-07-29  6:37               ` brookxu
  0 siblings, 0 replies; 11+ messages in thread
From: brookxu @ 2021-07-29  6:37 UTC (permalink / raw)
  To: Tejun Heo; +Cc: viro, lizefan.x, hannes, linux-kernel, linux-fsdevel, cgroups


Thanks for your time.

Tejun Heo wrote on 2021/7/28 11:38 下午:
> Hello,
> 
> On Wed, Jul 28, 2021 at 05:47:05PM +0800, brookxu wrote:
>> But considering stability issues(k8s), There are still many production environments use
>> cgroup v1 without kmem. If kmem is enabled, due to the relatively large granularity
>> of kmem, this feature can also prevent the abnormal open behavior from making the entire
>> container unavailable? but I currently do not have this scenario.
> 
> Now we are repeating the same points. This simply doesn't justify adding a
> user-facing feature that we have to maintain for eternity.

Ok, thanks you for your patient reply.

> Thanks.
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-07-29  6:37 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-22 15:20 [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit brookxu
2021-07-22 15:20 ` [RFC PATCH v2 2/3] misc_cgroup: add failcnt counter brookxu
2021-07-22 15:20 ` [RFC PATCH v2 3/3] misc_cgroup: delete failed logs to avoid log flooding brookxu
2021-07-26 21:27 ` [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit Tejun Heo
2021-07-27  3:18   ` brookxu
2021-07-27 16:32     ` Tejun Heo
2021-07-28  3:17       ` brookxu
2021-07-28  7:41         ` Tejun Heo
2021-07-28  9:47           ` brookxu
2021-07-28 15:38             ` Tejun Heo
2021-07-29  6:37               ` brookxu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).