linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/14] resource limits: foundation for resource highwater tracking
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 12:12   ` kbuild test robot
  2016-07-15 12:49   ` Nicolas Dichtel
  2016-07-15 10:35 ` [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level Topi Miettinen
                   ` (13 subsequent siblings)
  14 siblings, 2 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Jonathan Corbet, Ingo Molnar, Peter Zijlstra,
	Balbir Singh, David S. Miller, Nicolas Dichtel, Markus Elfring,
	Thomas Gleixner, Rik van Riel, open list:DOCUMENTATION

There are many basic ways to control processes, including capabilities,
cgroups and resource limits. However, there are far fewer ways to find out
useful values for the limits, except blind trial and error.

Prepare a foundation for resource highwater tracking.

The collected highwater marks for the resources can be seen using
taskstats netlink interface.

This depends on CONFIG_TASK_XACCT.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 Documentation/accounting/getdelays.c | 52 +++++++++++++++++++++++++++++++++---
 include/linux/sched.h                | 31 +++++++++++++++++++++
 include/linux/tsacct_kern.h          |  3 +++
 include/uapi/linux/taskstats.h       | 10 ++++++-
 kernel/taskstats.c                   |  4 +++
 kernel/tsacct.c                      | 26 ++++++++++++++++++
 6 files changed, 122 insertions(+), 4 deletions(-)

diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index b5ca536..489f1b7 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -8,7 +8,7 @@
  * Copyright (c) Jay Lan, SGI. 2006
  *
  * Compile with
- *	gcc -I/usr/src/linux/include getdelays.c -o getdelays
+ *	gcc -I/usr/src/linux getdelays.c -o getdelays
  */
 
 #include <stdio.h>
@@ -22,10 +22,11 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/wait.h>
+#include <sys/resource.h>
 #include <signal.h>
 
 #include <linux/genetlink.h>
-#include <linux/taskstats.h>
+#include "include/uapi/linux/taskstats.h"
 #include <linux/cgroupstats.h>
 
 /*
@@ -50,6 +51,7 @@ char name[100];
 int dbg;
 int print_delays;
 int print_io_accounting;
+int print_resource_accounting;
 int print_task_context_switch_counts;
 
 #define PRINTF(fmt, arg...) {			\
@@ -63,6 +65,8 @@ int print_task_context_switch_counts;
 /* Maximum number of cpus expected to be specified in a cpumask */
 #define MAX_CPUS	32
 
+#define TASKSTATS_VERSION_WITH_RESOURCE	9
+
 struct msgtemplate {
 	struct nlmsghdr n;
 	struct genlmsghdr g;
@@ -77,6 +81,7 @@ static void usage(void)
 			"[-m cpumask] [-t tgid] [-p pid]\n");
 	fprintf(stderr, "  -d: print delayacct stats\n");
 	fprintf(stderr, "  -i: print IO accounting (works only with -p)\n");
+	fprintf(stderr, "  -R: print resource accounting stats\n");
 	fprintf(stderr, "  -l: listen forever\n");
 	fprintf(stderr, "  -v: debug on\n");
 	fprintf(stderr, "  -C: container path\n");
@@ -232,6 +237,25 @@ static void task_context_switch_counts(struct taskstats *t)
 	       (unsigned long long)t->nvcsw, (unsigned long long)t->nivcsw);
 }
 
+static const char *const rlimit_names[] = {
+	[RLIMIT_CPU] = "RLIMIT_CPU",
+	[RLIMIT_FSIZE] = "RLIMIT_FSIZE",
+	[RLIMIT_DATA] = "RLIMIT_DATA",
+	[RLIMIT_STACK] = "RLIMIT_STACK",
+	[RLIMIT_CORE] = "RLIMIT_CORE",
+	[RLIMIT_RSS] = "RLIMIT_RSS",
+	[RLIMIT_NPROC] = "RLIMIT_NPROC",
+	[RLIMIT_NOFILE] = "RLIMIT_NOFILE",
+	[RLIMIT_MEMLOCK] = "RLIMIT_MEMLOCK",
+	[RLIMIT_AS] = "RLIMIT_AS",
+	[RLIMIT_LOCKS] = "RLIMIT_LOCKS",
+	[RLIMIT_SIGPENDING] = "RLIMIT_SIGPENDING",
+	[RLIMIT_MSGQUEUE] = "RLIMIT_MSGQUEUE",
+	[RLIMIT_NICE] = "RLIMIT_NICE",
+	[RLIMIT_RTPRIO] = "RLIMIT_RTPRIO",
+	[RLIMIT_RTTIME] = "RLIMIT_RTTIME",
+};
+
 static void print_cgroupstats(struct cgroupstats *c)
 {
 	printf("sleeping %llu, blocked %llu, running %llu, stopped %llu, "
@@ -252,6 +276,22 @@ static void print_ioacct(struct taskstats *t)
 		(unsigned long long)t->cancelled_write_bytes);
 }
 
+static void print_racct(const struct taskstats *t)
+{
+	int i;
+
+	if (t->version < TASKSTATS_VERSION_WITH_RESOURCE) {
+		printf("kernel too old (%d < %d)\n", t->version,
+		       TASKSTATS_VERSION_WITH_RESOURCE);
+		return;
+	}
+
+	for (i = 0; i < RLIM_NLIMITS; i++)
+		printf("%s=%llu\n",
+		       rlimit_names[i],
+		       (unsigned long long)t->resource_hiwater[i]);
+}
+
 int main(int argc, char *argv[])
 {
 	int c, rc, rep_len, aggr_len, len2;
@@ -280,7 +320,7 @@ int main(int argc, char *argv[])
 	struct msgtemplate msg;
 
 	while (!forking) {
-		c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:c:");
+		c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:c:R");
 		if (c < 0)
 			break;
 
@@ -297,6 +337,10 @@ int main(int argc, char *argv[])
 			printf("printing task/process context switch rates\n");
 			print_task_context_switch_counts = 1;
 			break;
+		case 'R':
+			printf("printing resource accounting\n");
+			print_resource_accounting = 1;
+			break;
 		case 'C':
 			containerset = 1;
 			containerpath = optarg;
@@ -497,6 +541,8 @@ int main(int argc, char *argv[])
 							print_ioacct((struct taskstats *) NLA_DATA(na));
 						if (print_task_context_switch_counts)
 							task_context_switch_counts((struct taskstats *) NLA_DATA(na));
+						if (print_resource_accounting)
+							print_racct((struct taskstats *) NLA_DATA(na));
 						if (fd) {
 							if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
 								err(1,"write error\n");
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 253538f..e4d7482 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -763,6 +763,9 @@ struct signal_struct {
 	unsigned long inblock, oublock, cinblock, coublock;
 	unsigned long maxrss, cmaxrss;
 	struct task_io_accounting ioac;
+#ifdef CONFIG_TASK_XACCT
+	unsigned long resource_highwatermark[RLIM_NLIMITS];
+#endif
 
 	/*
 	 * Cumulative ns of schedule CPU time fo dead threads in the
@@ -3323,6 +3326,24 @@ static inline void inc_syscw(struct task_struct *tsk)
 {
 	tsk->ioac.syscw++;
 }
+
+static inline void task_update_resource_highwatermark(struct task_struct *tsk,
+						      unsigned int limit,
+						      unsigned long r)
+{
+	struct signal_struct *sig = tsk->signal;
+
+	write_seqlock(&sig->stats_lock);
+	if ((sig->resource_highwatermark[limit]) < r)
+		sig->resource_highwatermark[limit] = r;
+	write_sequnlock(&sig->stats_lock);
+}
+
+static inline void update_resource_highwatermark(unsigned int limit,
+						 unsigned long r)
+{
+	task_update_resource_highwatermark(current, limit, r);
+}
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
@@ -3339,6 +3360,16 @@ static inline void inc_syscr(struct task_struct *tsk)
 static inline void inc_syscw(struct task_struct *tsk)
 {
 }
+static inline void task_update_resource_highwatermark(struct task_struct *tsk,
+						      unsigned int limit,
+						      unsigned long r)
+{
+}
+
+static inline void update_resource_highwatermark(unsigned int limit,
+						 unsigned long r)
+{
+}
 #endif
 
 #ifndef TASK_SIZE_OF
diff --git a/include/linux/tsacct_kern.h b/include/linux/tsacct_kern.h
index 3251965..bcf1301 100644
--- a/include/linux/tsacct_kern.h
+++ b/include/linux/tsacct_kern.h
@@ -25,6 +25,7 @@ extern void xacct_add_tsk(struct taskstats *stats, struct task_struct *p);
 extern void acct_update_integrals(struct task_struct *tsk);
 extern void acct_account_cputime(struct task_struct *tsk);
 extern void acct_clear_integrals(struct task_struct *tsk);
+extern void racct_add_tsk(struct taskstats *stats, struct task_struct *tsk);
 #else
 static inline void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {}
@@ -34,6 +35,8 @@ static inline void acct_account_cputime(struct task_struct *tsk)
 {}
 static inline void acct_clear_integrals(struct task_struct *tsk)
 {}
+static inline void racct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{}
 #endif /* CONFIG_TASK_XACCT */
 
 #endif
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 2466e55..8c65194 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -33,7 +33,7 @@
  */
 
 
-#define TASKSTATS_VERSION	8
+#define TASKSTATS_VERSION	9
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -163,6 +163,14 @@ struct taskstats {
 	/* Delay waiting for memory reclaim */
 	__u64	freepages_count;
 	__u64	freepages_delay_total;
+	/* Per-task storage I/O accounting ends */
+
+#define TASKSTATS_HAS_LIMIT_ACCOUNTING
+	/* Per-task resource accounting starts */
+	__u64   resource_hiwater[RLIM_NLIMITS]; /* high-watermark of
+						     RLIMIT
+						     resources */
+	/* Per-task resource accounting ends */
 };
 
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee..9a03e6b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/resource.h>
 #include <linux/taskstats_kern.h>
 #include <linux/tsacct_kern.h>
 #include <linux/delayacct.h>
@@ -188,6 +189,9 @@ static void fill_stats(struct user_namespace *user_ns,
 
 	/* fill in extended acct fields */
 	xacct_add_tsk(stats, tsk);
+
+	/* fill in resource acct fields */
+	racct_add_tsk(stats, tsk);
 }
 
 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f8e26ab..231bae3 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -183,4 +183,30 @@ void acct_clear_integrals(struct task_struct *tsk)
 	tsk->acct_rss_mem1 = 0;
 	tsk->acct_vm_mem1 = 0;
 }
+
+/*
+ * fill in resource accounting fields
+ */
+void racct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	int i;
+	unsigned int seq, nextseq;
+	unsigned long flags;
+
+	rcu_read_lock();
+	/* Attempt a lockless read on the first round. */
+	nextseq = 0;
+	do {
+		seq = nextseq;
+		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+		for (i = 0; i < RLIM_NLIMITS; i++)
+			stats->resource_hiwater[i] = (__u64)sig->resource_highwatermark[i];
+
+		/* If lockless access failed, take the lock. */
+		nextseq = 1;
+	} while (need_seqretry(&sig->stats_lock, seq));
+	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+	rcu_read_unlock();
+}
 #endif
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
  2016-07-15 10:35 ` [PATCH 01/14] resource limits: foundation for resource highwater tracking Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 12:38   ` kbuild test robot
  2016-07-15 14:10   ` Tejun Heo
  2016-07-15 10:35 ` [PATCH 03/14] resource limits: track highwater mark of file sizes Topi Miettinen
                   ` (12 subsequent siblings)
  14 siblings, 2 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Jonathan Corbet, Tejun Heo, Li Zefan,
	Johannes Weiner, Markus Elfring, David S. Miller,
	Nicolas Dichtel, open list:DOCUMENTATION,
	open list:CONTROL GROUP (CGROUP)

Collect resource usage highwater marks of a task to cgroup
statistics when the task exits.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 Documentation/accounting/getdelays.c | 10 ++++++-
 include/linux/cgroup-defs.h          |  5 ++++
 include/uapi/linux/cgroupstats.h     |  3 ++
 kernel/cgroup.c                      | 55 ++++++++++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index 489f1b7..7c86279 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -27,7 +27,7 @@
 
 #include <linux/genetlink.h>
 #include "include/uapi/linux/taskstats.h"
-#include <linux/cgroupstats.h>
+#include "include/uapi/linux/cgroupstats.h"
 
 /*
  * Generic macros for dealing with netlink sockets. Might be duplicated
@@ -258,12 +258,20 @@ static const char *const rlimit_names[] = {
 
 static void print_cgroupstats(struct cgroupstats *c)
 {
+	int i;
+
 	printf("sleeping %llu, blocked %llu, running %llu, stopped %llu, "
 		"uninterruptible %llu\n", (unsigned long long)c->nr_sleeping,
 		(unsigned long long)c->nr_io_wait,
 		(unsigned long long)c->nr_running,
 		(unsigned long long)c->nr_stopped,
 		(unsigned long long)c->nr_uninterruptible);
+
+	if (print_resource_accounting)
+		for (i = 0; i < RLIM_NLIMITS; i++)
+			printf("%s=%llu\n",
+			       rlimit_names[i],
+			       (unsigned long long)c->resource_hiwater[i]);
 }
 
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5b17de6..86bbc08 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
+#include <linux/cgroupstats.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -300,6 +301,10 @@ struct cgroup {
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
 
+#ifdef CONFIG_TASK_XACCT
+	struct cgroupstats stats;
+#endif
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
diff --git a/include/uapi/linux/cgroupstats.h b/include/uapi/linux/cgroupstats.h
index 3753c33..18b5b11 100644
--- a/include/uapi/linux/cgroupstats.h
+++ b/include/uapi/linux/cgroupstats.h
@@ -35,6 +35,9 @@ struct cgroupstats {
 	__u64	nr_uninterruptible;	/* Number of tasks in uninterruptible */
 					/* state */
 	__u64	nr_io_wait;		/* Number of tasks waiting on IO */
+	__u64   resource_hiwater[RLIM_NLIMITS]; /* high-watermark of
+						     RLIMIT
+						     resources */
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff0..9b2d805 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -247,6 +247,7 @@ static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 			      struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
+static void cgroup_update_stats(void);
 
 /**
  * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
@@ -2609,6 +2610,8 @@ out_release_tset:
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
+	cgroup_update_stats();
+
 	spin_unlock_irq(&css_set_lock);
 	return ret;
 }
@@ -4657,6 +4660,53 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	return 0;
 }
 
+/*
+ * Update cgroupstats based on the stats from exiting task
+ */
+static void cgroup_update_stats_from_task(struct cgroup *cgrp,
+					  struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	int i;
+	unsigned int seq, nextseq;
+	unsigned long flags;
+
+	rcu_read_lock();
+	/* Attempt a lockless read on the first round. */
+	nextseq = 0;
+	do {
+		seq = nextseq;
+		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+		for (i = 0; i < RLIM_NLIMITS; i++)
+			if (cgrp->stats.resource_hiwater[i] <
+			    sig->resource_highwatermark[i])
+				cgrp->stats.resource_hiwater[i] =
+					sig->resource_highwatermark[i];
+
+		/* If lockless access failed, take the lock. */
+		nextseq = 1;
+	} while (need_seqretry(&sig->stats_lock, seq));
+	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+	rcu_read_unlock();
+}
+
+static void cgroup_update_stats(void)
+{
+	struct cgroup_root *root;
+
+	for_each_root(root) {
+		struct cgroup *cgrp;
+
+		if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+			continue;
+
+		cgrp = task_cgroup_from_root(current, root);
+
+		if (cgroup_on_dfl(cgrp))
+			cgroup_update_stats_from_task(cgrp, current);
+	}
+}
+
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
@@ -4672,6 +4722,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	struct cgroup *cgrp;
 	struct css_task_iter it;
 	struct task_struct *tsk;
+	int i;
 
 	/* it should be kernfs_node belonging to cgroupfs and is a directory */
 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
@@ -4714,9 +4765,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 				stats->nr_io_wait++;
 			break;
 		}
+		cgroup_update_stats_from_task(cgrp, tsk);
 	}
 	css_task_iter_end(&it);
 
+	for (i = 0; i < RLIM_NLIMITS; i++)
+		stats->resource_hiwater[i] = cgrp->stats.resource_hiwater[i];
+
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 03/14] resource limits: track highwater mark of file sizes
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
  2016-07-15 10:35 ` [PATCH 01/14] resource limits: foundation for resource highwater tracking Topi Miettinen
  2016-07-15 10:35 ` [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:35 ` [PATCH 04/14] resource limits: track highwater mark of VM data segment Topi Miettinen
                   ` (11 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Alexander Viro,
	open list:FILESYSTEMS (VFS and infrastructure)

Track maximum size of files created, to be able to configure
RLIMIT_FSIZE resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 fs/attr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/attr.c b/fs/attr.c
index 25b24d0..546f4f9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -116,6 +116,8 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset)
 			return -ETXTBSY;
 	}
 
+	update_resource_highwatermark(RLIMIT_FSIZE, offset);
+
 	return 0;
 out_sig:
 	send_sig(SIGXFSZ, current, 0);
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 04/14] resource limits: track highwater mark of VM data segment
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (2 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 03/14] resource limits: track highwater mark of file sizes Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:35 ` [PATCH 05/14] resource limits: track highwater mark of stack size Topi Miettinen
                   ` (10 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	Alexander Viro, Andrew Morton, Michal Hocko, Vlastimil Babka,
	Ben Segall, Alex Thorlton, Mateusz Guzik, John Stultz,
	Kirill A. Shutemov, Oleg Nesterov, Chen Gang,
	Konstantin Khlebnikov, Andrea Arcangeli, Andrey Ryabinin,
	open list:FILESYSTEMS (VFS and infrastructure),
	open list:MEMORY MANAGEMENT

Track maximum size of data VM, to be able to configure
RLIMIT_DATA resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 arch/x86/ia32/ia32_aout.c | 2 ++
 fs/binfmt_aout.c          | 2 ++
 fs/binfmt_flat.c          | 2 ++
 kernel/sys.c              | 3 +++
 mm/mmap.c                 | 7 ++++++-
 5 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index cb26f18..9236254 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/jiffies.h>
 #include <linux/perf_event.h>
+#include <linux/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -398,6 +399,7 @@ beyond_if:
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 =
 	regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
 	set_fs(USER_DS);
+	update_resource_highwatermark(RLIMIT_DATA, ex.a_data + ex.a_bss);
 	return 0;
 }
 
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ae1b540..49216f4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/coredump.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -330,6 +331,7 @@ beyond_if:
 	regs->gp = ex.a_gpvalue;
 #endif
 	start_thread(regs, ex.a_entry, current->mm->start_stack);
+	update_resource_highwatermark(RLIMIT_DATA, ex.a_data + ex.a_bss);
 	return 0;
 }
 
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index caf9e39..19c2212 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/flat.h>
 #include <linux/syscalls.h>
+#include <linux/sched.h>
 
 #include <asm/byteorder.h>
 #include <asm/uaccess.h>
@@ -792,6 +793,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			libinfo->lib_list[id].start_brk) +	/* start brk */
 			stack_len);
 
+	update_resource_highwatermark(RLIMIT_DATA, data_len + bss_len);
 	return 0;
 err:
 	return ret;
diff --git a/kernel/sys.c b/kernel/sys.c
index 89d5be4..d84c87e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1896,6 +1896,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	if (prctl_map.auxv_size)
 		memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
 
+	update_resource_highwatermark(RLIMIT_DATA, mm->end_data -
+				      mm->start_data);
+
 	up_write(&mm->mmap_sem);
 	return 0;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index de2c176..0b10f56 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -228,6 +228,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 set_brk:
+	update_resource_highwatermark(RLIMIT_DATA, (brk - mm->start_brk) +
+				      (mm->end_data - mm->start_data));
 	mm->brk = brk;
 	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
 	up_write(&mm->mmap_sem);
@@ -2924,8 +2926,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 		mm->exec_vm += npages;
 	else if (is_stack_mapping(flags))
 		mm->stack_vm += npages;
-	else if (is_data_mapping(flags))
+	else if (is_data_mapping(flags)) {
 		mm->data_vm += npages;
+		update_resource_highwatermark(RLIMIT_DATA,
+					      mm->data_vm << PAGE_SHIFT);
+	}
 }
 
 static int special_mapping_fault(struct vm_area_struct *vma,
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 05/14] resource limits: track highwater mark of stack size
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (3 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 04/14] resource limits: track highwater mark of VM data segment Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:35 ` [PATCH 06/14] resource limits: track highwater mark of cores dumped Topi Miettinen
                   ` (9 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Chen Gang, Michal Hocko, Konstantin Khlebnikov, Andrea Arcangeli,
	Andrey Ryabinin, open list:MEMORY MANAGEMENT

Track maximum stack size, to be able to configure
RLIMIT_STACK resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 mm/mmap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/mmap.c b/mm/mmap.c
index 0b10f56..305c456 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2019,6 +2019,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 	if (security_vm_enough_memory_mm(mm, grow))
 		return -ENOMEM;
 
+	update_resource_highwatermark(RLIMIT_STACK, actual_size);
+
 	return 0;
 }
 
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 06/14] resource limits: track highwater mark of cores dumped
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (4 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 05/14] resource limits: track highwater mark of stack size Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:35 ` [PATCH 07/14] resource limits: track highwater mark of user processes Topi Miettinen
                   ` (8 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Alexander Viro,
	open list:FILESYSTEMS (VFS and infrastructure)

Track maximum size of core dump written, to be able to configure
RLIMIT_CORE resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 fs/coredump.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 281b768..a0ace88 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -784,20 +784,25 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 	struct file *file = cprm->file;
 	loff_t pos = file->f_pos;
 	ssize_t n;
+	int r = 0;
+
 	if (cprm->written + nr > cprm->limit)
 		return 0;
 	while (nr) {
 		if (dump_interrupted())
-			return 0;
+			goto err;
 		n = __kernel_write(file, addr, nr, &pos);
 		if (n <= 0)
-			return 0;
+			goto err;
 		file->f_pos = pos;
 		cprm->written += n;
 		cprm->pos += n;
 		nr -= n;
 	}
-	return 1;
+	r = 1;
+ err:
+	update_resource_highwatermark(RLIMIT_CORE, cprm->written);
+	return r;
 }
 EXPORT_SYMBOL(dump_emit);
 
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 07/14] resource limits: track highwater mark of user processes
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (5 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 06/14] resource limits: track highwater mark of cores dumped Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:35 ` [PATCH 08/14] resource limits: track highwater mark of number of files Topi Miettinen
                   ` (7 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Ingo Molnar, Peter Zijlstra, Tejun Heo, Li Zefan,
	Johannes Weiner, Andrew Morton, Michal Hocko, Vladimir Davydov,
	Joe Perches, Frederic Weisbecker, Andrea Arcangeli,
	Eric W. Biederman, Andi Kleen, Oleg Nesterov, Cyrill Gorcunov,
	Mateusz Guzik, John Stultz, Ben Segall, Rik van Riel,
	Thomas Gleixner, open list:CONTROL GROUP (CGROUP)

Track maximum number of processes per user, to be able to configure
RLIMIT_NPROC resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 include/linux/sched.h | 30 ++++++++++++++++++++++++++++++
 kernel/cgroup.c       | 31 +++++++++++++++++++++++++++----
 kernel/cred.c         |  1 +
 kernel/fork.c         |  2 ++
 kernel/sys.c          |  2 ++
 kernel/tsacct.c       | 23 ++++++++++++++++++++++-
 6 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e4d7482..d6af49b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -840,6 +840,7 @@ static inline int signal_group_exit(const struct signal_struct *sig)
 struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
+	atomic_t max_processes;	/* How many processes has this user had at the same time? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
 #ifdef CONFIG_INOTIFY_USER
 	atomic_t inotify_watches; /* How many inotify watches does this user have? */
@@ -3344,6 +3345,27 @@ static inline void update_resource_highwatermark(unsigned int limit,
 {
 	task_update_resource_highwatermark(current, limit, r);
 }
+
+static inline void user_update_maxproc_highwatermark(struct user_struct *u)
+{
+	int processes;
+
+	processes = atomic_read(&u->processes);
+	if (atomic_read(&u->max_processes) < processes)
+		atomic_set(&u->max_processes, processes);
+}
+
+static inline void task_update_maxproc_highwatermark(struct task_struct *t)
+{
+	const struct cred *tcred;
+
+	rcu_read_lock();
+	tcred = __task_cred(t);
+
+	user_update_maxproc_highwatermark(tcred->user);
+
+	rcu_read_unlock();
+}
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
@@ -3370,6 +3392,14 @@ static inline void update_resource_highwatermark(unsigned int limit,
 						 unsigned long r)
 {
 }
+
+static inline void user_update_maxproc_highwatermark(struct user_struct *u)
+{
+}
+
+static inline void task_update_maxproc_highwatermark(struct task_struct *t)
+{
+}
 #endif
 
 #ifndef TASK_SIZE_OF
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9b2d805..38a272f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4660,6 +4660,23 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	return 0;
 }
 
+static void cgroup_update_maxproc_highwatermark(struct cgroup *cgrp,
+						struct task_struct *t)
+{
+	const struct cred *tcred;
+	struct user_struct *u;
+	int max_processes;
+
+	tcred = __task_cred(t);
+	u = tcred->user;
+
+	user_update_maxproc_highwatermark(u);
+
+	max_processes = atomic_read(&u->max_processes);
+	if (cgrp->stats.resource_hiwater[RLIMIT_NPROC] < max_processes)
+		cgrp->stats.resource_hiwater[RLIMIT_NPROC] = max_processes;
+}
+
 /*
  * Update cgroupstats based on the stats from exiting task
  */
@@ -4678,10 +4695,16 @@ static void cgroup_update_stats_from_task(struct cgroup *cgrp,
 		seq = nextseq;
 		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 		for (i = 0; i < RLIM_NLIMITS; i++)
-			if (cgrp->stats.resource_hiwater[i] <
-			    sig->resource_highwatermark[i])
-				cgrp->stats.resource_hiwater[i] =
-					sig->resource_highwatermark[i];
+			switch(i) {
+			case RLIMIT_NPROC:
+				cgroup_update_maxproc_highwatermark(cgrp, tsk);
+				break;
+			default:
+				if (cgrp->stats.resource_hiwater[i] <
+				    sig->resource_highwatermark[i])
+					cgrp->stats.resource_hiwater[i] =
+						sig->resource_highwatermark[i];
+			}
 
 		/* If lockless access failed, take the lock. */
 		nextseq = 1;
diff --git a/kernel/cred.c b/kernel/cred.c
index 0c0cd8a..e12ab6e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -467,6 +467,7 @@ int commit_creds(struct cred *new)
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user)
 		atomic_dec(&old->user->processes);
+	user_update_maxproc_highwatermark(new->user);
 	alter_cred_subscribers(old, -2);
 
 	/* send notifications */
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a7ec0c..3f636c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1640,6 +1640,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		nr_threads++;
 	}
 
+	task_update_maxproc_highwatermark(p);
+
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	syscall_tracepoint_update(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index d84c87e..f1def17 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -439,6 +439,8 @@ static int set_user(struct cred *new)
 	else
 		current->flags &= ~PF_NPROC_EXCEEDED;
 
+	user_update_maxproc_highwatermark(new_user);
+
 	free_uid(new->user);
 	new->user = new_user;
 	return 0;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 231bae3..9fd4cef 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -184,6 +184,19 @@ void acct_clear_integrals(struct task_struct *tsk)
 	tsk->acct_vm_mem1 = 0;
 }
 
+static __u64 task_get_maxproc_highwatermark(struct task_struct *t)
+{
+	const struct cred *tcred;
+	struct user_struct *u;
+
+	tcred = __task_cred(t);
+	u = tcred->user;
+
+	user_update_maxproc_highwatermark(u);
+
+	return (__u64)atomic_read(&u->max_processes);
+}
+
 /*
  * fill in resource accounting fields
  */
@@ -201,7 +214,15 @@ void racct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 		seq = nextseq;
 		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 		for (i = 0; i < RLIM_NLIMITS; i++)
-			stats->resource_hiwater[i] = (__u64)sig->resource_highwatermark[i];
+			switch(i) {
+			case RLIMIT_NPROC:
+				stats->resource_hiwater[i] =
+					task_get_maxproc_highwatermark(tsk);
+				break;
+			default:
+				stats->resource_hiwater[i] =
+					(__u64)sig->resource_highwatermark[i];
+			}
 
 		/* If lockless access failed, take the lock. */
 		nextseq = 1;
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 08/14] resource limits: track highwater mark of number of files
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (6 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 07/14] resource limits: track highwater mark of user processes Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:35 ` [PATCH 10/14] resource limits: track highwater mark of address space size Topi Miettinen
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Alexander Viro,
	open list:FILESYSTEMS (VFS and infrastructure)

Track maximum number of files for the process, to be able to configure
RLIMIT_NOFILE resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 fs/file.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/file.c b/fs/file.c
index 6b1acdf..9de37c9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -547,6 +547,8 @@ repeat:
 	}
 #endif
 
+	update_resource_highwatermark(RLIMIT_NOFILE, fd);
+
 out:
 	spin_unlock(&files->file_lock);
 	return error;
@@ -857,6 +859,8 @@ __releases(&files->file_lock)
 	if (tofree)
 		filp_close(tofree, files);
 
+	update_resource_highwatermark(RLIMIT_NOFILE, fd);
+
 	return fd;
 
 Ebusy:
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 10/14] resource limits: track highwater mark of address space size
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (7 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 08/14] resource limits: track highwater mark of number of files Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:35 ` [PATCH 11/14] resource limits: track highwater mark of number of pending signals Topi Miettinen
                   ` (5 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Chen Gang, Michal Hocko, Konstantin Khlebnikov, Andrea Arcangeli,
	Andrey Ryabinin, David Rientjes, Vlastimil Babka, Hugh Dickins,
	Alexander Kuleshov, open list:MEMORY MANAGEMENT

Track maximum size of address space, to be able to configure
RLIMIT_AS resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 mm/mmap.c   | 4 ++++
 mm/mremap.c | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/mm/mmap.c b/mm/mmap.c
index c37f599..ded2f8d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2707,6 +2707,9 @@ static int do_brk(unsigned long addr, unsigned long len)
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
+
+	update_resource_highwatermark(RLIMIT_AS, mm->total_vm << PAGE_SHIFT);
+
 	mm->data_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
@@ -2927,6 +2930,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
 void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 {
 	mm->total_vm += npages;
+	update_resource_highwatermark(RLIMIT_AS, mm->total_vm << PAGE_SHIFT);
 
 	if (is_exec_mapping(flags))
 		mm->exec_vm += npages;
diff --git a/mm/mremap.c b/mm/mremap.c
index f1821335..aa717d0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -398,6 +398,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 		update_resource_highwatermark(RLIMIT_MEMLOCK,
 					      (mm->locked_vm << PAGE_SHIFT) +
 					      new_len - old_len);
+	update_resource_highwatermark(RLIMIT_AS, (mm->total_vm << PAGE_SHIFT) +
+				      new_len - old_len);
+
 	return vma;
 }
 
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 11/14] resource limits: track highwater mark of number of pending signals
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (8 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 10/14] resource limits: track highwater mark of address space size Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:35 ` [PATCH 12/14] resource limits: track highwater mark of size of message queues Topi Miettinen
                   ` (4 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Andrew Morton, Ingo Molnar, Oleg Nesterov,
	Amanieu d'Antras, Stas Sergeev, Andy Lutomirski,
	Wang Xiaoqiang, Sasha Levin, Dave Hansen

Track maximum number of pending signals, to be able to configure
RLIMIT_SIGPENDING resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 kernel/signal.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/signal.c b/kernel/signal.c
index 96e9bc4..670d609 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -387,6 +387,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
 		q->user = user;
+		task_update_resource_highwatermark(t, RLIMIT_SIGPENDING,
+						   atomic_read(&user->sigpending));
 	}
 
 	return q;
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 12/14] resource limits: track highwater mark of size of message queues
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (9 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 11/14] resource limits: track highwater mark of number of pending signals Topi Miettinen
@ 2016-07-15 10:35 ` Topi Miettinen
  2016-07-15 10:36 ` [PATCH 13/14] resource limits: track highwater mark of niceness Topi Miettinen
                   ` (3 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Topi Miettinen, Michal Hocko, Andrew Morton, Marcus Gelderie,
	Doug Ledford, Al Viro, Kirill A. Shutemov, Vladimir Davydov

Track maximum size of message queues, to be able to configure
RLIMIT_MSGQUEUE resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 ipc/mqueue.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ade739f..037ea47 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -287,6 +287,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 
 		/* all is ok */
 		info->user = get_uid(u);
+		update_resource_highwatermark(RLIMIT_MSGQUEUE, u->mq_bytes);
 	} else if (S_ISDIR(mode)) {
 		inc_nlink(inode);
 		/* Some things misbehave if size == 0 on a directory */
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 13/14] resource limits: track highwater mark of niceness
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (10 preceding siblings ...)
  2016-07-15 10:35 ` [PATCH 12/14] resource limits: track highwater mark of size of message queues Topi Miettinen
@ 2016-07-15 10:36 ` Topi Miettinen
  2016-07-15 10:36 ` [PATCH 14/14] resource limits: track highwater mark of RT priority Topi Miettinen
                   ` (2 subsequent siblings)
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:36 UTC (permalink / raw)
  To: linux-kernel; +Cc: Topi Miettinen, Ingo Molnar, Peter Zijlstra

Track maximum nice priority, to be able to configure
RLIMIT_NICE resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97ee9ac..da32bcd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3693,6 +3693,9 @@ void set_user_nice(struct task_struct *p, long nice)
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_curr(rq);
 	}
+	task_update_resource_highwatermark(p, RLIMIT_NICE,
+					   nice_to_rlimit(nice));
+
 out_unlock:
 	task_rq_unlock(rq, p, &rf);
 }
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 14/14] resource limits: track highwater mark of RT priority
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (11 preceding siblings ...)
  2016-07-15 10:36 ` [PATCH 13/14] resource limits: track highwater mark of niceness Topi Miettinen
@ 2016-07-15 10:36 ` Topi Miettinen
  2016-07-15 17:42 ` [PATCH 00/14] Present useful limits to user (v2) Topi Miettinen
       [not found] ` <20160715124330.GR30154@twins.programming.kicks-ass.net>
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 10:36 UTC (permalink / raw)
  To: linux-kernel; +Cc: Topi Miettinen, Ingo Molnar, Peter Zijlstra

Track maximum RT priority, to be able to configure
RLIMIT_RTPRIO resource limits. The information is available
with taskstats and cgroupstats netlink socket.

Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da32bcd..bb08b43 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4221,6 +4221,9 @@ change:
 	balance_callback(rq);
 	preempt_enable();
 
+	task_update_resource_highwatermark(p, RLIMIT_RTPRIO,
+					   attr->sched_priority);
+
 	return 0;
 }
 
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 01/14] resource limits: foundation for resource highwater tracking
  2016-07-15 10:35 ` [PATCH 01/14] resource limits: foundation for resource highwater tracking Topi Miettinen
@ 2016-07-15 12:12   ` kbuild test robot
  2016-07-15 12:49   ` Nicolas Dichtel
  1 sibling, 0 replies; 26+ messages in thread
From: kbuild test robot @ 2016-07-15 12:12 UTC (permalink / raw)
  To: Topi Miettinen
  Cc: kbuild-all, linux-kernel, Topi Miettinen, Jonathan Corbet,
	Ingo Molnar, Peter Zijlstra, Balbir Singh, David S. Miller,
	Nicolas Dichtel, Markus Elfring, Thomas Gleixner, Rik van Riel,
	open list:DOCUMENTATION

[-- Attachment #1: Type: text/plain, Size: 1280 bytes --]

Hi,

[auto build test ERROR on v4.7-rc7]
[also build test ERROR on next-20160715]
[cannot apply to tip/sched/core rdma/master]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Topi-Miettinen/Present-useful-limits-to-user-v2/20160715-194333
config: i386-tinyconfig (attached as .config)
compiler: gcc-6 (Debian 6.1.1-1) 6.1.1 20160430
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

>> Documentation/accounting/getdelays.c:29:42: fatal error: include/uapi/linux/taskstats.h: No such file or directory
    #include "include/uapi/linux/taskstats.h"
                                             ^
   compilation terminated.

vim +29 Documentation/accounting/getdelays.c

    23	#include <sys/socket.h>
    24	#include <sys/wait.h>
    25	#include <sys/resource.h>
    26	#include <signal.h>
    27	
    28	#include <linux/genetlink.h>
  > 29	#include "include/uapi/linux/taskstats.h"
    30	#include <linux/cgroupstats.h>
    31	
    32	/*

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 6334 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level
  2016-07-15 10:35 ` [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level Topi Miettinen
@ 2016-07-15 12:38   ` kbuild test robot
  2016-07-15 14:10   ` Tejun Heo
  1 sibling, 0 replies; 26+ messages in thread
From: kbuild test robot @ 2016-07-15 12:38 UTC (permalink / raw)
  To: Topi Miettinen
  Cc: kbuild-all, linux-kernel, Topi Miettinen, Jonathan Corbet,
	Tejun Heo, Li Zefan, Johannes Weiner, Markus Elfring,
	David S. Miller, Nicolas Dichtel, open list:DOCUMENTATION,
	open list:CONTROL GROUP (CGROUP)

[-- Attachment #1: Type: text/plain, Size: 2092 bytes --]

Hi,

[auto build test ERROR on v4.7-rc7]
[also build test ERROR on next-20160715]
[cannot apply to tip/sched/core rdma/master]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Topi-Miettinen/Present-useful-limits-to-user-v2/20160715-194333
config: i386-randconfig-s1-201628 (attached as .config)
compiler: gcc-6 (Debian 6.1.1-1) 6.1.1 20160430
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   kernel/cgroup.c: In function 'cgroup_update_stats_from_task':
>> kernel/cgroup.c:4681:12: error: 'struct cgroup' has no member named 'stats'
       if (cgrp->stats.resource_hiwater[i] <
               ^~
>> kernel/cgroup.c:4682:11: error: 'struct signal_struct' has no member named 'resource_highwatermark'
           sig->resource_highwatermark[i])
              ^~
   kernel/cgroup.c:4683:9: error: 'struct cgroup' has no member named 'stats'
        cgrp->stats.resource_hiwater[i] =
            ^~
   kernel/cgroup.c:4684:9: error: 'struct signal_struct' has no member named 'resource_highwatermark'
         sig->resource_highwatermark[i];
            ^~
   kernel/cgroup.c: In function 'cgroupstats_build':
   kernel/cgroup.c:4773:36: error: 'struct cgroup' has no member named 'stats'
      stats->resource_hiwater[i] = cgrp->stats.resource_hiwater[i];
                                       ^~

vim +4681 kernel/cgroup.c

  4675		/* Attempt a lockless read on the first round. */
  4676		nextseq = 0;
  4677		do {
  4678			seq = nextseq;
  4679			flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
  4680			for (i = 0; i < RLIM_NLIMITS; i++)
> 4681				if (cgrp->stats.resource_hiwater[i] <
> 4682				    sig->resource_highwatermark[i])
  4683					cgrp->stats.resource_hiwater[i] =
  4684						sig->resource_highwatermark[i];
  4685	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 22648 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 01/14] resource limits: foundation for resource highwater tracking
  2016-07-15 10:35 ` [PATCH 01/14] resource limits: foundation for resource highwater tracking Topi Miettinen
  2016-07-15 12:12   ` kbuild test robot
@ 2016-07-15 12:49   ` Nicolas Dichtel
  2016-07-15 16:27     ` Topi Miettinen
  1 sibling, 1 reply; 26+ messages in thread
From: Nicolas Dichtel @ 2016-07-15 12:49 UTC (permalink / raw)
  To: Topi Miettinen, linux-kernel
  Cc: Jonathan Corbet, Ingo Molnar, Peter Zijlstra, Balbir Singh,
	David S. Miller, Markus Elfring, Thomas Gleixner, Rik van Riel,
	open list:DOCUMENTATION

Le 15/07/2016 12:35, Topi Miettinen a écrit :
> There are many basic ways to control processes, including capabilities,
> cgroups and resource limits. However, there are far fewer ways to find out
> useful values for the limits, except blind trial and error.
> 
> Prepare a foundation for resource highwater tracking.
> 
> The collected highwater marks for the resources can be seen using
> taskstats netlink interface.
> 
> This depends on CONFIG_TASK_XACCT.
> 
> Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
> ---
[snip]
> @@ -63,6 +65,8 @@ int print_task_context_switch_counts;
>  /* Maximum number of cpus expected to be specified in a cpumask */
>  #define MAX_CPUS	32
>  
> +#define TASKSTATS_VERSION_WITH_RESOURCE	9
> +
>  struct msgtemplate {
>  	struct nlmsghdr n;
>  	struct genlmsghdr g;
[snip]
> @@ -252,6 +276,22 @@ static void print_ioacct(struct taskstats *t)
>  		(unsigned long long)t->cancelled_write_bytes);
>  }
>  
> +static void print_racct(const struct taskstats *t)
> +{
> +	int i;
> +
> +	if (t->version < TASKSTATS_VERSION_WITH_RESOURCE) {
> +		printf("kernel too old (%d < %d)\n", t->version,
> +		       TASKSTATS_VERSION_WITH_RESOURCE);
> +		return;
> +	}

[snip]
> diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
> index 2466e55..8c65194 100644
> --- a/include/uapi/linux/taskstats.h
> +++ b/include/uapi/linux/taskstats.h
> @@ -33,7 +33,7 @@
>   */
>  
>  
> -#define TASKSTATS_VERSION	8
> +#define TASKSTATS_VERSION	9
>  #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
>  					 * in linux/sched.h */
>  
> @@ -163,6 +163,14 @@ struct taskstats {
>  	/* Delay waiting for memory reclaim */
>  	__u64	freepages_count;
>  	__u64	freepages_delay_total;
> +	/* Per-task storage I/O accounting ends */
> +
> +#define TASKSTATS_HAS_LIMIT_ACCOUNTING
> +	/* Per-task resource accounting starts */
> +	__u64   resource_hiwater[RLIM_NLIMITS]; /* high-watermark of
> +						     RLIMIT
> +						     resources */
> +	/* Per-task resource accounting ends */
>  };
Why playing with version number? It complexifies the (userland) code and
existing applications break when the kernel is updated.
Goal of netlink is to be easily extensible. By adding a new attribute, existing
userspace tools won't break.


Regards,
Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level
  2016-07-15 10:35 ` [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level Topi Miettinen
  2016-07-15 12:38   ` kbuild test robot
@ 2016-07-15 14:10   ` Tejun Heo
  2016-07-15 17:15     ` Topi Miettinen
  1 sibling, 1 reply; 26+ messages in thread
From: Tejun Heo @ 2016-07-15 14:10 UTC (permalink / raw)
  To: Topi Miettinen
  Cc: linux-kernel, Jonathan Corbet, Li Zefan, Johannes Weiner,
	Markus Elfring, David S. Miller, Nicolas Dichtel,
	open list:DOCUMENTATION, open list:CONTROL GROUP (CGROUP)

Hello, Topi.

On Fri, Jul 15, 2016 at 01:35:49PM +0300, Topi Miettinen wrote:
> Collect resource usage highwater marks of a task to cgroup
> statistics when the task exits.

I'm not sure how this makes sense.  The limits are enforced and
collected per user or along the process hierarchy which can be very
different from cgroup organization.  What does collecting high
watermarks from orthogonal structure, sometimes even combining
per-user numbers from different users, even mean?  These are numbers
without clear semantics.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 01/14] resource limits: foundation for resource highwater tracking
  2016-07-15 12:49   ` Nicolas Dichtel
@ 2016-07-15 16:27     ` Topi Miettinen
  2016-07-15 17:57       ` Nicolas Dichtel
  0 siblings, 1 reply; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 16:27 UTC (permalink / raw)
  To: nicolas.dichtel, linux-kernel
  Cc: Jonathan Corbet, Ingo Molnar, Peter Zijlstra, Balbir Singh,
	David S. Miller, Markus Elfring, Thomas Gleixner, Rik van Riel,
	open list:DOCUMENTATION

On 07/15/16 12:49, Nicolas Dichtel wrote:
> Le 15/07/2016 12:35, Topi Miettinen a écrit :
>> There are many basic ways to control processes, including capabilities,
>> cgroups and resource limits. However, there are far fewer ways to find out
>> useful values for the limits, except blind trial and error.
>>
>> Prepare a foundation for resource highwater tracking.
>>
>> The collected highwater marks for the resources can be seen using
>> taskstats netlink interface.
>>
>> This depends on CONFIG_TASK_XACCT.
>>
>> Signed-off-by: Topi Miettinen <toiwoton@gmail.com>
>> ---
> [snip]
>> @@ -63,6 +65,8 @@ int print_task_context_switch_counts;
>>  /* Maximum number of cpus expected to be specified in a cpumask */
>>  #define MAX_CPUS	32
>>  
>> +#define TASKSTATS_VERSION_WITH_RESOURCE	9
>> +
>>  struct msgtemplate {
>>  	struct nlmsghdr n;
>>  	struct genlmsghdr g;
> [snip]
>> @@ -252,6 +276,22 @@ static void print_ioacct(struct taskstats *t)
>>  		(unsigned long long)t->cancelled_write_bytes);
>>  }
>>  
>> +static void print_racct(const struct taskstats *t)
>> +{
>> +	int i;
>> +
>> +	if (t->version < TASKSTATS_VERSION_WITH_RESOURCE) {
>> +		printf("kernel too old (%d < %d)\n", t->version,
>> +		       TASKSTATS_VERSION_WITH_RESOURCE);
>> +		return;
>> +	}
> 
> [snip]
>> diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
>> index 2466e55..8c65194 100644
>> --- a/include/uapi/linux/taskstats.h
>> +++ b/include/uapi/linux/taskstats.h
>> @@ -33,7 +33,7 @@
>>   */
>>  
>>  
>> -#define TASKSTATS_VERSION	8
>> +#define TASKSTATS_VERSION	9
>>  #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
>>  					 * in linux/sched.h */
>>  
>> @@ -163,6 +163,14 @@ struct taskstats {
>>  	/* Delay waiting for memory reclaim */
>>  	__u64	freepages_count;
>>  	__u64	freepages_delay_total;
>> +	/* Per-task storage I/O accounting ends */
>> +
>> +#define TASKSTATS_HAS_LIMIT_ACCOUNTING
>> +	/* Per-task resource accounting starts */
>> +	__u64   resource_hiwater[RLIM_NLIMITS]; /* high-watermark of
>> +						     RLIMIT
>> +						     resources */
>> +	/* Per-task resource accounting ends */
>>  };
> Why playing with version number? It complexifies the (userland) code and
> existing applications break when the kernel is updated.
> Goal of netlink is to be easily extensible. By adding a new attribute, existing
> userspace tools won't break.

I just followed this text in taskstats.h. Does that give wrong advice?

 * The struct is versioned. Newer versions should only add fields to
 * the bottom of the struct to maintain backward compatibility.
 *
 *
 * To add new fields
 *	a) bump up TASKSTATS_VERSION
 *	b) add comment indicating new version number at end of struct
 *	c) add new fields after version comment; maintain 64-bit alignment

-Topi

> 
> 
> Regards,
> Nicolas
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level
  2016-07-15 14:10   ` Tejun Heo
@ 2016-07-15 17:15     ` Topi Miettinen
  2016-07-18 22:52       ` Tejun Heo
  0 siblings, 1 reply; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 17:15 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, Jonathan Corbet, Li Zefan, Johannes Weiner,
	Markus Elfring, David S. Miller, Nicolas Dichtel,
	open list:DOCUMENTATION, open list:CONTROL GROUP (CGROUP)

On 07/15/16 14:10, Tejun Heo wrote:
> Hello, Topi.
> 
> On Fri, Jul 15, 2016 at 01:35:49PM +0300, Topi Miettinen wrote:
>> Collect resource usage highwater marks of a task to cgroup
>> statistics when the task exits.
> 
> I'm not sure how this makes sense.  The limits are enforced and
> collected per user or along the process hierarchy which can be very
> different from cgroup organization.  What does collecting high
> watermarks from orthogonal structure, sometimes even combining
> per-user numbers from different users, even mean?  These are numbers
> without clear semantics.

There are clear semantics for the limits themselves, either they apply
per task or per user. It makes sense to gather values according to these
semantics. Then with systemd or other tools you can use the valuse to
set the limits for a service regardless if the limit applies per task or
per user and it works according to each limit's semantics.

cgroups are used to aggregate values from a group of tasks, which still
are related to one service. Because with systemd the services also are
given a cgroup context, the values will completely make sense there too.

It could be useful to introduce a new set of limits that apply only
cgroup level. It would not remove the need to aggregate values from a
group of tasks.

-Topi

> 
> Thanks.
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 00/14] Present useful limits to user (v2)
       [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
                   ` (12 preceding siblings ...)
  2016-07-15 10:36 ` [PATCH 14/14] resource limits: track highwater mark of RT priority Topi Miettinen
@ 2016-07-15 17:42 ` Topi Miettinen
       [not found] ` <20160715124330.GR30154@twins.programming.kicks-ass.net>
  14 siblings, 0 replies; 26+ messages in thread
From: Topi Miettinen @ 2016-07-15 17:42 UTC (permalink / raw)
  To: linux-kernel

(Resending just to lkml due to message header limits)

Hello,

There are many basic ways to control processes, including capabilities,
cgroups and resource limits. However, there are far fewer ways to find out
useful values for the limits, except blind trial and error.

This patch series attempts to fix that by giving at least a nice starting
point from the highwater mark values of the resources in question.
I looked where each limit is checked and added a call to update the mark
nearby.

Example run of program from Documentation/accounting/getdelauys.c:

./getdelays -R -p `pidof smartd`
printing resource accounting
RLIMIT_CPU=0
RLIMIT_FSIZE=0
RLIMIT_DATA=18198528
RLIMIT_STACK=135168
RLIMIT_CORE=0
RLIMIT_RSS=0
RLIMIT_NPROC=1
RLIMIT_NOFILE=55
RLIMIT_MEMLOCK=0
RLIMIT_AS=130879488
RLIMIT_LOCKS=0
RLIMIT_SIGPENDING=0
RLIMIT_MSGQUEUE=0
RLIMIT_NICE=0
RLIMIT_RTPRIO=0
RLIMIT_RTTIME=0

./getdelays -R -C /sys/fs/cgroup/systemd/system.slice/smartd.service/
printing resource accounting
sleeping 1, blocked 0, running 0, stopped 0, uninterruptible 0
RLIMIT_CPU=0
RLIMIT_FSIZE=0
RLIMIT_DATA=18198528
RLIMIT_STACK=135168
RLIMIT_CORE=0
RLIMIT_RSS=0
RLIMIT_NPROC=1
RLIMIT_NOFILE=55
RLIMIT_MEMLOCK=0
RLIMIT_AS=130879488
RLIMIT_LOCKS=0
RLIMIT_SIGPENDING=0
RLIMIT_MSGQUEUE=0
RLIMIT_NICE=0
RLIMIT_RTPRIO=0
RLIMIT_RTTIME=0

In this example, smartd is running as a non-root user. The presented
values can be used as a starting point for giving new limits to the
service.

There's one problem with the patch 07/13, kernel initialization calls
create_worker() which seems to use different locking model or something:

[    0.145410] =========================================================
[    0.148000] [ INFO: possible irq lock inversion dependency detected ]
[    0.148000] 4.7.0-rc7+ #155 Not tainted
[    0.148000] ---------------------------------------------------------
[    0.148000] swapper/0/1 just changed the state of lock:
[    0.148000]  (&(&(&sig->stats_lock)->lock)->rlock){+.....}, at:
[<ffffffff810bf769>] __sched_setscheduler+0x339/0xbd0
[    0.148000] but this lock was taken by another, HARDIRQ-safe lock in
the past:
[    0.148000]  (&rq->lock){-.....}

and interrupts could create inverse lock ordering between them.

[    0.148000] [    0.148000] other info that might help us debug this:
[    0.148000]  Possible interrupt unsafe locking scenario:
[    0.148000] [    0.148000]        CPU0                    CPU1
[    0.148000]        ----                    ----
[    0.148000]   lock(&(&(&sig->stats_lock)->lock)->rlock);
[    0.148000]                                local_irq_disable();
[    0.148000]                                lock(&rq->lock);
[    0.148000]
lock(&(&(&sig->stats_lock)->lock)->rlock);
[    0.148000]   <Interrupt>
[    0.148000]     lock(&rq->lock);
[    0.148000] [    0.148000]  *** DEADLOCK ***
[    0.148000] [    0.148000] 2 locks held by swapper/0/1:
[    0.148000]  #0:  (cpu_hotplug.lock){.+.+.+}, at:
[<ffffffff81092824>] get_online_cpus+0x24/0x70
[    0.148000]  #1:  (smpboot_threads_lock){+.+.+.}, at:
[<ffffffff810ba517>] smpboot_register_percpu_thread_cpumask+0x37/0xf0
[    0.148000] [    0.148000] the shortest dependencies between 2nd lock
and 1st lock:
[    0.148000]  -> (&rq->lock){-.....} ops: 181 {
[    0.148000]     IN-HARDIRQ-W at:
[    0.148000]                       [<ffffffff810e8439>]
__lock_acquire+0x6e9/0x1440
[    0.148000]                       [<ffffffff810e95d3>]
lock_acquire+0xe3/0x1c0
[    0.148000]                       [<ffffffff818cf661>]
_raw_spin_lock+0x31/0x40
[    0.148000]                       [<ffffffff810c3a41>]
scheduler_tick+0x41/0xd0
[    0.148000]                       [<ffffffff81110471>]
update_process_times+0x51/0x60
[    0.148000]                       [<ffffffff8111fa4f>]
tick_periodic+0x2f/0xc0
[    0.148000]                       [<ffffffff8111fb05>]
tick_handle_periodic+0x25/0x70
[    0.148000]                       [<ffffffff8101ebf5>]
timer_interrupt+0x15/0x20
[    0.148000]                       [<ffffffff810fc731>]
handle_irq_event_percpu+0x41/0x320
[    0.148000]                       [<ffffffff810fca49>]
handle_irq_event+0x39/0x60
[    0.148000]                       [<ffffffff810ffe08>]
handle_level_irq+0x88/0x110
[    0.148000]                       [<ffffffff8101e58a>]
handle_irq+0x1a/0x30
[    0.148000]                       [<ffffffff818d2281>] do_IRQ+0x61/0x120
[    0.148000]                       [<ffffffff818d0949>]
ret_from_intr+0x0/0x19
[    0.148000]                       [<ffffffff810fe969>]
__setup_irq+0x3f9/0x5e0
[    0.148000]                       [<ffffffff810feb96>]
setup_irq+0x46/0xa0
[    0.148000]                       [<ffffffff821878e2>]
setup_default_timer_irq+0x1e/0x20
[    0.148000]                       [<ffffffff821878fb>]
hpet_time_init+0x17/0x19
[    0.148000]                       [<ffffffff821878bd>]
x86_late_time_init+0xa/0x11
[    0.148000]                       [<ffffffff82181ef9>]
start_kernel+0x39d/0x465
[    0.148000]                       [<ffffffff82181294>]
x86_64_start_reservations+0x2f/0x31
[    0.148000]                       [<ffffffff8218140e>]
x86_64_start_kernel+0x178/0x18b
[    0.148000]     INITIAL USE at:
[    0.148000]                      [<ffffffff810e7f90>]
__lock_acquire+0x240/0x1440
[    0.148000]                      [<ffffffff810e95d3>]
lock_acquire+0xe3/0x1c0
[    0.148000]                      [<ffffffff818cf82c>]
_raw_spin_lock_irqsave+0x3c/0x50
[    0.148000]                      [<ffffffff810bdc9d>]
rq_attach_root+0x1d/0x100
[    0.148000]                      [<ffffffff8219deab>]
sched_init+0x2f5/0x44c
[    0.148000]                      [<ffffffff82181d9d>]
start_kernel+0x241/0x465
[    0.148000]                      [<ffffffff82181294>]
x86_64_start_reservations+0x2f/0x31
[    0.148000]                      [<ffffffff8218140e>]
x86_64_start_kernel+0x178/0x18b
[    0.148000]   }
[    0.148000]   ... key      at: [<ffffffff822f3ad0>] __key.60059+0x0/0x8
[    0.148000]   ... acquired at:
[    0.148000]    [<ffffffff810e95d3>] lock_acquire+0xe3/0x1c0
[    0.148000]    [<ffffffff818cf661>] _raw_spin_lock+0x31/0x40
[    0.148000]    [<ffffffff810c0514>] set_user_nice.part.92+0xf4/0x270
[    0.148000]    [<ffffffff810c06b6>] set_user_nice+0x26/0x30
[    0.148000]    [<ffffffff810aee10>] create_worker+0xf0/0x1a0
[    0.148000]    [<ffffffff8219c195>] init_workqueues+0x317/0x51e
[    0.148000]    [<ffffffff81000450>] do_one_initcall+0x50/0x180
[    0.148000]    [<ffffffff821820d2>] kernel_init_freeable+0x111/0x25d
[    0.148000]    [<ffffffff818c206e>] kernel_init+0xe/0x100
[    0.148000]    [<ffffffff818d01ff>] ret_from_fork+0x1f/0x40
[    0.148000] [    0.148000] ->
(&(&(&sig->stats_lock)->lock)->rlock){+.....} ops: 2 {
[    0.148000]    HARDIRQ-ON-W at:
[    0.148000]                     [<ffffffff810e82e0>]
__lock_acquire+0x590/0x1440
[    0.148000]                     [<ffffffff810e95d3>]
lock_acquire+0xe3/0x1c0
[    0.148000]                     [<ffffffff818cf661>]
_raw_spin_lock+0x31/0x40
[    0.148000]                     [<ffffffff810bf769>]
__sched_setscheduler+0x339/0xbd0
[    0.148000]                     [<ffffffff810c0076>]
_sched_setscheduler+0x76/0x90
[    0.148000]                     [<ffffffff810c1012>]
sched_set_stop_task+0x62/0xb0
[    0.148000]                     [<ffffffff81143983>]
cpu_stop_create+0x23/0x30
[    0.148000]                     [<ffffffff810ba48d>]
__smpboot_create_thread.part.2+0xad/0x100
[    0.148000]                     [<ffffffff810ba57f>]
smpboot_register_percpu_thread_cpumask+0x9f/0xf0
[    0.148000]                     [<ffffffff821a1708>]
cpu_stop_init+0x7d/0xb8
[    0.148000]                     [<ffffffff81000450>]
do_one_initcall+0x50/0x180
[    0.148000]                     [<ffffffff821820d2>]
kernel_init_freeable+0x111/0x25d
[    0.148000]                     [<ffffffff818c206e>]
kernel_init+0xe/0x100
[    0.148000]                     [<ffffffff818d01ff>]
ret_from_fork+0x1f/0x40
[    0.148000]    INITIAL USE at:
[    0.148000]                    [<ffffffff810e7f90>]
__lock_acquire+0x240/0x1440
[    0.148000]                    [<ffffffff810e95d3>]
lock_acquire+0xe3/0x1c0
[    0.148000]                    [<ffffffff818cf661>]
_raw_spin_lock+0x31/0x40
[    0.148000]                    [<ffffffff810c0514>]
set_user_nice.part.92+0xf4/0x270
[    0.148000]                    [<ffffffff810c06b6>]
set_user_nice+0x26/0x30
[    0.148000]                    [<ffffffff810aee10>]
create_worker+0xf0/0x1a0
[    0.148000]                    [<ffffffff8219c195>]
init_workqueues+0x317/0x51e
[    0.148000]                    [<ffffffff81000450>]
do_one_initcall+0x50/0x180
[    0.148000]                    [<ffffffff821820d2>]
kernel_init_freeable+0x111/0x25d
[    0.148000]                    [<ffffffff818c206e>] kernel_init+0xe/0x100
[    0.148000]                    [<ffffffff818d01ff>]
ret_from_fork+0x1f/0x40
[    0.148000]  }
[    0.148000]  ... key      at: [<ffffffff822f2190>] __key.55894+0x0/0x8
[    0.148000]  ... acquired at:
[    0.148000]    [<ffffffff810e6885>] check_usage_backwards+0x155/0x160
[    0.148000]    [<ffffffff810e7533>] mark_lock+0x333/0x610
[    0.148000]    [<ffffffff810e82e0>] __lock_acquire+0x590/0x1440
[    0.148000]    [<ffffffff810e95d3>] lock_acquire+0xe3/0x1c0
[    0.148000]    [<ffffffff818cf661>] _raw_spin_lock+0x31/0x40
[    0.148000]    [<ffffffff810bf769>] __sched_setscheduler+0x339/0xbd0
[    0.148000]    [<ffffffff810c0076>] _sched_setscheduler+0x76/0x90
[    0.148000]    [<ffffffff810c1012>] sched_set_stop_task+0x62/0xb0
[    0.148000]    [<ffffffff81143983>] cpu_stop_create+0x23/0x30
[    0.148000]    [<ffffffff810ba48d>]
__smpboot_create_thread.part.2+0xad/0x100
[    0.148000]    [<ffffffff810ba57f>]
smpboot_register_percpu_thread_cpumask+0x9f/0xf0
[    0.148000]    [<ffffffff821a1708>] cpu_stop_init+0x7d/0xb8
[    0.148000]    [<ffffffff81000450>] do_one_initcall+0x50/0x180
[    0.148000]    [<ffffffff821820d2>] kernel_init_freeable+0x111/0x25d
[    0.148000]    [<ffffffff818c206e>] kernel_init+0xe/0x100
[    0.148000]    [<ffffffff818d01ff>] ret_from_fork+0x1f/0x40
[    0.148000] [    0.148000] [    0.148000] stack backtrace:
[    0.148000] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.7.0-rc7+ #155
[    0.148000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS Debian-1.8.2-1 04/01/2014
[    0.148000]  0000000000000086 00000000aea03eae ffff88003de6ba60
ffffffff813cb2d5
[    0.148000]  ffffffff82d48e60 ffff88003de6bac0 ffff88003de6baa0
ffffffff811a6b05
[    0.148000]  ffff88003de647d8 ffff88003de647d8 ffff88003de64040
ffffffff81d531a7
[    0.148000] Call Trace:
[    0.148000]  [<ffffffff813cb2d5>] dump_stack+0x67/0x92
[    0.148000]  [<ffffffff811a6b05>]
print_irq_inversion_bug.part.38+0x1a4/0x1b0
[    0.148000]  [<ffffffff810e6885>] check_usage_backwards+0x155/0x160
[    0.148000]  [<ffffffff810e7533>] mark_lock+0x333/0x610
[    0.148000]  [<ffffffff810e6730>] ? check_usage_forwards+0x160/0x160
[    0.148000]  [<ffffffff810e82e0>] __lock_acquire+0x590/0x1440
[    0.148000]  [<ffffffff810e7a6d>] ? trace_hardirqs_on+0xd/0x10
[    0.148000]  [<ffffffff81104aad>] ? debug_lockdep_rcu_enabled+0x1d/0x20
[    0.148000]  [<ffffffff810e95d3>] lock_acquire+0xe3/0x1c0
[    0.148000]  [<ffffffff810bf769>] ? __sched_setscheduler+0x339/0xbd0
[    0.148000]  [<ffffffff818cf661>] _raw_spin_lock+0x31/0x40
[    0.148000]  [<ffffffff810bf769>] ? __sched_setscheduler+0x339/0xbd0
[    0.148000]  [<ffffffff810bf769>] __sched_setscheduler+0x339/0xbd0
[    0.148000]  [<ffffffff810c0076>] _sched_setscheduler+0x76/0x90
[    0.148000]  [<ffffffff810c1012>] sched_set_stop_task+0x62/0xb0
[    0.148000]  [<ffffffff81143983>] cpu_stop_create+0x23/0x30
[    0.148000]  [<ffffffff810ba48d>]
__smpboot_create_thread.part.2+0xad/0x100
[    0.148000]  [<ffffffff810ba57f>]
smpboot_register_percpu_thread_cpumask+0x9f/0xf0
[    0.148000]  [<ffffffff821a1708>] cpu_stop_init+0x7d/0xb8
[    0.148000]  [<ffffffff821a168b>] ? pid_namespaces_init+0x40/0x40
[    0.148000]  [<ffffffff81000450>] do_one_initcall+0x50/0x180
[    0.148000]  [<ffffffff8102c24d>] ? print_cpu_info+0x7d/0xe0
[    0.148000]  [<ffffffff821820d2>] kernel_init_freeable+0x111/0x25d
[    0.148000]  [<ffffffff818c206e>] kernel_init+0xe/0x100
[    0.148000]  [<ffffffff818d01ff>] ret_from_fork+0x1f/0x40
[    0.148000]  [<ffffffff818c2060>] ? rest_init+0x130/0x130

In this v2, I tried to address all comments, thanks for reviews.

-Topi

Topi Miettinen (14):
  resource limits: foundation for resource highwater tracking
  resource limits: aggregate task highwater marks to cgroup level
  resource limits: track highwater mark of file sizes
  resource limits: track highwater mark of VM data segment
  resource limits: track highwater mark of stack size
  resource limits: track highwater mark of cores dumped
  resource limits: track highwater mark of user processes
  resource limits: track highwater mark of number of files
  resource limits: track highwater mark of locked memory
  resource limits: track highwater mark of address space size
  resource limits: track highwater mark of number of pending signals
  resource limits: track highwater mark of size of message queues
  resource limits: track highwater mark of niceness
  resource limits: track highwater mark of RT priority

 Documentation/accounting/getdelays.c       | 62 ++++++++++++++++++++++--
 arch/ia64/kernel/perfmon.c                 |  1 +
 arch/powerpc/kvm/book3s_64_vio.c           |  2 +
 arch/powerpc/mm/mmu_context_iommu.c        |  2 +
 arch/x86/ia32/ia32_aout.c                  |  2 +
 drivers/infiniband/core/umem.c             |  1 +
 drivers/infiniband/hw/hfi1/user_pages.c    |  2 +
 drivers/infiniband/hw/qib/qib_user_pages.c |  2 +
 drivers/infiniband/hw/usnic/usnic_uiom.c   |  2 +
 drivers/misc/mic/scif/scif_rma.c           |  1 +
 drivers/vfio/vfio_iommu_spapr_tce.c        |  2 +
 drivers/vfio/vfio_iommu_type1.c            |  5 ++
 fs/attr.c                                  |  2 +
 fs/binfmt_aout.c                           |  2 +
 fs/binfmt_flat.c                           |  2 +
 fs/coredump.c                              | 11 +++--
 fs/file.c                                  |  4 ++
 include/linux/cgroup-defs.h                |  5 ++
 include/linux/sched.h                      | 61 +++++++++++++++++++++++
 include/linux/tsacct_kern.h                |  3 ++
 include/uapi/linux/cgroupstats.h           |  3 ++
 include/uapi/linux/taskstats.h             | 10 +++-
 ipc/mqueue.c                               |  1 +
 kernel/bpf/syscall.c                       |  8 +++
 kernel/cgroup.c                            | 78
++++++++++++++++++++++++++++++
 kernel/cred.c                              |  1 +
 kernel/events/core.c                       |  1 +
 kernel/fork.c                              |  2 +
 kernel/sched/core.c                        |  6 +++
 kernel/signal.c                            |  2 +
 kernel/sys.c                               |  5 ++
 kernel/taskstats.c                         |  4 ++
 kernel/tsacct.c                            | 47 ++++++++++++++++++
 mm/mlock.c                                 |  8 +++
 mm/mmap.c                                  | 17 ++++++-
 mm/mremap.c                                |  7 +++
 36 files changed, 365 insertions(+), 9 deletions(-)

-- 
2.8.1

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 01/14] resource limits: foundation for resource highwater tracking
  2016-07-15 16:27     ` Topi Miettinen
@ 2016-07-15 17:57       ` Nicolas Dichtel
  0 siblings, 0 replies; 26+ messages in thread
From: Nicolas Dichtel @ 2016-07-15 17:57 UTC (permalink / raw)
  To: Topi Miettinen, linux-kernel
  Cc: Jonathan Corbet, Ingo Molnar, Peter Zijlstra, Balbir Singh,
	David S. Miller, Markus Elfring, Thomas Gleixner, Rik van Riel,
	open list:DOCUMENTATION

Le 15/07/2016 18:27, Topi Miettinen a écrit :
[snip]
>> Why playing with version number? It complexifies the (userland) code and
>> existing applications break when the kernel is updated.
>> Goal of netlink is to be easily extensible. By adding a new attribute, existing
>> userspace tools won't break.
> 
> I just followed this text in taskstats.h. Does that give wrong advice?
> 
>  * The struct is versioned. Newer versions should only add fields to
>  * the bottom of the struct to maintain backward compatibility.
>  *
>  *
>  * To add new fields
>  *	a) bump up TASKSTATS_VERSION
>  *	b) add comment indicating new version number at end of struct
>  *	c) add new fields after version comment; maintain 64-bit alignment
I don't know taskstats well, but that is not how netlink works. There is no need
to manage a version with netlink, just add new attributes.


Regards,
Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 00/14] Present useful limits to user (v2)
       [not found]     ` <20160715135956.GA3115@twins.programming.kicks-ass.net>
@ 2016-07-15 20:54       ` H. Peter Anvin
  2016-07-18 13:00         ` Austin S. Hemmelgarn
  0 siblings, 1 reply; 26+ messages in thread
From: H. Peter Anvin @ 2016-07-15 20:54 UTC (permalink / raw)
  To: Peter Zijlstra, Topi Miettinen
  Cc: linux-kernel, Jonathan Corbet, Tony Luck, Fenghua Yu,
	Alexander Graf, Paolo Bonzini, Radim Kr??m????,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Thomas Gleixner, Ingo Molnar,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	Doug Ledford, Sean Hefty, Hal Rosenstock, Mike Marciniszyn,
	Dennis Dalessandro, Christian Benvenuti, Dave Goodell,
	Sudeep Dutt, Ashutosh Dixit, Alex Williamson, Alexander Viro,
	Tejun Heo, Li.Zefan

<lizefan@huawei.com>,Johannes Weiner <hannes@cmpxchg.org>,Alexei Starovoitov <ast@kernel.org>,Arnaldo Carvalho de Melo <acme@kernel.org>,Alexander Shishkin <alexander.shishkin@linux.intel.com>,Balbir Singh <bsingharora@gmail.com>,Markus Elfring <elfring@users.sourceforge.net>,"David S. Miller" <davem@davemloft.net>,Nicolas Dichtel <nicolas.dichtel@6wind.com>,Andrew Morton <akpm@linux-foundation.org>,Konstantin Khlebnikov <koct9i@gmail.com>,Jiri Slaby <jslaby@suse.cz>,Cyrill Gorcunov <gorcunov@openvz.org>,Michal Hocko <mhocko@suse.com>,Vlastimil Babka <vbabka@suse.cz>,Dave Hansen <dave.hansen@linux.intel.com>,Greg Kroah-Hartman <gregkh@linuxfoundation.org>,Dan Carpenter <dan.carpenter@oracle.com>,Michael Kerrisk <mtk.manpages@gmail.com>,"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,Marcus Gelderie <redmnic@gmail.com>,Vladimir Davydov <vdavydov@virtuozzo.com>,Joe Perches <joe@perches.com>,Frederic Weisbecker <fweisbec@gmail.com>,Andrea Arcangeli <aarcange@redhat.com>,!
 "Eric W.
Biederman" <ebiederm@xmission.com>,Andi Kleen <ak@linux.intel.com>,Oleg Nesterov <oleg@redhat.com>,Stas Sergeev <stsp@list.ru>,Amanieu d'Antras <amanieu@gmail.com>,Richard Weinberger <richard@nod.at>,Wang Xiaoqiang <wangxq10@lzu.edu.cn>,Helge Deller <deller@gmx.de>,Mateusz Guzik <mguzik@redhat.com>,Alex Thorlton <athorlton@sgi.com>,Ben Segall <bsegall@google.com>,John Stultz <john.stultz@linaro.org>,Rik van Riel <riel@redhat.com>,Eric B Munson <emunson@akamai.com>,Alexey Klimov <klimov.linux@gmail.com>,Chen Gang <gang.chen.5i5j@gmail.com>,Andrey Ryabinin <aryabinin@virtuozzo.com>,David Rientjes <rientjes@google.com>,Hugh Dickins <hughd@google.com>,Alexander Kuleshov <kuleshovmail@gmail.com>,"open list:DOCUMENTATION" <linux-doc@vger.kernel.org>,"open list:IA64 (Itanium) PLATFORM" <linux-ia64@vger.kernel.org>,"open list:KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC" <kvm-ppc@vger.kernel.org>,"open list:KERNEL VIRTUAL MACHINE (KVM)" <kvm@vger.kernel.org>,"open list:LINUX FOR POWERPC!
  (32-BIT
AND 64-BIT)" <linuxppc-dev@lists.ozlabs.org>,"open list:INFINIBAND SUBSYSTEM" <linux-rdma@vger.kernel.org>,"open list:FILESYSTEMS (VFS and infrastructure)" <linux-fsdevel@vger.kernel.org>,"open list:CONTROL GROUP (CGROUP)" <cgroups@vger.kernel.org>,"open list:BPF (Safe dynamic programs and tools)" <netdev@vger.kernel.org>,"open list:MEMORY MANAGEMENT" <linux-mm@kvack.org>
Message-ID: <D79806FE-E6B9-481B-8AA2-A1800419D9B5@zytor.com>

On July 15, 2016 6:59:56 AM PDT, Peter Zijlstra <peterz@infradead.org> wrote:
>On Fri, Jul 15, 2016 at 01:52:48PM +0000, Topi Miettinen wrote:
>> On 07/15/16 12:43, Peter Zijlstra wrote:
>> > On Fri, Jul 15, 2016 at 01:35:47PM +0300, Topi Miettinen wrote:
>> >> Hello,
>> >>
>> >> There are many basic ways to control processes, including
>capabilities,
>> >> cgroups and resource limits. However, there are far fewer ways to
>find out
>> >> useful values for the limits, except blind trial and error.
>> >>
>> >> This patch series attempts to fix that by giving at least a nice
>starting
>> >> point from the highwater mark values of the resources in question.
>> >> I looked where each limit is checked and added a call to update
>the mark
>> >> nearby.
>> > 
>> > And how is that useful? Setting things to the high watermark is
>> > basically the same as not setting the limit at all.
>> 
>> What else would you use, too small limits?
>
>That question doesn't make sense.
>
>What's the point of setting a limit if it ends up being the same as
>no-limit (aka unlimited).
>
>If you cannot explain; and you have not so far; what use these values
>are, why would we look at the patches.

One reason is to catch a malfunctioning process rather than dragging the whole system down with it.  It could also be useful for development.
-- 
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 00/14] Present useful limits to user (v2)
  2016-07-15 20:54       ` H. Peter Anvin
@ 2016-07-18 13:00         ` Austin S. Hemmelgarn
  0 siblings, 0 replies; 26+ messages in thread
From: Austin S. Hemmelgarn @ 2016-07-18 13:00 UTC (permalink / raw)
  To: H. Peter Anvin, Peter Zijlstra, Topi Miettinen
  Cc: linux-kernel, Jonathan Corbet, Tony Luck, Fenghua Yu,
	Alexander Graf, Paolo Bonzini, Radim Kr??m????,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Thomas Gleixner, Ingo Molnar,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	Doug Ledford, Sean Hefty, Hal Rosenstock, Mike Marciniszyn,
	Dennis Dalessandro, Christian Benvenuti, Dave Goodell,
	Sudeep Dutt, Ashutosh Dixit, Alex Williamson, Alexander Viro,
	Tejun Heo, Li.Zefan

On 2016-07-15 16:54, H. Peter Anvin wrote:
> On July 15, 2016 6:59:56 AM PDT, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Fri, Jul 15, 2016 at 01:52:48PM +0000, Topi Miettinen wrote:
>>> On 07/15/16 12:43, Peter Zijlstra wrote:
>>>> On Fri, Jul 15, 2016 at 01:35:47PM +0300, Topi Miettinen wrote:
>>>>> Hello,
>>>>>
>>>>> There are many basic ways to control processes, including
>> capabilities,
>>>>> cgroups and resource limits. However, there are far fewer ways to
>> find out
>>>>> useful values for the limits, except blind trial and error.
>>>>>
>>>>> This patch series attempts to fix that by giving at least a nice
>> starting
>>>>> point from the highwater mark values of the resources in question.
>>>>> I looked where each limit is checked and added a call to update
>> the mark
>>>>> nearby.
>>>>
>>>> And how is that useful? Setting things to the high watermark is
>>>> basically the same as not setting the limit at all.
>>>
>>> What else would you use, too small limits?
>>
>> That question doesn't make sense.
>>
>> What's the point of setting a limit if it ends up being the same as
>> no-limit (aka unlimited).
>>
>> If you cannot explain; and you have not so far; what use these values
>> are, why would we look at the patches.
>
> One reason is to catch a malfunctioning process rather than dragging the whole system down with it.  It could also be useful for development.
>
Additionally, there are quite a few applications which don't gracefully 
handle memory allocation or process creation failures, either hanging, 
constantly retrying, or just dying when this happens.  For such an 
application, you have to set the limit to the high watermark if you want 
them limited at all, otherwise they don't work.  A classic example of 
this is the official client for Dropbox.  If it can't start up all the 
insane number of threads it thinks it needs, then it just hangs. 
However, it's also a network service, and therefore is a reasonable 
target for hackers, so it makes sense to try and limit it.  I've run 
into similar issues with quite a few 'desktop' services, both open and 
closed source.

Looking at this another way, this is most useful for things that have a 
deterministic maximum resource usage under regular use, not something 
like a forking server which has a functionally unbounded maximum 
resource usage.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level
  2016-07-15 17:15     ` Topi Miettinen
@ 2016-07-18 22:52       ` Tejun Heo
  2016-07-19 16:57         ` Topi Miettinen
  0 siblings, 1 reply; 26+ messages in thread
From: Tejun Heo @ 2016-07-18 22:52 UTC (permalink / raw)
  To: Topi Miettinen
  Cc: linux-kernel, Jonathan Corbet, Li Zefan, Johannes Weiner,
	Markus Elfring, David S. Miller, Nicolas Dichtel,
	open list:DOCUMENTATION, open list:CONTROL GROUP (CGROUP)

On Fri, Jul 15, 2016 at 05:15:41PM +0000, Topi Miettinen wrote:
> There are clear semantics for the limits themselves, either they apply
> per task or per user. It makes sense to gather values according to these
> semantics. Then with systemd or other tools you can use the valuse to
> set the limits for a service regardless if the limit applies per task or
> per user and it works according to each limit's semantics.

What does it mean to collect the maximum of the high watermarks of
multiple users or the high water marks along process hierarchy which
is spread across multiple cgroups?  These are non-sensical numbers.
If you want to collect high watermarks per-cgroup, the numbers have to
be per-cgroup - how many fds are being used in that particular cgroup
and what's the high watermark of that number and so on.  You can't
just take maximum from process hierarchy or user watermarks.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level
  2016-07-18 22:52       ` Tejun Heo
@ 2016-07-19 16:57         ` Topi Miettinen
  2016-07-19 18:18           ` Tejun Heo
  0 siblings, 1 reply; 26+ messages in thread
From: Topi Miettinen @ 2016-07-19 16:57 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, Jonathan Corbet, Li Zefan, Johannes Weiner,
	Markus Elfring, David S. Miller, Nicolas Dichtel,
	open list:DOCUMENTATION, open list:CONTROL GROUP (CGROUP)

On 07/18/16 22:52, Tejun Heo wrote:
> On Fri, Jul 15, 2016 at 05:15:41PM +0000, Topi Miettinen wrote:
>> There are clear semantics for the limits themselves, either they apply
>> per task or per user. It makes sense to gather values according to these
>> semantics. Then with systemd or other tools you can use the valuse to
>> set the limits for a service regardless if the limit applies per task or
>> per user and it works according to each limit's semantics.
> 
> What does it mean to collect the maximum of the high watermarks of
> multiple users or the high water marks along process hierarchy which
> is spread across multiple cgroups?  These are non-sensical numbers.
> If you want to collect high watermarks per-cgroup, the numbers have to
> be per-cgroup - how many fds are being used in that particular cgroup
> and what's the high watermark of that number and so on.  You can't
> just take maximum from process hierarchy or user watermarks.

Then there would need to be new limit checks at cgroup level. Would you
see problems with that approach?

-Topi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level
  2016-07-19 16:57         ` Topi Miettinen
@ 2016-07-19 18:18           ` Tejun Heo
  0 siblings, 0 replies; 26+ messages in thread
From: Tejun Heo @ 2016-07-19 18:18 UTC (permalink / raw)
  To: Topi Miettinen
  Cc: linux-kernel, Jonathan Corbet, Li Zefan, Johannes Weiner,
	Markus Elfring, David S. Miller, Nicolas Dichtel,
	open list:DOCUMENTATION, open list:CONTROL GROUP (CGROUP)

Hello, Topi.

On Tue, Jul 19, 2016 at 04:57:10PM +0000, Topi Miettinen wrote:
> Then there would need to be new limit checks at cgroup level. Would you
> see problems with that approach?

I'm worried that you're rushing this feature without thinking through
it.  You were mixing up completely orthogonal planes of accounting and
control without too much thought and are now suggesting something
which is also strange.  What do you mean by "new limit checks at
cgroup level"?  How would this be different from the resource
accounting and control implemented in the existing controllers?

Please take a step back and think through the overall design before
proposing changes to userland visible interface.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2016-07-19 18:18 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1468578983-28229-1-git-send-email-toiwoton@gmail.com>
2016-07-15 10:35 ` [PATCH 01/14] resource limits: foundation for resource highwater tracking Topi Miettinen
2016-07-15 12:12   ` kbuild test robot
2016-07-15 12:49   ` Nicolas Dichtel
2016-07-15 16:27     ` Topi Miettinen
2016-07-15 17:57       ` Nicolas Dichtel
2016-07-15 10:35 ` [PATCH 02/14] resource limits: aggregate task highwater marks to cgroup level Topi Miettinen
2016-07-15 12:38   ` kbuild test robot
2016-07-15 14:10   ` Tejun Heo
2016-07-15 17:15     ` Topi Miettinen
2016-07-18 22:52       ` Tejun Heo
2016-07-19 16:57         ` Topi Miettinen
2016-07-19 18:18           ` Tejun Heo
2016-07-15 10:35 ` [PATCH 03/14] resource limits: track highwater mark of file sizes Topi Miettinen
2016-07-15 10:35 ` [PATCH 04/14] resource limits: track highwater mark of VM data segment Topi Miettinen
2016-07-15 10:35 ` [PATCH 05/14] resource limits: track highwater mark of stack size Topi Miettinen
2016-07-15 10:35 ` [PATCH 06/14] resource limits: track highwater mark of cores dumped Topi Miettinen
2016-07-15 10:35 ` [PATCH 07/14] resource limits: track highwater mark of user processes Topi Miettinen
2016-07-15 10:35 ` [PATCH 08/14] resource limits: track highwater mark of number of files Topi Miettinen
2016-07-15 10:35 ` [PATCH 10/14] resource limits: track highwater mark of address space size Topi Miettinen
2016-07-15 10:35 ` [PATCH 11/14] resource limits: track highwater mark of number of pending signals Topi Miettinen
2016-07-15 10:35 ` [PATCH 12/14] resource limits: track highwater mark of size of message queues Topi Miettinen
2016-07-15 10:36 ` [PATCH 13/14] resource limits: track highwater mark of niceness Topi Miettinen
2016-07-15 10:36 ` [PATCH 14/14] resource limits: track highwater mark of RT priority Topi Miettinen
2016-07-15 17:42 ` [PATCH 00/14] Present useful limits to user (v2) Topi Miettinen
     [not found] ` <20160715124330.GR30154@twins.programming.kicks-ass.net>
     [not found]   ` <28b4b919-4f50-d9f6-c5e1-d1e92ea1ba1c@gmail.com>
     [not found]     ` <20160715135956.GA3115@twins.programming.kicks-ass.net>
2016-07-15 20:54       ` H. Peter Anvin
2016-07-18 13:00         ` Austin S. Hemmelgarn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).