[RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
@ 2020-08-10 14:58 Eugene Lubarsky
  2020-08-10 14:58 ` [RFC PATCH 1/5] fs/proc: Introduce /proc/all/stat Eugene Lubarsky
                   ` (6 more replies)
  0 siblings, 7 replies; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-10 14:58 UTC (permalink / raw)
  To: linux-api, linux-fsdevel; +Cc: linux-kernel, adobriyan, avagin, dsahern

This is an idea for substantially reducing the number of syscalls needed
by monitoring tools whilst mostly re-using the existing API.

The proposed files in this proof-of-concept patch set are:

* /proc/all/stat
      A stat line for each process in the existing format.

* /proc/all/statm
      statm lines but starting with a PID column.

* /proc/all/status
      status info for all processes in the existing format.

* /proc/all/io
      The existing /proc/pid/io data but formatted as a single line for
      each process, similarly to stat/statm, with a PID column added.

* /proc/all/statx
      Gathers info from stat, statm and io; the purpose is actually
      not so much to reduce syscalls but to help userspace be more
      efficient by not having to store data in e.g. hashtables in order
      to gather it from separate /proc/all/ files.

      The format proposed here starts with the unchanged stat line
      and begins the other info with a few characters, repeating for
      each process:

      ...
      25 (cat) R 1 1 0 0 -1 4194304 185 0 16 0 2 0 0 0 20 ...
      m 662 188 167 5 0 112 0
      io 4292 0 12 0 0 0 0
      ...


There has been a proposal with some overlapping goals: /proc/task-diag
(https://github.com/avagin/linux-task-diag), but I'm not sure about
its current status.



Best Wishes,

Eugene


Eugene Lubarsky (5):
  fs/proc: Introduce /proc/all/stat
  fs/proc: Introduce /proc/all/statm
  fs/proc: Introduce /proc/all/status
  fs/proc: Introduce /proc/all/io
  fs/proc: Introduce /proc/all/statx

 fs/proc/base.c     | 215 +++++++++++++++++++++++++++++++++++++++++++--
 fs/proc/internal.h |   1 +
 fs/proc/root.c     |   1 +
 3 files changed, 210 insertions(+), 7 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [RFC PATCH 1/5] fs/proc: Introduce /proc/all/stat
  2020-08-10 14:58 [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Eugene Lubarsky
@ 2020-08-10 14:58 ` Eugene Lubarsky
  2020-08-10 14:58 ` [RFC PATCH 2/5] fs/proc: Introduce /proc/all/statm Eugene Lubarsky
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-10 14:58 UTC (permalink / raw)
  To: linux-api, linux-fsdevel; +Cc: linux-kernel, adobriyan, avagin, dsahern

Returns stat lines for all visible processes in the existing format,
aiming to substantially reduce the number of syscalls that are needed 
for this common task.

Signed-off-by: Eugene Lubarsky <elubarsky.linux@gmail.com>
---
 fs/proc/base.c     | 98 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/internal.h |  1 +
 fs/proc/root.c     |  1 +
 3 files changed, 100 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a333caeca291..e0f60a1528b7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3811,3 +3811,101 @@ void __init set_proc_pid_nlink(void)
 	nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
 	nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
 }
+
+
+/*
+ * /proc/all/
+ */
+
+struct all_iter {
+	struct tgid_iter tgid_iter;
+	struct proc_fs_info *fs_info;
+	struct pid_namespace *ns;
+};
+
+static void *proc_all_start(struct seq_file *m, loff_t *pos)
+{
+	struct all_iter *iter = kmalloc(sizeof(struct all_iter), GFP_KERNEL);
+
+	iter->fs_info = proc_sb_info(file_inode(m->file)->i_sb);
+	iter->ns = proc_pid_ns(file_inode(m->file)->i_sb);
+
+	iter->tgid_iter.tgid = *pos;
+	iter->tgid_iter.task = NULL;
+	iter->tgid_iter = next_tgid(iter->ns, iter->tgid_iter);
+
+	if (!iter->tgid_iter.task) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void *proc_all_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct all_iter *iter = (struct all_iter *) v;
+	struct proc_fs_info *fs_info = iter->fs_info;
+	struct tgid_iter *tgid_iter = &iter->tgid_iter;
+
+	do {
+		tgid_iter->tgid += 1;
+		*tgid_iter = next_tgid(iter->ns, *tgid_iter);
+	} while (tgid_iter->task &&
+				!has_pid_permissions(fs_info, tgid_iter->task, HIDEPID_INVISIBLE));
+
+	*pos = tgid_iter->tgid;
+
+	if (!tgid_iter->task) {
+		kfree(v);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void proc_all_stop(struct seq_file *m, void *v)
+{
+	if (v) {
+		struct all_iter *iter = (struct all_iter *) v;
+		struct task_struct *task = iter->tgid_iter.task;
+
+		if (task)
+			put_task_struct(task);
+
+	  kfree(v);
+	}
+}
+
+static int proc_all_stat(struct seq_file *m, void *v)
+{
+	struct all_iter *iter = (struct all_iter *) v;
+
+	return proc_tgid_stat(m, iter->ns, iter->tgid_iter.task->thread_pid, iter->tgid_iter.task);
+}
+
+
+#define PROC_ALL_OPS(NAME) static const struct seq_operations proc_all_##NAME##_ops = { \
+	.start	= proc_all_start, \
+	.next	= proc_all_next, \
+	.stop	= proc_all_stop, \
+	.show	= proc_all_##NAME \
+}
+
+PROC_ALL_OPS(stat);
+
+#define PROC_ALL_CREATE(NAME) \
+	do { \
+		if (!proc_create_seq(#NAME, 0, all_dir, &proc_all_##NAME##_ops)) \
+			return; \
+	} while (0)
+
+void __init proc_all_init(void)
+{
+	struct proc_dir_entry *all_dir = proc_mkdir("all", NULL);
+
+	if (!all_dir)
+		return;
+
+	PROC_ALL_CREATE(stat);
+}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 917cc85e3466..b22d9cb619bf 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -171,6 +171,7 @@ extern int pid_delete_dentry(const struct dentry *);
 extern int proc_pid_readdir(struct file *, struct dir_context *);
 struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
 extern loff_t mem_lseek(struct file *, loff_t, int);
+extern void proc_all_init(void);
 
 /* Lookups */
 typedef struct dentry *instantiate_t(struct dentry *,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5e444d4f9717..4b5cfd2cdc0a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -291,6 +291,7 @@ void __init proc_root_init(void)
 	set_proc_pid_nlink();
 	proc_self_init();
 	proc_thread_self_init();
+	proc_all_init();
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [RFC PATCH 2/5] fs/proc: Introduce /proc/all/statm
  2020-08-10 14:58 [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Eugene Lubarsky
  2020-08-10 14:58 ` [RFC PATCH 1/5] fs/proc: Introduce /proc/all/stat Eugene Lubarsky
@ 2020-08-10 14:58 ` Eugene Lubarsky
  2020-08-10 14:58 ` [RFC PATCH 3/5] fs/proc: Introduce /proc/all/status Eugene Lubarsky
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-10 14:58 UTC (permalink / raw)
  To: linux-api, linux-fsdevel; +Cc: linux-kernel, adobriyan, avagin, dsahern

Returns statm lines for all visible processes with prepended PIDs.

Signed-off-by: Eugene Lubarsky <elubarsky.linux@gmail.com>
---
 fs/proc/base.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0f60a1528b7..8396a38ba7d2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3884,6 +3884,18 @@ static int proc_all_stat(struct seq_file *m, void *v)
 	return proc_tgid_stat(m, iter->ns, iter->tgid_iter.task->thread_pid, iter->tgid_iter.task);
 }
 
+static int proc_all_statm(struct seq_file *m, void *v)
+{
+	struct all_iter *iter = (struct all_iter *) v;
+	struct pid_namespace *ns = iter->ns;
+	struct task_struct *task = iter->tgid_iter.task;
+	struct pid *pid = task->thread_pid;
+
+	seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
+	seq_puts(m, " ");
+	return proc_pid_statm(m, ns, pid, task);
+}
+
 
 #define PROC_ALL_OPS(NAME) static const struct seq_operations proc_all_##NAME##_ops = { \
 	.start	= proc_all_start, \
@@ -3893,6 +3905,7 @@ static int proc_all_stat(struct seq_file *m, void *v)
 }
 
 PROC_ALL_OPS(stat);
+PROC_ALL_OPS(statm);
 
 #define PROC_ALL_CREATE(NAME) \
 	do { \
@@ -3908,4 +3921,5 @@ void __init proc_all_init(void)
 		return;
 
 	PROC_ALL_CREATE(stat);
+	PROC_ALL_CREATE(statm);
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [RFC PATCH 3/5] fs/proc: Introduce /proc/all/status
  2020-08-10 14:58 [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Eugene Lubarsky
  2020-08-10 14:58 ` [RFC PATCH 1/5] fs/proc: Introduce /proc/all/stat Eugene Lubarsky
  2020-08-10 14:58 ` [RFC PATCH 2/5] fs/proc: Introduce /proc/all/statm Eugene Lubarsky
@ 2020-08-10 14:58 ` Eugene Lubarsky
  2020-08-10 14:58 ` [RFC PATCH 4/5] fs/proc: Introduce /proc/all/io Eugene Lubarsky
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-10 14:58 UTC (permalink / raw)
  To: linux-api, linux-fsdevel; +Cc: linux-kernel, adobriyan, avagin, dsahern

Returns status lines for all visible processes in the existing format.

Signed-off-by: Eugene Lubarsky <elubarsky.linux@gmail.com>
---
 fs/proc/base.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8396a38ba7d2..5982fd43dd21 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3897,6 +3897,14 @@ static int proc_all_statm(struct seq_file *m, void *v)
 }
 
 
+
+static int proc_all_status(struct seq_file *m, void *v)
+{
+	struct all_iter *iter = (struct all_iter *) v;
+
+	return proc_pid_status(m, iter->ns, iter->tgid_iter.task->thread_pid, iter->tgid_iter.task);
+}
+
 #define PROC_ALL_OPS(NAME) static const struct seq_operations proc_all_##NAME##_ops = { \
 	.start	= proc_all_start, \
 	.next	= proc_all_next, \
@@ -3906,6 +3914,7 @@ static int proc_all_statm(struct seq_file *m, void *v)
 
 PROC_ALL_OPS(stat);
 PROC_ALL_OPS(statm);
+PROC_ALL_OPS(status);
 
 #define PROC_ALL_CREATE(NAME) \
 	do { \
@@ -3922,4 +3931,5 @@ void __init proc_all_init(void)
 
 	PROC_ALL_CREATE(stat);
 	PROC_ALL_CREATE(statm);
+	PROC_ALL_CREATE(status);
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [RFC PATCH 4/5] fs/proc: Introduce /proc/all/io
  2020-08-10 14:58 [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Eugene Lubarsky
                   ` (2 preceding siblings ...)
  2020-08-10 14:58 ` [RFC PATCH 3/5] fs/proc: Introduce /proc/all/status Eugene Lubarsky
@ 2020-08-10 14:58 ` Eugene Lubarsky
  2020-08-10 14:58 ` [RFC PATCH 5/5] fs/proc: Introduce /proc/all/statx Eugene Lubarsky
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-10 14:58 UTC (permalink / raw)
  To: linux-api, linux-fsdevel; +Cc: linux-kernel, adobriyan, avagin, dsahern

Returns io info for all visible processes.

The data is the same as /proc/[pid]/io but formatted as a series
of numbers on a single line. A PID column is also prepended.

Signed-off-by: Eugene Lubarsky <elubarsky.linux@gmail.com>
---
 fs/proc/base.c | 66 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 7 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5982fd43dd21..03d48225b6d1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2910,9 +2910,8 @@ static const struct file_operations proc_coredump_filter_operations = {
 #endif
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
+static int calc_io_accounting(struct task_struct *task, struct task_io_accounting *acct, int whole)
 {
-	struct task_io_accounting acct = task->ioac;
 	unsigned long flags;
 	int result;
 
@@ -2928,12 +2927,27 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 	if (whole && lock_task_sighand(task, &flags)) {
 		struct task_struct *t = task;
 
-		task_io_accounting_add(&acct, &task->signal->ioac);
+		task_io_accounting_add(acct, &task->signal->ioac);
 		while_each_thread(task, t)
-			task_io_accounting_add(&acct, &t->ioac);
+			task_io_accounting_add(acct, &t->ioac);
 
 		unlock_task_sighand(task, &flags);
 	}
+	result = 0;
+
+out_unlock:
+	mutex_unlock(&task->signal->exec_update_mutex);
+	return result;
+}
+static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
+{
+	struct task_io_accounting acct = task->ioac;
+	int result;
+
+	result = calc_io_accounting(task, &acct, whole);
+	if (result)
+		return result;
+
 	seq_printf(m,
 		   "rchar: %llu\n"
 		   "wchar: %llu\n"
@@ -2949,10 +2963,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 		   (unsigned long long)acct.read_bytes,
 		   (unsigned long long)acct.write_bytes,
 		   (unsigned long long)acct.cancelled_write_bytes);
-	result = 0;
 
-out_unlock:
-	mutex_unlock(&task->signal->exec_update_mutex);
 	return result;
 }
 
@@ -3896,7 +3907,42 @@ static int proc_all_statm(struct seq_file *m, void *v)
 	return proc_pid_statm(m, ns, pid, task);
 }
 
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+static int proc_all_io_print_one(struct seq_file *m, struct task_struct *task)
+{
+	struct task_io_accounting acct = task->ioac;
+	int result;
 
+	result = calc_io_accounting(task, &acct, 1);
+	if (result)
+		return result;
+
+	seq_printf(m,
+		   "%llu %llu %llu %llu %llu %llu %llu\n",
+		   (unsigned long long)acct.rchar,
+		   (unsigned long long)acct.wchar,
+		   (unsigned long long)acct.syscr,
+		   (unsigned long long)acct.syscw,
+		   (unsigned long long)acct.read_bytes,
+		   (unsigned long long)acct.write_bytes,
+		   (unsigned long long)acct.cancelled_write_bytes);
+
+	return result;
+}
+
+static int proc_all_io(struct seq_file *m, void *v)
+{
+	struct all_iter *iter = (struct all_iter *) v;
+	struct pid_namespace *ns = iter->ns;
+	struct task_struct *task = iter->tgid_iter.task;
+	struct pid *pid = task->thread_pid;
+
+	seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
+	seq_puts(m, " ");
+
+	return proc_all_io_print_one(m, task);
+}
+#endif
 
 static int proc_all_status(struct seq_file *m, void *v)
 {
@@ -3915,6 +3961,9 @@ static int proc_all_status(struct seq_file *m, void *v)
 PROC_ALL_OPS(stat);
 PROC_ALL_OPS(statm);
 PROC_ALL_OPS(status);
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	PROC_ALL_OPS(io);
+#endif
 
 #define PROC_ALL_CREATE(NAME) \
 	do { \
@@ -3932,4 +3981,7 @@ void __init proc_all_init(void)
 	PROC_ALL_CREATE(stat);
 	PROC_ALL_CREATE(statm);
 	PROC_ALL_CREATE(status);
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	PROC_ALL_CREATE(io);
+#endif
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [RFC PATCH 5/5] fs/proc: Introduce /proc/all/statx
  2020-08-10 14:58 [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Eugene Lubarsky
                   ` (3 preceding siblings ...)
  2020-08-10 14:58 ` [RFC PATCH 4/5] fs/proc: Introduce /proc/all/io Eugene Lubarsky
@ 2020-08-10 14:58 ` Eugene Lubarsky
  2020-08-10 15:04 ` [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Greg KH
  2020-08-12  7:51 ` Andrei Vagin
  6 siblings, 0 replies; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-10 14:58 UTC (permalink / raw)
  To: linux-api, linux-fsdevel; +Cc: linux-kernel, adobriyan, avagin, dsahern

Gathers info from stat, statm and io files.

The purpose is not so much to reduce syscall numbers but to help
userspace not have to store data in e.g. hashtables in order to
gather it from separate /proc/all files.

The format starts with an unchanged stat line and begins the
statm & io lines with "m" or "io", repeating these for each process.

e.g.
...
25 (cat) R 1 1 0 0 -1 4194304 185 0 16 0 2 0 0 0 20 ...
m 662 188 167 5 0 112 0
io 4292 0 12 0 0 0 0
...

Signed-off-by: Eugene Lubarsky <elubarsky.linux@gmail.com>
---
 fs/proc/base.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 03d48225b6d1..5c6010c2ea1c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3944,6 +3944,31 @@ static int proc_all_io(struct seq_file *m, void *v)
 }
 #endif
 
+static int proc_all_statx(struct seq_file *m, void *v)
+{
+	struct all_iter *iter = (struct all_iter *) v;
+	struct pid_namespace *ns = iter->ns;
+	struct pid *pid = iter->tgid_iter.task->thread_pid;
+	struct task_struct *task = iter->tgid_iter.task;
+	int err;
+
+	err = proc_tgid_stat(m, ns, pid, task);
+	if (err)
+		return err;
+
+	seq_puts(m, "m ");
+	err = proc_pid_statm(m, ns, pid, task);
+	if (err)
+		return err;
+
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	seq_puts(m, "io ");
+	err = proc_all_io_print_one(m, task);
+#endif
+
+	return err;
+}
+
 static int proc_all_status(struct seq_file *m, void *v)
 {
 	struct all_iter *iter = (struct all_iter *) v;
@@ -3960,6 +3985,7 @@ static int proc_all_status(struct seq_file *m, void *v)
 
 PROC_ALL_OPS(stat);
 PROC_ALL_OPS(statm);
+PROC_ALL_OPS(statx);
 PROC_ALL_OPS(status);
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	PROC_ALL_OPS(io);
@@ -3980,6 +4006,7 @@ void __init proc_all_init(void)
 
 	PROC_ALL_CREATE(stat);
 	PROC_ALL_CREATE(statm);
+	PROC_ALL_CREATE(statx);
 	PROC_ALL_CREATE(status);
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	PROC_ALL_CREATE(io);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-10 14:58 [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Eugene Lubarsky
                   ` (4 preceding siblings ...)
  2020-08-10 14:58 ` [RFC PATCH 5/5] fs/proc: Introduce /proc/all/statx Eugene Lubarsky
@ 2020-08-10 15:04 ` Greg KH
  2020-08-10 15:27   ` Eugene Lubarsky
  2020-08-12  7:51 ` Andrei Vagin
  6 siblings, 1 reply; 16+ messages in thread
From: Greg KH @ 2020-08-10 15:04 UTC (permalink / raw)
  To: Eugene Lubarsky
  Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan, avagin, dsahern

On Tue, Aug 11, 2020 at 12:58:47AM +1000, Eugene Lubarsky wrote:
> This is an idea for substantially reducing the number of syscalls needed
> by monitoring tools whilst mostly re-using the existing API.

How many syscalls does this save on?

Perhaps you want my proposed readfile(2) syscall:
	https://lore.kernel.org/r/20200704140250.423345-1-gregkh@linuxfoundation.org
to help out with things like this?  :)

> The proposed files in this proof-of-concept patch set are:
> 
> * /proc/all/stat

I think the problem will be defining "all" in the case of the specific
namespace you are dealing with, right?  How will this handle all of
those issues properly for all of these different statisics?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-10 15:04 ` [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Greg KH
@ 2020-08-10 15:27   ` Eugene Lubarsky
  2020-08-10 15:41     ` Greg KH
  0 siblings, 1 reply; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-10 15:27 UTC (permalink / raw)
  To: Greg KH
  Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan, avagin, dsahern

On Mon, 10 Aug 2020 17:04:53 +0200
Greg KH <gregkh@linuxfoundation.org> wrote:
> How many syscalls does this save on?
> 
> Perhaps you want my proposed readfile(2) syscall:
> 	https://lore.kernel.org/r/20200704140250.423345-1-gregkh@linuxfoundation.org
> to help out with things like this?  :)

The proposed readfile sounds great and would help, but if there are
1000 processes wouldn't that require 1000 readfile calls to read their
proc files?

With something like this the stats for 1000 processes could be
retrieved with an open, a few reads and a close.

> 
> > The proposed files in this proof-of-concept patch set are:
> > 
> > * /proc/all/stat
> 
> I think the problem will be defining "all" in the case of the specific
> namespace you are dealing with, right?  How will this handle all of
> those issues properly for all of these different statisics?
> 

Currently I'm trying to re-use the existing code in fs/proc that
controls which PIDs are visible, but may well be missing something..


Best Wishes,
Eugene

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-10 15:27   ` Eugene Lubarsky
@ 2020-08-10 15:41     ` Greg KH
  2020-08-25  9:59       ` Eugene Lubarsky
  0 siblings, 1 reply; 16+ messages in thread
From: Greg KH @ 2020-08-10 15:41 UTC (permalink / raw)
  To: Eugene Lubarsky
  Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan, avagin, dsahern

On Tue, Aug 11, 2020 at 01:27:00AM +1000, Eugene Lubarsky wrote:
> On Mon, 10 Aug 2020 17:04:53 +0200
> Greg KH <gregkh@linuxfoundation.org> wrote:
> > How many syscalls does this save on?
> > 
> > Perhaps you want my proposed readfile(2) syscall:
> > 	https://lore.kernel.org/r/20200704140250.423345-1-gregkh@linuxfoundation.org
> > to help out with things like this?  :)
> 
> The proposed readfile sounds great and would help, but if there are
> 1000 processes wouldn't that require 1000 readfile calls to read their
> proc files?

Yes, but that should be better than 1000 open, 1000 read, and then 1000
close calls, right?  :)

> With something like this the stats for 1000 processes could be
> retrieved with an open, a few reads and a close.

And have you benchmarked any of this?  Try working with the common tools
that want this information and see if it actually is noticeable (hint, I
have been doing that with the readfile work and it's surprising what the
results are in places...)

> 
> > 
> > > The proposed files in this proof-of-concept patch set are:
> > > 
> > > * /proc/all/stat
> > 
> > I think the problem will be defining "all" in the case of the specific
> > namespace you are dealing with, right?  How will this handle all of
> > those issues properly for all of these different statisics?
> > 
> 
> Currently I'm trying to re-use the existing code in fs/proc that
> controls which PIDs are visible, but may well be missing something..

Try it out and see if it works correctly.  And pid namespaces are not
the only thing these days from what I call :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-10 14:58 [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Eugene Lubarsky
                   ` (5 preceding siblings ...)
  2020-08-10 15:04 ` [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Greg KH
@ 2020-08-12  7:51 ` Andrei Vagin
  2020-08-13  4:47   ` David Ahern
  2020-08-13 15:01   ` Eugene Lubarsky
  6 siblings, 2 replies; 16+ messages in thread
From: Andrei Vagin @ 2020-08-12  7:51 UTC (permalink / raw)
  To: Eugene Lubarsky
  Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan, dsahern

On Tue, Aug 11, 2020 at 12:58:47AM +1000, Eugene Lubarsky wrote:
> This is an idea for substantially reducing the number of syscalls needed
> by monitoring tools whilst mostly re-using the existing API.
> 
> The proposed files in this proof-of-concept patch set are:
> 
> * /proc/all/stat
>       A stat line for each process in the existing format.
> 
> * /proc/all/statm
>       statm lines but starting with a PID column.
> 
> * /proc/all/status
>       status info for all processes in the existing format.
> 
> * /proc/all/io
>       The existing /proc/pid/io data but formatted as a single line for
>       each process, similarly to stat/statm, with a PID column added.
> 
> * /proc/all/statx
>       Gathers info from stat, statm and io; the purpose is actually
>       not so much to reduce syscalls but to help userspace be more
>       efficient by not having to store data in e.g. hashtables in order
>       to gather it from separate /proc/all/ files.
> 
>       The format proposed here starts with the unchanged stat line
>       and begins the other info with a few characters, repeating for
>       each process:
> 
>       ...
>       25 (cat) R 1 1 0 0 -1 4194304 185 0 16 0 2 0 0 0 20 ...
>       m 662 188 167 5 0 112 0
>       io 4292 0 12 0 0 0 0
>       ...
> 
> 
> There has been a proposal with some overlapping goals: /proc/task-diag
> (https://github.com/avagin/linux-task-diag), but I'm not sure about
> its current status.

I rebased the task_diag patches on top of v5.8:
https://github.com/avagin/linux-task-diag/tree/v5.8-task-diag

/proc/pid files have three major limitations:
* Requires at least three syscalls per process per file
  open(), read(), close()
* Variety of formats, mostly text based
  The kernel spent time to encode binary data into a text format and
  then tools like top and ps spent time to decode them back to a binary
  format.
* Sometimes slow due to extra attributes
  For example, /proc/PID/smaps contains a lot of useful informations
  about memory mappings and memory consumption for each of them. But
  even if we don't need memory consumption fields, the kernel will
  spend time to collect this information.

More details and numbers are in this article:
https://avagin.github.io/how-fast-is-procfs

This new interface doesn't have only one of these limitations, but
task_diag doesn't have all of them.

And I compared how fast each of these interfaces:

The test environment:
CPU: Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz
RAM: 16GB
kernel: v5.8 with task_diag and /proc/all patches.
100K processes:
$ ps ax | wc -l
10228

$ time cat /proc/all/status > /dev/null

real	0m0.577s
user	0m0.017s
sys	0m0.559s

task_proc_all is used to read /proc/pid/status for all tasks:
https://github.com/avagin/linux-task-diag/blob/master/tools/testing/selftests/task_diag/task_proc_all.c

$ time ./task_proc_all status
tasks: 100230

real	0m0.924s
user	0m0.054s
sys	0m0.858s


/proc/all/status is about 40% faster than /proc/*/status.

Now let's take a look at the perf output:

$ time perf record -g cat /proc/all/status > /dev/null
$ perf report
-   98.08%     1.38%  cat      [kernel.vmlinux]  [k] entry_SYSCALL_64
   - 96.70% entry_SYSCALL_64
      - do_syscall_64
         - 94.97% ksys_read
            - 94.80% vfs_read
               - 94.58% proc_reg_read
                  - seq_read
                     - 87.95% proc_pid_status
                        + 13.10% seq_put_decimal_ull_width
                        - 11.69% task_mem
                           + 9.48% seq_put_decimal_ull_width
                        + 10.63% seq_printf
                        - 10.35% cpuset_task_status_allowed
                           + seq_printf
                        - 9.84% render_sigset_t
                             1.61% seq_putc
                           + 1.61% seq_puts
                        + 4.99% proc_task_name
                        + 4.11% seq_puts
                        - 3.76% render_cap_t
                             2.38% seq_put_hex_ll
                           + 1.25% seq_puts
                          2.64% __task_pid_nr_ns
                        + 1.54% get_task_mm
                        + 1.34% __lock_task_sighand
                        + 0.70% from_kuid_munged
                          0.61% get_task_cred
                          0.56% seq_putc
                          0.52% hugetlb_report_usage
                          0.52% from_kgid_munged
                     + 4.30% proc_all_next
                     + 0.82% _copy_to_user 

We can see that the kernel spent more than 50% of the time to encode binary
data into a text format.

Now let's see how fast task_diag:

$ time ./task_diag_all all -c -q

real	0m0.087s
user	0m0.001s
sys	0m0.082s

Maybe we need resurrect the task_diag series instead of inventing
another less-effective interface...

Thanks,
Andrei

> 
> 
> 
> Best Wishes,
> 
> Eugene
> 
> 
> Eugene Lubarsky (5):
>   fs/proc: Introduce /proc/all/stat
>   fs/proc: Introduce /proc/all/statm
>   fs/proc: Introduce /proc/all/status
>   fs/proc: Introduce /proc/all/io
>   fs/proc: Introduce /proc/all/statx
> 
>  fs/proc/base.c     | 215 +++++++++++++++++++++++++++++++++++++++++++--
>  fs/proc/internal.h |   1 +
>  fs/proc/root.c     |   1 +
>  3 files changed, 210 insertions(+), 7 deletions(-)
> 
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-12  7:51 ` Andrei Vagin
@ 2020-08-13  4:47   ` David Ahern
  2020-08-13  8:03     ` Andrei Vagin
  2020-08-13 15:01   ` Eugene Lubarsky
  1 sibling, 1 reply; 16+ messages in thread
From: David Ahern @ 2020-08-13  4:47 UTC (permalink / raw)
  To: Andrei Vagin, Eugene Lubarsky
  Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan

On 8/12/20 1:51 AM, Andrei Vagin wrote:
> 
> I rebased the task_diag patches on top of v5.8:
> https://github.com/avagin/linux-task-diag/tree/v5.8-task-diag

Thanks for updating the patches.

> 
> /proc/pid files have three major limitations:
> * Requires at least three syscalls per process per file
>   open(), read(), close()
> * Variety of formats, mostly text based
>   The kernel spent time to encode binary data into a text format and
>   then tools like top and ps spent time to decode them back to a binary
>   format.
> * Sometimes slow due to extra attributes
>   For example, /proc/PID/smaps contains a lot of useful informations
>   about memory mappings and memory consumption for each of them. But
>   even if we don't need memory consumption fields, the kernel will
>   spend time to collect this information.

that's what I recall as well.

> 
> More details and numbers are in this article:
> https://avagin.github.io/how-fast-is-procfs
> 
> This new interface doesn't have only one of these limitations, but
> task_diag doesn't have all of them.
> 
> And I compared how fast each of these interfaces:
> 
> The test environment:
> CPU: Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz
> RAM: 16GB
> kernel: v5.8 with task_diag and /proc/all patches.
> 100K processes:
> $ ps ax | wc -l
> 10228

100k processes but showing 10k here??

> 
> $ time cat /proc/all/status > /dev/null
> 
> real	0m0.577s
> user	0m0.017s
> sys	0m0.559s
> 
> task_proc_all is used to read /proc/pid/status for all tasks:
> https://github.com/avagin/linux-task-diag/blob/master/tools/testing/selftests/task_diag/task_proc_all.c
> 
> $ time ./task_proc_all status
> tasks: 100230
> 
> real	0m0.924s
> user	0m0.054s
> sys	0m0.858s
> 
> 
> /proc/all/status is about 40% faster than /proc/*/status.
> 
> Now let's take a look at the perf output:
> 
> $ time perf record -g cat /proc/all/status > /dev/null
> $ perf report
> -   98.08%     1.38%  cat      [kernel.vmlinux]  [k] entry_SYSCALL_64
>    - 96.70% entry_SYSCALL_64
>       - do_syscall_64
>          - 94.97% ksys_read
>             - 94.80% vfs_read
>                - 94.58% proc_reg_read
>                   - seq_read
>                      - 87.95% proc_pid_status
>                         + 13.10% seq_put_decimal_ull_width
>                         - 11.69% task_mem
>                            + 9.48% seq_put_decimal_ull_width
>                         + 10.63% seq_printf
>                         - 10.35% cpuset_task_status_allowed
>                            + seq_printf
>                         - 9.84% render_sigset_t
>                              1.61% seq_putc
>                            + 1.61% seq_puts
>                         + 4.99% proc_task_name
>                         + 4.11% seq_puts
>                         - 3.76% render_cap_t
>                              2.38% seq_put_hex_ll
>                            + 1.25% seq_puts
>                           2.64% __task_pid_nr_ns
>                         + 1.54% get_task_mm
>                         + 1.34% __lock_task_sighand
>                         + 0.70% from_kuid_munged
>                           0.61% get_task_cred
>                           0.56% seq_putc
>                           0.52% hugetlb_report_usage
>                           0.52% from_kgid_munged
>                      + 4.30% proc_all_next
>                      + 0.82% _copy_to_user 
> 
> We can see that the kernel spent more than 50% of the time to encode binary
> data into a text format.
> 
> Now let's see how fast task_diag:
> 
> $ time ./task_diag_all all -c -q
> 
> real	0m0.087s
> user	0m0.001s
> sys	0m0.082s
> 
> Maybe we need resurrect the task_diag series instead of inventing
> another less-effective interface...

I think the netlink message design is the better way to go. As system
sizes continue to increase (> 100 cpus is common now) you need to be
able to pass the right data to userspace as fast as possible to keep up
with what can be a very dynamic userspace and set of processes.

When you first proposed this idea I was working on systems with >= 1k
cpus and the netlink option was able to keep up with a 'make -j N' on
those systems. `perf record` walking /proc would never finish
initializing - I had to add a "done initializing" message to know when
to start a test. With the task_diag approach, perf could collect the
data in short order and move on to recording data.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-13  4:47   ` David Ahern
@ 2020-08-13  8:03     ` Andrei Vagin
  0 siblings, 0 replies; 16+ messages in thread
From: Andrei Vagin @ 2020-08-13  8:03 UTC (permalink / raw)
  To: David Ahern
  Cc: Eugene Lubarsky, linux-api, linux-fsdevel, linux-kernel, adobriyan

On Wed, Aug 12, 2020 at 10:47:32PM -0600, David Ahern wrote:
> On 8/12/20 1:51 AM, Andrei Vagin wrote:
> > 
> > I rebased the task_diag patches on top of v5.8:
> > https://github.com/avagin/linux-task-diag/tree/v5.8-task-diag
> 
> Thanks for updating the patches.
> 
> > 
> > /proc/pid files have three major limitations:
> > * Requires at least three syscalls per process per file
> >   open(), read(), close()
> > * Variety of formats, mostly text based
> >   The kernel spent time to encode binary data into a text format and
> >   then tools like top and ps spent time to decode them back to a binary
> >   format.
> > * Sometimes slow due to extra attributes
> >   For example, /proc/PID/smaps contains a lot of useful informations
> >   about memory mappings and memory consumption for each of them. But
> >   even if we don't need memory consumption fields, the kernel will
> >   spend time to collect this information.
> 
> that's what I recall as well.
> 
> > 
> > More details and numbers are in this article:
> > https://avagin.github.io/how-fast-is-procfs
> > 
> > This new interface doesn't have only one of these limitations, but
> > task_diag doesn't have all of them.
> > 
> > And I compared how fast each of these interfaces:
> > 
> > The test environment:
> > CPU: Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz
> > RAM: 16GB
> > kernel: v5.8 with task_diag and /proc/all patches.
> > 100K processes:
> > $ ps ax | wc -l
> > 10228
> 
> 100k processes but showing 10k here??

I'm sure that one zero has been escaped from here. task_proc_all shows
a number of tasks too and it shows 100230.

> 
> > 
> > $ time cat /proc/all/status > /dev/null
> > 
> > real	0m0.577s
> > user	0m0.017s
> > sys	0m0.559s
> > 
> > task_proc_all is used to read /proc/pid/status for all tasks:
> > https://github.com/avagin/linux-task-diag/blob/master/tools/testing/selftests/task_diag/task_proc_all.c
> > 
> > $ time ./task_proc_all status
> > tasks: 100230
> > 

Thanks,
Andrei

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-12  7:51 ` Andrei Vagin
  2020-08-13  4:47   ` David Ahern
@ 2020-08-13 15:01   ` Eugene Lubarsky
  2020-08-20 17:41     ` Andrei Vagin
  1 sibling, 1 reply; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-13 15:01 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan, dsahern

On Wed, 12 Aug 2020 00:51:35 -0700
Andrei Vagin <avagin@gmail.com> wrote:

> Maybe we need resurrect the task_diag series instead of inventing
> another less-effective interface...

I would certainly welcome the resurrection of task_diag - it is clearly
more efficient than this /proc/all/ idea. It would be good to find out
if there's anything in particular that's currently blocking it.

This RFC is mainly meant to check whether such an addition would
be acceptable from an API point of view. It currently has an obvious
performance issue in that seq_file seems to only return one page at a
time so lots of read syscalls are still required. However I may not
have the time to figure out a proposed fix for this by myself.
Regardless, text-based formats can't match the efficiency of task_diag,
but binary ones are also possible.

Best Wishes,
Eugene

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-13 15:01   ` Eugene Lubarsky
@ 2020-08-20 17:41     ` Andrei Vagin
  2020-08-25 10:00       ` Eugene Lubarsky
  0 siblings, 1 reply; 16+ messages in thread
From: Andrei Vagin @ 2020-08-20 17:41 UTC (permalink / raw)
  To: Eugene Lubarsky
  Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan, dsahern,
	Andy Lutomirski, Arnd Bergmann, Oleg Nesterov

On Fri, Aug 14, 2020 at 01:01:00AM +1000, Eugene Lubarsky wrote:
> On Wed, 12 Aug 2020 00:51:35 -0700
> Andrei Vagin <avagin@gmail.com> wrote:
> 
> > Maybe we need resurrect the task_diag series instead of inventing
> > another less-effective interface...
> 
> I would certainly welcome the resurrection of task_diag - it is clearly
> more efficient than this /proc/all/ idea. It would be good to find out
> if there's anything in particular that's currently blocking it.

Unfotunatly, I don't have enough time to lead a process of pushing
task_diag into the upstream. So if it is interesting for you, you can
restart this process and I am ready to help as much as time will permit.

I think the main blocking issue was a lack of interest from the wide
audience to this. The slow proc is the problem just for a few users, but
task_diag is a big subsystem that repeats functionality of another
subsystem with all derived problems like code duplication.

Another blocking issue is a new interface. There was no consensus on
this. Initially, I suggested to use netlink sockets, but developers from
non-network subsystem objected on this, so the transaction file
interface was introduced. The main idea similar to netlink sockets is
that we write a request and read a response.

There were some security concerns but I think I fixed them.

> 
> This RFC is mainly meant to check whether such an addition would
> be acceptable from an API point of view. It currently has an obvious
> performance issue in that seq_file seems to only return one page at a
> time so lots of read syscalls are still required. However I may not
> have the time to figure out a proposed fix for this by myself.
> Regardless, text-based formats can't match the efficiency of task_diag,
> but binary ones are also possible.

I don't have objections to this series. It can be an option if we
will decide that we don't want to do a major rework here.

Thanks,
Andrei

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-10 15:41     ` Greg KH
@ 2020-08-25  9:59       ` Eugene Lubarsky
  0 siblings, 0 replies; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-25  9:59 UTC (permalink / raw)
  To: Greg KH
  Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan, avagin, dsahern

On Mon, 10 Aug 2020 17:41:32 +0200
Greg KH <gregkh@linuxfoundation.org> wrote:

> On Tue, Aug 11, 2020 at 01:27:00AM +1000, Eugene Lubarsky wrote:
> > On Mon, 10 Aug 2020 17:04:53 +0200
> > Greg KH <gregkh@linuxfoundation.org> wrote:  
> And have you benchmarked any of this?  Try working with the common
> tools that want this information and see if it actually is noticeable
> (hint, I have been doing that with the readfile work and it's
> surprising what the results are in places...)

Apologies for the delay. Here are some benchmarks with atop.

Patch to atop at: https://github.com/eug48/atop/commits/proc-all
Patch to add /proc/all/schedstat & cpuset below.
atop not collecting threads & cmdline as /proc/all/ doesn't support it.
10,000 processes, kernel 5.8, nested KVM, 2 cores of i7-6700HQ @ 2.60GHz

# USE_PROC_ALL=0 ./atop -w test 1 &
# pidstat -p $(pidof atop) 1

01:33:05   %usr %system  %guest   %wait    %CPU   CPU  Command
01:33:06  33.66   33.66    0.00    0.99   67.33     1  atop
01:33:07  33.00   32.00    0.00    2.00   65.00     0  atop
01:33:08  34.00   31.00    0.00    1.00   65.00     0  atop
...
Average:  33.15   32.79    0.00    1.09   65.94     -  atop


# USE_PROC_ALL=1 ./atop -w test 1 &
# pidstat -p $(pidof atop) 1

01:33:33   %usr %system  %guest   %wait    %CPU   CPU  Command
01:33:34  28.00   14.00    0.00    1.00   42.00     1  atop
01:33:35  28.00   14.00    0.00    0.00   42.00     1  atop
01:33:36  26.00   13.00    0.00    0.00   39.00     1  atop
...
Average:  27.08   12.86    0.00    0.35   39.94     -  atop

So CPU usage goes down from ~65% to ~40%.

Data collection times in milliseconds are:

# xsv cat columns proc.csv procall.csv \
> | xsv stats \
> | xsv select field,min,max,mean,stddev \
> | xsv table
field           min  max  mean     stddev
/proc time      558  625  586.59   18.29
/proc/all time  231  262  243.56   8.02

Much performance optimisation can still be done, e.g. the modified atop
uses fgets which is reading 1KB at a time, and seq_file seems to only
return 4KB pages. task_diag should be much faster still.

I'd imagine this sort of thing would be useful for daemons monitoring
large numbers of processes. I don't run such systems myself; my initial
motivation was frustration with the Kubernetes kubelet having ~2-4% CPU
usage even with a couple of containers. Basic profiling suggests syscalls
have a lot to do with it - it's actually reading loads of tiny cgroup files
and enumerating many directories every 10 seconds, but /proc has similar
issues and seemed easier to start with.

Anyway, I've read that io_uring could also help here in the near future,
which would be really cool especially if there was a way to enumerate
directories and read many files regex-style in a single operation,
e.g. /proc/[0-9].*/(stat|statm|io)

> > Currently I'm trying to re-use the existing code in fs/proc that
> > controls which PIDs are visible, but may well be missing
> > something..  
> 
> Try it out and see if it works correctly.  And pid namespaces are not
> the only thing these days from what I call :)
> 
I've tried `unshare --fork --pid --mount-proc cat /proc/all/stat`
which seems to behave correctly. ptrace flags are handled by the
existing code.


Best Wishes,
Eugene


From 2ffc2e388f7ce4e3f182c2442823e5f13bae03dd Mon Sep 17 00:00:00 2001
From: Eugene Lubarsky <elubarsky.linux@gmail.com>
Date: Tue, 25 Aug 2020 12:36:41 +1000
Subject: [RFC PATCH] fs/proc: /proc/all: add schedstat and cpuset

Signed-off-by: Eugene Lubarsky <elubarsky.linux@gmail.com>
---
 fs/proc/base.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0bba4b3a985e..44d73f1ade4a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3944,6 +3944,36 @@ static int proc_all_io(struct seq_file *m, void *v)
 }
 #endif
 
+#ifdef CONFIG_PROC_PID_CPUSET
+static int proc_all_cpuset(struct seq_file *m, void *v)
+{
+	struct all_iter *iter = (struct all_iter *) v;
+	struct pid_namespace *ns = iter->ns;
+	struct task_struct *task = iter->tgid_iter.task;
+	struct pid *pid = task->thread_pid;
+
+	seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
+	seq_puts(m, " ");
+
+	return proc_cpuset_show(m, ns, pid, task);
+}
+#endif
+
+#ifdef CONFIG_SCHED_INFO
+static int proc_all_schedstat(struct seq_file *m, void *v)
+{
+	struct all_iter *iter = (struct all_iter *) v;
+	struct pid_namespace *ns = iter->ns;
+	struct task_struct *task = iter->tgid_iter.task;
+	struct pid *pid = task->thread_pid;
+
+	seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
+	seq_puts(m, " ");
+
+	return proc_pid_schedstat(m, ns, pid, task);
+}
+#endif
+
 static int proc_all_statx(struct seq_file *m, void *v)
 {
 	struct all_iter *iter = (struct all_iter *) v;
@@ -3990,6 +4020,12 @@ PROC_ALL_OPS(status);
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	PROC_ALL_OPS(io);
 #endif
+#ifdef CONFIG_SCHED_INFO
+	PROC_ALL_OPS(schedstat);
+#endif
+#ifdef CONFIG_PROC_PID_CPUSET
+	PROC_ALL_OPS(cpuset);
+#endif
 
 #define PROC_ALL_CREATE(NAME) \
 	do { \
@@ -4011,4 +4047,10 @@ void __init proc_all_init(void)
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	PROC_ALL_CREATE(io);
 #endif
+#ifdef CONFIG_SCHED_INFO
+	PROC_ALL_CREATE(schedstat);
+#endif
+#ifdef CONFIG_PROC_PID_CPUSET
+	PROC_ALL_CREATE(cpuset);
+#endif
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
  2020-08-20 17:41     ` Andrei Vagin
@ 2020-08-25 10:00       ` Eugene Lubarsky
  0 siblings, 0 replies; 16+ messages in thread
From: Eugene Lubarsky @ 2020-08-25 10:00 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-api, linux-fsdevel, linux-kernel, adobriyan, dsahern,
	Andy Lutomirski, Arnd Bergmann, Oleg Nesterov

On Thu, 20 Aug 2020 10:41:39 -0700
Andrei Vagin <avagin@gmail.com> wrote:
> Unfotunatly, I don't have enough time to lead a process of pushing
> task_diag into the upstream. So if it is interesting for you, you can
> restart this process and I am ready to help as much as time will
> permit.
>
> I think the main blocking issue was a lack of interest from the wide
> audience to this. The slow proc is the problem just for a few users,
> but task_diag is a big subsystem that repeats functionality of another
> subsystem with all derived problems like code duplication.

Unfortunately I don't have much time either and yes it sounds like
upstreaming a new interface like this will require input & enthusiasm
from more of those who are monitoring large numbers of processes,
which is not really me..

A related issue is that task_diag doesn't currently support the cgroup
filesystem which has the same issues as /proc and is accessed very
heavily by e.g. the Kubernetes kubelet cadvisor. Perhaps more interest
in tackling this could come from the Kubernetes community.

> 
> Another blocking issue is a new interface. There was no consensus on
> this. Initially, I suggested to use netlink sockets, but developers
> from non-network subsystem objected on this, so the transaction file
> interface was introduced. The main idea similar to netlink sockets is
> that we write a request and read a response.
> 
> There were some security concerns but I think I fixed them.

There's currently a lot of momentum behind io_uring which could not only
enable efficient enumeration and retrieval of small files but maybe it
would also be a more natural place for an API like task_diag..

Best Wishes,
Eugene

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2020-08-25 10:00 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-10 14:58 [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Eugene Lubarsky
2020-08-10 14:58 ` [RFC PATCH 1/5] fs/proc: Introduce /proc/all/stat Eugene Lubarsky
2020-08-10 14:58 ` [RFC PATCH 2/5] fs/proc: Introduce /proc/all/statm Eugene Lubarsky
2020-08-10 14:58 ` [RFC PATCH 3/5] fs/proc: Introduce /proc/all/status Eugene Lubarsky
2020-08-10 14:58 ` [RFC PATCH 4/5] fs/proc: Introduce /proc/all/io Eugene Lubarsky
2020-08-10 14:58 ` [RFC PATCH 5/5] fs/proc: Introduce /proc/all/statx Eugene Lubarsky
2020-08-10 15:04 ` [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes Greg KH
2020-08-10 15:27   ` Eugene Lubarsky
2020-08-10 15:41     ` Greg KH
2020-08-25  9:59       ` Eugene Lubarsky
2020-08-12  7:51 ` Andrei Vagin
2020-08-13  4:47   ` David Ahern
2020-08-13  8:03     ` Andrei Vagin
2020-08-13 15:01   ` Eugene Lubarsky
2020-08-20 17:41     ` Andrei Vagin
2020-08-25 10:00       ` Eugene Lubarsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).