linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files
@ 2018-08-13  6:58 Konstantin Khlebnikov
  2018-08-13  6:58 ` [PATCH RFC 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups Konstantin Khlebnikov
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Konstantin Khlebnikov @ 2018-08-13  6:58 UTC (permalink / raw)
  To: linux-mm, linux-kernel, cgroups
  Cc: Tejun Heo, Michal Hocko, Vladimir Davydov, Roman Gushchin,
	Johannes Weiner

After removing cgroup subsystem state could leak or live in background
forever because it is pinned by some reference. For example memory cgroup
could be pinned by pages in cache or tmpfs.

This patch adds common debugfs interface for listing basic state for each
controller. Controller could define callback for dumping own attributes.

In file /sys/kernel/debug/cgroup/<controller> each line shows state in
format: <common_attr>=<value>... [-- <controller_attr>=<value>... ]

Common attributes:

css - css pointer
cgroup - cgroup pointer
id - css id
ino - cgroup inode
flags - css flags
refcnt - css atomic refcount, for online shows huge bias
path - cgroup path

This patch adds memcg attributes:

mem_id - 16-bit memory cgroup id
memory - charged pages
swap - charged swap

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 include/linux/cgroup-defs.h |    1 
 kernel/cgroup/cgroup.c      |   99 +++++++++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c             |   12 +++++
 3 files changed, 112 insertions(+)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c0e68f903011..c828820e160f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -595,6 +595,7 @@ struct cgroup_subsys {
 	void (*exit)(struct task_struct *task);
 	void (*free)(struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
+	void (*css_dump)(struct cgroup_subsys_state *css, struct seq_file *);
 
 	bool early_init:1;
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 077370bf8964..b7be190daffe 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -39,6 +39,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
@@ -5978,3 +5979,101 @@ static int __init cgroup_sysfs_init(void)
 }
 subsys_initcall(cgroup_sysfs_init);
 #endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_DEBUG_FS
+void *css_debugfs_seqfile_start(struct seq_file *m, loff_t *pos)
+{
+	struct cgroup_subsys *ss = m->private;
+	struct cgroup_subsys_state *css;
+	int id = *pos;
+
+	rcu_read_lock();
+	css = idr_get_next(&ss->css_idr, &id);
+	*pos = id;
+	return css;
+}
+
+void *css_debugfs_seqfile_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct cgroup_subsys *ss = m->private;
+	struct cgroup_subsys_state *css;
+	int id = *pos + 1;
+
+	css = idr_get_next(&ss->css_idr, &id);
+	*pos = id;
+	return css;
+}
+
+void css_debugfs_seqfile_stop(struct seq_file *m, void *v)
+{
+	rcu_read_unlock();
+}
+
+int css_debugfs_seqfile_show(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys *ss = m->private;
+	struct cgroup_subsys_state *css = v;
+	size_t buflen;
+	char *buf;
+	int len;
+
+	seq_printf(m, "css=%pK cgroup=%pK id=%d ino=%lu flags=%#x refcnt=%lu path=",
+		   css, css->cgroup, css->id, cgroup_ino(css->cgroup),
+		   css->flags, atomic_long_read(&css->refcnt.count));
+
+	buflen = seq_get_buf(m, &buf);
+	if (buf) {
+		len = cgroup_path(css->cgroup, buf, buflen);
+		seq_commit(m, len < buflen ? len : -1);
+	}
+
+	if (ss->css_dump) {
+		seq_puts(m, " -- ");
+		ss->css_dump(css, m);
+	}
+
+	seq_puts(m, "\n");
+	return 0;
+}
+
+static const struct seq_operations css_debug_seq_ops = {
+	.start = css_debugfs_seqfile_start,
+	.next = css_debugfs_seqfile_next,
+	.stop = css_debugfs_seqfile_stop,
+	.show = css_debugfs_seqfile_show,
+};
+
+static int css_debugfs_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &css_debug_seq_ops);
+	struct seq_file *m = file->private_data;
+
+	if (!ret)
+		m->private = inode->i_private;
+	return ret;
+}
+
+static const struct file_operations css_debugfs_fops = {
+	.open = css_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static int __init css_debugfs_init(void)
+{
+	struct cgroup_subsys *ss;
+	struct dentry *dir;
+	int ssid;
+
+	dir = debugfs_create_dir("cgroup", NULL);
+	if (dir) {
+		for_each_subsys(ss, ssid)
+			debugfs_create_file(ss->name, 0644, dir, ss,
+					    &css_debugfs_fops);
+	}
+
+	return 0;
+}
+late_initcall(css_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b2173f7e5164..19a4348974a4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4345,6 +4345,17 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	memcg_wb_domain_size_changed(memcg);
 }
 
+static void mem_cgroup_css_dump(struct cgroup_subsys_state *css,
+				struct seq_file *m)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	seq_printf(m, "mem_id=%u memory=%lu swap=%lu",
+		   mem_cgroup_id(memcg),
+		   page_counter_read(&memcg->memory),
+		   page_counter_read(&memcg->swap));
+}
+
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 static int mem_cgroup_do_precharge(unsigned long count)
@@ -5386,6 +5397,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.css_released = mem_cgroup_css_released,
 	.css_free = mem_cgroup_css_free,
 	.css_reset = mem_cgroup_css_reset,
+	.css_dump = mem_cgroup_css_dump,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.post_attach = mem_cgroup_move_task,


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH RFC 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups
  2018-08-13  6:58 [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files Konstantin Khlebnikov
@ 2018-08-13  6:58 ` Konstantin Khlebnikov
  2018-08-22 14:58   ` Tejun Heo
  2018-08-13  6:58 ` [PATCH RFC 3/3] tools/vm/page-types: add flag for showing inodes " Konstantin Khlebnikov
  2018-08-13 13:48 ` [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files Tejun Heo
  2 siblings, 1 reply; 8+ messages in thread
From: Konstantin Khlebnikov @ 2018-08-13  6:58 UTC (permalink / raw)
  To: linux-mm, linux-kernel, cgroups
  Cc: Tejun Heo, Michal Hocko, Vladimir Davydov, Roman Gushchin,
	Johannes Weiner

By default this interface reports inode number of closest online ancestor
if cgroups is offline (removed). Information about real owner is required
for detecting which pages keep removed cgroup.

This patch adds per-file mode which is changed by writing 64-bit flags
into opened /proc/kpagecgroup. For now only first bit is used.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 Documentation/admin-guide/mm/pagemap.rst |    3 +++
 fs/proc/page.c                           |   24 ++++++++++++++++++++++--
 include/linux/memcontrol.h               |    2 +-
 mm/memcontrol.c                          |    5 +++--
 mm/memory-failure.c                      |    2 +-
 5 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 577af85beb41..b39d841ac560 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -80,6 +80,9 @@ There are four components to pagemap:
    memory cgroup each page is charged to, indexed by PFN. Only available when
    CONFIG_MEMCG is set.
 
+   For offline (removed) cgroup this returnes inode number of closest online
+   ancestor. Write 64-bit flag 1 into opened file for getting real owners.
+
 Short descriptions to the page flags
 ====================================
 
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 792c78a49174..337f526fcc27 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -248,6 +248,7 @@ static const struct file_operations proc_kpageflags_operations = {
 static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
+	unsigned long flags = (unsigned long)file->private_data;
 	u64 __user *out = (u64 __user *)buf;
 	struct page *ppage;
 	unsigned long src = *ppos;
@@ -267,7 +268,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 			ppage = NULL;
 
 		if (ppage)
-			ino = page_cgroup_ino(ppage);
+			ino = page_cgroup_ino(ppage, !(flags & 1));
 		else
 			ino = 0;
 
@@ -289,9 +290,28 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 	return ret;
 }
 
+static ssize_t kpagecgroup_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	u64 flags;
+
+	if (count != 8)
+		return -EINVAL;
+
+	if (get_user(flags, buf))
+		return -EFAULT;
+
+	if (flags > 1)
+		return -EINVAL;
+
+	file->private_data = (void *)(unsigned long)flags;
+	return count;
+}
+
 static const struct file_operations proc_kpagecgroup_operations = {
 	.llseek = mem_lseek,
 	.read = kpagecgroup_read,
+	.write = kpagecgroup_write,
 };
 #endif /* CONFIG_MEMCG */
 
@@ -300,7 +320,7 @@ static int __init proc_page_init(void)
 	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
 	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
 #ifdef CONFIG_MEMCG
-	proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+	proc_create("kpagecgroup", 0600, NULL, &proc_kpagecgroup_operations);
 #endif
 	return 0;
 }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb116e925..a7c40522bef0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -444,7 +444,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
-ino_t page_cgroup_ino(struct page *page);
+ino_t page_cgroup_ino(struct page *page, bool online);
 
 static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 19a4348974a4..7ef6ea9d5e4a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -333,6 +333,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 /**
  * page_cgroup_ino - return inode number of the memcg a page is charged to
  * @page: the page
+ * @online: return closest online ancestor
  *
  * Look up the closest online ancestor of the memory cgroup @page is charged to
  * and return its inode number or 0 if @page is not charged to any cgroup. It
@@ -343,14 +344,14 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
  * after page_cgroup_ino() returns, so it only should be used by callers that
  * do not care (such as procfs interfaces).
  */
-ino_t page_cgroup_ino(struct page *page)
+ino_t page_cgroup_ino(struct page *page, bool online)
 {
 	struct mem_cgroup *memcg;
 	unsigned long ino = 0;
 
 	rcu_read_lock();
 	memcg = READ_ONCE(page->mem_cgroup);
-	while (memcg && !(memcg->css.flags & CSS_ONLINE))
+	while (memcg && online && !(memcg->css.flags & CSS_ONLINE))
 		memcg = parent_mem_cgroup(memcg);
 	if (memcg)
 		ino = cgroup_ino(memcg->css.cgroup);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9d142b9b86dc..bd09c447e0ec 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -139,7 +139,7 @@ static int hwpoison_filter_task(struct page *p)
 	if (!hwpoison_filter_memcg)
 		return 0;
 
-	if (page_cgroup_ino(p) != hwpoison_filter_memcg)
+	if (page_cgroup_ino(p, true) != hwpoison_filter_memcg)
 		return -EINVAL;
 
 	return 0;


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH RFC 3/3] tools/vm/page-types: add flag for showing inodes of offline cgroups
  2018-08-13  6:58 [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files Konstantin Khlebnikov
  2018-08-13  6:58 ` [PATCH RFC 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups Konstantin Khlebnikov
@ 2018-08-13  6:58 ` Konstantin Khlebnikov
  2018-08-13 13:48 ` [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files Tejun Heo
  2 siblings, 0 replies; 8+ messages in thread
From: Konstantin Khlebnikov @ 2018-08-13  6:58 UTC (permalink / raw)
  To: linux-mm, linux-kernel, cgroups
  Cc: Tejun Heo, Michal Hocko, Vladimir Davydov, Roman Gushchin,
	Johannes Weiner

With flag -R|--real-cgroup page-types will report real owner.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 tools/vm/page-types.c |   18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index cce853dca691..453dbbb9fe8b 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -173,6 +173,7 @@ static pid_t		opt_pid;	/* process to walk */
 const char		*opt_file;	/* file or directory path */
 static uint64_t		opt_cgroup;	/* cgroup inode */
 static int		opt_list_cgroup;/* list page cgroup */
+static int		opt_real_cgroup;/* real offline cgroup */
 static const char	*opt_kpageflags;/* kpageflags file to parse */
 
 #define MAX_ADDR_RANGES	1024
@@ -789,6 +790,7 @@ static void usage(void)
 "            -l|--list                  Show page details in ranges\n"
 "            -L|--list-each             Show page details one by one\n"
 "            -C|--list-cgroup           Show cgroup inode for pages\n"
+"            -R|--real-cgroup           Show real offline cgroups\n"
 "            -N|--no-summary            Don't show summary info\n"
 "            -X|--hwpoison              hwpoison pages\n"
 "            -x|--unpoison              unpoison pages\n"
@@ -1193,6 +1195,7 @@ static const struct option opts[] = {
 	{ "list"      , 0, NULL, 'l' },
 	{ "list-each" , 0, NULL, 'L' },
 	{ "list-cgroup", 0, NULL, 'C' },
+	{ "real-cgroup", 0, NULL, 'R' },
 	{ "no-summary", 0, NULL, 'N' },
 	{ "hwpoison"  , 0, NULL, 'X' },
 	{ "unpoison"  , 0, NULL, 'x' },
@@ -1208,7 +1211,7 @@ int main(int argc, char *argv[])
 	page_size = getpagesize();
 
 	while ((c = getopt_long(argc, argv,
-				"rp:f:a:b:d:c:ClLNXxF:h", opts, NULL)) != -1) {
+				"rp:f:a:b:d:c:CRlLNXxF:h", opts, NULL)) != -1) {
 		switch (c) {
 		case 'r':
 			opt_raw = 1;
@@ -1231,6 +1234,9 @@ int main(int argc, char *argv[])
 		case 'C':
 			opt_list_cgroup = 1;
 			break;
+		case 'R':
+			opt_real_cgroup = 1;
+			break;
 		case 'd':
 			describe_flags(optarg);
 			exit(0);
@@ -1266,7 +1272,15 @@ int main(int argc, char *argv[])
 	if (!opt_kpageflags)
 		opt_kpageflags = PROC_KPAGEFLAGS;
 
-	if (opt_cgroup || opt_list_cgroup)
+	if (opt_real_cgroup) {
+		uint64_t flags = 1;
+
+		kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDWR);
+		if (write(kpagecgroup_fd, &flags, sizeof(flags)) < 0) {
+			perror(PROC_KPAGECGROUP);
+			exit(EXIT_FAILURE);
+		}
+	} else if (opt_cgroup || opt_list_cgroup)
 		kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
 
 	if (opt_list && opt_pid)


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files
  2018-08-13  6:58 [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files Konstantin Khlebnikov
  2018-08-13  6:58 ` [PATCH RFC 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups Konstantin Khlebnikov
  2018-08-13  6:58 ` [PATCH RFC 3/3] tools/vm/page-types: add flag for showing inodes " Konstantin Khlebnikov
@ 2018-08-13 13:48 ` Tejun Heo
  2018-08-13 17:11   ` Johannes Weiner
  2 siblings, 1 reply; 8+ messages in thread
From: Tejun Heo @ 2018-08-13 13:48 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-mm, linux-kernel, cgroups, Michal Hocko, Vladimir Davydov,
	Roman Gushchin, Johannes Weiner

Hello, Konstantin.

On Mon, Aug 13, 2018 at 09:58:05AM +0300, Konstantin Khlebnikov wrote:
> After removing cgroup subsystem state could leak or live in background
> forever because it is pinned by some reference. For example memory cgroup
> could be pinned by pages in cache or tmpfs.
> 
> This patch adds common debugfs interface for listing basic state for each
> controller. Controller could define callback for dumping own attributes.
> 
> In file /sys/kernel/debug/cgroup/<controller> each line shows state in
> format: <common_attr>=<value>... [-- <controller_attr>=<value>... ]

Seems pretty useful to me.  Roman, Johannes, what do you guys think?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files
  2018-08-13 13:48 ` [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files Tejun Heo
@ 2018-08-13 17:11   ` Johannes Weiner
  2018-08-13 17:53     ` Roman Gushchin
  0 siblings, 1 reply; 8+ messages in thread
From: Johannes Weiner @ 2018-08-13 17:11 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Konstantin Khlebnikov, linux-mm, linux-kernel, cgroups,
	Michal Hocko, Vladimir Davydov, Roman Gushchin

On Mon, Aug 13, 2018 at 06:48:42AM -0700, Tejun Heo wrote:
> Hello, Konstantin.
> 
> On Mon, Aug 13, 2018 at 09:58:05AM +0300, Konstantin Khlebnikov wrote:
> > After removing cgroup subsystem state could leak or live in background
> > forever because it is pinned by some reference. For example memory cgroup
> > could be pinned by pages in cache or tmpfs.
> > 
> > This patch adds common debugfs interface for listing basic state for each
> > controller. Controller could define callback for dumping own attributes.
> > 
> > In file /sys/kernel/debug/cgroup/<controller> each line shows state in
> > format: <common_attr>=<value>... [-- <controller_attr>=<value>... ]
> 
> Seems pretty useful to me.  Roman, Johannes, what do you guys think?

Generally I like the idea of having more introspection into offlined
cgroups, but I wonder if having only memory= and swap= could be a
little too terse to track down what exactly is pinning the groups.

Roman has more experience debugging these pileups, but it seems to me
that unless we add a breakdown off memory, and maybe make slabinfo
available for these groups, that in practice this might not provide
that much more insight than per-cgroup stat counters of dead children.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files
  2018-08-13 17:11   ` Johannes Weiner
@ 2018-08-13 17:53     ` Roman Gushchin
  2018-08-14  9:40       ` Konstantin Khlebnikov
  0 siblings, 1 reply; 8+ messages in thread
From: Roman Gushchin @ 2018-08-13 17:53 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Tejun Heo, Konstantin Khlebnikov, linux-mm, linux-kernel,
	cgroups, Michal Hocko, Vladimir Davydov

On Mon, Aug 13, 2018 at 01:11:19PM -0400, Johannes Weiner wrote:
> On Mon, Aug 13, 2018 at 06:48:42AM -0700, Tejun Heo wrote:
> > Hello, Konstantin.
> > 
> > On Mon, Aug 13, 2018 at 09:58:05AM +0300, Konstantin Khlebnikov wrote:
> > > After removing cgroup subsystem state could leak or live in background
> > > forever because it is pinned by some reference. For example memory cgroup
> > > could be pinned by pages in cache or tmpfs.
> > > 
> > > This patch adds common debugfs interface for listing basic state for each
> > > controller. Controller could define callback for dumping own attributes.
> > > 
> > > In file /sys/kernel/debug/cgroup/<controller> each line shows state in
> > > format: <common_attr>=<value>... [-- <controller_attr>=<value>... ]
> > 
> > Seems pretty useful to me.  Roman, Johannes, what do you guys think?

Totally agree with the idea and was about to suggest something similar.

> Generally I like the idea of having more introspection into offlined
> cgroups, but I wonder if having only memory= and swap= could be a
> little too terse to track down what exactly is pinning the groups.
> 
> Roman has more experience debugging these pileups, but it seems to me
> that unless we add a breakdown off memory, and maybe make slabinfo
> available for these groups, that in practice this might not provide
> that much more insight than per-cgroup stat counters of dead children.

I agree here.

It's hard to say in advance what numbers are useful, so let's export
these numbers, but also make the format more extendable, so we can
easily add new information later. Maybe, something like:

cgroup {
  path = ...
  ino = ...
  main css {
    refcnt = ...
    key = value
    ...
  }
  memcg css {
    refcnt = ...
    ...
  }
  some other controller css {
  }
  ...
}

Also, because we do batch charges, printing numbers without draining stocks
is not that useful. All stats are also per-cpu cached, what adds some
inaccuracy.

Thanks!

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files
  2018-08-13 17:53     ` Roman Gushchin
@ 2018-08-14  9:40       ` Konstantin Khlebnikov
  0 siblings, 0 replies; 8+ messages in thread
From: Konstantin Khlebnikov @ 2018-08-14  9:40 UTC (permalink / raw)
  To: Roman Gushchin, Johannes Weiner
  Cc: Tejun Heo, linux-mm, linux-kernel, cgroups, Michal Hocko,
	Vladimir Davydov

On 13.08.2018 20:53, Roman Gushchin wrote:
> On Mon, Aug 13, 2018 at 01:11:19PM -0400, Johannes Weiner wrote:
>> On Mon, Aug 13, 2018 at 06:48:42AM -0700, Tejun Heo wrote:
>>> Hello, Konstantin.
>>>
>>> On Mon, Aug 13, 2018 at 09:58:05AM +0300, Konstantin Khlebnikov wrote:
>>>> After removing cgroup subsystem state could leak or live in background
>>>> forever because it is pinned by some reference. For example memory cgroup
>>>> could be pinned by pages in cache or tmpfs.
>>>>
>>>> This patch adds common debugfs interface for listing basic state for each
>>>> controller. Controller could define callback for dumping own attributes.
>>>>
>>>> In file /sys/kernel/debug/cgroup/<controller> each line shows state in
>>>> format: <common_attr>=<value>... [-- <controller_attr>=<value>... ]
>>>
>>> Seems pretty useful to me.  Roman, Johannes, what do you guys think?
> 
> Totally agree with the idea and was about to suggest something similar.
> 
>> Generally I like the idea of having more introspection into offlined
>> cgroups, but I wonder if having only memory= and swap= could be a
>> little too terse to track down what exactly is pinning the groups.
>>
>> Roman has more experience debugging these pileups, but it seems to me
>> that unless we add a breakdown off memory, and maybe make slabinfo
>> available for these groups, that in practice this might not provide
>> that much more insight than per-cgroup stat counters of dead children.
> 
> I agree here.

I don't think that we could cover all cases with single interface.

This debugfs just gives simple entry point for debugging:
- paths for guessing user
- pointers for looking with gdb via kcore
- inode numbers for page-types - see second and third patch

For slab: this could show one of remaining slab. Anyway each of them pins css.

> 
> It's hard to say in advance what numbers are useful, so let's export
> these numbers, but also make the format more extendable, so we can
> easily add new information later. Maybe, something like:
> 
> cgroup {
>    path = ...
>    ino = ...
>    main css {
>      refcnt = ...
>      key = value
>      ...
>    }
>    memcg css {
>      refcnt = ...
>      ...
>    }
>    some other controller css {
>    }
>    ...
> }
> 
> Also, because we do batch charges, printing numbers without draining stocks
> is not that useful. All stats are also per-cpu cached, what adds some
> inaccuracy.

Seems too verbose. And one-line key=value format is more grep/awk friendly.

Anyway such extended debugging could be implemented as gdb plugin.
While simple list of pointers gives enough information for dumping structures
with gdb alone without extra plugins.

> 
> Thanks!
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups
  2018-08-13  6:58 ` [PATCH RFC 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups Konstantin Khlebnikov
@ 2018-08-22 14:58   ` Tejun Heo
  0 siblings, 0 replies; 8+ messages in thread
From: Tejun Heo @ 2018-08-22 14:58 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-mm, linux-kernel, cgroups, Michal Hocko, Vladimir Davydov,
	Roman Gushchin, Johannes Weiner

Hello,

On Mon, Aug 13, 2018 at 09:58:10AM +0300, Konstantin Khlebnikov wrote:
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 19a4348974a4..7ef6ea9d5e4a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -333,6 +333,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
>  /**
>   * page_cgroup_ino - return inode number of the memcg a page is charged to
>   * @page: the page
> + * @online: return closest online ancestor
>   *
>   * Look up the closest online ancestor of the memory cgroup @page is charged to
>   * and return its inode number or 0 if @page is not charged to any cgroup. It
> @@ -343,14 +344,14 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
>   * after page_cgroup_ino() returns, so it only should be used by callers that
>   * do not care (such as procfs interfaces).
>   */
> -ino_t page_cgroup_ino(struct page *page)
> +ino_t page_cgroup_ino(struct page *page, bool online)
>  {
>  	struct mem_cgroup *memcg;
>  	unsigned long ino = 0;
>  
>  	rcu_read_lock();
>  	memcg = READ_ONCE(page->mem_cgroup);
> -	while (memcg && !(memcg->css.flags & CSS_ONLINE))
> +	while (memcg && online && !(memcg->css.flags & CSS_ONLINE))
>  		memcg = parent_mem_cgroup(memcg);
>  	if (memcg)
>  		ino = cgroup_ino(memcg->css.cgroup);

We pin the ino till the cgroup is actually released now but that's an
implementation detail which may change in the future, so I'm not sure
this is a good idea.  Can you instead use the 64bit filehandle exposed
by kernfs?  That's currently also based on ino (+gen) but it's
something guarnateed to stay unique per cgroup and you can easily get
to the cgroup using the fh too.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-08-22 14:58 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-13  6:58 [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files Konstantin Khlebnikov
2018-08-13  6:58 ` [PATCH RFC 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups Konstantin Khlebnikov
2018-08-22 14:58   ` Tejun Heo
2018-08-13  6:58 ` [PATCH RFC 3/3] tools/vm/page-types: add flag for showing inodes " Konstantin Khlebnikov
2018-08-13 13:48 ` [PATCH RFC 1/3] cgroup: list all subsystem states in debugfs files Tejun Heo
2018-08-13 17:11   ` Johannes Weiner
2018-08-13 17:53     ` Roman Gushchin
2018-08-14  9:40       ` Konstantin Khlebnikov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).