All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-08-28 22:01 ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-08-28 22:01 UTC (permalink / raw)
  To: hannes, mhocko, akpm; +Cc: cgroups, linux-mm, vdavydov, kernel-team

task_struct->memcg_oom is a sub-struct containing fields which are
used for async memcg oom handling.  Most task_struct fields aren't
packaged this way and it can lead to unnecessary alignment paddings.
This patch flattens it.

* task.memcg_oom.memcg          -> task.memcg_in_oom
* task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
* task.memcg_oom.order          -> task.memcg_oom_order
* task.memcg_oom.may_oom        -> task.memcg_may_oom

In addition, task.memcg_may_oom is relocated to where other bitfields
are which reduces the size of task_struct.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
Hello,

These two patches are what survived from the following patchset.

 http://lkml.kernel.org/g/1440775530-18630-1-git-send-email-tj@kernel.org

Thanks.

 include/linux/memcontrol.h |   10 +++++-----
 include/linux/sched.h      |   13 ++++++-------
 mm/memcontrol.c            |   16 ++++++++--------
 3 files changed, 19 insertions(+), 20 deletions(-)

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -407,19 +407,19 @@ void mem_cgroup_print_oom_info(struct me
 
 static inline void mem_cgroup_oom_enable(void)
 {
-	WARN_ON(current->memcg_oom.may_oom);
-	current->memcg_oom.may_oom = 1;
+	WARN_ON(current->memcg_may_oom);
+	current->memcg_may_oom = 1;
 }
 
 static inline void mem_cgroup_oom_disable(void)
 {
-	WARN_ON(!current->memcg_oom.may_oom);
-	current->memcg_oom.may_oom = 0;
+	WARN_ON(!current->memcg_may_oom);
+	current->memcg_may_oom = 0;
 }
 
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-	return p->memcg_oom.memcg;
+	return p->memcg_in_oom;
 }
 
 bool mem_cgroup_oom_synchronize(bool wait);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1451,7 +1451,9 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
-
+#ifdef CONFIG_MEMCG
+	unsigned memcg_may_oom:1;
+#endif
 #ifdef CONFIG_MEMCG_KMEM
 	unsigned memcg_kmem_skip_account:1;
 #endif
@@ -1782,12 +1784,9 @@ struct task_struct {
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG
-	struct memcg_oom_info {
-		struct mem_cgroup *memcg;
-		gfp_t gfp_mask;
-		int order;
-		unsigned int may_oom:1;
-	} memcg_oom;
+	struct mem_cgroup *memcg_in_oom;
+	gfp_t memcg_oom_gfp_mask;
+	int memcg_oom_order;
 #endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1652,7 +1652,7 @@ static void memcg_oom_recover(struct mem
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	if (!current->memcg_oom.may_oom)
+	if (!current->memcg_may_oom)
 		return;
 	/*
 	 * We are in the middle of the charge context here, so we
@@ -1669,9 +1669,9 @@ static void mem_cgroup_oom(struct mem_cg
 	 * and when we know whether the fault was overall successful.
 	 */
 	css_get(&memcg->css);
-	current->memcg_oom.memcg = memcg;
-	current->memcg_oom.gfp_mask = mask;
-	current->memcg_oom.order = order;
+	current->memcg_in_oom = memcg;
+	current->memcg_oom_gfp_mask = mask;
+	current->memcg_oom_order = order;
 }
 
 /**
@@ -1693,7 +1693,7 @@ static void mem_cgroup_oom(struct mem_cg
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
-	struct mem_cgroup *memcg = current->memcg_oom.memcg;
+	struct mem_cgroup *memcg = current->memcg_in_oom;
 	struct oom_wait_info owait;
 	bool locked;
 
@@ -1721,8 +1721,8 @@ bool mem_cgroup_oom_synchronize(bool han
 	if (locked && !memcg->oom_kill_disable) {
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-					 current->memcg_oom.order);
+		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+					 current->memcg_oom_order);
 	} else {
 		schedule();
 		mem_cgroup_unmark_under_oom(memcg);
@@ -1739,7 +1739,7 @@ bool mem_cgroup_oom_synchronize(bool han
 		memcg_oom_recover(memcg);
 	}
 cleanup:
-	current->memcg_oom.memcg = NULL;
+	current->memcg_in_oom = NULL;
 	css_put(&memcg->css);
 	return true;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-08-28 22:01 ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-08-28 22:01 UTC (permalink / raw)
  To: hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

task_struct->memcg_oom is a sub-struct containing fields which are
used for async memcg oom handling.  Most task_struct fields aren't
packaged this way and it can lead to unnecessary alignment paddings.
This patch flattens it.

* task.memcg_oom.memcg          -> task.memcg_in_oom
* task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
* task.memcg_oom.order          -> task.memcg_oom_order
* task.memcg_oom.may_oom        -> task.memcg_may_oom

In addition, task.memcg_may_oom is relocated to where other bitfields
are which reduces the size of task_struct.

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
Hello,

These two patches are what survived from the following patchset.

 http://lkml.kernel.org/g/1440775530-18630-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org

Thanks.

 include/linux/memcontrol.h |   10 +++++-----
 include/linux/sched.h      |   13 ++++++-------
 mm/memcontrol.c            |   16 ++++++++--------
 3 files changed, 19 insertions(+), 20 deletions(-)

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -407,19 +407,19 @@ void mem_cgroup_print_oom_info(struct me
 
 static inline void mem_cgroup_oom_enable(void)
 {
-	WARN_ON(current->memcg_oom.may_oom);
-	current->memcg_oom.may_oom = 1;
+	WARN_ON(current->memcg_may_oom);
+	current->memcg_may_oom = 1;
 }
 
 static inline void mem_cgroup_oom_disable(void)
 {
-	WARN_ON(!current->memcg_oom.may_oom);
-	current->memcg_oom.may_oom = 0;
+	WARN_ON(!current->memcg_may_oom);
+	current->memcg_may_oom = 0;
 }
 
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-	return p->memcg_oom.memcg;
+	return p->memcg_in_oom;
 }
 
 bool mem_cgroup_oom_synchronize(bool wait);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1451,7 +1451,9 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
-
+#ifdef CONFIG_MEMCG
+	unsigned memcg_may_oom:1;
+#endif
 #ifdef CONFIG_MEMCG_KMEM
 	unsigned memcg_kmem_skip_account:1;
 #endif
@@ -1782,12 +1784,9 @@ struct task_struct {
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG
-	struct memcg_oom_info {
-		struct mem_cgroup *memcg;
-		gfp_t gfp_mask;
-		int order;
-		unsigned int may_oom:1;
-	} memcg_oom;
+	struct mem_cgroup *memcg_in_oom;
+	gfp_t memcg_oom_gfp_mask;
+	int memcg_oom_order;
 #endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1652,7 +1652,7 @@ static void memcg_oom_recover(struct mem
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	if (!current->memcg_oom.may_oom)
+	if (!current->memcg_may_oom)
 		return;
 	/*
 	 * We are in the middle of the charge context here, so we
@@ -1669,9 +1669,9 @@ static void mem_cgroup_oom(struct mem_cg
 	 * and when we know whether the fault was overall successful.
 	 */
 	css_get(&memcg->css);
-	current->memcg_oom.memcg = memcg;
-	current->memcg_oom.gfp_mask = mask;
-	current->memcg_oom.order = order;
+	current->memcg_in_oom = memcg;
+	current->memcg_oom_gfp_mask = mask;
+	current->memcg_oom_order = order;
 }
 
 /**
@@ -1693,7 +1693,7 @@ static void mem_cgroup_oom(struct mem_cg
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
-	struct mem_cgroup *memcg = current->memcg_oom.memcg;
+	struct mem_cgroup *memcg = current->memcg_in_oom;
 	struct oom_wait_info owait;
 	bool locked;
 
@@ -1721,8 +1721,8 @@ bool mem_cgroup_oom_synchronize(bool han
 	if (locked && !memcg->oom_kill_disable) {
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-					 current->memcg_oom.order);
+		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+					 current->memcg_oom_order);
 	} else {
 		schedule();
 		mem_cgroup_unmark_under_oom(memcg);
@@ -1739,7 +1739,7 @@ bool mem_cgroup_oom_synchronize(bool han
 		memcg_oom_recover(memcg);
 	}
 cleanup:
-	current->memcg_oom.memcg = NULL;
+	current->memcg_in_oom = NULL;
 	css_put(&memcg->css);
 	return true;
 }

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 2/2] memcg: always enable kmemcg on the default hierarchy
@ 2015-08-28 22:02   ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-08-28 22:02 UTC (permalink / raw)
  To: hannes, mhocko, akpm; +Cc: cgroups, linux-mm, vdavydov, kernel-team

On the default hierarchy, all memory consumption will be accounted
together and controlled by the same set of limits.  Enable kmemcg on
the default hierarchy by default.  Boot parameter "disable_kmemcg" can
be specified to turn it off.

v2: - v1 triggered oops on nested cgroup creations.  Moved enabling
      mechanism to memcg_propagate_kmem().
    - Bypass busy test on kmem activation as it's unnecessary and gets
      confused by controller being enabled on a cgroup which already
      has processes.
    - "disable_kmemcg" boot param added.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/memcontrol.c |   43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -346,6 +346,17 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
+
+static bool kmemcg_disabled;
+
+static int __init disable_kmemcg(char *s)
+{
+	kmemcg_disabled = true;
+	pr_info("memcg: kernel memory support disabled on cgroup2");
+	return 0;
+}
+__setup("disable_kmemcg", disable_kmemcg);
+
 /*
  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
  * The main reason for not using cgroup id for this:
@@ -2908,9 +2919,9 @@ static int memcg_activate_kmem(struct me
 	BUG_ON(memcg->kmem_acct_active);
 
 	/*
-	 * For simplicity, we won't allow this to be disabled.  It also can't
-	 * be changed if the cgroup has children already, or if tasks had
-	 * already joined.
+	 * On traditional hierarchies, for simplicity, we won't allow this
+	 * to be disabled.  It also can't be changed if the cgroup has
+	 * children already, or if tasks had already joined.
 	 *
 	 * If tasks join before we set the limit, a person looking at
 	 * kmem.usage_in_bytes will have no way to determine when it took
@@ -2919,13 +2930,15 @@ static int memcg_activate_kmem(struct me
 	 * After it first became limited, changes in the value of the limit are
 	 * of course permitted.
 	 */
-	mutex_lock(&memcg_create_mutex);
-	if (cgroup_has_tasks(memcg->css.cgroup) ||
-	    (memcg->use_hierarchy && memcg_has_children(memcg)))
-		err = -EBUSY;
-	mutex_unlock(&memcg_create_mutex);
-	if (err)
-		goto out;
+	if (!cgroup_on_dfl(memcg->css.cgroup)) {
+		mutex_lock(&memcg_create_mutex);
+		if (cgroup_has_tasks(memcg->css.cgroup) ||
+		    (memcg->use_hierarchy && memcg_has_children(memcg)))
+			err = -EBUSY;
+		mutex_unlock(&memcg_create_mutex);
+		if (err)
+			goto out;
+	}
 
 	memcg_id = memcg_alloc_cache_id();
 	if (memcg_id < 0) {
@@ -2978,10 +2991,14 @@ static int memcg_propagate_kmem(struct m
 
 	mutex_lock(&memcg_limit_mutex);
 	/*
-	 * If the parent cgroup is not kmem-active now, it cannot be activated
-	 * after this point, because it has at least one child already.
+	 * On the default hierarchy, automatically enable kmemcg unless
+	 * explicitly disabled by the boot param.  On traditional
+	 * hierarchies, inherit from the parent.  If the parent cgroup is
+	 * not kmem-active now, it cannot be activated after this point,
+	 * because it has at least one child already.
 	 */
-	if (memcg_kmem_is_active(parent))
+	if ((!kmemcg_disabled && cgroup_on_dfl(memcg->css.cgroup)) ||
+	    memcg_kmem_is_active(parent))
 		ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
 	mutex_unlock(&memcg_limit_mutex);
 	return ret;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 2/2] memcg: always enable kmemcg on the default hierarchy
@ 2015-08-28 22:02   ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-08-28 22:02 UTC (permalink / raw)
  To: hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On the default hierarchy, all memory consumption will be accounted
together and controlled by the same set of limits.  Enable kmemcg on
the default hierarchy by default.  Boot parameter "disable_kmemcg" can
be specified to turn it off.

v2: - v1 triggered oops on nested cgroup creations.  Moved enabling
      mechanism to memcg_propagate_kmem().
    - Bypass busy test on kmem activation as it's unnecessary and gets
      confused by controller being enabled on a cgroup which already
      has processes.
    - "disable_kmemcg" boot param added.

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 mm/memcontrol.c |   43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -346,6 +346,17 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
+
+static bool kmemcg_disabled;
+
+static int __init disable_kmemcg(char *s)
+{
+	kmemcg_disabled = true;
+	pr_info("memcg: kernel memory support disabled on cgroup2");
+	return 0;
+}
+__setup("disable_kmemcg", disable_kmemcg);
+
 /*
  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
  * The main reason for not using cgroup id for this:
@@ -2908,9 +2919,9 @@ static int memcg_activate_kmem(struct me
 	BUG_ON(memcg->kmem_acct_active);
 
 	/*
-	 * For simplicity, we won't allow this to be disabled.  It also can't
-	 * be changed if the cgroup has children already, or if tasks had
-	 * already joined.
+	 * On traditional hierarchies, for simplicity, we won't allow this
+	 * to be disabled.  It also can't be changed if the cgroup has
+	 * children already, or if tasks had already joined.
 	 *
 	 * If tasks join before we set the limit, a person looking at
 	 * kmem.usage_in_bytes will have no way to determine when it took
@@ -2919,13 +2930,15 @@ static int memcg_activate_kmem(struct me
 	 * After it first became limited, changes in the value of the limit are
 	 * of course permitted.
 	 */
-	mutex_lock(&memcg_create_mutex);
-	if (cgroup_has_tasks(memcg->css.cgroup) ||
-	    (memcg->use_hierarchy && memcg_has_children(memcg)))
-		err = -EBUSY;
-	mutex_unlock(&memcg_create_mutex);
-	if (err)
-		goto out;
+	if (!cgroup_on_dfl(memcg->css.cgroup)) {
+		mutex_lock(&memcg_create_mutex);
+		if (cgroup_has_tasks(memcg->css.cgroup) ||
+		    (memcg->use_hierarchy && memcg_has_children(memcg)))
+			err = -EBUSY;
+		mutex_unlock(&memcg_create_mutex);
+		if (err)
+			goto out;
+	}
 
 	memcg_id = memcg_alloc_cache_id();
 	if (memcg_id < 0) {
@@ -2978,10 +2991,14 @@ static int memcg_propagate_kmem(struct m
 
 	mutex_lock(&memcg_limit_mutex);
 	/*
-	 * If the parent cgroup is not kmem-active now, it cannot be activated
-	 * after this point, because it has at least one child already.
+	 * On the default hierarchy, automatically enable kmemcg unless
+	 * explicitly disabled by the boot param.  On traditional
+	 * hierarchies, inherit from the parent.  If the parent cgroup is
+	 * not kmem-active now, it cannot be activated after this point,
+	 * because it has at least one child already.
 	 */
-	if (memcg_kmem_is_active(parent))
+	if ((!kmemcg_disabled && cgroup_on_dfl(memcg->css.cgroup)) ||
+	    memcg_kmem_is_active(parent))
 		ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
 	mutex_unlock(&memcg_limit_mutex);
 	return ret;

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/2] memcg: always enable kmemcg on the default hierarchy
  2015-08-28 22:02   ` Tejun Heo
  (?)
@ 2015-08-31 22:54   ` Andrew Morton
  -1 siblings, 0 replies; 41+ messages in thread
From: Andrew Morton @ 2015-08-31 22:54 UTC (permalink / raw)
  To: Tejun Heo; +Cc: hannes, mhocko, cgroups, linux-mm, vdavydov, kernel-team

On Fri, 28 Aug 2015 18:02:37 -0400 Tejun Heo <tj@kernel.org> wrote:

> On the default hierarchy, all memory consumption will be accounted
> together and controlled by the same set of limits.  Enable kmemcg on
> the default hierarchy by default.  Boot parameter "disable_kmemcg" can
> be specified to turn it off.
> 
> ...
>
>  mm/memcontrol.c |   43 ++++++++++++++++++++++++++++++-------------
>  1 file changed, 30 insertions(+), 13 deletions(-)

Some documentation updates will be needed?
 
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -346,6 +346,17 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
>  #endif
>  
>  #ifdef CONFIG_MEMCG_KMEM
> +
> +static bool kmemcg_disabled;
> +
> +static int __init disable_kmemcg(char *s)
> +{
> +	kmemcg_disabled = true;
> +	pr_info("memcg: kernel memory support disabled on cgroup2");

typo?

> +	return 0;
> +}
> +__setup("disable_kmemcg", disable_kmemcg);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-01 15:25   ` Michal Hocko
  0 siblings, 0 replies; 41+ messages in thread
From: Michal Hocko @ 2015-09-01 15:25 UTC (permalink / raw)
  To: Tejun Heo; +Cc: hannes, akpm, cgroups, linux-mm, vdavydov, kernel-team

On Fri 28-08-15 18:01:58, Tejun Heo wrote:
> task_struct->memcg_oom is a sub-struct containing fields which are
> used for async memcg oom handling.  Most task_struct fields aren't
> packaged this way and it can lead to unnecessary alignment paddings.
> This patch flattens it.
> 
> * task.memcg_oom.memcg          -> task.memcg_in_oom
> * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> * task.memcg_oom.order          -> task.memcg_oom_order
> * task.memcg_oom.may_oom        -> task.memcg_may_oom
> 
> In addition, task.memcg_may_oom is relocated to where other bitfields
> are which reduces the size of task_struct.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
> Hello,
> 
> These two patches are what survived from the following patchset.
> 
>  http://lkml.kernel.org/g/1440775530-18630-1-git-send-email-tj@kernel.org
> 
> Thanks.
> 
>  include/linux/memcontrol.h |   10 +++++-----
>  include/linux/sched.h      |   13 ++++++-------
>  mm/memcontrol.c            |   16 ++++++++--------
>  3 files changed, 19 insertions(+), 20 deletions(-)
> 
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -407,19 +407,19 @@ void mem_cgroup_print_oom_info(struct me
>  
>  static inline void mem_cgroup_oom_enable(void)
>  {
> -	WARN_ON(current->memcg_oom.may_oom);
> -	current->memcg_oom.may_oom = 1;
> +	WARN_ON(current->memcg_may_oom);
> +	current->memcg_may_oom = 1;
>  }
>  
>  static inline void mem_cgroup_oom_disable(void)
>  {
> -	WARN_ON(!current->memcg_oom.may_oom);
> -	current->memcg_oom.may_oom = 0;
> +	WARN_ON(!current->memcg_may_oom);
> +	current->memcg_may_oom = 0;
>  }
>  
>  static inline bool task_in_memcg_oom(struct task_struct *p)
>  {
> -	return p->memcg_oom.memcg;
> +	return p->memcg_in_oom;
>  }
>  
>  bool mem_cgroup_oom_synchronize(bool wait);
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1451,7 +1451,9 @@ struct task_struct {
>  	unsigned sched_reset_on_fork:1;
>  	unsigned sched_contributes_to_load:1;
>  	unsigned sched_migrated:1;
> -
> +#ifdef CONFIG_MEMCG
> +	unsigned memcg_may_oom:1;
> +#endif
>  #ifdef CONFIG_MEMCG_KMEM
>  	unsigned memcg_kmem_skip_account:1;
>  #endif
> @@ -1782,12 +1784,9 @@ struct task_struct {
>  	unsigned long trace_recursion;
>  #endif /* CONFIG_TRACING */
>  #ifdef CONFIG_MEMCG
> -	struct memcg_oom_info {
> -		struct mem_cgroup *memcg;
> -		gfp_t gfp_mask;
> -		int order;
> -		unsigned int may_oom:1;
> -	} memcg_oom;
> +	struct mem_cgroup *memcg_in_oom;
> +	gfp_t memcg_oom_gfp_mask;
> +	int memcg_oom_order;
>  #endif
>  #ifdef CONFIG_UPROBES
>  	struct uprobe_task *utask;
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1652,7 +1652,7 @@ static void memcg_oom_recover(struct mem
>  
>  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
>  {
> -	if (!current->memcg_oom.may_oom)
> +	if (!current->memcg_may_oom)
>  		return;
>  	/*
>  	 * We are in the middle of the charge context here, so we
> @@ -1669,9 +1669,9 @@ static void mem_cgroup_oom(struct mem_cg
>  	 * and when we know whether the fault was overall successful.
>  	 */
>  	css_get(&memcg->css);
> -	current->memcg_oom.memcg = memcg;
> -	current->memcg_oom.gfp_mask = mask;
> -	current->memcg_oom.order = order;
> +	current->memcg_in_oom = memcg;
> +	current->memcg_oom_gfp_mask = mask;
> +	current->memcg_oom_order = order;
>  }
>  
>  /**
> @@ -1693,7 +1693,7 @@ static void mem_cgroup_oom(struct mem_cg
>   */
>  bool mem_cgroup_oom_synchronize(bool handle)
>  {
> -	struct mem_cgroup *memcg = current->memcg_oom.memcg;
> +	struct mem_cgroup *memcg = current->memcg_in_oom;
>  	struct oom_wait_info owait;
>  	bool locked;
>  
> @@ -1721,8 +1721,8 @@ bool mem_cgroup_oom_synchronize(bool han
>  	if (locked && !memcg->oom_kill_disable) {
>  		mem_cgroup_unmark_under_oom(memcg);
>  		finish_wait(&memcg_oom_waitq, &owait.wait);
> -		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
> -					 current->memcg_oom.order);
> +		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
> +					 current->memcg_oom_order);
>  	} else {
>  		schedule();
>  		mem_cgroup_unmark_under_oom(memcg);
> @@ -1739,7 +1739,7 @@ bool mem_cgroup_oom_synchronize(bool han
>  		memcg_oom_recover(memcg);
>  	}
>  cleanup:
> -	current->memcg_oom.memcg = NULL;
> +	current->memcg_in_oom = NULL;
>  	css_put(&memcg->css);
>  	return true;
>  }
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-01 15:25   ` Michal Hocko
  0 siblings, 0 replies; 41+ messages in thread
From: Michal Hocko @ 2015-09-01 15:25 UTC (permalink / raw)
  To: Tejun Heo
  Cc: hannes-druUgvl0LCNAfugRpC6u6w,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On Fri 28-08-15 18:01:58, Tejun Heo wrote:
> task_struct->memcg_oom is a sub-struct containing fields which are
> used for async memcg oom handling.  Most task_struct fields aren't
> packaged this way and it can lead to unnecessary alignment paddings.
> This patch flattens it.
> 
> * task.memcg_oom.memcg          -> task.memcg_in_oom
> * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> * task.memcg_oom.order          -> task.memcg_oom_order
> * task.memcg_oom.may_oom        -> task.memcg_may_oom
> 
> In addition, task.memcg_may_oom is relocated to where other bitfields
> are which reduces the size of task_struct.
> 
> Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

Acked-by: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>

> ---
> Hello,
> 
> These two patches are what survived from the following patchset.
> 
>  http://lkml.kernel.org/g/1440775530-18630-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
> 
> Thanks.
> 
>  include/linux/memcontrol.h |   10 +++++-----
>  include/linux/sched.h      |   13 ++++++-------
>  mm/memcontrol.c            |   16 ++++++++--------
>  3 files changed, 19 insertions(+), 20 deletions(-)
> 
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -407,19 +407,19 @@ void mem_cgroup_print_oom_info(struct me
>  
>  static inline void mem_cgroup_oom_enable(void)
>  {
> -	WARN_ON(current->memcg_oom.may_oom);
> -	current->memcg_oom.may_oom = 1;
> +	WARN_ON(current->memcg_may_oom);
> +	current->memcg_may_oom = 1;
>  }
>  
>  static inline void mem_cgroup_oom_disable(void)
>  {
> -	WARN_ON(!current->memcg_oom.may_oom);
> -	current->memcg_oom.may_oom = 0;
> +	WARN_ON(!current->memcg_may_oom);
> +	current->memcg_may_oom = 0;
>  }
>  
>  static inline bool task_in_memcg_oom(struct task_struct *p)
>  {
> -	return p->memcg_oom.memcg;
> +	return p->memcg_in_oom;
>  }
>  
>  bool mem_cgroup_oom_synchronize(bool wait);
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1451,7 +1451,9 @@ struct task_struct {
>  	unsigned sched_reset_on_fork:1;
>  	unsigned sched_contributes_to_load:1;
>  	unsigned sched_migrated:1;
> -
> +#ifdef CONFIG_MEMCG
> +	unsigned memcg_may_oom:1;
> +#endif
>  #ifdef CONFIG_MEMCG_KMEM
>  	unsigned memcg_kmem_skip_account:1;
>  #endif
> @@ -1782,12 +1784,9 @@ struct task_struct {
>  	unsigned long trace_recursion;
>  #endif /* CONFIG_TRACING */
>  #ifdef CONFIG_MEMCG
> -	struct memcg_oom_info {
> -		struct mem_cgroup *memcg;
> -		gfp_t gfp_mask;
> -		int order;
> -		unsigned int may_oom:1;
> -	} memcg_oom;
> +	struct mem_cgroup *memcg_in_oom;
> +	gfp_t memcg_oom_gfp_mask;
> +	int memcg_oom_order;
>  #endif
>  #ifdef CONFIG_UPROBES
>  	struct uprobe_task *utask;
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1652,7 +1652,7 @@ static void memcg_oom_recover(struct mem
>  
>  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
>  {
> -	if (!current->memcg_oom.may_oom)
> +	if (!current->memcg_may_oom)
>  		return;
>  	/*
>  	 * We are in the middle of the charge context here, so we
> @@ -1669,9 +1669,9 @@ static void mem_cgroup_oom(struct mem_cg
>  	 * and when we know whether the fault was overall successful.
>  	 */
>  	css_get(&memcg->css);
> -	current->memcg_oom.memcg = memcg;
> -	current->memcg_oom.gfp_mask = mask;
> -	current->memcg_oom.order = order;
> +	current->memcg_in_oom = memcg;
> +	current->memcg_oom_gfp_mask = mask;
> +	current->memcg_oom_order = order;
>  }
>  
>  /**
> @@ -1693,7 +1693,7 @@ static void mem_cgroup_oom(struct mem_cg
>   */
>  bool mem_cgroup_oom_synchronize(bool handle)
>  {
> -	struct mem_cgroup *memcg = current->memcg_oom.memcg;
> +	struct mem_cgroup *memcg = current->memcg_in_oom;
>  	struct oom_wait_info owait;
>  	bool locked;
>  
> @@ -1721,8 +1721,8 @@ bool mem_cgroup_oom_synchronize(bool han
>  	if (locked && !memcg->oom_kill_disable) {
>  		mem_cgroup_unmark_under_oom(memcg);
>  		finish_wait(&memcg_oom_waitq, &owait.wait);
> -		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
> -					 current->memcg_oom.order);
> +		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
> +					 current->memcg_oom_order);
>  	} else {
>  		schedule();
>  		mem_cgroup_unmark_under_oom(memcg);
> @@ -1739,7 +1739,7 @@ bool mem_cgroup_oom_synchronize(bool han
>  		memcg_oom_recover(memcg);
>  	}
>  cleanup:
> -	current->memcg_oom.memcg = NULL;
> +	current->memcg_in_oom = NULL;
>  	css_put(&memcg->css);
>  	return true;
>  }
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-02 11:45   ` Vladimir Davydov
  0 siblings, 0 replies; 41+ messages in thread
From: Vladimir Davydov @ 2015-09-02 11:45 UTC (permalink / raw)
  To: Tejun Heo; +Cc: hannes, mhocko, akpm, cgroups, linux-mm, kernel-team

On Fri, Aug 28, 2015 at 06:01:58PM -0400, Tejun Heo wrote:
> task_struct->memcg_oom is a sub-struct containing fields which are
> used for async memcg oom handling.  Most task_struct fields aren't
> packaged this way and it can lead to unnecessary alignment paddings.
> This patch flattens it.
> 
> * task.memcg_oom.memcg          -> task.memcg_in_oom
> * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> * task.memcg_oom.order          -> task.memcg_oom_order
> * task.memcg_oom.may_oom        -> task.memcg_may_oom
> 
> In addition, task.memcg_may_oom is relocated to where other bitfields
> are which reduces the size of task_struct.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>

Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-02 11:45   ` Vladimir Davydov
  0 siblings, 0 replies; 41+ messages in thread
From: Vladimir Davydov @ 2015-09-02 11:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	kernel-team-b10kYP2dOMg

On Fri, Aug 28, 2015 at 06:01:58PM -0400, Tejun Heo wrote:
> task_struct->memcg_oom is a sub-struct containing fields which are
> used for async memcg oom handling.  Most task_struct fields aren't
> packaged this way and it can lead to unnecessary alignment paddings.
> This patch flattens it.
> 
> * task.memcg_oom.memcg          -> task.memcg_in_oom
> * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> * task.memcg_oom.order          -> task.memcg_oom_order
> * task.memcg_oom.may_oom        -> task.memcg_may_oom
> 
> In addition, task.memcg_may_oom is relocated to where other bitfields
> are which reduces the size of task_struct.
> 
> Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

Reviewed-by: Vladimir Davydov <vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
  2015-08-28 22:02   ` Tejun Heo
  (?)
  (?)
@ 2015-09-04 21:00   ` Tejun Heo
  2015-09-07  9:23       ` Michal Hocko
  2015-09-07 11:38       ` Vladimir Davydov
  -1 siblings, 2 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-04 21:00 UTC (permalink / raw)
  To: hannes, mhocko, akpm; +Cc: cgroups, linux-mm, vdavydov, kernel-team

Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped.  If a process performs
only speculative allocations, it can blow way past the high limit.
This is actually easily reproducible by simply doing "find /".
slab/slub allocator tries speculative allocations first, so as long as
there's memory which can be consumed without blocking, it can keep
allocating memory regardless of the high limit.

This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path.  If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.

As long as kernel doesn't have a run-away allocation spree, this
should provide enough protection while making kmemcg behave more
consistently.

v2: - Switched to reclaiming only the overage caused by current rather
      than the difference between usage and high as suggested by
      Michal.
    - Don't record the memcg which went over high limit.  This makes
      exit path handling unnecessary.  Dropped.
    - Drop mentions of avoiding high stack usage from description as
      suggested by Vladimir.  max limit still triggers direct reclaim.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
---
Hello,

Updated per previous discussions.

Thanks.

 include/linux/memcontrol.h |    6 +++++
 include/linux/sched.h      |    3 ++
 include/linux/tracehook.h  |    3 ++
 mm/memcontrol.c            |   47 +++++++++++++++++++++++++++++++++++++--------
 4 files changed, 51 insertions(+), 8 deletions(-)

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -402,6 +402,8 @@ static inline int mem_cgroup_inactive_an
 	return inactive * inactive_ratio < active;
 }
 
+void mem_cgroup_handle_over_high(void);
+
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 				struct task_struct *p);
 
@@ -621,6 +623,10 @@ static inline void mem_cgroup_end_page_s
 {
 }
 
+static inline void mem_cgroup_handle_over_high(void)
+{
+}
+
 static inline void mem_cgroup_oom_enable(void)
 {
 }
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1787,6 +1787,9 @@ struct task_struct {
 	struct mem_cgroup *memcg_in_oom;
 	gfp_t memcg_oom_gfp_mask;
 	int memcg_oom_order;
+
+	/* number of pages to reclaim on returning to userland */
+	unsigned int memcg_nr_pages_over_high;
 #endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -50,6 +50,7 @@
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/task_work.h>
+#include <linux/memcontrol.h>
 struct linux_binprm;
 
 /*
@@ -188,6 +189,8 @@ static inline void tracehook_notify_resu
 	smp_mb__after_atomic();
 	if (unlikely(current->task_works))
 		task_work_run();
+
+	mem_cgroup_handle_over_high();
 }
 
 #endif	/* <linux/tracehook.h> */
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
+#include <linux/tracehook.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -1974,6 +1975,31 @@ static int memcg_cpu_hotplug_callback(st
 	return NOTIFY_OK;
 }
 
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+	unsigned int nr_pages = current->memcg_nr_pages_over_high;
+	struct mem_cgroup *memcg, *pos;
+
+	if (likely(!nr_pages))
+		return;
+
+	pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+	do {
+		if (page_counter_read(&pos->memory) <= pos->high)
+			continue;
+		mem_cgroup_events(pos, MEMCG_HIGH, 1);
+		try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+	} while ((pos = parent_mem_cgroup(pos)));
+
+	css_put(&memcg->css);
+	current->memcg_nr_pages_over_high = 0;
+}
+
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		      unsigned int nr_pages)
 {
@@ -2082,17 +2108,22 @@ done_restock:
 	css_get_many(&memcg->css, batch);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
-	if (!(gfp_mask & __GFP_WAIT))
-		goto done;
+
 	/*
-	 * If the hierarchy is above the normal consumption range,
-	 * make the charging task trim their excess contribution.
+	 * If the hierarchy is above the normal consumption range, schedule
+	 * reclaim on returning to userland.  We can perform reclaim here
+	 * if __GFP_WAIT but let's always punt for simplicity and so that
+	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+	 * not recorded as it most likely matches current's and won't
+	 * change in the meantime.  As high limit is checked again before
+	 * reclaim, the cost of mismatch is negligible.
 	 */
 	do {
-		if (page_counter_read(&memcg->memory) <= memcg->high)
-			continue;
-		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
-		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+		if (page_counter_read(&memcg->memory) > memcg->high) {
+			current->memcg_nr_pages_over_high += nr_pages;
+			set_notify_resume(current);
+			break;
+		}
 	} while ((memcg = parent_mem_cgroup(memcg)));
 done:
 	return ret;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
@ 2015-09-07  9:23       ` Michal Hocko
  0 siblings, 0 replies; 41+ messages in thread
From: Michal Hocko @ 2015-09-07  9:23 UTC (permalink / raw)
  To: Tejun Heo; +Cc: hannes, akpm, cgroups, linux-mm, vdavydov, kernel-team

On Fri 04-09-15 17:00:11, Tejun Heo wrote:
> Currently, try_charge() tries to reclaim memory synchronously when the
> high limit is breached; however, if the allocation doesn't have
> __GFP_WAIT, synchronous reclaim is skipped.  If a process performs
> only speculative allocations, it can blow way past the high limit.
> This is actually easily reproducible by simply doing "find /".
> slab/slub allocator tries speculative allocations first, so as long as
> there's memory which can be consumed without blocking, it can keep
> allocating memory regardless of the high limit.
> 
> This patch makes try_charge() always punt the over-high reclaim to the
> return-to-userland path.  If try_charge() detects that high limit is
> breached, it adds the overage to current->memcg_nr_pages_over_high and
> schedules execution of mem_cgroup_handle_over_high() which performs
> synchronous reclaim from the return-to-userland path.

This also means that a killed task will not reclaim before it dies. This
shouldn't be a big deal, though, because the task should uncharge its
memory which will most likely belong to the memcg. More on that below.

> As long as kernel doesn't have a run-away allocation spree, this
> should provide enough protection while making kmemcg behave more
> consistently.

I would also point out that this approach allows for a better reclaim
opportunities for GFP_NOFS charges which are quite common with kmem
enabled.

> v2: - Switched to reclaiming only the overage caused by current rather
>       than the difference between usage and high as suggested by
>       Michal.
>     - Don't record the memcg which went over high limit.  This makes
>       exit path handling unnecessary.  Dropped.

Hmm, this allows to leave a memcg in a high limit excess. I guess
you are right that this is not that likely to lose sleep over
it. Nevertheless, a nasty user could move away from within signal
handler context which runs before. This looks like a potential runaway
but the migration outside of the restricted hierarchy is a problem in
itself so I wouldn't consider this a problem.

>     - Drop mentions of avoiding high stack usage from description as
>       suggested by Vladimir.  max limit still triggers direct reclaim.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Vladimir Davydov <vdavydov@parallels.com>

Acked-by: Michal Hocko <mhocko@suse.com>

> +/*
> + * Scheduled by try_charge() to be executed from the userland return path
> + * and reclaims memory over the high limit.
> + */
> +void mem_cgroup_handle_over_high(void)
> +{
> +	unsigned int nr_pages = current->memcg_nr_pages_over_high;
> +	struct mem_cgroup *memcg, *pos;
> +
> +	if (likely(!nr_pages))
> +		return;

This is hooking into a hot path so I guess it would be better to make
this part inline and the rest can go via function call.

> +
> +	pos = memcg = get_mem_cgroup_from_mm(current->mm);
> +
> +	do {
> +		if (page_counter_read(&pos->memory) <= pos->high)
> +			continue;
> +		mem_cgroup_events(pos, MEMCG_HIGH, 1);

I was thinking about when to emit the event when I realized we haven't
specified the semantic anywhere. It sounds more logical to emit the
event when the limit is breached rather than when we reclaim for it.  On
the other hand we have been doing it only for reclaim and GFP_NOWAIT
where not accounted. So this mimics the previous behavior. Same for the
max limit when we skip the charge. So I guess this is OK as well.


> +		try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
> +	} while ((pos = parent_mem_cgroup(pos)));
> +
> +	css_put(&memcg->css);
> +	current->memcg_nr_pages_over_high = 0;
> +}
> +
>  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  		      unsigned int nr_pages)
>  {
> @@ -2082,17 +2108,22 @@ done_restock:

JFYI you can get rid of labels in the patch format by
[diff "default"]
        xfuncname = "^[[:alpha:]$_].*[^:]$"

I found it really nice and easier to read.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
@ 2015-09-07  9:23       ` Michal Hocko
  0 siblings, 0 replies; 41+ messages in thread
From: Michal Hocko @ 2015-09-07  9:23 UTC (permalink / raw)
  To: Tejun Heo
  Cc: hannes-druUgvl0LCNAfugRpC6u6w,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On Fri 04-09-15 17:00:11, Tejun Heo wrote:
> Currently, try_charge() tries to reclaim memory synchronously when the
> high limit is breached; however, if the allocation doesn't have
> __GFP_WAIT, synchronous reclaim is skipped.  If a process performs
> only speculative allocations, it can blow way past the high limit.
> This is actually easily reproducible by simply doing "find /".
> slab/slub allocator tries speculative allocations first, so as long as
> there's memory which can be consumed without blocking, it can keep
> allocating memory regardless of the high limit.
> 
> This patch makes try_charge() always punt the over-high reclaim to the
> return-to-userland path.  If try_charge() detects that high limit is
> breached, it adds the overage to current->memcg_nr_pages_over_high and
> schedules execution of mem_cgroup_handle_over_high() which performs
> synchronous reclaim from the return-to-userland path.

This also means that a killed task will not reclaim before it dies. This
shouldn't be a big deal, though, because the task should uncharge its
memory which will most likely belong to the memcg. More on that below.

> As long as kernel doesn't have a run-away allocation spree, this
> should provide enough protection while making kmemcg behave more
> consistently.

I would also point out that this approach allows for a better reclaim
opportunities for GFP_NOFS charges which are quite common with kmem
enabled.

> v2: - Switched to reclaiming only the overage caused by current rather
>       than the difference between usage and high as suggested by
>       Michal.
>     - Don't record the memcg which went over high limit.  This makes
>       exit path handling unnecessary.  Dropped.

Hmm, this allows to leave a memcg in a high limit excess. I guess
you are right that this is not that likely to lose sleep over
it. Nevertheless, a nasty user could move away from within signal
handler context which runs before. This looks like a potential runaway
but the migration outside of the restricted hierarchy is a problem in
itself so I wouldn't consider this a problem.

>     - Drop mentions of avoiding high stack usage from description as
>       suggested by Vladimir.  max limit still triggers direct reclaim.
> 
> Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Cc: Michal Hocko <mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Cc: Vladimir Davydov <vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

Acked-by: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>

> +/*
> + * Scheduled by try_charge() to be executed from the userland return path
> + * and reclaims memory over the high limit.
> + */
> +void mem_cgroup_handle_over_high(void)
> +{
> +	unsigned int nr_pages = current->memcg_nr_pages_over_high;
> +	struct mem_cgroup *memcg, *pos;
> +
> +	if (likely(!nr_pages))
> +		return;

This is hooking into a hot path so I guess it would be better to make
this part inline and the rest can go via function call.

> +
> +	pos = memcg = get_mem_cgroup_from_mm(current->mm);
> +
> +	do {
> +		if (page_counter_read(&pos->memory) <= pos->high)
> +			continue;
> +		mem_cgroup_events(pos, MEMCG_HIGH, 1);

I was thinking about when to emit the event when I realized we haven't
specified the semantic anywhere. It sounds more logical to emit the
event when the limit is breached rather than when we reclaim for it.  On
the other hand we have been doing it only for reclaim and GFP_NOWAIT
where not accounted. So this mimics the previous behavior. Same for the
max limit when we skip the charge. So I guess this is OK as well.


> +		try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
> +	} while ((pos = parent_mem_cgroup(pos)));
> +
> +	css_put(&memcg->css);
> +	current->memcg_nr_pages_over_high = 0;
> +}
> +
>  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  		      unsigned int nr_pages)
>  {
> @@ -2082,17 +2108,22 @@ done_restock:

JFYI you can get rid of labels in the patch format by
[diff "default"]
        xfuncname = "^[[:alpha:]$_].*[^:]$"

I found it really nice and easier to read.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
@ 2015-09-07 11:38       ` Vladimir Davydov
  0 siblings, 0 replies; 41+ messages in thread
From: Vladimir Davydov @ 2015-09-07 11:38 UTC (permalink / raw)
  To: Tejun Heo; +Cc: hannes, mhocko, akpm, cgroups, linux-mm, kernel-team

On Fri, Sep 04, 2015 at 05:00:11PM -0400, Tejun Heo wrote:
> Currently, try_charge() tries to reclaim memory synchronously when the
> high limit is breached; however, if the allocation doesn't have
> __GFP_WAIT, synchronous reclaim is skipped.  If a process performs
> only speculative allocations, it can blow way past the high limit.
> This is actually easily reproducible by simply doing "find /".
> slab/slub allocator tries speculative allocations first, so as long as
> there's memory which can be consumed without blocking, it can keep
> allocating memory regardless of the high limit.
> 
> This patch makes try_charge() always punt the over-high reclaim to the
> return-to-userland path.  If try_charge() detects that high limit is
> breached, it adds the overage to current->memcg_nr_pages_over_high and
> schedules execution of mem_cgroup_handle_over_high() which performs
> synchronous reclaim from the return-to-userland path.
> 
> As long as kernel doesn't have a run-away allocation spree, this
> should provide enough protection while making kmemcg behave more
> consistently.

Another good thing about such an approach is that it copes with prio
inversion. Currently, a task with small memory.high might issue
memory.high reclaim on kmem charge with a bunch of various locks held.
If a task with a big value of memory.high needs any of these locks,
it'll have to wait until the low prio task finishes reclaim and releases
the locks. By handing over reclaim to task_work whenever possible we
might avoid this issue and improve overall performance.

> 
> v2: - Switched to reclaiming only the overage caused by current rather
>       than the difference between usage and high as suggested by
>       Michal.
>     - Don't record the memcg which went over high limit.  This makes
>       exit path handling unnecessary.  Dropped.
>     - Drop mentions of avoiding high stack usage from description as
>       suggested by Vladimir.  max limit still triggers direct reclaim.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Vladimir Davydov <vdavydov@parallels.com>

Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
@ 2015-09-07 11:38       ` Vladimir Davydov
  0 siblings, 0 replies; 41+ messages in thread
From: Vladimir Davydov @ 2015-09-07 11:38 UTC (permalink / raw)
  To: Tejun Heo
  Cc: hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	kernel-team-b10kYP2dOMg

On Fri, Sep 04, 2015 at 05:00:11PM -0400, Tejun Heo wrote:
> Currently, try_charge() tries to reclaim memory synchronously when the
> high limit is breached; however, if the allocation doesn't have
> __GFP_WAIT, synchronous reclaim is skipped.  If a process performs
> only speculative allocations, it can blow way past the high limit.
> This is actually easily reproducible by simply doing "find /".
> slab/slub allocator tries speculative allocations first, so as long as
> there's memory which can be consumed without blocking, it can keep
> allocating memory regardless of the high limit.
> 
> This patch makes try_charge() always punt the over-high reclaim to the
> return-to-userland path.  If try_charge() detects that high limit is
> breached, it adds the overage to current->memcg_nr_pages_over_high and
> schedules execution of mem_cgroup_handle_over_high() which performs
> synchronous reclaim from the return-to-userland path.
> 
> As long as kernel doesn't have a run-away allocation spree, this
> should provide enough protection while making kmemcg behave more
> consistently.

Another good thing about such an approach is that it copes with prio
inversion. Currently, a task with small memory.high might issue
memory.high reclaim on kmem charge with a bunch of various locks held.
If a task with a big value of memory.high needs any of these locks,
it'll have to wait until the low prio task finishes reclaim and releases
the locks. By handing over reclaim to task_work whenever possible we
might avoid this issue and improve overall performance.

> 
> v2: - Switched to reclaiming only the overage caused by current rather
>       than the difference between usage and high as suggested by
>       Michal.
>     - Don't record the memcg which went over high limit.  This makes
>       exit path handling unnecessary.  Dropped.
>     - Drop mentions of avoiding high stack usage from description as
>       suggested by Vladimir.  max limit still triggers direct reclaim.
> 
> Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Cc: Michal Hocko <mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Cc: Vladimir Davydov <vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

Reviewed-by: Vladimir Davydov <vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
@ 2015-09-08 16:59         ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-08 16:59 UTC (permalink / raw)
  To: Michal Hocko; +Cc: hannes, akpm, cgroups, linux-mm, vdavydov, kernel-team

Hello, Michal.

On Mon, Sep 07, 2015 at 11:23:46AM +0200, Michal Hocko wrote:
> > As long as kernel doesn't have a run-away allocation spree, this
> > should provide enough protection while making kmemcg behave more
> > consistently.
> 
> I would also point out that this approach allows for a better reclaim
> opportunities for GFP_NOFS charges which are quite common with kmem
> enabled.

Will update the description.

> > v2: - Switched to reclaiming only the overage caused by current rather
> >       than the difference between usage and high as suggested by
> >       Michal.
> >     - Don't record the memcg which went over high limit.  This makes
> >       exit path handling unnecessary.  Dropped.
> 
> Hmm, this allows to leave a memcg in a high limit excess. I guess
> you are right that this is not that likely to lose sleep over
> it. Nevertheless, a nasty user could move away from within signal
> handler context which runs before. This looks like a potential runaway
> but the migration outside of the restricted hierarchy is a problem in
> itself so I wouldn't consider this a problem.

Yeah, it's difficult to say that there isn't an exploitable sequence
which allows userland to breach high limit from an exiting task.  That
said, I think it'd be pretty difficult to make it large enough to
matter given that the exiting task is losing whatever it's pinning
anyway and the overage it can leave behind is confined by what it
allocates in the kernel execution leading to the exit.  The only
reason I added the exit handling is to put the memcg ref in the first
place.

> > +void mem_cgroup_handle_over_high(void)
> > +{
> > +	unsigned int nr_pages = current->memcg_nr_pages_over_high;
> > +	struct mem_cgroup *memcg, *pos;
> > +
> > +	if (likely(!nr_pages))
> > +		return;
> 
> This is hooking into a hot path so I guess it would be better to make
> this part inline and the rest can go via function call.

Thought about that but it's already guarded by TIF_NOTIFY_RESUME.  It
shouldn't be a hot path.

> > @@ -2082,17 +2108,22 @@ done_restock:
> 
> JFYI you can get rid of labels in the patch format by
> [diff "default"]
>         xfuncname = "^[[:alpha:]$_].*[^:]$"

Heh, I had that in my config but apparently lost it while migrating to
a new machine.  Reinstating...

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
@ 2015-09-08 16:59         ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-08 16:59 UTC (permalink / raw)
  To: Michal Hocko
  Cc: hannes-druUgvl0LCNAfugRpC6u6w,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

Hello, Michal.

On Mon, Sep 07, 2015 at 11:23:46AM +0200, Michal Hocko wrote:
> > As long as kernel doesn't have a run-away allocation spree, this
> > should provide enough protection while making kmemcg behave more
> > consistently.
> 
> I would also point out that this approach allows for a better reclaim
> opportunities for GFP_NOFS charges which are quite common with kmem
> enabled.

Will update the description.

> > v2: - Switched to reclaiming only the overage caused by current rather
> >       than the difference between usage and high as suggested by
> >       Michal.
> >     - Don't record the memcg which went over high limit.  This makes
> >       exit path handling unnecessary.  Dropped.
> 
> Hmm, this allows to leave a memcg in a high limit excess. I guess
> you are right that this is not that likely to lose sleep over
> it. Nevertheless, a nasty user could move away from within signal
> handler context which runs before. This looks like a potential runaway
> but the migration outside of the restricted hierarchy is a problem in
> itself so I wouldn't consider this a problem.

Yeah, it's difficult to say that there isn't an exploitable sequence
which allows userland to breach high limit from an exiting task.  That
said, I think it'd be pretty difficult to make it large enough to
matter given that the exiting task is losing whatever it's pinning
anyway and the overage it can leave behind is confined by what it
allocates in the kernel execution leading to the exit.  The only
reason I added the exit handling is to put the memcg ref in the first
place.

> > +void mem_cgroup_handle_over_high(void)
> > +{
> > +	unsigned int nr_pages = current->memcg_nr_pages_over_high;
> > +	struct mem_cgroup *memcg, *pos;
> > +
> > +	if (likely(!nr_pages))
> > +		return;
> 
> This is hooking into a hot path so I guess it would be better to make
> this part inline and the rest can go via function call.

Thought about that but it's already guarded by TIF_NOTIFY_RESUME.  It
shouldn't be a hot path.

> > @@ -2082,17 +2108,22 @@ done_restock:
> 
> JFYI you can get rid of labels in the patch format by
> [diff "default"]
>         xfuncname = "^[[:alpha:]$_].*[^:]$"

Heh, I had that in my config but apparently lost it while migrating to
a new machine.  Reinstating...

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
  2015-09-07 11:38       ` Vladimir Davydov
@ 2015-09-08 17:00         ` Tejun Heo
  -1 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-08 17:00 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, cgroups, linux-mm, kernel-team

Hello, Vladimir.

On Mon, Sep 07, 2015 at 02:38:22PM +0300, Vladimir Davydov wrote:
> > As long as kernel doesn't have a run-away allocation spree, this
> > should provide enough protection while making kmemcg behave more
> > consistently.
> 
> Another good thing about such an approach is that it copes with prio
> inversion. Currently, a task with small memory.high might issue
> memory.high reclaim on kmem charge with a bunch of various locks held.
> If a task with a big value of memory.high needs any of these locks,
> it'll have to wait until the low prio task finishes reclaim and releases
> the locks. By handing over reclaim to task_work whenever possible we
> might avoid this issue and improve overall performance.

Indeed, will update the patch accordingly.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path
@ 2015-09-08 17:00         ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-08 17:00 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	kernel-team-b10kYP2dOMg

Hello, Vladimir.

On Mon, Sep 07, 2015 at 02:38:22PM +0300, Vladimir Davydov wrote:
> > As long as kernel doesn't have a run-away allocation spree, this
> > should provide enough protection while making kmemcg behave more
> > consistently.
> 
> Another good thing about such an approach is that it copes with prio
> inversion. Currently, a task with small memory.high might issue
> memory.high reclaim on kmem charge with a bunch of various locks held.
> If a task with a big value of memory.high needs any of these locks,
> it'll have to wait until the low prio task finishes reclaim and releases
> the locks. By handing over reclaim to task_work whenever possible we
> might avoid this issue and improve overall performance.

Indeed, will update the patch accordingly.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/2] memcg: always enable kmemcg on the default hierarchy
@ 2015-09-15  9:20     ` Johannes Weiner
  0 siblings, 0 replies; 41+ messages in thread
From: Johannes Weiner @ 2015-09-15  9:20 UTC (permalink / raw)
  To: Tejun Heo; +Cc: mhocko, akpm, cgroups, linux-mm, vdavydov, kernel-team

On Fri, Aug 28, 2015 at 06:02:37PM -0400, Tejun Heo wrote:
> On the default hierarchy, all memory consumption will be accounted
> together and controlled by the same set of limits.  Enable kmemcg on
> the default hierarchy by default.  Boot parameter "disable_kmemcg" can
> be specified to turn it off.
> 
> v2: - v1 triggered oops on nested cgroup creations.  Moved enabling
>       mechanism to memcg_propagate_kmem().
>     - Bypass busy test on kmem activation as it's unnecessary and gets
>       confused by controller being enabled on a cgroup which already
>       has processes.
>     - "disable_kmemcg" boot param added.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

The old distinction between kernel and user memory really doesn't make
sense and should not be maintained. The dentry and inode caches are a
significant share of overall memory consumed in common workloads, and
that is memory unambiguously coupled to userspace activity. I'd go as
far as removing CONFIG_MEMCG_KMEM altogether because it strikes me as
a completely unreasonable choice to give to the user (outside of maybe
CONFIG_EXPERT).

What CONFIG_MEMCG should really capture is all memory that can grow
significantly in size and can be associated directly with userspace
behavior. If there are types of memory that turn out to be very costly
to account and track, we can still go back and conceive an interface
that lets the user select the types of memory he doesn't need tracked.

But the KMEMCG differentation is an arbitrary, and mostly historical
distinction that we shouldn't continue to present to users.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/2] memcg: always enable kmemcg on the default hierarchy
@ 2015-09-15  9:20     ` Johannes Weiner
  0 siblings, 0 replies; 41+ messages in thread
From: Johannes Weiner @ 2015-09-15  9:20 UTC (permalink / raw)
  To: Tejun Heo
  Cc: mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On Fri, Aug 28, 2015 at 06:02:37PM -0400, Tejun Heo wrote:
> On the default hierarchy, all memory consumption will be accounted
> together and controlled by the same set of limits.  Enable kmemcg on
> the default hierarchy by default.  Boot parameter "disable_kmemcg" can
> be specified to turn it off.
> 
> v2: - v1 triggered oops on nested cgroup creations.  Moved enabling
>       mechanism to memcg_propagate_kmem().
>     - Bypass busy test on kmem activation as it's unnecessary and gets
>       confused by controller being enabled on a cgroup which already
>       has processes.
>     - "disable_kmemcg" boot param added.
> 
> Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

Acked-by: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>

The old distinction between kernel and user memory really doesn't make
sense and should not be maintained. The dentry and inode caches are a
significant share of overall memory consumed in common workloads, and
that is memory unambiguously coupled to userspace activity. I'd go as
far as removing CONFIG_MEMCG_KMEM altogether because it strikes me as
a completely unreasonable choice to give to the user (outside of maybe
CONFIG_EXPERT).

What CONFIG_MEMCG should really capture is all memory that can grow
significantly in size and can be associated directly with userspace
behavior. If there are types of memory that turn out to be very costly
to account and track, we can still go back and conceive an interface
that lets the user select the types of memory he doesn't need tracked.

But the KMEMCG differentation is an arbitrary, and mostly historical
distinction that we shouldn't continue to present to users.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
  2015-12-11 16:25               ` Tejun Heo
  (?)
@ 2015-12-15 19:22               ` Peter Zijlstra
  -1 siblings, 0 replies; 41+ messages in thread
From: Peter Zijlstra @ 2015-12-15 19:22 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrey Ryabinin, Ingo Molnar, Sasha Levin, Andrew Morton,
	Johannes Weiner, mhocko, cgroups, linux-mm, vdavydov,
	kernel-team, Dmitry Vyukov

On Fri, Dec 11, 2015 at 11:25:54AM -0500, Tejun Heo wrote:
> Hello, Peter, Ingo.
> 
> On Wed, Nov 25, 2015 at 06:44:49PM +0100, Peter Zijlstra wrote:
> > On Wed, Nov 25, 2015 at 06:31:41PM +0300, Andrey Ryabinin wrote:
> > > > +       /* scheduler bits, serialized by scheduler locks */
> > > >         unsigned sched_reset_on_fork:1;
> > > >         unsigned sched_contributes_to_load:1;
> > > >         unsigned sched_migrated:1;
> > > > +       unsigned __padding_sched:29;
> > > 
> > > AFAIK the order of bit fields is implementation defined, so GCC could
> > > sort all these bits as it wants.
> > 
> > We're relying on it doing DTRT in other places, so I'm fairly confident
> > this'll work, otoh
> > 
> > > You could use unnamed zero-widht bit-field to force padding:
> > > 
> > >          unsigned :0; //force aligment to the next boundary.
> > 
> > That's a nice trick I was not aware of, thanks!
> 
> Has this been fixed yet?  While I'm not completely sure and I don't
> think there's a way to be certain after the fact, we have a single
> report of a machine which is showing ~4G as loadavg and one plausible
> explanation could be that one of the ->nr_uninterruptible counters
> underflowed from sched_contributes_to_load getting corrupted, so it'd
> be great to get this one fixed soon.

Nope, lemme write a Changelog and queue it. Thanks for the reminder.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-12-11 16:25               ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-12-11 16:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrey Ryabinin, Ingo Molnar, Sasha Levin, Andrew Morton,
	Johannes Weiner, mhocko, cgroups, linux-mm, vdavydov,
	kernel-team, Dmitry Vyukov

Hello, Peter, Ingo.

On Wed, Nov 25, 2015 at 06:44:49PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 25, 2015 at 06:31:41PM +0300, Andrey Ryabinin wrote:
> > > +       /* scheduler bits, serialized by scheduler locks */
> > >         unsigned sched_reset_on_fork:1;
> > >         unsigned sched_contributes_to_load:1;
> > >         unsigned sched_migrated:1;
> > > +       unsigned __padding_sched:29;
> > 
> > AFAIK the order of bit fields is implementation defined, so GCC could
> > sort all these bits as it wants.
> 
> We're relying on it doing DTRT in other places, so I'm fairly confident
> this'll work, otoh
> 
> > You could use unnamed zero-widht bit-field to force padding:
> > 
> >          unsigned :0; //force aligment to the next boundary.
> 
> That's a nice trick I was not aware of, thanks!

Has this been fixed yet?  While I'm not completely sure and I don't
think there's a way to be certain after the fact, we have a single
report of a machine which is showing ~4G as loadavg and one plausible
explanation could be that one of the ->nr_uninterruptible counters
underflowed from sched_contributes_to_load getting corrupted, so it'd
be great to get this one fixed soon.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-12-11 16:25               ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-12-11 16:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrey Ryabinin, Ingo Molnar, Sasha Levin, Andrew Morton,
	Johannes Weiner, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg,
	Dmitry Vyukov

Hello, Peter, Ingo.

On Wed, Nov 25, 2015 at 06:44:49PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 25, 2015 at 06:31:41PM +0300, Andrey Ryabinin wrote:
> > > +       /* scheduler bits, serialized by scheduler locks */
> > >         unsigned sched_reset_on_fork:1;
> > >         unsigned sched_contributes_to_load:1;
> > >         unsigned sched_migrated:1;
> > > +       unsigned __padding_sched:29;
> > 
> > AFAIK the order of bit fields is implementation defined, so GCC could
> > sort all these bits as it wants.
> 
> We're relying on it doing DTRT in other places, so I'm fairly confident
> this'll work, otoh
> 
> > You could use unnamed zero-widht bit-field to force padding:
> > 
> >          unsigned :0; //force aligment to the next boundary.
> 
> That's a nice trick I was not aware of, thanks!

Has this been fixed yet?  While I'm not completely sure and I don't
think there's a way to be certain after the fact, we have a single
report of a machine which is showing ~4G as loadavg and one plausible
explanation could be that one of the ->nr_uninterruptible counters
underflowed from sched_contributes_to_load getting corrupted, so it'd
be great to get this one fixed soon.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
  2015-11-25 15:31         ` Andrey Ryabinin
  2015-11-25 17:34             ` Dmitry Vyukov
@ 2015-11-25 17:44           ` Peter Zijlstra
  2015-12-11 16:25               ` Tejun Heo
  1 sibling, 1 reply; 41+ messages in thread
From: Peter Zijlstra @ 2015-11-25 17:44 UTC (permalink / raw)
  To: Andrey Ryabinin
  Cc: Tejun Heo, Ingo Molnar, Sasha Levin, Andrew Morton,
	Johannes Weiner, mhocko, cgroups, linux-mm, vdavydov,
	kernel-team, Dmitry Vyukov

On Wed, Nov 25, 2015 at 06:31:41PM +0300, Andrey Ryabinin wrote:
> > +       /* scheduler bits, serialized by scheduler locks */
> >         unsigned sched_reset_on_fork:1;
> >         unsigned sched_contributes_to_load:1;
> >         unsigned sched_migrated:1;
> > +       unsigned __padding_sched:29;
> 
> AFAIK the order of bit fields is implementation defined, so GCC could
> sort all these bits as it wants.

We're relying on it doing DTRT in other places, so I'm fairly confident
this'll work, otoh

> You could use unnamed zero-widht bit-field to force padding:
> 
>          unsigned :0; //force aligment to the next boundary.

That's a nice trick I was not aware of, thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-11-25 17:34             ` Dmitry Vyukov
  0 siblings, 0 replies; 41+ messages in thread
From: Dmitry Vyukov @ 2015-11-25 17:34 UTC (permalink / raw)
  To: Andrey Ryabinin
  Cc: Peter Zijlstra, Tejun Heo, Ingo Molnar, Sasha Levin,
	Andrew Morton, Johannes Weiner, Michal Hocko, cgroups, linux-mm,
	vdavydov, kernel-team

On Wed, Nov 25, 2015 at 4:31 PM, Andrey Ryabinin <ryabinin.a.a@gmail.com> wrote:
> 2015-11-25 18:02 GMT+03:00 Peter Zijlstra <peterz@infradead.org>:
>> On Wed, Nov 25, 2015 at 03:43:54PM +0100, Peter Zijlstra wrote:
>>> On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
>>> > So, the only way the patch could have caused the above is if someone
>>> > who isn't the task itself is writing to the bitfields while the task
>>> > is running.  Looking through the fields, ->sched_reset_on_fork seems a
>>> > bit suspicious.  __sched_setscheduler() looks like it can modify the
>>> > bit while the target task is running.  Peter, am I misreading the
>>> > code?
>>>
>>> Nope, that's quite possible. Looks like we need to break up those
>>> bitfields a bit. All the scheduler ones should be serialized by
>>> scheduler locks, but the others are fair game.
>>
>> Maybe something like so; but my brain is a complete mess today.
>>
>> ---
>>  include/linux/sched.h | 11 ++++++-----
>>  1 file changed, 6 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index f425aac63317..b474e0f05327 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1455,14 +1455,15 @@ struct task_struct {
>>         /* Used for emulating ABI behavior of previous Linux versions */
>>         unsigned int personality;
>>
>> -       unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
>> -                                * execve */
>> -       unsigned in_iowait:1;
>> -
>> -       /* Revert to default priority/policy when forking */
>> +       /* scheduler bits, serialized by scheduler locks */
>>         unsigned sched_reset_on_fork:1;
>>         unsigned sched_contributes_to_load:1;
>>         unsigned sched_migrated:1;
>> +       unsigned __padding_sched:29;
>
> AFAIK the order of bit fields is implementation defined, so GCC could
> sort all these bits as it wants.
> You could use unnamed zero-widht bit-field to force padding:
>
>          unsigned :0; //force aligment to the next boundary.
>
>> +
>> +       /* unserialized, strictly 'current' */
>> +       unsigned in_execve:1; /* bit to tell LSMs we're in execve */
>> +       unsigned in_iowait:1;
>>  #ifdef CONFIG_MEMCG
>>         unsigned memcg_may_oom:1;
>>  #endif
>>

I've gathered some evidence that in my case the guilty bit is
sched_reset_on_fork:
https://groups.google.com/d/msg/syzkaller/o8VqvYNEu_I/I0pXGx79DQAJ

This patch should help.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-11-25 17:34             ` Dmitry Vyukov
  0 siblings, 0 replies; 41+ messages in thread
From: Dmitry Vyukov @ 2015-11-25 17:34 UTC (permalink / raw)
  To: Andrey Ryabinin
  Cc: Peter Zijlstra, Tejun Heo, Ingo Molnar, Sasha Levin,
	Andrew Morton, Johannes Weiner, Michal Hocko,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On Wed, Nov 25, 2015 at 4:31 PM, Andrey Ryabinin <ryabinin.a.a-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> 2015-11-25 18:02 GMT+03:00 Peter Zijlstra <peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>:
>> On Wed, Nov 25, 2015 at 03:43:54PM +0100, Peter Zijlstra wrote:
>>> On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
>>> > So, the only way the patch could have caused the above is if someone
>>> > who isn't the task itself is writing to the bitfields while the task
>>> > is running.  Looking through the fields, ->sched_reset_on_fork seems a
>>> > bit suspicious.  __sched_setscheduler() looks like it can modify the
>>> > bit while the target task is running.  Peter, am I misreading the
>>> > code?
>>>
>>> Nope, that's quite possible. Looks like we need to break up those
>>> bitfields a bit. All the scheduler ones should be serialized by
>>> scheduler locks, but the others are fair game.
>>
>> Maybe something like so; but my brain is a complete mess today.
>>
>> ---
>>  include/linux/sched.h | 11 ++++++-----
>>  1 file changed, 6 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index f425aac63317..b474e0f05327 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1455,14 +1455,15 @@ struct task_struct {
>>         /* Used for emulating ABI behavior of previous Linux versions */
>>         unsigned int personality;
>>
>> -       unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
>> -                                * execve */
>> -       unsigned in_iowait:1;
>> -
>> -       /* Revert to default priority/policy when forking */
>> +       /* scheduler bits, serialized by scheduler locks */
>>         unsigned sched_reset_on_fork:1;
>>         unsigned sched_contributes_to_load:1;
>>         unsigned sched_migrated:1;
>> +       unsigned __padding_sched:29;
>
> AFAIK the order of bit fields is implementation defined, so GCC could
> sort all these bits as it wants.
> You could use unnamed zero-widht bit-field to force padding:
>
>          unsigned :0; //force aligment to the next boundary.
>
>> +
>> +       /* unserialized, strictly 'current' */
>> +       unsigned in_execve:1; /* bit to tell LSMs we're in execve */
>> +       unsigned in_iowait:1;
>>  #ifdef CONFIG_MEMCG
>>         unsigned memcg_may_oom:1;
>>  #endif
>>

I've gathered some evidence that in my case the guilty bit is
sched_reset_on_fork:
https://groups.google.com/d/msg/syzkaller/o8VqvYNEu_I/I0pXGx79DQAJ

This patch should help.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
  2015-11-25 15:02         ` Peter Zijlstra
  (?)
@ 2015-11-25 15:31         ` Andrey Ryabinin
  2015-11-25 17:34             ` Dmitry Vyukov
  2015-11-25 17:44           ` Peter Zijlstra
  -1 siblings, 2 replies; 41+ messages in thread
From: Andrey Ryabinin @ 2015-11-25 15:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, Ingo Molnar, Sasha Levin, Andrew Morton,
	Johannes Weiner, mhocko, cgroups, linux-mm, vdavydov,
	kernel-team, Dmitry Vyukov

2015-11-25 18:02 GMT+03:00 Peter Zijlstra <peterz@infradead.org>:
> On Wed, Nov 25, 2015 at 03:43:54PM +0100, Peter Zijlstra wrote:
>> On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
>> > So, the only way the patch could have caused the above is if someone
>> > who isn't the task itself is writing to the bitfields while the task
>> > is running.  Looking through the fields, ->sched_reset_on_fork seems a
>> > bit suspicious.  __sched_setscheduler() looks like it can modify the
>> > bit while the target task is running.  Peter, am I misreading the
>> > code?
>>
>> Nope, that's quite possible. Looks like we need to break up those
>> bitfields a bit. All the scheduler ones should be serialized by
>> scheduler locks, but the others are fair game.
>
> Maybe something like so; but my brain is a complete mess today.
>
> ---
>  include/linux/sched.h | 11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f425aac63317..b474e0f05327 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1455,14 +1455,15 @@ struct task_struct {
>         /* Used for emulating ABI behavior of previous Linux versions */
>         unsigned int personality;
>
> -       unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
> -                                * execve */
> -       unsigned in_iowait:1;
> -
> -       /* Revert to default priority/policy when forking */
> +       /* scheduler bits, serialized by scheduler locks */
>         unsigned sched_reset_on_fork:1;
>         unsigned sched_contributes_to_load:1;
>         unsigned sched_migrated:1;
> +       unsigned __padding_sched:29;

AFAIK the order of bit fields is implementation defined, so GCC could
sort all these bits as it wants.
You could use unnamed zero-widht bit-field to force padding:

         unsigned :0; //force aligment to the next boundary.

> +
> +       /* unserialized, strictly 'current' */
> +       unsigned in_execve:1; /* bit to tell LSMs we're in execve */
> +       unsigned in_iowait:1;
>  #ifdef CONFIG_MEMCG
>         unsigned memcg_may_oom:1;
>  #endif
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-11-25 15:02         ` Peter Zijlstra
  0 siblings, 0 replies; 41+ messages in thread
From: Peter Zijlstra @ 2015-11-25 15:02 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Sasha Levin, akpm, hannes, mhocko, cgroups,
	linux-mm, vdavydov, kernel-team

On Wed, Nov 25, 2015 at 03:43:54PM +0100, Peter Zijlstra wrote:
> On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
> > So, the only way the patch could have caused the above is if someone
> > who isn't the task itself is writing to the bitfields while the task
> > is running.  Looking through the fields, ->sched_reset_on_fork seems a
> > bit suspicious.  __sched_setscheduler() looks like it can modify the
> > bit while the target task is running.  Peter, am I misreading the
> > code?
> 
> Nope, that's quite possible. Looks like we need to break up those
> bitfields a bit. All the scheduler ones should be serialized by
> scheduler locks, but the others are fair game.

Maybe something like so; but my brain is a complete mess today.

---
 include/linux/sched.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f425aac63317..b474e0f05327 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1455,14 +1455,15 @@ struct task_struct {
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
 
-	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
-				 * execve */
-	unsigned in_iowait:1;
-
-	/* Revert to default priority/policy when forking */
+	/* scheduler bits, serialized by scheduler locks */
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
+	unsigned __padding_sched:29;
+
+	/* unserialized, strictly 'current' */
+	unsigned in_execve:1; /* bit to tell LSMs we're in execve */
+	unsigned in_iowait:1;
 #ifdef CONFIG_MEMCG
 	unsigned memcg_may_oom:1;
 #endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-11-25 15:02         ` Peter Zijlstra
  0 siblings, 0 replies; 41+ messages in thread
From: Peter Zijlstra @ 2015-11-25 15:02 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Sasha Levin, akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On Wed, Nov 25, 2015 at 03:43:54PM +0100, Peter Zijlstra wrote:
> On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
> > So, the only way the patch could have caused the above is if someone
> > who isn't the task itself is writing to the bitfields while the task
> > is running.  Looking through the fields, ->sched_reset_on_fork seems a
> > bit suspicious.  __sched_setscheduler() looks like it can modify the
> > bit while the target task is running.  Peter, am I misreading the
> > code?
> 
> Nope, that's quite possible. Looks like we need to break up those
> bitfields a bit. All the scheduler ones should be serialized by
> scheduler locks, but the others are fair game.

Maybe something like so; but my brain is a complete mess today.

---
 include/linux/sched.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f425aac63317..b474e0f05327 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1455,14 +1455,15 @@ struct task_struct {
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
 
-	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
-				 * execve */
-	unsigned in_iowait:1;
-
-	/* Revert to default priority/policy when forking */
+	/* scheduler bits, serialized by scheduler locks */
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
+	unsigned __padding_sched:29;
+
+	/* unserialized, strictly 'current' */
+	unsigned in_execve:1; /* bit to tell LSMs we're in execve */
+	unsigned in_iowait:1;
 #ifdef CONFIG_MEMCG
 	unsigned memcg_may_oom:1;
 #endif

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-11-25 14:43       ` Peter Zijlstra
  0 siblings, 0 replies; 41+ messages in thread
From: Peter Zijlstra @ 2015-11-25 14:43 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Sasha Levin, akpm, hannes, mhocko, cgroups,
	linux-mm, vdavydov, kernel-team

On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
> (cc'ing scheduler folks)
> 
> On Sun, Sep 20, 2015 at 10:45:25AM -0400, Sasha Levin wrote:
> > On 09/13/2015 02:59 PM, Tejun Heo wrote:
> > > task_struct->memcg_oom is a sub-struct containing fields which are
> > > used for async memcg oom handling.  Most task_struct fields aren't
> > > packaged this way and it can lead to unnecessary alignment paddings.
> > > This patch flattens it.
> > > 
> > > * task.memcg_oom.memcg          -> task.memcg_in_oom
> > > * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> > > * task.memcg_oom.order          -> task.memcg_oom_order
> > > * task.memcg_oom.may_oom        -> task.memcg_may_oom
> ...
> > I've started seeing these warnings:
> > 
> > [1598889.250160] WARNING: CPU: 3 PID: 11648 at include/linux/memcontrol.h:414 handle_mm_fault+0x1020/0x3fa0()
> ...
> > [1598892.247256] dump_stack (lib/dump_stack.c:52)
> > [1598892.249105] warn_slowpath_common (kernel/panic.c:448)
> > [1598892.253202] warn_slowpath_null (kernel/panic.c:482)
> > [1598892.255148] handle_mm_fault (include/linux/memcontrol.h:414 mm/memory.c:3430)
> > [1598892.268151] __do_page_fault (arch/x86/mm/fault.c:1239)
> > [1598892.269022] trace_do_page_fault (arch/x86/mm/fault.c:1331 include/linux/jump_label.h:133 include/linux/context_tracking_state.h:30 include/linux/context_tracking.h:46 arch/x86/mm/fault.c:1332)
> > [1598892.269894] do_async_page_fault (arch/x86/kernel/kvm.c:280)
> > [1598892.270792] async_page_fault (arch/x86/entry/entry_64.S:989)
> > 
> > Not sure if it's because of this patch or not, but I haven't seen them before.
> 
> So, the only way the patch could have caused the above is if someone
> who isn't the task itself is writing to the bitfields while the task
> is running.  Looking through the fields, ->sched_reset_on_fork seems a
> bit suspicious.  __sched_setscheduler() looks like it can modify the
> bit while the target task is running.  Peter, am I misreading the
> code?

Nope, that's quite possible. Looks like we need to break up those
bitfields a bit. All the scheduler ones should be serialized by
scheduler locks, but the others are fair game.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-11-25 14:43       ` Peter Zijlstra
  0 siblings, 0 replies; 41+ messages in thread
From: Peter Zijlstra @ 2015-11-25 14:43 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Sasha Levin, akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
> (cc'ing scheduler folks)
> 
> On Sun, Sep 20, 2015 at 10:45:25AM -0400, Sasha Levin wrote:
> > On 09/13/2015 02:59 PM, Tejun Heo wrote:
> > > task_struct->memcg_oom is a sub-struct containing fields which are
> > > used for async memcg oom handling.  Most task_struct fields aren't
> > > packaged this way and it can lead to unnecessary alignment paddings.
> > > This patch flattens it.
> > > 
> > > * task.memcg_oom.memcg          -> task.memcg_in_oom
> > > * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> > > * task.memcg_oom.order          -> task.memcg_oom_order
> > > * task.memcg_oom.may_oom        -> task.memcg_may_oom
> ...
> > I've started seeing these warnings:
> > 
> > [1598889.250160] WARNING: CPU: 3 PID: 11648 at include/linux/memcontrol.h:414 handle_mm_fault+0x1020/0x3fa0()
> ...
> > [1598892.247256] dump_stack (lib/dump_stack.c:52)
> > [1598892.249105] warn_slowpath_common (kernel/panic.c:448)
> > [1598892.253202] warn_slowpath_null (kernel/panic.c:482)
> > [1598892.255148] handle_mm_fault (include/linux/memcontrol.h:414 mm/memory.c:3430)
> > [1598892.268151] __do_page_fault (arch/x86/mm/fault.c:1239)
> > [1598892.269022] trace_do_page_fault (arch/x86/mm/fault.c:1331 include/linux/jump_label.h:133 include/linux/context_tracking_state.h:30 include/linux/context_tracking.h:46 arch/x86/mm/fault.c:1332)
> > [1598892.269894] do_async_page_fault (arch/x86/kernel/kvm.c:280)
> > [1598892.270792] async_page_fault (arch/x86/entry/entry_64.S:989)
> > 
> > Not sure if it's because of this patch or not, but I haven't seen them before.
> 
> So, the only way the patch could have caused the above is if someone
> who isn't the task itself is writing to the bitfields while the task
> is running.  Looking through the fields, ->sched_reset_on_fork seems a
> bit suspicious.  __sched_setscheduler() looks like it can modify the
> bit while the target task is running.  Peter, am I misreading the
> code?

Nope, that's quite possible. Looks like we need to break up those
bitfields a bit. All the scheduler ones should be serialized by
scheduler locks, but the others are fair game.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-30 18:54       ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-30 18:54 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Sasha Levin
  Cc: akpm, hannes, mhocko, cgroups, linux-mm, vdavydov, kernel-team

On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
> (cc'ing scheduler folks)
> 
> On Sun, Sep 20, 2015 at 10:45:25AM -0400, Sasha Levin wrote:
> > On 09/13/2015 02:59 PM, Tejun Heo wrote:
> > > task_struct->memcg_oom is a sub-struct containing fields which are
> > > used for async memcg oom handling.  Most task_struct fields aren't
> > > packaged this way and it can lead to unnecessary alignment paddings.
> > > This patch flattens it.
> > > 
> > > * task.memcg_oom.memcg          -> task.memcg_in_oom
> > > * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> > > * task.memcg_oom.order          -> task.memcg_oom_order
> > > * task.memcg_oom.may_oom        -> task.memcg_may_oom
> ...
> > I've started seeing these warnings:
> > 
> > [1598889.250160] WARNING: CPU: 3 PID: 11648 at include/linux/memcontrol.h:414 handle_mm_fault+0x1020/0x3fa0()
> ...
> > [1598892.247256] dump_stack (lib/dump_stack.c:52)
> > [1598892.249105] warn_slowpath_common (kernel/panic.c:448)
> > [1598892.253202] warn_slowpath_null (kernel/panic.c:482)
> > [1598892.255148] handle_mm_fault (include/linux/memcontrol.h:414 mm/memory.c:3430)
> > [1598892.268151] __do_page_fault (arch/x86/mm/fault.c:1239)
> > [1598892.269022] trace_do_page_fault (arch/x86/mm/fault.c:1331 include/linux/jump_label.h:133 include/linux/context_tracking_state.h:30 include/linux/context_tracking.h:46 arch/x86/mm/fault.c:1332)
> > [1598892.269894] do_async_page_fault (arch/x86/kernel/kvm.c:280)
> > [1598892.270792] async_page_fault (arch/x86/entry/entry_64.S:989)
> > 
> > Not sure if it's because of this patch or not, but I haven't seen them before.
> 
> So, the only way the patch could have caused the above is if someone
> who isn't the task itself is writing to the bitfields while the task
> is running.  Looking through the fields, ->sched_reset_on_fork seems a
> bit suspicious.  __sched_setscheduler() looks like it can modify the
> bit while the target task is running.  Peter, am I misreading the
> code?

Ping?

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-30 18:54       ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-30 18:54 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Sasha Levin
  Cc: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On Mon, Sep 21, 2015 at 04:01:41PM -0400, Tejun Heo wrote:
> (cc'ing scheduler folks)
> 
> On Sun, Sep 20, 2015 at 10:45:25AM -0400, Sasha Levin wrote:
> > On 09/13/2015 02:59 PM, Tejun Heo wrote:
> > > task_struct->memcg_oom is a sub-struct containing fields which are
> > > used for async memcg oom handling.  Most task_struct fields aren't
> > > packaged this way and it can lead to unnecessary alignment paddings.
> > > This patch flattens it.
> > > 
> > > * task.memcg_oom.memcg          -> task.memcg_in_oom
> > > * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> > > * task.memcg_oom.order          -> task.memcg_oom_order
> > > * task.memcg_oom.may_oom        -> task.memcg_may_oom
> ...
> > I've started seeing these warnings:
> > 
> > [1598889.250160] WARNING: CPU: 3 PID: 11648 at include/linux/memcontrol.h:414 handle_mm_fault+0x1020/0x3fa0()
> ...
> > [1598892.247256] dump_stack (lib/dump_stack.c:52)
> > [1598892.249105] warn_slowpath_common (kernel/panic.c:448)
> > [1598892.253202] warn_slowpath_null (kernel/panic.c:482)
> > [1598892.255148] handle_mm_fault (include/linux/memcontrol.h:414 mm/memory.c:3430)
> > [1598892.268151] __do_page_fault (arch/x86/mm/fault.c:1239)
> > [1598892.269022] trace_do_page_fault (arch/x86/mm/fault.c:1331 include/linux/jump_label.h:133 include/linux/context_tracking_state.h:30 include/linux/context_tracking.h:46 arch/x86/mm/fault.c:1332)
> > [1598892.269894] do_async_page_fault (arch/x86/kernel/kvm.c:280)
> > [1598892.270792] async_page_fault (arch/x86/entry/entry_64.S:989)
> > 
> > Not sure if it's because of this patch or not, but I haven't seen them before.
> 
> So, the only way the patch could have caused the above is if someone
> who isn't the task itself is writing to the bitfields while the task
> is running.  Looking through the fields, ->sched_reset_on_fork seems a
> bit suspicious.  __sched_setscheduler() looks like it can modify the
> bit while the target task is running.  Peter, am I misreading the
> code?

Ping?

-- 
tejun

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-21 20:01     ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-21 20:01 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Sasha Levin
  Cc: akpm, hannes, mhocko, cgroups, linux-mm, vdavydov, kernel-team

(cc'ing scheduler folks)

On Sun, Sep 20, 2015 at 10:45:25AM -0400, Sasha Levin wrote:
> On 09/13/2015 02:59 PM, Tejun Heo wrote:
> > task_struct->memcg_oom is a sub-struct containing fields which are
> > used for async memcg oom handling.  Most task_struct fields aren't
> > packaged this way and it can lead to unnecessary alignment paddings.
> > This patch flattens it.
> > 
> > * task.memcg_oom.memcg          -> task.memcg_in_oom
> > * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> > * task.memcg_oom.order          -> task.memcg_oom_order
> > * task.memcg_oom.may_oom        -> task.memcg_may_oom
...
> I've started seeing these warnings:
> 
> [1598889.250160] WARNING: CPU: 3 PID: 11648 at include/linux/memcontrol.h:414 handle_mm_fault+0x1020/0x3fa0()
...
> [1598892.247256] dump_stack (lib/dump_stack.c:52)
> [1598892.249105] warn_slowpath_common (kernel/panic.c:448)
> [1598892.253202] warn_slowpath_null (kernel/panic.c:482)
> [1598892.255148] handle_mm_fault (include/linux/memcontrol.h:414 mm/memory.c:3430)
> [1598892.268151] __do_page_fault (arch/x86/mm/fault.c:1239)
> [1598892.269022] trace_do_page_fault (arch/x86/mm/fault.c:1331 include/linux/jump_label.h:133 include/linux/context_tracking_state.h:30 include/linux/context_tracking.h:46 arch/x86/mm/fault.c:1332)
> [1598892.269894] do_async_page_fault (arch/x86/kernel/kvm.c:280)
> [1598892.270792] async_page_fault (arch/x86/entry/entry_64.S:989)
> 
> Not sure if it's because of this patch or not, but I haven't seen them before.

So, the only way the patch could have caused the above is if someone
who isn't the task itself is writing to the bitfields while the task
is running.  Looking through the fields, ->sched_reset_on_fork seems a
bit suspicious.  __sched_setscheduler() looks like it can modify the
bit while the target task is running.  Peter, am I misreading the
code?

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-21 20:01     ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-21 20:01 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Sasha Levin
  Cc: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

(cc'ing scheduler folks)

On Sun, Sep 20, 2015 at 10:45:25AM -0400, Sasha Levin wrote:
> On 09/13/2015 02:59 PM, Tejun Heo wrote:
> > task_struct->memcg_oom is a sub-struct containing fields which are
> > used for async memcg oom handling.  Most task_struct fields aren't
> > packaged this way and it can lead to unnecessary alignment paddings.
> > This patch flattens it.
> > 
> > * task.memcg_oom.memcg          -> task.memcg_in_oom
> > * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> > * task.memcg_oom.order          -> task.memcg_oom_order
> > * task.memcg_oom.may_oom        -> task.memcg_may_oom
...
> I've started seeing these warnings:
> 
> [1598889.250160] WARNING: CPU: 3 PID: 11648 at include/linux/memcontrol.h:414 handle_mm_fault+0x1020/0x3fa0()
...
> [1598892.247256] dump_stack (lib/dump_stack.c:52)
> [1598892.249105] warn_slowpath_common (kernel/panic.c:448)
> [1598892.253202] warn_slowpath_null (kernel/panic.c:482)
> [1598892.255148] handle_mm_fault (include/linux/memcontrol.h:414 mm/memory.c:3430)
> [1598892.268151] __do_page_fault (arch/x86/mm/fault.c:1239)
> [1598892.269022] trace_do_page_fault (arch/x86/mm/fault.c:1331 include/linux/jump_label.h:133 include/linux/context_tracking_state.h:30 include/linux/context_tracking.h:46 arch/x86/mm/fault.c:1332)
> [1598892.269894] do_async_page_fault (arch/x86/kernel/kvm.c:280)
> [1598892.270792] async_page_fault (arch/x86/entry/entry_64.S:989)
> 
> Not sure if it's because of this patch or not, but I haven't seen them before.

So, the only way the patch could have caused the above is if someone
who isn't the task itself is writing to the bitfields while the task
is running.  Looking through the fields, ->sched_reset_on_fork seems a
bit suspicious.  __sched_setscheduler() looks like it can modify the
bit while the target task is running.  Peter, am I misreading the
code?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-20 14:45   ` Sasha Levin
  0 siblings, 0 replies; 41+ messages in thread
From: Sasha Levin @ 2015-09-20 14:45 UTC (permalink / raw)
  To: Tejun Heo, akpm, hannes, mhocko; +Cc: cgroups, linux-mm, vdavydov, kernel-team

On 09/13/2015 02:59 PM, Tejun Heo wrote:
> task_struct->memcg_oom is a sub-struct containing fields which are
> used for async memcg oom handling.  Most task_struct fields aren't
> packaged this way and it can lead to unnecessary alignment paddings.
> This patch flattens it.
> 
> * task.memcg_oom.memcg          -> task.memcg_in_oom
> * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> * task.memcg_oom.order          -> task.memcg_oom_order
> * task.memcg_oom.may_oom        -> task.memcg_may_oom
> 
> In addition, task.memcg_may_oom is relocated to where other bitfields
> are which reduces the size of task_struct.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
> ---
> Hello,
> 
> Andrew, these are the two patches which got acked from the following
> thread.
> 
>  http://lkml.kernel.org/g/20150828220158.GD11089@htj.dyndns.org
> 
> Acks are added and the second patch's description is updated as
> suggested by Michal and Vladimir.
> 
> Can you please put them in -mm?

Hi Tejun,

I've started seeing these warnings:

[1598889.250160] WARNING: CPU: 3 PID: 11648 at include/linux/memcontrol.h:414 handle_mm_fault+0x1020/0x3fa0()
[1598889.786891] Modules linked in:
[1598890.883223] Unable to find swap-space signature
[1598891.463736]
[1598892.236001] CPU: 3 PID: 11648 Comm: trinity-c10 Not tainted 4.3.0-rc1-next-20150918-sasha-00081-g4b7392a-dirty #2565
[1598892.239377]  ffffffffb4746d40 ffff8802edad7c70 ffffffffabfe97ba 0000000000000000
[1598892.241723]  ffff8802edad7cb0 ffffffffaa367466 ffffffffaa764140 ffff88037f00bec0
[1598892.244135]  ffff8802ecb94000 ffff88042420a000 0000000000b98fe8 ffff88042420a000
[1598892.246393] Call Trace:
[1598892.247256] dump_stack (lib/dump_stack.c:52)
[1598892.249105] warn_slowpath_common (kernel/panic.c:448)
[1598892.253202] warn_slowpath_null (kernel/panic.c:482)
[1598892.255148] handle_mm_fault (include/linux/memcontrol.h:414 mm/memory.c:3430)
[1598892.268151] __do_page_fault (arch/x86/mm/fault.c:1239)
[1598892.269022] trace_do_page_fault (arch/x86/mm/fault.c:1331 include/linux/jump_label.h:133 include/linux/context_tracking_state.h:30 include/linux/context_tracking.h:46 arch/x86/mm/fault.c:1332)
[1598892.269894] do_async_page_fault (arch/x86/kernel/kvm.c:280)
[1598892.270792] async_page_fault (arch/x86/entry/entry_64.S:989)

Not sure if it's because of this patch or not, but I haven't seen them before.


Thanks,
Sasha

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-20 14:45   ` Sasha Levin
  0 siblings, 0 replies; 41+ messages in thread
From: Sasha Levin @ 2015-09-20 14:45 UTC (permalink / raw)
  To: Tejun Heo, akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

On 09/13/2015 02:59 PM, Tejun Heo wrote:
> task_struct->memcg_oom is a sub-struct containing fields which are
> used for async memcg oom handling.  Most task_struct fields aren't
> packaged this way and it can lead to unnecessary alignment paddings.
> This patch flattens it.
> 
> * task.memcg_oom.memcg          -> task.memcg_in_oom
> * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> * task.memcg_oom.order          -> task.memcg_oom_order
> * task.memcg_oom.may_oom        -> task.memcg_may_oom
> 
> In addition, task.memcg_may_oom is relocated to where other bitfields
> are which reduces the size of task_struct.
> 
> Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Acked-by: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
> Reviewed-by: Vladimir Davydov <vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
> ---
> Hello,
> 
> Andrew, these are the two patches which got acked from the following
> thread.
> 
>  http://lkml.kernel.org/g/20150828220158.GD11089-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org
> 
> Acks are added and the second patch's description is updated as
> suggested by Michal and Vladimir.
> 
> Can you please put them in -mm?

Hi Tejun,

I've started seeing these warnings:

[1598889.250160] WARNING: CPU: 3 PID: 11648 at include/linux/memcontrol.h:414 handle_mm_fault+0x1020/0x3fa0()
[1598889.786891] Modules linked in:
[1598890.883223] Unable to find swap-space signature
[1598891.463736]
[1598892.236001] CPU: 3 PID: 11648 Comm: trinity-c10 Not tainted 4.3.0-rc1-next-20150918-sasha-00081-g4b7392a-dirty #2565
[1598892.239377]  ffffffffb4746d40 ffff8802edad7c70 ffffffffabfe97ba 0000000000000000
[1598892.241723]  ffff8802edad7cb0 ffffffffaa367466 ffffffffaa764140 ffff88037f00bec0
[1598892.244135]  ffff8802ecb94000 ffff88042420a000 0000000000b98fe8 ffff88042420a000
[1598892.246393] Call Trace:
[1598892.247256] dump_stack (lib/dump_stack.c:52)
[1598892.249105] warn_slowpath_common (kernel/panic.c:448)
[1598892.253202] warn_slowpath_null (kernel/panic.c:482)
[1598892.255148] handle_mm_fault (include/linux/memcontrol.h:414 mm/memory.c:3430)
[1598892.268151] __do_page_fault (arch/x86/mm/fault.c:1239)
[1598892.269022] trace_do_page_fault (arch/x86/mm/fault.c:1331 include/linux/jump_label.h:133 include/linux/context_tracking_state.h:30 include/linux/context_tracking.h:46 arch/x86/mm/fault.c:1332)
[1598892.269894] do_async_page_fault (arch/x86/kernel/kvm.c:280)
[1598892.270792] async_page_fault (arch/x86/entry/entry_64.S:989)

Not sure if it's because of this patch or not, but I haven't seen them before.


Thanks,
Sasha

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-15  7:37   ` Johannes Weiner
  0 siblings, 0 replies; 41+ messages in thread
From: Johannes Weiner @ 2015-09-15  7:37 UTC (permalink / raw)
  To: Tejun Heo; +Cc: akpm, mhocko, cgroups, linux-mm, vdavydov, kernel-team

On Sun, Sep 13, 2015 at 02:59:40PM -0400, Tejun Heo wrote:
> task_struct->memcg_oom is a sub-struct containing fields which are
> used for async memcg oom handling.  Most task_struct fields aren't
> packaged this way and it can lead to unnecessary alignment paddings.
> This patch flattens it.
> 
> * task.memcg_oom.memcg          -> task.memcg_in_oom
> * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> * task.memcg_oom.order          -> task.memcg_oom_order
> * task.memcg_oom.may_oom        -> task.memcg_may_oom
> 
> In addition, task.memcg_may_oom is relocated to where other bitfields
> are which reduces the size of task_struct.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>

Looks good to me, thanks.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-15  7:37   ` Johannes Weiner
  0 siblings, 0 replies; 41+ messages in thread
From: Johannes Weiner @ 2015-09-15  7:37 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	mhocko-DgEjT+Ai2ygdnm+yROfE0A, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, vdavydov-bzQdu9zFT3WakBO8gow8eQ,
	kernel-team-b10kYP2dOMg

On Sun, Sep 13, 2015 at 02:59:40PM -0400, Tejun Heo wrote:
> task_struct->memcg_oom is a sub-struct containing fields which are
> used for async memcg oom handling.  Most task_struct fields aren't
> packaged this way and it can lead to unnecessary alignment paddings.
> This patch flattens it.
> 
> * task.memcg_oom.memcg          -> task.memcg_in_oom
> * task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
> * task.memcg_oom.order          -> task.memcg_oom_order
> * task.memcg_oom.may_oom        -> task.memcg_may_oom
> 
> In addition, task.memcg_may_oom is relocated to where other bitfields
> are which reduces the size of task_struct.
> 
> Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Acked-by: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
> Reviewed-by: Vladimir Davydov <vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

Looks good to me, thanks.

Acked-by: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-13 18:59 ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-13 18:59 UTC (permalink / raw)
  To: akpm, hannes, mhocko; +Cc: cgroups, linux-mm, vdavydov, kernel-team

task_struct->memcg_oom is a sub-struct containing fields which are
used for async memcg oom handling.  Most task_struct fields aren't
packaged this way and it can lead to unnecessary alignment paddings.
This patch flattens it.

* task.memcg_oom.memcg          -> task.memcg_in_oom
* task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
* task.memcg_oom.order          -> task.memcg_oom_order
* task.memcg_oom.may_oom        -> task.memcg_may_oom

In addition, task.memcg_may_oom is relocated to where other bitfields
are which reduces the size of task_struct.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
---
Hello,

Andrew, these are the two patches which got acked from the following
thread.

 http://lkml.kernel.org/g/20150828220158.GD11089@htj.dyndns.org

Acks are added and the second patch's description is updated as
suggested by Michal and Vladimir.

Can you please put them in -mm?

Thanks!

 include/linux/memcontrol.h |   10 +++++-----
 include/linux/sched.h      |   13 ++++++-------
 mm/memcontrol.c            |   16 ++++++++--------
 3 files changed, 19 insertions(+), 20 deletions(-)

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -407,19 +407,19 @@ void mem_cgroup_print_oom_info(struct me
 
 static inline void mem_cgroup_oom_enable(void)
 {
-	WARN_ON(current->memcg_oom.may_oom);
-	current->memcg_oom.may_oom = 1;
+	WARN_ON(current->memcg_may_oom);
+	current->memcg_may_oom = 1;
 }
 
 static inline void mem_cgroup_oom_disable(void)
 {
-	WARN_ON(!current->memcg_oom.may_oom);
-	current->memcg_oom.may_oom = 0;
+	WARN_ON(!current->memcg_may_oom);
+	current->memcg_may_oom = 0;
 }
 
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-	return p->memcg_oom.memcg;
+	return p->memcg_in_oom;
 }
 
 bool mem_cgroup_oom_synchronize(bool wait);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1451,7 +1451,9 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
-
+#ifdef CONFIG_MEMCG
+	unsigned memcg_may_oom:1;
+#endif
 #ifdef CONFIG_MEMCG_KMEM
 	unsigned memcg_kmem_skip_account:1;
 #endif
@@ -1782,12 +1784,9 @@ struct task_struct {
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG
-	struct memcg_oom_info {
-		struct mem_cgroup *memcg;
-		gfp_t gfp_mask;
-		int order;
-		unsigned int may_oom:1;
-	} memcg_oom;
+	struct mem_cgroup *memcg_in_oom;
+	gfp_t memcg_oom_gfp_mask;
+	int memcg_oom_order;
 #endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1652,7 +1652,7 @@ static void memcg_oom_recover(struct mem
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	if (!current->memcg_oom.may_oom)
+	if (!current->memcg_may_oom)
 		return;
 	/*
 	 * We are in the middle of the charge context here, so we
@@ -1669,9 +1669,9 @@ static void mem_cgroup_oom(struct mem_cg
 	 * and when we know whether the fault was overall successful.
 	 */
 	css_get(&memcg->css);
-	current->memcg_oom.memcg = memcg;
-	current->memcg_oom.gfp_mask = mask;
-	current->memcg_oom.order = order;
+	current->memcg_in_oom = memcg;
+	current->memcg_oom_gfp_mask = mask;
+	current->memcg_oom_order = order;
 }
 
 /**
@@ -1693,7 +1693,7 @@ static void mem_cgroup_oom(struct mem_cg
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
-	struct mem_cgroup *memcg = current->memcg_oom.memcg;
+	struct mem_cgroup *memcg = current->memcg_in_oom;
 	struct oom_wait_info owait;
 	bool locked;
 
@@ -1721,8 +1721,8 @@ bool mem_cgroup_oom_synchronize(bool han
 	if (locked && !memcg->oom_kill_disable) {
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-					 current->memcg_oom.order);
+		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+					 current->memcg_oom_order);
 	} else {
 		schedule();
 		mem_cgroup_unmark_under_oom(memcg);
@@ -1739,7 +1739,7 @@ bool mem_cgroup_oom_synchronize(bool han
 		memcg_oom_recover(memcg);
 	}
 cleanup:
-	current->memcg_oom.memcg = NULL;
+	current->memcg_in_oom = NULL;
 	css_put(&memcg->css);
 	return true;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 1/2] memcg: flatten task_struct->memcg_oom
@ 2015-09-13 18:59 ` Tejun Heo
  0 siblings, 0 replies; 41+ messages in thread
From: Tejun Heo @ 2015-09-13 18:59 UTC (permalink / raw)
  To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	hannes-druUgvl0LCNAfugRpC6u6w, mhocko-DgEjT+Ai2ygdnm+yROfE0A
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	vdavydov-bzQdu9zFT3WakBO8gow8eQ, kernel-team-b10kYP2dOMg

task_struct->memcg_oom is a sub-struct containing fields which are
used for async memcg oom handling.  Most task_struct fields aren't
packaged this way and it can lead to unnecessary alignment paddings.
This patch flattens it.

* task.memcg_oom.memcg          -> task.memcg_in_oom
* task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
* task.memcg_oom.order          -> task.memcg_oom_order
* task.memcg_oom.may_oom        -> task.memcg_may_oom

In addition, task.memcg_may_oom is relocated to where other bitfields
are which reduces the size of task_struct.

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Acked-by: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
Reviewed-by: Vladimir Davydov <vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
---
Hello,

Andrew, these are the two patches which got acked from the following
thread.

 http://lkml.kernel.org/g/20150828220158.GD11089-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org

Acks are added and the second patch's description is updated as
suggested by Michal and Vladimir.

Can you please put them in -mm?

Thanks!

 include/linux/memcontrol.h |   10 +++++-----
 include/linux/sched.h      |   13 ++++++-------
 mm/memcontrol.c            |   16 ++++++++--------
 3 files changed, 19 insertions(+), 20 deletions(-)

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -407,19 +407,19 @@ void mem_cgroup_print_oom_info(struct me
 
 static inline void mem_cgroup_oom_enable(void)
 {
-	WARN_ON(current->memcg_oom.may_oom);
-	current->memcg_oom.may_oom = 1;
+	WARN_ON(current->memcg_may_oom);
+	current->memcg_may_oom = 1;
 }
 
 static inline void mem_cgroup_oom_disable(void)
 {
-	WARN_ON(!current->memcg_oom.may_oom);
-	current->memcg_oom.may_oom = 0;
+	WARN_ON(!current->memcg_may_oom);
+	current->memcg_may_oom = 0;
 }
 
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-	return p->memcg_oom.memcg;
+	return p->memcg_in_oom;
 }
 
 bool mem_cgroup_oom_synchronize(bool wait);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1451,7 +1451,9 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
-
+#ifdef CONFIG_MEMCG
+	unsigned memcg_may_oom:1;
+#endif
 #ifdef CONFIG_MEMCG_KMEM
 	unsigned memcg_kmem_skip_account:1;
 #endif
@@ -1782,12 +1784,9 @@ struct task_struct {
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG
-	struct memcg_oom_info {
-		struct mem_cgroup *memcg;
-		gfp_t gfp_mask;
-		int order;
-		unsigned int may_oom:1;
-	} memcg_oom;
+	struct mem_cgroup *memcg_in_oom;
+	gfp_t memcg_oom_gfp_mask;
+	int memcg_oom_order;
 #endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1652,7 +1652,7 @@ static void memcg_oom_recover(struct mem
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	if (!current->memcg_oom.may_oom)
+	if (!current->memcg_may_oom)
 		return;
 	/*
 	 * We are in the middle of the charge context here, so we
@@ -1669,9 +1669,9 @@ static void mem_cgroup_oom(struct mem_cg
 	 * and when we know whether the fault was overall successful.
 	 */
 	css_get(&memcg->css);
-	current->memcg_oom.memcg = memcg;
-	current->memcg_oom.gfp_mask = mask;
-	current->memcg_oom.order = order;
+	current->memcg_in_oom = memcg;
+	current->memcg_oom_gfp_mask = mask;
+	current->memcg_oom_order = order;
 }
 
 /**
@@ -1693,7 +1693,7 @@ static void mem_cgroup_oom(struct mem_cg
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
-	struct mem_cgroup *memcg = current->memcg_oom.memcg;
+	struct mem_cgroup *memcg = current->memcg_in_oom;
 	struct oom_wait_info owait;
 	bool locked;
 
@@ -1721,8 +1721,8 @@ bool mem_cgroup_oom_synchronize(bool han
 	if (locked && !memcg->oom_kill_disable) {
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-					 current->memcg_oom.order);
+		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+					 current->memcg_oom_order);
 	} else {
 		schedule();
 		mem_cgroup_unmark_under_oom(memcg);
@@ -1739,7 +1739,7 @@ bool mem_cgroup_oom_synchronize(bool han
 		memcg_oom_recover(memcg);
 	}
 cleanup:
-	current->memcg_oom.memcg = NULL;
+	current->memcg_in_oom = NULL;
 	css_put(&memcg->css);
 	return true;
 }

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2015-12-15 19:22 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-08-28 22:01 [PATCH 1/2] memcg: flatten task_struct->memcg_oom Tejun Heo
2015-08-28 22:01 ` Tejun Heo
2015-08-28 22:02 ` [PATCH 2/2] memcg: always enable kmemcg on the default hierarchy Tejun Heo
2015-08-28 22:02   ` Tejun Heo
2015-08-31 22:54   ` Andrew Morton
2015-09-04 21:00   ` [PATCH v2 3/2] memcg: punt high overage reclaim to return-to-userland path Tejun Heo
2015-09-07  9:23     ` Michal Hocko
2015-09-07  9:23       ` Michal Hocko
2015-09-08 16:59       ` Tejun Heo
2015-09-08 16:59         ` Tejun Heo
2015-09-07 11:38     ` Vladimir Davydov
2015-09-07 11:38       ` Vladimir Davydov
2015-09-08 17:00       ` Tejun Heo
2015-09-08 17:00         ` Tejun Heo
2015-09-15  9:20   ` [PATCH 2/2] memcg: always enable kmemcg on the default hierarchy Johannes Weiner
2015-09-15  9:20     ` Johannes Weiner
2015-09-01 15:25 ` [PATCH 1/2] memcg: flatten task_struct->memcg_oom Michal Hocko
2015-09-01 15:25   ` Michal Hocko
2015-09-02 11:45 ` Vladimir Davydov
2015-09-02 11:45   ` Vladimir Davydov
2015-09-13 18:59 Tejun Heo
2015-09-13 18:59 ` Tejun Heo
2015-09-15  7:37 ` Johannes Weiner
2015-09-15  7:37   ` Johannes Weiner
2015-09-20 14:45 ` Sasha Levin
2015-09-20 14:45   ` Sasha Levin
2015-09-21 20:01   ` Tejun Heo
2015-09-21 20:01     ` Tejun Heo
2015-09-30 18:54     ` Tejun Heo
2015-09-30 18:54       ` Tejun Heo
2015-11-25 14:43     ` Peter Zijlstra
2015-11-25 14:43       ` Peter Zijlstra
2015-11-25 15:02       ` Peter Zijlstra
2015-11-25 15:02         ` Peter Zijlstra
2015-11-25 15:31         ` Andrey Ryabinin
2015-11-25 17:34           ` Dmitry Vyukov
2015-11-25 17:34             ` Dmitry Vyukov
2015-11-25 17:44           ` Peter Zijlstra
2015-12-11 16:25             ` Tejun Heo
2015-12-11 16:25               ` Tejun Heo
2015-12-15 19:22               ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.