All of lore.kernel.org
 help / color / mirror / Atom feed
From: Michal Hocko <mhocko@kernel.org>
To: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>,
	Petr Mladek <pmladek@suse.com>,
	cgroups@vger.kernel.org, Cyril Hrubis <chrubis@suse.cz>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 2/2] memcg: relocate charge moving from ->attach to ->post_attach
Date: Mon, 25 Apr 2016 10:25:42 +0200	[thread overview]
Message-ID: <20160425082542.GB23933@dhcp22.suse.cz> (raw)
In-Reply-To: <20160421230902.GJ4775@htj.duckdns.org>

On Thu 21-04-16 19:09:02, Tejun Heo wrote:
> Hello,
> 
> So, this ended up a lot simpler than I originally expected.  I tested
> it lightly and it seems to work fine.  Petr, can you please test these
> two patches w/o the lru drain drop patch and see whether the problem
> is gone?
> 
> Thanks.
> ------ 8< ------
> If charge moving is used, memcg performs relabeling of the affected
> pages from its ->attach callback which is called under both
> cgroup_threadgroup_rwsem and thus can't create new kthreads.  This is
> fragile as various operations may depend on workqueues making forward
> progress which relies on the ability to create new kthreads.
> 
> There's no reason to perform charge moving from ->attach which is deep
> in the task migration path.  Move it to ->post_attach which is called
> after the actual migration is finished and cgroup_threadgroup_rwsem is
> dropped.
> 
> * move_charge_struct->mm is added and ->can_attach is now responsible
>   for pinning and recording the target mm.  mem_cgroup_clear_mc() is
>   updated accordingly.  This also simplifies mem_cgroup_move_task().
> 
> * mem_cgroup_move_task() is now called from ->post_attach instead of
>   ->attach.

This looks much better than the previous quick&dirty workaround. Thanks
for taking an extra step!

Sorry for being so pushy but shouldn't we at least document those
callbacks which depend on cgroup_threadgroup_rwsem to never ever block
on WQ directly or indirectly. Maybe even enforce they have to be
non-blocking. This would be out of scope of this particular patch of
course but it would fit nicely into the series.
 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Debugged-by: Petr Mladek <pmladek@suse.com>
> Reported-by: Cyril Hrubis <chrubis@suse.cz>
> Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> Fixes: 1ed1328792ff ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem")
> Cc: <stable@vger.kernel.org> # 4.4+

Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> ---
>  mm/memcontrol.c |   37 +++++++++++++++++++------------------
>  1 file changed, 19 insertions(+), 18 deletions(-)
> 
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct
>  /* "mc" and its members are protected by cgroup_mutex */
>  static struct move_charge_struct {
>  	spinlock_t	  lock; /* for from, to */
> +	struct mm_struct  *mm;
>  	struct mem_cgroup *from;
>  	struct mem_cgroup *to;
>  	unsigned long flags;
> @@ -4667,6 +4668,8 @@ static void __mem_cgroup_clear_mc(void)
>  
>  static void mem_cgroup_clear_mc(void)
>  {
> +	struct mm_struct *mm = mc.mm;
> +
>  	/*
>  	 * we must clear moving_task before waking up waiters at the end of
>  	 * task migration.
> @@ -4676,7 +4679,10 @@ static void mem_cgroup_clear_mc(void)
>  	spin_lock(&mc.lock);
>  	mc.from = NULL;
>  	mc.to = NULL;
> +	mc.mm = NULL;
>  	spin_unlock(&mc.lock);
> +
> +	mmput(mm);
>  }
>  
>  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -4733,6 +4739,7 @@ static int mem_cgroup_can_attach(struct
>  		VM_BUG_ON(mc.moved_swap);
>  
>  		spin_lock(&mc.lock);
> +		mc.mm = mm;
>  		mc.from = from;
>  		mc.to = memcg;
>  		mc.flags = move_flags;
> @@ -4742,8 +4749,9 @@ static int mem_cgroup_can_attach(struct
>  		ret = mem_cgroup_precharge_mc(mm);
>  		if (ret)
>  			mem_cgroup_clear_mc();
> +	} else {
> +		mmput(mm);
>  	}
> -	mmput(mm);
>  	return ret;
>  }
>  
> @@ -4852,11 +4860,11 @@ put:			/* get_mctgt_type() gets the page
>  	return ret;
>  }
>  
> -static void mem_cgroup_move_charge(struct mm_struct *mm)
> +static void mem_cgroup_move_charge(void)
>  {
>  	struct mm_walk mem_cgroup_move_charge_walk = {
>  		.pmd_entry = mem_cgroup_move_charge_pte_range,
> -		.mm = mm,
> +		.mm = mc.mm,
>  	};
>  
>  	lru_add_drain_all();
> @@ -4868,7 +4876,7 @@ static void mem_cgroup_move_charge(struc
>  	atomic_inc(&mc.from->moving_account);
>  	synchronize_rcu();
>  retry:
> -	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
> +	if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
>  		/*
>  		 * Someone who are holding the mmap_sem might be waiting in
>  		 * waitq. So we cancel all extra charges, wake up all waiters,
> @@ -4885,23 +4893,16 @@ retry:
>  	 * additional charge, the page walk just aborts.
>  	 */
>  	walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
> -	up_read(&mm->mmap_sem);
> +	up_read(&mc.mm->mmap_sem);
>  	atomic_dec(&mc.from->moving_account);
>  }
>  
> -static void mem_cgroup_move_task(struct cgroup_taskset *tset)
> +static void mem_cgroup_move_task(void)
>  {
> -	struct cgroup_subsys_state *css;
> -	struct task_struct *p = cgroup_taskset_first(tset, &css);
> -	struct mm_struct *mm = get_task_mm(p);
> -
> -	if (mm) {
> -		if (mc.to)
> -			mem_cgroup_move_charge(mm);
> -		mmput(mm);
> -	}
> -	if (mc.to)
> +	if (mc.to) {
> +		mem_cgroup_move_charge();
>  		mem_cgroup_clear_mc();
> +	}
>  }
>  #else	/* !CONFIG_MMU */
>  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -4911,7 +4912,7 @@ static int mem_cgroup_can_attach(struct
>  static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
>  {
>  }
> -static void mem_cgroup_move_task(struct cgroup_taskset *tset)
> +static void mem_cgroup_move_task(void)
>  {
>  }
>  #endif
> @@ -5195,7 +5196,7 @@ struct cgroup_subsys memory_cgrp_subsys
>  	.css_reset = mem_cgroup_css_reset,
>  	.can_attach = mem_cgroup_can_attach,
>  	.cancel_attach = mem_cgroup_cancel_attach,
> -	.attach = mem_cgroup_move_task,
> +	.post_attach = mem_cgroup_move_task,
>  	.bind = mem_cgroup_bind,
>  	.dfl_cftypes = memory_files,
>  	.legacy_cftypes = mem_cgroup_legacy_files,

-- 
Michal Hocko
SUSE Labs

WARNING: multiple messages have this Message-ID (diff)
From: Michal Hocko <mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
To: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>,
	Petr Mladek <pmladek-IBi9RG/b67k@public.gmane.org>,
	cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Cyril Hrubis <chrubis-AlSwsSmVLrQ@public.gmane.org>,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: Re: [PATCH 2/2] memcg: relocate charge moving from ->attach to ->post_attach
Date: Mon, 25 Apr 2016 10:25:42 +0200	[thread overview]
Message-ID: <20160425082542.GB23933@dhcp22.suse.cz> (raw)
In-Reply-To: <20160421230902.GJ4775-piEFEHQLUPpN0TnZuCh8vA@public.gmane.org>

On Thu 21-04-16 19:09:02, Tejun Heo wrote:
> Hello,
> 
> So, this ended up a lot simpler than I originally expected.  I tested
> it lightly and it seems to work fine.  Petr, can you please test these
> two patches w/o the lru drain drop patch and see whether the problem
> is gone?
> 
> Thanks.
> ------ 8< ------
> If charge moving is used, memcg performs relabeling of the affected
> pages from its ->attach callback which is called under both
> cgroup_threadgroup_rwsem and thus can't create new kthreads.  This is
> fragile as various operations may depend on workqueues making forward
> progress which relies on the ability to create new kthreads.
> 
> There's no reason to perform charge moving from ->attach which is deep
> in the task migration path.  Move it to ->post_attach which is called
> after the actual migration is finished and cgroup_threadgroup_rwsem is
> dropped.
> 
> * move_charge_struct->mm is added and ->can_attach is now responsible
>   for pinning and recording the target mm.  mem_cgroup_clear_mc() is
>   updated accordingly.  This also simplifies mem_cgroup_move_task().
> 
> * mem_cgroup_move_task() is now called from ->post_attach instead of
>   ->attach.

This looks much better than the previous quick&dirty workaround. Thanks
for taking an extra step!

Sorry for being so pushy but shouldn't we at least document those
callbacks which depend on cgroup_threadgroup_rwsem to never ever block
on WQ directly or indirectly. Maybe even enforce they have to be
non-blocking. This would be out of scope of this particular patch of
course but it would fit nicely into the series.
 
> Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> Cc: Michal Hocko <mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Debugged-by: Petr Mladek <pmladek-IBi9RG/b67k@public.gmane.org>
> Reported-by: Cyril Hrubis <chrubis-AlSwsSmVLrQ@public.gmane.org>
> Reported-by: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> Fixes: 1ed1328792ff ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem")
> Cc: <stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org> # 4.4+

Acked-by: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>

Thanks!

> ---
>  mm/memcontrol.c |   37 +++++++++++++++++++------------------
>  1 file changed, 19 insertions(+), 18 deletions(-)
> 
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct
>  /* "mc" and its members are protected by cgroup_mutex */
>  static struct move_charge_struct {
>  	spinlock_t	  lock; /* for from, to */
> +	struct mm_struct  *mm;
>  	struct mem_cgroup *from;
>  	struct mem_cgroup *to;
>  	unsigned long flags;
> @@ -4667,6 +4668,8 @@ static void __mem_cgroup_clear_mc(void)
>  
>  static void mem_cgroup_clear_mc(void)
>  {
> +	struct mm_struct *mm = mc.mm;
> +
>  	/*
>  	 * we must clear moving_task before waking up waiters at the end of
>  	 * task migration.
> @@ -4676,7 +4679,10 @@ static void mem_cgroup_clear_mc(void)
>  	spin_lock(&mc.lock);
>  	mc.from = NULL;
>  	mc.to = NULL;
> +	mc.mm = NULL;
>  	spin_unlock(&mc.lock);
> +
> +	mmput(mm);
>  }
>  
>  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -4733,6 +4739,7 @@ static int mem_cgroup_can_attach(struct
>  		VM_BUG_ON(mc.moved_swap);
>  
>  		spin_lock(&mc.lock);
> +		mc.mm = mm;
>  		mc.from = from;
>  		mc.to = memcg;
>  		mc.flags = move_flags;
> @@ -4742,8 +4749,9 @@ static int mem_cgroup_can_attach(struct
>  		ret = mem_cgroup_precharge_mc(mm);
>  		if (ret)
>  			mem_cgroup_clear_mc();
> +	} else {
> +		mmput(mm);
>  	}
> -	mmput(mm);
>  	return ret;
>  }
>  
> @@ -4852,11 +4860,11 @@ put:			/* get_mctgt_type() gets the page
>  	return ret;
>  }
>  
> -static void mem_cgroup_move_charge(struct mm_struct *mm)
> +static void mem_cgroup_move_charge(void)
>  {
>  	struct mm_walk mem_cgroup_move_charge_walk = {
>  		.pmd_entry = mem_cgroup_move_charge_pte_range,
> -		.mm = mm,
> +		.mm = mc.mm,
>  	};
>  
>  	lru_add_drain_all();
> @@ -4868,7 +4876,7 @@ static void mem_cgroup_move_charge(struc
>  	atomic_inc(&mc.from->moving_account);
>  	synchronize_rcu();
>  retry:
> -	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
> +	if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
>  		/*
>  		 * Someone who are holding the mmap_sem might be waiting in
>  		 * waitq. So we cancel all extra charges, wake up all waiters,
> @@ -4885,23 +4893,16 @@ retry:
>  	 * additional charge, the page walk just aborts.
>  	 */
>  	walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
> -	up_read(&mm->mmap_sem);
> +	up_read(&mc.mm->mmap_sem);
>  	atomic_dec(&mc.from->moving_account);
>  }
>  
> -static void mem_cgroup_move_task(struct cgroup_taskset *tset)
> +static void mem_cgroup_move_task(void)
>  {
> -	struct cgroup_subsys_state *css;
> -	struct task_struct *p = cgroup_taskset_first(tset, &css);
> -	struct mm_struct *mm = get_task_mm(p);
> -
> -	if (mm) {
> -		if (mc.to)
> -			mem_cgroup_move_charge(mm);
> -		mmput(mm);
> -	}
> -	if (mc.to)
> +	if (mc.to) {
> +		mem_cgroup_move_charge();
>  		mem_cgroup_clear_mc();
> +	}
>  }
>  #else	/* !CONFIG_MMU */
>  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -4911,7 +4912,7 @@ static int mem_cgroup_can_attach(struct
>  static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
>  {
>  }
> -static void mem_cgroup_move_task(struct cgroup_taskset *tset)
> +static void mem_cgroup_move_task(void)
>  {
>  }
>  #endif
> @@ -5195,7 +5196,7 @@ struct cgroup_subsys memory_cgrp_subsys
>  	.css_reset = mem_cgroup_css_reset,
>  	.can_attach = mem_cgroup_can_attach,
>  	.cancel_attach = mem_cgroup_cancel_attach,
> -	.attach = mem_cgroup_move_task,
> +	.post_attach = mem_cgroup_move_task,
>  	.bind = mem_cgroup_bind,
>  	.dfl_cftypes = memory_files,
>  	.legacy_cftypes = mem_cgroup_legacy_files,

-- 
Michal Hocko
SUSE Labs

  parent reply	other threads:[~2016-04-25  8:26 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-04-13  9:42 [BUG] cgroup/workques/fork: deadlock when moving cgroups Petr Mladek
2016-04-13 18:33 ` Tejun Heo
2016-04-13 18:33   ` Tejun Heo
2016-04-13 18:57   ` Tejun Heo
2016-04-13 18:57     ` Tejun Heo
2016-04-13 19:23   ` Michal Hocko
2016-04-13 19:23     ` Michal Hocko
2016-04-13 19:28     ` Michal Hocko
2016-04-13 19:28       ` Michal Hocko
2016-04-13 19:37     ` Tejun Heo
2016-04-13 19:48       ` Michal Hocko
2016-04-14  7:06         ` Michal Hocko
2016-04-14  7:06           ` Michal Hocko
2016-04-14 15:32           ` Tejun Heo
2016-04-14 15:32             ` Tejun Heo
2016-04-14 17:50     ` Johannes Weiner
2016-04-15  7:06       ` Michal Hocko
2016-04-15 14:38         ` Tejun Heo
2016-04-15 14:38           ` Tejun Heo
2016-04-15 15:08           ` Michal Hocko
2016-04-15 15:08             ` Michal Hocko
2016-04-15 15:25             ` Tejun Heo
2016-04-15 15:25               ` Tejun Heo
2016-04-17 12:00               ` Michal Hocko
2016-04-17 12:00                 ` Michal Hocko
2016-04-18 14:40           ` Petr Mladek
2016-04-18 14:40             ` Petr Mladek
2016-04-19 14:01             ` Michal Hocko
2016-04-19 14:01               ` Michal Hocko
2016-04-19 15:39               ` Petr Mladek
2016-04-15 19:17       ` [PATCH for-4.6-fixes] memcg: remove lru_add_drain_all() invocation from mem_cgroup_move_charge() Tejun Heo
2016-04-17 12:07         ` Michal Hocko
2016-04-17 12:07           ` Michal Hocko
2016-04-20 21:29           ` Tejun Heo
2016-04-20 21:29             ` Tejun Heo
2016-04-21  3:27             ` Michal Hocko
2016-04-21  3:27               ` Michal Hocko
2016-04-21 15:00               ` Petr Mladek
2016-04-21 15:00                 ` Petr Mladek
2016-04-21 15:51                 ` Tejun Heo
2016-04-21 23:06           ` [PATCH 1/2] cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback Tejun Heo
2016-04-21 23:06             ` Tejun Heo
2016-04-21 23:09             ` [PATCH 2/2] memcg: relocate charge moving from ->attach to ->post_attach Tejun Heo
2016-04-21 23:09               ` Tejun Heo
2016-04-22 13:57               ` Petr Mladek
2016-04-22 13:57                 ` Petr Mladek
2016-04-25  8:25               ` Michal Hocko [this message]
2016-04-25  8:25                 ` Michal Hocko
2016-04-25 19:42                 ` Tejun Heo
2016-04-25 19:42                   ` Tejun Heo
2016-04-25 19:44               ` Tejun Heo
2016-04-25 19:44                 ` Tejun Heo
2016-04-21 23:11             ` [PATCH 1/2] cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback Tejun Heo
2016-04-21 23:11               ` Tejun Heo
2016-04-21 15:56         ` [PATCH for-4.6-fixes] memcg: remove lru_add_drain_all() invocation from mem_cgroup_move_charge() Tejun Heo
2016-04-21 15:56           ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160425082542.GB23933@dhcp22.suse.cz \
    --to=mhocko@kernel.org \
    --cc=cgroups@vger.kernel.org \
    --cc=chrubis@suse.cz \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pmladek@suse.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.