linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
@ 2019-10-29 23:47 Shakeel Butt
  2019-10-29 23:51 ` Roman Gushchin
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Shakeel Butt @ 2019-10-29 23:47 UTC (permalink / raw)
  To: Greg Thelen, Roman Gushchin, Johannes Weiner, Michal Hocko,
	Andrew Morton
  Cc: linux-mm, cgroups, linux-kernel, Shakeel Butt,
	syzbot+13f93c99c06988391efe

Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
between reclaimers"), the memcg reclaim does not bail out earlier based
on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
pages of the memcg on all the nodes will be scanned relative to the
reclaim priority. So, there is no need to maintain state regarding which
node to start the memcg reclaim from. Also KCSAN complains data races in
the code maintaining the state.

This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
memory from nodes in round-robin order") and the commit 453a9bf347f1
("memcg: fix numa scan information update to be triggered by memory
event").

Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reported-by: <syzbot+13f93c99c06988391efe@syzkaller.appspotmail.com>
---
 include/linux/memcontrol.h |   8 ---
 mm/memcontrol.c            | 112 -------------------------------------
 mm/vmscan.c                |  11 +---
 3 files changed, 1 insertion(+), 130 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e82928deea88..239e752a7817 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -80,7 +80,6 @@ struct mem_cgroup_id {
 enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_THRESH,
 	MEM_CGROUP_TARGET_SOFTLIMIT,
-	MEM_CGROUP_TARGET_NUMAINFO,
 	MEM_CGROUP_NTARGETS,
 };
 
@@ -312,13 +311,6 @@ struct mem_cgroup {
 	struct list_head kmem_caches;
 #endif
 
-	int last_scanned_node;
-#if MAX_NUMNODES > 1
-	nodemask_t	scan_nodes;
-	atomic_t	numainfo_events;
-	atomic_t	numainfo_updating;
-#endif
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head cgwb_list;
 	struct wb_domain cgwb_domain;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ea085877c548..aaa19bf5cf0f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -100,7 +100,6 @@ static bool do_memsw_account(void)
 
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
-#define NUMAINFO_EVENTS_TARGET	1024
 
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
@@ -869,9 +868,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 		case MEM_CGROUP_TARGET_SOFTLIMIT:
 			next = val + SOFTLIMIT_EVENTS_TARGET;
 			break;
-		case MEM_CGROUP_TARGET_NUMAINFO:
-			next = val + NUMAINFO_EVENTS_TARGET;
-			break;
 		default:
 			break;
 		}
@@ -891,21 +887,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_THRESH))) {
 		bool do_softlimit;
-		bool do_numainfo __maybe_unused;
 
 		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_SOFTLIMIT);
-#if MAX_NUMNODES > 1
-		do_numainfo = mem_cgroup_event_ratelimit(memcg,
-						MEM_CGROUP_TARGET_NUMAINFO);
-#endif
 		mem_cgroup_threshold(memcg);
 		if (unlikely(do_softlimit))
 			mem_cgroup_update_tree(memcg, page);
-#if MAX_NUMNODES > 1
-		if (unlikely(do_numainfo))
-			atomic_inc(&memcg->numainfo_events);
-#endif
 	}
 }
 
@@ -1590,104 +1577,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	return ret;
 }
 
-#if MAX_NUMNODES > 1
-
-/**
- * test_mem_cgroup_node_reclaimable
- * @memcg: the target memcg
- * @nid: the node ID to be checked.
- * @noswap : specify true here if the user wants flle only information.
- *
- * This function returns whether the specified memcg contains any
- * reclaimable pages on a node. Returns true if there are any reclaimable
- * pages in the node.
- */
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
-		int nid, bool noswap)
-{
-	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
-
-	if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
-	    lruvec_page_state(lruvec, NR_ACTIVE_FILE))
-		return true;
-	if (noswap || !total_swap_pages)
-		return false;
-	if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
-	    lruvec_page_state(lruvec, NR_ACTIVE_ANON))
-		return true;
-	return false;
-
-}
-
-/*
- * Always updating the nodemask is not very good - even if we have an empty
- * list or the wrong list here, we can start from some node and traverse all
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
- *
- */
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
-{
-	int nid;
-	/*
-	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
-	 * pagein/pageout changes since the last update.
-	 */
-	if (!atomic_read(&memcg->numainfo_events))
-		return;
-	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
-		return;
-
-	/* make a nodemask where this memcg uses memory from */
-	memcg->scan_nodes = node_states[N_MEMORY];
-
-	for_each_node_mask(nid, node_states[N_MEMORY]) {
-
-		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
-			node_clear(nid, memcg->scan_nodes);
-	}
-
-	atomic_set(&memcg->numainfo_events, 0);
-	atomic_set(&memcg->numainfo_updating, 0);
-}
-
-/*
- * Selecting a node where we start reclaim from. Because what we need is just
- * reducing usage counter, start from anywhere is O,K. Considering
- * memory reclaim from current node, there are pros. and cons.
- *
- * Freeing memory from current node means freeing memory from a node which
- * we'll use or we've used. So, it may make LRU bad. And if several threads
- * hit limits, it will see a contention on a node. But freeing from remote
- * node means more costs for memory reclaim because of memory latency.
- *
- * Now, we use round-robin. Better algorithm is welcomed.
- */
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
-{
-	int node;
-
-	mem_cgroup_may_update_nodemask(memcg);
-	node = memcg->last_scanned_node;
-
-	node = next_node_in(node, memcg->scan_nodes);
-	/*
-	 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
-	 * last time it really checked all the LRUs due to rate limiting.
-	 * Fallback to the current node in that case for simplicity.
-	 */
-	if (unlikely(node == MAX_NUMNODES))
-		node = numa_node_id();
-
-	memcg->last_scanned_node = node;
-	return node;
-}
-#else
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
-{
-	return 0;
-}
-#endif
-
 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 				   pg_data_t *pgdat,
 				   gfp_t gfp_mask,
@@ -5056,7 +4945,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 		goto fail;
 
 	INIT_WORK(&memcg->high_work, high_work_func);
-	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1154b3a2b637..cb4dc52cfb88 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3344,10 +3344,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   gfp_t gfp_mask,
 					   bool may_swap)
 {
-	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
 	unsigned long pflags;
-	int nid;
 	unsigned int noreclaim_flag;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -3360,16 +3358,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.may_swap = may_swap,
 	};
+	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 
 	set_task_reclaim_state(current, &sc.reclaim_state);
-	/*
-	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
-	 * take care of from where we get pages. So the node where we start the
-	 * scan does not need to be the current node.
-	 */
-	nid = mem_cgroup_select_victim_node(memcg);
-
-	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
 
 	trace_mm_vmscan_memcg_reclaim_begin(
 				cgroup_ino(memcg->css.cgroup),
-- 
2.24.0.rc0.303.g954a862665-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
  2019-10-29 23:47 [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Shakeel Butt
@ 2019-10-29 23:51 ` Roman Gushchin
  2019-10-30  0:47 ` Andrew Morton
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Roman Gushchin @ 2019-10-29 23:51 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Greg Thelen, Johannes Weiner, Michal Hocko, Andrew Morton,
	linux-mm, cgroups, linux-kernel, syzbot+13f93c99c06988391efe

On Tue, Oct 29, 2019 at 04:47:53PM -0700, Shakeel Butt wrote:
> Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
> between reclaimers"), the memcg reclaim does not bail out earlier based
> on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
> pages of the memcg on all the nodes will be scanned relative to the
> reclaim priority. So, there is no need to maintain state regarding which
> node to start the memcg reclaim from. Also KCSAN complains data races in
> the code maintaining the state.
> 
> This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
> memory from nodes in round-robin order") and the commit 453a9bf347f1
> ("memcg: fix numa scan information update to be triggered by memory
> event").
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Reported-by: <syzbot+13f93c99c06988391efe@syzkaller.appspotmail.com>

Acked-by: Roman Gushchin <guro@fb.com>

Thanks!

> ---
>  include/linux/memcontrol.h |   8 ---
>  mm/memcontrol.c            | 112 -------------------------------------
>  mm/vmscan.c                |  11 +---
>  3 files changed, 1 insertion(+), 130 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e82928deea88..239e752a7817 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -80,7 +80,6 @@ struct mem_cgroup_id {
>  enum mem_cgroup_events_target {
>  	MEM_CGROUP_TARGET_THRESH,
>  	MEM_CGROUP_TARGET_SOFTLIMIT,
> -	MEM_CGROUP_TARGET_NUMAINFO,
>  	MEM_CGROUP_NTARGETS,
>  };
>  
> @@ -312,13 +311,6 @@ struct mem_cgroup {
>  	struct list_head kmem_caches;
>  #endif
>  
> -	int last_scanned_node;
> -#if MAX_NUMNODES > 1
> -	nodemask_t	scan_nodes;
> -	atomic_t	numainfo_events;
> -	atomic_t	numainfo_updating;
> -#endif
> -
>  #ifdef CONFIG_CGROUP_WRITEBACK
>  	struct list_head cgwb_list;
>  	struct wb_domain cgwb_domain;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index ea085877c548..aaa19bf5cf0f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -100,7 +100,6 @@ static bool do_memsw_account(void)
>  
>  #define THRESHOLDS_EVENTS_TARGET 128
>  #define SOFTLIMIT_EVENTS_TARGET 1024
> -#define NUMAINFO_EVENTS_TARGET	1024
>  
>  /*
>   * Cgroups above their limits are maintained in a RB-Tree, independent of
> @@ -869,9 +868,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>  		case MEM_CGROUP_TARGET_SOFTLIMIT:
>  			next = val + SOFTLIMIT_EVENTS_TARGET;
>  			break;
> -		case MEM_CGROUP_TARGET_NUMAINFO:
> -			next = val + NUMAINFO_EVENTS_TARGET;
> -			break;
>  		default:
>  			break;
>  		}
> @@ -891,21 +887,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
>  	if (unlikely(mem_cgroup_event_ratelimit(memcg,
>  						MEM_CGROUP_TARGET_THRESH))) {
>  		bool do_softlimit;
> -		bool do_numainfo __maybe_unused;
>  
>  		do_softlimit = mem_cgroup_event_ratelimit(memcg,
>  						MEM_CGROUP_TARGET_SOFTLIMIT);
> -#if MAX_NUMNODES > 1
> -		do_numainfo = mem_cgroup_event_ratelimit(memcg,
> -						MEM_CGROUP_TARGET_NUMAINFO);
> -#endif
>  		mem_cgroup_threshold(memcg);
>  		if (unlikely(do_softlimit))
>  			mem_cgroup_update_tree(memcg, page);
> -#if MAX_NUMNODES > 1
> -		if (unlikely(do_numainfo))
> -			atomic_inc(&memcg->numainfo_events);
> -#endif
>  	}
>  }
>  
> @@ -1590,104 +1577,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	return ret;
>  }
>  
> -#if MAX_NUMNODES > 1
> -
> -/**
> - * test_mem_cgroup_node_reclaimable
> - * @memcg: the target memcg
> - * @nid: the node ID to be checked.
> - * @noswap : specify true here if the user wants flle only information.
> - *
> - * This function returns whether the specified memcg contains any
> - * reclaimable pages on a node. Returns true if there are any reclaimable
> - * pages in the node.
> - */
> -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
> -		int nid, bool noswap)
> -{
> -	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
> -
> -	if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
> -	    lruvec_page_state(lruvec, NR_ACTIVE_FILE))
> -		return true;
> -	if (noswap || !total_swap_pages)
> -		return false;
> -	if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
> -	    lruvec_page_state(lruvec, NR_ACTIVE_ANON))
> -		return true;
> -	return false;
> -
> -}
> -
> -/*
> - * Always updating the nodemask is not very good - even if we have an empty
> - * list or the wrong list here, we can start from some node and traverse all
> - * nodes based on the zonelist. So update the list loosely once per 10 secs.
> - *
> - */
> -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
> -{
> -	int nid;
> -	/*
> -	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
> -	 * pagein/pageout changes since the last update.
> -	 */
> -	if (!atomic_read(&memcg->numainfo_events))
> -		return;
> -	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
> -		return;
> -
> -	/* make a nodemask where this memcg uses memory from */
> -	memcg->scan_nodes = node_states[N_MEMORY];
> -
> -	for_each_node_mask(nid, node_states[N_MEMORY]) {
> -
> -		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
> -			node_clear(nid, memcg->scan_nodes);
> -	}
> -
> -	atomic_set(&memcg->numainfo_events, 0);
> -	atomic_set(&memcg->numainfo_updating, 0);
> -}
> -
> -/*
> - * Selecting a node where we start reclaim from. Because what we need is just
> - * reducing usage counter, start from anywhere is O,K. Considering
> - * memory reclaim from current node, there are pros. and cons.
> - *
> - * Freeing memory from current node means freeing memory from a node which
> - * we'll use or we've used. So, it may make LRU bad. And if several threads
> - * hit limits, it will see a contention on a node. But freeing from remote
> - * node means more costs for memory reclaim because of memory latency.
> - *
> - * Now, we use round-robin. Better algorithm is welcomed.
> - */
> -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
> -{
> -	int node;
> -
> -	mem_cgroup_may_update_nodemask(memcg);
> -	node = memcg->last_scanned_node;
> -
> -	node = next_node_in(node, memcg->scan_nodes);
> -	/*
> -	 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
> -	 * last time it really checked all the LRUs due to rate limiting.
> -	 * Fallback to the current node in that case for simplicity.
> -	 */
> -	if (unlikely(node == MAX_NUMNODES))
> -		node = numa_node_id();
> -
> -	memcg->last_scanned_node = node;
> -	return node;
> -}
> -#else
> -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
> -{
> -	return 0;
> -}
> -#endif
> -
>  static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
>  				   pg_data_t *pgdat,
>  				   gfp_t gfp_mask,
> @@ -5056,7 +4945,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>  		goto fail;
>  
>  	INIT_WORK(&memcg->high_work, high_work_func);
> -	memcg->last_scanned_node = MAX_NUMNODES;
>  	INIT_LIST_HEAD(&memcg->oom_notify);
>  	mutex_init(&memcg->thresholds_lock);
>  	spin_lock_init(&memcg->move_lock);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1154b3a2b637..cb4dc52cfb88 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3344,10 +3344,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					   gfp_t gfp_mask,
>  					   bool may_swap)
>  {
> -	struct zonelist *zonelist;
>  	unsigned long nr_reclaimed;
>  	unsigned long pflags;
> -	int nid;
>  	unsigned int noreclaim_flag;
>  	struct scan_control sc = {
>  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> @@ -3360,16 +3358,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  		.may_unmap = 1,
>  		.may_swap = may_swap,
>  	};
> +	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
>  
>  	set_task_reclaim_state(current, &sc.reclaim_state);
> -	/*
> -	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
> -	 * take care of from where we get pages. So the node where we start the
> -	 * scan does not need to be the current node.
> -	 */
> -	nid = mem_cgroup_select_victim_node(memcg);
> -
> -	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
>  
>  	trace_mm_vmscan_memcg_reclaim_begin(
>  				cgroup_ino(memcg->css.cgroup),
> -- 
> 2.24.0.rc0.303.g954a862665-goog
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
  2019-10-29 23:47 [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Shakeel Butt
  2019-10-29 23:51 ` Roman Gushchin
@ 2019-10-30  0:47 ` Andrew Morton
  2019-10-30  0:54 ` Andrew Morton
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Andrew Morton @ 2019-10-30  0:47 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Greg Thelen, Roman Gushchin, Johannes Weiner, Michal Hocko,
	linux-mm, cgroups, linux-kernel, syzbot+13f93c99c06988391efe

On Tue, 29 Oct 2019 16:47:53 -0700 Shakeel Butt <shakeelb@google.com> wrote:

> Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
> between reclaimers"), the memcg reclaim does not bail out earlier based
> on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
> pages of the memcg on all the nodes will be scanned relative to the
> reclaim priority. So, there is no need to maintain state regarding which
> node to start the memcg reclaim from. Also KCSAN complains data races in
> the code maintaining the state.
> 
> This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
> memory from nodes in round-robin order") and the commit 453a9bf347f1
> ("memcg: fix numa scan information update to be triggered by memory
> event").
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Reported-by: <syzbot+13f93c99c06988391efe@syzkaller.appspotmail.com>

The sysbot report
(http://lkml.kernel.org/r/00000000000055aba7058af4d378@google.com) was
for a null pointer deref.  So won't we be needing a Fixes: and cc:stable?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
  2019-10-29 23:47 [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Shakeel Butt
  2019-10-29 23:51 ` Roman Gushchin
  2019-10-30  0:47 ` Andrew Morton
@ 2019-10-30  0:54 ` Andrew Morton
  2019-10-30  1:15   ` Shakeel Butt
  2019-10-30 11:49 ` Michal Hocko
  2019-10-30 17:44 ` Johannes Weiner
  4 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2019-10-30  0:54 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Greg Thelen, Roman Gushchin, Johannes Weiner, Michal Hocko,
	linux-mm, cgroups, linux-kernel, syzbot+13f93c99c06988391efe

On Tue, 29 Oct 2019 16:47:53 -0700 Shakeel Butt <shakeelb@google.com> wrote:

> Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
> between reclaimers"), the memcg reclaim does not bail out earlier based
> on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
> pages of the memcg on all the nodes will be scanned relative to the
> reclaim priority. So, there is no need to maintain state regarding which
> node to start the memcg reclaim from. Also KCSAN complains data races in
> the code maintaining the state.
> 
> This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
> memory from nodes in round-robin order") and the commit 453a9bf347f1
> ("memcg: fix numa scan information update to be triggered by memory
> event").
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Reported-by: <syzbot+13f93c99c06988391efe@syzkaller.appspotmail.com>

I can't find the original sysbot email.  Help?

iirc the incidentally-fixed issue is a rather theoretical data race and
the patch isn't a high priority thing?


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
  2019-10-30  0:54 ` Andrew Morton
@ 2019-10-30  1:15   ` Shakeel Butt
  0 siblings, 0 replies; 9+ messages in thread
From: Shakeel Butt @ 2019-10-30  1:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Greg Thelen, Roman Gushchin, Johannes Weiner, Michal Hocko,
	Linux MM, Cgroups, LKML, syzbot+13f93c99c06988391efe

On Tue, Oct 29, 2019 at 5:55 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Tue, 29 Oct 2019 16:47:53 -0700 Shakeel Butt <shakeelb@google.com> wrote:
>
> > Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
> > between reclaimers"), the memcg reclaim does not bail out earlier based
> > on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
> > pages of the memcg on all the nodes will be scanned relative to the
> > reclaim priority. So, there is no need to maintain state regarding which
> > node to start the memcg reclaim from. Also KCSAN complains data races in
> > the code maintaining the state.
> >
> > This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
> > memory from nodes in round-robin order") and the commit 453a9bf347f1
> > ("memcg: fix numa scan information update to be triggered by memory
> > event").
> >
> > Signed-off-by: Shakeel Butt <shakeelb@google.com>
> > Reported-by: <syzbot+13f93c99c06988391efe@syzkaller.appspotmail.com>
>
> I can't find the original sysbot email.  Help?
>
> iirc the incidentally-fixed issue is a rather theoretical data race and
> the patch isn't a high priority thing?
>

Let me check why the link is not working. However you are right that
the fix is for a theoretical data race and no need to be backported to
stable trees.

Shakeel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
  2019-10-29 23:47 [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Shakeel Butt
                   ` (2 preceding siblings ...)
  2019-10-30  0:54 ` Andrew Morton
@ 2019-10-30 11:49 ` Michal Hocko
  2019-10-30 17:44 ` Johannes Weiner
  4 siblings, 0 replies; 9+ messages in thread
From: Michal Hocko @ 2019-10-30 11:49 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Greg Thelen, Roman Gushchin, Johannes Weiner, Andrew Morton,
	linux-mm, cgroups, linux-kernel, syzbot+13f93c99c06988391efe

On Tue 29-10-19 16:47:53, Shakeel Butt wrote:
> Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
> between reclaimers"), the memcg reclaim does not bail out earlier based
> on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
> pages of the memcg on all the nodes will be scanned relative to the
> reclaim priority. So, there is no need to maintain state regarding which
> node to start the memcg reclaim from. Also KCSAN complains data races in
> the code maintaining the state.

If you want to mention KCSAN then it would be really good to describe
the race and explain why it is more theoretical than real. Other than
that it just causes confusion IMHO.

> This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
> memory from nodes in round-robin order") and the commit 453a9bf347f1
> ("memcg: fix numa scan information update to be triggered by memory
> event").
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Reported-by: <syzbot+13f93c99c06988391efe@syzkaller.appspotmail.com>

I like this very much. The code was dubious to say the least. It was
really hard to make any educated guess on how the reclaim is going to
behave NUMA wise. The diffstat is also encouraging.

Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> ---
>  include/linux/memcontrol.h |   8 ---
>  mm/memcontrol.c            | 112 -------------------------------------
>  mm/vmscan.c                |  11 +---
>  3 files changed, 1 insertion(+), 130 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e82928deea88..239e752a7817 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -80,7 +80,6 @@ struct mem_cgroup_id {
>  enum mem_cgroup_events_target {
>  	MEM_CGROUP_TARGET_THRESH,
>  	MEM_CGROUP_TARGET_SOFTLIMIT,
> -	MEM_CGROUP_TARGET_NUMAINFO,
>  	MEM_CGROUP_NTARGETS,
>  };
>  
> @@ -312,13 +311,6 @@ struct mem_cgroup {
>  	struct list_head kmem_caches;
>  #endif
>  
> -	int last_scanned_node;
> -#if MAX_NUMNODES > 1
> -	nodemask_t	scan_nodes;
> -	atomic_t	numainfo_events;
> -	atomic_t	numainfo_updating;
> -#endif
> -
>  #ifdef CONFIG_CGROUP_WRITEBACK
>  	struct list_head cgwb_list;
>  	struct wb_domain cgwb_domain;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index ea085877c548..aaa19bf5cf0f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -100,7 +100,6 @@ static bool do_memsw_account(void)
>  
>  #define THRESHOLDS_EVENTS_TARGET 128
>  #define SOFTLIMIT_EVENTS_TARGET 1024
> -#define NUMAINFO_EVENTS_TARGET	1024
>  
>  /*
>   * Cgroups above their limits are maintained in a RB-Tree, independent of
> @@ -869,9 +868,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>  		case MEM_CGROUP_TARGET_SOFTLIMIT:
>  			next = val + SOFTLIMIT_EVENTS_TARGET;
>  			break;
> -		case MEM_CGROUP_TARGET_NUMAINFO:
> -			next = val + NUMAINFO_EVENTS_TARGET;
> -			break;
>  		default:
>  			break;
>  		}
> @@ -891,21 +887,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
>  	if (unlikely(mem_cgroup_event_ratelimit(memcg,
>  						MEM_CGROUP_TARGET_THRESH))) {
>  		bool do_softlimit;
> -		bool do_numainfo __maybe_unused;
>  
>  		do_softlimit = mem_cgroup_event_ratelimit(memcg,
>  						MEM_CGROUP_TARGET_SOFTLIMIT);
> -#if MAX_NUMNODES > 1
> -		do_numainfo = mem_cgroup_event_ratelimit(memcg,
> -						MEM_CGROUP_TARGET_NUMAINFO);
> -#endif
>  		mem_cgroup_threshold(memcg);
>  		if (unlikely(do_softlimit))
>  			mem_cgroup_update_tree(memcg, page);
> -#if MAX_NUMNODES > 1
> -		if (unlikely(do_numainfo))
> -			atomic_inc(&memcg->numainfo_events);
> -#endif
>  	}
>  }
>  
> @@ -1590,104 +1577,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	return ret;
>  }
>  
> -#if MAX_NUMNODES > 1
> -
> -/**
> - * test_mem_cgroup_node_reclaimable
> - * @memcg: the target memcg
> - * @nid: the node ID to be checked.
> - * @noswap : specify true here if the user wants flle only information.
> - *
> - * This function returns whether the specified memcg contains any
> - * reclaimable pages on a node. Returns true if there are any reclaimable
> - * pages in the node.
> - */
> -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
> -		int nid, bool noswap)
> -{
> -	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
> -
> -	if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
> -	    lruvec_page_state(lruvec, NR_ACTIVE_FILE))
> -		return true;
> -	if (noswap || !total_swap_pages)
> -		return false;
> -	if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
> -	    lruvec_page_state(lruvec, NR_ACTIVE_ANON))
> -		return true;
> -	return false;
> -
> -}
> -
> -/*
> - * Always updating the nodemask is not very good - even if we have an empty
> - * list or the wrong list here, we can start from some node and traverse all
> - * nodes based on the zonelist. So update the list loosely once per 10 secs.
> - *
> - */
> -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
> -{
> -	int nid;
> -	/*
> -	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
> -	 * pagein/pageout changes since the last update.
> -	 */
> -	if (!atomic_read(&memcg->numainfo_events))
> -		return;
> -	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
> -		return;
> -
> -	/* make a nodemask where this memcg uses memory from */
> -	memcg->scan_nodes = node_states[N_MEMORY];
> -
> -	for_each_node_mask(nid, node_states[N_MEMORY]) {
> -
> -		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
> -			node_clear(nid, memcg->scan_nodes);
> -	}
> -
> -	atomic_set(&memcg->numainfo_events, 0);
> -	atomic_set(&memcg->numainfo_updating, 0);
> -}
> -
> -/*
> - * Selecting a node where we start reclaim from. Because what we need is just
> - * reducing usage counter, start from anywhere is O,K. Considering
> - * memory reclaim from current node, there are pros. and cons.
> - *
> - * Freeing memory from current node means freeing memory from a node which
> - * we'll use or we've used. So, it may make LRU bad. And if several threads
> - * hit limits, it will see a contention on a node. But freeing from remote
> - * node means more costs for memory reclaim because of memory latency.
> - *
> - * Now, we use round-robin. Better algorithm is welcomed.
> - */
> -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
> -{
> -	int node;
> -
> -	mem_cgroup_may_update_nodemask(memcg);
> -	node = memcg->last_scanned_node;
> -
> -	node = next_node_in(node, memcg->scan_nodes);
> -	/*
> -	 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
> -	 * last time it really checked all the LRUs due to rate limiting.
> -	 * Fallback to the current node in that case for simplicity.
> -	 */
> -	if (unlikely(node == MAX_NUMNODES))
> -		node = numa_node_id();
> -
> -	memcg->last_scanned_node = node;
> -	return node;
> -}
> -#else
> -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
> -{
> -	return 0;
> -}
> -#endif
> -
>  static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
>  				   pg_data_t *pgdat,
>  				   gfp_t gfp_mask,
> @@ -5056,7 +4945,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>  		goto fail;
>  
>  	INIT_WORK(&memcg->high_work, high_work_func);
> -	memcg->last_scanned_node = MAX_NUMNODES;
>  	INIT_LIST_HEAD(&memcg->oom_notify);
>  	mutex_init(&memcg->thresholds_lock);
>  	spin_lock_init(&memcg->move_lock);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1154b3a2b637..cb4dc52cfb88 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3344,10 +3344,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					   gfp_t gfp_mask,
>  					   bool may_swap)
>  {
> -	struct zonelist *zonelist;
>  	unsigned long nr_reclaimed;
>  	unsigned long pflags;
> -	int nid;
>  	unsigned int noreclaim_flag;
>  	struct scan_control sc = {
>  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> @@ -3360,16 +3358,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  		.may_unmap = 1,
>  		.may_swap = may_swap,
>  	};
> +	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
>  
>  	set_task_reclaim_state(current, &sc.reclaim_state);
> -	/*
> -	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
> -	 * take care of from where we get pages. So the node where we start the
> -	 * scan does not need to be the current node.
> -	 */
> -	nid = mem_cgroup_select_victim_node(memcg);
> -
> -	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
>  
>  	trace_mm_vmscan_memcg_reclaim_begin(
>  				cgroup_ino(memcg->css.cgroup),
> -- 
> 2.24.0.rc0.303.g954a862665-goog

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
  2019-10-29 23:47 [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Shakeel Butt
                   ` (3 preceding siblings ...)
  2019-10-30 11:49 ` Michal Hocko
@ 2019-10-30 17:44 ` Johannes Weiner
  2019-10-30 17:53   ` Michal Hocko
  4 siblings, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2019-10-30 17:44 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Greg Thelen, Roman Gushchin, Michal Hocko, Andrew Morton,
	linux-mm, cgroups, linux-kernel, syzbot+13f93c99c06988391efe

On Tue, Oct 29, 2019 at 04:47:53PM -0700, Shakeel Butt wrote:
> Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
> between reclaimers"), the memcg reclaim does not bail out earlier based
> on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
> pages of the memcg on all the nodes will be scanned relative to the
> reclaim priority. So, there is no need to maintain state regarding which
> node to start the memcg reclaim from. Also KCSAN complains data races in
> the code maintaining the state.
> 
> This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
> memory from nodes in round-robin order") and the commit 453a9bf347f1
> ("memcg: fix numa scan information update to be triggered by memory
> event").
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Reported-by: <syzbot+13f93c99c06988391efe@syzkaller.appspotmail.com>

Excellent, thanks Shakeel!
Acked-by: Johannes Weiner <hannes@cmpxchg.org>

Just a request on this bit:

> @@ -3360,16 +3358,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  		.may_unmap = 1,
>  		.may_swap = may_swap,
>  	};
> +	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
>  
>  	set_task_reclaim_state(current, &sc.reclaim_state);
> -	/*
> -	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
> -	 * take care of from where we get pages. So the node where we start the
> -	 * scan does not need to be the current node.
> -	 */
> -	nid = mem_cgroup_select_victim_node(memcg);
> -
> -	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];

This works, but it *is* somewhat fragile if we decide to add bail-out
conditions to reclaim again. And some numa nodes receiving slightly
less pressure than others could be quite tricky to debug.

Can we add a comment here that points out the assumption that the
zonelist walk is comprehensive, and that all nodes receive equal
reclaim pressure?

Also, I think we should use sc.gfp_mask & ~__GFP_THISNODE, so that
allocations with a physical node preference still do node-agnostic
reclaim for the purpose of cgroup accounting.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
  2019-10-30 17:44 ` Johannes Weiner
@ 2019-10-30 17:53   ` Michal Hocko
  2019-10-30 18:06     ` Johannes Weiner
  0 siblings, 1 reply; 9+ messages in thread
From: Michal Hocko @ 2019-10-30 17:53 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Shakeel Butt, Greg Thelen, Roman Gushchin, Andrew Morton,
	linux-mm, cgroups, linux-kernel, syzbot+13f93c99c06988391efe

On Wed 30-10-19 13:44:55, Johannes Weiner wrote:
> On Tue, Oct 29, 2019 at 04:47:53PM -0700, Shakeel Butt wrote:
> > Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
> > between reclaimers"), the memcg reclaim does not bail out earlier based
> > on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
> > pages of the memcg on all the nodes will be scanned relative to the
> > reclaim priority. So, there is no need to maintain state regarding which
> > node to start the memcg reclaim from. Also KCSAN complains data races in
> > the code maintaining the state.
> > 
> > This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
> > memory from nodes in round-robin order") and the commit 453a9bf347f1
> > ("memcg: fix numa scan information update to be triggered by memory
> > event").
> > 
> > Signed-off-by: Shakeel Butt <shakeelb@google.com>
> > Reported-by: <syzbot+13f93c99c06988391efe@syzkaller.appspotmail.com>
> 
> Excellent, thanks Shakeel!
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> Just a request on this bit:
> 
> > @@ -3360,16 +3358,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> >  		.may_unmap = 1,
> >  		.may_swap = may_swap,
> >  	};
> > +	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
> >  
> >  	set_task_reclaim_state(current, &sc.reclaim_state);
> > -	/*
> > -	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
> > -	 * take care of from where we get pages. So the node where we start the
> > -	 * scan does not need to be the current node.
> > -	 */
> > -	nid = mem_cgroup_select_victim_node(memcg);
> > -
> > -	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
> 
> This works, but it *is* somewhat fragile if we decide to add bail-out
> conditions to reclaim again. And some numa nodes receiving slightly
> less pressure than others could be quite tricky to debug.
> 
> Can we add a comment here that points out the assumption that the
> zonelist walk is comprehensive, and that all nodes receive equal
> reclaim pressure?

Makes sense
 
> Also, I think we should use sc.gfp_mask & ~__GFP_THISNODE, so that
> allocations with a physical node preference still do node-agnostic
> reclaim for the purpose of cgroup accounting.

Do not we exclude that by GFP_RECLAIM_MASK already?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()
  2019-10-30 17:53   ` Michal Hocko
@ 2019-10-30 18:06     ` Johannes Weiner
  0 siblings, 0 replies; 9+ messages in thread
From: Johannes Weiner @ 2019-10-30 18:06 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Shakeel Butt, Greg Thelen, Roman Gushchin, Andrew Morton,
	linux-mm, cgroups, linux-kernel, syzbot+13f93c99c06988391efe

On Wed, Oct 30, 2019 at 06:53:02PM +0100, Michal Hocko wrote:
> On Wed 30-10-19 13:44:55, Johannes Weiner wrote:
> > Also, I think we should use sc.gfp_mask & ~__GFP_THISNODE, so that
> > allocations with a physical node preference still do node-agnostic
> > reclaim for the purpose of cgroup accounting.
> 
> Do not we exclude that by GFP_RECLAIM_MASK already?

My bad, you're right. Scratch that, then. Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-10-30 18:07 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-29 23:47 [PATCH] mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Shakeel Butt
2019-10-29 23:51 ` Roman Gushchin
2019-10-30  0:47 ` Andrew Morton
2019-10-30  0:54 ` Andrew Morton
2019-10-30  1:15   ` Shakeel Butt
2019-10-30 11:49 ` Michal Hocko
2019-10-30 17:44 ` Johannes Weiner
2019-10-30 17:53   ` Michal Hocko
2019-10-30 18:06     ` Johannes Weiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).