linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHv2] memcg: reclaim memory from node in round-robin
@ 2011-04-27  7:51 KAMEZAWA Hiroyuki
  2011-04-27 17:33 ` Ying Han
  2011-05-09  2:20 ` [PATCHv2] " KOSAKI Motohiro
  0 siblings, 2 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-27  7:51 UTC (permalink / raw)
  To: linux-mm; +Cc: linux-kernel, akpm, nishimura, Ying Han, balbir

I changed the logic a little and add a filter for skipping nodes.
With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
can be unbalanced. So, I think a filter is required.

==
Now, memory cgroup's direct reclaim frees memory from the current node.
But this has some troubles. In usual, when a set of threads works in
cooperative way, they are tend to on the same node. So, if they hit
limits under memcg, it will reclaim memory from themselves, it may be
active working set.

For example, assume 2 node system which has Node 0 and Node 1
and a memcg which has 1G limit. After some work, file cacne remains and
and usages are
   Node 0:  1M
   Node 1:  998M.

and run an application on Node 0, it will eats its foot before freeing
unnecessary file caches.

This patch adds round-robin for NUMA and adds equal pressure to each
node. When using cpuset's spread memory feature, this will work very well.


From: Ying Han <yinghan@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Changelog v1->v2:
  - fixed comments.
  - added a logic to avoid scanning unused node.

---
 include/linux/memcontrol.h |    1 
 mm/memcontrol.c            |   98 ++++++++++++++++++++++++++++++++++++++++++---
 mm/vmscan.c                |    9 +++-
 3 files changed, 101 insertions(+), 7 deletions(-)

Index: memcg/include/linux/memcontrol.h
===================================================================
--- memcg.orig/include/linux/memcontrol.h
+++ memcg/include/linux/memcontrol.h
@@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
  */
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru);
Index: memcg/mm/memcontrol.c
===================================================================
--- memcg.orig/mm/memcontrol.c
+++ memcg/mm/memcontrol.c
@@ -237,6 +237,11 @@ struct mem_cgroup {
 	 * reclaimed from.
 	 */
 	int last_scanned_child;
+	int last_scanned_node;
+#if MAX_NUMNODES > 1
+	nodemask_t	scan_nodes;
+	unsigned long   next_scan_node_update;
+#endif
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
@@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct 
 	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
 }
 
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+	struct mem_cgroup_per_zone *mz;
+	u64 total;
+	int zid;
+
+	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+		mz = mem_cgroup_zoneinfo(mem, nid, zid);
+		total += MEM_CGROUP_ZSTAT(mz, idx);
+	}
+	return total;
+}
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 					enum lru_list idx)
 {
-	int nid, zid;
-	struct mem_cgroup_per_zone *mz;
+	int nid;
 	u64 total = 0;
 
 	for_each_online_node(nid)
-		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-			mz = mem_cgroup_zoneinfo(mem, nid, zid);
-			total += MEM_CGROUP_ZSTAT(mz, idx);
-		}
+		total += mem_cgroup_get_zonestat_node(mem, nid, idx);
 	return total;
 }
 
@@ -1471,6 +1485,77 @@ mem_cgroup_select_victim(struct mem_cgro
 	return ret;
 }
 
+#if MAX_NUMNODES > 1
+
+/*
+ * Update nodemask always is not very good. Even if we have empty
+ * list, or wrong list here, we can start from some node and traverse all nodes
+ * based on zonelist. So, update the list loosely once in 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+	int nid;
+
+	if (time_after(mem->next_scan_node_update, jiffies))
+		return;
+
+	mem->next_scan_node_update = jiffies + 10*HZ;
+	/* make a nodemask where this memcg uses memory from */
+	mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+		if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+		    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+			continue;
+
+		if (total_swap_pages &&
+		    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+		     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+			continue;
+		node_clear(nid, mem->scan_nodes);
+	}
+
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+	int node;
+
+	mem_cgroup_may_update_nodemask(mem);
+	node = mem->last_scanned_node;
+
+	node = next_node(node, mem->scan_nodes);
+	if (node == MAX_NUMNODES) {
+		node = first_node(mem->scan_nodes);
+		if (unlikely(node == MAX_NUMNODES))
+			node = numa_node_id();
+	}
+
+	mem->last_scanned_node = node;
+	return node;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+	return 0;
+}
+#endif
+
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4678,6 +4763,7 @@ mem_cgroup_create(struct cgroup_subsys *
 		res_counter_init(&mem->memsw, NULL);
 	}
 	mem->last_scanned_child = 0;
+	mem->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&mem->oom_notify);
 
 	if (parent)
Index: memcg/mm/vmscan.c
===================================================================
--- memcg.orig/mm/vmscan.c
+++ memcg/mm/vmscan.c
@@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
+	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
@@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
 		.mem_cgroup = mem_cont,
 		.nodemask = NULL, /* we don't care the placement */
 	};
+	/*
+	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim
+	 * don't take care of from where we get pages . So, the node where
+	 * we start scan is not needed to be current node.
+	 */
+	nid = mem_cgroup_select_victim_node(mem_cont);
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+	zonelist = NODE_DATA(nid)->node_zonelists;
 
 	trace_mm_vmscan_memcg_reclaim_begin(0,
 					    sc.may_writepage,


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv2] memcg: reclaim memory from node in round-robin
  2011-04-27  7:51 [PATCHv2] memcg: reclaim memory from node in round-robin KAMEZAWA Hiroyuki
@ 2011-04-27 17:33 ` Ying Han
  2011-04-27 23:57   ` KAMEZAWA Hiroyuki
  2011-04-28  0:35   ` [PATCHv3] " KAMEZAWA Hiroyuki
  2011-05-09  2:20 ` [PATCHv2] " KOSAKI Motohiro
  1 sibling, 2 replies; 15+ messages in thread
From: Ying Han @ 2011-04-27 17:33 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, linux-kernel, akpm, nishimura, balbir

On Wed, Apr 27, 2011 at 12:51 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> I changed the logic a little and add a filter for skipping nodes.
> With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
> can be unbalanced. So, I think a filter is required.

Thank you.

>
> ==
> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
>
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
> and usages are
>   Node 0:  1M
>   Node 1:  998M.
>
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
>
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. When using cpuset's spread memory feature, this will work very well.
>
>
> From: Ying Han <yinghan@google.com>
> Signed-off-by: Ying Han <yinghan@google.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Changelog v1->v2:
>  - fixed comments.
>  - added a logic to avoid scanning unused node.
>
> ---
>  include/linux/memcontrol.h |    1
>  mm/memcontrol.c            |   98 ++++++++++++++++++++++++++++++++++++++++++---
>  mm/vmscan.c                |    9 +++-
>  3 files changed, 101 insertions(+), 7 deletions(-)
>
> Index: memcg/include/linux/memcontrol.h
> ===================================================================
> --- memcg.orig/include/linux/memcontrol.h
> +++ memcg/include/linux/memcontrol.h
> @@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
>  */
>  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
>  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
>  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
>                                       struct zone *zone,
>                                       enum lru_list lru);
> Index: memcg/mm/memcontrol.c
> ===================================================================
> --- memcg.orig/mm/memcontrol.c
> +++ memcg/mm/memcontrol.c
> @@ -237,6 +237,11 @@ struct mem_cgroup {
>         * reclaimed from.
>         */
>        int last_scanned_child;
> +       int last_scanned_node;
> +#if MAX_NUMNODES > 1
> +       nodemask_t      scan_nodes;
> +       unsigned long   next_scan_node_update;
> +#endif
>        /*
>         * Should the accounting and control be hierarchical, per subtree?
>         */
> @@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
>        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
>  }
>
> +static unsigned long
> +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
> +{
> +       struct mem_cgroup_per_zone *mz;
> +       u64 total;
> +       int zid;
> +
> +       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> +               mz = mem_cgroup_zoneinfo(mem, nid, zid);
> +               total += MEM_CGROUP_ZSTAT(mz, idx);
> +       }
> +       return total;
> +}
>  static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
>                                        enum lru_list idx)
>  {
> -       int nid, zid;
> -       struct mem_cgroup_per_zone *mz;
> +       int nid;
>        u64 total = 0;
>
>        for_each_online_node(nid)
> -               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> -                       mz = mem_cgroup_zoneinfo(mem, nid, zid);
> -                       total += MEM_CGROUP_ZSTAT(mz, idx);
> -               }
> +               total += mem_cgroup_get_zonestat_node(mem, nid, idx);
>        return total;
>  }
>
> @@ -1471,6 +1485,77 @@ mem_cgroup_select_victim(struct mem_cgro
>        return ret;
>  }
>
> +#if MAX_NUMNODES > 1
> +
> +/*
> + * Update nodemask always is not very good. Even if we have empty
> + * list, or wrong list here, we can start from some node and traverse all nodes
> + * based on zonelist. So, update the list loosely once in 10 secs.
> + *
> + */
> +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> +{
> +       int nid;
> +
> +       if (time_after(mem->next_scan_node_update, jiffies))
> +               return;
> +
> +       mem->next_scan_node_update = jiffies + 10*HZ;
> +       /* make a nodemask where this memcg uses memory from */
> +       mem->scan_nodes = node_states[N_HIGH_MEMORY];
> +
> +       for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
> +
> +               if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
> +                   mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
> +                       continue;
> +
> +               if (total_swap_pages &&
> +                   (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
> +                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
> +                       continue;
> +               node_clear(nid, mem->scan_nodes);
> +       }
> +
> +}
> +
> +/*
> + * Selecting a node where we start reclaim from. Because what we need is just
> + * reducing usage counter, start from anywhere is O,K. Considering
> + * memory reclaim from current node, there are pros. and cons.
> + *
> + * Freeing memory from current node means freeing memory from a node which
> + * we'll use or we've used. So, it may make LRU bad. And if several threads
> + * hit limits, it will see a contention on a node. But freeing from remote
> + * node means more costs for memory reclaim because of memory latency.
> + *
> + * Now, we use round-robin. Better algorithm is welcomed.
> + */
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +{
> +       int node;
> +
> +       mem_cgroup_may_update_nodemask(mem);
> +       node = mem->last_scanned_node;
> +
> +       node = next_node(node, mem->scan_nodes);
> +       if (node == MAX_NUMNODES) {
> +               node = first_node(mem->scan_nodes);
> +               if (unlikely(node == MAX_NUMNODES))
> +                       node = numa_node_id();
not sure about this logic, is that possible we reclaim from a node
with all "unreclaimable" pages (based on the
mem_cgroup_may_update_nodemask check).
If i missed anything here, it would be helpful to add comment.

--Ying

> +       }
> +
> +       mem->last_scanned_node = node;
> +       return node;
> +}
> +
> +#else
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +{
> +       return 0;
> +}
> +#endif
> +
>  /*
>  * Scan the hierarchy if needed to reclaim memory. We remember the last child
>  * we reclaimed from, so that we don't end up penalizing one child extensively
> @@ -4678,6 +4763,7 @@ mem_cgroup_create(struct cgroup_subsys *
>                res_counter_init(&mem->memsw, NULL);
>        }
>        mem->last_scanned_child = 0;
> +       mem->last_scanned_node = MAX_NUMNODES;
>        INIT_LIST_HEAD(&mem->oom_notify);
>
>        if (parent)
> Index: memcg/mm/vmscan.c
> ===================================================================
> --- memcg.orig/mm/vmscan.c
> +++ memcg/mm/vmscan.c
> @@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
>  {
>        struct zonelist *zonelist;
>        unsigned long nr_reclaimed;
> +       int nid;
>        struct scan_control sc = {
>                .may_writepage = !laptop_mode,
>                .may_unmap = 1,
> @@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
>                .mem_cgroup = mem_cont,
>                .nodemask = NULL, /* we don't care the placement */
>        };
> +       /*
> +        * Unlike direct reclaim via alloc_pages(), memcg's reclaim
> +        * don't take care of from where we get pages . So, the node where
> +        * we start scan is not needed to be current node.
> +        */
> +       nid = mem_cgroup_select_victim_node(mem_cont);
>
>        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
>                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
> -       zonelist = NODE_DATA(numa_node_id())->node_zonelists;
> +       zonelist = NODE_DATA(nid)->node_zonelists;
>
>        trace_mm_vmscan_memcg_reclaim_begin(0,
>                                            sc.may_writepage,
>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv2] memcg: reclaim memory from node in round-robin
  2011-04-27 17:33 ` Ying Han
@ 2011-04-27 23:57   ` KAMEZAWA Hiroyuki
  2011-04-28  2:49     ` Ying Han
  2011-04-28  0:35   ` [PATCHv3] " KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-27 23:57 UTC (permalink / raw)
  To: Ying Han; +Cc: linux-mm, linux-kernel, akpm, nishimura, balbir

On Wed, 27 Apr 2011 10:33:43 -0700
Ying Han <yinghan@google.com> wrote:

> On Wed, Apr 27, 2011 at 12:51 AM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > I changed the logic a little and add a filter for skipping nodes.
> > With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
> > can be unbalanced. So, I think a filter is required.
> 
> Thank you.
> 
> >
> > ==
> > Now, memory cgroup's direct reclaim frees memory from the current node.
> > But this has some troubles. In usual, when a set of threads works in
> > cooperative way, they are tend to on the same node. So, if they hit
> > limits under memcg, it will reclaim memory from themselves, it may be
> > active working set.
> >
> > For example, assume 2 node system which has Node 0 and Node 1
> > and a memcg which has 1G limit. After some work, file cacne remains and
> > and usages are
> >   Node 0:  1M
> >   Node 1:  998M.
> >
> > and run an application on Node 0, it will eats its foot before freeing
> > unnecessary file caches.
> >
> > This patch adds round-robin for NUMA and adds equal pressure to each
> > node. When using cpuset's spread memory feature, this will work very well.
> >
> >
> > From: Ying Han <yinghan@google.com>
> > Signed-off-by: Ying Han <yinghan@google.com>
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > Changelog v1->v2:
> >  - fixed comments.
> >  - added a logic to avoid scanning unused node.
> >
> > ---
> >  include/linux/memcontrol.h |    1
> >  mm/memcontrol.c            |   98 ++++++++++++++++++++++++++++++++++++++++++---
> >  mm/vmscan.c                |    9 +++-
> >  3 files changed, 101 insertions(+), 7 deletions(-)
> >
> > Index: memcg/include/linux/memcontrol.h
> > ===================================================================
> > --- memcg.orig/include/linux/memcontrol.h
> > +++ memcg/include/linux/memcontrol.h
> > @@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
> >  */
> >  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
> >  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> > +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
> >  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> >                                       struct zone *zone,
> >                                       enum lru_list lru);
> > Index: memcg/mm/memcontrol.c
> > ===================================================================
> > --- memcg.orig/mm/memcontrol.c
> > +++ memcg/mm/memcontrol.c
> > @@ -237,6 +237,11 @@ struct mem_cgroup {
> >         * reclaimed from.
> >         */
> >        int last_scanned_child;
> > +       int last_scanned_node;
> > +#if MAX_NUMNODES > 1
> > +       nodemask_t      scan_nodes;
> > +       unsigned long   next_scan_node_update;
> > +#endif
> >        /*
> >         * Should the accounting and control be hierarchical, per subtree?
> >         */
> > @@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
> >        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
> >  }
> >
> > +static unsigned long
> > +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
> > +{
> > +       struct mem_cgroup_per_zone *mz;
> > +       u64 total;
> > +       int zid;
> > +
> > +       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> > +               mz = mem_cgroup_zoneinfo(mem, nid, zid);
> > +               total += MEM_CGROUP_ZSTAT(mz, idx);
> > +       }
> > +       return total;
> > +}
> >  static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
> >                                        enum lru_list idx)
> >  {
> > -       int nid, zid;
> > -       struct mem_cgroup_per_zone *mz;
> > +       int nid;
> >        u64 total = 0;
> >
> >        for_each_online_node(nid)
> > -               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> > -                       mz = mem_cgroup_zoneinfo(mem, nid, zid);
> > -                       total += MEM_CGROUP_ZSTAT(mz, idx);
> > -               }
> > +               total += mem_cgroup_get_zonestat_node(mem, nid, idx);
> >        return total;
> >  }
> >
> > @@ -1471,6 +1485,77 @@ mem_cgroup_select_victim(struct mem_cgro
> >        return ret;
> >  }
> >
> > +#if MAX_NUMNODES > 1
> > +
> > +/*
> > + * Update nodemask always is not very good. Even if we have empty
> > + * list, or wrong list here, we can start from some node and traverse all nodes
> > + * based on zonelist. So, update the list loosely once in 10 secs.
> > + *
> > + */
> > +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> > +{
> > +       int nid;
> > +
> > +       if (time_after(mem->next_scan_node_update, jiffies))
> > +               return;
> > +
> > +       mem->next_scan_node_update = jiffies + 10*HZ;
> > +       /* make a nodemask where this memcg uses memory from */
> > +       mem->scan_nodes = node_states[N_HIGH_MEMORY];
> > +
> > +       for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
> > +
> > +               if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
> > +                   mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
> > +                       continue;
> > +
> > +               if (total_swap_pages &&
> > +                   (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
> > +                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
> > +                       continue;
> > +               node_clear(nid, mem->scan_nodes);
> > +       }
> > +
> > +}
> > +
> > +/*
> > + * Selecting a node where we start reclaim from. Because what we need is just
> > + * reducing usage counter, start from anywhere is O,K. Considering
> > + * memory reclaim from current node, there are pros. and cons.
> > + *
> > + * Freeing memory from current node means freeing memory from a node which
> > + * we'll use or we've used. So, it may make LRU bad. And if several threads
> > + * hit limits, it will see a contention on a node. But freeing from remote
> > + * node means more costs for memory reclaim because of memory latency.
> > + *
> > + * Now, we use round-robin. Better algorithm is welcomed.
> > + */
> > +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> > +{
> > +       int node;
> > +
> > +       mem_cgroup_may_update_nodemask(mem);
> > +       node = mem->last_scanned_node;
> > +
> > +       node = next_node(node, mem->scan_nodes);
> > +       if (node == MAX_NUMNODES) {
> > +               node = first_node(mem->scan_nodes);
> > +               if (unlikely(node == MAX_NUMNODES))
> > +                       node = numa_node_id();
> not sure about this logic, is that possible we reclaim from a node
> with all "unreclaimable" pages (based on the
> mem_cgroup_may_update_nodemask check).
> If i missed anything here, it would be helpful to add comment.
> 

What I'm afraid here is when a user uses very small memcg,
all pages on the LRU may be isolated or all usages are in per-cpu cache
of memcg or because of task-migration between memcg, it hits limit before
having any pages on LRU.....I think there is possible corner cases which
can cause hang.

ok, will add comment.

Thanks,
-Kame







^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCHv3] memcg: reclaim memory from node in round-robin
  2011-04-27 17:33 ` Ying Han
  2011-04-27 23:57   ` KAMEZAWA Hiroyuki
@ 2011-04-28  0:35   ` KAMEZAWA Hiroyuki
  2011-04-28  1:37     ` Daisuke Nishimura
  1 sibling, 1 reply; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-28  0:35 UTC (permalink / raw)
  To: Ying Han; +Cc: linux-mm, linux-kernel, akpm, nishimura, balbir

Now, memory cgroup's direct reclaim frees memory from the current node.
But this has some troubles. In usual, when a set of threads works in
cooperative way, they are tend to on the same node. So, if they hit
limits under memcg, it will reclaim memory from themselves, it may be
active working set.

For example, assume 2 node system which has Node 0 and Node 1
and a memcg which has 1G limit. After some work, file cacne remains and
and usages are
   Node 0:  1M
   Node 1:  998M.

and run an application on Node 0, it will eats its foot before freeing
unnecessary file caches.

This patch adds round-robin for NUMA and adds equal pressure to each
node. With using cpuset's spread memory feature, this will work very well.

But yes, better algorithm is appreciated.

From: Ying Han <yinghan@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Changelog v2->v3
  - added comments for why we need sanity check.

Changelog v1->v2:
  - fixed comments.
  - added a logic to avoid scanning unused node.

---
 include/linux/memcontrol.h |    1 
 mm/memcontrol.c            |  102 ++++++++++++++++++++++++++++++++++++++++++---
 mm/vmscan.c                |    9 +++
 3 files changed, 105 insertions(+), 7 deletions(-)

Index: memcg/include/linux/memcontrol.h
===================================================================
--- memcg.orig/include/linux/memcontrol.h
+++ memcg/include/linux/memcontrol.h
@@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
  */
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru);
Index: memcg/mm/memcontrol.c
===================================================================
--- memcg.orig/mm/memcontrol.c
+++ memcg/mm/memcontrol.c
@@ -237,6 +237,11 @@ struct mem_cgroup {
 	 * reclaimed from.
 	 */
 	int last_scanned_child;
+	int last_scanned_node;
+#if MAX_NUMNODES > 1
+	nodemask_t	scan_nodes;
+	unsigned long   next_scan_node_update;
+#endif
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
@@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct 
 	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
 }
 
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+	struct mem_cgroup_per_zone *mz;
+	u64 total;
+	int zid;
+
+	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+		mz = mem_cgroup_zoneinfo(mem, nid, zid);
+		total += MEM_CGROUP_ZSTAT(mz, idx);
+	}
+	return total;
+}
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 					enum lru_list idx)
 {
-	int nid, zid;
-	struct mem_cgroup_per_zone *mz;
+	int nid;
 	u64 total = 0;
 
 	for_each_online_node(nid)
-		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-			mz = mem_cgroup_zoneinfo(mem, nid, zid);
-			total += MEM_CGROUP_ZSTAT(mz, idx);
-		}
+		total += mem_cgroup_get_zonestat_node(mem, nid, idx);
 	return total;
 }
 
@@ -1471,6 +1485,81 @@ mem_cgroup_select_victim(struct mem_cgro
 	return ret;
 }
 
+#if MAX_NUMNODES > 1
+
+/*
+ * Update nodemask always is not very good. Even if we have empty
+ * list, or wrong list here, we can start from some node and traverse all nodes
+ * based on zonelist. So, update the list loosely once in 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+	int nid;
+
+	if (time_after(mem->next_scan_node_update, jiffies))
+		return;
+
+	mem->next_scan_node_update = jiffies + 10*HZ;
+	/* make a nodemask where this memcg uses memory from */
+	mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+		if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+		    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+			continue;
+
+		if (total_swap_pages &&
+		    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+		     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+			continue;
+		node_clear(nid, mem->scan_nodes);
+	}
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+	int node;
+
+	mem_cgroup_may_update_nodemask(mem);
+	node = mem->last_scanned_node;
+
+	node = next_node(node, mem->scan_nodes);
+	if (node == MAX_NUMNODES)
+		node = first_node(mem->scan_nodes);
+	/*
+	 * We call this when we hit limit, not when pages are added to LRU.
+	 * No LRU may hold pages because all pages are UNEVICTABLE or
+	 * memcg is too small and all pages are not on LRU. In that case,
+	 * we use curret node.
+	 */
+	if (unlikely(node == MAX_NUMNODES))
+		node = numa_node_id();
+
+	mem->last_scanned_node = node;
+	return node;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+	return 0;
+}
+#endif
+
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4678,6 +4767,7 @@ mem_cgroup_create(struct cgroup_subsys *
 		res_counter_init(&mem->memsw, NULL);
 	}
 	mem->last_scanned_child = 0;
+	mem->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&mem->oom_notify);
 
 	if (parent)
Index: memcg/mm/vmscan.c
===================================================================
--- memcg.orig/mm/vmscan.c
+++ memcg/mm/vmscan.c
@@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
+	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
@@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
 		.mem_cgroup = mem_cont,
 		.nodemask = NULL, /* we don't care the placement */
 	};
+	/*
+	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim
+	 * don't take care of from where we get pages . So, the node where
+	 * we start scan is not needed to be current node.
+	 */
+	nid = mem_cgroup_select_victim_node(mem_cont);
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+	zonelist = NODE_DATA(nid)->node_zonelists;
 
 	trace_mm_vmscan_memcg_reclaim_begin(0,
 					    sc.may_writepage,


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv3] memcg: reclaim memory from node in round-robin
  2011-04-28  0:35   ` [PATCHv3] " KAMEZAWA Hiroyuki
@ 2011-04-28  1:37     ` Daisuke Nishimura
  2011-04-28  1:49       ` [PATCHv4] " KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 15+ messages in thread
From: Daisuke Nishimura @ 2011-04-28  1:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Ying Han, linux-mm, linux-kernel, akpm, balbir, Daisuke Nishimura

On Thu, 28 Apr 2011 09:35:13 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
> 
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
                                                        ^^^^^
                                                        cache
> and usages are
>    Node 0:  1M
>    Node 1:  998M.
> 
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
> 
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. With using cpuset's spread memory feature, this will work very well.
> 
> But yes, better algorithm is appreciated.
> 
> From: Ying Han <yinghan@google.com>
> Signed-off-by: Ying Han <yinghan@google.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Changelog v2->v3
>   - added comments for why we need sanity check.
> 
> Changelog v1->v2:
>   - fixed comments.
>   - added a logic to avoid scanning unused node.
> 
> ---
>  include/linux/memcontrol.h |    1 
>  mm/memcontrol.c            |  102 ++++++++++++++++++++++++++++++++++++++++++---
>  mm/vmscan.c                |    9 +++
>  3 files changed, 105 insertions(+), 7 deletions(-)
> 
> Index: memcg/include/linux/memcontrol.h
> ===================================================================
> --- memcg.orig/include/linux/memcontrol.h
> +++ memcg/include/linux/memcontrol.h
> @@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
>   */
>  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
>  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
>  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
>  				       struct zone *zone,
>  				       enum lru_list lru);
> Index: memcg/mm/memcontrol.c
> ===================================================================
> --- memcg.orig/mm/memcontrol.c
> +++ memcg/mm/memcontrol.c
> @@ -237,6 +237,11 @@ struct mem_cgroup {
>  	 * reclaimed from.
>  	 */
>  	int last_scanned_child;
> +	int last_scanned_node;
> +#if MAX_NUMNODES > 1
> +	nodemask_t	scan_nodes;
> +	unsigned long   next_scan_node_update;
> +#endif
>  	/*
>  	 * Should the accounting and control be hierarchical, per subtree?
>  	 */
> @@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct 
>  	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
>  }
>  
> +static unsigned long
> +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
> +{
> +	struct mem_cgroup_per_zone *mz;
> +	u64 total;
> +	int zid;
> +
> +	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> +		mz = mem_cgroup_zoneinfo(mem, nid, zid);
> +		total += MEM_CGROUP_ZSTAT(mz, idx);
> +	}
> +	return total;
> +}
>  static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
>  					enum lru_list idx)
>  {
> -	int nid, zid;
> -	struct mem_cgroup_per_zone *mz;
> +	int nid;
>  	u64 total = 0;
>  
>  	for_each_online_node(nid)
> -		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> -			mz = mem_cgroup_zoneinfo(mem, nid, zid);
> -			total += MEM_CGROUP_ZSTAT(mz, idx);
> -		}
> +		total += mem_cgroup_get_zonestat_node(mem, nid, idx);
>  	return total;
>  }
>  
> @@ -1471,6 +1485,81 @@ mem_cgroup_select_victim(struct mem_cgro
>  	return ret;
>  }
>  
> +#if MAX_NUMNODES > 1
> +
> +/*
> + * Update nodemask always is not very good. Even if we have empty
> + * list, or wrong list here, we can start from some node and traverse all nodes
> + * based on zonelist. So, update the list loosely once in 10 secs.
> + *
> + */
> +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> +{
> +	int nid;
> +
> +	if (time_after(mem->next_scan_node_update, jiffies))
> +		return;
> +
Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?

Looks good to me, otherwise.

Thanks,
Daisuke Nishimura.

> +	mem->next_scan_node_update = jiffies + 10*HZ;
> +	/* make a nodemask where this memcg uses memory from */
> +	mem->scan_nodes = node_states[N_HIGH_MEMORY];
> +
> +	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
> +
> +		if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
> +		    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
> +			continue;
> +
> +		if (total_swap_pages &&
> +		    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
> +		     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
> +			continue;
> +		node_clear(nid, mem->scan_nodes);
> +	}
> +}
> +
> +/*
> + * Selecting a node where we start reclaim from. Because what we need is just
> + * reducing usage counter, start from anywhere is O,K. Considering
> + * memory reclaim from current node, there are pros. and cons.
> + *
> + * Freeing memory from current node means freeing memory from a node which
> + * we'll use or we've used. So, it may make LRU bad. And if several threads
> + * hit limits, it will see a contention on a node. But freeing from remote
> + * node means more costs for memory reclaim because of memory latency.
> + *
> + * Now, we use round-robin. Better algorithm is welcomed.
> + */
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +{
> +	int node;
> +
> +	mem_cgroup_may_update_nodemask(mem);
> +	node = mem->last_scanned_node;
> +
> +	node = next_node(node, mem->scan_nodes);
> +	if (node == MAX_NUMNODES)
> +		node = first_node(mem->scan_nodes);
> +	/*
> +	 * We call this when we hit limit, not when pages are added to LRU.
> +	 * No LRU may hold pages because all pages are UNEVICTABLE or
> +	 * memcg is too small and all pages are not on LRU. In that case,
> +	 * we use curret node.
> +	 */
> +	if (unlikely(node == MAX_NUMNODES))
> +		node = numa_node_id();
> +
> +	mem->last_scanned_node = node;
> +	return node;
> +}
> +
> +#else
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +{
> +	return 0;
> +}
> +#endif
> +
>  /*
>   * Scan the hierarchy if needed to reclaim memory. We remember the last child
>   * we reclaimed from, so that we don't end up penalizing one child extensively
> @@ -4678,6 +4767,7 @@ mem_cgroup_create(struct cgroup_subsys *
>  		res_counter_init(&mem->memsw, NULL);
>  	}
>  	mem->last_scanned_child = 0;
> +	mem->last_scanned_node = MAX_NUMNODES;
>  	INIT_LIST_HEAD(&mem->oom_notify);
>  
>  	if (parent)
> Index: memcg/mm/vmscan.c
> ===================================================================
> --- memcg.orig/mm/vmscan.c
> +++ memcg/mm/vmscan.c
> @@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
>  {
>  	struct zonelist *zonelist;
>  	unsigned long nr_reclaimed;
> +	int nid;
>  	struct scan_control sc = {
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
> @@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
>  		.mem_cgroup = mem_cont,
>  		.nodemask = NULL, /* we don't care the placement */
>  	};
> +	/*
> +	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim
> +	 * don't take care of from where we get pages . So, the node where
> +	 * we start scan is not needed to be current node.
> +	 */
> +	nid = mem_cgroup_select_victim_node(mem_cont);
>  
>  	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
>  			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
> -	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
> +	zonelist = NODE_DATA(nid)->node_zonelists;
>  
>  	trace_mm_vmscan_memcg_reclaim_begin(0,
>  					    sc.may_writepage,
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCHv4] memcg: reclaim memory from node in round-robin
  2011-04-28  1:37     ` Daisuke Nishimura
@ 2011-04-28  1:49       ` KAMEZAWA Hiroyuki
  2011-04-28  2:04         ` Daisuke Nishimura
  2011-05-04 21:26         ` Andrew Morton
  0 siblings, 2 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-28  1:49 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: Ying Han, linux-mm, linux-kernel, akpm, balbir

On Thu, 28 Apr 2011 10:37:05 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > +	if (time_after(mem->next_scan_node_update, jiffies))
> > +		return;
> > +
> Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?
> 
> Looks good to me, otherwise.
> 

time_after(a, b) returns true when a is after b.....you're right.
==
Now, memory cgroup's direct reclaim frees memory from the current node.
But this has some troubles. In usual, when a set of threads works in
cooperative way, they are tend to on the same node. So, if they hit
limits under memcg, it will reclaim memory from themselves, it may be
active working set.

For example, assume 2 node system which has Node 0 and Node 1
and a memcg which has 1G limit. After some work, file cacne remains and
and usages are
   Node 0:  1M
   Node 1:  998M.

and run an application on Node 0, it will eats its foot before freeing
unnecessary file caches.

This patch adds round-robin for NUMA and adds equal pressure to each
node. When using cpuset's spread memory feature, this will work very well.

But yes, better algorithm is appreciated.

From: Ying Han <yinghan@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Changelog v3->v4
  - fixed time_after() bug. using time_before() instead.
Changelog v2->v3
  - added comments.

Changelog v1->v2:
  - fixed comments.
  - added a logic to avoid scanning unused node.

---
 include/linux/memcontrol.h |    1 
 mm/memcontrol.c            |  102 ++++++++++++++++++++++++++++++++++++++++++---
 mm/vmscan.c                |    9 +++
 3 files changed, 105 insertions(+), 7 deletions(-)

Index: memcg/include/linux/memcontrol.h
===================================================================
--- memcg.orig/include/linux/memcontrol.h
+++ memcg/include/linux/memcontrol.h
@@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
  */
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru);
Index: memcg/mm/memcontrol.c
===================================================================
--- memcg.orig/mm/memcontrol.c
+++ memcg/mm/memcontrol.c
@@ -237,6 +237,11 @@ struct mem_cgroup {
 	 * reclaimed from.
 	 */
 	int last_scanned_child;
+	int last_scanned_node;
+#if MAX_NUMNODES > 1
+	nodemask_t	scan_nodes;
+	unsigned long   next_scan_node_update;
+#endif
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
@@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct 
 	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
 }
 
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+	struct mem_cgroup_per_zone *mz;
+	u64 total;
+	int zid;
+
+	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+		mz = mem_cgroup_zoneinfo(mem, nid, zid);
+		total += MEM_CGROUP_ZSTAT(mz, idx);
+	}
+	return total;
+}
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 					enum lru_list idx)
 {
-	int nid, zid;
-	struct mem_cgroup_per_zone *mz;
+	int nid;
 	u64 total = 0;
 
 	for_each_online_node(nid)
-		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-			mz = mem_cgroup_zoneinfo(mem, nid, zid);
-			total += MEM_CGROUP_ZSTAT(mz, idx);
-		}
+		total += mem_cgroup_get_zonestat_node(mem, nid, idx);
 	return total;
 }
 
@@ -1471,6 +1485,81 @@ mem_cgroup_select_victim(struct mem_cgro
 	return ret;
 }
 
+#if MAX_NUMNODES > 1
+
+/*
+ * Update nodemask always is not very good. Even if we have empty
+ * list, or wrong list here, we can start from some node and traverse all nodes
+ * based on zonelist. So, update the list loosely once in 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+	int nid;
+
+	if (time_before(mem->next_scan_node_update, jiffies))
+		return;
+
+	mem->next_scan_node_update = jiffies + 10*HZ;
+	/* make a nodemask where this memcg uses memory from */
+	mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+		if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+		    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+			continue;
+
+		if (total_swap_pages &&
+		    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+		     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+			continue;
+		node_clear(nid, mem->scan_nodes);
+	}
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+	int node;
+
+	mem_cgroup_may_update_nodemask(mem);
+	node = mem->last_scanned_node;
+
+	node = next_node(node, mem->scan_nodes);
+	if (node == MAX_NUMNODES)
+		node = first_node(mem->scan_nodes);
+	/*
+	 * We call this when we hit limit, not when pages are added to LRU.
+	 * No LRU may hold pages because all pages are UNEVICTABLE or
+	 * memcg is too small and all pages are not on LRU. In that case,
+	 * we use curret node.
+	 */
+	if (unlikely(node == MAX_NUMNODES))
+		node = numa_node_id();
+
+	mem->last_scanned_node = node;
+	return node;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+	return 0;
+}
+#endif
+
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4678,6 +4767,7 @@ mem_cgroup_create(struct cgroup_subsys *
 		res_counter_init(&mem->memsw, NULL);
 	}
 	mem->last_scanned_child = 0;
+	mem->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&mem->oom_notify);
 
 	if (parent)
Index: memcg/mm/vmscan.c
===================================================================
--- memcg.orig/mm/vmscan.c
+++ memcg/mm/vmscan.c
@@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
+	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
@@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
 		.mem_cgroup = mem_cont,
 		.nodemask = NULL, /* we don't care the placement */
 	};
+	/*
+	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim
+	 * don't take care of from where we get pages . So, the node where
+	 * we start scan is not needed to be current node.
+	 */
+	nid = mem_cgroup_select_victim_node(mem_cont);
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+	zonelist = NODE_DATA(nid)->node_zonelists;
 
 	trace_mm_vmscan_memcg_reclaim_begin(0,
 					    sc.may_writepage,


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv4] memcg: reclaim memory from node in round-robin
  2011-04-28  1:49       ` [PATCHv4] " KAMEZAWA Hiroyuki
@ 2011-04-28  2:04         ` Daisuke Nishimura
  2011-05-04 21:26         ` Andrew Morton
  1 sibling, 0 replies; 15+ messages in thread
From: Daisuke Nishimura @ 2011-04-28  2:04 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Daisuke Nishimura, Ying Han, linux-mm, linux-kernel, akpm, balbir

On Thu, 28 Apr 2011 10:49:12 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Thu, 28 Apr 2011 10:37:05 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > +	if (time_after(mem->next_scan_node_update, jiffies))
> > > +		return;
> > > +
> > Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?
> > 
> > Looks good to me, otherwise.
> > 
> 
> time_after(a, b) returns true when a is after b.....you're right.
> ==
> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
> 
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
> and usages are
>    Node 0:  1M
>    Node 1:  998M.
> 
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
> 
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. When using cpuset's spread memory feature, this will work very well.
> 
> But yes, better algorithm is appreciated.
> 
> From: Ying Han <yinghan@google.com>
> Signed-off-by: Ying Han <yinghan@google.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv2] memcg: reclaim memory from node in round-robin
  2011-04-27 23:57   ` KAMEZAWA Hiroyuki
@ 2011-04-28  2:49     ` Ying Han
  0 siblings, 0 replies; 15+ messages in thread
From: Ying Han @ 2011-04-28  2:49 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, linux-kernel, akpm, nishimura, balbir

On Wed, Apr 27, 2011 at 4:57 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Wed, 27 Apr 2011 10:33:43 -0700
> Ying Han <yinghan@google.com> wrote:
>
>> On Wed, Apr 27, 2011 at 12:51 AM, KAMEZAWA Hiroyuki
>> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>> > I changed the logic a little and add a filter for skipping nodes.
>> > With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
>> > can be unbalanced. So, I think a filter is required.
>>
>> Thank you.
>>
>> >
>> > ==
>> > Now, memory cgroup's direct reclaim frees memory from the current node.
>> > But this has some troubles. In usual, when a set of threads works in
>> > cooperative way, they are tend to on the same node. So, if they hit
>> > limits under memcg, it will reclaim memory from themselves, it may be
>> > active working set.
>> >
>> > For example, assume 2 node system which has Node 0 and Node 1
>> > and a memcg which has 1G limit. After some work, file cacne remains and
>> > and usages are
>> >   Node 0:  1M
>> >   Node 1:  998M.
>> >
>> > and run an application on Node 0, it will eats its foot before freeing
>> > unnecessary file caches.
>> >
>> > This patch adds round-robin for NUMA and adds equal pressure to each
>> > node. When using cpuset's spread memory feature, this will work very well.
>> >
>> >
>> > From: Ying Han <yinghan@google.com>
>> > Signed-off-by: Ying Han <yinghan@google.com>
>> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>> >
>> > Changelog v1->v2:
>> >  - fixed comments.
>> >  - added a logic to avoid scanning unused node.
>> >
>> > ---
>> >  include/linux/memcontrol.h |    1
>> >  mm/memcontrol.c            |   98 ++++++++++++++++++++++++++++++++++++++++++---
>> >  mm/vmscan.c                |    9 +++-
>> >  3 files changed, 101 insertions(+), 7 deletions(-)
>> >
>> > Index: memcg/include/linux/memcontrol.h
>> > ===================================================================
>> > --- memcg.orig/include/linux/memcontrol.h
>> > +++ memcg/include/linux/memcontrol.h
>> > @@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
>> >  */
>> >  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
>> >  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
>> > +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
>> >  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
>> >                                       struct zone *zone,
>> >                                       enum lru_list lru);
>> > Index: memcg/mm/memcontrol.c
>> > ===================================================================
>> > --- memcg.orig/mm/memcontrol.c
>> > +++ memcg/mm/memcontrol.c
>> > @@ -237,6 +237,11 @@ struct mem_cgroup {
>> >         * reclaimed from.
>> >         */
>> >        int last_scanned_child;
>> > +       int last_scanned_node;
>> > +#if MAX_NUMNODES > 1
>> > +       nodemask_t      scan_nodes;
>> > +       unsigned long   next_scan_node_update;
>> > +#endif
>> >        /*
>> >         * Should the accounting and control be hierarchical, per subtree?
>> >         */
>> > @@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
>> >        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
>> >  }
>> >
>> > +static unsigned long
>> > +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
>> > +{
>> > +       struct mem_cgroup_per_zone *mz;
>> > +       u64 total;
>> > +       int zid;
>> > +
>> > +       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
>> > +               mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> > +               total += MEM_CGROUP_ZSTAT(mz, idx);
>> > +       }
>> > +       return total;
>> > +}
>> >  static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
>> >                                        enum lru_list idx)
>> >  {
>> > -       int nid, zid;
>> > -       struct mem_cgroup_per_zone *mz;
>> > +       int nid;
>> >        u64 total = 0;
>> >
>> >        for_each_online_node(nid)
>> > -               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
>> > -                       mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> > -                       total += MEM_CGROUP_ZSTAT(mz, idx);
>> > -               }
>> > +               total += mem_cgroup_get_zonestat_node(mem, nid, idx);
>> >        return total;
>> >  }
>> >
>> > @@ -1471,6 +1485,77 @@ mem_cgroup_select_victim(struct mem_cgro
>> >        return ret;
>> >  }
>> >
>> > +#if MAX_NUMNODES > 1
>> > +
>> > +/*
>> > + * Update nodemask always is not very good. Even if we have empty
>> > + * list, or wrong list here, we can start from some node and traverse all nodes
>> > + * based on zonelist. So, update the list loosely once in 10 secs.
>> > + *
>> > + */
>> > +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
>> > +{
>> > +       int nid;
>> > +
>> > +       if (time_after(mem->next_scan_node_update, jiffies))
>> > +               return;
>> > +
>> > +       mem->next_scan_node_update = jiffies + 10*HZ;
>> > +       /* make a nodemask where this memcg uses memory from */
>> > +       mem->scan_nodes = node_states[N_HIGH_MEMORY];
>> > +
>> > +       for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
>> > +
>> > +               if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
>> > +                   mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
>> > +                       continue;
>> > +
>> > +               if (total_swap_pages &&
>> > +                   (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
>> > +                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
>> > +                       continue;
>> > +               node_clear(nid, mem->scan_nodes);
>> > +       }
>> > +
>> > +}
>> > +
>> > +/*
>> > + * Selecting a node where we start reclaim from. Because what we need is just
>> > + * reducing usage counter, start from anywhere is O,K. Considering
>> > + * memory reclaim from current node, there are pros. and cons.
>> > + *
>> > + * Freeing memory from current node means freeing memory from a node which
>> > + * we'll use or we've used. So, it may make LRU bad. And if several threads
>> > + * hit limits, it will see a contention on a node. But freeing from remote
>> > + * node means more costs for memory reclaim because of memory latency.
>> > + *
>> > + * Now, we use round-robin. Better algorithm is welcomed.
>> > + */
>> > +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
>> > +{
>> > +       int node;
>> > +
>> > +       mem_cgroup_may_update_nodemask(mem);
>> > +       node = mem->last_scanned_node;
>> > +
>> > +       node = next_node(node, mem->scan_nodes);
>> > +       if (node == MAX_NUMNODES) {
>> > +               node = first_node(mem->scan_nodes);
>> > +               if (unlikely(node == MAX_NUMNODES))
>> > +                       node = numa_node_id();
>> not sure about this logic, is that possible we reclaim from a node
>> with all "unreclaimable" pages (based on the
>> mem_cgroup_may_update_nodemask check).
>> If i missed anything here, it would be helpful to add comment.
>>
>
> What I'm afraid here is when a user uses very small memcg,
> all pages on the LRU may be isolated or all usages are in per-cpu cache
> of memcg or because of task-migration between memcg, it hits limit before
> having any pages on LRU.....I think there is possible corner cases which
> can cause hang.
>
> ok, will add comment.

Ok, thanks. Otherwise it looks good.

Acked-by: Ying Han <yinghan@google.com>

--Ying

--Ying
>
> Thanks,
> -Kame
>
>
>
>
>
>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv4] memcg: reclaim memory from node in round-robin
  2011-04-28  1:49       ` [PATCHv4] " KAMEZAWA Hiroyuki
  2011-04-28  2:04         ` Daisuke Nishimura
@ 2011-05-04 21:26         ` Andrew Morton
  2011-05-06  6:13           ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2011-05-04 21:26 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Daisuke Nishimura, Ying Han, linux-mm, linux-kernel, balbir

On Thu, 28 Apr 2011 10:49:12 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Thu, 28 Apr 2011 10:37:05 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > +	if (time_after(mem->next_scan_node_update, jiffies))
> > > +		return;
> > > +
> > Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?
> > 
> > Looks good to me, otherwise.
> > 
> 
> time_after(a, b) returns true when a is after b.....you're right.
> ==
> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
> 
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
> and usages are
>    Node 0:  1M
>    Node 1:  998M.
> 
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
> 
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. When using cpuset's spread memory feature, this will work very well.
> 
> But yes, better algorithm is appreciated.

That ten-second thing is a gruesome and ghastly hack, but didn't even
get a mention in the patch description?

Talk to us about it.  Why is it there?  What are the implications of
getting it wrong?  What alternatives are there? 

It would be much better to work out the optimum time at which to rotate
the index via some deterministic means.

If we can't think of a way of doing that then we should at least pace
the rotation frequency via something saner than wall-time.  Such as
number-of-pages-scanned.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv4] memcg: reclaim memory from node in round-robin
  2011-05-04 21:26         ` Andrew Morton
@ 2011-05-06  6:13           ` KAMEZAWA Hiroyuki
  2011-05-26 19:52             ` Andrew Morton
  0 siblings, 1 reply; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-05-06  6:13 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Daisuke Nishimura, Ying Han, linux-mm, linux-kernel, balbir

On Wed, 4 May 2011 14:26:23 -0700
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Thu, 28 Apr 2011 10:49:12 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > On Thu, 28 Apr 2011 10:37:05 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > +	if (time_after(mem->next_scan_node_update, jiffies))
> > > > +		return;
> > > > +
> > > Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?
> > > 
> > > Looks good to me, otherwise.
> > > 
> > 
> > time_after(a, b) returns true when a is after b.....you're right.
> > ==
> > Now, memory cgroup's direct reclaim frees memory from the current node.
> > But this has some troubles. In usual, when a set of threads works in
> > cooperative way, they are tend to on the same node. So, if they hit
> > limits under memcg, it will reclaim memory from themselves, it may be
> > active working set.
> > 
> > For example, assume 2 node system which has Node 0 and Node 1
> > and a memcg which has 1G limit. After some work, file cacne remains and
> > and usages are
> >    Node 0:  1M
> >    Node 1:  998M.
> > 
> > and run an application on Node 0, it will eats its foot before freeing
> > unnecessary file caches.
> > 
> > This patch adds round-robin for NUMA and adds equal pressure to each
> > node. When using cpuset's spread memory feature, this will work very well.
> > 
> > But yes, better algorithm is appreciated.
> 
> That ten-second thing is a gruesome and ghastly hack, but didn't even
> get a mention in the patch description?
> 
> Talk to us about it.  Why is it there?  What are the implications of
> getting it wrong?  What alternatives are there? 
> 

Ah, sorry I couldn't think of fix to that levet, I posted.

> It would be much better to work out the optimum time at which to rotate
> the index via some deterministic means.
> 
> If we can't think of a way of doing that then we should at least pace
> the rotation frequency via something saner than wall-time.  Such as
> number-of-pages-scanned.
> 


What I think now is using reclaim_stat or usigng some fairness based on
the ratio of inactive file caches. We can calculate the total sum of
recalaim_stat which gives us a scan_ratio for a whole memcg. And we can
calculate LRU rotate/scan ratio per node. If rotate/scan ratio is small,
it will be a good candidate of reclaim target. Hmm,

  - check which memory(anon or file) should be scanned.
    (If file is too small, rotate/scan ratio of file is meaningless.)
  - check rotate/scan ratio of each nodes.
  - calculate weights for each nodes (by some logic ?)
  - give a fair scan w.r.t node's weight.

Hmm, I'll have a study on this.

Thanks.
-Kame














^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv2] memcg: reclaim memory from node in round-robin
  2011-04-27  7:51 [PATCHv2] memcg: reclaim memory from node in round-robin KAMEZAWA Hiroyuki
  2011-04-27 17:33 ` Ying Han
@ 2011-05-09  2:20 ` KOSAKI Motohiro
  2011-05-09  2:30   ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 15+ messages in thread
From: KOSAKI Motohiro @ 2011-05-09  2:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: kosaki.motohiro, linux-mm, linux-kernel, akpm, nishimura,
	Ying Han, balbir

> I changed the logic a little and add a filter for skipping nodes.
> With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
> can be unbalanced. So, I think a filter is required.
> 
> ==
> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
> 
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
> and usages are
>    Node 0:  1M
>    Node 1:  998M.
> 
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
> 
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. When using cpuset's spread memory feature, this will work very well.

Looks nice. And it would be more nice if global reclaim has the same feature.
Do you have a plan to do it?



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv2] memcg: reclaim memory from node in round-robin
  2011-05-09  2:20 ` [PATCHv2] " KOSAKI Motohiro
@ 2011-05-09  2:30   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-05-09  2:30 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: linux-mm, linux-kernel, akpm, nishimura, Ying Han, balbir

On Mon,  9 May 2011 11:20:31 +0900 (JST)
KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:

> > I changed the logic a little and add a filter for skipping nodes.
> > With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
> > can be unbalanced. So, I think a filter is required.
> > 
> > ==
> > Now, memory cgroup's direct reclaim frees memory from the current node.
> > But this has some troubles. In usual, when a set of threads works in
> > cooperative way, they are tend to on the same node. So, if they hit
> > limits under memcg, it will reclaim memory from themselves, it may be
> > active working set.
> > 
> > For example, assume 2 node system which has Node 0 and Node 1
> > and a memcg which has 1G limit. After some work, file cacne remains and
> > and usages are
> >    Node 0:  1M
> >    Node 1:  998M.
> > 
> > and run an application on Node 0, it will eats its foot before freeing
> > unnecessary file caches.
> > 
> > This patch adds round-robin for NUMA and adds equal pressure to each
> > node. When using cpuset's spread memory feature, this will work very well.
> 
> Looks nice. And it would be more nice if global reclaim has the same feature.
> Do you have a plan to do it?
> 

Hmm, IIUC, at allocating memory for file-cache, we may be able to avoid starting
from current node. But, isn't it be a feature of cpuset ? 
If cpuset.memory_spread_page==1 and a page for file is allocated from a node in
round-robin, and memory reclaim runs in such manner (using node-only zonelist
fallabck).

Do you mean the kernel should have a knob for allowing non-local allocation for
file caches even without cpuset ?

Thanks,
-Kame




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv4] memcg: reclaim memory from node in round-robin
  2011-05-06  6:13           ` KAMEZAWA Hiroyuki
@ 2011-05-26 19:52             ` Andrew Morton
  2011-05-26 23:54               ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2011-05-26 19:52 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Daisuke Nishimura, Ying Han, linux-mm, linux-kernel, balbir

On Fri, 6 May 2011 15:13:02 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> > It would be much better to work out the optimum time at which to rotate
> > the index via some deterministic means.
> > 
> > If we can't think of a way of doing that then we should at least pace
> > the rotation frequency via something saner than wall-time.  Such as
> > number-of-pages-scanned.
> > 
> 
> 
> What I think now is using reclaim_stat or usigng some fairness based on
> the ratio of inactive file caches. We can calculate the total sum of
> recalaim_stat which gives us a scan_ratio for a whole memcg. And we can
> calculate LRU rotate/scan ratio per node. If rotate/scan ratio is small,
> it will be a good candidate of reclaim target. Hmm,
> 
>   - check which memory(anon or file) should be scanned.
>     (If file is too small, rotate/scan ratio of file is meaningless.)
>   - check rotate/scan ratio of each nodes.
>   - calculate weights for each nodes (by some logic ?)
>   - give a fair scan w.r.t node's weight.
> 
> Hmm, I'll have a study on this.

How's the study coming along ;)

I'll send this in to Linus today, but I'll feel grumpy while doing so. 
We really should do something smarter here - the magic constant will
basically always be suboptimal for everyone and we end up tweaking its
value (if we don't, then the feature just wasn't valuable in the first
place) and then we add a tunable and then people try to tweak the
default setting of the tunable and then I deride them for not setting
the tunable in initscripts and then we have to maintain the stupid
tunable after we've changed the internal implementation and it's all
basically screwed up.

How to we automatically determine the optimum time at which to rotate,
at runtime?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv4] memcg: reclaim memory from node in round-robin
  2011-05-26 19:52             ` Andrew Morton
@ 2011-05-26 23:54               ` KAMEZAWA Hiroyuki
  2011-05-27  2:39                 ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-05-26 23:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Daisuke Nishimura, Ying Han, linux-mm, linux-kernel, balbir

On Thu, 26 May 2011 12:52:07 -0700
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Fri, 6 May 2011 15:13:02 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > > It would be much better to work out the optimum time at which to rotate
> > > the index via some deterministic means.
> > > 
> > > If we can't think of a way of doing that then we should at least pace
> > > the rotation frequency via something saner than wall-time.  Such as
> > > number-of-pages-scanned.
> > > 
> > 
> > 
> > What I think now is using reclaim_stat or usigng some fairness based on
> > the ratio of inactive file caches. We can calculate the total sum of
> > recalaim_stat which gives us a scan_ratio for a whole memcg. And we can
> > calculate LRU rotate/scan ratio per node. If rotate/scan ratio is small,
> > it will be a good candidate of reclaim target. Hmm,
> > 
> >   - check which memory(anon or file) should be scanned.
> >     (If file is too small, rotate/scan ratio of file is meaningless.)
> >   - check rotate/scan ratio of each nodes.
> >   - calculate weights for each nodes (by some logic ?)
> >   - give a fair scan w.r.t node's weight.
> > 
> > Hmm, I'll have a study on this.
> 
> How's the study coming along ;)
> 
> I'll send this in to Linus today, but I'll feel grumpy while doing so. 
> We really should do something smarter here - the magic constant will
> basically always be suboptimal for everyone and we end up tweaking its
> value (if we don't, then the feature just wasn't valuable in the first
> place) and then we add a tunable and then people try to tweak the
> default setting of the tunable and then I deride them for not setting
> the tunable in initscripts and then we have to maintain the stupid
> tunable after we've changed the internal implementation and it's all
> basically screwed up.
> 
> How to we automatically determine the optimum time at which to rotate,
> at runtime?
> 

Ah, I think I should check it after dirty page accounting comes...because
ratio of dirty pages is an important information..

Ok, what I think now is just comparing the number of INACTIVE_FILE or the number
of FILE CACHES per node. 

I think we can periodically update per-node and total amount of file caches
and we can record per-node 
   node-file-cache * 100/ total-file cache
information into memcg's per-node structure.

Then, I think we can do some scheduling like lottery scheduling, a scan proportional
to the ratio of file caches in the memcg. If it's better to check INACTIVE_ANON, 
I think swappiness can be used in above calcuration.

But yes, I or someone may be able to think of something much better.

Thanks,
-Kame






^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHv4] memcg: reclaim memory from node in round-robin
  2011-05-26 23:54               ` KAMEZAWA Hiroyuki
@ 2011-05-27  2:39                 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-05-27  2:39 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Daisuke Nishimura, Ying Han, linux-mm,
	linux-kernel, balbir

On Fri, 27 May 2011 08:54:40 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Thu, 26 May 2011 12:52:07 -0700
> Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > On Fri, 6 May 2011 15:13:02 +0900
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > 
> > > > It would be much better to work out the optimum time at which to rotate
> > > > the index via some deterministic means.
> > > > 
> > > > If we can't think of a way of doing that then we should at least pace
> > > > the rotation frequency via something saner than wall-time.  Such as
> > > > number-of-pages-scanned.
> > > > 
> > > 
> > > 
> > > What I think now is using reclaim_stat or usigng some fairness based on
> > > the ratio of inactive file caches. We can calculate the total sum of
> > > recalaim_stat which gives us a scan_ratio for a whole memcg. And we can
> > > calculate LRU rotate/scan ratio per node. If rotate/scan ratio is small,
> > > it will be a good candidate of reclaim target. Hmm,
> > > 
> > >   - check which memory(anon or file) should be scanned.
> > >     (If file is too small, rotate/scan ratio of file is meaningless.)
> > >   - check rotate/scan ratio of each nodes.
> > >   - calculate weights for each nodes (by some logic ?)
> > >   - give a fair scan w.r.t node's weight.
> > > 
> > > Hmm, I'll have a study on this.
> > 
> > How's the study coming along ;)
> > 
> > I'll send this in to Linus today, but I'll feel grumpy while doing so. 
> > We really should do something smarter here - the magic constant will
> > basically always be suboptimal for everyone and we end up tweaking its
> > value (if we don't, then the feature just wasn't valuable in the first
> > place) and then we add a tunable and then people try to tweak the
> > default setting of the tunable and then I deride them for not setting
> > the tunable in initscripts and then we have to maintain the stupid
> > tunable after we've changed the internal implementation and it's all
> > basically screwed up.
> > 
> > How to we automatically determine the optimum time at which to rotate,
> > at runtime?
> > 
> 
> Ah, I think I should check it after dirty page accounting comes...because
> ratio of dirty pages is an important information..
> 
> Ok, what I think now is just comparing the number of INACTIVE_FILE or the number
> of FILE CACHES per node. 
> 
> I think we can periodically update per-node and total amount of file caches
> and we can record per-node 
>    node-file-cache * 100/ total-file cache
> information into memcg's per-node structure.
> 

Hmmm..something like this ?
==
This will not be able to be applied mmotm directly.
This patch is made from tons of magic numbers....I need more study
and will be able to write a simple one.

At first, mem_cgroup can reclaim memory from anywhere, it just checks
amount of memory. Now, victim node to be reclaimed is just determined
by round-robin.

This patch adds a scheduler simliar to a weighted fair share scanning
among nodes. Now, we periodically update mem->scan_nodes to know
which node has evictable memory. This patch gathers more information.

This patch caluculate "weight" of node as

	(nr_inactive_file + nr_active_file/10) * (200-swappiness)
        + (nr_inactive_anon) * (swappiness)
	(see vmscan.c::get_scan_count() for meaning of swappiness)

And select some nodes in a fair way proportional to the weight.
selected nodes are cached into mem->victim_nodes, victime_nodes
will be visited in round robin.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memcontrol.c |  102 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 18 deletions(-)

Index: memcg_async/mm/memcontrol.c
===================================================================
--- memcg_async.orig/mm/memcontrol.c
+++ memcg_async/mm/memcontrol.c
@@ -48,6 +48,7 @@
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
+#include <linux/random.h>
 #include "internal.h"
 
 #include <asm/uaccess.h>
@@ -149,6 +150,7 @@ struct mem_cgroup_per_zone {
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
 
 struct mem_cgroup_per_node {
+	u64 scan_weight;
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 
@@ -257,6 +259,7 @@ struct mem_cgroup {
 	int last_scanned_node;
 #if MAX_NUMNODES > 1
 	nodemask_t	scan_nodes;
+	nodemask_t	victim_nodes;
 	unsigned long   next_scan_node_update;
 #endif
 	/*
@@ -1732,9 +1735,21 @@ u64 mem_cgroup_get_limit(struct mem_cgro
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
+
+/*
+ * This is for selecting a victim node with lottery proportional share
+ * scheduling. This LOTTEY value can be arbitrary but must be higher
+ * than max number of nodes.
+ */
+#define NODE_SCAN_LOTTERY	(1 << 15)
+#define NODE_SCAN_LOTTERY_MASK	(NODE_SCAN_LOTTERY - 1)
+
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
 {
 	int nid;
+	u64 total_weight;
+	unsigned long swappiness;
+	int nr_selection;
 
 	if (!force && time_after(mem->next_scan_node_update, jiffies))
 		return;
@@ -1742,18 +1757,77 @@ static void mem_cgroup_may_update_nodema
 	mem->next_scan_node_update = jiffies + 10*HZ;
 	/* make a nodemask where this memcg uses memory from */
 	mem->scan_nodes = node_states[N_HIGH_MEMORY];
+	nodes_clear(mem->victim_nodes);
+
+	swappiness = mem_cgroup_swappiness(mem);
+	total_weight = 0;
 
 	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+		u64 val, file_weight, anon_weight, pages;
+		int lru;
 
-		if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
-		    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
-			continue;
+		lru = LRU_INACTIVE_FILE;
+		val = mem_cgroup_get_zonestat_node(mem, nid, lru);
+		file_weight = val;
+		pages = val;
 
-		if (total_swap_pages &&
-		    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
-		     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
-			continue;
-		node_clear(nid, mem->scan_nodes);
+		lru = LRU_ACTIVE_FILE;
+		val = mem_cgroup_get_zonestat_node(mem, nid, lru);
+		/*
+		 * This is a magic calculation. We add 10% of active file
+		 * to weight. This should be tweaked..
+		 */
+		if (val)
+			file_weight += val/10;
+		pages += val;
+
+		if (total_swap_pages) {
+			lru = LRU_INACTIVE_ANON;
+			val = mem_cgroup_get_zonestat_node(mem, nid, lru);
+			anon_weight = val;
+			pages += val;
+			lru = LRU_ACTIVE_ANON;
+			val = mem_cgroup_get_zonestat_node(mem, nid, lru);
+			/*
+			 * Magic again. We don't want to active_anon take into
+			 * account but cannot ignore....add +1.
+			 */
+			if (val)
+				anon_weight += 1;
+			pages += val;
+		} else
+			anon_weight = 0;
+		mem->info.nodeinfo[nid]->scan_weight =
+			file_weight * (200 - swappiness) +
+			anon_weight * swappiness;
+		if (!pages)
+			node_clear(nid, mem->scan_nodes);
+
+		total_weight += mem->info.nodeinfo[nid]->scan_weight;
+	}
+	/* NORMALIZE weight information.*/
+	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+		mem->info.nodeinfo[nid]->scan_weight =
+			mem->info.nodeinfo[nid]->scan_weight
+				*  NODE_SCAN_LOTTERY/ total_weight;
+	}
+	/*
+	 * because checking lottery at every scan is heavy. we cache
+ 	 * some results. These victims will be used for the next 10sec.
+ 	 * Even if scan_nodes is empty, the victim_nodes includes node 0
+ 	 * at least.
+ 	 */
+	nr_selection = int_sqrt(nodes_weight(mem->scan_nodes)) + 1;
+
+	while (nr_selection >= 0) {
+		int lottery = random32();
+		for_each_node_mask(nid, mem->scan_nodes) {
+			lottery -= mem->info.nodeinfo[nid]->scan_weight;
+			if (lottery <= 0)
+				break;
+		}
+		node_set(nid, mem->victim_nodes);
 	}
 }
 
@@ -1776,17 +1850,9 @@ int mem_cgroup_select_victim_node(struct
 	mem_cgroup_may_update_nodemask(mem, false);
 	node = mem->last_scanned_node;
 
-	node = next_node(node, mem->scan_nodes);
+	node = next_node(node, mem->victim_nodes);
 	if (node == MAX_NUMNODES)
-		node = first_node(mem->scan_nodes);
-	/*
-	 * We call this when we hit limit, not when pages are added to LRU.
-	 * No LRU may hold pages because all pages are UNEVICTABLE or
-	 * memcg is too small and all pages are not on LRU. In that case,
-	 * we use curret node.
-	 */
-	if (unlikely(node == MAX_NUMNODES))
-		node = numa_node_id();
+		node = first_node(mem->victim_nodes);
 
 	mem->last_scanned_node = node;
 	return node;





^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2011-05-27  2:46 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-27  7:51 [PATCHv2] memcg: reclaim memory from node in round-robin KAMEZAWA Hiroyuki
2011-04-27 17:33 ` Ying Han
2011-04-27 23:57   ` KAMEZAWA Hiroyuki
2011-04-28  2:49     ` Ying Han
2011-04-28  0:35   ` [PATCHv3] " KAMEZAWA Hiroyuki
2011-04-28  1:37     ` Daisuke Nishimura
2011-04-28  1:49       ` [PATCHv4] " KAMEZAWA Hiroyuki
2011-04-28  2:04         ` Daisuke Nishimura
2011-05-04 21:26         ` Andrew Morton
2011-05-06  6:13           ` KAMEZAWA Hiroyuki
2011-05-26 19:52             ` Andrew Morton
2011-05-26 23:54               ` KAMEZAWA Hiroyuki
2011-05-27  2:39                 ` KAMEZAWA Hiroyuki
2011-05-09  2:20 ` [PATCHv2] " KOSAKI Motohiro
2011-05-09  2:30   ` KAMEZAWA Hiroyuki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).