All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ying Han <yinghan@google.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
	Pavel Emelyanov <xemul@openvz.org>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>,
	Li Zefan <lizf@cn.fujitsu.com>, Mel Gorman <mel@csn.ul.ie>,
	Christoph Lameter <cl@linux.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Rik van Riel <riel@redhat.com>, Hugh Dickins <hughd@google.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Tejun Heo <tj@kernel.org>, Michal Hocko <mhocko@suse.cz>,
	Andrew Morton <akpm@linux-foundation.org>,
	Dave Hansen <dave@linux.vnet.ibm.com>
Cc: linux-mm@kvack.org
Subject: [PATCH V3 2/7] Add per memcg reclaim watermarks
Date: Wed, 13 Apr 2011 00:03:02 -0700	[thread overview]
Message-ID: <1302678187-24154-3-git-send-email-yinghan@google.com> (raw)
In-Reply-To: <1302678187-24154-1-git-send-email-yinghan@google.com>

There are two watermarks added per-memcg including "high_wmark" and "low_wmark".
The per-memcg kswapd is invoked when the memcg's memory usage(usage_in_bytes)
is higher than the low_wmark. Then the kswapd thread starts to reclaim pages
until the usage is lower than the high_wmark.

Each watermark is calculated based on the hard_limit(limit_in_bytes) for each
memcg. Each time the hard_limit is changed, the corresponding wmarks are
re-calculated. Since memory controller charges only user pages, there is
no need for a "min_wmark". The current calculation of wmarks is a function of
"wmark_ratio" which is set to 0 by default. When the value is 0, the watermarks
are equal to the hard_limit.

changelog v3..v2:
1. Add VM_BUG_ON() on couple of places.
2. Remove the spinlock on the min_free_kbytes since the consequence of reading
stale data.
3. Remove the "min_free_kbytes" API and replace it with wmark_ratio based on
hard_limit.

changelog v2..v1:
1. Remove the res_counter_charge on wmark due to performance concern.
2. Move the new APIs min_free_kbytes, reclaim_wmarks into seperate commit.
3. Calculate the min_free_kbytes automatically based on the limit_in_bytes.
4. make the wmark to be consistant with core VM which checks the free pages
instead of usage.
5. changed wmark to be boolean

Signed-off-by: Ying Han <yinghan@google.com>
---
 include/linux/memcontrol.h  |    1 +
 include/linux/res_counter.h |   80 +++++++++++++++++++++++++++++++++++++++++++
 kernel/res_counter.c        |    6 +++
 mm/memcontrol.c             |   52 ++++++++++++++++++++++++++++
 4 files changed, 139 insertions(+), 0 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5a5ce70..3ece36d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -82,6 +82,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+extern int mem_cgroup_watermark_ok(struct mem_cgroup *mem, int charge_flags);
 
 static inline
 int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index c9d625c..fa4181b 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -39,6 +39,16 @@ struct res_counter {
 	 */
 	unsigned long long soft_limit;
 	/*
+	 * the limit that reclaim triggers. TODO: res_counter in mem
+	 * or wmark_limit.
+	 */
+	unsigned long long low_wmark_limit;
+	/*
+	 * the limit that reclaim stops. TODO: res_counter in mem or
+	 * wmark_limit.
+	 */
+	unsigned long long high_wmark_limit;
+	/*
 	 * the number of unsuccessful attempts to consume the resource
 	 */
 	unsigned long long failcnt;
@@ -55,6 +65,9 @@ struct res_counter {
 
 #define RESOURCE_MAX (unsigned long long)LLONG_MAX
 
+#define CHARGE_WMARK_LOW	0x01
+#define CHARGE_WMARK_HIGH	0x02
+
 /**
  * Helpers to interact with userspace
  * res_counter_read_u64() - returns the value of the specified member.
@@ -92,6 +105,8 @@ enum {
 	RES_LIMIT,
 	RES_FAILCNT,
 	RES_SOFT_LIMIT,
+	RES_LOW_WMARK_LIMIT,
+	RES_HIGH_WMARK_LIMIT
 };
 
 /*
@@ -147,6 +162,24 @@ static inline unsigned long long res_counter_margin(struct res_counter *cnt)
 	return margin;
 }
 
+static inline bool
+res_counter_high_wmark_limit_check_locked(struct res_counter *cnt)
+{
+	if (cnt->usage < cnt->high_wmark_limit)
+		return true;
+
+	return false;
+}
+
+static inline bool
+res_counter_low_wmark_limit_check_locked(struct res_counter *cnt)
+{
+	if (cnt->usage < cnt->low_wmark_limit)
+		return true;
+
+	return false;
+}
+
 /**
  * Get the difference between the usage and the soft limit
  * @cnt: The counter
@@ -169,6 +202,30 @@ res_counter_soft_limit_excess(struct res_counter *cnt)
 	return excess;
 }
 
+static inline bool
+res_counter_check_under_low_wmark_limit(struct res_counter *cnt)
+{
+	bool ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	ret = res_counter_low_wmark_limit_check_locked(cnt);
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return ret;
+}
+
+static inline bool
+res_counter_check_under_high_wmark_limit(struct res_counter *cnt)
+{
+	bool ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	ret = res_counter_high_wmark_limit_check_locked(cnt);
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return ret;
+}
+
 static inline void res_counter_reset_max(struct res_counter *cnt)
 {
 	unsigned long flags;
@@ -214,4 +271,27 @@ res_counter_set_soft_limit(struct res_counter *cnt,
 	return 0;
 }
 
+static inline int
+res_counter_set_high_wmark_limit(struct res_counter *cnt,
+				unsigned long long wmark_limit)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	cnt->high_wmark_limit = wmark_limit;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return 0;
+}
+
+static inline int
+res_counter_set_low_wmark_limit(struct res_counter *cnt,
+				unsigned long long wmark_limit)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	cnt->low_wmark_limit = wmark_limit;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return 0;
+}
 #endif
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683ef..206a724 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 	spin_lock_init(&counter->lock);
 	counter->limit = RESOURCE_MAX;
 	counter->soft_limit = RESOURCE_MAX;
+	counter->low_wmark_limit = RESOURCE_MAX;
+	counter->high_wmark_limit = RESOURCE_MAX;
 	counter->parent = parent;
 }
 
@@ -103,6 +105,10 @@ res_counter_member(struct res_counter *counter, int member)
 		return &counter->failcnt;
 	case RES_SOFT_LIMIT:
 		return &counter->soft_limit;
+	case RES_LOW_WMARK_LIMIT:
+		return &counter->low_wmark_limit;
+	case RES_HIGH_WMARK_LIMIT:
+		return &counter->high_wmark_limit;
 	};
 
 	BUG();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4407dd0..664cdc5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -272,6 +272,8 @@ struct mem_cgroup {
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
+
+	int wmark_ratio;
 };
 
 /* Stuffs for move charges at task migration. */
@@ -353,6 +355,7 @@ static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 static void drain_all_stock_async(void);
+static unsigned long get_wmark_ratio(struct mem_cgroup *mem);
 
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -813,6 +816,27 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 	return (mem == root_mem_cgroup);
 }
 
+static void setup_per_memcg_wmarks(struct mem_cgroup *mem)
+{
+	u64 limit;
+	unsigned long wmark_ratio;
+
+	wmark_ratio = get_wmark_ratio(mem);
+	limit = mem_cgroup_get_limit(mem);
+	if (wmark_ratio == 0) {
+		res_counter_set_low_wmark_limit(&mem->res, limit);
+		res_counter_set_high_wmark_limit(&mem->res, limit);
+	} else {
+		unsigned long low_wmark, high_wmark;
+		unsigned long long tmp = (wmark_ratio * limit) / 100;
+
+		low_wmark = tmp;
+		high_wmark = tmp - (tmp >> 8);
+		res_counter_set_low_wmark_limit(&mem->res, low_wmark);
+		res_counter_set_high_wmark_limit(&mem->res, high_wmark);
+	}
+}
+
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
@@ -1195,6 +1219,16 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
 	return memcg->swappiness;
 }
 
+static unsigned long get_wmark_ratio(struct mem_cgroup *memcg)
+{
+	struct cgroup *cgrp = memcg->css.cgroup;
+
+	VM_BUG_ON(!cgrp);
+	VM_BUG_ON(!cgrp->parent);
+
+	return memcg->wmark_ratio;
+}
+
 static void mem_cgroup_start_move(struct mem_cgroup *mem)
 {
 	int cpu;
@@ -3205,6 +3239,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 			else
 				memcg->memsw_is_minimum = false;
 		}
+		setup_per_memcg_wmarks(memcg);
 		mutex_unlock(&set_limit_mutex);
 
 		if (!ret)
@@ -3264,6 +3299,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 			else
 				memcg->memsw_is_minimum = false;
 		}
+		setup_per_memcg_wmarks(memcg);
 		mutex_unlock(&set_limit_mutex);
 
 		if (!ret)
@@ -4521,6 +4557,22 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
+int mem_cgroup_watermark_ok(struct mem_cgroup *mem,
+				int charge_flags)
+{
+	long ret = 0;
+	int flags = CHARGE_WMARK_LOW | CHARGE_WMARK_HIGH;
+
+	VM_BUG_ON((charge_flags & flags) == flags);
+
+	if (charge_flags & CHARGE_WMARK_LOW)
+		ret = res_counter_check_under_low_wmark_limit(&mem->res);
+	if (charge_flags & CHARGE_WMARK_HIGH)
+		ret = res_counter_check_under_high_wmark_limit(&mem->res);
+
+	return ret;
+}
+
 static int mem_cgroup_soft_limit_tree_init(void)
 {
 	struct mem_cgroup_tree_per_node *rtpn;
-- 
1.7.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2011-04-13  7:04 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-04-13  7:03 [PATCH V3 0/7] memcg: per cgroup background reclaim Ying Han
2011-04-13  7:03 ` [PATCH V3 1/7] Add kswapd descriptor Ying Han
2011-04-13  7:03 ` Ying Han [this message]
2011-04-13  8:25   ` [PATCH V3 2/7] Add per memcg reclaim watermarks KAMEZAWA Hiroyuki
2011-04-13 18:40     ` Ying Han
2011-04-14  0:27       ` KAMEZAWA Hiroyuki
2011-04-14  8:24   ` Zhu Yanhai
2011-04-14 17:43     ` Ying Han
2011-04-13  7:03 ` [PATCH V3 3/7] New APIs to adjust per-memcg wmarks Ying Han
2011-04-13  8:30   ` KAMEZAWA Hiroyuki
2011-04-13 18:46     ` Ying Han
2011-04-13  7:03 ` [PATCH V3 4/7] Infrastructure to support per-memcg reclaim Ying Han
2011-04-14  3:57   ` Zhu Yanhai
2011-04-14  6:32     ` Ying Han
2011-04-13  7:03 ` [PATCH V3 5/7] Per-memcg background reclaim Ying Han
2011-04-13  8:58   ` KAMEZAWA Hiroyuki
2011-04-13 22:45     ` Ying Han
2011-04-13  7:03 ` [PATCH V3 6/7] Enable per-memcg " Ying Han
2011-04-13  9:05   ` KAMEZAWA Hiroyuki
2011-04-13 21:20     ` Ying Han
2011-04-14  0:30       ` KAMEZAWA Hiroyuki
2011-04-13  7:03 ` [PATCH V3 7/7] Add some per-memcg stats Ying Han
2011-04-13  7:47 ` [PATCH V3 0/7] memcg: per cgroup background reclaim KAMEZAWA Hiroyuki
2011-04-13 17:53   ` Ying Han
2011-04-14  0:14     ` KAMEZAWA Hiroyuki
2011-04-14 17:38       ` Ying Han
2011-04-14 21:59         ` Ying Han

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1302678187-24154-3-git-send-email-yinghan@google.com \
    --to=yinghan@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=cl@linux.com \
    --cc=dave@linux.vnet.ibm.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-mm@kvack.org \
    --cc=lizf@cn.fujitsu.com \
    --cc=mel@csn.ul.ie \
    --cc=mhocko@suse.cz \
    --cc=nishimura@mxp.nes.nec.co.jp \
    --cc=riel@redhat.com \
    --cc=tj@kernel.org \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.