linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Roman Gushchin <guro@fb.com>
To: <linux-mm@kvack.org>
Cc: Michal Hocko <mhocko@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	<linux-kernel@vger.kernel.org>, <kernel-team@fb.com>,
	Shakeel Butt <shakeelb@google.com>,
	Vladimir Davydov <vdavydov.dev@gmail.com>,
	Waiman Long <longman@redhat.com>, Roman Gushchin <guro@fb.com>
Subject: [PATCH RFC 01/14] mm: memcg: subpage charging API
Date: Thu, 5 Sep 2019 14:45:45 -0700	[thread overview]
Message-ID: <20190905214553.1643060-2-guro@fb.com> (raw)
In-Reply-To: <20190905214553.1643060-1-guro@fb.com>

Introduce an API to charge subpage objects to the memory cgroup.
The API will be used by the new slab memory controller. Later it
can also be used to implement percpu memory accounting.
In both cases, a single page can be shared between multiple cgroups
(and in percpu case a single allocation is split over multiple pages),
so it's not possible to use page-based accounting.

The implementation is based on percpu stocks. Memory cgroups are still
charged in pages, and the residue is stored in perpcu stock, or on the
memcg itself, when it's necessary to flush the stock.

Please, note, that unlike the generic page charging API, a subpage
object is not holding a reference to the memory cgroup. It's because
a more complicated indirect scheme is required in order to implement
cheap reparenting. The percpu stock holds a single reference to the
cached cgroup though.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 include/linux/memcontrol.h |   4 ++
 mm/memcontrol.c            | 129 +++++++++++++++++++++++++++++++++----
 2 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c762e8ca6a6..120d39066148 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -214,6 +214,7 @@ struct mem_cgroup {
 	/* Accounted resources */
 	struct page_counter memory;
 	struct page_counter swap;
+	atomic_t nr_stocked_bytes;
 
 	/* Legacy consumer-oriented counters */
 	struct page_counter memsw;
@@ -1370,6 +1371,9 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 			      struct mem_cgroup *memcg);
 void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
 				 unsigned int nr_pages);
+int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
+				gfp_t gfp);
+void __memcg_kmem_uncharge_subpage(struct mem_cgroup *memcg, size_t size);
 
 extern struct static_key_false memcg_kmem_enabled_key;
 extern struct workqueue_struct *memcg_kmem_cache_wq;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c4c08b45e44..effefcec47b3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2149,6 +2149,10 @@ EXPORT_SYMBOL(unlock_page_memcg);
 struct memcg_stock_pcp {
 	struct mem_cgroup *cached; /* this never be root cgroup */
 	unsigned int nr_pages;
+
+	struct mem_cgroup *subpage_cached;
+	unsigned int nr_bytes;
+
 	struct work_struct work;
 	unsigned long flags;
 #define FLUSHING_CACHED_CHARGE	0
@@ -2189,6 +2193,29 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 	return ret;
 }
 
+static bool consume_subpage_stock(struct mem_cgroup *memcg,
+				  unsigned int nr_bytes)
+{
+	struct memcg_stock_pcp *stock;
+	unsigned long flags;
+	bool ret = false;
+
+	if (nr_bytes > (MEMCG_CHARGE_BATCH << PAGE_SHIFT))
+		return ret;
+
+	local_irq_save(flags);
+
+	stock = this_cpu_ptr(&memcg_stock);
+	if (memcg == stock->subpage_cached && stock->nr_bytes >= nr_bytes) {
+		stock->nr_bytes -= nr_bytes;
+		ret = true;
+	}
+
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 /*
  * Returns stocks cached in percpu and reset cached information.
  */
@@ -2206,6 +2233,27 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 	stock->cached = NULL;
 }
 
+static void drain_subpage_stock(struct memcg_stock_pcp *stock)
+{
+	struct mem_cgroup *old = stock->subpage_cached;
+
+	if (stock->nr_bytes) {
+		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+		page_counter_uncharge(&old->memory, nr_pages);
+		if (do_memsw_account())
+			page_counter_uncharge(&old->memsw, nr_pages);
+
+		atomic_add(nr_bytes, &old->nr_stocked_bytes);
+		stock->nr_bytes = 0;
+	}
+	if (stock->subpage_cached) {
+		css_put(&old->css);
+		stock->subpage_cached = NULL;
+	}
+}
+
 static void drain_local_stock(struct work_struct *dummy)
 {
 	struct memcg_stock_pcp *stock;
@@ -2218,8 +2266,11 @@ static void drain_local_stock(struct work_struct *dummy)
 	local_irq_save(flags);
 
 	stock = this_cpu_ptr(&memcg_stock);
-	drain_stock(stock);
-	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+	if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+		drain_stock(stock);
+		drain_subpage_stock(stock);
+		clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+	}
 
 	local_irq_restore(flags);
 }
@@ -2248,6 +2299,29 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 	local_irq_restore(flags);
 }
 
+static void refill_subpage_stock(struct mem_cgroup *memcg,
+				 unsigned int nr_bytes)
+{
+	struct memcg_stock_pcp *stock;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	stock = this_cpu_ptr(&memcg_stock);
+	if (stock->subpage_cached != memcg) { /* reset if necessary */
+		drain_subpage_stock(stock);
+		css_get(&memcg->css);
+		stock->subpage_cached = memcg;
+		stock->nr_bytes = atomic_xchg(&memcg->nr_stocked_bytes, 0);
+	}
+	stock->nr_bytes += nr_bytes;
+
+	if (stock->nr_bytes > (MEMCG_CHARGE_BATCH << PAGE_SHIFT))
+		drain_subpage_stock(stock);
+
+	local_irq_restore(flags);
+}
+
 /*
  * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it.
@@ -2276,6 +2350,9 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
 		if (memcg && stock->nr_pages &&
 		    mem_cgroup_is_descendant(memcg, root_memcg))
 			flush = true;
+		memcg = stock->subpage_cached;
+		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+			flush = true;
 		rcu_read_unlock();
 
 		if (flush &&
@@ -2500,8 +2577,9 @@ void mem_cgroup_handle_over_high(void)
 }
 
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-		      unsigned int nr_pages)
+		      unsigned int amount, bool subpage)
 {
+	unsigned int nr_pages = subpage ? ((amount >> PAGE_SHIFT) + 1) : amount;
 	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup *mem_over_limit;
@@ -2514,7 +2592,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	if (mem_cgroup_is_root(memcg))
 		return 0;
 retry:
-	if (consume_stock(memcg, nr_pages))
+	if (subpage && consume_subpage_stock(memcg, amount))
+		return 0;
+	else if (!subpage && consume_stock(memcg, nr_pages))
 		return 0;
 
 	if (!do_memsw_account() ||
@@ -2632,14 +2712,22 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	page_counter_charge(&memcg->memory, nr_pages);
 	if (do_memsw_account())
 		page_counter_charge(&memcg->memsw, nr_pages);
-	css_get_many(&memcg->css, nr_pages);
+
+	if (subpage)
+		refill_subpage_stock(memcg, (nr_pages << PAGE_SHIFT) - amount);
+	else
+		css_get_many(&memcg->css, nr_pages);
 
 	return 0;
 
 done_restock:
-	css_get_many(&memcg->css, batch);
-	if (batch > nr_pages)
-		refill_stock(memcg, batch - nr_pages);
+	if (subpage && (batch << PAGE_SHIFT) > amount) {
+		refill_subpage_stock(memcg, (batch << PAGE_SHIFT) - amount);
+	} else if (!subpage) {
+		css_get_many(&memcg->css, batch);
+		if (batch > nr_pages)
+			refill_stock(memcg, batch - nr_pages);
+	}
 
 	/*
 	 * If the hierarchy is above the normal consumption range, schedule
@@ -2942,7 +3030,7 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 	struct page_counter *counter;
 	int ret;
 
-	ret = try_charge(memcg, gfp, nr_pages);
+	ret = try_charge(memcg, gfp, nr_pages, false);
 	if (ret)
 		return ret;
 
@@ -3020,6 +3108,18 @@ void __memcg_kmem_uncharge(struct page *page, int order)
 
 	css_put_many(&memcg->css, nr_pages);
 }
+
+int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
+				gfp_t gfp)
+{
+	return try_charge(memcg, gfp, size, true);
+}
+
+void __memcg_kmem_uncharge_subpage(struct mem_cgroup *memcg, size_t size)
+{
+	refill_subpage_stock(memcg, size);
+}
+
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -5267,7 +5367,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
 	int ret;
 
 	/* Try a single bulk charge without reclaim first, kswapd may wake */
-	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
+	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count,
+			 false);
 	if (!ret) {
 		mc.precharge += count;
 		return ret;
@@ -5275,7 +5376,7 @@ static int mem_cgroup_do_precharge(unsigned long count)
 
 	/* Try charges one by one with reclaim, but do not retry */
 	while (count--) {
-		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
+		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1, false);
 		if (ret)
 			return ret;
 		mc.precharge++;
@@ -6487,7 +6588,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 	if (!memcg)
 		memcg = get_mem_cgroup_from_mm(mm);
 
-	ret = try_charge(memcg, gfp_mask, nr_pages);
+	ret = try_charge(memcg, gfp_mask, nr_pages, false);
 
 	css_put(&memcg->css);
 out:
@@ -6866,10 +6967,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 
 	mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
 
-	if (try_charge(memcg, gfp_mask, nr_pages) == 0)
+	if (try_charge(memcg, gfp_mask, nr_pages, false) == 0)
 		return true;
 
-	try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
+	try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages, false);
 	return false;
 }
 
-- 
2.21.0


  reply	other threads:[~2019-09-05 21:46 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-05 21:45 [PATCH RFC 00/14] The new slab memory controller Roman Gushchin
2019-09-05 21:45 ` Roman Gushchin [this message]
2019-09-16 12:56   ` [PATCH RFC 01/14] mm: memcg: subpage charging API Johannes Weiner
2019-09-17  2:27     ` Roman Gushchin
2019-09-17  8:50       ` Johannes Weiner
2019-09-17 18:33         ` Roman Gushchin
2019-09-05 21:45 ` [PATCH RFC 02/14] mm: memcg: introduce mem_cgroup_ptr Roman Gushchin
2019-09-05 22:34   ` Roman Gushchin
2019-09-05 21:45 ` [PATCH RFC 03/14] mm: vmstat: use s32 for vm_node_stat_diff in struct per_cpu_nodestat Roman Gushchin
2019-09-05 21:45 ` [PATCH RFC 04/14] mm: vmstat: convert slab vmstat counter to bytes Roman Gushchin
2019-09-16 12:38   ` Johannes Weiner
2019-09-17  2:08     ` Roman Gushchin
2019-09-05 21:45 ` [PATCH RFC 05/14] mm: memcg/slab: allocate space for memcg ownership data for non-root slabs Roman Gushchin
2019-09-05 21:45 ` [PATCH RFC 06/14] mm: slub: implement SLUB version of obj_to_index() Roman Gushchin
2019-09-05 21:45 ` [PATCH RFC 07/14] mm: memcg/slab: save memcg ownership data for non-root slab objects Roman Gushchin
2019-09-05 21:45 ` [PATCH RFC 08/14] mm: memcg: move memcg_kmem_bypass() to memcontrol.h Roman Gushchin
2019-09-05 21:45 ` [PATCH RFC 09/14] mm: memcg: introduce __mod_lruvec_memcg_state() Roman Gushchin
2019-09-05 22:37 ` [PATCH RFC 02/14] mm: memcg: introduce mem_cgroup_ptr Roman Gushchin
2019-09-17 19:48 ` [PATCH RFC 00/14] The new slab memory controller Waiman Long
2019-09-17 21:24   ` Roman Gushchin
2019-09-19 13:39 ` Suleiman Souhlal
2019-09-19 16:22   ` Roman Gushchin
2019-09-19 21:10     ` Suleiman Souhlal
2019-09-19 21:40       ` Roman Gushchin
2019-10-01 15:12 ` Michal Koutný
2019-10-02  2:09   ` Roman Gushchin
2019-10-02 13:00     ` Suleiman Souhlal
2019-10-03 10:47       ` Michal Koutný
2019-10-03 15:52         ` Roman Gushchin
2019-12-09  9:17 ` [PATCH 00/16] " Bharata B Rao
2019-12-09 11:56   ` Bharata B Rao
2019-12-09 18:04     ` Roman Gushchin
2019-12-10  6:23       ` Bharata B Rao
2019-12-10 18:05         ` Roman Gushchin
2020-01-13  8:47           ` Bharata B Rao
2020-01-13 15:31             ` Roman Gushchin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190905214553.1643060-2-guro@fb.com \
    --to=guro@fb.com \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longman@redhat.com \
    --cc=mhocko@kernel.org \
    --cc=shakeelb@google.com \
    --cc=vdavydov.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).