* [PATCH v6 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c
2021-05-06 15:00 [PATCH v6 0/4] mm/memcg: Reduce kmemcache memory accounting overhead Waiman Long
@ 2021-05-06 15:00 ` Waiman Long
2021-05-06 15:00 ` [PATCH v6 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp Waiman Long
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Waiman Long @ 2021-05-06 15:00 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin
Cc: linux-kernel, cgroups, linux-mm, Shakeel Butt, Muchun Song,
Alex Shi, Chris Down, Yafang Shao, Wei Yang, Masayoshi Mizuma,
Xing Zhengjun, Matthew Wilcox, Waiman Long
The mod_objcg_state() function is moved from mm/slab.h to mm/memcontrol.c
so that further optimization can be done to it in later patches without
exposing unnecessary details to other mm components.
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
---
mm/memcontrol.c | 13 +++++++++++++
mm/slab.h | 16 ++--------------
2 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c100265dc393..859f872b482e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -906,6 +906,19 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ rcu_read_unlock();
+}
+
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
diff --git a/mm/slab.h b/mm/slab.h
index 18c1927cd196..dcf964737d7e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -239,6 +239,8 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp, bool new_page);
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr);
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
@@ -283,20 +285,6 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
return true;
}
-static inline void mod_objcg_state(struct obj_cgroup *objcg,
- struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
-{
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mod_memcg_lruvec_state(lruvec, idx, nr);
- rcu_read_unlock();
-}
-
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
struct obj_cgroup *objcg,
gfp_t flags, size_t size,
--
2.18.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v6 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp
2021-05-06 15:00 [PATCH v6 0/4] mm/memcg: Reduce kmemcache memory accounting overhead Waiman Long
2021-05-06 15:00 ` [PATCH v6 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c Waiman Long
@ 2021-05-06 15:00 ` Waiman Long
2021-05-06 15:00 ` [PATCH v6 3/4] mm/memcg: Improve refill_obj_stock() performance Waiman Long
2021-05-06 15:00 ` [PATCH v6 4/4] mm/memcg: Optimize user context object stock access Waiman Long
3 siblings, 0 replies; 5+ messages in thread
From: Waiman Long @ 2021-05-06 15:00 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin
Cc: linux-kernel, cgroups, linux-mm, Shakeel Butt, Muchun Song,
Alex Shi, Chris Down, Yafang Shao, Wei Yang, Masayoshi Mizuma,
Xing Zhengjun, Matthew Wilcox, Waiman Long
Before the new slab memory controller with per object byte charging,
charging and vmstat data update happen only when new slab pages are
allocated or freed. Now they are done with every kmem_cache_alloc()
and kmem_cache_free(). This causes additional overhead for workloads
that generate a lot of alloc and free calls.
The memcg_stock_pcp is used to cache byte charge for a specific
obj_cgroup to reduce that overhead. To further reducing it, this patch
makes the vmstat data cached in the memcg_stock_pcp structure as well
until it accumulates a page size worth of update or when other cached
data change. Caching the vmstat data in the per-cpu stock eliminates two
writes to non-hot cachelines for memcg specific as well as memcg-lruvecs
specific vmstat data by a write to a hot local stock cacheline.
On a 2-socket Cascade Lake server with instrumentation enabled and this
patch applied, it was found that about 20% (634400 out of 3243830)
of the time when mod_objcg_state() is called leads to an actual call
to __mod_objcg_state() after initial boot. When doing parallel kernel
build, the figure was about 17% (24329265 out of 142512465). So caching
the vmstat data reduces the number of calls to __mod_objcg_state()
by more than 80%.
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
---
mm/memcontrol.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 87 insertions(+), 3 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 859f872b482e..fbedfc55a248 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -906,8 +906,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
@@ -915,7 +916,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mod_memcg_lruvec_state(lruvec, idx, nr);
+ __mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
}
@@ -2183,7 +2184,10 @@ struct memcg_stock_pcp {
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
+ struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
+ int nr_slab_reclaimable_b;
+ int nr_slab_unreclaimable_b;
#endif
struct work_struct work;
@@ -3132,6 +3136,67 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
obj_cgroup_put(objcg);
}
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ int *bytes;
+
+ local_irq_save(flags);
+ stock = this_cpu_ptr(&memcg_stock);
+
+ /*
+ * Save vmstat data in stock and skip vmstat array update unless
+ * accumulating over a page of vmstat data or when pgdat or idx
+ * changes.
+ */
+ if (stock->cached_objcg != objcg) {
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ stock->cached_objcg = objcg;
+ stock->cached_pgdat = pgdat;
+ } else if (stock->cached_pgdat != pgdat) {
+ /* Flush the existing cached vmstat data */
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->cached_pgdat = pgdat;
+ }
+
+ bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
+ : &stock->nr_slab_unreclaimable_b;
+ /*
+ * Even for large object >= PAGE_SIZE, the vmstat data will still be
+ * cached locally at least once before pushing it out.
+ */
+ if (!*bytes) {
+ *bytes = nr;
+ nr = 0;
+ } else {
+ *bytes += nr;
+ if (abs(*bytes) > PAGE_SIZE) {
+ nr = *bytes;
+ *bytes = 0;
+ } else {
+ nr = 0;
+ }
+ }
+ if (nr)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
+
+ local_irq_restore(flags);
+}
+
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
struct memcg_stock_pcp *stock;
@@ -3179,6 +3244,25 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
stock->nr_bytes = 0;
}
+ /*
+ * Flush the vmstat data in current stock
+ */
+ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(old, stock->cached_pgdat,
+ NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(old, stock->cached_pgdat,
+ NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->cached_pgdat = NULL;
+ }
+
obj_cgroup_put(old);
stock->cached_objcg = NULL;
}
--
2.18.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v6 3/4] mm/memcg: Improve refill_obj_stock() performance
2021-05-06 15:00 [PATCH v6 0/4] mm/memcg: Reduce kmemcache memory accounting overhead Waiman Long
2021-05-06 15:00 ` [PATCH v6 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c Waiman Long
2021-05-06 15:00 ` [PATCH v6 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp Waiman Long
@ 2021-05-06 15:00 ` Waiman Long
2021-05-06 15:00 ` [PATCH v6 4/4] mm/memcg: Optimize user context object stock access Waiman Long
3 siblings, 0 replies; 5+ messages in thread
From: Waiman Long @ 2021-05-06 15:00 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin
Cc: linux-kernel, cgroups, linux-mm, Shakeel Butt, Muchun Song,
Alex Shi, Chris Down, Yafang Shao, Wei Yang, Masayoshi Mizuma,
Xing Zhengjun, Matthew Wilcox, Waiman Long
There are two issues with the current refill_obj_stock() code. First of
all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to
atomically flush out remaining bytes to obj_cgroup, clear cached_objcg
and do a obj_cgroup_put(). It is likely that the same obj_cgroup will
be used again which leads to another call to drain_obj_stock() and
obj_cgroup_get() as well as atomically retrieve the available byte from
obj_cgroup. That is costly. Instead, we should just uncharge the excess
pages, reduce the stock bytes and be done with it. The drain_obj_stock()
function should only be called when obj_cgroup changes.
Secondly, when charging an object of size not less than a page in
obj_cgroup_charge(), it is possible that the remaining bytes to be
refilled to the stock will overflow a page and cause refill_obj_stock()
to uncharge 1 page. To avoid the additional uncharge in this case, a new
allow_uncharge flag is added to refill_obj_stock() which will be set to
false when called from obj_cgroup_charge() so that an uncharge_pages()
call won't be issued right after a charge_pages() call unless the objcg
changes.
A multithreaded kmalloc+kfree microbenchmark on a 2-socket 48-core
96-thread x86-64 system with 96 testing threads were run. Before this
patch, the total number of kilo kmalloc+kfree operations done for a 4k
large object by all the testing threads per second were 4,304 kops/s
(cgroup v1) and 8,478 kops/s (cgroup v2). After applying this patch, the
number were 4,731 (cgroup v1) and 418,142 (cgroup v2) respectively. This
represents a performance improvement of 1.10X (cgroup v1) and 49.3X
(cgroup v2).
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
---
mm/memcontrol.c | 48 +++++++++++++++++++++++++++++++++++-------------
1 file changed, 35 insertions(+), 13 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fbedfc55a248..513f3d56e89a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3281,10 +3281,12 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
return false;
}
-static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ bool allow_uncharge)
{
struct memcg_stock_pcp *stock;
unsigned long flags;
+ unsigned int nr_pages = 0;
local_irq_save(flags);
@@ -3293,14 +3295,21 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->cached_objcg = objcg;
- stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
- if (stock->nr_bytes > PAGE_SIZE)
- drain_obj_stock(stock);
+ if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
+ nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ stock->nr_bytes &= (PAGE_SIZE - 1);
+ }
local_irq_restore(flags);
+
+ if (nr_pages)
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
}
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
@@ -3312,14 +3321,27 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
return 0;
/*
- * In theory, memcg->nr_charged_bytes can have enough
+ * In theory, objcg->nr_charged_bytes can have enough
* pre-charged bytes to satisfy the allocation. However,
- * flushing memcg->nr_charged_bytes requires two atomic
- * operations, and memcg->nr_charged_bytes can't be big,
- * so it's better to ignore it and try grab some new pages.
- * memcg->nr_charged_bytes will be flushed in
- * refill_obj_stock(), called from this function or
- * independently later.
+ * flushing objcg->nr_charged_bytes requires two atomic
+ * operations, and objcg->nr_charged_bytes can't be big.
+ * The shared objcg->nr_charged_bytes can also become a
+ * performance bottleneck if all tasks of the same memcg are
+ * trying to update it. So it's better to ignore it and try
+ * grab some new pages. The stock's nr_bytes will be flushed to
+ * objcg->nr_charged_bytes later on when objcg changes.
+ *
+ * The stock's nr_bytes may contain enough pre-charged bytes
+ * to allow one less page from being charged, but we can't rely
+ * on the pre-charged bytes not being changed outside of
+ * consume_obj_stock() or refill_obj_stock(). So ignore those
+ * pre-charged bytes as well when charging pages. To avoid a
+ * page uncharge right after a page charge, we set the
+ * allow_uncharge flag to false when calling refill_obj_stock()
+ * to temporarily allow the pre-charged bytes to exceed the page
+ * size limit. The maximum reachable value of the pre-charged
+ * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
+ * race.
*/
nr_pages = size >> PAGE_SHIFT;
nr_bytes = size & (PAGE_SIZE - 1);
@@ -3329,14 +3351,14 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
return ret;
}
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size);
+ refill_obj_stock(objcg, size, true);
}
#endif /* CONFIG_MEMCG_KMEM */
--
2.18.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v6 4/4] mm/memcg: Optimize user context object stock access
2021-05-06 15:00 [PATCH v6 0/4] mm/memcg: Reduce kmemcache memory accounting overhead Waiman Long
` (2 preceding siblings ...)
2021-05-06 15:00 ` [PATCH v6 3/4] mm/memcg: Improve refill_obj_stock() performance Waiman Long
@ 2021-05-06 15:00 ` Waiman Long
3 siblings, 0 replies; 5+ messages in thread
From: Waiman Long @ 2021-05-06 15:00 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin
Cc: linux-kernel, cgroups, linux-mm, Shakeel Butt, Muchun Song,
Alex Shi, Chris Down, Yafang Shao, Wei Yang, Masayoshi Mizuma,
Xing Zhengjun, Matthew Wilcox, Waiman Long
Most kmem_cache_alloc() calls are from user context. With instrumentation
enabled, the measured amount of kmem_cache_alloc() calls from non-task
context was about 0.01% of the total.
The irq disable/enable sequence used in this case to access content
from object stock is slow. To optimize for user context access, there
are now two sets of object stocks (in the new obj_stock structure)
for task context and interrupt context access respectively.
The task context object stock can be accessed after disabling preemption
which is cheap in non-preempt kernel. The interrupt context object stock
can only be accessed after disabling interrupt. User context code can
access interrupt object stock, but not vice versa.
The downside of this change is that there are more data stored in local
object stocks and not reflected in the charge counter and the vmstat
arrays. However, this is a small price to pay for better performance.
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
---
mm/memcontrol.c | 94 +++++++++++++++++++++++++++++++++++--------------
1 file changed, 68 insertions(+), 26 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 513f3d56e89a..d8e90aa6e1ad 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -906,6 +906,10 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
+/*
+ * mod_objcg_mlstate() may be called with irq enabled, so
+ * mod_memcg_lruvec_state() should be used.
+ */
static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
@@ -916,7 +920,7 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
}
@@ -2178,17 +2182,23 @@ void unlock_page_memcg(struct page *page)
}
EXPORT_SYMBOL(unlock_page_memcg);
-struct memcg_stock_pcp {
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
-
+struct obj_stock {
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
+#else
+ int dummy[0];
#endif
+};
+
+struct memcg_stock_pcp {
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ unsigned int nr_pages;
+ struct obj_stock task_obj;
+ struct obj_stock irq_obj;
struct work_struct work;
unsigned long flags;
@@ -2198,12 +2208,12 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static void drain_obj_stock(struct obj_stock *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
#else
-static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+static inline void drain_obj_stock(struct obj_stock *stock)
{
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@ -2213,6 +2223,40 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
}
#endif
+/*
+ * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
+ * sequence used in this case to access content from object stock is slow.
+ * To optimize for user context access, there are now two object stocks for
+ * task context and interrupt context access respectively.
+ *
+ * The task context object stock can be accessed by disabling preemption only
+ * which is cheap in non-preempt kernel. The interrupt context object stock
+ * can only be accessed after disabling interrupt. User context code can
+ * access interrupt object stock, but not vice versa.
+ */
+static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
+{
+ struct memcg_stock_pcp *stock;
+
+ if (likely(in_task())) {
+ preempt_disable();
+ stock = this_cpu_ptr(&memcg_stock);
+ return &stock->task_obj;
+ } else {
+ local_irq_save(*pflags);
+ stock = this_cpu_ptr(&memcg_stock);
+ return &stock->irq_obj;
+ }
+}
+
+static inline void put_obj_stock(unsigned long flags)
+{
+ if (likely(in_task()))
+ preempt_enable();
+ else
+ local_irq_restore(flags);
+}
+
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
@@ -2279,7 +2323,9 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
- drain_obj_stock(stock);
+ drain_obj_stock(&stock->irq_obj);
+ if (in_task())
+ drain_obj_stock(&stock->task_obj);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
@@ -3139,13 +3185,10 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
- struct memcg_stock_pcp *stock;
unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
int *bytes;
- local_irq_save(flags);
- stock = this_cpu_ptr(&memcg_stock);
-
/*
* Save vmstat data in stock and skip vmstat array update unless
* accumulating over a page of vmstat data or when pgdat or idx
@@ -3194,29 +3237,26 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr);
- local_irq_restore(flags);
+ put_obj_stock(flags);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
- struct memcg_stock_pcp *stock;
unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
bool ret = false;
- local_irq_save(flags);
-
- stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
- local_irq_restore(flags);
+ put_obj_stock(flags);
return ret;
}
-static void drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock *stock)
{
struct obj_cgroup *old = stock->cached_objcg;
@@ -3272,8 +3312,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
{
struct mem_cgroup *memcg;
- if (stock->cached_objcg) {
- memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (in_task() && stock->task_obj.cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ return true;
+ }
+ if (stock->irq_obj.cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3284,13 +3329,10 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool allow_uncharge)
{
- struct memcg_stock_pcp *stock;
unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
unsigned int nr_pages = 0;
- local_irq_save(flags);
-
- stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */
drain_obj_stock(stock);
obj_cgroup_get(objcg);
@@ -3306,7 +3348,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_irq_restore(flags);
+ put_obj_stock(flags);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
--
2.18.1
^ permalink raw reply related [flat|nested] 5+ messages in thread