* [PATCH-next v5 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c
2021-04-20 19:29 [PATCH-next v5 0/4] mm/memcg: Reduce kmemcache memory accounting overhead Waiman Long
@ 2021-04-20 19:29 ` Waiman Long
2021-04-21 15:26 ` Shakeel Butt
2021-04-21 23:08 ` Roman Gushchin
2021-04-20 19:29 ` [PATCH-next v5 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp Waiman Long
` (2 subsequent siblings)
3 siblings, 2 replies; 16+ messages in thread
From: Waiman Long @ 2021-04-20 19:29 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin
Cc: linux-kernel, cgroups, linux-mm, Shakeel Butt, Muchun Song,
Alex Shi, Chris Down, Yafang Shao, Wei Yang, Masayoshi Mizuma,
Xing Zhengjun, Matthew Wilcox, Waiman Long
The mod_objcg_state() function is moved from mm/slab.h to mm/memcontrol.c
so that further optimization can be done to it in later patches without
exposing unnecessary details to other mm components.
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
mm/memcontrol.c | 13 +++++++++++++
mm/slab.h | 16 ++--------------
2 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 64ada9e650a5..7cd7187a017c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -782,6 +782,19 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ rcu_read_unlock();
+}
+
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
diff --git a/mm/slab.h b/mm/slab.h
index a7268072f017..f1f26f22a94a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -242,6 +242,8 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp, bool new_page);
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr);
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
@@ -286,20 +288,6 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
return true;
}
-static inline void mod_objcg_state(struct obj_cgroup *objcg,
- struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
-{
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mod_memcg_lruvec_state(lruvec, idx, nr);
- rcu_read_unlock();
-}
-
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
struct obj_cgroup *objcg,
gfp_t flags, size_t size,
--
2.18.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c
2021-04-20 19:29 ` [PATCH-next v5 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c Waiman Long
@ 2021-04-21 15:26 ` Shakeel Butt
2021-04-21 23:08 ` Roman Gushchin
1 sibling, 0 replies; 16+ messages in thread
From: Shakeel Butt @ 2021-04-21 15:26 UTC (permalink / raw)
To: Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin, LKML, Cgroups,
Linux MM, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On Tue, Apr 20, 2021 at 12:30 PM Waiman Long <longman@redhat.com> wrote:
>
> The mod_objcg_state() function is moved from mm/slab.h to mm/memcontrol.c
> so that further optimization can be done to it in later patches without
> exposing unnecessary details to other mm components.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c
2021-04-20 19:29 ` [PATCH-next v5 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c Waiman Long
2021-04-21 15:26 ` Shakeel Butt
@ 2021-04-21 23:08 ` Roman Gushchin
1 sibling, 0 replies; 16+ messages in thread
From: Roman Gushchin @ 2021-04-21 23:08 UTC (permalink / raw)
To: Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On Tue, Apr 20, 2021 at 03:29:04PM -0400, Waiman Long wrote:
> The mod_objcg_state() function is moved from mm/slab.h to mm/memcontrol.c
> so that further optimization can be done to it in later patches without
> exposing unnecessary details to other mm components.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <guro@fb.com>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH-next v5 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp
2021-04-20 19:29 [PATCH-next v5 0/4] mm/memcg: Reduce kmemcache memory accounting overhead Waiman Long
2021-04-20 19:29 ` [PATCH-next v5 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c Waiman Long
@ 2021-04-20 19:29 ` Waiman Long
2021-04-21 23:28 ` Roman Gushchin
2021-04-20 19:29 ` [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance Waiman Long
2021-04-20 19:29 ` [PATCH-next v5 4/4] mm/memcg: Optimize user context object stock access Waiman Long
3 siblings, 1 reply; 16+ messages in thread
From: Waiman Long @ 2021-04-20 19:29 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin
Cc: linux-kernel, cgroups, linux-mm, Shakeel Butt, Muchun Song,
Alex Shi, Chris Down, Yafang Shao, Wei Yang, Masayoshi Mizuma,
Xing Zhengjun, Matthew Wilcox, Waiman Long
Before the new slab memory controller with per object byte charging,
charging and vmstat data update happen only when new slab pages are
allocated or freed. Now they are done with every kmem_cache_alloc()
and kmem_cache_free(). This causes additional overhead for workloads
that generate a lot of alloc and free calls.
The memcg_stock_pcp is used to cache byte charge for a specific
obj_cgroup to reduce that overhead. To further reducing it, this patch
makes the vmstat data cached in the memcg_stock_pcp structure as well
until it accumulates a page size worth of update or when other cached
data change. Caching the vmstat data in the per-cpu stock eliminates two
writes to non-hot cachelines for memcg specific as well as memcg-lruvecs
specific vmstat data by a write to a hot local stock cacheline.
On a 2-socket Cascade Lake server with instrumentation enabled and this
patch applied, it was found that about 20% (634400 out of 3243830)
of the time when mod_objcg_state() is called leads to an actual call
to __mod_objcg_state() after initial boot. When doing parallel kernel
build, the figure was about 17% (24329265 out of 142512465). So caching
the vmstat data reduces the number of calls to __mod_objcg_state()
by more than 80%.
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
---
mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 83 insertions(+), 3 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7cd7187a017c..292b4783b1a7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
@@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mod_memcg_lruvec_state(lruvec, idx, nr);
+ __mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
}
@@ -2059,7 +2060,10 @@ struct memcg_stock_pcp {
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
+ struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
+ int nr_slab_reclaimable_b;
+ int nr_slab_unreclaimable_b;
#endif
struct work_struct work;
@@ -3008,6 +3012,63 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
obj_cgroup_put(objcg);
}
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ int *bytes;
+
+ local_irq_save(flags);
+ stock = this_cpu_ptr(&memcg_stock);
+
+ /*
+ * Save vmstat data in stock and skip vmstat array update unless
+ * accumulating over a page of vmstat data or when pgdat or idx
+ * changes.
+ */
+ if (stock->cached_objcg != objcg) {
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ stock->cached_objcg = objcg;
+ stock->cached_pgdat = pgdat;
+ } else if (stock->cached_pgdat != pgdat) {
+ /* Flush the existing cached vmstat data */
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->cached_pgdat = pgdat;
+ }
+
+ bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
+ : &stock->nr_slab_unreclaimable_b;
+ if (!*bytes) {
+ *bytes = nr;
+ nr = 0;
+ } else {
+ *bytes += nr;
+ if (abs(*bytes) > PAGE_SIZE) {
+ nr = *bytes;
+ *bytes = 0;
+ } else {
+ nr = 0;
+ }
+ }
+ if (nr)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
+
+ local_irq_restore(flags);
+}
+
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
struct memcg_stock_pcp *stock;
@@ -3055,6 +3116,25 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
stock->nr_bytes = 0;
}
+ /*
+ * Flush the vmstat data in current stock
+ */
+ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(old, stock->cached_pgdat,
+ NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(old, stock->cached_pgdat,
+ NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->cached_pgdat = NULL;
+ }
+
obj_cgroup_put(old);
stock->cached_objcg = NULL;
}
--
2.18.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp
2021-04-20 19:29 ` [PATCH-next v5 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp Waiman Long
@ 2021-04-21 23:28 ` Roman Gushchin
2021-04-22 16:58 ` Waiman Long
0 siblings, 1 reply; 16+ messages in thread
From: Roman Gushchin @ 2021-04-21 23:28 UTC (permalink / raw)
To: Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote:
> Before the new slab memory controller with per object byte charging,
> charging and vmstat data update happen only when new slab pages are
> allocated or freed. Now they are done with every kmem_cache_alloc()
> and kmem_cache_free(). This causes additional overhead for workloads
> that generate a lot of alloc and free calls.
>
> The memcg_stock_pcp is used to cache byte charge for a specific
> obj_cgroup to reduce that overhead. To further reducing it, this patch
> makes the vmstat data cached in the memcg_stock_pcp structure as well
> until it accumulates a page size worth of update or when other cached
> data change. Caching the vmstat data in the per-cpu stock eliminates two
> writes to non-hot cachelines for memcg specific as well as memcg-lruvecs
> specific vmstat data by a write to a hot local stock cacheline.
>
> On a 2-socket Cascade Lake server with instrumentation enabled and this
> patch applied, it was found that about 20% (634400 out of 3243830)
> of the time when mod_objcg_state() is called leads to an actual call
> to __mod_objcg_state() after initial boot. When doing parallel kernel
> build, the figure was about 17% (24329265 out of 142512465). So caching
> the vmstat data reduces the number of calls to __mod_objcg_state()
> by more than 80%.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> Reviewed-by: Shakeel Butt <shakeelb@google.com>
> ---
> mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 83 insertions(+), 3 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7cd7187a017c..292b4783b1a7 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
> rcu_read_unlock();
> }
>
> -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> - enum node_stat_item idx, int nr)
> +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
> + struct pglist_data *pgdat,
> + enum node_stat_item idx, int nr)
> {
> struct mem_cgroup *memcg;
> struct lruvec *lruvec;
> @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> rcu_read_lock();
> memcg = obj_cgroup_memcg(objcg);
> lruvec = mem_cgroup_lruvec(memcg, pgdat);
> - mod_memcg_lruvec_state(lruvec, idx, nr);
> + __mod_memcg_lruvec_state(lruvec, idx, nr);
> rcu_read_unlock();
> }
>
> @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp {
>
> #ifdef CONFIG_MEMCG_KMEM
> struct obj_cgroup *cached_objcg;
> + struct pglist_data *cached_pgdat;
I wonder if we want to have per-node counters instead?
That would complicate the initialization of pcp stocks a bit,
but might shave off some additional cpu time.
But we can do it later too.
> unsigned int nr_bytes;
> + int nr_slab_reclaimable_b;
> + int nr_slab_unreclaimable_b;
> #endif
>
> struct work_struct work;
> @@ -3008,6 +3012,63 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
> obj_cgroup_put(objcg);
> }
>
> +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> + enum node_stat_item idx, int nr)
> +{
> + struct memcg_stock_pcp *stock;
> + unsigned long flags;
> + int *bytes;
> +
> + local_irq_save(flags);
> + stock = this_cpu_ptr(&memcg_stock);
> +
> + /*
> + * Save vmstat data in stock and skip vmstat array update unless
> + * accumulating over a page of vmstat data or when pgdat or idx
> + * changes.
> + */
> + if (stock->cached_objcg != objcg) {
> + drain_obj_stock(stock);
> + obj_cgroup_get(objcg);
> + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
> + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
> + stock->cached_objcg = objcg;
> + stock->cached_pgdat = pgdat;
> + } else if (stock->cached_pgdat != pgdat) {
> + /* Flush the existing cached vmstat data */
> + if (stock->nr_slab_reclaimable_b) {
> + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B,
> + stock->nr_slab_reclaimable_b);
> + stock->nr_slab_reclaimable_b = 0;
> + }
> + if (stock->nr_slab_unreclaimable_b) {
> + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B,
> + stock->nr_slab_unreclaimable_b);
> + stock->nr_slab_unreclaimable_b = 0;
> + }
> + stock->cached_pgdat = pgdat;
> + }
> +
> + bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
> + : &stock->nr_slab_unreclaimable_b;
> + if (!*bytes) {
> + *bytes = nr;
> + nr = 0;
> + } else {
> + *bytes += nr;
> + if (abs(*bytes) > PAGE_SIZE) {
> + nr = *bytes;
> + *bytes = 0;
> + } else {
> + nr = 0;
> + }
> + }
This part is a little bit hard to follow, how about something like this
(completely untested):
{
stocked = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
: &stock->nr_slab_unreclaimable_b;
if (abs(*stocked + nr) > PAGE_SIZE) {
nr += *stocked;
*stocked = 0;
} else {
*stocked += nr;
nr = 0;
}
}
> + if (nr)
> + mod_objcg_mlstate(objcg, pgdat, idx, nr);
> +
> + local_irq_restore(flags);
> +}
> +
> static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> {
> struct memcg_stock_pcp *stock;
> @@ -3055,6 +3116,25 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
> stock->nr_bytes = 0;
> }
>
> + /*
> + * Flush the vmstat data in current stock
> + */
> + if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
> + if (stock->nr_slab_reclaimable_b) {
> + mod_objcg_mlstate(old, stock->cached_pgdat,
> + NR_SLAB_RECLAIMABLE_B,
> + stock->nr_slab_reclaimable_b);
> + stock->nr_slab_reclaimable_b = 0;
> + }
> + if (stock->nr_slab_unreclaimable_b) {
> + mod_objcg_mlstate(old, stock->cached_pgdat,
> + NR_SLAB_UNRECLAIMABLE_B,
> + stock->nr_slab_unreclaimable_b);
> + stock->nr_slab_unreclaimable_b = 0;
> + }
> + stock->cached_pgdat = NULL;
> + }
> +
> obj_cgroup_put(old);
> stock->cached_objcg = NULL;
> }
> --
> 2.18.1
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp
2021-04-21 23:28 ` Roman Gushchin
@ 2021-04-22 16:58 ` Waiman Long
2021-04-23 1:56 ` Roman Gushchin
0 siblings, 1 reply; 16+ messages in thread
From: Waiman Long @ 2021-04-22 16:58 UTC (permalink / raw)
To: Roman Gushchin
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On 4/21/21 7:28 PM, Roman Gushchin wrote:
> On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote:
>> Before the new slab memory controller with per object byte charging,
>> charging and vmstat data update happen only when new slab pages are
>> allocated or freed. Now they are done with every kmem_cache_alloc()
>> and kmem_cache_free(). This causes additional overhead for workloads
>> that generate a lot of alloc and free calls.
>>
>> The memcg_stock_pcp is used to cache byte charge for a specific
>> obj_cgroup to reduce that overhead. To further reducing it, this patch
>> makes the vmstat data cached in the memcg_stock_pcp structure as well
>> until it accumulates a page size worth of update or when other cached
>> data change. Caching the vmstat data in the per-cpu stock eliminates two
>> writes to non-hot cachelines for memcg specific as well as memcg-lruvecs
>> specific vmstat data by a write to a hot local stock cacheline.
>>
>> On a 2-socket Cascade Lake server with instrumentation enabled and this
>> patch applied, it was found that about 20% (634400 out of 3243830)
>> of the time when mod_objcg_state() is called leads to an actual call
>> to __mod_objcg_state() after initial boot. When doing parallel kernel
>> build, the figure was about 17% (24329265 out of 142512465). So caching
>> the vmstat data reduces the number of calls to __mod_objcg_state()
>> by more than 80%.
>>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> Reviewed-by: Shakeel Butt <shakeelb@google.com>
>> ---
>> mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++--
>> 1 file changed, 83 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 7cd7187a017c..292b4783b1a7 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
>> rcu_read_unlock();
>> }
>>
>> -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
>> - enum node_stat_item idx, int nr)
>> +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
>> + struct pglist_data *pgdat,
>> + enum node_stat_item idx, int nr)
>> {
>> struct mem_cgroup *memcg;
>> struct lruvec *lruvec;
>> @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
>> rcu_read_lock();
>> memcg = obj_cgroup_memcg(objcg);
>> lruvec = mem_cgroup_lruvec(memcg, pgdat);
>> - mod_memcg_lruvec_state(lruvec, idx, nr);
>> + __mod_memcg_lruvec_state(lruvec, idx, nr);
>> rcu_read_unlock();
>> }
>>
>> @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp {
>>
>> #ifdef CONFIG_MEMCG_KMEM
>> struct obj_cgroup *cached_objcg;
>> + struct pglist_data *cached_pgdat;
> I wonder if we want to have per-node counters instead?
> That would complicate the initialization of pcp stocks a bit,
> but might shave off some additional cpu time.
> But we can do it later too.
>
A per node counter will certainly complicate the code and reduce the
performance benefit too. I got a pretty good hit rate of 80%+ with the
current code on a 2-socket system. The hit rate will probably drop when
there are more nodes. I will do some more investigation, but it will not
be for this patchset.
>> unsigned int nr_bytes;
>> + int nr_slab_reclaimable_b;
>> + int nr_slab_unreclaimable_b;
>> #endif
>>
>> struct work_struct work;
>> @@ -3008,6 +3012,63 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
>> obj_cgroup_put(objcg);
>> }
>>
>> +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
>> + enum node_stat_item idx, int nr)
>> +{
>> + struct memcg_stock_pcp *stock;
>> + unsigned long flags;
>> + int *bytes;
>> +
>> + local_irq_save(flags);
>> + stock = this_cpu_ptr(&memcg_stock);
>> +
>> + /*
>> + * Save vmstat data in stock and skip vmstat array update unless
>> + * accumulating over a page of vmstat data or when pgdat or idx
>> + * changes.
>> + */
>> + if (stock->cached_objcg != objcg) {
>> + drain_obj_stock(stock);
>> + obj_cgroup_get(objcg);
>> + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
>> + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
>> + stock->cached_objcg = objcg;
>> + stock->cached_pgdat = pgdat;
>> + } else if (stock->cached_pgdat != pgdat) {
>> + /* Flush the existing cached vmstat data */
>> + if (stock->nr_slab_reclaimable_b) {
>> + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B,
>> + stock->nr_slab_reclaimable_b);
>> + stock->nr_slab_reclaimable_b = 0;
>> + }
>> + if (stock->nr_slab_unreclaimable_b) {
>> + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B,
>> + stock->nr_slab_unreclaimable_b);
>> + stock->nr_slab_unreclaimable_b = 0;
>> + }
>> + stock->cached_pgdat = pgdat;
>> + }
>> +
>> + bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
>> + : &stock->nr_slab_unreclaimable_b;
>> + if (!*bytes) {
>> + *bytes = nr;
>> + nr = 0;
>> + } else {
>> + *bytes += nr;
>> + if (abs(*bytes) > PAGE_SIZE) {
>> + nr = *bytes;
>> + *bytes = 0;
>> + } else {
>> + nr = 0;
>> + }
>> + }
> This part is a little bit hard to follow, how about something like this
> (completely untested):
>
> {
> stocked = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
> : &stock->nr_slab_unreclaimable_b;
> if (abs(*stocked + nr) > PAGE_SIZE) {
> nr += *stocked;
> *stocked = 0;
> } else {
> *stocked += nr;
> nr = 0;
> }
> }
That was done purposely to make sure that large object (>= 4k) will also
be cached once before flushing it out. I should have been more clear
about that by adding a comment about it. vmstat data isn't as critical
as memory charge and so I am allowing it to cache more than 4k in this case.
Cheers,
Longman
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp
2021-04-22 16:58 ` Waiman Long
@ 2021-04-23 1:56 ` Roman Gushchin
2021-04-23 16:52 ` Waiman Long
0 siblings, 1 reply; 16+ messages in thread
From: Roman Gushchin @ 2021-04-23 1:56 UTC (permalink / raw)
To: Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On Thu, Apr 22, 2021 at 12:58:52PM -0400, Waiman Long wrote:
> On 4/21/21 7:28 PM, Roman Gushchin wrote:
> > On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote:
> > > Before the new slab memory controller with per object byte charging,
> > > charging and vmstat data update happen only when new slab pages are
> > > allocated or freed. Now they are done with every kmem_cache_alloc()
> > > and kmem_cache_free(). This causes additional overhead for workloads
> > > that generate a lot of alloc and free calls.
> > >
> > > The memcg_stock_pcp is used to cache byte charge for a specific
> > > obj_cgroup to reduce that overhead. To further reducing it, this patch
> > > makes the vmstat data cached in the memcg_stock_pcp structure as well
> > > until it accumulates a page size worth of update or when other cached
> > > data change. Caching the vmstat data in the per-cpu stock eliminates two
> > > writes to non-hot cachelines for memcg specific as well as memcg-lruvecs
> > > specific vmstat data by a write to a hot local stock cacheline.
> > >
> > > On a 2-socket Cascade Lake server with instrumentation enabled and this
> > > patch applied, it was found that about 20% (634400 out of 3243830)
> > > of the time when mod_objcg_state() is called leads to an actual call
> > > to __mod_objcg_state() after initial boot. When doing parallel kernel
> > > build, the figure was about 17% (24329265 out of 142512465). So caching
> > > the vmstat data reduces the number of calls to __mod_objcg_state()
> > > by more than 80%.
> > >
> > > Signed-off-by: Waiman Long <longman@redhat.com>
> > > Reviewed-by: Shakeel Butt <shakeelb@google.com>
> > > ---
> > > mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++--
> > > 1 file changed, 83 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 7cd7187a017c..292b4783b1a7 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
> > > rcu_read_unlock();
> > > }
> > > -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> > > - enum node_stat_item idx, int nr)
> > > +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
> > > + struct pglist_data *pgdat,
> > > + enum node_stat_item idx, int nr)
> > > {
> > > struct mem_cgroup *memcg;
> > > struct lruvec *lruvec;
> > > @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> > > rcu_read_lock();
> > > memcg = obj_cgroup_memcg(objcg);
> > > lruvec = mem_cgroup_lruvec(memcg, pgdat);
> > > - mod_memcg_lruvec_state(lruvec, idx, nr);
> > > + __mod_memcg_lruvec_state(lruvec, idx, nr);
> > > rcu_read_unlock();
> > > }
> > > @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp {
> > > #ifdef CONFIG_MEMCG_KMEM
> > > struct obj_cgroup *cached_objcg;
> > > + struct pglist_data *cached_pgdat;
> > I wonder if we want to have per-node counters instead?
> > That would complicate the initialization of pcp stocks a bit,
> > but might shave off some additional cpu time.
> > But we can do it later too.
> >
> A per node counter will certainly complicate the code and reduce the
> performance benefit too.
Hm, why? We wouldn't need to flush the stock if the release happens
on some other cpu not matching the current pgdat.
> I got a pretty good hit rate of 80%+ with the
> current code on a 2-socket system. The hit rate will probably drop when
> there are more nodes. I will do some more investigation, but it will not be
> for this patchset.
Works for me!
Thanks!
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp
2021-04-23 1:56 ` Roman Gushchin
@ 2021-04-23 16:52 ` Waiman Long
0 siblings, 0 replies; 16+ messages in thread
From: Waiman Long @ 2021-04-23 16:52 UTC (permalink / raw)
To: Roman Gushchin, Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On 4/22/21 9:56 PM, Roman Gushchin wrote:
> On Thu, Apr 22, 2021 at 12:58:52PM -0400, Waiman Long wrote:
>> On 4/21/21 7:28 PM, Roman Gushchin wrote:
>>> On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote:
>>>> Before the new slab memory controller with per object byte charging,
>>>> charging and vmstat data update happen only when new slab pages are
>>>> allocated or freed. Now they are done with every kmem_cache_alloc()
>>>> and kmem_cache_free(). This causes additional overhead for workloads
>>>> that generate a lot of alloc and free calls.
>>>>
>>>> The memcg_stock_pcp is used to cache byte charge for a specific
>>>> obj_cgroup to reduce that overhead. To further reducing it, this patch
>>>> makes the vmstat data cached in the memcg_stock_pcp structure as well
>>>> until it accumulates a page size worth of update or when other cached
>>>> data change. Caching the vmstat data in the per-cpu stock eliminates two
>>>> writes to non-hot cachelines for memcg specific as well as memcg-lruvecs
>>>> specific vmstat data by a write to a hot local stock cacheline.
>>>>
>>>> On a 2-socket Cascade Lake server with instrumentation enabled and this
>>>> patch applied, it was found that about 20% (634400 out of 3243830)
>>>> of the time when mod_objcg_state() is called leads to an actual call
>>>> to __mod_objcg_state() after initial boot. When doing parallel kernel
>>>> build, the figure was about 17% (24329265 out of 142512465). So caching
>>>> the vmstat data reduces the number of calls to __mod_objcg_state()
>>>> by more than 80%.
>>>>
>>>> Signed-off-by: Waiman Long <longman@redhat.com>
>>>> Reviewed-by: Shakeel Butt <shakeelb@google.com>
>>>> ---
>>>> mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++--
>>>> 1 file changed, 83 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index 7cd7187a017c..292b4783b1a7 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
>>>> rcu_read_unlock();
>>>> }
>>>> -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
>>>> - enum node_stat_item idx, int nr)
>>>> +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
>>>> + struct pglist_data *pgdat,
>>>> + enum node_stat_item idx, int nr)
>>>> {
>>>> struct mem_cgroup *memcg;
>>>> struct lruvec *lruvec;
>>>> @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
>>>> rcu_read_lock();
>>>> memcg = obj_cgroup_memcg(objcg);
>>>> lruvec = mem_cgroup_lruvec(memcg, pgdat);
>>>> - mod_memcg_lruvec_state(lruvec, idx, nr);
>>>> + __mod_memcg_lruvec_state(lruvec, idx, nr);
>>>> rcu_read_unlock();
>>>> }
>>>> @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp {
>>>> #ifdef CONFIG_MEMCG_KMEM
>>>> struct obj_cgroup *cached_objcg;
>>>> + struct pglist_data *cached_pgdat;
>>> I wonder if we want to have per-node counters instead?
>>> That would complicate the initialization of pcp stocks a bit,
>>> but might shave off some additional cpu time.
>>> But we can do it later too.
>>>
>> A per node counter will certainly complicate the code and reduce the
>> performance benefit too.
> Hm, why? We wouldn't need to flush the stock if the release happens
> on some other cpu not matching the current pgdat.
I had actually experimented with just caching vmstat data for the local
node only. It turned out the hit rate was a bit lower. That is why I
keep the current approach and I need to do further investigation on a
better approach.
Cheers,
Longman
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance
2021-04-20 19:29 [PATCH-next v5 0/4] mm/memcg: Reduce kmemcache memory accounting overhead Waiman Long
2021-04-20 19:29 ` [PATCH-next v5 1/4] mm/memcg: Move mod_objcg_state() to memcontrol.c Waiman Long
2021-04-20 19:29 ` [PATCH-next v5 2/4] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp Waiman Long
@ 2021-04-20 19:29 ` Waiman Long
2021-04-21 23:55 ` Roman Gushchin
2021-04-26 19:24 ` Shakeel Butt
2021-04-20 19:29 ` [PATCH-next v5 4/4] mm/memcg: Optimize user context object stock access Waiman Long
3 siblings, 2 replies; 16+ messages in thread
From: Waiman Long @ 2021-04-20 19:29 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin
Cc: linux-kernel, cgroups, linux-mm, Shakeel Butt, Muchun Song,
Alex Shi, Chris Down, Yafang Shao, Wei Yang, Masayoshi Mizuma,
Xing Zhengjun, Matthew Wilcox, Waiman Long
There are two issues with the current refill_obj_stock() code. First of
all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to
atomically flush out remaining bytes to obj_cgroup, clear cached_objcg
and do a obj_cgroup_put(). It is likely that the same obj_cgroup will
be used again which leads to another call to drain_obj_stock() and
obj_cgroup_get() as well as atomically retrieve the available byte from
obj_cgroup. That is costly. Instead, we should just uncharge the excess
pages, reduce the stock bytes and be done with it. The drain_obj_stock()
function should only be called when obj_cgroup changes.
Secondly, when charging an object of size not less than a page in
obj_cgroup_charge(), it is possible that the remaining bytes to be
refilled to the stock will overflow a page and cause refill_obj_stock()
to uncharge 1 page. To avoid the additional uncharge in this case,
a new overfill flag is added to refill_obj_stock() which will be set
when called from obj_cgroup_charge().
A multithreaded kmalloc+kfree microbenchmark on a 2-socket 48-core
96-thread x86-64 system with 96 testing threads were run. Before this
patch, the total number of kilo kmalloc+kfree operations done for a 4k
large object by all the testing threads per second were 4,304 kops/s
(cgroup v1) and 8,478 kops/s (cgroup v2). After applying this patch, the
number were 4,731 (cgroup v1) and 418,142 (cgroup v2) respectively. This
represents a performance improvement of 1.10X (cgroup v1) and 49.3X
(cgroup v2).
Signed-off-by: Waiman Long <longman@redhat.com>
---
mm/memcontrol.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 292b4783b1a7..2f87d0b05092 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3153,10 +3153,12 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
return false;
}
-static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ bool overfill)
{
struct memcg_stock_pcp *stock;
unsigned long flags;
+ unsigned int nr_pages = 0;
local_irq_save(flags);
@@ -3165,14 +3167,20 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->cached_objcg = objcg;
- stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
}
stock->nr_bytes += nr_bytes;
- if (stock->nr_bytes > PAGE_SIZE)
- drain_obj_stock(stock);
+ if (!overfill && (stock->nr_bytes > PAGE_SIZE)) {
+ nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ stock->nr_bytes &= (PAGE_SIZE - 1);
+ }
local_irq_restore(flags);
+
+ if (nr_pages)
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
}
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
@@ -3201,14 +3209,14 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, true);
return ret;
}
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size);
+ refill_obj_stock(objcg, size, false);
}
#endif /* CONFIG_MEMCG_KMEM */
--
2.18.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance
2021-04-20 19:29 ` [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance Waiman Long
@ 2021-04-21 23:55 ` Roman Gushchin
2021-04-22 17:26 ` Waiman Long
2021-04-26 19:24 ` Shakeel Butt
1 sibling, 1 reply; 16+ messages in thread
From: Roman Gushchin @ 2021-04-21 23:55 UTC (permalink / raw)
To: Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On Tue, Apr 20, 2021 at 03:29:06PM -0400, Waiman Long wrote:
> There are two issues with the current refill_obj_stock() code. First of
> all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to
> atomically flush out remaining bytes to obj_cgroup, clear cached_objcg
> and do a obj_cgroup_put(). It is likely that the same obj_cgroup will
> be used again which leads to another call to drain_obj_stock() and
> obj_cgroup_get() as well as atomically retrieve the available byte from
> obj_cgroup. That is costly. Instead, we should just uncharge the excess
> pages, reduce the stock bytes and be done with it. The drain_obj_stock()
> function should only be called when obj_cgroup changes.
I really like this idea! Thanks!
However, I wonder if it can implemented simpler by splitting drain_obj_stock()
into two functions:
empty_obj_stock() will flush cached bytes, but not reset the objcg
drain_obj_stock() will call empty_obj_stock() and then reset objcg
Then we simple can replace the second drain_obj_stock() in
refill_obj_stock() with empty_obj_stock(). What do you think?
>
> Secondly, when charging an object of size not less than a page in
> obj_cgroup_charge(), it is possible that the remaining bytes to be
> refilled to the stock will overflow a page and cause refill_obj_stock()
> to uncharge 1 page. To avoid the additional uncharge in this case,
> a new overfill flag is added to refill_obj_stock() which will be set
> when called from obj_cgroup_charge().
>
> A multithreaded kmalloc+kfree microbenchmark on a 2-socket 48-core
> 96-thread x86-64 system with 96 testing threads were run. Before this
> patch, the total number of kilo kmalloc+kfree operations done for a 4k
> large object by all the testing threads per second were 4,304 kops/s
> (cgroup v1) and 8,478 kops/s (cgroup v2). After applying this patch, the
> number were 4,731 (cgroup v1) and 418,142 (cgroup v2) respectively. This
> represents a performance improvement of 1.10X (cgroup v1) and 49.3X
> (cgroup v2).
This part looks more controversial. Basically if there are N consequent
allocations of size (PAGE_SIZE + x), the stock will end up with (N * x)
cached bytes, right? It's not the end of the world, but do we really
need it given that uncharging a page is also cached?
Thanks!
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance
2021-04-21 23:55 ` Roman Gushchin
@ 2021-04-22 17:26 ` Waiman Long
2021-04-23 2:28 ` Roman Gushchin
0 siblings, 1 reply; 16+ messages in thread
From: Waiman Long @ 2021-04-22 17:26 UTC (permalink / raw)
To: Roman Gushchin
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On 4/21/21 7:55 PM, Roman Gushchin wrote:
> On Tue, Apr 20, 2021 at 03:29:06PM -0400, Waiman Long wrote:
>> There are two issues with the current refill_obj_stock() code. First of
>> all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to
>> atomically flush out remaining bytes to obj_cgroup, clear cached_objcg
>> and do a obj_cgroup_put(). It is likely that the same obj_cgroup will
>> be used again which leads to another call to drain_obj_stock() and
>> obj_cgroup_get() as well as atomically retrieve the available byte from
>> obj_cgroup. That is costly. Instead, we should just uncharge the excess
>> pages, reduce the stock bytes and be done with it. The drain_obj_stock()
>> function should only be called when obj_cgroup changes.
> I really like this idea! Thanks!
>
> However, I wonder if it can implemented simpler by splitting drain_obj_stock()
> into two functions:
> empty_obj_stock() will flush cached bytes, but not reset the objcg
> drain_obj_stock() will call empty_obj_stock() and then reset objcg
>
> Then we simple can replace the second drain_obj_stock() in
> refill_obj_stock() with empty_obj_stock(). What do you think?
Actually the problem is the flushing cached bytes to
objcg->nr_charged_bytes that can become a performance bottleneck in a
multithreaded testing scenario. See my description in the latter half of
my cover-letter.
For cgroup v2, update the page charge will mostly update the per-cpu
page charge stock. Flushing the remaining byte charge, however, will
cause the obgcg to became the single contended cacheline for all the
cpus that need to flush the byte charge. That is why I only update the
page charge and left the remaining byte charge stayed put in the object
stock.
>
>> Secondly, when charging an object of size not less than a page in
>> obj_cgroup_charge(), it is possible that the remaining bytes to be
>> refilled to the stock will overflow a page and cause refill_obj_stock()
>> to uncharge 1 page. To avoid the additional uncharge in this case,
>> a new overfill flag is added to refill_obj_stock() which will be set
>> when called from obj_cgroup_charge().
>>
>> A multithreaded kmalloc+kfree microbenchmark on a 2-socket 48-core
>> 96-thread x86-64 system with 96 testing threads were run. Before this
>> patch, the total number of kilo kmalloc+kfree operations done for a 4k
>> large object by all the testing threads per second were 4,304 kops/s
>> (cgroup v1) and 8,478 kops/s (cgroup v2). After applying this patch, the
>> number were 4,731 (cgroup v1) and 418,142 (cgroup v2) respectively. This
>> represents a performance improvement of 1.10X (cgroup v1) and 49.3X
>> (cgroup v2).
> This part looks more controversial. Basically if there are N consequent
> allocations of size (PAGE_SIZE + x), the stock will end up with (N * x)
> cached bytes, right? It's not the end of the world, but do we really
> need it given that uncharging a page is also cached?
Actually the maximum charge that can be accumulated in (2*PAGE_SIZE + x
- 1) since a following consume_obj_stock() will use those bytes once the
byte charge is not less than (PAGE_SIZE + x).
Yes, the page charge is cached for v2, but it is not the case for v1.
See the benchmark data in the cover-letter.
Cheers,
Longman
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance
2021-04-22 17:26 ` Waiman Long
@ 2021-04-23 2:28 ` Roman Gushchin
2021-04-23 20:06 ` Waiman Long
0 siblings, 1 reply; 16+ messages in thread
From: Roman Gushchin @ 2021-04-23 2:28 UTC (permalink / raw)
To: Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On Thu, Apr 22, 2021 at 01:26:08PM -0400, Waiman Long wrote:
> On 4/21/21 7:55 PM, Roman Gushchin wrote:
> > On Tue, Apr 20, 2021 at 03:29:06PM -0400, Waiman Long wrote:
> > > There are two issues with the current refill_obj_stock() code. First of
> > > all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to
> > > atomically flush out remaining bytes to obj_cgroup, clear cached_objcg
> > > and do a obj_cgroup_put(). It is likely that the same obj_cgroup will
> > > be used again which leads to another call to drain_obj_stock() and
> > > obj_cgroup_get() as well as atomically retrieve the available byte from
> > > obj_cgroup. That is costly. Instead, we should just uncharge the excess
> > > pages, reduce the stock bytes and be done with it. The drain_obj_stock()
> > > function should only be called when obj_cgroup changes.
> > I really like this idea! Thanks!
> >
> > However, I wonder if it can implemented simpler by splitting drain_obj_stock()
> > into two functions:
> > empty_obj_stock() will flush cached bytes, but not reset the objcg
> > drain_obj_stock() will call empty_obj_stock() and then reset objcg
> >
> > Then we simple can replace the second drain_obj_stock() in
> > refill_obj_stock() with empty_obj_stock(). What do you think?
>
> Actually the problem is the flushing cached bytes to objcg->nr_charged_bytes
> that can become a performance bottleneck in a multithreaded testing
> scenario. See my description in the latter half of my cover-letter.
>
> For cgroup v2, update the page charge will mostly update the per-cpu page
> charge stock. Flushing the remaining byte charge, however, will cause the
> obgcg to became the single contended cacheline for all the cpus that need to
> flush the byte charge. That is why I only update the page charge and left
> the remaining byte charge stayed put in the object stock.
>
> >
> > > Secondly, when charging an object of size not less than a page in
> > > obj_cgroup_charge(), it is possible that the remaining bytes to be
> > > refilled to the stock will overflow a page and cause refill_obj_stock()
> > > to uncharge 1 page. To avoid the additional uncharge in this case,
> > > a new overfill flag is added to refill_obj_stock() which will be set
> > > when called from obj_cgroup_charge().
> > >
> > > A multithreaded kmalloc+kfree microbenchmark on a 2-socket 48-core
> > > 96-thread x86-64 system with 96 testing threads were run. Before this
> > > patch, the total number of kilo kmalloc+kfree operations done for a 4k
> > > large object by all the testing threads per second were 4,304 kops/s
> > > (cgroup v1) and 8,478 kops/s (cgroup v2). After applying this patch, the
> > > number were 4,731 (cgroup v1) and 418,142 (cgroup v2) respectively. This
> > > represents a performance improvement of 1.10X (cgroup v1) and 49.3X
> > > (cgroup v2).
> > This part looks more controversial. Basically if there are N consequent
> > allocations of size (PAGE_SIZE + x), the stock will end up with (N * x)
> > cached bytes, right? It's not the end of the world, but do we really
> > need it given that uncharging a page is also cached?
>
> Actually the maximum charge that can be accumulated in (2*PAGE_SIZE + x - 1)
> since a following consume_obj_stock() will use those bytes once the byte
> charge is not less than (PAGE_SIZE + x).
Got it, thank you for the explanation!
Can you, please, add a comment explaining what the "overfill" parameter does
and why it has different values on charge and uncharge paths?
Personally, I'd revert it's meaning and rename it to something like "trim"
or just plain "bool charge".
I think the simple explanation is that during the charge we can't refill more
than a PAGE_SIZE - 1 and the following allocation will likely use it or
the following deallocation will trim it if necessarily.
And on the uncharge path there are no bounds and the following deallocation
can only increase the cached value.
Thanks!
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance
2021-04-23 2:28 ` Roman Gushchin
@ 2021-04-23 20:06 ` Waiman Long
0 siblings, 0 replies; 16+ messages in thread
From: Waiman Long @ 2021-04-23 20:06 UTC (permalink / raw)
To: Roman Gushchin, Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, linux-kernel, cgroups, linux-mm,
Shakeel Butt, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On 4/22/21 10:28 PM, Roman Gushchin wrote:
> On Thu, Apr 22, 2021 at 01:26:08PM -0400, Waiman Long wrote:
>> On 4/21/21 7:55 PM, Roman Gushchin wrote:
>>> On Tue, Apr 20, 2021 at 03:29:06PM -0400, Waiman Long wrote:
>>>> There are two issues with the current refill_obj_stock() code. First of
>>>> all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to
>>>> atomically flush out remaining bytes to obj_cgroup, clear cached_objcg
>>>> and do a obj_cgroup_put(). It is likely that the same obj_cgroup will
>>>> be used again which leads to another call to drain_obj_stock() and
>>>> obj_cgroup_get() as well as atomically retrieve the available byte from
>>>> obj_cgroup. That is costly. Instead, we should just uncharge the excess
>>>> pages, reduce the stock bytes and be done with it. The drain_obj_stock()
>>>> function should only be called when obj_cgroup changes.
>>> I really like this idea! Thanks!
>>>
>>> However, I wonder if it can implemented simpler by splitting drain_obj_stock()
>>> into two functions:
>>> empty_obj_stock() will flush cached bytes, but not reset the objcg
>>> drain_obj_stock() will call empty_obj_stock() and then reset objcg
>>>
>>> Then we simple can replace the second drain_obj_stock() in
>>> refill_obj_stock() with empty_obj_stock(). What do you think?
>> Actually the problem is the flushing cached bytes to objcg->nr_charged_bytes
>> that can become a performance bottleneck in a multithreaded testing
>> scenario. See my description in the latter half of my cover-letter.
>>
>> For cgroup v2, update the page charge will mostly update the per-cpu page
>> charge stock. Flushing the remaining byte charge, however, will cause the
>> obgcg to became the single contended cacheline for all the cpus that need to
>> flush the byte charge. That is why I only update the page charge and left
>> the remaining byte charge stayed put in the object stock.
>>
>>>> Secondly, when charging an object of size not less than a page in
>>>> obj_cgroup_charge(), it is possible that the remaining bytes to be
>>>> refilled to the stock will overflow a page and cause refill_obj_stock()
>>>> to uncharge 1 page. To avoid the additional uncharge in this case,
>>>> a new overfill flag is added to refill_obj_stock() which will be set
>>>> when called from obj_cgroup_charge().
>>>>
>>>> A multithreaded kmalloc+kfree microbenchmark on a 2-socket 48-core
>>>> 96-thread x86-64 system with 96 testing threads were run. Before this
>>>> patch, the total number of kilo kmalloc+kfree operations done for a 4k
>>>> large object by all the testing threads per second were 4,304 kops/s
>>>> (cgroup v1) and 8,478 kops/s (cgroup v2). After applying this patch, the
>>>> number were 4,731 (cgroup v1) and 418,142 (cgroup v2) respectively. This
>>>> represents a performance improvement of 1.10X (cgroup v1) and 49.3X
>>>> (cgroup v2).
>>> This part looks more controversial. Basically if there are N consequent
>>> allocations of size (PAGE_SIZE + x), the stock will end up with (N * x)
>>> cached bytes, right? It's not the end of the world, but do we really
>>> need it given that uncharging a page is also cached?
>> Actually the maximum charge that can be accumulated in (2*PAGE_SIZE + x - 1)
>> since a following consume_obj_stock() will use those bytes once the byte
>> charge is not less than (PAGE_SIZE + x).
> Got it, thank you for the explanation!
>
> Can you, please, add a comment explaining what the "overfill" parameter does
> and why it has different values on charge and uncharge paths?
> Personally, I'd revert it's meaning and rename it to something like "trim"
> or just plain "bool charge".
> I think the simple explanation is that during the charge we can't refill more
> than a PAGE_SIZE - 1 and the following allocation will likely use it or
> the following deallocation will trim it if necessarily.
> And on the uncharge path there are no bounds and the following deallocation
> can only increase the cached value.
Yes, that is the intention. I will make suggested change and put in a
comment about it.
Thanks,
Longman
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance
2021-04-20 19:29 ` [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance Waiman Long
2021-04-21 23:55 ` Roman Gushchin
@ 2021-04-26 19:24 ` Shakeel Butt
1 sibling, 0 replies; 16+ messages in thread
From: Shakeel Butt @ 2021-04-26 19:24 UTC (permalink / raw)
To: Waiman Long
Cc: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin, LKML, Cgroups,
Linux MM, Muchun Song, Alex Shi, Chris Down, Yafang Shao,
Wei Yang, Masayoshi Mizuma, Xing Zhengjun, Matthew Wilcox
On Tue, Apr 20, 2021 at 12:30 PM Waiman Long <longman@redhat.com> wrote:
>
> There are two issues with the current refill_obj_stock() code. First of
> all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to
> atomically flush out remaining bytes to obj_cgroup, clear cached_objcg
> and do a obj_cgroup_put(). It is likely that the same obj_cgroup will
> be used again which leads to another call to drain_obj_stock() and
> obj_cgroup_get() as well as atomically retrieve the available byte from
> obj_cgroup. That is costly. Instead, we should just uncharge the excess
> pages, reduce the stock bytes and be done with it. The drain_obj_stock()
> function should only be called when obj_cgroup changes.
>
> Secondly, when charging an object of size not less than a page in
> obj_cgroup_charge(), it is possible that the remaining bytes to be
> refilled to the stock will overflow a page and cause refill_obj_stock()
> to uncharge 1 page. To avoid the additional uncharge in this case,
> a new overfill flag is added to refill_obj_stock() which will be set
> when called from obj_cgroup_charge().
>
> A multithreaded kmalloc+kfree microbenchmark on a 2-socket 48-core
> 96-thread x86-64 system with 96 testing threads were run. Before this
> patch, the total number of kilo kmalloc+kfree operations done for a 4k
> large object by all the testing threads per second were 4,304 kops/s
> (cgroup v1) and 8,478 kops/s (cgroup v2). After applying this patch, the
> number were 4,731 (cgroup v1) and 418,142 (cgroup v2) respectively. This
> represents a performance improvement of 1.10X (cgroup v1) and 49.3X
> (cgroup v2).
>
> Signed-off-by: Waiman Long <longman@redhat.com>
After incorporating Roman's suggestion, you can add:
Reviewed-by: Shakeel Butt <shakeelb@google.com>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH-next v5 4/4] mm/memcg: Optimize user context object stock access
2021-04-20 19:29 [PATCH-next v5 0/4] mm/memcg: Reduce kmemcache memory accounting overhead Waiman Long
` (2 preceding siblings ...)
2021-04-20 19:29 ` [PATCH-next v5 3/4] mm/memcg: Improve refill_obj_stock() performance Waiman Long
@ 2021-04-20 19:29 ` Waiman Long
3 siblings, 0 replies; 16+ messages in thread
From: Waiman Long @ 2021-04-20 19:29 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko, Vladimir Davydov, Andrew Morton,
Tejun Heo, Christoph Lameter, Pekka Enberg, David Rientjes,
Joonsoo Kim, Vlastimil Babka, Roman Gushchin
Cc: linux-kernel, cgroups, linux-mm, Shakeel Butt, Muchun Song,
Alex Shi, Chris Down, Yafang Shao, Wei Yang, Masayoshi Mizuma,
Xing Zhengjun, Matthew Wilcox, Waiman Long
Most kmem_cache_alloc() calls are from user context. With instrumentation
enabled, the measured amount of kmem_cache_alloc() calls from non-task
context was about 0.01% of the total.
The irq disable/enable sequence used in this case to access content
from object stock is slow. To optimize for user context access, there
are now two sets of object stocks (in the new obj_stock structure)
for task context and interrupt context access respectively.
The task context object stock can be accessed after disabling preemption
which is cheap in non-preempt kernel. The interrupt context object stock
can only be accessed after disabling interrupt. User context code can
access interrupt object stock, but not vice versa.
The downside of this change is that there are more data stored in local
object stocks and not reflected in the charge counter and the vmstat
arrays. However, this is a small price to pay for better performance.
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
---
mm/memcontrol.c | 94 +++++++++++++++++++++++++++++++++++--------------
1 file changed, 68 insertions(+), 26 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f87d0b05092..c79b9837cb85 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -782,6 +782,10 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
+/*
+ * mod_objcg_mlstate() may be called with irq enabled, so
+ * mod_memcg_lruvec_state() should be used.
+ */
static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
@@ -792,7 +796,7 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
}
@@ -2054,17 +2058,23 @@ void unlock_page_memcg(struct page *page)
}
EXPORT_SYMBOL(unlock_page_memcg);
-struct memcg_stock_pcp {
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
-
+struct obj_stock {
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
+#else
+ int dummy[0];
#endif
+};
+
+struct memcg_stock_pcp {
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ unsigned int nr_pages;
+ struct obj_stock task_obj;
+ struct obj_stock irq_obj;
struct work_struct work;
unsigned long flags;
@@ -2074,12 +2084,12 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static void drain_obj_stock(struct obj_stock *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
#else
-static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+static inline void drain_obj_stock(struct obj_stock *stock)
{
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@ -2089,6 +2099,40 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
}
#endif
+/*
+ * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
+ * sequence used in this case to access content from object stock is slow.
+ * To optimize for user context access, there are now two object stocks for
+ * task context and interrupt context access respectively.
+ *
+ * The task context object stock can be accessed by disabling preemption only
+ * which is cheap in non-preempt kernel. The interrupt context object stock
+ * can only be accessed after disabling interrupt. User context code can
+ * access interrupt object stock, but not vice versa.
+ */
+static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
+{
+ struct memcg_stock_pcp *stock;
+
+ if (likely(in_task())) {
+ preempt_disable();
+ stock = this_cpu_ptr(&memcg_stock);
+ return &stock->task_obj;
+ } else {
+ local_irq_save(*pflags);
+ stock = this_cpu_ptr(&memcg_stock);
+ return &stock->irq_obj;
+ }
+}
+
+static inline void put_obj_stock(unsigned long flags)
+{
+ if (likely(in_task()))
+ preempt_enable();
+ else
+ local_irq_restore(flags);
+}
+
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
@@ -2155,7 +2199,9 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
- drain_obj_stock(stock);
+ drain_obj_stock(&stock->irq_obj);
+ if (in_task())
+ drain_obj_stock(&stock->task_obj);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
@@ -3015,13 +3061,10 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
- struct memcg_stock_pcp *stock;
unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
int *bytes;
- local_irq_save(flags);
- stock = this_cpu_ptr(&memcg_stock);
-
/*
* Save vmstat data in stock and skip vmstat array update unless
* accumulating over a page of vmstat data or when pgdat or idx
@@ -3066,29 +3109,26 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr);
- local_irq_restore(flags);
+ put_obj_stock(flags);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
- struct memcg_stock_pcp *stock;
unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
bool ret = false;
- local_irq_save(flags);
-
- stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
- local_irq_restore(flags);
+ put_obj_stock(flags);
return ret;
}
-static void drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock *stock)
{
struct obj_cgroup *old = stock->cached_objcg;
@@ -3144,8 +3184,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
{
struct mem_cgroup *memcg;
- if (stock->cached_objcg) {
- memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (in_task() && stock->task_obj.cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ return true;
+ }
+ if (stock->irq_obj.cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3156,13 +3201,10 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool overfill)
{
- struct memcg_stock_pcp *stock;
unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
unsigned int nr_pages = 0;
- local_irq_save(flags);
-
- stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */
drain_obj_stock(stock);
obj_cgroup_get(objcg);
@@ -3177,7 +3219,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_irq_restore(flags);
+ put_obj_stock(flags);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
--
2.18.1
^ permalink raw reply related [flat|nested] 16+ messages in thread