linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq
@ 2023-05-25  4:35 Ming Lei
  2023-05-25 14:11 ` Michal Koutný
                   ` (3 more replies)
  0 siblings, 4 replies; 21+ messages in thread
From: Ming Lei @ 2023-05-25  4:35 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, stable, Jay Shin, Waiman Long, Tejun Heo,
	mkoutny, Yosry Ahmed

As noted by Michal, the blkg_iostat_set's in the lockless list hold
reference to blkg's to protect against their removal. Those blkg's
hold reference to blkcg. When a cgroup is being destroyed,
cgroup_rstat_flush() is only called at css_release_work_fn() which
is called when the blkcg reference count reaches 0. This circular
dependency will prevent blkcg and some blkgs from being freed after
they are made offline.

It is less a problem if the cgroup to be destroyed also has other
controllers like memory that will call cgroup_rstat_flush() which will
clean up the reference count. If block is the only controller that uses
rstat, these offline blkcg and blkgs may never be freed leaking more
and more memory over time.

To prevent this potential memory leak:

- flush blkcg per-cpu stats list in __blkg_release(), when no new stat
can be added

- add global blkg_stat_lock for covering concurrent parent blkg stat
update

- don't grab bio->bi_blkg reference when adding the stats into blkcg's
per-cpu stat list since all stats are guaranteed to be consumed before
releasing blkg instance, and grabbing blkg reference for stats was the
most fragile part of original patch

Based on Waiman's patch:

https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/

Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
Cc: stable@vger.kernel.org
Reported-by: Jay Shin <jaeshin@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: mkoutny@suse.com
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
V3:
	- add one global blkg_stat_lock for avoiding concurrent update on
	blkg stat; this way is easier for backport, also won't cause contention;

V2:
	- remove kernel/cgroup change, and call blkcg_rstat_flush()
	to flush stat directly

 block/blk-cgroup.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..f0b5c9c41cde 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -34,6 +34,8 @@
 #include "blk-ioprio.h"
 #include "blk-throttle.h"
 
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
+
 /*
  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
  * blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -56,6 +58,8 @@ static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
 
 bool blkcg_debug_stats = false;
 
+static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
+
 #define BLKG_DESTROY_BATCH_SIZE  64
 
 /*
@@ -163,10 +167,20 @@ static void blkg_free(struct blkcg_gq *blkg)
 static void __blkg_release(struct rcu_head *rcu)
 {
 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+	struct blkcg *blkcg = blkg->blkcg;
+	int cpu;
 
 #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
 	WARN_ON(!bio_list_empty(&blkg->async_bios));
 #endif
+	/*
+	 * Flush all the non-empty percpu lockless lists before releasing
+	 * us, given these stat belongs to us.
+	 *
+	 * blkg_stat_lock is for serializing blkg stat update
+	 */
+	for_each_possible_cpu(cpu)
+		__blkcg_rstat_flush(blkcg, cpu);
 
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
@@ -951,23 +965,26 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
 	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
 }
 
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
 	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
 	struct llist_node *lnode;
 	struct blkg_iostat_set *bisc, *next_bisc;
 
-	/* Root-level stats are sourced from system-wide IO stats */
-	if (!cgroup_parent(css->cgroup))
-		return;
-
 	rcu_read_lock();
 
 	lnode = llist_del_all(lhead);
 	if (!lnode)
 		goto out;
 
+	/*
+	 * For covering concurrent parent blkg update from blkg_release().
+	 *
+	 * When flushing from cgroup, cgroup_rstat_lock is always held, so
+	 * this lock won't cause contention most of time.
+	 */
+	raw_spin_lock(&blkg_stat_lock);
+
 	/*
 	 * Iterate only the iostat_cpu's queued in the lockless list.
 	 */
@@ -991,13 +1008,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 		if (parent && parent->parent)
 			blkcg_iostat_update(parent, &blkg->iostat.cur,
 					    &blkg->iostat.last);
-		percpu_ref_put(&blkg->refcnt);
 	}
-
+	raw_spin_unlock(&blkg_stat_lock);
 out:
 	rcu_read_unlock();
 }
 
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+	/* Root-level stats are sourced from system-wide IO stats */
+	if (cgroup_parent(css->cgroup))
+		__blkcg_rstat_flush(css_to_blkcg(css), cpu);
+}
+
 /*
  * We source root cgroup stats from the system-wide stats to avoid
  * tracking the same information twice and incurring overhead when no
@@ -2075,7 +2098,6 @@ void blk_cgroup_bio_start(struct bio *bio)
 
 		llist_add(&bis->lnode, lhead);
 		WRITE_ONCE(bis->lqueued, true);
-		percpu_ref_get(&bis->blkg->refcnt);
 	}
 
 	u64_stats_update_end_irqrestore(&bis->sync, flags);
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25  4:35 [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq Ming Lei
@ 2023-05-25 14:11 ` Michal Koutný
  2023-05-25 15:21   ` Waiman Long
  2023-05-25 15:33   ` Ming Lei
  2023-05-25 16:01 ` [PATCH] " Waiman Long
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 21+ messages in thread
From: Michal Koutný @ 2023-05-25 14:11 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, stable, Jay Shin, Waiman Long,
	Tejun Heo, Yosry Ahmed

[-- Attachment #1: Type: text/plain, Size: 1650 bytes --]

On Thu, May 25, 2023 at 12:35:18PM +0800, Ming Lei <ming.lei@redhat.com> wrote:
> It is less a problem if the cgroup to be destroyed also has other
> controllers like memory that will call cgroup_rstat_flush() which will
> clean up the reference count. If block is the only controller that uses
> rstat, these offline blkcg and blkgs may never be freed leaking more
> and more memory over time.

On v2, io implies memory too.
Do you observe the leak on the v2 system too?

(Beware that (not only) dirty pages would pin offlined memcg, so the
actual mem_cgroup_css_release and cgroup_rstat_flush would be further
delayed.)

> To prevent this potential memory leak:
> 
> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> can be added
> 
> - add global blkg_stat_lock for covering concurrent parent blkg stat
> update

It's bit unfortunate yet another lock is added :-/

IIUC, even Waiman's patch (flush in blkcg_destroy_blkcfs) would need
synchronization for different CPU replicas flushes in
blkcg_iostat_update, right?

> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> per-cpu stat list since all stats are guaranteed to be consumed before
> releasing blkg instance, and grabbing blkg reference for stats was the
> most fragile part of original patch


At one moment, the lhead -> blkcg_gq reference seemed alright to me and
consequently blkcg_gq -> blkcg is the one that looks reversed (forming
the cycle). But changing its direction would be much more fundamental
change, it'd need also kind of blkcg_gq reparenting -- similarly to
memcg.


Thanks,
Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25 14:11 ` Michal Koutný
@ 2023-05-25 15:21   ` Waiman Long
  2023-05-25 15:33   ` Ming Lei
  1 sibling, 0 replies; 21+ messages in thread
From: Waiman Long @ 2023-05-25 15:21 UTC (permalink / raw)
  To: Michal Koutný, Ming Lei
  Cc: Jens Axboe, linux-block, stable, Jay Shin, Tejun Heo, Yosry Ahmed

On 5/25/23 10:11, Michal Koutný wrote:
> On Thu, May 25, 2023 at 12:35:18PM +0800, Ming Lei <ming.lei@redhat.com> wrote:
>> It is less a problem if the cgroup to be destroyed also has other
>> controllers like memory that will call cgroup_rstat_flush() which will
>> clean up the reference count. If block is the only controller that uses
>> rstat, these offline blkcg and blkgs may never be freed leaking more
>> and more memory over time.
> On v2, io implies memory too.
> Do you observe the leak on the v2 system too?
>
> (Beware that (not only) dirty pages would pin offlined memcg, so the
> actual mem_cgroup_css_release and cgroup_rstat_flush would be further
> delayed.)

The memory leak isn't observed on cgroup v2 as the cgroup_rstat_flush() 
call by the memory controller will help to flush the extra references 
holding back blkcgs. It is only happening with a cgroup v1 configuration 
where blkcg is standalone in a cgroup.

>
>> To prevent this potential memory leak:
>>
>> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
>> can be added
>>
>> - add global blkg_stat_lock for covering concurrent parent blkg stat
>> update
> It's bit unfortunate yet another lock is added :-/
>
> IIUC, even Waiman's patch (flush in blkcg_destroy_blkcfs) would need
> synchronization for different CPU replicas flushes in
> blkcg_iostat_update, right?

Right, the goal is to prevent concurrent call of blkcg_iostat_update() 
to the same blkg.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25 14:11 ` Michal Koutný
  2023-05-25 15:21   ` Waiman Long
@ 2023-05-25 15:33   ` Ming Lei
  1 sibling, 0 replies; 21+ messages in thread
From: Ming Lei @ 2023-05-25 15:33 UTC (permalink / raw)
  To: Michal Koutný
  Cc: Jens Axboe, linux-block, stable, Jay Shin, Waiman Long,
	Tejun Heo, Yosry Ahmed

On Thu, May 25, 2023 at 04:11:34PM +0200, Michal Koutný wrote:
> On Thu, May 25, 2023 at 12:35:18PM +0800, Ming Lei <ming.lei@redhat.com> wrote:
> > It is less a problem if the cgroup to be destroyed also has other
> > controllers like memory that will call cgroup_rstat_flush() which will
> > clean up the reference count. If block is the only controller that uses
> > rstat, these offline blkcg and blkgs may never be freed leaking more
> > and more memory over time.
> 
> On v2, io implies memory too.
> Do you observe the leak on the v2 system too?

blkg leak is only observed on v1.

> 
> (Beware that (not only) dirty pages would pin offlined memcg, so the
> actual mem_cgroup_css_release and cgroup_rstat_flush would be further
> delayed.)
> 
> > To prevent this potential memory leak:
> > 
> > - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> > can be added
> > 
> > - add global blkg_stat_lock for covering concurrent parent blkg stat
> > update
> 
> It's bit unfortunate yet another lock is added :-/

Cause it should be the simplest one for backport, :-)

> 
> IIUC, even Waiman's patch (flush in blkcg_destroy_blkcfs) would need
> synchronization for different CPU replicas flushes in
> blkcg_iostat_update, right?
> 
> > - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> > per-cpu stat list since all stats are guaranteed to be consumed before
> > releasing blkg instance, and grabbing blkg reference for stats was the
> > most fragile part of original patch
> 
> 
> At one moment, the lhead -> blkcg_gq reference seemed alright to me and

But bio->bi_blkg has grabbed one reference in blk_cgroup_bio_start
already.

> consequently blkcg_gq -> blkcg is the one that looks reversed (forming

IMO, this one is correct, cause blkcg_gq depends on blkcg.

> the cycle). But changing its direction would be much more fundamental
> change, it'd need also kind of blkcg_gq reparenting -- similarly to
> memcg.

Thanks, 
Ming


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25  4:35 [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq Ming Lei
  2023-05-25 14:11 ` Michal Koutný
@ 2023-05-25 16:01 ` Waiman Long
  2023-05-25 16:06   ` Waiman Long
  2023-06-06  3:49 ` [PATCH V3] " Ming Lei
  2023-06-08 23:10 ` Tejun Heo
  3 siblings, 1 reply; 21+ messages in thread
From: Waiman Long @ 2023-05-25 16:01 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo, Josef Bacik, Zefan Li, Johannes Weiner
  Cc: cgroups, linux-block, linux-kernel, Michal Koutný,
	Yosry Ahmed, Jay Shin, Waiman Long, stable, Ming Lei

As noted by Michal, the blkg_iostat_set's in the lockless list hold
reference to blkg's to protect against their removal. Those blkg's
hold reference to blkcg. When a cgroup is being destroyed,
cgroup_rstat_flush() is only called at css_release_work_fn() which
is called when the blkcg reference count reaches 0. This circular
dependency will prevent blkcg and some blkgs from being freed after
they are made offline.

It is less a problem if the cgroup to be destroyed also has other
controllers like memory that will call cgroup_rstat_flush() which will
clean up the reference count. If block is the only controller that uses
rstat, these offline blkcg and blkgs may never be freed leaking more
and more memory over time.

To prevent this potential memory leak:

- flush blkcg per-cpu stats list in __blkg_release(), when no new stat
  can be added to avoid use-after-free of the percpu blkg_iostat_set in
  futue cgroup_rstat_flush*() calls.

- add a cgroup_rstat_flush_acquire() helper and call it to acquire
  cgroup_rstat_lock to block concurrent execution of other
  cgroup_rstat_flush*() calls

- don't grab bio->bi_blkg reference when adding the stats into blkcg's
  per-cpu stat list since all stats are guaranteed to be consumed before
  releasing blkg instance, and grabbing blkg reference for stats was
  the most fragile part of original patch

Based on Waiman's patch:

https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/

Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
Cc: stable@vger.kernel.org
Reported-by: Jay Shin <jaeshin@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: mkoutny@suse.com
Cc: Yosry Ahmed <yosryahmed@google.com>
Co-developed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Waiman Long <longman@redhat.com>
---
 block/blk-cgroup.c     | 57 +++++++++++++++++++++++++++++++-----------
 include/linux/cgroup.h |  1 +
 kernel/cgroup/rstat.c  | 15 ++++++++++-
 3 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..90c2efc3767f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -160,13 +160,39 @@ static void blkg_free(struct blkcg_gq *blkg)
 	schedule_work(&blkg->free_work);
 }
 
+static void __blkcg_rstat_flush(struct llist_node *lnode);
+
 static void __blkg_release(struct rcu_head *rcu)
 {
 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+	struct blkcg *blkcg = blkg->blkcg;
+	int cpu;
 
 #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
 	WARN_ON(!bio_list_empty(&blkg->async_bios));
 #endif
+	/*
+	 * Flush all the non-empty percpu lockless lists before releasing
+	 * us, given these stat belongs to us.
+	 *
+	 * Hold the cgroup_rstat_lock before calling __blkcg_rstat_flush()
+	 * to block concurrent cgroup_rstat_flush*() calls.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+		struct llist_node *lnode;
+
+		if (llist_empty(lhead))
+			continue;
+
+		lnode = llist_del_all(lhead);
+		if (!lnode)
+			continue;
+
+		cgroup_rstat_flush_acquire();
+		__blkcg_rstat_flush(lnode);
+		cgroup_rstat_flush_release();
+	}
 
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
@@ -951,23 +977,12 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
 	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
 }
 
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+static void __blkcg_rstat_flush(struct llist_node *lnode)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
-	struct llist_node *lnode;
 	struct blkg_iostat_set *bisc, *next_bisc;
 
-	/* Root-level stats are sourced from system-wide IO stats */
-	if (!cgroup_parent(css->cgroup))
-		return;
-
 	rcu_read_lock();
 
-	lnode = llist_del_all(lhead);
-	if (!lnode)
-		goto out;
-
 	/*
 	 * Iterate only the iostat_cpu's queued in the lockless list.
 	 */
@@ -991,13 +1006,26 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 		if (parent && parent->parent)
 			blkcg_iostat_update(parent, &blkg->iostat.cur,
 					    &blkg->iostat.last);
-		percpu_ref_put(&blkg->refcnt);
 	}
 
-out:
 	rcu_read_unlock();
 }
 
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+	struct llist_node *lnode;
+
+	/* Root-level stats are sourced from system-wide IO stats */
+	if (!cgroup_parent(css->cgroup))
+		return;
+
+	lnode = llist_del_all(lhead);
+	if (lnode)
+		__blkcg_rstat_flush(lnode);
+}
+
 /*
  * We source root cgroup stats from the system-wide stats to avoid
  * tracking the same information twice and incurring overhead when no
@@ -2075,7 +2103,6 @@ void blk_cgroup_bio_start(struct bio *bio)
 
 		llist_add(&bis->lnode, lhead);
 		WRITE_ONCE(bis->lqueued, true);
-		percpu_ref_get(&bis->blkg->refcnt);
 	}
 
 	u64_stats_update_end_irqrestore(&bis->sync, flags);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 885f5395fcd0..88e6647f49df 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -694,6 +694,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
 void cgroup_rstat_flush(struct cgroup *cgrp);
 void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
 void cgroup_rstat_flush_hold(struct cgroup *cgrp);
+void cgroup_rstat_flush_acquire(void);
 void cgroup_rstat_flush_release(void);
 
 /*
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9c4c55228567..b0fd4e27f466 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -273,7 +273,20 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 }
 
 /**
- * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ * cgroup_rstat_flush_acquire - acquire cgroup_rstat_lock
+ *
+ * Callers can acquire the internal cgroup_rstat_lock to prevent concurrent
+ * execution of cgroup_rstat_flush*() and the controller callbacks.
+ */
+void cgroup_rstat_flush_acquire(void)
+	__acquires(&cgroup_rstat_lock)
+{
+	spin_lock_irq(&cgroup_rstat_lock);
+}
+
+/**
+ * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() or
+ *				cgroup_rstat_flush_acquire()
  */
 void cgroup_rstat_flush_release(void)
 	__releases(&cgroup_rstat_lock)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25 16:01 ` [PATCH] " Waiman Long
@ 2023-05-25 16:06   ` Waiman Long
  2023-05-26  0:34     ` Ming Lei
  0 siblings, 1 reply; 21+ messages in thread
From: Waiman Long @ 2023-05-25 16:06 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo, Josef Bacik, Zefan Li, Johannes Weiner
  Cc: cgroups, linux-block, linux-kernel, Michal Koutný,
	Yosry Ahmed, Jay Shin, stable, Ming Lei

On 5/25/23 12:01, Waiman Long wrote:
> As noted by Michal, the blkg_iostat_set's in the lockless list hold
> reference to blkg's to protect against their removal. Those blkg's
> hold reference to blkcg. When a cgroup is being destroyed,
> cgroup_rstat_flush() is only called at css_release_work_fn() which
> is called when the blkcg reference count reaches 0. This circular
> dependency will prevent blkcg and some blkgs from being freed after
> they are made offline.
>
> It is less a problem if the cgroup to be destroyed also has other
> controllers like memory that will call cgroup_rstat_flush() which will
> clean up the reference count. If block is the only controller that uses
> rstat, these offline blkcg and blkgs may never be freed leaking more
> and more memory over time.
>
> To prevent this potential memory leak:
>
> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
>    can be added to avoid use-after-free of the percpu blkg_iostat_set in
>    futue cgroup_rstat_flush*() calls.
>
> - add a cgroup_rstat_flush_acquire() helper and call it to acquire
>    cgroup_rstat_lock to block concurrent execution of other
>    cgroup_rstat_flush*() calls
>
> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
>    per-cpu stat list since all stats are guaranteed to be consumed before
>    releasing blkg instance, and grabbing blkg reference for stats was
>    the most fragile part of original patch
>
> Based on Waiman's patch:
>
> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
>
> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> Cc: stable@vger.kernel.org
> Reported-by: Jay Shin <jaeshin@redhat.com>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: mkoutny@suse.com
> Cc: Yosry Ahmed <yosryahmed@google.com>
> Co-developed-by: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>   block/blk-cgroup.c     | 57 +++++++++++++++++++++++++++++++-----------
>   include/linux/cgroup.h |  1 +
>   kernel/cgroup/rstat.c  | 15 ++++++++++-
>   3 files changed, 57 insertions(+), 16 deletions(-)

This is my counter-proposal to Ming's v3 patch. The major difference is 
that I used the existing cgroup_rstat_lock instead of adding a new 
internal lock. This minimizes performance impact to existing 
cgroup_rstat_flush*() call while achieving the same objective. I am fine 
with Ming current v3 patch if we decide to go that way.

Thanks,
Longman

>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 0ce64dd73cfe..90c2efc3767f 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -160,13 +160,39 @@ static void blkg_free(struct blkcg_gq *blkg)
>   	schedule_work(&blkg->free_work);
>   }
>   
> +static void __blkcg_rstat_flush(struct llist_node *lnode);
> +
>   static void __blkg_release(struct rcu_head *rcu)
>   {
>   	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
> +	struct blkcg *blkcg = blkg->blkcg;
> +	int cpu;
>   
>   #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
>   	WARN_ON(!bio_list_empty(&blkg->async_bios));
>   #endif
> +	/*
> +	 * Flush all the non-empty percpu lockless lists before releasing
> +	 * us, given these stat belongs to us.
> +	 *
> +	 * Hold the cgroup_rstat_lock before calling __blkcg_rstat_flush()
> +	 * to block concurrent cgroup_rstat_flush*() calls.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> +		struct llist_node *lnode;
> +
> +		if (llist_empty(lhead))
> +			continue;
> +
> +		lnode = llist_del_all(lhead);
> +		if (!lnode)
> +			continue;
> +
> +		cgroup_rstat_flush_acquire();
> +		__blkcg_rstat_flush(lnode);
> +		cgroup_rstat_flush_release();
> +	}
>   
>   	/* release the blkcg and parent blkg refs this blkg has been holding */
>   	css_put(&blkg->blkcg->css);
> @@ -951,23 +977,12 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
>   	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>   }
>   
> -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> +static void __blkcg_rstat_flush(struct llist_node *lnode)
>   {
> -	struct blkcg *blkcg = css_to_blkcg(css);
> -	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> -	struct llist_node *lnode;
>   	struct blkg_iostat_set *bisc, *next_bisc;
>   
> -	/* Root-level stats are sourced from system-wide IO stats */
> -	if (!cgroup_parent(css->cgroup))
> -		return;
> -
>   	rcu_read_lock();
>   
> -	lnode = llist_del_all(lhead);
> -	if (!lnode)
> -		goto out;
> -
>   	/*
>   	 * Iterate only the iostat_cpu's queued in the lockless list.
>   	 */
> @@ -991,13 +1006,26 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>   		if (parent && parent->parent)
>   			blkcg_iostat_update(parent, &blkg->iostat.cur,
>   					    &blkg->iostat.last);
> -		percpu_ref_put(&blkg->refcnt);
>   	}
>   
> -out:
>   	rcu_read_unlock();
>   }
>   
> +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> +{
> +	struct blkcg *blkcg = css_to_blkcg(css);
> +	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> +	struct llist_node *lnode;Jay Shin <jaeshin@redhat.com>
> +
> +	/* Root-level stats are sourced from system-wide IO stats */
> +	if (!cgroup_parent(css->cgroup))
> +		return;
> +
> +	lnode = llist_del_all(lhead);
> +	if (lnode)
> +		__blkcg_rstat_flush(lnode);
> +}
> +
>   /*
>    * We source root cgroup stats from the system-wide stats to avoid
>    * tracking the same information twice and incurring overhead when no
> @@ -2075,7 +2103,6 @@ void blk_cgroup_bio_start(struct bio *bio)
>   
>   		llist_add(&bis->lnode, lhead);
>   		WRITE_ONCE(bis->lqueued, true);
> -		percpu_ref_get(&bis->blkg->refcnt);
>   	}
>   
>   	u64_stats_update_end_irqrestore(&bis->sync, flags);
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 885f5395fcd0..88e6647f49df 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -694,6 +694,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
>   void cgroup_rstat_flush(struct cgroup *cgrp);
>   void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
>   void cgroup_rstat_flush_hold(struct cgroup *cgrp);
> +void cgroup_rstat_flush_acquire(void);
>   void cgroup_rstat_flush_release(void);
>   
>   /*
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 9c4c55228567..b0fd4e27f466 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -273,7 +273,20 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
>   }
>   
>   /**
> - * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
> + * cgroup_rstat_flush_acquire - acquire cgroup_rstat_lock
> + *
> + * Callers can acquire the internal cgroup_rstat_lock to prevent concurrent
> + * execution of cgroup_rstat_flush*() and the controller callbacks.
> + */
> +void cgroup_rstat_flush_acquire(void)
> +	__acquires(&cgroup_rstat_lock)
> +{
> +	spin_lock_irq(&cgroup_rstat_lock);
> +}
> +
> +/**
> + * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() or
> + *				cgroup_rstat_flush_acquire()
>    */
>   void cgroup_rstat_flush_release(void)
>   	__releases(&cgroup_rstat_lock)


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25 16:06   ` Waiman Long
@ 2023-05-26  0:34     ` Ming Lei
  0 siblings, 0 replies; 21+ messages in thread
From: Ming Lei @ 2023-05-26  0:34 UTC (permalink / raw)
  To: Waiman Long
  Cc: Jens Axboe, Tejun Heo, Josef Bacik, Zefan Li, Johannes Weiner,
	cgroups, linux-block, linux-kernel, Michal Koutný,
	Yosry Ahmed, Jay Shin, stable, ming.lei

On Thu, May 25, 2023 at 12:06:07PM -0400, Waiman Long wrote:
> On 5/25/23 12:01, Waiman Long wrote:
> > As noted by Michal, the blkg_iostat_set's in the lockless list hold
> > reference to blkg's to protect against their removal. Those blkg's
> > hold reference to blkcg. When a cgroup is being destroyed,
> > cgroup_rstat_flush() is only called at css_release_work_fn() which
> > is called when the blkcg reference count reaches 0. This circular
> > dependency will prevent blkcg and some blkgs from being freed after
> > they are made offline.
> > 
> > It is less a problem if the cgroup to be destroyed also has other
> > controllers like memory that will call cgroup_rstat_flush() which will
> > clean up the reference count. If block is the only controller that uses
> > rstat, these offline blkcg and blkgs may never be freed leaking more
> > and more memory over time.
> > 
> > To prevent this potential memory leak:
> > 
> > - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> >    can be added to avoid use-after-free of the percpu blkg_iostat_set in
> >    futue cgroup_rstat_flush*() calls.
> > 
> > - add a cgroup_rstat_flush_acquire() helper and call it to acquire
> >    cgroup_rstat_lock to block concurrent execution of other
> >    cgroup_rstat_flush*() calls
> > 
> > - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> >    per-cpu stat list since all stats are guaranteed to be consumed before
> >    releasing blkg instance, and grabbing blkg reference for stats was
> >    the most fragile part of original patch
> > 
> > Based on Waiman's patch:
> > 
> > https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> > 
> > Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> > Cc: stable@vger.kernel.org
> > Reported-by: Jay Shin <jaeshin@redhat.com>
> > Cc: Waiman Long <longman@redhat.com>
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: mkoutny@suse.com
> > Cc: Yosry Ahmed <yosryahmed@google.com>
> > Co-developed-by: Ming Lei <ming.lei@redhat.com>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > Signed-off-by: Waiman Long <longman@redhat.com>
> > ---
> >   block/blk-cgroup.c     | 57 +++++++++++++++++++++++++++++++-----------
> >   include/linux/cgroup.h |  1 +
> >   kernel/cgroup/rstat.c  | 15 ++++++++++-
> >   3 files changed, 57 insertions(+), 16 deletions(-)
> 
> This is my counter-proposal to Ming's v3 patch. The major difference is that
> I used the existing cgroup_rstat_lock instead of adding a new internal lock.
> This minimizes performance impact to existing cgroup_rstat_flush*() call

The added internal lock has ~zero perf impact on rstat flush cause
the lock won't be contended basically.

> while achieving the same objective. I am fine with Ming current v3 patch if
> we decide to go that way.

As I mentioned, the main motivation with internal lock is to make the fix as
simple as possible since cross-subsystem change isn't involved, and I am fine
with any following cleanup or improvement on current blkg rstat flush.

Another benefit with this internal lock is that race in blkcg_reset_stats()
can be avoided.


Thanks, 
Ming


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25  4:35 [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq Ming Lei
  2023-05-25 14:11 ` Michal Koutný
  2023-05-25 16:01 ` [PATCH] " Waiman Long
@ 2023-06-06  3:49 ` Ming Lei
  2023-06-08 23:07   ` Tejun Heo
  2023-06-08 23:10 ` Tejun Heo
  3 siblings, 1 reply; 21+ messages in thread
From: Ming Lei @ 2023-06-06  3:49 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo
  Cc: linux-block, stable, Jay Shin, Waiman Long, mkoutny, Yosry Ahmed,
	ming.lei

On Thu, May 25, 2023 at 12:35:18PM +0800, Ming Lei wrote:
> As noted by Michal, the blkg_iostat_set's in the lockless list hold
> reference to blkg's to protect against their removal. Those blkg's
> hold reference to blkcg. When a cgroup is being destroyed,
> cgroup_rstat_flush() is only called at css_release_work_fn() which
> is called when the blkcg reference count reaches 0. This circular
> dependency will prevent blkcg and some blkgs from being freed after
> they are made offline.
> 
> It is less a problem if the cgroup to be destroyed also has other
> controllers like memory that will call cgroup_rstat_flush() which will
> clean up the reference count. If block is the only controller that uses
> rstat, these offline blkcg and blkgs may never be freed leaking more
> and more memory over time.
> 
> To prevent this potential memory leak:
> 
> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> can be added
> 
> - add global blkg_stat_lock for covering concurrent parent blkg stat
> update
> 
> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> per-cpu stat list since all stats are guaranteed to be consumed before
> releasing blkg instance, and grabbing blkg reference for stats was the
> most fragile part of original patch
> 
> Based on Waiman's patch:
> 
> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> 
> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> Cc: stable@vger.kernel.org
> Reported-by: Jay Shin <jaeshin@redhat.com>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: mkoutny@suse.com
> Cc: Yosry Ahmed <yosryahmed@google.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
> V3:
> 	- add one global blkg_stat_lock for avoiding concurrent update on
> 	blkg stat; this way is easier for backport, also won't cause contention;

Hello Jens and Tejun,

Can we move on with this patch or Waiman's version[1]?

I am fine with either one.

[1] https://lore.kernel.org/linux-block/20230525160105.1968749-1-longman@redhat.com/

Thanks, 
Ming


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-06-06  3:49 ` [PATCH V3] " Ming Lei
@ 2023-06-08 23:07   ` Tejun Heo
  0 siblings, 0 replies; 21+ messages in thread
From: Tejun Heo @ 2023-06-08 23:07 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, stable, Jay Shin, Waiman Long, mkoutny,
	Yosry Ahmed

On Tue, Jun 06, 2023 at 11:49:13AM +0800, Ming Lei wrote:
> > V3:
> > 	- add one global blkg_stat_lock for avoiding concurrent update on
> > 	blkg stat; this way is easier for backport, also won't cause contention;
> 
> Hello Jens and Tejun,
> 
> Can we move on with this patch or Waiman's version[1]?
> 
> I am fine with either one.
> 
> [1] https://lore.kernel.org/linux-block/20230525160105.1968749-1-longman@redhat.com/

I personally like Ming's version with a separate lock as it's a bit simpler.
Let's go with that one.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25  4:35 [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq Ming Lei
                   ` (2 preceding siblings ...)
  2023-06-06  3:49 ` [PATCH V3] " Ming Lei
@ 2023-06-08 23:10 ` Tejun Heo
  3 siblings, 0 replies; 21+ messages in thread
From: Tejun Heo @ 2023-06-08 23:10 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, stable, Jay Shin, Waiman Long, mkoutny,
	Yosry Ahmed

On Thu, May 25, 2023 at 12:35:18PM +0800, Ming Lei wrote:
> As noted by Michal, the blkg_iostat_set's in the lockless list hold
> reference to blkg's to protect against their removal. Those blkg's
> hold reference to blkcg. When a cgroup is being destroyed,
> cgroup_rstat_flush() is only called at css_release_work_fn() which
> is called when the blkcg reference count reaches 0. This circular
> dependency will prevent blkcg and some blkgs from being freed after
> they are made offline.
> 
> It is less a problem if the cgroup to be destroyed also has other
> controllers like memory that will call cgroup_rstat_flush() which will
> clean up the reference count. If block is the only controller that uses
> rstat, these offline blkcg and blkgs may never be freed leaking more
> and more memory over time.
> 
> To prevent this potential memory leak:
> 
> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> can be added
> 
> - add global blkg_stat_lock for covering concurrent parent blkg stat
> update
> 
> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> per-cpu stat list since all stats are guaranteed to be consumed before
> releasing blkg instance, and grabbing blkg reference for stats was the
> most fragile part of original patch
> 
> Based on Waiman's patch:
> 
> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> 
> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> Cc: stable@vger.kernel.org
> Reported-by: Jay Shin <jaeshin@redhat.com>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: mkoutny@suse.com
> Cc: Yosry Ahmed <yosryahmed@google.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25 15:25       ` Waiman Long
@ 2023-05-26 21:11         ` Michal Koutný
  0 siblings, 0 replies; 21+ messages in thread
From: Michal Koutný @ 2023-05-26 21:11 UTC (permalink / raw)
  To: Waiman Long
  Cc: Ming Lei, Yosry Ahmed, Linux-MM, Michal Hocko, Shakeel Butt,
	Johannes Weiner, Roman Gushchin, Muchun Song, Jens Axboe,
	linux-block, cgroups, Tejun Heo

[-- Attachment #1: Type: text/plain, Size: 600 bytes --]

On Thu, May 25, 2023 at 11:25:05AM -0400, Waiman Long <longman@redhat.com> wrote:
> Since the percpu blkg_iostat_set's that are linked in the lockless list will
> be freed if the corresponding blkcg_gq is freed, we need to flush the
> lockless list to avoid potential use-after-free in a future
> cgroup_rstat_flush*() call.

Ah, so that was meant to the situation post-patch (that removes refcnt
of entries on list lockless).

(It sounded like an answer to Yosry's question about
cgroup_rstat_flush in offline_css in pre-patch version. Nevermind, this
would need other adjustments.)

Thanks,
Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-25 14:11     ` Michal Koutný
@ 2023-05-25 15:25       ` Waiman Long
  2023-05-26 21:11         ` Michal Koutný
  0 siblings, 1 reply; 21+ messages in thread
From: Waiman Long @ 2023-05-25 15:25 UTC (permalink / raw)
  To: Michal Koutný, Ming Lei
  Cc: Yosry Ahmed, Linux-MM, Michal Hocko, Shakeel Butt,
	Johannes Weiner, Roman Gushchin, Muchun Song, Jens Axboe,
	linux-block, cgroups, Tejun Heo


On 5/25/23 10:11, Michal Koutný wrote:
> On Wed, May 24, 2023 at 10:37:10AM +0800, Ming Lei <ming.lei@redhat.com> wrote:
>>> I am not at all familiar with blkcg, but does calling
>>> cgroup_rstat_flush() in offline_css() fix the problem?
>> Except for offline, this list needs to be flushed after the associated disk
>> is deleted.
> Why the second flush trigger?
> a) To break another ref-dependency cycle (like on the blkcg side)?
> b) To avoid stale data upon device removal?
>
> (Because b) should be unnecessary, a future reader would flush when
> needed.)

Since the percpu blkg_iostat_set's that are linked in the lockless list 
will be freed if the corresponding blkcg_gq is freed, we need to flush 
the lockless list to avoid potential use-after-free in a future 
cgroup_rstat_flush*() call.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-24  2:37   ` Ming Lei
  2023-05-24  2:43     ` Yosry Ahmed
  2023-05-24  4:10     ` Waiman Long
@ 2023-05-25 14:11     ` Michal Koutný
  2023-05-25 15:25       ` Waiman Long
  2 siblings, 1 reply; 21+ messages in thread
From: Michal Koutný @ 2023-05-25 14:11 UTC (permalink / raw)
  To: Ming Lei
  Cc: Yosry Ahmed, Linux-MM, Michal Hocko, Shakeel Butt,
	Johannes Weiner, Roman Gushchin, Muchun Song, Jens Axboe,
	linux-block, Waiman Long, cgroups, Tejun Heo

[-- Attachment #1: Type: text/plain, Size: 544 bytes --]

On Wed, May 24, 2023 at 10:37:10AM +0800, Ming Lei <ming.lei@redhat.com> wrote:
> > I am not at all familiar with blkcg, but does calling
> > cgroup_rstat_flush() in offline_css() fix the problem?
> 
> Except for offline, this list needs to be flushed after the associated disk
> is deleted.

Why the second flush trigger?
a) To break another ref-dependency cycle (like on the blkcg side)?
b) To avoid stale data upon device removal?

(Because b) should be unnecessary, a future reader would flush when
needed.)

Thanks,
Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-24  4:10     ` Waiman Long
@ 2023-05-24  4:21       ` Ming Lei
  0 siblings, 0 replies; 21+ messages in thread
From: Ming Lei @ 2023-05-24  4:21 UTC (permalink / raw)
  To: Waiman Long
  Cc: Yosry Ahmed, Linux-MM, Michal Hocko, Shakeel Butt,
	Johannes Weiner, Roman Gushchin, Muchun Song, Jens Axboe,
	linux-block, cgroups, Tejun Heo, mkoutny, ming.lei

On Wed, May 24, 2023 at 12:10:55AM -0400, Waiman Long wrote:
> 
> On 5/23/23 22:37, Ming Lei wrote:
> > Hi Yosry,
> > 
> > On Tue, May 23, 2023 at 07:06:38PM -0700, Yosry Ahmed wrote:
> > > Hi Ming,
> > > 
> > > On Tue, May 23, 2023 at 6:21 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > As noted by Michal, the blkg_iostat_set's in the lockless list
> > > > hold reference to blkg's to protect against their removal. Those
> > > > blkg's hold reference to blkcg. When a cgroup is being destroyed,
> > > > cgroup_rstat_flush() is only called at css_release_work_fn() which
> > > > is called when the blkcg reference count reaches 0. This circular
> > > > dependency will prevent blkcg and some blkgs from being freed after
> > > > they are made offline.
> > > I am not at all familiar with blkcg, but does calling
> > > cgroup_rstat_flush() in offline_css() fix the problem?
> > Except for offline, this list needs to be flushed after the associated disk
> > is deleted.
> > 
> > > or can items be
> > > added to the lockless list(s) after the blkcg is offlined?
> > Yeah.
> > 
> > percpu_ref_*get(&blkg->refcnt) still can succeed after the percpu refcnt
> > is killed in blkg_destroy() which is called from both offline css and
> > removing disk.
> 
> As suggested by Tejun, we can use percpu_ref_tryget(&blkg->refcnt) to make
> sure that we can only take a reference when the blkg is online. I think it
> is a bit safer to take a percpu refcnt to avoid use after free. My other

blkg_release() does guarantee that no new stat associated with this
blkg can be added any more, so what is the use-after-free?

> concern about your patch is that the per cpu list iterations will be done
> multiple times when a blkcg is destroyed if many blkgs are attached to the

Yeah, but is it one big deal? The percpu list should be flushed just in one
of blkg's release handler.

> blkcg. I still prefer to do it once in blkcg_destroy_blkgs(). I am going to

Problem is that new stat still may be added to this list after
percpu_ref_kill() returns.

To be honest, I really hate to grab/put blkg refcnt for adding/consuming
state, and this way is too fragile.

Thanks,
Ming


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-24  4:04   ` Waiman Long
@ 2023-05-24  4:13     ` Yosry Ahmed
  0 siblings, 0 replies; 21+ messages in thread
From: Yosry Ahmed @ 2023-05-24  4:13 UTC (permalink / raw)
  To: Waiman Long
  Cc: Ming Lei, Linux-MM, Michal Hocko, Shakeel Butt, Johannes Weiner,
	Roman Gushchin, Muchun Song, Jens Axboe, linux-block, cgroups,
	Tejun Heo, mkoutny

On Tue, May 23, 2023 at 9:04 PM Waiman Long <longman@redhat.com> wrote:
>
> On 5/23/23 22:06, Yosry Ahmed wrote:
> > Hi Ming,
> >
> > On Tue, May 23, 2023 at 6:21 PM Ming Lei <ming.lei@redhat.com> wrote:
> >> As noted by Michal, the blkg_iostat_set's in the lockless list
> >> hold reference to blkg's to protect against their removal. Those
> >> blkg's hold reference to blkcg. When a cgroup is being destroyed,
> >> cgroup_rstat_flush() is only called at css_release_work_fn() which
> >> is called when the blkcg reference count reaches 0. This circular
> >> dependency will prevent blkcg and some blkgs from being freed after
> >> they are made offline.
> > I am not at all familiar with blkcg, but does calling
> > cgroup_rstat_flush() in offline_css() fix the problem? or can items be
> > added to the lockless list(s) after the blkcg is offlined?
> >
> >> It is less a problem if the cgroup to be destroyed also has other
> >> controllers like memory that will call cgroup_rstat_flush() which will
> >> clean up the reference count. If block is the only controller that uses
> >> rstat, these offline blkcg and blkgs may never be freed leaking more
> >> and more memory over time.
> >>
> >> To prevent this potential memory leak:
> >>
> >> - a new cgroup_rstat_css_cpu_flush() function is added to flush stats for
> >> a given css and cpu. This new function will be called in __blkg_release().
> >>
> >> - don't grab bio->bi_blkg when adding the stats into blkcg's per-cpu
> >> stat list, and this kind of handling is the most fragile part of
> >> original patch
> >>
> >> Based on Waiman's patch:
> >>
> >> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> >>
> >> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> >> Cc: Waiman Long <longman@redhat.com>
> >> Cc: cgroups@vger.kernel.org
> >> Cc: Tejun Heo <tj@kernel.org>
> >> Cc: mkoutny@suse.com
> >> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >> ---
> >>   block/blk-cgroup.c     | 15 +++++++++++++--
> >>   include/linux/cgroup.h |  1 +
> >>   kernel/cgroup/rstat.c  | 18 ++++++++++++++++++
> >>   3 files changed, 32 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> >> index 0ce64dd73cfe..5437b6af3955 100644
> >> --- a/block/blk-cgroup.c
> >> +++ b/block/blk-cgroup.c
> >> @@ -163,10 +163,23 @@ static void blkg_free(struct blkcg_gq *blkg)
> >>   static void __blkg_release(struct rcu_head *rcu)
> >>   {
> >>          struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
> >> +       struct blkcg *blkcg = blkg->blkcg;
> >> +       int cpu;
> >>
> >>   #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
> >>          WARN_ON(!bio_list_empty(&blkg->async_bios));
> >>   #endif
> >> +       /*
> >> +        * Flush all the non-empty percpu lockless lists before releasing
> >> +        * us. Meantime no new bio can refer to this blkg any more given
> >> +        * the refcnt is killed.
> >> +        */
> >> +       for_each_possible_cpu(cpu) {
> >> +               struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> >> +
> >> +               if (!llist_empty(lhead))
> >> +                       cgroup_rstat_css_cpu_flush(&blkcg->css, cpu);
> >> +       }
> >>
> >>          /* release the blkcg and parent blkg refs this blkg has been holding */
> >>          css_put(&blkg->blkcg->css);
> >> @@ -991,7 +1004,6 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> >>                  if (parent && parent->parent)
> >>                          blkcg_iostat_update(parent, &blkg->iostat.cur,
> >>                                              &blkg->iostat.last);
> >> -               percpu_ref_put(&blkg->refcnt);
> >>          }
> >>
> >>   out:
> >> @@ -2075,7 +2087,6 @@ void blk_cgroup_bio_start(struct bio *bio)
> >>
> >>                  llist_add(&bis->lnode, lhead);
> >>                  WRITE_ONCE(bis->lqueued, true);
> >> -               percpu_ref_get(&bis->blkg->refcnt);
> >>          }
> >>
> >>          u64_stats_update_end_irqrestore(&bis->sync, flags);
> >> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> >> index 885f5395fcd0..97d4764d8e6a 100644
> >> --- a/include/linux/cgroup.h
> >> +++ b/include/linux/cgroup.h
> >> @@ -695,6 +695,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp);
> >>   void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
> >>   void cgroup_rstat_flush_hold(struct cgroup *cgrp);
> >>   void cgroup_rstat_flush_release(void);
> >> +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu);
> >>
> >>   /*
> >>    * Basic resource stats.
> >> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> >> index 9c4c55228567..96e7a4e6da72 100644
> >> --- a/kernel/cgroup/rstat.c
> >> +++ b/kernel/cgroup/rstat.c
> >> @@ -281,6 +281,24 @@ void cgroup_rstat_flush_release(void)
> >>          spin_unlock_irq(&cgroup_rstat_lock);
> >>   }
> >>
> >> +/**
> >> + * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu
> >> + * @css: target css to be flush
> >> + * @cpu: the cpu that holds the stats to be flush
> >> + *
> >> + * A lightweight rstat flush operation for a given css and cpu.
> >> + * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock
> >> + * isn't used.
> > (Adding linux-mm and memcg maintainers)
> > +Linux-MM +Michal Hocko +Shakeel Butt +Johannes Weiner +Roman Gushchin
> > +Muchun Song
> >
> > I don't think flushing the stats without holding cgroup_rstat_lock is
> > safe for memcg stats flushing. mem_cgroup_css_rstat_flush() modifies
> > some non-percpu data (e.g. memcg->vmstats->state,
> > memcg->vmstats->state_pending).
> >
> > Perhaps have this be a separate callback than css_rstat_flush() (e.g.
> > css_rstat_flush_cpu() or something), so that it's clear what
> > subsystems support this? In this case, only blkcg would implement this
> > callback.
>
> That function is added to call blkcg_rstat_flush() only which flush the
> stat in the blkcg and it should be safe. I agree that we should note
> that in the comment to list the preconditions for calling it.

I think it would be a better API if there's a separate callback for
flushing only protected by percpu locks, that would be implemented by
blkcg only in this case. Anyway, I see v2 dropped the cgroup changes
anyway and is calling blkcg_rstat_flush() directly.

>
> Cheers,
> Longman
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-24  2:37   ` Ming Lei
  2023-05-24  2:43     ` Yosry Ahmed
@ 2023-05-24  4:10     ` Waiman Long
  2023-05-24  4:21       ` Ming Lei
  2023-05-25 14:11     ` Michal Koutný
  2 siblings, 1 reply; 21+ messages in thread
From: Waiman Long @ 2023-05-24  4:10 UTC (permalink / raw)
  To: Ming Lei, Yosry Ahmed
  Cc: Linux-MM, Michal Hocko, Shakeel Butt, Johannes Weiner,
	Roman Gushchin, Muchun Song, Jens Axboe, linux-block, cgroups,
	Tejun Heo, mkoutny


On 5/23/23 22:37, Ming Lei wrote:
> Hi Yosry,
>
> On Tue, May 23, 2023 at 07:06:38PM -0700, Yosry Ahmed wrote:
>> Hi Ming,
>>
>> On Tue, May 23, 2023 at 6:21 PM Ming Lei <ming.lei@redhat.com> wrote:
>>> As noted by Michal, the blkg_iostat_set's in the lockless list
>>> hold reference to blkg's to protect against their removal. Those
>>> blkg's hold reference to blkcg. When a cgroup is being destroyed,
>>> cgroup_rstat_flush() is only called at css_release_work_fn() which
>>> is called when the blkcg reference count reaches 0. This circular
>>> dependency will prevent blkcg and some blkgs from being freed after
>>> they are made offline.
>> I am not at all familiar with blkcg, but does calling
>> cgroup_rstat_flush() in offline_css() fix the problem?
> Except for offline, this list needs to be flushed after the associated disk
> is deleted.
>
>> or can items be
>> added to the lockless list(s) after the blkcg is offlined?
> Yeah.
>
> percpu_ref_*get(&blkg->refcnt) still can succeed after the percpu refcnt
> is killed in blkg_destroy() which is called from both offline css and
> removing disk.

As suggested by Tejun, we can use percpu_ref_tryget(&blkg->refcnt) to 
make sure that we can only take a reference when the blkg is online. I 
think it is a bit safer to take a percpu refcnt to avoid use after free. 
My other concern about your patch is that the per cpu list iterations 
will be done multiple times when a blkcg is destroyed if many blkgs are 
attached to the blkcg. I still prefer to do it once in 
blkcg_destroy_blkgs(). I am going to post an updated version tomorrow 
after some more testings.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-24  2:06 ` Yosry Ahmed
  2023-05-24  2:37   ` Ming Lei
@ 2023-05-24  4:04   ` Waiman Long
  2023-05-24  4:13     ` Yosry Ahmed
  1 sibling, 1 reply; 21+ messages in thread
From: Waiman Long @ 2023-05-24  4:04 UTC (permalink / raw)
  To: Yosry Ahmed, Ming Lei, Linux-MM, Michal Hocko, Shakeel Butt,
	Johannes Weiner, Roman Gushchin, Muchun Song
  Cc: Jens Axboe, linux-block, cgroups, Tejun Heo, mkoutny

On 5/23/23 22:06, Yosry Ahmed wrote:
> Hi Ming,
>
> On Tue, May 23, 2023 at 6:21 PM Ming Lei <ming.lei@redhat.com> wrote:
>> As noted by Michal, the blkg_iostat_set's in the lockless list
>> hold reference to blkg's to protect against their removal. Those
>> blkg's hold reference to blkcg. When a cgroup is being destroyed,
>> cgroup_rstat_flush() is only called at css_release_work_fn() which
>> is called when the blkcg reference count reaches 0. This circular
>> dependency will prevent blkcg and some blkgs from being freed after
>> they are made offline.
> I am not at all familiar with blkcg, but does calling
> cgroup_rstat_flush() in offline_css() fix the problem? or can items be
> added to the lockless list(s) after the blkcg is offlined?
>
>> It is less a problem if the cgroup to be destroyed also has other
>> controllers like memory that will call cgroup_rstat_flush() which will
>> clean up the reference count. If block is the only controller that uses
>> rstat, these offline blkcg and blkgs may never be freed leaking more
>> and more memory over time.
>>
>> To prevent this potential memory leak:
>>
>> - a new cgroup_rstat_css_cpu_flush() function is added to flush stats for
>> a given css and cpu. This new function will be called in __blkg_release().
>>
>> - don't grab bio->bi_blkg when adding the stats into blkcg's per-cpu
>> stat list, and this kind of handling is the most fragile part of
>> original patch
>>
>> Based on Waiman's patch:
>>
>> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
>>
>> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
>> Cc: Waiman Long <longman@redhat.com>
>> Cc: cgroups@vger.kernel.org
>> Cc: Tejun Heo <tj@kernel.org>
>> Cc: mkoutny@suse.com
>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>> ---
>>   block/blk-cgroup.c     | 15 +++++++++++++--
>>   include/linux/cgroup.h |  1 +
>>   kernel/cgroup/rstat.c  | 18 ++++++++++++++++++
>>   3 files changed, 32 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 0ce64dd73cfe..5437b6af3955 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -163,10 +163,23 @@ static void blkg_free(struct blkcg_gq *blkg)
>>   static void __blkg_release(struct rcu_head *rcu)
>>   {
>>          struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
>> +       struct blkcg *blkcg = blkg->blkcg;
>> +       int cpu;
>>
>>   #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
>>          WARN_ON(!bio_list_empty(&blkg->async_bios));
>>   #endif
>> +       /*
>> +        * Flush all the non-empty percpu lockless lists before releasing
>> +        * us. Meantime no new bio can refer to this blkg any more given
>> +        * the refcnt is killed.
>> +        */
>> +       for_each_possible_cpu(cpu) {
>> +               struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
>> +
>> +               if (!llist_empty(lhead))
>> +                       cgroup_rstat_css_cpu_flush(&blkcg->css, cpu);
>> +       }
>>
>>          /* release the blkcg and parent blkg refs this blkg has been holding */
>>          css_put(&blkg->blkcg->css);
>> @@ -991,7 +1004,6 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>>                  if (parent && parent->parent)
>>                          blkcg_iostat_update(parent, &blkg->iostat.cur,
>>                                              &blkg->iostat.last);
>> -               percpu_ref_put(&blkg->refcnt);
>>          }
>>
>>   out:
>> @@ -2075,7 +2087,6 @@ void blk_cgroup_bio_start(struct bio *bio)
>>
>>                  llist_add(&bis->lnode, lhead);
>>                  WRITE_ONCE(bis->lqueued, true);
>> -               percpu_ref_get(&bis->blkg->refcnt);
>>          }
>>
>>          u64_stats_update_end_irqrestore(&bis->sync, flags);
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index 885f5395fcd0..97d4764d8e6a 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -695,6 +695,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp);
>>   void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
>>   void cgroup_rstat_flush_hold(struct cgroup *cgrp);
>>   void cgroup_rstat_flush_release(void);
>> +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu);
>>
>>   /*
>>    * Basic resource stats.
>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>> index 9c4c55228567..96e7a4e6da72 100644
>> --- a/kernel/cgroup/rstat.c
>> +++ b/kernel/cgroup/rstat.c
>> @@ -281,6 +281,24 @@ void cgroup_rstat_flush_release(void)
>>          spin_unlock_irq(&cgroup_rstat_lock);
>>   }
>>
>> +/**
>> + * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu
>> + * @css: target css to be flush
>> + * @cpu: the cpu that holds the stats to be flush
>> + *
>> + * A lightweight rstat flush operation for a given css and cpu.
>> + * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock
>> + * isn't used.
> (Adding linux-mm and memcg maintainers)
> +Linux-MM +Michal Hocko +Shakeel Butt +Johannes Weiner +Roman Gushchin
> +Muchun Song
>
> I don't think flushing the stats without holding cgroup_rstat_lock is
> safe for memcg stats flushing. mem_cgroup_css_rstat_flush() modifies
> some non-percpu data (e.g. memcg->vmstats->state,
> memcg->vmstats->state_pending).
>
> Perhaps have this be a separate callback than css_rstat_flush() (e.g.
> css_rstat_flush_cpu() or something), so that it's clear what
> subsystems support this? In this case, only blkcg would implement this
> callback.

That function is added to call blkcg_rstat_flush() only which flush the 
stat in the blkcg and it should be safe. I agree that we should note 
that in the comment to list the preconditions for calling it.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-24  2:37   ` Ming Lei
@ 2023-05-24  2:43     ` Yosry Ahmed
  2023-05-24  4:10     ` Waiman Long
  2023-05-25 14:11     ` Michal Koutný
  2 siblings, 0 replies; 21+ messages in thread
From: Yosry Ahmed @ 2023-05-24  2:43 UTC (permalink / raw)
  To: Ming Lei
  Cc: Linux-MM, Michal Hocko, Shakeel Butt, Johannes Weiner,
	Roman Gushchin, Muchun Song, Jens Axboe, linux-block,
	Waiman Long, cgroups, Tejun Heo, mkoutny

On Tue, May 23, 2023 at 7:37 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> Hi Yosry,
>
> On Tue, May 23, 2023 at 07:06:38PM -0700, Yosry Ahmed wrote:
> > Hi Ming,
> >
> > On Tue, May 23, 2023 at 6:21 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > As noted by Michal, the blkg_iostat_set's in the lockless list
> > > hold reference to blkg's to protect against their removal. Those
> > > blkg's hold reference to blkcg. When a cgroup is being destroyed,
> > > cgroup_rstat_flush() is only called at css_release_work_fn() which
> > > is called when the blkcg reference count reaches 0. This circular
> > > dependency will prevent blkcg and some blkgs from being freed after
> > > they are made offline.
> >
> > I am not at all familiar with blkcg, but does calling
> > cgroup_rstat_flush() in offline_css() fix the problem?
>
> Except for offline, this list needs to be flushed after the associated disk
> is deleted.
>
> > or can items be
> > added to the lockless list(s) after the blkcg is offlined?
>
> Yeah.
>
> percpu_ref_*get(&blkg->refcnt) still can succeed after the percpu refcnt
> is killed in blkg_destroy() which is called from both offline css and
> removing disk.

I see.

>
> >
> > >
> > > It is less a problem if the cgroup to be destroyed also has other
> > > controllers like memory that will call cgroup_rstat_flush() which will
> > > clean up the reference count. If block is the only controller that uses
> > > rstat, these offline blkcg and blkgs may never be freed leaking more
> > > and more memory over time.
> > >
> > > To prevent this potential memory leak:
> > >
> > > - a new cgroup_rstat_css_cpu_flush() function is added to flush stats for
> > > a given css and cpu. This new function will be called in __blkg_release().
> > >
> > > - don't grab bio->bi_blkg when adding the stats into blkcg's per-cpu
> > > stat list, and this kind of handling is the most fragile part of
> > > original patch
> > >
> > > Based on Waiman's patch:
> > >
> > > https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> > >
> > > Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> > > Cc: Waiman Long <longman@redhat.com>
> > > Cc: cgroups@vger.kernel.org
> > > Cc: Tejun Heo <tj@kernel.org>
> > > Cc: mkoutny@suse.com
> > > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > > ---
> > >  block/blk-cgroup.c     | 15 +++++++++++++--
> > >  include/linux/cgroup.h |  1 +
> > >  kernel/cgroup/rstat.c  | 18 ++++++++++++++++++
> > >  3 files changed, 32 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> > > index 0ce64dd73cfe..5437b6af3955 100644
> > > --- a/block/blk-cgroup.c
> > > +++ b/block/blk-cgroup.c
> > > @@ -163,10 +163,23 @@ static void blkg_free(struct blkcg_gq *blkg)
> > >  static void __blkg_release(struct rcu_head *rcu)
> > >  {
> > >         struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
> > > +       struct blkcg *blkcg = blkg->blkcg;
> > > +       int cpu;
> > >
> > >  #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
> > >         WARN_ON(!bio_list_empty(&blkg->async_bios));
> > >  #endif
> > > +       /*
> > > +        * Flush all the non-empty percpu lockless lists before releasing
> > > +        * us. Meantime no new bio can refer to this blkg any more given
> > > +        * the refcnt is killed.
> > > +        */
> > > +       for_each_possible_cpu(cpu) {
> > > +               struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> > > +
> > > +               if (!llist_empty(lhead))
> > > +                       cgroup_rstat_css_cpu_flush(&blkcg->css, cpu);
> > > +       }
> > >
> > >         /* release the blkcg and parent blkg refs this blkg has been holding */
> > >         css_put(&blkg->blkcg->css);
> > > @@ -991,7 +1004,6 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> > >                 if (parent && parent->parent)
> > >                         blkcg_iostat_update(parent, &blkg->iostat.cur,
> > >                                             &blkg->iostat.last);
> > > -               percpu_ref_put(&blkg->refcnt);
> > >         }
> > >
> > >  out:
> > > @@ -2075,7 +2087,6 @@ void blk_cgroup_bio_start(struct bio *bio)
> > >
> > >                 llist_add(&bis->lnode, lhead);
> > >                 WRITE_ONCE(bis->lqueued, true);
> > > -               percpu_ref_get(&bis->blkg->refcnt);
> > >         }
> > >
> > >         u64_stats_update_end_irqrestore(&bis->sync, flags);
> > > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> > > index 885f5395fcd0..97d4764d8e6a 100644
> > > --- a/include/linux/cgroup.h
> > > +++ b/include/linux/cgroup.h
> > > @@ -695,6 +695,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp);
> > >  void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
> > >  void cgroup_rstat_flush_hold(struct cgroup *cgrp);
> > >  void cgroup_rstat_flush_release(void);
> > > +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu);
> > >
> > >  /*
> > >   * Basic resource stats.
> > > diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> > > index 9c4c55228567..96e7a4e6da72 100644
> > > --- a/kernel/cgroup/rstat.c
> > > +++ b/kernel/cgroup/rstat.c
> > > @@ -281,6 +281,24 @@ void cgroup_rstat_flush_release(void)
> > >         spin_unlock_irq(&cgroup_rstat_lock);
> > >  }
> > >
> > > +/**
> > > + * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu
> > > + * @css: target css to be flush
> > > + * @cpu: the cpu that holds the stats to be flush
> > > + *
> > > + * A lightweight rstat flush operation for a given css and cpu.
> > > + * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock
> > > + * isn't used.
> >
> > (Adding linux-mm and memcg maintainers)
> > +Linux-MM +Michal Hocko +Shakeel Butt +Johannes Weiner +Roman Gushchin
> > +Muchun Song
> >
> > I don't think flushing the stats without holding cgroup_rstat_lock is
> > safe for memcg stats flushing. mem_cgroup_css_rstat_flush() modifies
> > some non-percpu data (e.g. memcg->vmstats->state,
> > memcg->vmstats->state_pending).
> >
> > Perhaps have this be a separate callback than css_rstat_flush() (e.g.
> > css_rstat_flush_cpu() or something), so that it's clear what
> > subsystems support this? In this case, only blkcg would implement this
> > callback.
>
> Also I guess cgroup_rstat_flush() can be used here too.

If we don't really care about flushing other subsystems, then yeah
that would be a simpler approach.

>
> BTW, cgroup_rstat_flush() is annotated as might_sleep(), however it won't
> sleep actually, so can this might_sleep() be removed?

It will actually sleep if may_sleep=true, and I have a change in
Andrew's MM tree that removes that *_atomic() variant and makes it
always sleepable.

In cgroup_rstat_flush_locked(), if need_resched() or
spin_needbreak(&cgroup_rstat_lock), we release the lock, reschedule,
and then re-acquire it.

>
> >
> > > + */
> > > +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu)
> > > +{
> > > +       raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
> > > +
> > > +       raw_spin_lock_irq(cpu_lock);
> > > +       css->ss->css_rstat_flush(css, cpu);
> >
> > I think we need to check that css_rstat_flush() (or a new callback) is
> > implemented before calling it here.
>
> Good catch!
>
>
>
> Thanks,
> Ming
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-24  2:06 ` Yosry Ahmed
@ 2023-05-24  2:37   ` Ming Lei
  2023-05-24  2:43     ` Yosry Ahmed
                       ` (2 more replies)
  2023-05-24  4:04   ` Waiman Long
  1 sibling, 3 replies; 21+ messages in thread
From: Ming Lei @ 2023-05-24  2:37 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Linux-MM, Michal Hocko, Shakeel Butt, Johannes Weiner,
	Roman Gushchin, Muchun Song, Jens Axboe, linux-block,
	Waiman Long, cgroups, Tejun Heo, mkoutny, ming.lei

Hi Yosry,

On Tue, May 23, 2023 at 07:06:38PM -0700, Yosry Ahmed wrote:
> Hi Ming,
> 
> On Tue, May 23, 2023 at 6:21 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > As noted by Michal, the blkg_iostat_set's in the lockless list
> > hold reference to blkg's to protect against their removal. Those
> > blkg's hold reference to blkcg. When a cgroup is being destroyed,
> > cgroup_rstat_flush() is only called at css_release_work_fn() which
> > is called when the blkcg reference count reaches 0. This circular
> > dependency will prevent blkcg and some blkgs from being freed after
> > they are made offline.
> 
> I am not at all familiar with blkcg, but does calling
> cgroup_rstat_flush() in offline_css() fix the problem?

Except for offline, this list needs to be flushed after the associated disk
is deleted.

> or can items be
> added to the lockless list(s) after the blkcg is offlined?

Yeah.

percpu_ref_*get(&blkg->refcnt) still can succeed after the percpu refcnt
is killed in blkg_destroy() which is called from both offline css and
removing disk.

> 
> >
> > It is less a problem if the cgroup to be destroyed also has other
> > controllers like memory that will call cgroup_rstat_flush() which will
> > clean up the reference count. If block is the only controller that uses
> > rstat, these offline blkcg and blkgs may never be freed leaking more
> > and more memory over time.
> >
> > To prevent this potential memory leak:
> >
> > - a new cgroup_rstat_css_cpu_flush() function is added to flush stats for
> > a given css and cpu. This new function will be called in __blkg_release().
> >
> > - don't grab bio->bi_blkg when adding the stats into blkcg's per-cpu
> > stat list, and this kind of handling is the most fragile part of
> > original patch
> >
> > Based on Waiman's patch:
> >
> > https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> >
> > Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> > Cc: Waiman Long <longman@redhat.com>
> > Cc: cgroups@vger.kernel.org
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: mkoutny@suse.com
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-cgroup.c     | 15 +++++++++++++--
> >  include/linux/cgroup.h |  1 +
> >  kernel/cgroup/rstat.c  | 18 ++++++++++++++++++
> >  3 files changed, 32 insertions(+), 2 deletions(-)
> >
> > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> > index 0ce64dd73cfe..5437b6af3955 100644
> > --- a/block/blk-cgroup.c
> > +++ b/block/blk-cgroup.c
> > @@ -163,10 +163,23 @@ static void blkg_free(struct blkcg_gq *blkg)
> >  static void __blkg_release(struct rcu_head *rcu)
> >  {
> >         struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
> > +       struct blkcg *blkcg = blkg->blkcg;
> > +       int cpu;
> >
> >  #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
> >         WARN_ON(!bio_list_empty(&blkg->async_bios));
> >  #endif
> > +       /*
> > +        * Flush all the non-empty percpu lockless lists before releasing
> > +        * us. Meantime no new bio can refer to this blkg any more given
> > +        * the refcnt is killed.
> > +        */
> > +       for_each_possible_cpu(cpu) {
> > +               struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> > +
> > +               if (!llist_empty(lhead))
> > +                       cgroup_rstat_css_cpu_flush(&blkcg->css, cpu);
> > +       }
> >
> >         /* release the blkcg and parent blkg refs this blkg has been holding */
> >         css_put(&blkg->blkcg->css);
> > @@ -991,7 +1004,6 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> >                 if (parent && parent->parent)
> >                         blkcg_iostat_update(parent, &blkg->iostat.cur,
> >                                             &blkg->iostat.last);
> > -               percpu_ref_put(&blkg->refcnt);
> >         }
> >
> >  out:
> > @@ -2075,7 +2087,6 @@ void blk_cgroup_bio_start(struct bio *bio)
> >
> >                 llist_add(&bis->lnode, lhead);
> >                 WRITE_ONCE(bis->lqueued, true);
> > -               percpu_ref_get(&bis->blkg->refcnt);
> >         }
> >
> >         u64_stats_update_end_irqrestore(&bis->sync, flags);
> > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> > index 885f5395fcd0..97d4764d8e6a 100644
> > --- a/include/linux/cgroup.h
> > +++ b/include/linux/cgroup.h
> > @@ -695,6 +695,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp);
> >  void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
> >  void cgroup_rstat_flush_hold(struct cgroup *cgrp);
> >  void cgroup_rstat_flush_release(void);
> > +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu);
> >
> >  /*
> >   * Basic resource stats.
> > diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> > index 9c4c55228567..96e7a4e6da72 100644
> > --- a/kernel/cgroup/rstat.c
> > +++ b/kernel/cgroup/rstat.c
> > @@ -281,6 +281,24 @@ void cgroup_rstat_flush_release(void)
> >         spin_unlock_irq(&cgroup_rstat_lock);
> >  }
> >
> > +/**
> > + * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu
> > + * @css: target css to be flush
> > + * @cpu: the cpu that holds the stats to be flush
> > + *
> > + * A lightweight rstat flush operation for a given css and cpu.
> > + * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock
> > + * isn't used.
> 
> (Adding linux-mm and memcg maintainers)
> +Linux-MM +Michal Hocko +Shakeel Butt +Johannes Weiner +Roman Gushchin
> +Muchun Song
> 
> I don't think flushing the stats without holding cgroup_rstat_lock is
> safe for memcg stats flushing. mem_cgroup_css_rstat_flush() modifies
> some non-percpu data (e.g. memcg->vmstats->state,
> memcg->vmstats->state_pending).
> 
> Perhaps have this be a separate callback than css_rstat_flush() (e.g.
> css_rstat_flush_cpu() or something), so that it's clear what
> subsystems support this? In this case, only blkcg would implement this
> callback.

Also I guess cgroup_rstat_flush() can be used here too.

BTW, cgroup_rstat_flush() is annotated as might_sleep(), however it won't
sleep actually, so can this might_sleep() be removed?

> 
> > + */
> > +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu)
> > +{
> > +       raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
> > +
> > +       raw_spin_lock_irq(cpu_lock);
> > +       css->ss->css_rstat_flush(css, cpu);
> 
> I think we need to check that css_rstat_flush() (or a new callback) is
> implemented before calling it here.

Good catch!



Thanks,
Ming


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
  2023-05-24  1:19 [PATCH] " Ming Lei
@ 2023-05-24  2:06 ` Yosry Ahmed
  2023-05-24  2:37   ` Ming Lei
  2023-05-24  4:04   ` Waiman Long
  0 siblings, 2 replies; 21+ messages in thread
From: Yosry Ahmed @ 2023-05-24  2:06 UTC (permalink / raw)
  To: Ming Lei, Linux-MM, Michal Hocko, Shakeel Butt, Johannes Weiner,
	Roman Gushchin, Muchun Song
  Cc: Jens Axboe, linux-block, Waiman Long, cgroups, Tejun Heo, mkoutny

Hi Ming,

On Tue, May 23, 2023 at 6:21 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> As noted by Michal, the blkg_iostat_set's in the lockless list
> hold reference to blkg's to protect against their removal. Those
> blkg's hold reference to blkcg. When a cgroup is being destroyed,
> cgroup_rstat_flush() is only called at css_release_work_fn() which
> is called when the blkcg reference count reaches 0. This circular
> dependency will prevent blkcg and some blkgs from being freed after
> they are made offline.

I am not at all familiar with blkcg, but does calling
cgroup_rstat_flush() in offline_css() fix the problem? or can items be
added to the lockless list(s) after the blkcg is offlined?

>
> It is less a problem if the cgroup to be destroyed also has other
> controllers like memory that will call cgroup_rstat_flush() which will
> clean up the reference count. If block is the only controller that uses
> rstat, these offline blkcg and blkgs may never be freed leaking more
> and more memory over time.
>
> To prevent this potential memory leak:
>
> - a new cgroup_rstat_css_cpu_flush() function is added to flush stats for
> a given css and cpu. This new function will be called in __blkg_release().
>
> - don't grab bio->bi_blkg when adding the stats into blkcg's per-cpu
> stat list, and this kind of handling is the most fragile part of
> original patch
>
> Based on Waiman's patch:
>
> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
>
> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> Cc: Waiman Long <longman@redhat.com>
> Cc: cgroups@vger.kernel.org
> Cc: Tejun Heo <tj@kernel.org>
> Cc: mkoutny@suse.com
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-cgroup.c     | 15 +++++++++++++--
>  include/linux/cgroup.h |  1 +
>  kernel/cgroup/rstat.c  | 18 ++++++++++++++++++
>  3 files changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 0ce64dd73cfe..5437b6af3955 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -163,10 +163,23 @@ static void blkg_free(struct blkcg_gq *blkg)
>  static void __blkg_release(struct rcu_head *rcu)
>  {
>         struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
> +       struct blkcg *blkcg = blkg->blkcg;
> +       int cpu;
>
>  #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
>         WARN_ON(!bio_list_empty(&blkg->async_bios));
>  #endif
> +       /*
> +        * Flush all the non-empty percpu lockless lists before releasing
> +        * us. Meantime no new bio can refer to this blkg any more given
> +        * the refcnt is killed.
> +        */
> +       for_each_possible_cpu(cpu) {
> +               struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> +
> +               if (!llist_empty(lhead))
> +                       cgroup_rstat_css_cpu_flush(&blkcg->css, cpu);
> +       }
>
>         /* release the blkcg and parent blkg refs this blkg has been holding */
>         css_put(&blkg->blkcg->css);
> @@ -991,7 +1004,6 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>                 if (parent && parent->parent)
>                         blkcg_iostat_update(parent, &blkg->iostat.cur,
>                                             &blkg->iostat.last);
> -               percpu_ref_put(&blkg->refcnt);
>         }
>
>  out:
> @@ -2075,7 +2087,6 @@ void blk_cgroup_bio_start(struct bio *bio)
>
>                 llist_add(&bis->lnode, lhead);
>                 WRITE_ONCE(bis->lqueued, true);
> -               percpu_ref_get(&bis->blkg->refcnt);
>         }
>
>         u64_stats_update_end_irqrestore(&bis->sync, flags);
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 885f5395fcd0..97d4764d8e6a 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -695,6 +695,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp);
>  void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
>  void cgroup_rstat_flush_hold(struct cgroup *cgrp);
>  void cgroup_rstat_flush_release(void);
> +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu);
>
>  /*
>   * Basic resource stats.
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 9c4c55228567..96e7a4e6da72 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -281,6 +281,24 @@ void cgroup_rstat_flush_release(void)
>         spin_unlock_irq(&cgroup_rstat_lock);
>  }
>
> +/**
> + * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu
> + * @css: target css to be flush
> + * @cpu: the cpu that holds the stats to be flush
> + *
> + * A lightweight rstat flush operation for a given css and cpu.
> + * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock
> + * isn't used.

(Adding linux-mm and memcg maintainers)
+Linux-MM +Michal Hocko +Shakeel Butt +Johannes Weiner +Roman Gushchin
+Muchun Song

I don't think flushing the stats without holding cgroup_rstat_lock is
safe for memcg stats flushing. mem_cgroup_css_rstat_flush() modifies
some non-percpu data (e.g. memcg->vmstats->state,
memcg->vmstats->state_pending).

Perhaps have this be a separate callback than css_rstat_flush() (e.g.
css_rstat_flush_cpu() or something), so that it's clear what
subsystems support this? In this case, only blkcg would implement this
callback.

> + */
> +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu)
> +{
> +       raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
> +
> +       raw_spin_lock_irq(cpu_lock);
> +       css->ss->css_rstat_flush(css, cpu);

I think we need to check that css_rstat_flush() (or a new callback) is
implemented before calling it here.


> +       raw_spin_unlock_irq(cpu_lock);
> +}
> +
>  int cgroup_rstat_init(struct cgroup *cgrp)
>  {
>         int cpu;
> --
> 2.40.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
@ 2023-05-24  1:19 Ming Lei
  2023-05-24  2:06 ` Yosry Ahmed
  0 siblings, 1 reply; 21+ messages in thread
From: Ming Lei @ 2023-05-24  1:19 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, Waiman Long, cgroups, Tejun Heo, mkoutny

As noted by Michal, the blkg_iostat_set's in the lockless list
hold reference to blkg's to protect against their removal. Those
blkg's hold reference to blkcg. When a cgroup is being destroyed,
cgroup_rstat_flush() is only called at css_release_work_fn() which
is called when the blkcg reference count reaches 0. This circular
dependency will prevent blkcg and some blkgs from being freed after
they are made offline.

It is less a problem if the cgroup to be destroyed also has other
controllers like memory that will call cgroup_rstat_flush() which will
clean up the reference count. If block is the only controller that uses
rstat, these offline blkcg and blkgs may never be freed leaking more
and more memory over time.

To prevent this potential memory leak:

- a new cgroup_rstat_css_cpu_flush() function is added to flush stats for
a given css and cpu. This new function will be called in __blkg_release().

- don't grab bio->bi_blkg when adding the stats into blkcg's per-cpu
stat list, and this kind of handling is the most fragile part of
original patch

Based on Waiman's patch:

https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/

Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
Cc: Waiman Long <longman@redhat.com>
Cc: cgroups@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Cc: mkoutny@suse.com
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-cgroup.c     | 15 +++++++++++++--
 include/linux/cgroup.h |  1 +
 kernel/cgroup/rstat.c  | 18 ++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..5437b6af3955 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -163,10 +163,23 @@ static void blkg_free(struct blkcg_gq *blkg)
 static void __blkg_release(struct rcu_head *rcu)
 {
 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+	struct blkcg *blkcg = blkg->blkcg;
+	int cpu;
 
 #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
 	WARN_ON(!bio_list_empty(&blkg->async_bios));
 #endif
+	/*
+	 * Flush all the non-empty percpu lockless lists before releasing
+	 * us. Meantime no new bio can refer to this blkg any more given
+	 * the refcnt is killed.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+
+		if (!llist_empty(lhead))
+			cgroup_rstat_css_cpu_flush(&blkcg->css, cpu);
+	}
 
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
@@ -991,7 +1004,6 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 		if (parent && parent->parent)
 			blkcg_iostat_update(parent, &blkg->iostat.cur,
 					    &blkg->iostat.last);
-		percpu_ref_put(&blkg->refcnt);
 	}
 
 out:
@@ -2075,7 +2087,6 @@ void blk_cgroup_bio_start(struct bio *bio)
 
 		llist_add(&bis->lnode, lhead);
 		WRITE_ONCE(bis->lqueued, true);
-		percpu_ref_get(&bis->blkg->refcnt);
 	}
 
 	u64_stats_update_end_irqrestore(&bis->sync, flags);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 885f5395fcd0..97d4764d8e6a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -695,6 +695,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp);
 void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
 void cgroup_rstat_flush_hold(struct cgroup *cgrp);
 void cgroup_rstat_flush_release(void);
+void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu);
 
 /*
  * Basic resource stats.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9c4c55228567..96e7a4e6da72 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -281,6 +281,24 @@ void cgroup_rstat_flush_release(void)
 	spin_unlock_irq(&cgroup_rstat_lock);
 }
 
+/**
+ * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu
+ * @css: target css to be flush
+ * @cpu: the cpu that holds the stats to be flush
+ *
+ * A lightweight rstat flush operation for a given css and cpu.
+ * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock
+ * isn't used.
+ */
+void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu)
+{
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+
+	raw_spin_lock_irq(cpu_lock);
+	css->ss->css_rstat_flush(css, cpu);
+	raw_spin_unlock_irq(cpu_lock);
+}
+
 int cgroup_rstat_init(struct cgroup *cgrp)
 {
 	int cpu;
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2023-06-08 23:10 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-25  4:35 [PATCH V3] blk-cgroup: Flush stats before releasing blkcg_gq Ming Lei
2023-05-25 14:11 ` Michal Koutný
2023-05-25 15:21   ` Waiman Long
2023-05-25 15:33   ` Ming Lei
2023-05-25 16:01 ` [PATCH] " Waiman Long
2023-05-25 16:06   ` Waiman Long
2023-05-26  0:34     ` Ming Lei
2023-06-06  3:49 ` [PATCH V3] " Ming Lei
2023-06-08 23:07   ` Tejun Heo
2023-06-08 23:10 ` Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2023-05-24  1:19 [PATCH] " Ming Lei
2023-05-24  2:06 ` Yosry Ahmed
2023-05-24  2:37   ` Ming Lei
2023-05-24  2:43     ` Yosry Ahmed
2023-05-24  4:10     ` Waiman Long
2023-05-24  4:21       ` Ming Lei
2023-05-25 14:11     ` Michal Koutný
2023-05-25 15:25       ` Waiman Long
2023-05-26 21:11         ` Michal Koutný
2023-05-24  4:04   ` Waiman Long
2023-05-24  4:13     ` Yosry Ahmed

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).