linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] block, bfq: minor cleanup and fix
@ 2021-12-21  3:21 Yu Kuai
  2021-12-21  3:21 ` [PATCH 1/4] block, bfq: cleanup bfq_bfqq_to_bfqg() Yu Kuai
                   ` (3 more replies)
  0 siblings, 4 replies; 18+ messages in thread
From: Yu Kuai @ 2021-12-21  3:21 UTC (permalink / raw)
  To: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna
  Cc: cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

Yu Kuai (4):
  block, bfq: cleanup bfq_bfqq_to_bfqg()
  block, bfq: avoid moving bfqq to it's parent bfqg
  block, bfq: don't move oom_bfqq
  block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()

 block/bfq-cgroup.c  | 25 ++++++++++++++++++++-----
 block/bfq-iosched.c |  4 ++--
 block/bfq-iosched.h |  1 -
 block/bfq-wf2q.c    | 15 ---------------
 4 files changed, 22 insertions(+), 23 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/4] block, bfq: cleanup bfq_bfqq_to_bfqg()
  2021-12-21  3:21 [PATCH 0/4] block, bfq: minor cleanup and fix Yu Kuai
@ 2021-12-21  3:21 ` Yu Kuai
  2021-12-21 10:15   ` Jan Kara
  2021-12-21  3:21 ` [PATCH 2/4] block, bfq: avoid moving bfqq to it's parent bfqg Yu Kuai
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 18+ messages in thread
From: Yu Kuai @ 2021-12-21  3:21 UTC (permalink / raw)
  To: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna
  Cc: cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

Use bfq_group() instead, which do the same thing.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bfq-iosched.c |  4 ++--
 block/bfq-iosched.h |  1 -
 block/bfq-wf2q.c    | 15 ---------------
 3 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0c612a911696..2f2b97cad980 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -774,7 +774,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	if (!bfqq->next_rq)
 		return;
 
-	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree;
 	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
 			blk_rq_pos(bfqq->next_rq), &parent, &p);
 	if (!__bfqq) {
@@ -2669,7 +2669,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
 					 struct bfq_queue *bfqq,
 					 sector_t sector)
 {
-	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree;
 	struct rb_node *parent, *node;
 	struct bfq_queue *__bfqq;
 
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 07288b9da389..99949548896e 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -1051,7 +1051,6 @@ extern struct blkcg_policy blkcg_policy_bfq;
 	for (parent = NULL; entity ; entity = parent)
 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
 
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
 struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
 unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
 struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index b74cc0da118e..e1f5ca5c1fdb 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *group_entity = bfqq->entity.parent;
-
-	if (!group_entity)
-		group_entity = &bfqq->bfqd->root_group->entity;
-
-	return container_of(group_entity, struct bfq_group, entity);
-}
-
 /*
  * Returns true if this budget changes may let next_in_service->parent
  * become the next_in_service entity for its parent entity.
@@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 
 #else /* CONFIG_BFQ_GROUP_IOSCHED */
 
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	return bfqq->bfqd->root_group;
-}
-
 static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
 	return false;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/4] block, bfq: avoid moving bfqq to it's parent bfqg
  2021-12-21  3:21 [PATCH 0/4] block, bfq: minor cleanup and fix Yu Kuai
  2021-12-21  3:21 ` [PATCH 1/4] block, bfq: cleanup bfq_bfqq_to_bfqg() Yu Kuai
@ 2021-12-21  3:21 ` Yu Kuai
  2021-12-21 10:16   ` Jan Kara
  2021-12-21  3:21 ` [PATCH 3/4] block, bfq: don't move oom_bfqq Yu Kuai
  2021-12-21  3:21 ` [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move() Yu Kuai
  3 siblings, 1 reply; 18+ messages in thread
From: Yu Kuai @ 2021-12-21  3:21 UTC (permalink / raw)
  To: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna
  Cc: cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

Moving bfqq to it's parent bfqg is pointless.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bfq-cgroup.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..0f62546a72d4 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -645,6 +645,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		   struct bfq_group *bfqg)
 {
 	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_group *old_parent = bfq_group(bfqq);
+
+	/* No point to move bfqq to the same group */
+	if (old_parent == bfqg)
+		return;
 
 	/*
 	 * Get extra reference to prevent bfqq from being freed in
@@ -666,7 +671,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	else if (entity->on_st_or_in_serv)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
-	bfqg_and_blkg_put(bfqq_group(bfqq));
+	bfqg_and_blkg_put(old_parent);
 
 	if (entity->parent &&
 	    entity->parent->last_bfqq_created == bfqq)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/4] block, bfq: don't move oom_bfqq
  2021-12-21  3:21 [PATCH 0/4] block, bfq: minor cleanup and fix Yu Kuai
  2021-12-21  3:21 ` [PATCH 1/4] block, bfq: cleanup bfq_bfqq_to_bfqg() Yu Kuai
  2021-12-21  3:21 ` [PATCH 2/4] block, bfq: avoid moving bfqq to it's parent bfqg Yu Kuai
@ 2021-12-21  3:21 ` Yu Kuai
  2021-12-21 10:18   ` Jan Kara
  2021-12-21  3:21 ` [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move() Yu Kuai
  3 siblings, 1 reply; 18+ messages in thread
From: Yu Kuai @ 2021-12-21  3:21 UTC (permalink / raw)
  To: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna
  Cc: cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

Our test report a UAF:

[ 2073.019181] ==================================================================
[ 2073.019188] BUG: KASAN: use-after-free in __bfq_put_async_bfqq+0xa0/0x168
[ 2073.019191] Write of size 8 at addr ffff8000ccf64128 by task rmmod/72584
[ 2073.019192]
[ 2073.019196] CPU: 0 PID: 72584 Comm: rmmod Kdump: loaded Not tainted 4.19.90-yk #5
[ 2073.019198] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 2073.019200] Call trace:
[ 2073.019203]  dump_backtrace+0x0/0x310
[ 2073.019206]  show_stack+0x28/0x38
[ 2073.019210]  dump_stack+0xec/0x15c
[ 2073.019216]  print_address_description+0x68/0x2d0
[ 2073.019220]  kasan_report+0x238/0x2f0
[ 2073.019224]  __asan_store8+0x88/0xb0
[ 2073.019229]  __bfq_put_async_bfqq+0xa0/0x168
[ 2073.019233]  bfq_put_async_queues+0xbc/0x208
[ 2073.019236]  bfq_pd_offline+0x178/0x238
[ 2073.019240]  blkcg_deactivate_policy+0x1f0/0x420
[ 2073.019244]  bfq_exit_queue+0x128/0x178
[ 2073.019249]  blk_mq_exit_sched+0x12c/0x160
[ 2073.019252]  elevator_exit+0xc8/0xd0
[ 2073.019256]  blk_exit_queue+0x50/0x88
[ 2073.019259]  blk_cleanup_queue+0x228/0x3d8
[ 2073.019267]  null_del_dev+0xfc/0x1e0 [null_blk]
[ 2073.019274]  null_exit+0x90/0x114 [null_blk]
[ 2073.019278]  __arm64_sys_delete_module+0x358/0x5a0
[ 2073.019282]  el0_svc_common+0xc8/0x320
[ 2073.019287]  el0_svc_handler+0xf8/0x160
[ 2073.019290]  el0_svc+0x10/0x218
[ 2073.019291]
[ 2073.019294] Allocated by task 14163:
[ 2073.019301]  kasan_kmalloc+0xe0/0x190
[ 2073.019305]  kmem_cache_alloc_node_trace+0x1cc/0x418
[ 2073.019308]  bfq_pd_alloc+0x54/0x118
[ 2073.019313]  blkcg_activate_policy+0x250/0x460
[ 2073.019317]  bfq_create_group_hierarchy+0x38/0x110
[ 2073.019321]  bfq_init_queue+0x6d0/0x948
[ 2073.019325]  blk_mq_init_sched+0x1d8/0x390
[ 2073.019330]  elevator_switch_mq+0x88/0x170
[ 2073.019334]  elevator_switch+0x140/0x270
[ 2073.019338]  elv_iosched_store+0x1a4/0x2a0
[ 2073.019342]  queue_attr_store+0x90/0xe0
[ 2073.019348]  sysfs_kf_write+0xa8/0xe8
[ 2073.019351]  kernfs_fop_write+0x1f8/0x378
[ 2073.019359]  __vfs_write+0xe0/0x360
[ 2073.019363]  vfs_write+0xf0/0x270
[ 2073.019367]  ksys_write+0xdc/0x1b8
[ 2073.019371]  __arm64_sys_write+0x50/0x60
[ 2073.019375]  el0_svc_common+0xc8/0x320
[ 2073.019380]  el0_svc_handler+0xf8/0x160
[ 2073.019383]  el0_svc+0x10/0x218
[ 2073.019385]
[ 2073.019387] Freed by task 72584:
[ 2073.019391]  __kasan_slab_free+0x120/0x228
[ 2073.019394]  kasan_slab_free+0x10/0x18
[ 2073.019397]  kfree+0x94/0x368
[ 2073.019400]  bfqg_put+0x64/0xb0
[ 2073.019404]  bfqg_and_blkg_put+0x90/0xb0
[ 2073.019408]  bfq_put_queue+0x220/0x228
[ 2073.019413]  __bfq_put_async_bfqq+0x98/0x168
[ 2073.019416]  bfq_put_async_queues+0xbc/0x208
[ 2073.019420]  bfq_pd_offline+0x178/0x238
[ 2073.019424]  blkcg_deactivate_policy+0x1f0/0x420
[ 2073.019429]  bfq_exit_queue+0x128/0x178
[ 2073.019433]  blk_mq_exit_sched+0x12c/0x160
[ 2073.019437]  elevator_exit+0xc8/0xd0
[ 2073.019440]  blk_exit_queue+0x50/0x88
[ 2073.019443]  blk_cleanup_queue+0x228/0x3d8
[ 2073.019451]  null_del_dev+0xfc/0x1e0 [null_blk]
[ 2073.019459]  null_exit+0x90/0x114 [null_blk]
[ 2073.019462]  __arm64_sys_delete_module+0x358/0x5a0
[ 2073.019467]  el0_svc_common+0xc8/0x320
[ 2073.019471]  el0_svc_handler+0xf8/0x160
[ 2073.019474]  el0_svc+0x10/0x218
[ 2073.019475]
[ 2073.019479] The buggy address belongs to the object at ffff8000ccf63f00
 which belongs to the cache kmalloc-1024 of size 1024
[ 2073.019484] The buggy address is located 552 bytes inside of
 1024-byte region [ffff8000ccf63f00, ffff8000ccf64300)
[ 2073.019486] The buggy address belongs to the page:
[ 2073.019492] page:ffff7e000333d800 count:1 mapcount:0 mapping:ffff8000c0003a00 index:0x0 compound_mapcount: 0
[ 2073.020123] flags: 0x7ffff0000008100(slab|head)
[ 2073.020403] raw: 07ffff0000008100 ffff7e0003334c08 ffff7e00001f5a08 ffff8000c0003a00
[ 2073.020409] raw: 0000000000000000 00000000001c001c 00000001ffffffff 0000000000000000
[ 2073.020411] page dumped because: kasan: bad access detected
[ 2073.020412]
[ 2073.020414] Memory state around the buggy address:
[ 2073.020420]  ffff8000ccf64000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020424]  ffff8000ccf64080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020428] >ffff8000ccf64100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020430]                                   ^
[ 2073.020434]  ffff8000ccf64180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020438]  ffff8000ccf64200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020439] ==================================================================

The same problem exist in mainline as well.

This is because oom_bfqq is moved to a non-root group, thus root_group
is freed earlier.

Thus fix the problem by don't move oom_bfqq.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bfq-cgroup.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 0f62546a72d4..8e8cf6b3d946 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -651,6 +651,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	if (old_parent == bfqg)
 		return;
 
+	/*
+	 * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group
+	 * until elevator exit.
+	 */
+	if (bfqq == &bfqd->oom_bfqq)
+		return;
 	/*
 	 * Get extra reference to prevent bfqq from being freed in
 	 * next possible expire or deactivate.
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2021-12-21  3:21 [PATCH 0/4] block, bfq: minor cleanup and fix Yu Kuai
                   ` (2 preceding siblings ...)
  2021-12-21  3:21 ` [PATCH 3/4] block, bfq: don't move oom_bfqq Yu Kuai
@ 2021-12-21  3:21 ` Yu Kuai
  2021-12-21 11:50   ` Jan Kara
  3 siblings, 1 reply; 18+ messages in thread
From: Yu Kuai @ 2021-12-21  3:21 UTC (permalink / raw)
  To: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna
  Cc: cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

During code review, we found that if bfqq is not busy in
bfq_bfqq_move(), bfq_pos_tree_add_move() won't be called for the bfqq,
thus bfqq->pos_root still points to the old bfqg. However, the ref
that bfqq hold for the old bfqg will be released, so it's possible
that the old bfqg can be freed. This is problematic because the freed
bfqg can still be accessed by bfqq->pos_root.

Fix the problem by calling bfq_pos_tree_add_move() for idle bfqq
as well.

Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bfq-cgroup.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 8e8cf6b3d946..822dd28ecf53 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -677,7 +677,6 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	else if (entity->on_st_or_in_serv)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
-	bfqg_and_blkg_put(old_parent);
 
 	if (entity->parent &&
 	    entity->parent->last_bfqq_created == bfqq)
@@ -690,11 +689,16 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	/* pin down bfqg and its associated blkg  */
 	bfqg_and_blkg_get(bfqg);
 
-	if (bfq_bfqq_busy(bfqq)) {
-		if (unlikely(!bfqd->nonrot_with_queueing))
-			bfq_pos_tree_add_move(bfqd, bfqq);
+	/*
+	 * Don't leave the pos_root to old bfqg, since the ref to old bfqg will
+	 * be released and the bfqg might be freed.
+	 */
+	if (unlikely(!bfqd->nonrot_with_queueing))
+		bfq_pos_tree_add_move(bfqd, bfqq);
+	bfqg_and_blkg_put(old_parent);
+
+	if (bfq_bfqq_busy(bfqq))
 		bfq_activate_bfqq(bfqd, bfqq);
-	}
 
 	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
 		bfq_schedule_dispatch(bfqd);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] block, bfq: cleanup bfq_bfqq_to_bfqg()
  2021-12-21  3:21 ` [PATCH 1/4] block, bfq: cleanup bfq_bfqq_to_bfqg() Yu Kuai
@ 2021-12-21 10:15   ` Jan Kara
  0 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2021-12-21 10:15 UTC (permalink / raw)
  To: Yu Kuai
  Cc: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna,
	cgroups, linux-block, linux-kernel, yi.zhang

On Tue 21-12-21 11:21:32, Yu Kuai wrote:
> Use bfq_group() instead, which do the same thing.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>

Nice. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  block/bfq-iosched.c |  4 ++--
>  block/bfq-iosched.h |  1 -
>  block/bfq-wf2q.c    | 15 ---------------
>  3 files changed, 2 insertions(+), 18 deletions(-)
> 
> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
> index 0c612a911696..2f2b97cad980 100644
> --- a/block/bfq-iosched.c
> +++ b/block/bfq-iosched.c
> @@ -774,7 +774,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
>  	if (!bfqq->next_rq)
>  		return;
>  
> -	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
> +	bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree;
>  	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
>  			blk_rq_pos(bfqq->next_rq), &parent, &p);
>  	if (!__bfqq) {
> @@ -2669,7 +2669,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
>  					 struct bfq_queue *bfqq,
>  					 sector_t sector)
>  {
> -	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
> +	struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree;
>  	struct rb_node *parent, *node;
>  	struct bfq_queue *__bfqq;
>  
> diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
> index 07288b9da389..99949548896e 100644
> --- a/block/bfq-iosched.h
> +++ b/block/bfq-iosched.h
> @@ -1051,7 +1051,6 @@ extern struct blkcg_policy blkcg_policy_bfq;
>  	for (parent = NULL; entity ; entity = parent)
>  #endif /* CONFIG_BFQ_GROUP_IOSCHED */
>  
> -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
>  struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
>  unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
>  struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
> diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
> index b74cc0da118e..e1f5ca5c1fdb 100644
> --- a/block/bfq-wf2q.c
> +++ b/block/bfq-wf2q.c
> @@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
>  
>  #ifdef CONFIG_BFQ_GROUP_IOSCHED
>  
> -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
> -{
> -	struct bfq_entity *group_entity = bfqq->entity.parent;
> -
> -	if (!group_entity)
> -		group_entity = &bfqq->bfqd->root_group->entity;
> -
> -	return container_of(group_entity, struct bfq_group, entity);
> -}
> -
>  /*
>   * Returns true if this budget changes may let next_in_service->parent
>   * become the next_in_service entity for its parent entity.
> @@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
>  
>  #else /* CONFIG_BFQ_GROUP_IOSCHED */
>  
> -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
> -{
> -	return bfqq->bfqd->root_group;
> -}
> -
>  static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
>  {
>  	return false;
> -- 
> 2.31.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/4] block, bfq: avoid moving bfqq to it's parent bfqg
  2021-12-21  3:21 ` [PATCH 2/4] block, bfq: avoid moving bfqq to it's parent bfqg Yu Kuai
@ 2021-12-21 10:16   ` Jan Kara
  2021-12-21 11:08     ` yukuai (C)
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2021-12-21 10:16 UTC (permalink / raw)
  To: Yu Kuai
  Cc: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna,
	cgroups, linux-block, linux-kernel, yi.zhang

On Tue 21-12-21 11:21:33, Yu Kuai wrote:
> Moving bfqq to it's parent bfqg is pointless.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>

Did you notice that this is happening often enough that the check is worth
it? Where do we do this?

								Honza

> ---
>  block/bfq-cgroup.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
> index 24a5c5329bcd..0f62546a72d4 100644
> --- a/block/bfq-cgroup.c
> +++ b/block/bfq-cgroup.c
> @@ -645,6 +645,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>  		   struct bfq_group *bfqg)
>  {
>  	struct bfq_entity *entity = &bfqq->entity;
> +	struct bfq_group *old_parent = bfq_group(bfqq);
> +
> +	/* No point to move bfqq to the same group */
> +	if (old_parent == bfqg)
> +		return;
>  
>  	/*
>  	 * Get extra reference to prevent bfqq from being freed in
> @@ -666,7 +671,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>  		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
>  	else if (entity->on_st_or_in_serv)
>  		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
> -	bfqg_and_blkg_put(bfqq_group(bfqq));
> +	bfqg_and_blkg_put(old_parent);
>  
>  	if (entity->parent &&
>  	    entity->parent->last_bfqq_created == bfqq)
> -- 
> 2.31.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] block, bfq: don't move oom_bfqq
  2021-12-21  3:21 ` [PATCH 3/4] block, bfq: don't move oom_bfqq Yu Kuai
@ 2021-12-21 10:18   ` Jan Kara
  0 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2021-12-21 10:18 UTC (permalink / raw)
  To: Yu Kuai
  Cc: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna,
	cgroups, linux-block, linux-kernel, yi.zhang

On Tue 21-12-21 11:21:34, Yu Kuai wrote:
> Our test report a UAF:
> 
> [ 2073.019181] ==================================================================
> [ 2073.019188] BUG: KASAN: use-after-free in __bfq_put_async_bfqq+0xa0/0x168
> [ 2073.019191] Write of size 8 at addr ffff8000ccf64128 by task rmmod/72584
> [ 2073.019192]
> [ 2073.019196] CPU: 0 PID: 72584 Comm: rmmod Kdump: loaded Not tainted 4.19.90-yk #5
> [ 2073.019198] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
> [ 2073.019200] Call trace:
> [ 2073.019203]  dump_backtrace+0x0/0x310
> [ 2073.019206]  show_stack+0x28/0x38
> [ 2073.019210]  dump_stack+0xec/0x15c
> [ 2073.019216]  print_address_description+0x68/0x2d0
> [ 2073.019220]  kasan_report+0x238/0x2f0
> [ 2073.019224]  __asan_store8+0x88/0xb0
> [ 2073.019229]  __bfq_put_async_bfqq+0xa0/0x168
> [ 2073.019233]  bfq_put_async_queues+0xbc/0x208
> [ 2073.019236]  bfq_pd_offline+0x178/0x238
> [ 2073.019240]  blkcg_deactivate_policy+0x1f0/0x420
> [ 2073.019244]  bfq_exit_queue+0x128/0x178
> [ 2073.019249]  blk_mq_exit_sched+0x12c/0x160
> [ 2073.019252]  elevator_exit+0xc8/0xd0
> [ 2073.019256]  blk_exit_queue+0x50/0x88
> [ 2073.019259]  blk_cleanup_queue+0x228/0x3d8
> [ 2073.019267]  null_del_dev+0xfc/0x1e0 [null_blk]
> [ 2073.019274]  null_exit+0x90/0x114 [null_blk]
> [ 2073.019278]  __arm64_sys_delete_module+0x358/0x5a0
> [ 2073.019282]  el0_svc_common+0xc8/0x320
> [ 2073.019287]  el0_svc_handler+0xf8/0x160
> [ 2073.019290]  el0_svc+0x10/0x218
> [ 2073.019291]
> [ 2073.019294] Allocated by task 14163:
> [ 2073.019301]  kasan_kmalloc+0xe0/0x190
> [ 2073.019305]  kmem_cache_alloc_node_trace+0x1cc/0x418
> [ 2073.019308]  bfq_pd_alloc+0x54/0x118
> [ 2073.019313]  blkcg_activate_policy+0x250/0x460
> [ 2073.019317]  bfq_create_group_hierarchy+0x38/0x110
> [ 2073.019321]  bfq_init_queue+0x6d0/0x948
> [ 2073.019325]  blk_mq_init_sched+0x1d8/0x390
> [ 2073.019330]  elevator_switch_mq+0x88/0x170
> [ 2073.019334]  elevator_switch+0x140/0x270
> [ 2073.019338]  elv_iosched_store+0x1a4/0x2a0
> [ 2073.019342]  queue_attr_store+0x90/0xe0
> [ 2073.019348]  sysfs_kf_write+0xa8/0xe8
> [ 2073.019351]  kernfs_fop_write+0x1f8/0x378
> [ 2073.019359]  __vfs_write+0xe0/0x360
> [ 2073.019363]  vfs_write+0xf0/0x270
> [ 2073.019367]  ksys_write+0xdc/0x1b8
> [ 2073.019371]  __arm64_sys_write+0x50/0x60
> [ 2073.019375]  el0_svc_common+0xc8/0x320
> [ 2073.019380]  el0_svc_handler+0xf8/0x160
> [ 2073.019383]  el0_svc+0x10/0x218
> [ 2073.019385]
> [ 2073.019387] Freed by task 72584:
> [ 2073.019391]  __kasan_slab_free+0x120/0x228
> [ 2073.019394]  kasan_slab_free+0x10/0x18
> [ 2073.019397]  kfree+0x94/0x368
> [ 2073.019400]  bfqg_put+0x64/0xb0
> [ 2073.019404]  bfqg_and_blkg_put+0x90/0xb0
> [ 2073.019408]  bfq_put_queue+0x220/0x228
> [ 2073.019413]  __bfq_put_async_bfqq+0x98/0x168
> [ 2073.019416]  bfq_put_async_queues+0xbc/0x208
> [ 2073.019420]  bfq_pd_offline+0x178/0x238
> [ 2073.019424]  blkcg_deactivate_policy+0x1f0/0x420
> [ 2073.019429]  bfq_exit_queue+0x128/0x178
> [ 2073.019433]  blk_mq_exit_sched+0x12c/0x160
> [ 2073.019437]  elevator_exit+0xc8/0xd0
> [ 2073.019440]  blk_exit_queue+0x50/0x88
> [ 2073.019443]  blk_cleanup_queue+0x228/0x3d8
> [ 2073.019451]  null_del_dev+0xfc/0x1e0 [null_blk]
> [ 2073.019459]  null_exit+0x90/0x114 [null_blk]
> [ 2073.019462]  __arm64_sys_delete_module+0x358/0x5a0
> [ 2073.019467]  el0_svc_common+0xc8/0x320
> [ 2073.019471]  el0_svc_handler+0xf8/0x160
> [ 2073.019474]  el0_svc+0x10/0x218
> [ 2073.019475]
> [ 2073.019479] The buggy address belongs to the object at ffff8000ccf63f00
>  which belongs to the cache kmalloc-1024 of size 1024
> [ 2073.019484] The buggy address is located 552 bytes inside of
>  1024-byte region [ffff8000ccf63f00, ffff8000ccf64300)
> [ 2073.019486] The buggy address belongs to the page:
> [ 2073.019492] page:ffff7e000333d800 count:1 mapcount:0 mapping:ffff8000c0003a00 index:0x0 compound_mapcount: 0
> [ 2073.020123] flags: 0x7ffff0000008100(slab|head)
> [ 2073.020403] raw: 07ffff0000008100 ffff7e0003334c08 ffff7e00001f5a08 ffff8000c0003a00
> [ 2073.020409] raw: 0000000000000000 00000000001c001c 00000001ffffffff 0000000000000000
> [ 2073.020411] page dumped because: kasan: bad access detected
> [ 2073.020412]
> [ 2073.020414] Memory state around the buggy address:
> [ 2073.020420]  ffff8000ccf64000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [ 2073.020424]  ffff8000ccf64080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [ 2073.020428] >ffff8000ccf64100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [ 2073.020430]                                   ^
> [ 2073.020434]  ffff8000ccf64180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [ 2073.020438]  ffff8000ccf64200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [ 2073.020439] ==================================================================
> 
> The same problem exist in mainline as well.
> 
> This is because oom_bfqq is moved to a non-root group, thus root_group
> is freed earlier.
> 
> Thus fix the problem by don't move oom_bfqq.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>

Makes sense. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

									Honza

> ---
>  block/bfq-cgroup.c | 6 ++++++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
> index 0f62546a72d4..8e8cf6b3d946 100644
> --- a/block/bfq-cgroup.c
> +++ b/block/bfq-cgroup.c
> @@ -651,6 +651,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>  	if (old_parent == bfqg)
>  		return;
>  
> +	/*
> +	 * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group
> +	 * until elevator exit.
> +	 */
> +	if (bfqq == &bfqd->oom_bfqq)
> +		return;
>  	/*
>  	 * Get extra reference to prevent bfqq from being freed in
>  	 * next possible expire or deactivate.
> -- 
> 2.31.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/4] block, bfq: avoid moving bfqq to it's parent bfqg
  2021-12-21 10:16   ` Jan Kara
@ 2021-12-21 11:08     ` yukuai (C)
  2021-12-21 11:52       ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: yukuai (C) @ 2021-12-21 11:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: tj, axboe, paolo.valente, fchecconi, avanzini.arianna, cgroups,
	linux-block, linux-kernel, yi.zhang

在 2021/12/21 18:16, Jan Kara 写道:
> On Tue 21-12-21 11:21:33, Yu Kuai wrote:
>> Moving bfqq to it's parent bfqg is pointless.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> 
> Did you notice that this is happening often enough that the check is worth
> it? Where do we do this?
> 

I noticed that this will happend when root group is offlined:

bfq_pd_offline
  bfq_put_async_queues
   __bfq_put_async_bfqq
    bfq_bfqq_move

I'm not sure if there are other situations. I think bfq_bfqq_move()
is not happening often itself, thus the checking won't affect
performance.

Thanks,
Kuai
> 								Honza
> 
>> ---
>>   block/bfq-cgroup.c | 7 ++++++-
>>   1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
>> index 24a5c5329bcd..0f62546a72d4 100644
>> --- a/block/bfq-cgroup.c
>> +++ b/block/bfq-cgroup.c
>> @@ -645,6 +645,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>>   		   struct bfq_group *bfqg)
>>   {
>>   	struct bfq_entity *entity = &bfqq->entity;
>> +	struct bfq_group *old_parent = bfq_group(bfqq);
>> +
>> +	/* No point to move bfqq to the same group */
>> +	if (old_parent == bfqg)
>> +		return;
>>   
>>   	/*
>>   	 * Get extra reference to prevent bfqq from being freed in
>> @@ -666,7 +671,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>>   		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
>>   	else if (entity->on_st_or_in_serv)
>>   		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
>> -	bfqg_and_blkg_put(bfqq_group(bfqq));
>> +	bfqg_and_blkg_put(old_parent);
>>   
>>   	if (entity->parent &&
>>   	    entity->parent->last_bfqq_created == bfqq)
>> -- 
>> 2.31.1
>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2021-12-21  3:21 ` [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move() Yu Kuai
@ 2021-12-21 11:50   ` Jan Kara
  2021-12-22  3:12     ` yukuai (C)
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2021-12-21 11:50 UTC (permalink / raw)
  To: Yu Kuai
  Cc: tj, axboe, paolo.valente, jack, fchecconi, avanzini.arianna,
	cgroups, linux-block, linux-kernel, yi.zhang

On Tue 21-12-21 11:21:35, Yu Kuai wrote:
> During code review, we found that if bfqq is not busy in
> bfq_bfqq_move(), bfq_pos_tree_add_move() won't be called for the bfqq,
> thus bfqq->pos_root still points to the old bfqg. However, the ref
> that bfqq hold for the old bfqg will be released, so it's possible
> that the old bfqg can be freed. This is problematic because the freed
> bfqg can still be accessed by bfqq->pos_root.
> 
> Fix the problem by calling bfq_pos_tree_add_move() for idle bfqq
> as well.
> 
> Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>

I'm just wondering, how can it happen that !bfq_bfqq_busy() queue is in
pos_tree? Because bfq_remove_request() takes care to remove bfqq from the
pos_tree...

								Honza

> ---
>  block/bfq-cgroup.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
> index 8e8cf6b3d946..822dd28ecf53 100644
> --- a/block/bfq-cgroup.c
> +++ b/block/bfq-cgroup.c
> @@ -677,7 +677,6 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>  		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
>  	else if (entity->on_st_or_in_serv)
>  		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
> -	bfqg_and_blkg_put(old_parent);
>  
>  	if (entity->parent &&
>  	    entity->parent->last_bfqq_created == bfqq)
> @@ -690,11 +689,16 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>  	/* pin down bfqg and its associated blkg  */
>  	bfqg_and_blkg_get(bfqg);
>  
> -	if (bfq_bfqq_busy(bfqq)) {
> -		if (unlikely(!bfqd->nonrot_with_queueing))
> -			bfq_pos_tree_add_move(bfqd, bfqq);
> +	/*
> +	 * Don't leave the pos_root to old bfqg, since the ref to old bfqg will
> +	 * be released and the bfqg might be freed.
> +	 */
> +	if (unlikely(!bfqd->nonrot_with_queueing))
> +		bfq_pos_tree_add_move(bfqd, bfqq);
> +	bfqg_and_blkg_put(old_parent);
> +
> +	if (bfq_bfqq_busy(bfqq))
>  		bfq_activate_bfqq(bfqd, bfqq);
> -	}
>  
>  	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
>  		bfq_schedule_dispatch(bfqd);
> -- 
> 2.31.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/4] block, bfq: avoid moving bfqq to it's parent bfqg
  2021-12-21 11:08     ` yukuai (C)
@ 2021-12-21 11:52       ` Jan Kara
  0 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2021-12-21 11:52 UTC (permalink / raw)
  To: yukuai (C)
  Cc: Jan Kara, tj, axboe, paolo.valente, fchecconi, avanzini.arianna,
	cgroups, linux-block, linux-kernel, yi.zhang

On Tue 21-12-21 19:08:44, yukuai (C) wrote:
> 在 2021/12/21 18:16, Jan Kara 写道:
> > On Tue 21-12-21 11:21:33, Yu Kuai wrote:
> > > Moving bfqq to it's parent bfqg is pointless.
> > > 
> > > Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> > 
> > Did you notice that this is happening often enough that the check is worth
> > it? Where do we do this?
> > 
> 
> I noticed that this will happend when root group is offlined:
> 
> bfq_pd_offline
>  bfq_put_async_queues
>   __bfq_put_async_bfqq
>    bfq_bfqq_move
> 
> I'm not sure if there are other situations. I think bfq_bfqq_move()
> is not happening often itself, thus the checking won't affect
> performance.

Yeah, OK, I was just wondering. I guess there's no harm in doing this
check. Maybe add a comment that this can sometimes happen when dealing with
the root cgroup. And then feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2021-12-21 11:50   ` Jan Kara
@ 2021-12-22  3:12     ` yukuai (C)
  2021-12-22 14:17       ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: yukuai (C) @ 2021-12-22  3:12 UTC (permalink / raw)
  To: Jan Kara
  Cc: tj, axboe, paolo.valente, fchecconi, avanzini.arianna, cgroups,
	linux-block, linux-kernel, yi.zhang

在 2021/12/21 19:50, Jan Kara 写道:
> On Tue 21-12-21 11:21:35, Yu Kuai wrote:
>> During code review, we found that if bfqq is not busy in
>> bfq_bfqq_move(), bfq_pos_tree_add_move() won't be called for the bfqq,
>> thus bfqq->pos_root still points to the old bfqg. However, the ref
>> that bfqq hold for the old bfqg will be released, so it's possible
>> that the old bfqg can be freed. This is problematic because the freed
>> bfqg can still be accessed by bfqq->pos_root.
>>
>> Fix the problem by calling bfq_pos_tree_add_move() for idle bfqq
>> as well.
>>
>> Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> 
> I'm just wondering, how can it happen that !bfq_bfqq_busy() queue is in
> pos_tree? Because bfq_remove_request() takes care to remove bfqq from the
> pos_tree...

Hi,

It's right this is not a problem in common case. The problem seems to
relate to queue merging and task migration. Because I once reporduced
it with the same reporducer for the problem that offlined bfqg can be
inserted into service tree. The uaf is exactly in
bfq_remove_request->rb_rease(). However I didn't save the stack...

I guess this is because bfq_del_bfqq_busy() is called from
bfq_release_process_ref(), and queue merging prevert sunch bfqq to be
freed, thus such bfqq is not in service tree, and it's pos_root can
point to the old bfqg after bfq_bic_update_cgroup->bfq_bfqq_move.

I haven't confirmed this, however, this patch itself only cleared
bfqq->pos_root for idle bfqq, there should be no harm.

Thanks,
Kuai
> 
> 								Honza
> 
>> ---
>>   block/bfq-cgroup.c | 14 +++++++++-----
>>   1 file changed, 9 insertions(+), 5 deletions(-)
>>
>> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
>> index 8e8cf6b3d946..822dd28ecf53 100644
>> --- a/block/bfq-cgroup.c
>> +++ b/block/bfq-cgroup.c
>> @@ -677,7 +677,6 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>>   		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
>>   	else if (entity->on_st_or_in_serv)
>>   		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
>> -	bfqg_and_blkg_put(old_parent);
>>   
>>   	if (entity->parent &&
>>   	    entity->parent->last_bfqq_created == bfqq)
>> @@ -690,11 +689,16 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
>>   	/* pin down bfqg and its associated blkg  */
>>   	bfqg_and_blkg_get(bfqg);
>>   
>> -	if (bfq_bfqq_busy(bfqq)) {
>> -		if (unlikely(!bfqd->nonrot_with_queueing))
>> -			bfq_pos_tree_add_move(bfqd, bfqq);
>> +	/*
>> +	 * Don't leave the pos_root to old bfqg, since the ref to old bfqg will
>> +	 * be released and the bfqg might be freed.
>> +	 */
>> +	if (unlikely(!bfqd->nonrot_with_queueing))
>> +		bfq_pos_tree_add_move(bfqd, bfqq);
>> +	bfqg_and_blkg_put(old_parent);
>> +
>> +	if (bfq_bfqq_busy(bfqq))
>>   		bfq_activate_bfqq(bfqd, bfqq);
>> -	}
>>   
>>   	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
>>   		bfq_schedule_dispatch(bfqd);
>> -- 
>> 2.31.1
>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2021-12-22  3:12     ` yukuai (C)
@ 2021-12-22 14:17       ` Jan Kara
  2021-12-23  1:03         ` yukuai (C)
  2021-12-25  1:19         ` yukuai (C)
  0 siblings, 2 replies; 18+ messages in thread
From: Jan Kara @ 2021-12-22 14:17 UTC (permalink / raw)
  To: yukuai (C)
  Cc: Jan Kara, tj, axboe, paolo.valente, fchecconi, avanzini.arianna,
	cgroups, linux-block, linux-kernel, yi.zhang

On Wed 22-12-21 11:12:45, yukuai (C) wrote:
> 在 2021/12/21 19:50, Jan Kara 写道:
> > On Tue 21-12-21 11:21:35, Yu Kuai wrote:
> > > During code review, we found that if bfqq is not busy in
> > > bfq_bfqq_move(), bfq_pos_tree_add_move() won't be called for the bfqq,
> > > thus bfqq->pos_root still points to the old bfqg. However, the ref
> > > that bfqq hold for the old bfqg will be released, so it's possible
> > > that the old bfqg can be freed. This is problematic because the freed
> > > bfqg can still be accessed by bfqq->pos_root.
> > > 
> > > Fix the problem by calling bfq_pos_tree_add_move() for idle bfqq
> > > as well.
> > > 
> > > Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
> > > Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> > 
> > I'm just wondering, how can it happen that !bfq_bfqq_busy() queue is in
> > pos_tree? Because bfq_remove_request() takes care to remove bfqq from the
> > pos_tree...
> 
> Hi,
> 
> It's right this is not a problem in common case. The problem seems to
> relate to queue merging and task migration. Because I once reporduced
> it with the same reporducer for the problem that offlined bfqg can be
> inserted into service tree. The uaf is exactly in
> bfq_remove_request->rb_rease(). However I didn't save the stack...
> 
> I guess this is because bfq_del_bfqq_busy() is called from
> bfq_release_process_ref(), and queue merging prevert sunch bfqq to be
> freed, thus such bfqq is not in service tree, and it's pos_root can
> point to the old bfqg after bfq_bic_update_cgroup->bfq_bfqq_move.
> 
> I haven't confirmed this, however, this patch itself only cleared
> bfqq->pos_root for idle bfqq, there should be no harm.

Well, I agree this patch does no harm but in my opinion it is just papering
over the real problem which is that we leave bfqq without any request in
the pos_tree which can have also other unexpected consequences. I don't
think your scenario with bfq_release_process_ref() calling
bfq_del_bfqq_busy() really answers my question because we call
bfq_del_bfqq_busy() only if RB_EMPTY_ROOT(&bfqq->sort_list) (i.e., bfqq has
no requests) and when sort_list was becoming empty, bfq_remove_request()
should have removed bfqq from the pos_tree. So I think proper fix lies
elsewhere and I would not merge this patch until we better understand what
is happening in this case.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2021-12-22 14:17       ` Jan Kara
@ 2021-12-23  1:03         ` yukuai (C)
  2021-12-25  1:19         ` yukuai (C)
  1 sibling, 0 replies; 18+ messages in thread
From: yukuai (C) @ 2021-12-23  1:03 UTC (permalink / raw)
  To: Jan Kara
  Cc: tj, axboe, paolo.valente, fchecconi, avanzini.arianna, cgroups,
	linux-block, linux-kernel, yi.zhang

在 2021/12/22 22:17, Jan Kara 写道:
> On Wed 22-12-21 11:12:45, yukuai (C) wrote:
>> 在 2021/12/21 19:50, Jan Kara 写道:
>>> On Tue 21-12-21 11:21:35, Yu Kuai wrote:
>>>> During code review, we found that if bfqq is not busy in
>>>> bfq_bfqq_move(), bfq_pos_tree_add_move() won't be called for the bfqq,
>>>> thus bfqq->pos_root still points to the old bfqg. However, the ref
>>>> that bfqq hold for the old bfqg will be released, so it's possible
>>>> that the old bfqg can be freed. This is problematic because the freed
>>>> bfqg can still be accessed by bfqq->pos_root.
>>>>
>>>> Fix the problem by calling bfq_pos_tree_add_move() for idle bfqq
>>>> as well.
>>>>
>>>> Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>>
>>> I'm just wondering, how can it happen that !bfq_bfqq_busy() queue is in
>>> pos_tree? Because bfq_remove_request() takes care to remove bfqq from the
>>> pos_tree...
>>
>> Hi,
>>
>> It's right this is not a problem in common case. The problem seems to
>> relate to queue merging and task migration. Because I once reporduced
>> it with the same reporducer for the problem that offlined bfqg can be
>> inserted into service tree. The uaf is exactly in
>> bfq_remove_request->rb_rease(). However I didn't save the stack...
>>
>> I guess this is because bfq_del_bfqq_busy() is called from
>> bfq_release_process_ref(), and queue merging prevert sunch bfqq to be
>> freed, thus such bfqq is not in service tree, and it's pos_root can
>> point to the old bfqg after bfq_bic_update_cgroup->bfq_bfqq_move.
>>
>> I haven't confirmed this, however, this patch itself only cleared
>> bfqq->pos_root for idle bfqq, there should be no harm.
> 
> Well, I agree this patch does no harm but in my opinion it is just papering
> over the real problem which is that we leave bfqq without any request in
> the pos_tree which can have also other unexpected consequences. I don't
> think your scenario with bfq_release_process_ref() calling
> bfq_del_bfqq_busy() really answers my question because we call
> bfq_del_bfqq_busy() only if RB_EMPTY_ROOT(&bfqq->sort_list) (i.e., bfqq has
> no requests) and when sort_list was becoming empty, bfq_remove_request()
> should have removed bfqq from the pos_tree. So I think proper fix lies
> elsewhere and I would not merge this patch until we better understand what
> is happening in this case.

Hi,

I'll try to reporduce the UAF, and take a look at it.

Thanks,
Kuai
> 
> 								Honza
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2021-12-22 14:17       ` Jan Kara
  2021-12-23  1:03         ` yukuai (C)
@ 2021-12-25  1:19         ` yukuai (C)
  2021-12-25  7:44           ` yukuai (C)
  1 sibling, 1 reply; 18+ messages in thread
From: yukuai (C) @ 2021-12-25  1:19 UTC (permalink / raw)
  To: Jan Kara
  Cc: tj, axboe, paolo.valente, fchecconi, avanzini.arianna, cgroups,
	linux-block, linux-kernel, yi.zhang

在 2021/12/22 22:17, Jan Kara 写道:
> On Wed 22-12-21 11:12:45, yukuai (C) wrote:
>> 在 2021/12/21 19:50, Jan Kara 写道:
>>> On Tue 21-12-21 11:21:35, Yu Kuai wrote:
>>>> During code review, we found that if bfqq is not busy in
>>>> bfq_bfqq_move(), bfq_pos_tree_add_move() won't be called for the bfqq,
>>>> thus bfqq->pos_root still points to the old bfqg. However, the ref
>>>> that bfqq hold for the old bfqg will be released, so it's possible
>>>> that the old bfqg can be freed. This is problematic because the freed
>>>> bfqg can still be accessed by bfqq->pos_root.
>>>>
>>>> Fix the problem by calling bfq_pos_tree_add_move() for idle bfqq
>>>> as well.
>>>>
>>>> Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>>
>>> I'm just wondering, how can it happen that !bfq_bfqq_busy() queue is in
>>> pos_tree? Because bfq_remove_request() takes care to remove bfqq from the
>>> pos_tree...
>>
>> Hi,
>>
>> It's right this is not a problem in common case. The problem seems to
>> relate to queue merging and task migration. Because I once reporduced
>> it with the same reporducer for the problem that offlined bfqg can be
>> inserted into service tree. The uaf is exactly in
>> bfq_remove_request->rb_rease(). However I didn't save the stack...
>>
>> I guess this is because bfq_del_bfqq_busy() is called from
>> bfq_release_process_ref(), and queue merging prevert sunch bfqq to be
>> freed, thus such bfqq is not in service tree, and it's pos_root can
>> point to the old bfqg after bfq_bic_update_cgroup->bfq_bfqq_move.
>>
>> I haven't confirmed this, however, this patch itself only cleared
>> bfqq->pos_root for idle bfqq, there should be no harm.
> 
> Well, I agree this patch does no harm but in my opinion it is just papering
> over the real problem which is that we leave bfqq without any request in
> the pos_tree which can have also other unexpected consequences. I don't
> think your scenario with bfq_release_process_ref() calling
> bfq_del_bfqq_busy() really answers my question because we call
> bfq_del_bfqq_busy() only if RB_EMPTY_ROOT(&bfqq->sort_list) (i.e., bfqq has
> no requests) and when sort_list was becoming empty, bfq_remove_request()
> should have removed bfqq from the pos_tree. So I think proper fix lies
> elsewhere and I would not merge this patch until we better understand what
> is happening in this case.
> 

Hi,

I reporduced this problem on v4.19, here is the stack:

[34094.992162] 
==================================================================
[34094.993121] BUG: KASAN: use-after-free in rb_erase+0x4e0/0x8c0
[34094.993121] Write of size 8 at addr ffff888126528258 by task 
kworker/3:1H/554
[34094.993121]
[34094.993121] CPU: 3 PID: 554 Comm: kworker/3:1H Not tainted 4.19.90+ #2
[34094.993121] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
BIOS ?-20190727_073836-4
[34094.993121] Workqueue: kblockd blk_mq_run_work_fn
[34094.993121] Call Trace:
[34094.993121]  dump_stack+0x76/0xa0
[34094.993121]  print_address_description+0x6c/0x237
[34094.993121]  ? rb_erase+0x4e0/0x8c0
[34094.993121]  kasan_report.cold+0x88/0x2a0
[34094.993121]  rb_erase+0x4e0/0x8c0
[34094.993121]  bfq_remove_request+0x239/0x4c0
[34094.993121]  bfq_dispatch_request+0x658/0x17b0
[34094.993121]  blk_mq_do_dispatch_sched+0x183/0x220
[34094.993121]  ? blk_mq_sched_free_hctx_data+0xe0/0xe0
[34094.993121]  ? __switch_to+0x3b2/0x6c0
[34094.993121]  blk_mq_sched_dispatch_requests+0x2ac/0x310
[34094.993121]  ? finish_task_switch+0xa4/0x370
[34094.993121]  ? dequeue_task_fair+0x216/0x360
[34094.993121]  ? blk_mq_sched_restart+0x40/0x40
[34094.993121]  ? __schedule+0x588/0xc10
[34094.993121]  __blk_mq_run_hw_queue+0x82/0x140
[34094.993121]  process_one_work+0x39d/0x770
[34094.993121]  worker_thread+0x78/0x5c0
[34094.993121]  ? process_one_work+0x770/0x770
[34094.993121]  kthread+0x1af/0x1d0
[34094.993121]  ? kthread_create_worker_on_cpu+0xd0/0xd0
[34094.993121]  ret_from_fork+0x1f/0x30
[34094.993121]
[34094.993121] Allocated by task 19184:
[34094.993121]  kasan_kmalloc+0xc2/0xe0
[34094.993121]  kmem_cache_alloc_node_trace+0xf9/0x220
[34094.993121]  bfq_pd_alloc+0x4c/0x510
[34094.993121]  blkg_alloc+0x237/0x310
[34094.993121]  blkg_create+0x499/0x5f0
[34094.993121]  blkg_lookup_create+0x140/0x1b0
[34094.993121]  generic_make_request_checks+0x5ce/0xad0
[34094.993121]  generic_make_request+0xd9/0x6b0
[34094.993121]  submit_bio+0xa6/0x240
[34094.993121]  mpage_readpages+0x29e/0x3b0
[34094.993121]  read_pages+0xdf/0x3a0
[34094.993121]  __do_page_cache_readahead+0x278/0x290
[34094.993121]  ondemand_readahead+0x275/0x460
[34094.993121]  generic_file_read_iter+0xc4a/0x1790
[34094.993121]  blkdev_read_iter+0x8c/0xc0
[34094.993121]  aio_read+0x174/0x260
[34094.993121]  io_submit_one+0x7d3/0x14b0
[34094.993121]  __x64_sys_io_submit+0xfe/0x230
[34094.993121]  do_syscall_64+0x6f/0x280
[34094.993121]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[34094.993121]
[34094.993121] Freed by task 9:
[34094.993121]  __kasan_slab_free+0x12f/0x180
[34094.993121]  kfree+0x92/0x1b0
[34094.993121]  blkg_free.part.0+0x4a/0xe0
[34094.993121]  rcu_process_callbacks+0x420/0x6c0
[34094.993121]  __do_softirq+0x109/0x36c
[34094.993121]
[34094.993121] The buggy address belongs to the object at ffff888126528000
[34094.993121]  which belongs to the cache kmalloc-2048 of size 2048
[34094.993121] The buggy address is located 600 bytes inside of
[34094.993121]  2048-byte region [ffff888126528000, ffff888126528800)
[34094.993121] The buggy address belongs to the page:
[34094.993121] page:ffffea0004994a00 count:1 mapcount:0 
mapping:ffff88810000e800 index:0xffff0
[34094.993121] flags: 0x17ffffc0008100(slab|head)
[34094.993121] raw: 0017ffffc0008100 dead000000000100 dead000000000200 
ffff88810000e800
[34094.993121] raw: ffff88812652c400 00000000800f0009 00000001ffffffff 
0000000000000000
[34094.993121] page dumped because: kasan: bad access detected
[34094.993121]
[34094.993121] Memory state around the buggy address:
[34094.993121]  ffff888126528100: fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb
[34094.993121]  ffff888126528180: fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb
[34094.993121] >ffff888126528200: fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb
[34094.993121]                                                     ^
[34094.993121]  ffff888126528280: fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb
[34094.993121]  ffff888126528300: fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb
[34094.993121] 
==================================================================

I'll try to figure out the root cause, in the meantime, feel free to
kick around if you have any througts.

Thansk,
Kuai
> 								Honza
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2021-12-25  1:19         ` yukuai (C)
@ 2021-12-25  7:44           ` yukuai (C)
  2022-01-03 15:37             ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: yukuai (C) @ 2021-12-25  7:44 UTC (permalink / raw)
  To: Jan Kara
  Cc: tj, axboe, paolo.valente, fchecconi, avanzini.arianna, cgroups,
	linux-block, linux-kernel, yi.zhang

在 2021/12/25 9:19, yukuai (C) 写道:
> 在 2021/12/22 22:17, Jan Kara 写道:
>> On Wed 22-12-21 11:12:45, yukuai (C) wrote:
>>> 在 2021/12/21 19:50, Jan Kara 写道:
>>>> On Tue 21-12-21 11:21:35, Yu Kuai wrote:
>>>>> During code review, we found that if bfqq is not busy in
>>>>> bfq_bfqq_move(), bfq_pos_tree_add_move() won't be called for the bfqq,
>>>>> thus bfqq->pos_root still points to the old bfqg. However, the ref
>>>>> that bfqq hold for the old bfqg will be released, so it's possible
>>>>> that the old bfqg can be freed. This is problematic because the freed
>>>>> bfqg can still be accessed by bfqq->pos_root.
>>>>>
>>>>> Fix the problem by calling bfq_pos_tree_add_move() for idle bfqq
>>>>> as well.
>>>>>
>>>>> Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling 
>>>>> and cgroups support")
>>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>>>
>>>> I'm just wondering, how can it happen that !bfq_bfqq_busy() queue is in
>>>> pos_tree? Because bfq_remove_request() takes care to remove bfqq 
>>>> from the
>>>> pos_tree...
>>>
>>> Hi,
>>>
>>> It's right this is not a problem in common case. The problem seems to
>>> relate to queue merging and task migration. Because I once reporduced
>>> it with the same reporducer for the problem that offlined bfqg can be
>>> inserted into service tree. The uaf is exactly in
>>> bfq_remove_request->rb_rease(). However I didn't save the stack...
>>>
>>> I guess this is because bfq_del_bfqq_busy() is called from
>>> bfq_release_process_ref(), and queue merging prevert sunch bfqq to be
>>> freed, thus such bfqq is not in service tree, and it's pos_root can
>>> point to the old bfqg after bfq_bic_update_cgroup->bfq_bfqq_move.
>>>
>>> I haven't confirmed this, however, this patch itself only cleared
>>> bfqq->pos_root for idle bfqq, there should be no harm.
>>
>> Well, I agree this patch does no harm but in my opinion it is just 
>> papering
>> over the real problem which is that we leave bfqq without any request in
>> the pos_tree which can have also other unexpected consequences. I don't
>> think your scenario with bfq_release_process_ref() calling
>> bfq_del_bfqq_busy() really answers my question because we call
>> bfq_del_bfqq_busy() only if RB_EMPTY_ROOT(&bfqq->sort_list) (i.e., 
>> bfqq has
>> no requests) and when sort_list was becoming empty, bfq_remove_request()
>> should have removed bfqq from the pos_tree. So I think proper fix lies
>> elsewhere and I would not merge this patch until we better understand 
>> what
>> is happening in this case.
>>
> 
> Hi,
> 
> I reporduced this problem on v4.19, here is the stack:
> 
> [34094.992162] 
> ==================================================================
> [34094.993121] BUG: KASAN: use-after-free in rb_erase+0x4e0/0x8c0
> [34094.993121] Write of size 8 at addr ffff888126528258 by task 
> kworker/3:1H/554
> [34094.993121]
> [34094.993121] CPU: 3 PID: 554 Comm: kworker/3:1H Not tainted 4.19.90+ #2
> [34094.993121] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
> BIOS ?-20190727_073836-4
> [34094.993121] Workqueue: kblockd blk_mq_run_work_fn
> [34094.993121] Call Trace:
> [34094.993121]  dump_stack+0x76/0xa0
> [34094.993121]  print_address_description+0x6c/0x237
> [34094.993121]  ? rb_erase+0x4e0/0x8c0
> [34094.993121]  kasan_report.cold+0x88/0x2a0
> [34094.993121]  rb_erase+0x4e0/0x8c0
> [34094.993121]  bfq_remove_request+0x239/0x4c0
> [34094.993121]  bfq_dispatch_request+0x658/0x17b0
> [34094.993121]  blk_mq_do_dispatch_sched+0x183/0x220
> [34094.993121]  ? blk_mq_sched_free_hctx_data+0xe0/0xe0
> [34094.993121]  ? __switch_to+0x3b2/0x6c0
> [34094.993121]  blk_mq_sched_dispatch_requests+0x2ac/0x310
> [34094.993121]  ? finish_task_switch+0xa4/0x370
> [34094.993121]  ? dequeue_task_fair+0x216/0x360
> [34094.993121]  ? blk_mq_sched_restart+0x40/0x40
> [34094.993121]  ? __schedule+0x588/0xc10
> [34094.993121]  __blk_mq_run_hw_queue+0x82/0x140
> [34094.993121]  process_one_work+0x39d/0x770
> [34094.993121]  worker_thread+0x78/0x5c0
> [34094.993121]  ? process_one_work+0x770/0x770
> [34094.993121]  kthread+0x1af/0x1d0
> [34094.993121]  ? kthread_create_worker_on_cpu+0xd0/0xd0
> [34094.993121]  ret_from_fork+0x1f/0x30
> [34094.993121]
> [34094.993121] Allocated by task 19184:
> [34094.993121]  kasan_kmalloc+0xc2/0xe0
> [34094.993121]  kmem_cache_alloc_node_trace+0xf9/0x220
> [34094.993121]  bfq_pd_alloc+0x4c/0x510
> [34094.993121]  blkg_alloc+0x237/0x310
> [34094.993121]  blkg_create+0x499/0x5f0
> [34094.993121]  blkg_lookup_create+0x140/0x1b0
> [34094.993121]  generic_make_request_checks+0x5ce/0xad0
> [34094.993121]  generic_make_request+0xd9/0x6b0
> [34094.993121]  submit_bio+0xa6/0x240
> [34094.993121]  mpage_readpages+0x29e/0x3b0
> [34094.993121]  read_pages+0xdf/0x3a0
> [34094.993121]  __do_page_cache_readahead+0x278/0x290
> [34094.993121]  ondemand_readahead+0x275/0x460
> [34094.993121]  generic_file_read_iter+0xc4a/0x1790
> [34094.993121]  blkdev_read_iter+0x8c/0xc0
> [34094.993121]  aio_read+0x174/0x260
> [34094.993121]  io_submit_one+0x7d3/0x14b0
> [34094.993121]  __x64_sys_io_submit+0xfe/0x230
> [34094.993121]  do_syscall_64+0x6f/0x280
> [34094.993121]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [34094.993121]
> [34094.993121] Freed by task 9:
> [34094.993121]  __kasan_slab_free+0x12f/0x180
> [34094.993121]  kfree+0x92/0x1b0
> [34094.993121]  blkg_free.part.0+0x4a/0xe0
> [34094.993121]  rcu_process_callbacks+0x420/0x6c0
> [34094.993121]  __do_softirq+0x109/0x36c
> [34094.993121]
> [34094.993121] The buggy address belongs to the object at ffff888126528000
> [34094.993121]  which belongs to the cache kmalloc-2048 of size 2048
> [34094.993121] The buggy address is located 600 bytes inside of
> [34094.993121]  2048-byte region [ffff888126528000, ffff888126528800)
> [34094.993121] The buggy address belongs to the page:
> [34094.993121] page:ffffea0004994a00 count:1 mapcount:0 
> mapping:ffff88810000e800 index:0xffff0
> [34094.993121] flags: 0x17ffffc0008100(slab|head)
> [34094.993121] raw: 0017ffffc0008100 dead000000000100 dead000000000200 
> ffff88810000e800
> [34094.993121] raw: ffff88812652c400 00000000800f0009 00000001ffffffff 
> 0000000000000000
> [34094.993121] page dumped because: kasan: bad access detected
> [34094.993121]
> [34094.993121] Memory state around the buggy address:
> [34094.993121]  ffff888126528100: fb fb fb fb fb fb fb fb fb fb fb fb fb 
> fb fb fb
> [34094.993121]  ffff888126528180: fb fb fb fb fb fb fb fb fb fb fb fb fb 
> fb fb fb
> [34094.993121] >ffff888126528200: fb fb fb fb fb fb fb fb fb fb fb fb fb 
> fb fb fb
> [34094.993121]                                                     ^
> [34094.993121]  ffff888126528280: fb fb fb fb fb fb fb fb fb fb fb fb fb 
> fb fb fb
> [34094.993121]  ffff888126528300: fb fb fb fb fb fb fb fb fb fb fb fb fb 
> fb fb fb
> [34094.993121] 
> ==================================================================
> 
> I'll try to figure out the root cause, in the meantime, feel free to
> kick around if you have any througts.
> 
> Thansk,
> Kuai
>>                                 Honza
>>

Hi,

I finally figure out the root cause... This is introduced by a temporary
fix of the problem that offlined bfqg is reinserted into service tree.

The temporary fix is as follow:

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..ee1933cd9a43 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -935,6 +935,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)

  put_async_queues:
         bfq_put_async_queues(bfqd, bfqg);
+       pd->plid = BLKCG_MAX_POLS;

         spin_unlock_irqrestore(&bfqd->lock, flags);
         /*
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index b74cc0da118e..fa2062244805 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1692,6 +1692,15 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, 
struct bfq_queue *bfqq,
   */
  void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       /* If parent group is offlined, move the bfqq to root group */
+       if (bfqq->entity.parent) {
+               struct bfq_group *bfqg = bfq_bfqq_to_bfqg(bfqq);
+
+               if (bfqg->pd.plid >= BLKCG_MAX_POLS)
+                       bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+       }
+#endif
         bfq_log_bfqq(bfqd, bfqq, "add to busy");

I add bfq_bfqq_move() here before bfq_mark_bfqq_busy(), which cause
the problem...

Thanks,
Kuai

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2021-12-25  7:44           ` yukuai (C)
@ 2022-01-03 15:37             ` Jan Kara
  2022-01-07  1:04               ` yukuai (C)
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2022-01-03 15:37 UTC (permalink / raw)
  To: yukuai (C)
  Cc: Jan Kara, tj, axboe, paolo.valente, fchecconi, avanzini.arianna,
	cgroups, linux-block, linux-kernel, yi.zhang

On Sat 25-12-21 15:44:54, yukuai (C) wrote:
> 在 2021/12/25 9:19, yukuai (C) 写道:
> > 在 2021/12/22 22:17, Jan Kara 写道:
> > > On Wed 22-12-21 11:12:45, yukuai (C) wrote:
> > > > 在 2021/12/21 19:50, Jan Kara 写道:
> > > > > On Tue 21-12-21 11:21:35, Yu Kuai wrote:
> > > > > > During code review, we found that if bfqq is not busy in
> > > > > > bfq_bfqq_move(), bfq_pos_tree_add_move() won't be called for the bfqq,
> > > > > > thus bfqq->pos_root still points to the old bfqg. However, the ref
> > > > > > that bfqq hold for the old bfqg will be released, so it's possible
> > > > > > that the old bfqg can be freed. This is problematic because the freed
> > > > > > bfqg can still be accessed by bfqq->pos_root.
> > > > > > 
> > > > > > Fix the problem by calling bfq_pos_tree_add_move() for idle bfqq
> > > > > > as well.
> > > > > > 
> > > > > > Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical
> > > > > > scheduling and cgroups support")
> > > > > > Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> > > > > 
> > > > > I'm just wondering, how can it happen that !bfq_bfqq_busy() queue is in
> > > > > pos_tree? Because bfq_remove_request() takes care to remove
> > > > > bfqq from the
> > > > > pos_tree...
> > > > 
> > > > Hi,
> > > > 
> > > > It's right this is not a problem in common case. The problem seems to
> > > > relate to queue merging and task migration. Because I once reporduced
> > > > it with the same reporducer for the problem that offlined bfqg can be
> > > > inserted into service tree. The uaf is exactly in
> > > > bfq_remove_request->rb_rease(). However I didn't save the stack...
> > > > 
> > > > I guess this is because bfq_del_bfqq_busy() is called from
> > > > bfq_release_process_ref(), and queue merging prevert sunch bfqq to be
> > > > freed, thus such bfqq is not in service tree, and it's pos_root can
> > > > point to the old bfqg after bfq_bic_update_cgroup->bfq_bfqq_move.
> > > > 
> > > > I haven't confirmed this, however, this patch itself only cleared
> > > > bfqq->pos_root for idle bfqq, there should be no harm.
> > > 
> > > Well, I agree this patch does no harm but in my opinion it is just
> > > papering
> > > over the real problem which is that we leave bfqq without any request in
> > > the pos_tree which can have also other unexpected consequences. I don't
> > > think your scenario with bfq_release_process_ref() calling
> > > bfq_del_bfqq_busy() really answers my question because we call
> > > bfq_del_bfqq_busy() only if RB_EMPTY_ROOT(&bfqq->sort_list) (i.e.,
> > > bfqq has
> > > no requests) and when sort_list was becoming empty, bfq_remove_request()
> > > should have removed bfqq from the pos_tree. So I think proper fix lies
> > > elsewhere and I would not merge this patch until we better
> > > understand what
> > > is happening in this case.
> > > 
> > 
> > Hi,
> > 
> > I reporduced this problem on v4.19, here is the stack:
> > 
> > [34094.992162]
> > ==================================================================
> > [34094.993121] BUG: KASAN: use-after-free in rb_erase+0x4e0/0x8c0
> > [34094.993121] Write of size 8 at addr ffff888126528258 by task
> > kworker/3:1H/554
> > [34094.993121]
> > [34094.993121] CPU: 3 PID: 554 Comm: kworker/3:1H Not tainted 4.19.90+ #2
> > [34094.993121] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> > BIOS ?-20190727_073836-4
> > [34094.993121] Workqueue: kblockd blk_mq_run_work_fn
> > [34094.993121] Call Trace:
> > [34094.993121]  dump_stack+0x76/0xa0
> > [34094.993121]  print_address_description+0x6c/0x237
> > [34094.993121]  ? rb_erase+0x4e0/0x8c0
> > [34094.993121]  kasan_report.cold+0x88/0x2a0
> > [34094.993121]  rb_erase+0x4e0/0x8c0
> > [34094.993121]  bfq_remove_request+0x239/0x4c0
> > [34094.993121]  bfq_dispatch_request+0x658/0x17b0
> > [34094.993121]  blk_mq_do_dispatch_sched+0x183/0x220
> > [34094.993121]  ? blk_mq_sched_free_hctx_data+0xe0/0xe0
> > [34094.993121]  ? __switch_to+0x3b2/0x6c0
> > [34094.993121]  blk_mq_sched_dispatch_requests+0x2ac/0x310
> > [34094.993121]  ? finish_task_switch+0xa4/0x370
> > [34094.993121]  ? dequeue_task_fair+0x216/0x360
> > [34094.993121]  ? blk_mq_sched_restart+0x40/0x40
> > [34094.993121]  ? __schedule+0x588/0xc10
> > [34094.993121]  __blk_mq_run_hw_queue+0x82/0x140
> > [34094.993121]  process_one_work+0x39d/0x770
> > [34094.993121]  worker_thread+0x78/0x5c0
> > [34094.993121]  ? process_one_work+0x770/0x770
> > [34094.993121]  kthread+0x1af/0x1d0
> > [34094.993121]  ? kthread_create_worker_on_cpu+0xd0/0xd0
> > [34094.993121]  ret_from_fork+0x1f/0x30
> > [34094.993121]
> > [34094.993121] Allocated by task 19184:
> > [34094.993121]  kasan_kmalloc+0xc2/0xe0
> > [34094.993121]  kmem_cache_alloc_node_trace+0xf9/0x220
> > [34094.993121]  bfq_pd_alloc+0x4c/0x510
> > [34094.993121]  blkg_alloc+0x237/0x310
> > [34094.993121]  blkg_create+0x499/0x5f0
> > [34094.993121]  blkg_lookup_create+0x140/0x1b0
> > [34094.993121]  generic_make_request_checks+0x5ce/0xad0
> > [34094.993121]  generic_make_request+0xd9/0x6b0
> > [34094.993121]  submit_bio+0xa6/0x240
> > [34094.993121]  mpage_readpages+0x29e/0x3b0
> > [34094.993121]  read_pages+0xdf/0x3a0
> > [34094.993121]  __do_page_cache_readahead+0x278/0x290
> > [34094.993121]  ondemand_readahead+0x275/0x460
> > [34094.993121]  generic_file_read_iter+0xc4a/0x1790
> > [34094.993121]  blkdev_read_iter+0x8c/0xc0
> > [34094.993121]  aio_read+0x174/0x260
> > [34094.993121]  io_submit_one+0x7d3/0x14b0
> > [34094.993121]  __x64_sys_io_submit+0xfe/0x230
> > [34094.993121]  do_syscall_64+0x6f/0x280
> > [34094.993121]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > [34094.993121]
> > [34094.993121] Freed by task 9:
> > [34094.993121]  __kasan_slab_free+0x12f/0x180
> > [34094.993121]  kfree+0x92/0x1b0
> > [34094.993121]  blkg_free.part.0+0x4a/0xe0
> > [34094.993121]  rcu_process_callbacks+0x420/0x6c0
> > [34094.993121]  __do_softirq+0x109/0x36c
> > [34094.993121]
> > [34094.993121] The buggy address belongs to the object at ffff888126528000
> > [34094.993121]  which belongs to the cache kmalloc-2048 of size 2048
> > [34094.993121] The buggy address is located 600 bytes inside of
> > [34094.993121]  2048-byte region [ffff888126528000, ffff888126528800)
> > [34094.993121] The buggy address belongs to the page:
> > [34094.993121] page:ffffea0004994a00 count:1 mapcount:0
> > mapping:ffff88810000e800 index:0xffff0
> > [34094.993121] flags: 0x17ffffc0008100(slab|head)
> > [34094.993121] raw: 0017ffffc0008100 dead000000000100 dead000000000200
> > ffff88810000e800
> > [34094.993121] raw: ffff88812652c400 00000000800f0009 00000001ffffffff
> > 0000000000000000
> > [34094.993121] page dumped because: kasan: bad access detected
> > [34094.993121]
> > [34094.993121] Memory state around the buggy address:
> > [34094.993121]  ffff888126528100: fb fb fb fb fb fb fb fb fb fb fb fb fb
> > fb fb fb
> > [34094.993121]  ffff888126528180: fb fb fb fb fb fb fb fb fb fb fb fb fb
> > fb fb fb
> > [34094.993121] >ffff888126528200: fb fb fb fb fb fb fb fb fb fb fb fb fb
> > fb fb fb
> > [34094.993121]                                                     ^
> > [34094.993121]  ffff888126528280: fb fb fb fb fb fb fb fb fb fb fb fb fb
> > fb fb fb
> > [34094.993121]  ffff888126528300: fb fb fb fb fb fb fb fb fb fb fb fb fb
> > fb fb fb
> > [34094.993121]
> > ==================================================================
> > 
> > I'll try to figure out the root cause, in the meantime, feel free to
> > kick around if you have any througts.
> > 
> > Thansk,
> > Kuai
> > >                                 Honza
> > > 
> 
> Hi,
> 
> I finally figure out the root cause... This is introduced by a temporary
> fix of the problem that offlined bfqg is reinserted into service tree.
> 
> The temporary fix is as follow:
> 
> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
> index 24a5c5329bcd..ee1933cd9a43 100644
> --- a/block/bfq-cgroup.c
> +++ b/block/bfq-cgroup.c
> @@ -935,6 +935,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
> 
>  put_async_queues:
>         bfq_put_async_queues(bfqd, bfqg);
> +       pd->plid = BLKCG_MAX_POLS;
> 
>         spin_unlock_irqrestore(&bfqd->lock, flags);
>         /*
> diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
> index b74cc0da118e..fa2062244805 100644
> --- a/block/bfq-wf2q.c
> +++ b/block/bfq-wf2q.c
> @@ -1692,6 +1692,15 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct
> bfq_queue *bfqq,
>   */
>  void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
>  {
> +#ifdef CONFIG_BFQ_GROUP_IOSCHED
> +       /* If parent group is offlined, move the bfqq to root group */
> +       if (bfqq->entity.parent) {
> +               struct bfq_group *bfqg = bfq_bfqq_to_bfqg(bfqq);
> +
> +               if (bfqg->pd.plid >= BLKCG_MAX_POLS)
> +                       bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
> +       }
> +#endif
>         bfq_log_bfqq(bfqd, bfqq, "add to busy");
> 
> I add bfq_bfqq_move() here before bfq_mark_bfqq_busy(), which cause
> the problem...

OK, thanks for following up on this. So do I understand you correctly that
the problem with empty bfqq being in pos_tree does not exist in current
upstream kernel?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move()
  2022-01-03 15:37             ` Jan Kara
@ 2022-01-07  1:04               ` yukuai (C)
  0 siblings, 0 replies; 18+ messages in thread
From: yukuai (C) @ 2022-01-07  1:04 UTC (permalink / raw)
  To: Jan Kara
  Cc: tj, axboe, paolo.valente, fchecconi, avanzini.arianna, cgroups,
	linux-block, linux-kernel, yi.zhang

在 2022/01/03 23:37, Jan Kara 写道:
> OK, thanks for following up on this. So do I understand you correctly that
> the problem with empty bfqq being in pos_tree does not exist in current
> upstream kernel?

Yes, I had this patch removed in v2 patchset.

Thanks,
Kuai

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2022-01-07  1:04 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-21  3:21 [PATCH 0/4] block, bfq: minor cleanup and fix Yu Kuai
2021-12-21  3:21 ` [PATCH 1/4] block, bfq: cleanup bfq_bfqq_to_bfqg() Yu Kuai
2021-12-21 10:15   ` Jan Kara
2021-12-21  3:21 ` [PATCH 2/4] block, bfq: avoid moving bfqq to it's parent bfqg Yu Kuai
2021-12-21 10:16   ` Jan Kara
2021-12-21 11:08     ` yukuai (C)
2021-12-21 11:52       ` Jan Kara
2021-12-21  3:21 ` [PATCH 3/4] block, bfq: don't move oom_bfqq Yu Kuai
2021-12-21 10:18   ` Jan Kara
2021-12-21  3:21 ` [PATCH 4/4] block, bfq: update pos_root for idle bfq_queue in bfq_bfqq_move() Yu Kuai
2021-12-21 11:50   ` Jan Kara
2021-12-22  3:12     ` yukuai (C)
2021-12-22 14:17       ` Jan Kara
2021-12-23  1:03         ` yukuai (C)
2021-12-25  1:19         ` yukuai (C)
2021-12-25  7:44           ` yukuai (C)
2022-01-03 15:37             ` Jan Kara
2022-01-07  1:04               ` yukuai (C)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).