linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] bfq: Fix use-after-free with cgroups
@ 2021-12-01 13:34 Jan Kara
  2021-12-01 16:47 ` kernel test robot
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Jan Kara @ 2021-12-01 13:34 UTC (permalink / raw)
  To: Paolo Valente
  Cc: Jens Axboe, linux-block, Michal Koutný,
	fvogt, Tejun Heo, cgroups, Jan Kara, stable, Fabian Vogt

BFQ started crashing with 5.15-based kernels like:

BUG: KASAN: use-after-free in rb_erase (lib/rbtree.c:262 lib/rbtr
Read of size 8 at addr ffff888008193098 by task bash/1472

CPU: 0 PID: 1472 Comm: bash Tainted: G            E     5.15.2-0.
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1
Call Trace:
rb_erase (lib/rbtree.c:262 lib/rbtree.c:445)
bfq_idle_extract (block/bfq-wf2q.c:356)
bfq_put_idle_entity (block/bfq-wf2q.c:660)
bfq_bfqq_served (block/bfq-wf2q.c:833)
bfq_dispatch_request (block/bfq-iosched.c:4870 block/bfq-iosched.
__blk_mq_do_dispatch_sched (block/blk-mq-sched.c:150)
__blk_mq_sched_dispatch_requests (block/blk-mq-sched.c:215 block/
blk_mq_sched_dispatch_requests (block/blk-mq-sched.c:360)
blk_mq_sched_insert_requests (include/linux/percpu-refcount.h:174
blk_mq_flush_plug_list (include/linux/list.h:282 block/blk-mq.c:1
blk_flush_plug_list (block/blk-core.c:1722)
blk_finish_plug (block/blk-core.c:1745 block/blk-core.c:1739)
read_pages (include/linux/list.h:282 mm/readahead.c:152)
page_cache_ra_unbounded (mm/readahead.c:212 (discriminator 2))
filemap_fault (mm/filemap.c:2982 mm/filemap.c:3074)
__do_fault (mm/memory.c:3858)
__handle_mm_fault (mm/memory.c:4182 mm/memory.c:4310 mm/memory.c:
handle_mm_fault (mm/memory.c:4801)

After some analysis we've found out that the culprit of the problem is
that some task is reparented from cgroup G to the root cgroup and G is
offlined. But a bfq_queue in task's IO context still points to G as its
parent and thus when task submits more IO, G is inserted into service
trees. Once the task exits and bfq_queue is destroyed, the last
reference to G is dropped as well and G is freed but it is still linked
from service trees causing use-after-free issues sometime later.

Fix the problem by tracking all bfq_queues that point to a particular
cgroup as their parent and reparent them when the cgroup is going
offline.

CC: stable@vger.kernel.org
Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Tested-by: Fabian Vogt <fvogt@suse.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 block/bfq-cgroup.c  | 100 ++++++--------------------------------------
 block/bfq-iosched.c |  54 ++++++++++++------------
 block/bfq-iosched.h |   6 +++
 3 files changed, 47 insertions(+), 113 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..519e6291e98e 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -666,6 +666,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	else if (entity->on_st_or_in_serv)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	hlist_del(&bfqq->children_node);
 	bfqg_and_blkg_put(bfqq_group(bfqq));
 
 	if (entity->parent &&
@@ -678,6 +679,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	entity->sched_data = &bfqg->sched_data;
 	/* pin down bfqg and its associated blkg  */
 	bfqg_and_blkg_get(bfqg);
+	hlist_add_head(&bfqq->children_node, &bfqg->children);
 
 	if (bfq_bfqq_busy(bfqq)) {
 		if (unlikely(!bfqd->nonrot_with_queueing))
@@ -810,68 +812,13 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 	rcu_read_unlock();
 }
 
-/**
- * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
- * @st: the service tree being flushed.
- */
-static void bfq_flush_idle_tree(struct bfq_service_tree *st)
-{
-	struct bfq_entity *entity = st->first_idle;
-
-	for (; entity ; entity = st->first_idle)
-		__bfq_deactivate_entity(entity, false);
-}
-
-/**
- * bfq_reparent_leaf_entity - move leaf entity to the root_group.
- * @bfqd: the device data structure with the root group.
- * @entity: the entity to move, if entity is a leaf; or the parent entity
- *	    of an active leaf entity to move, if entity is not a leaf.
- */
-static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
-				     struct bfq_entity *entity,
-				     int ioprio_class)
+static void bfq_reparent_children(struct bfq_data *bfqd, struct bfq_group *bfqg)
 {
 	struct bfq_queue *bfqq;
-	struct bfq_entity *child_entity = entity;
-
-	while (child_entity->my_sched_data) { /* leaf not reached yet */
-		struct bfq_sched_data *child_sd = child_entity->my_sched_data;
-		struct bfq_service_tree *child_st = child_sd->service_tree +
-			ioprio_class;
-		struct rb_root *child_active = &child_st->active;
-
-		child_entity = bfq_entity_of(rb_first(child_active));
-
-		if (!child_entity)
-			child_entity = child_sd->in_service_entity;
-	}
-
-	bfqq = bfq_entity_to_bfqq(child_entity);
-	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
-}
-
-/**
- * bfq_reparent_active_queues - move to the root group all active queues.
- * @bfqd: the device data structure with the root group.
- * @bfqg: the group to move from.
- * @st: the service tree to start the search from.
- */
-static void bfq_reparent_active_queues(struct bfq_data *bfqd,
-				       struct bfq_group *bfqg,
-				       struct bfq_service_tree *st,
-				       int ioprio_class)
-{
-	struct rb_root *active = &st->active;
-	struct bfq_entity *entity;
-
-	while ((entity = bfq_entity_of(rb_first(active))))
-		bfq_reparent_leaf_entity(bfqd, entity, ioprio_class);
+	struct hlist_node *next;
 
-	if (bfqg->sched_data.in_service_entity)
-		bfq_reparent_leaf_entity(bfqd,
-					 bfqg->sched_data.in_service_entity,
-					 ioprio_class);
+	hlist_for_each_entry_safe(bfqq, next, &bfqg->children, children_node)
+		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
 }
 
 /**
@@ -897,38 +844,17 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
 		goto put_async_queues;
 
 	/*
-	 * Empty all service_trees belonging to this group before
-	 * deactivating the group itself.
+	 * Reparent all bfqqs under this bfq group. This will also empty all
+	 * service_trees belonging to this group before deactivating the group
+	 * itself.
 	 */
+	bfq_reparent_children(bfqd, bfqg);
+
 	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
 		st = bfqg->sched_data.service_tree + i;
 
-		/*
-		 * It may happen that some queues are still active
-		 * (busy) upon group destruction (if the corresponding
-		 * processes have been forced to terminate). We move
-		 * all the leaf entities corresponding to these queues
-		 * to the root_group.
-		 * Also, it may happen that the group has an entity
-		 * in service, which is disconnected from the active
-		 * tree: it must be moved, too.
-		 * There is no need to put the sync queues, as the
-		 * scheduler has taken no reference.
-		 */
-		bfq_reparent_active_queues(bfqd, bfqg, st, i);
-
-		/*
-		 * The idle tree may still contain bfq_queues
-		 * belonging to exited task because they never
-		 * migrated to a different cgroup from the one being
-		 * destroyed now. In addition, even
-		 * bfq_reparent_active_queues() may happen to add some
-		 * entities to the idle tree. It happens if, in some
-		 * of the calls to bfq_bfqq_move() performed by
-		 * bfq_reparent_active_queues(), the queue to move is
-		 * empty and gets expired.
-		 */
-		bfq_flush_idle_tree(st);
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&st->active));
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&st->idle));
 	}
 
 	__bfq_deactivate_entity(entity, false);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fec18118dc30..8f33f6b91d05 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5163,6 +5163,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
 		bfqq->bfqd->last_completed_rq_bfqq = NULL;
 
+	hlist_del(&bfqq->children_node);
 	kmem_cache_free(bfq_pool, bfqq);
 	bfqg_and_blkg_put(bfqg);
 }
@@ -5337,8 +5338,9 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 		bfq_set_next_ioprio_data(bfqq, bic);
 }
 
-static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_group *bfqg,
+			  struct bfq_queue *bfqq, struct bfq_io_cq *bic,
+			  pid_t pid, int is_sync)
 {
 	u64 now_ns = ktime_get_ns();
 
@@ -5347,6 +5349,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	INIT_HLIST_NODE(&bfqq->burst_list_node);
 	INIT_HLIST_NODE(&bfqq->woken_list_node);
 	INIT_HLIST_HEAD(&bfqq->woken_list);
+	hlist_add_head(&bfqq->children_node, &bfqg->children);
 
 	bfqq->ref = 0;
 	bfqq->bfqd = bfqd;
@@ -5600,8 +5603,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 				     bfqd->queue->node);
 
 	if (bfqq) {
-		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
-			      is_sync);
+		bfq_init_bfqq(bfqd, bfqg, bfqq, bic, current->pid, is_sync);
 		bfq_init_entity(&bfqq->entity, bfqg);
 		bfq_log_bfqq(bfqd, bfqq, "allocated");
 	} else {
@@ -6908,6 +6910,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
+	hlist_del(&bfqd->oom_bfqq.children_node);
 	/* release oom-queue reference to root group */
 	bfqg_and_blkg_put(bfqd->root_group);
 
@@ -6959,28 +6962,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	q->elevator = eq;
 	spin_unlock_irq(&q->queue_lock);
 
-	/*
-	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
-	 * Grab a permanent reference to it, so that the normal code flow
-	 * will not attempt to free it.
-	 */
-	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
-	bfqd->oom_bfqq.ref++;
-	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
-	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
-	bfqd->oom_bfqq.entity.new_weight =
-		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
-
-	/* oom_bfqq does not participate to bursts */
-	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
-
-	/*
-	 * Trigger weight initialization, according to ioprio, at the
-	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
-	 * class won't be changed any more.
-	 */
-	bfqd->oom_bfqq.entity.prio_changed = 1;
-
 	bfqd->queue = q;
 
 	INIT_LIST_HEAD(&bfqd->dispatch);
@@ -7059,6 +7040,27 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		goto out_free;
 	bfq_init_root_group(bfqd->root_group, bfqd);
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+	/*
+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+	 * Grab a permanent reference to it, so that the normal code flow
+	 * will not attempt to free it.
+	 */
+	bfq_init_bfqq(bfqd, bfqd->root_group, &bfqd->oom_bfqq, NULL, 1, 0);
+	bfqd->oom_bfqq.ref++;
+	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.entity.new_weight =
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+	/* oom_bfqq does not participate to bursts */
+	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+
+	/*
+	 * Trigger weight initialization, according to ioprio, at the
+	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+	 * class won't be changed any more.
+	 */
+	bfqd->oom_bfqq.entity.prio_changed = 1;
 
 	wbt_disable_default(q);
 	return 0;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index a73488eec8a4..a1984959d6be 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -292,6 +292,9 @@ struct bfq_queue {
 
 	/* node for active/idle bfqq list inside parent bfqd */
 	struct list_head bfqq_list;
+	/* Member of parent's bfqg children list */
+	struct hlist_node children_node;
+
 
 	/* associated @bfq_ttime struct */
 	struct bfq_ttime ttime;
@@ -929,6 +932,9 @@ struct bfq_group {
 	struct bfq_entity entity;
 	struct bfq_sched_data sched_data;
 
+	/* bfq_queues under this entity */
+	struct hlist_head children;
+
 	void *bfqd;
 
 	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] bfq: Fix use-after-free with cgroups
  2021-12-01 13:34 [PATCH] bfq: Fix use-after-free with cgroups Jan Kara
@ 2021-12-01 16:47 ` kernel test robot
  2021-12-01 21:22 ` kernel test robot
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2021-12-01 16:47 UTC (permalink / raw)
  To: Jan Kara, Paolo Valente
  Cc: kbuild-all, Jens Axboe, linux-block, Michal Koutný,
	fvogt, Tejun Heo, cgroups, Jan Kara, stable, Fabian Vogt

Hi Jan,

I love your patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on v5.16-rc3 next-20211201]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Jan-Kara/bfq-Fix-use-after-free-with-cgroups/20211201-213549
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: um-i386_defconfig (https://download.01.org/0day-ci/archive/20211202/202112020023.Klrg0J1F-lkp@intel.com/config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/2154a2da8d69308aca6bb431da2d4d9e3e687daa
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Jan-Kara/bfq-Fix-use-after-free-with-cgroups/20211201-213549
        git checkout 2154a2da8d69308aca6bb431da2d4d9e3e687daa
        # save the config file to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash block/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   block/bfq-iosched.c: In function 'bfq_init_bfqq':
>> block/bfq-iosched.c:5472:44: error: 'struct bfq_group' has no member named 'children'
    5472 |  hlist_add_head(&bfqq->children_node, &bfqg->children);
         |                                            ^~


vim +5472 block/bfq-iosched.c

  5460	
  5461	static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_group *bfqg,
  5462				  struct bfq_queue *bfqq, struct bfq_io_cq *bic,
  5463				  pid_t pid, int is_sync)
  5464	{
  5465		u64 now_ns = ktime_get_ns();
  5466	
  5467		RB_CLEAR_NODE(&bfqq->entity.rb_node);
  5468		INIT_LIST_HEAD(&bfqq->fifo);
  5469		INIT_HLIST_NODE(&bfqq->burst_list_node);
  5470		INIT_HLIST_NODE(&bfqq->woken_list_node);
  5471		INIT_HLIST_HEAD(&bfqq->woken_list);
> 5472		hlist_add_head(&bfqq->children_node, &bfqg->children);
  5473	
  5474		bfqq->ref = 0;
  5475		bfqq->bfqd = bfqd;
  5476	
  5477		if (bic)
  5478			bfq_set_next_ioprio_data(bfqq, bic);
  5479	
  5480		if (is_sync) {
  5481			/*
  5482			 * No need to mark as has_short_ttime if in
  5483			 * idle_class, because no device idling is performed
  5484			 * for queues in idle class
  5485			 */
  5486			if (!bfq_class_idle(bfqq))
  5487				/* tentatively mark as has_short_ttime */
  5488				bfq_mark_bfqq_has_short_ttime(bfqq);
  5489			bfq_mark_bfqq_sync(bfqq);
  5490			bfq_mark_bfqq_just_created(bfqq);
  5491		} else
  5492			bfq_clear_bfqq_sync(bfqq);
  5493	
  5494		/* set end request to minus infinity from now */
  5495		bfqq->ttime.last_end_request = now_ns + 1;
  5496	
  5497		bfqq->creation_time = jiffies;
  5498	
  5499		bfqq->io_start_time = now_ns;
  5500	
  5501		bfq_mark_bfqq_IO_bound(bfqq);
  5502	
  5503		bfqq->pid = pid;
  5504	
  5505		/* Tentative initial value to trade off between thr and lat */
  5506		bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
  5507		bfqq->budget_timeout = bfq_smallest_from_now();
  5508	
  5509		bfqq->wr_coeff = 1;
  5510		bfqq->last_wr_start_finish = jiffies;
  5511		bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
  5512		bfqq->split_time = bfq_smallest_from_now();
  5513	
  5514		/*
  5515		 * To not forget the possibly high bandwidth consumed by a
  5516		 * process/queue in the recent past,
  5517		 * bfq_bfqq_softrt_next_start() returns a value at least equal
  5518		 * to the current value of bfqq->soft_rt_next_start (see
  5519		 * comments on bfq_bfqq_softrt_next_start).  Set
  5520		 * soft_rt_next_start to now, to mean that bfqq has consumed
  5521		 * no bandwidth so far.
  5522		 */
  5523		bfqq->soft_rt_next_start = jiffies;
  5524	
  5525		/* first request is almost certainly seeky */
  5526		bfqq->seek_history = 1;
  5527	}
  5528	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bfq: Fix use-after-free with cgroups
  2021-12-01 13:34 [PATCH] bfq: Fix use-after-free with cgroups Jan Kara
  2021-12-01 16:47 ` kernel test robot
@ 2021-12-01 21:22 ` kernel test robot
  2021-12-07 14:53 ` Holger Hoffstätte
  2021-12-07 19:08 ` Michal Koutný
  3 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2021-12-01 21:22 UTC (permalink / raw)
  To: Jan Kara, Paolo Valente
  Cc: llvm, kbuild-all, Jens Axboe, linux-block, Michal Koutný,
	fvogt, Tejun Heo, cgroups, Jan Kara, stable, Fabian Vogt

Hi Jan,

I love your patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on v5.16-rc3 next-20211201]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Jan-Kara/bfq-Fix-use-after-free-with-cgroups/20211201-213549
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: hexagon-randconfig-r045-20211128 (https://download.01.org/0day-ci/archive/20211202/202112020559.l11FLFdN-lkp@intel.com/config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 4b553297ef3ee4dc2119d5429adf3072e90fac38)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/2154a2da8d69308aca6bb431da2d4d9e3e687daa
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Jan-Kara/bfq-Fix-use-after-free-with-cgroups/20211201-213549
        git checkout 2154a2da8d69308aca6bb431da2d4d9e3e687daa
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> block/bfq-iosched.c:5472:46: error: no member named 'children' in 'struct bfq_group'
           hlist_add_head(&bfqq->children_node, &bfqg->children);
                                                 ~~~~  ^
   1 error generated.


vim +5472 block/bfq-iosched.c

  5460	
  5461	static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_group *bfqg,
  5462				  struct bfq_queue *bfqq, struct bfq_io_cq *bic,
  5463				  pid_t pid, int is_sync)
  5464	{
  5465		u64 now_ns = ktime_get_ns();
  5466	
  5467		RB_CLEAR_NODE(&bfqq->entity.rb_node);
  5468		INIT_LIST_HEAD(&bfqq->fifo);
  5469		INIT_HLIST_NODE(&bfqq->burst_list_node);
  5470		INIT_HLIST_NODE(&bfqq->woken_list_node);
  5471		INIT_HLIST_HEAD(&bfqq->woken_list);
> 5472		hlist_add_head(&bfqq->children_node, &bfqg->children);
  5473	
  5474		bfqq->ref = 0;
  5475		bfqq->bfqd = bfqd;
  5476	
  5477		if (bic)
  5478			bfq_set_next_ioprio_data(bfqq, bic);
  5479	
  5480		if (is_sync) {
  5481			/*
  5482			 * No need to mark as has_short_ttime if in
  5483			 * idle_class, because no device idling is performed
  5484			 * for queues in idle class
  5485			 */
  5486			if (!bfq_class_idle(bfqq))
  5487				/* tentatively mark as has_short_ttime */
  5488				bfq_mark_bfqq_has_short_ttime(bfqq);
  5489			bfq_mark_bfqq_sync(bfqq);
  5490			bfq_mark_bfqq_just_created(bfqq);
  5491		} else
  5492			bfq_clear_bfqq_sync(bfqq);
  5493	
  5494		/* set end request to minus infinity from now */
  5495		bfqq->ttime.last_end_request = now_ns + 1;
  5496	
  5497		bfqq->creation_time = jiffies;
  5498	
  5499		bfqq->io_start_time = now_ns;
  5500	
  5501		bfq_mark_bfqq_IO_bound(bfqq);
  5502	
  5503		bfqq->pid = pid;
  5504	
  5505		/* Tentative initial value to trade off between thr and lat */
  5506		bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
  5507		bfqq->budget_timeout = bfq_smallest_from_now();
  5508	
  5509		bfqq->wr_coeff = 1;
  5510		bfqq->last_wr_start_finish = jiffies;
  5511		bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
  5512		bfqq->split_time = bfq_smallest_from_now();
  5513	
  5514		/*
  5515		 * To not forget the possibly high bandwidth consumed by a
  5516		 * process/queue in the recent past,
  5517		 * bfq_bfqq_softrt_next_start() returns a value at least equal
  5518		 * to the current value of bfqq->soft_rt_next_start (see
  5519		 * comments on bfq_bfqq_softrt_next_start).  Set
  5520		 * soft_rt_next_start to now, to mean that bfqq has consumed
  5521		 * no bandwidth so far.
  5522		 */
  5523		bfqq->soft_rt_next_start = jiffies;
  5524	
  5525		/* first request is almost certainly seeky */
  5526		bfqq->seek_history = 1;
  5527	}
  5528	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bfq: Fix use-after-free with cgroups
  2021-12-01 13:34 [PATCH] bfq: Fix use-after-free with cgroups Jan Kara
  2021-12-01 16:47 ` kernel test robot
  2021-12-01 21:22 ` kernel test robot
@ 2021-12-07 14:53 ` Holger Hoffstätte
  2021-12-13 13:46   ` Jan Kara
  2021-12-07 19:08 ` Michal Koutný
  3 siblings, 1 reply; 7+ messages in thread
From: Holger Hoffstätte @ 2021-12-07 14:53 UTC (permalink / raw)
  To: Jan Kara, Paolo Valente
  Cc: Jens Axboe, linux-block, Michal Koutný,
	fvogt, Tejun Heo, cgroups, stable, Fabian Vogt


On 2021-12-01 14:34, Jan Kara wrote:
> BFQ started crashing with 5.15-based kernels like:
> 
> BUG: KASAN: use-after-free in rb_erase (lib/rbtree.c:262 lib/rbtr
> Read of size 8 at addr ffff888008193098 by task bash/1472
[snip]

This does not compile when CONFIG_BFQ_GROUP_IOSCHED is disabled.
I know the patch is meant for the case where it is enabled, but still..

block/bfq-iosched.c: In function 'bfq_init_bfqq':
block/bfq-iosched.c:5362:51: error: 'struct bfq_group' has no member named 'children'
  5362 |         hlist_add_head(&bfqq->children_node, &bfqg->children);
       |                                                   ^~
make[1]: *** [scripts/Makefile.build:277: block/bfq-iosched.o] Error 1

Probably just needs a few more ifdefs :)

cheers
Holger

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bfq: Fix use-after-free with cgroups
  2021-12-01 13:34 [PATCH] bfq: Fix use-after-free with cgroups Jan Kara
                   ` (2 preceding siblings ...)
  2021-12-07 14:53 ` Holger Hoffstätte
@ 2021-12-07 19:08 ` Michal Koutný
  2021-12-13 14:52   ` Jan Kara
  3 siblings, 1 reply; 7+ messages in thread
From: Michal Koutný @ 2021-12-07 19:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Paolo Valente, Jens Axboe, linux-block, fvogt, Tejun Heo,
	cgroups, stable, Fabian Vogt

On Wed, Dec 01, 2021 at 02:34:39PM +0100, Jan Kara <jack@suse.cz> wrote:
> After some analysis we've found out that the culprit of the problem is
> that some task is reparented from cgroup G to the root cgroup and G is
> offlined.

Just sharing my interpretation for context -- (I saw this was a system
using the unified cgroup hierarchy, io_cgrp_subsys_on_dfl_key was
enabled) and what was observed could also have been disabling the io
controller on given level -- that would also manifest similarly -- the
task is migrated to parent and the former blkcg is offlined.


> +static void bfq_reparent_children(struct bfq_data *bfqd, struct bfq_group *bfqg)
> [...]
> -	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
> [...]
> +	hlist_for_each_entry_safe(bfqq, next, &bfqg->children, children_node)
> +		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);

Here I assume root_group is (representing) the global blkcg root and
this reparenting thus skips all ancestors between the removed leaf and
the root. IIUC the associated io_context would then be treated as if it
was running in the root blkcg.
(Admittedly, this isn't a change from this patch but it may cause some
surprises if the given process runs after the operation.)

Reparenting to the immediate ancestors should be safe as cgroup core
should ensure children are offlined before parents. Would it make sense
to you?


> @@ -897,38 +844,17 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
> [...]
> -		 * It may happen that some queues are still active
> -		 * (busy) upon group destruction (if the corresponding
> -		 * processes have been forced to terminate). We move
> -		 * all the leaf entities corresponding to these queues
> -		 * to the root_group.

This comment is removed but it seems to me it assumed that the
reparented entities are only some transitional remainings of terminated
tasks but they may be the processes migrated upwards with a long (IO
active) life ahead.



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bfq: Fix use-after-free with cgroups
  2021-12-07 14:53 ` Holger Hoffstätte
@ 2021-12-13 13:46   ` Jan Kara
  0 siblings, 0 replies; 7+ messages in thread
From: Jan Kara @ 2021-12-13 13:46 UTC (permalink / raw)
  To: Holger Hoffstätte
  Cc: Jan Kara, Paolo Valente, Jens Axboe, linux-block,
	Michal Koutný,
	fvogt, Tejun Heo, cgroups, stable, Fabian Vogt

On Tue 07-12-21 15:53:54, Holger Hoffstätte wrote:
> 
> On 2021-12-01 14:34, Jan Kara wrote:
> > BFQ started crashing with 5.15-based kernels like:
> > 
> > BUG: KASAN: use-after-free in rb_erase (lib/rbtree.c:262 lib/rbtr
> > Read of size 8 at addr ffff888008193098 by task bash/1472
> [snip]
> 
> This does not compile when CONFIG_BFQ_GROUP_IOSCHED is disabled.
> I know the patch is meant for the case where it is enabled, but still..
> 
> block/bfq-iosched.c: In function 'bfq_init_bfqq':
> block/bfq-iosched.c:5362:51: error: 'struct bfq_group' has no member named 'children'
>  5362 |         hlist_add_head(&bfqq->children_node, &bfqg->children);
>       |                                                   ^~
> make[1]: *** [scripts/Makefile.build:277: block/bfq-iosched.o] Error 1
> 
> Probably just needs a few more ifdefs :)

Yep, already fixed that up locally. Thanks for notice.

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bfq: Fix use-after-free with cgroups
  2021-12-07 19:08 ` Michal Koutný
@ 2021-12-13 14:52   ` Jan Kara
  0 siblings, 0 replies; 7+ messages in thread
From: Jan Kara @ 2021-12-13 14:52 UTC (permalink / raw)
  To: Michal Koutný
  Cc: Jan Kara, Paolo Valente, Jens Axboe, linux-block, fvogt,
	Tejun Heo, cgroups, stable, Fabian Vogt

On Tue 07-12-21 20:08:43, Michal Koutný wrote:
> On Wed, Dec 01, 2021 at 02:34:39PM +0100, Jan Kara <jack@suse.cz> wrote:
> > After some analysis we've found out that the culprit of the problem is
> > that some task is reparented from cgroup G to the root cgroup and G is
> > offlined.
> 
> Just sharing my interpretation for context -- (I saw this was a system
> using the unified cgroup hierarchy, io_cgrp_subsys_on_dfl_key was
> enabled) and what was observed could also have been disabling the io
> controller on given level -- that would also manifest similarly -- the
> task is migrated to parent and the former blkcg is offlined.

Yes, that's another possibility.

> > +static void bfq_reparent_children(struct bfq_data *bfqd, struct bfq_group *bfqg)
> > [...]
> > -	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
> > [...]
> > +	hlist_for_each_entry_safe(bfqq, next, &bfqg->children, children_node)
> > +		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
> 
> Here I assume root_group is (representing) the global blkcg root and
> this reparenting thus skips all ancestors between the removed leaf and
> the root. IIUC the associated io_context would then be treated as if it
> was running in the root blkcg.
> (Admittedly, this isn't a change from this patch but it may cause some
> surprises if the given process runs after the operation.)

Yes, this is what happens in bfq_reparent_children() and basically
preserves what BFQ was already doing for a subset of bfq queues.

> Reparenting to the immediate ancestors should be safe as cgroup core
> should ensure children are offlined before parents. Would it make sense
> to you?

I suppose yes, it makes more sense to reparent just to immediate parents
instead of the root of the blkcg hierarchy. Initially when developing the
patch I was not sure whether parent has to be still alive but as you write
it should be safe. I'll modify the patch to:

static void bfq_reparent_children(struct bfq_data *bfqd, struct bfq_group *bfqg)
{
        struct bfq_queue *bfqq;
        struct hlist_node *next;
        struct bfq_group *parent;

        parent = bfqg_parent(bfqg);
        if (!parent)
                parent = bfqd->root_group;

        hlist_for_each_entry_safe(bfqq, next, &bfqg->children, children_node)
                bfq_bfqq_move(bfqd, bfqq, parent);
}

 
> > @@ -897,38 +844,17 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
> > [...]
> > -		 * It may happen that some queues are still active
> > -		 * (busy) upon group destruction (if the corresponding
> > -		 * processes have been forced to terminate). We move
> > -		 * all the leaf entities corresponding to these queues
> > -		 * to the root_group.
> 
> This comment is removed but it seems to me it assumed that the
> reparented entities are only some transitional remainings of terminated
> tasks but they may be the processes migrated upwards with a long (IO
> active) life ahead.

Yes, this seemed to be a misconception of the old code...

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-12-13 14:52 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-01 13:34 [PATCH] bfq: Fix use-after-free with cgroups Jan Kara
2021-12-01 16:47 ` kernel test robot
2021-12-01 21:22 ` kernel test robot
2021-12-07 14:53 ` Holger Hoffstätte
2021-12-13 13:46   ` Jan Kara
2021-12-07 19:08 ` Michal Koutný
2021-12-13 14:52   ` Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).