linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH -next v2 0/9] iocost bugfix
@ 2022-11-30 13:21 Li Nan
  2022-11-30 13:21 ` [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write() Li Nan
                   ` (9 more replies)
  0 siblings, 10 replies; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

Li Nan (4):
  blk-iocost: fix divide by 0 error in calc_lcoefs()
  blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in
    ioc_refresh_params()
  blk-iocost: fix UAF in ioc_pd_free
  block: fix null-pointer dereference in ioc_pd_init

Yu Kuai (5):
  blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write()
  blk-iocost: improve hanlder of match_u64()
  blk-iocost: don't allow to configure bio based device
  blk-iocost: read params inside lock in sysfs apis
  blk-iocost: fix walk_list corruption

 block/blk-iocost.c | 117 ++++++++++++++++++++++++++++-----------------
 block/genhd.c      |   2 +-
 2 files changed, 75 insertions(+), 44 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write()
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 15:54   ` Christoph Hellwig
  2022-11-30 20:31   ` Tejun Heo
  2022-11-30 13:21 ` [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64() Li Nan
                   ` (8 subsequent siblings)
  9 siblings, 2 replies; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

From: Yu Kuai <yukuai3@huawei.com>

There are no functional changes, just to make the code a litter cleaner
and follow up patches easier.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/blk-iocost.c | 62 +++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 37 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index f01359906c83..fd495e823db2 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3185,7 +3185,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 	if (!ioc) {
 		ret = blk_iocost_init(disk);
 		if (ret)
-			goto err;
+			goto out;
 		ioc = q_to_ioc(disk->queue);
 	}
 
@@ -3197,6 +3197,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 	enable = ioc->enabled;
 	user = ioc->user_qos_params;
 
+	ret = -EINVAL;
 	while ((p = strsep(&input, " \t\n"))) {
 		substring_t args[MAX_OPT_ARGS];
 		char buf[32];
@@ -3218,7 +3219,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 			else if (!strcmp(buf, "user"))
 				user = true;
 			else
-				goto einval;
+				goto out_unlock;
 			continue;
 		}
 
@@ -3228,39 +3229,39 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 		case QOS_WPPM:
 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
 			    sizeof(buf))
-				goto einval;
+				goto out_unlock;
 			if (cgroup_parse_float(buf, 2, &v))
-				goto einval;
+				goto out_unlock;
 			if (v < 0 || v > 10000)
-				goto einval;
+				goto out_unlock;
 			qos[tok] = v * 100;
 			break;
 		case QOS_RLAT:
 		case QOS_WLAT:
 			if (match_u64(&args[0], &v))
-				goto einval;
+				goto out_unlock;
 			qos[tok] = v;
 			break;
 		case QOS_MIN:
 		case QOS_MAX:
 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
 			    sizeof(buf))
-				goto einval;
+				goto out_unlock;
 			if (cgroup_parse_float(buf, 2, &v))
-				goto einval;
+				goto out_unlock;
 			if (v < 0)
-				goto einval;
+				goto out_unlock;
 			qos[tok] = clamp_t(s64, v * 100,
 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
 			break;
 		default:
-			goto einval;
+			goto out_unlock;
 		}
 		user = true;
 	}
 
 	if (qos[QOS_MIN] > qos[QOS_MAX])
-		goto einval;
+		goto out_unlock;
 
 	if (enable) {
 		blk_stat_enable_accounting(disk->queue);
@@ -3281,21 +3282,14 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 	}
 
 	ioc_refresh_params(ioc, true);
-	spin_unlock_irq(&ioc->lock);
+	ret = nbytes;
 
-	blk_mq_unquiesce_queue(disk->queue);
-	blk_mq_unfreeze_queue(disk->queue);
-
-	blkdev_put_no_open(bdev);
-	return nbytes;
-einval:
+out_unlock:
 	spin_unlock_irq(&ioc->lock);
-
 	blk_mq_unquiesce_queue(disk->queue);
 	blk_mq_unfreeze_queue(disk->queue);
 
-	ret = -EINVAL;
-err:
+out:
 	blkdev_put_no_open(bdev);
 	return ret;
 }
@@ -3364,7 +3358,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	if (!ioc) {
 		ret = blk_iocost_init(bdev->bd_disk);
 		if (ret)
-			goto err;
+			goto out;
 		ioc = q_to_ioc(q);
 	}
 
@@ -3375,6 +3369,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
 	user = ioc->user_cost_model;
 
+	ret = -EINVAL;
 	while ((p = strsep(&input, " \t\n"))) {
 		substring_t args[MAX_OPT_ARGS];
 		char buf[32];
@@ -3392,20 +3387,20 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 			else if (!strcmp(buf, "user"))
 				user = true;
 			else
-				goto einval;
+				goto out_unlock;
 			continue;
 		case COST_MODEL:
 			match_strlcpy(buf, &args[0], sizeof(buf));
 			if (strcmp(buf, "linear"))
-				goto einval;
+				goto out_unlock;
 			continue;
 		}
 
 		tok = match_token(p, i_lcoef_tokens, args);
 		if (tok == NR_I_LCOEFS)
-			goto einval;
+			goto out_unlock;
 		if (match_u64(&args[0], &v))
-			goto einval;
+			goto out_unlock;
 		u[tok] = v;
 		user = true;
 	}
@@ -3416,23 +3411,16 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	} else {
 		ioc->user_cost_model = false;
 	}
-	ioc_refresh_params(ioc, true);
-	spin_unlock_irq(&ioc->lock);
 
-	blk_mq_unquiesce_queue(q);
-	blk_mq_unfreeze_queue(q);
-
-	blkdev_put_no_open(bdev);
-	return nbytes;
+	ioc_refresh_params(ioc, true);
+	ret = nbytes;
 
-einval:
+out_unlock:
 	spin_unlock_irq(&ioc->lock);
-
 	blk_mq_unquiesce_queue(q);
 	blk_mq_unfreeze_queue(q);
 
-	ret = -EINVAL;
-err:
+out:
 	blkdev_put_no_open(bdev);
 	return ret;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64()
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
  2022-11-30 13:21 ` [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write() Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 20:32   ` Tejun Heo
  2022-11-30 13:21 ` [PATCH -next v2 3/9] blk-iocost: don't allow to configure bio based device Li Nan
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

From: Yu Kuai <yukuai3@huawei.com>

1) There are one place that return value of match_u64() is not checked.
2) If match_u64() failed, return value is set to -EINVAL despite that
   there are other possible errnos.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/blk-iocost.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index fd495e823db2..c532129a1456 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3202,6 +3202,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 		substring_t args[MAX_OPT_ARGS];
 		char buf[32];
 		int tok;
+		int err;
 		s64 v;
 
 		if (!*p)
@@ -3209,7 +3210,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 
 		switch (match_token(p, qos_ctrl_tokens, args)) {
 		case QOS_ENABLE:
-			match_u64(&args[0], &v);
+			err = match_u64(&args[0], &v);
+			if (err) {
+				ret = err;
+				goto out_unlock;
+			}
+
 			enable = v;
 			continue;
 		case QOS_CTRL:
@@ -3238,8 +3244,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 			break;
 		case QOS_RLAT:
 		case QOS_WLAT:
-			if (match_u64(&args[0], &v))
+			err = match_u64(&args[0], &v);
+			if (err) {
+				ret = err;
 				goto out_unlock;
+			}
+
 			qos[tok] = v;
 			break;
 		case QOS_MIN:
@@ -3374,6 +3384,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 		substring_t args[MAX_OPT_ARGS];
 		char buf[32];
 		int tok;
+		int err;
 		u64 v;
 
 		if (!*p)
@@ -3399,8 +3410,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 		tok = match_token(p, i_lcoef_tokens, args);
 		if (tok == NR_I_LCOEFS)
 			goto out_unlock;
-		if (match_u64(&args[0], &v))
+
+		err = match_u64(&args[0], &v);
+		if (err) {
+			ret = err;
 			goto out_unlock;
+		}
+
 		u[tok] = v;
 		user = true;
 	}
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH -next v2 3/9] blk-iocost: don't allow to configure bio based device
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
  2022-11-30 13:21 ` [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write() Li Nan
  2022-11-30 13:21 ` [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64() Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 20:15   ` Tejun Heo
  2022-11-30 13:21 ` [PATCH -next v2 4/9] blk-iocost: read params inside lock in sysfs apis Li Nan
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

From: Yu Kuai <yukuai3@huawei.com>

iocost is based on rq_qos, which can only work for request based device,
thus it doesn't make sense to configure iocost for bio based device.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/blk-iocost.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index c532129a1456..bc6522bb314d 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3181,6 +3181,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 		return PTR_ERR(bdev);
 
 	disk = bdev->bd_disk;
+	if (!queue_is_mq(disk->queue)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
 	ioc = q_to_ioc(disk->queue);
 	if (!ioc) {
 		ret = blk_iocost_init(disk);
@@ -3364,6 +3369,11 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 		return PTR_ERR(bdev);
 
 	q = bdev_get_queue(bdev);
+	if (!queue_is_mq(q)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
 	ioc = q_to_ioc(q);
 	if (!ioc) {
 		ret = blk_iocost_init(bdev->bd_disk);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH -next v2 4/9] blk-iocost: read params inside lock in sysfs apis
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
                   ` (2 preceding siblings ...)
  2022-11-30 13:21 ` [PATCH -next v2 3/9] blk-iocost: don't allow to configure bio based device Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 20:16   ` Tejun Heo
  2022-11-30 13:21 ` [PATCH -next v2 5/9] blk-iocost: fix divide by 0 error in calc_lcoefs() Li Nan
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

From: Yu Kuai <yukuai3@huawei.com>

Otherwise, user might get abnormal values if params is updated
concurrently.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/blk-iocost.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index bc6522bb314d..3a96cd557c47 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3125,6 +3125,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
 	if (!dname)
 		return 0;
 
+	spin_lock_irq(&ioc->lock);
 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
 		   ioc->params.qos[QOS_RPPM] / 10000,
@@ -3137,6 +3138,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
 		   ioc->params.qos[QOS_MAX] / 10000,
 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
+	spin_unlock_irq(&ioc->lock);
 	return 0;
 }
 
@@ -3319,12 +3321,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf,
 	if (!dname)
 		return 0;
 
+	spin_lock_irq(&ioc->lock);
 	seq_printf(sf, "%s ctrl=%s model=linear "
 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
 		   dname, ioc->user_cost_model ? "user" : "auto",
 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
+	spin_unlock_irq(&ioc->lock);
 	return 0;
 }
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH -next v2 5/9] blk-iocost: fix divide by 0 error in calc_lcoefs()
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
                   ` (3 preceding siblings ...)
  2022-11-30 13:21 ` [PATCH -next v2 4/9] blk-iocost: read params inside lock in sysfs apis Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 20:20   ` Tejun Heo
  2022-11-30 13:21 ` [PATCH -next v2 6/9] blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in ioc_refresh_params() Li Nan
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

echo max of u64 to cost.model can cause divide by 0 error.

  # echo 8:0 rbps=18446744073709551615 > /sys/fs/cgroup/io.cost.model

  divide error: 0000 [#1] PREEMPT SMP
  RIP: 0010:calc_lcoefs+0x4c/0xc0
  Call Trace:
   <TASK>
   ioc_refresh_params+0x2b3/0x4f0
   ioc_cost_model_write+0x3cb/0x4c0
   ? _copy_from_iter+0x6d/0x6c0
   ? kernfs_fop_write_iter+0xfc/0x270
   cgroup_file_write+0xa0/0x200
   kernfs_fop_write_iter+0x17d/0x270
   vfs_write+0x414/0x620
   ksys_write+0x73/0x160
   __x64_sys_write+0x1e/0x30
   do_syscall_64+0x35/0x80
   entry_SYSCALL_64_after_hwframe+0x63/0xcd

calc_lcoefs() uses the input value of cost.model in DIV_ROUND_UP_ULL,
overflow would happen if bps plus IOC_PAGE_SIZE is greater than
ULLONG_MAX, it can cause divide by 0 error.I_LCOEF_MAX is introduced to
prevent it.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/blk-iocost.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 3a96cd557c47..f4a754b9d10f 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -306,6 +306,9 @@ enum {
 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
 
+	/* avoid overflow */
+	I_LCOEF_MAX		= ULLONG_MAX - IOC_PAGE_SIZE,
+
 	/* if apart further than 16M, consider randio for linear model */
 	LCOEF_RANDIO_PAGES	= 4096,
 };
@@ -3431,6 +3434,8 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 			goto out_unlock;
 		}
 
+		if (v > I_LCOEF_MAX)
+			goto out_unlock;
 		u[tok] = v;
 		user = true;
 	}
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH -next v2 6/9] blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in ioc_refresh_params()
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
                   ` (4 preceding siblings ...)
  2022-11-30 13:21 ` [PATCH -next v2 5/9] blk-iocost: fix divide by 0 error in calc_lcoefs() Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 20:22   ` Tejun Heo
  2022-11-30 13:21 ` [PATCH -next v2 7/9] blk-iocost: fix UAF in ioc_pd_free Li Nan
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

vrate_min is calculated by DIV64_U64_ROUND_UP, but vrate_max is calculated
by div64_u64. Vrate_min may be 1 greater than vrate_max if the input
values min and max of cost.qos are equal.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/blk-iocost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index f4a754b9d10f..2316ba93e7d6 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -926,7 +926,7 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force)
 
 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 					    VTIME_PER_USEC, MILLION);
-	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
+	ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
 				   VTIME_PER_USEC, MILLION);
 
 	return true;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH -next v2 7/9] blk-iocost: fix UAF in ioc_pd_free
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
                   ` (5 preceding siblings ...)
  2022-11-30 13:21 ` [PATCH -next v2 6/9] blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in ioc_refresh_params() Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 20:42   ` Tejun Heo
  2022-11-30 13:21 ` [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init Li Nan
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

Our test found the following problem in kernel 5.10, and the same problem
should exist in mainline:

  BUG: KASAN: use-after-free in _raw_spin_lock_irqsave+0x71/0xe0
  Write of size 4 at addr ffff8881432000e0 by task swapper/4/0
  ...
  Call Trace:
   <IRQ>
   dump_stack+0x9c/0xd3
   print_address_description.constprop.0+0x19/0x170
   __kasan_report.cold+0x6c/0x84
   kasan_report+0x3a/0x50
   check_memory_region+0xfd/0x1f0
   _raw_spin_lock_irqsave+0x71/0xe0
   ioc_pd_free+0x9d/0x250
   blkg_free.part.0+0x80/0x100
   __blkg_release+0xf3/0x1c0
   rcu_do_batch+0x292/0x700
   rcu_core+0x270/0x2d0
   __do_softirq+0xfd/0x402
    </IRQ>
   asm_call_irq_on_stack+0x12/0x20
   do_softirq_own_stack+0x37/0x50
   irq_exit_rcu+0x134/0x1a0
   sysvec_apic_timer_interrupt+0x36/0x80
   asm_sysvec_apic_timer_interrupt+0x12/0x20

   Freed by task 57:
   kfree+0xba/0x680
   rq_qos_exit+0x5a/0x80
   blk_cleanup_queue+0xce/0x1a0
   virtblk_remove+0x77/0x130 [virtio_blk]
   virtio_dev_remove+0x56/0xe0
   __device_release_driver+0x2ba/0x450
   device_release_driver+0x29/0x40
   bus_remove_device+0x1d8/0x2c0
   device_del+0x333/0x7e0
   device_unregister+0x27/0x90
   unregister_virtio_device+0x22/0x40
   virtio_pci_remove+0x53/0xb0
   pci_device_remove+0x7a/0x130
   __device_release_driver+0x2ba/0x450
   device_release_driver+0x29/0x40
   pci_stop_bus_device+0xcf/0x100
   pci_stop_and_remove_bus_device+0x16/0x20
   disable_slot+0xa1/0x110
   acpiphp_disable_and_eject_slot+0x35/0xe0
   hotplug_event+0x1b8/0x3c0
   acpiphp_hotplug_notify+0x37/0x70
   acpi_device_hotplug+0xee/0x320
   acpi_hotplug_work_fn+0x69/0x80
   process_one_work+0x3c5/0x730
   worker_thread+0x93/0x650
   kthread+0x1ba/0x210
   ret_from_fork+0x22/0x30

It happened as follow:

	T1		     T2			T3
  //delete device
  del_gendisk
   bdi_unregister
    bdi_remove_from_list
     synchronize_rcu_expedited

		         //rmdir cgroup
		         blkcg_destroy_blkgs
		          blkg_destroy
		           percpu_ref_kill
		            blkg_release
		             call_rcu
   rq_qos_exit
    ioc_rqos_exit
     kfree(ioc)
					   __blkg_release
					    blkg_free
					     blkg_free_workfn
					      pd_free_fn
					       ioc_pd_free
						spin_lock_irqsave
						 ->ioc is freed

Fix the problem by moving the operation on ioc in ioc_pd_free() to
ioc_pd_offline(), and just free resource in ioc_pd_free() like iolatency
and throttle.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/blk-iocost.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 2316ba93e7d6..710cf63a1643 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2978,7 +2978,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
 	spin_unlock_irqrestore(&ioc->lock, flags);
 }
 
-static void ioc_pd_free(struct blkg_policy_data *pd)
+static void ioc_pd_offline(struct blkg_policy_data *pd)
 {
 	struct ioc_gq *iocg = pd_to_iocg(pd);
 	struct ioc *ioc = iocg->ioc;
@@ -3002,6 +3002,12 @@ static void ioc_pd_free(struct blkg_policy_data *pd)
 
 		hrtimer_cancel(&iocg->waitq_timer);
 	}
+}
+
+static void ioc_pd_free(struct blkg_policy_data *pd)
+{
+	struct ioc_gq *iocg = pd_to_iocg(pd);
+
 	free_percpu(iocg->pcpu_stat);
 	kfree(iocg);
 }
@@ -3488,6 +3494,7 @@ static struct blkcg_policy blkcg_policy_iocost = {
 	.cpd_free_fn	= ioc_cpd_free,
 	.pd_alloc_fn	= ioc_pd_alloc,
 	.pd_init_fn	= ioc_pd_init,
+	.pd_offline_fn	= ioc_pd_offline,
 	.pd_free_fn	= ioc_pd_free,
 	.pd_stat_fn	= ioc_pd_stat,
 };
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
                   ` (6 preceding siblings ...)
  2022-11-30 13:21 ` [PATCH -next v2 7/9] blk-iocost: fix UAF in ioc_pd_free Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 20:50   ` Tejun Heo
  2022-11-30 13:21 ` [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption Li Nan
  2022-11-30 17:26 ` [PATCH -next v2 0/9] iocost bugfix Jens Axboe
  9 siblings, 1 reply; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

Remove block device when iocost is initializing may cause
null-pointer dereference:

	CPU1				   CPU2
  ioc_qos_write
   blkcg_conf_open_bdev
    blkdev_get_no_open
     kobject_get_unless_zero
    blk_iocost_init
     rq_qos_add
  					del_gendisk
  					 rq_qos_exit
  					  q->rq_qos = rqos->next
  					   //iocost is removed from q->roqs
      blkcg_activate_policy
       pd_init_fn
        ioc_pd_init
  	 ioc = q_to_ioc(blkg->q)
 	  //cant find iocost and return null

Fix problem by moving rq_qos_exit() to disk_release(). ioc_qos_write() get
bd_device.kobj in blkcg_conf_open_bdev(), so disk_release will not be
actived until iocost initialization is complited.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index dcf200bcbd3e..0db440bbfefb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -656,7 +656,6 @@ void del_gendisk(struct gendisk *disk)
 		elevator_exit(q);
 		mutex_unlock(&q->sysfs_lock);
 	}
-	rq_qos_exit(q);
 	blk_mq_unquiesce_queue(q);
 
 	/*
@@ -1168,6 +1167,7 @@ static void disk_release(struct device *dev)
 	    !test_bit(GD_ADDED, &disk->state))
 		blk_mq_exit_queue(disk->queue);
 
+	rq_qos_exit(disk->queue);
 	blkcg_exit_disk(disk);
 
 	bioset_exit(&disk->bio_split);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
                   ` (7 preceding siblings ...)
  2022-11-30 13:21 ` [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init Li Nan
@ 2022-11-30 13:21 ` Li Nan
  2022-11-30 20:59   ` Tejun Heo
  2022-11-30 17:26 ` [PATCH -next v2 0/9] iocost bugfix Jens Axboe
  9 siblings, 1 reply; 38+ messages in thread
From: Li Nan @ 2022-11-30 13:21 UTC (permalink / raw)
  To: tj, josef, axboe
  Cc: cgroups, linux-block, linux-kernel, linan122, yukuai3, yi.zhang

From: Yu Kuai <yukuai3@huawei.com>

Our test report a problem:

------------[ cut here ]------------
list_del corruption. next->prev should be ffff888127e0c4b0, but was ffff888127e090b0
WARNING: CPU: 2 PID: 3117789 at lib/list_debug.c:62 __list_del_entry_valid+0x119/0x130
RIP: 0010:__list_del_entry_valid+0x119/0x130
RIP: 0010:__list_del_entry_valid+0x119/0x130
Call Trace:
 <IRQ>
 iocg_flush_stat.isra.0+0x11e/0x230
 ? ioc_rqos_done+0x230/0x230
 ? ioc_now+0x14f/0x180
 ioc_timer_fn+0x569/0x1640

We haven't reporduced it yet, but we think this is due to parent iocg is
freed before child iocg, and then in ioc_timer_fn, walk_list is
corrupted.

1) Remove child cgroup can concurrent with remove parent cgroup, and
ioc_pd_free for parent iocg can be called before child iocg. This can be
fixed by moving the handle of walk_list to ioc_pd_offline, since that
offline from child is ensured to be called before parent.

2) ioc_pd_free can be triggered from both removing device and removing
cgroup, this patch fix the problem by deleting timer before deactivating
policy, so that free parent iocg first in this case won't matter.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/blk-iocost.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 710cf63a1643..d2b873908f88 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2813,13 +2813,14 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
 {
 	struct ioc *ioc = rqos_to_ioc(rqos);
 
+	del_timer_sync(&ioc->timer);
+
 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
 
 	spin_lock_irq(&ioc->lock);
 	ioc->running = IOC_STOP;
 	spin_unlock_irq(&ioc->lock);
 
-	del_timer_sync(&ioc->timer);
 	free_percpu(ioc->pcpu_stat);
 	kfree(ioc);
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write()
  2022-11-30 13:21 ` [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write() Li Nan
@ 2022-11-30 15:54   ` Christoph Hellwig
  2022-11-30 15:55     ` Christoph Hellwig
  2022-11-30 20:31   ` Tejun Heo
  1 sibling, 1 reply; 38+ messages in thread
From: Christoph Hellwig @ 2022-11-30 15:54 UTC (permalink / raw)
  To: Li Nan
  Cc: tj, josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

> +	ret = nbytes;

ret is an int which bytes is a size_t.  So we at least need a ssize_t
instead for ret, and even that assumes nbytes never overflows a ssize_t.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write()
  2022-11-30 15:54   ` Christoph Hellwig
@ 2022-11-30 15:55     ` Christoph Hellwig
  0 siblings, 0 replies; 38+ messages in thread
From: Christoph Hellwig @ 2022-11-30 15:55 UTC (permalink / raw)
  To: Li Nan
  Cc: tj, josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 07:54:59AM -0800, Christoph Hellwig wrote:
> > +	ret = nbytes;
> 
> ret is an int which bytes is a size_t.  So we at least need a ssize_t
> instead for ret, and even that assumes nbytes never overflows a ssize_t.

A better way might be to initialize ret to 0 at declaration time and
then do

	if (ret)
		return ret;
	return nbytes;

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 0/9] iocost bugfix
  2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
                   ` (8 preceding siblings ...)
  2022-11-30 13:21 ` [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption Li Nan
@ 2022-11-30 17:26 ` Jens Axboe
  9 siblings, 0 replies; 38+ messages in thread
From: Jens Axboe @ 2022-11-30 17:26 UTC (permalink / raw)
  To: Li Nan, tj, josef; +Cc: cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On 11/30/22 6:21 AM, Li Nan wrote:
> Li Nan (4):
>   blk-iocost: fix divide by 0 error in calc_lcoefs()
>   blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in
>     ioc_refresh_params()
>   blk-iocost: fix UAF in ioc_pd_free
>   block: fix null-pointer dereference in ioc_pd_init
> 
> Yu Kuai (5):
>   blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write()
>   blk-iocost: improve hanlder of match_u64()
>   blk-iocost: don't allow to configure bio based device
>   blk-iocost: read params inside lock in sysfs apis
>   blk-iocost: fix walk_list corruption
> 
>  block/blk-iocost.c | 117 ++++++++++++++++++++++++++++-----------------
>  block/genhd.c      |   2 +-
>  2 files changed, 75 insertions(+), 44 deletions(-)

Please include a changelog when posting later versions of a patchset.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 3/9] blk-iocost: don't allow to configure bio based device
  2022-11-30 13:21 ` [PATCH -next v2 3/9] blk-iocost: don't allow to configure bio based device Li Nan
@ 2022-11-30 20:15   ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:15 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:50PM +0800, Li Nan wrote:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> iocost is based on rq_qos, which can only work for request based device,
> thus it doesn't make sense to configure iocost for bio based device.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Li Nan <linan122@huawei.com>

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 4/9] blk-iocost: read params inside lock in sysfs apis
  2022-11-30 13:21 ` [PATCH -next v2 4/9] blk-iocost: read params inside lock in sysfs apis Li Nan
@ 2022-11-30 20:16   ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:16 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:51PM +0800, Li Nan wrote:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> Otherwise, user might get abnormal values if params is updated
> concurrently.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> Signed-off-by: Li Nan <linan122@huawei.com>

Acked-by: Tejun Heo <tj@kernel.org>

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 5/9] blk-iocost: fix divide by 0 error in calc_lcoefs()
  2022-11-30 13:21 ` [PATCH -next v2 5/9] blk-iocost: fix divide by 0 error in calc_lcoefs() Li Nan
@ 2022-11-30 20:20   ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:20 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:52PM +0800, Li Nan wrote:
> echo max of u64 to cost.model can cause divide by 0 error.
> 
>   # echo 8:0 rbps=18446744073709551615 > /sys/fs/cgroup/io.cost.model
> 
>   divide error: 0000 [#1] PREEMPT SMP
>   RIP: 0010:calc_lcoefs+0x4c/0xc0
>   Call Trace:
>    <TASK>
>    ioc_refresh_params+0x2b3/0x4f0
>    ioc_cost_model_write+0x3cb/0x4c0
>    ? _copy_from_iter+0x6d/0x6c0
>    ? kernfs_fop_write_iter+0xfc/0x270
>    cgroup_file_write+0xa0/0x200
>    kernfs_fop_write_iter+0x17d/0x270
>    vfs_write+0x414/0x620
>    ksys_write+0x73/0x160
>    __x64_sys_write+0x1e/0x30
>    do_syscall_64+0x35/0x80
>    entry_SYSCALL_64_after_hwframe+0x63/0xcd
> 
> calc_lcoefs() uses the input value of cost.model in DIV_ROUND_UP_ULL,
> overflow would happen if bps plus IOC_PAGE_SIZE is greater than
> ULLONG_MAX, it can cause divide by 0 error.I_LCOEF_MAX is introduced to
> prevent it.
> 
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>  block/blk-iocost.c | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/block/blk-iocost.c b/block/blk-iocost.c
> index 3a96cd557c47..f4a754b9d10f 100644
> --- a/block/blk-iocost.c
> +++ b/block/blk-iocost.c
> @@ -306,6 +306,9 @@ enum {
>  	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
>  	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
>  
> +	/* avoid overflow */
> +	I_LCOEF_MAX		= ULLONG_MAX - IOC_PAGE_SIZE,
> +
>  	/* if apart further than 16M, consider randio for linear model */
>  	LCOEF_RANDIO_PAGES	= 4096,
>  };
> @@ -3431,6 +3434,8 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
>  			goto out_unlock;
>  		}
>  
> +		if (v > I_LCOEF_MAX)
> +			goto out_unlock;

This is kinda round-about. Can you just make calc_lcoefs to not divide by
zero (e.g. just make it 1 if zero)?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 6/9] blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in ioc_refresh_params()
  2022-11-30 13:21 ` [PATCH -next v2 6/9] blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in ioc_refresh_params() Li Nan
@ 2022-11-30 20:22   ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:22 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:53PM +0800, Li Nan wrote:
> vrate_min is calculated by DIV64_U64_ROUND_UP, but vrate_max is calculated
> by div64_u64. Vrate_min may be 1 greater than vrate_max if the input
> values min and max of cost.qos are equal.
> 
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>  block/blk-iocost.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/block/blk-iocost.c b/block/blk-iocost.c
> index f4a754b9d10f..2316ba93e7d6 100644
> --- a/block/blk-iocost.c
> +++ b/block/blk-iocost.c
> @@ -926,7 +926,7 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force)
>  
>  	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
>  					    VTIME_PER_USEC, MILLION);
> -	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
> +	ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
>  				   VTIME_PER_USEC, MILLION);

Can you please align the parameters line? Other than that

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write()
  2022-11-30 13:21 ` [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write() Li Nan
  2022-11-30 15:54   ` Christoph Hellwig
@ 2022-11-30 20:31   ` Tejun Heo
  1 sibling, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:31 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:48PM +0800, Li Nan wrote:
> @@ -3197,6 +3197,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
>  	enable = ioc->enabled;
>  	user = ioc->user_qos_params;
>  
> +	ret = -EINVAL;
>  	while ((p = strsep(&input, " \t\n"))) {
>  		substring_t args[MAX_OPT_ARGS];
>  		char buf[32];
> @@ -3218,7 +3219,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
>  			else if (!strcmp(buf, "user"))
>  				user = true;
>  			else
> -				goto einval;
> +				goto out_unlock;

So, I kinda dislike it. That's a lot of code to cover with one "ret =
-EINVAL" assignment which makes it pretty easy to make a mistake. People use
variables like i, ret, err without thinking much and it doesn't help that
you now can't tell whether a given exit condition is error or not by just
looking at it.

I don't know what great extra insight the return value from match_u64()
would carry (like, what else is it gonna say? and even if it does why does
that matter when we say -EINVAL to pretty much everything else) so I'm not
sure this matters but if you really want it just add a separate error return
label?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64()
  2022-11-30 13:21 ` [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64() Li Nan
@ 2022-11-30 20:32   ` Tejun Heo
  2022-12-01  2:15     ` Yu Kuai
  0 siblings, 1 reply; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:32 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:49PM +0800, Li Nan wrote:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> 1) There are one place that return value of match_u64() is not checked.
> 2) If match_u64() failed, return value is set to -EINVAL despite that
>    there are other possible errnos.

Ditto. Does this matter?

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 7/9] blk-iocost: fix UAF in ioc_pd_free
  2022-11-30 13:21 ` [PATCH -next v2 7/9] blk-iocost: fix UAF in ioc_pd_free Li Nan
@ 2022-11-30 20:42   ` Tejun Heo
  2022-12-06  7:53     ` Yu Kuai
  0 siblings, 1 reply; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:42 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:54PM +0800, Li Nan wrote:
> 	T1		     T2			T3
>   //delete device
>   del_gendisk
>    bdi_unregister
>     bdi_remove_from_list
>      synchronize_rcu_expedited
> 
> 		         //rmdir cgroup
> 		         blkcg_destroy_blkgs
> 		          blkg_destroy
> 		           percpu_ref_kill
> 		            blkg_release
> 		             call_rcu
>    rq_qos_exit
>     ioc_rqos_exit
>      kfree(ioc)
> 					   __blkg_release
> 					    blkg_free
> 					     blkg_free_workfn
> 					      pd_free_fn
> 					       ioc_pd_free
> 						spin_lock_irqsave
> 						 ->ioc is freed
> 
> Fix the problem by moving the operation on ioc in ioc_pd_free() to
> ioc_pd_offline(), and just free resource in ioc_pd_free() like iolatency
> and throttle.
> 
> Signed-off-by: Li Nan <linan122@huawei.com>

I wonder what we really wanna do is pinning ioc while blkgs are still around
but I think this should work too.

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init
  2022-11-30 13:21 ` [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init Li Nan
@ 2022-11-30 20:50   ` Tejun Heo
  2022-12-01  2:12     ` Yu Kuai
  0 siblings, 1 reply; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:50 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:55PM +0800, Li Nan wrote:
> Remove block device when iocost is initializing may cause
> null-pointer dereference:
> 
> 	CPU1				   CPU2
>   ioc_qos_write
>    blkcg_conf_open_bdev
>     blkdev_get_no_open
>      kobject_get_unless_zero
>     blk_iocost_init
>      rq_qos_add
>   					del_gendisk
>   					 rq_qos_exit
>   					  q->rq_qos = rqos->next
>   					   //iocost is removed from q->roqs
>       blkcg_activate_policy
>        pd_init_fn
>         ioc_pd_init
>   	 ioc = q_to_ioc(blkg->q)
>  	  //cant find iocost and return null
> 
> Fix problem by moving rq_qos_exit() to disk_release(). ioc_qos_write() get
> bd_device.kobj in blkcg_conf_open_bdev(), so disk_release will not be
> actived until iocost initialization is complited.

I think it'd be better to make del_gendisk wait for these in-flight
operations because this might fix the above particular issue but now all the
policies are exposed to request_queue in a state it never expected before.

del_gendisk() is quiescing the queue around rq_qos_exit(), so maybe we can
piggyback on that and update blkcg_conf_open_bdev() to provide such
exclusion?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption
  2022-11-30 13:21 ` [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption Li Nan
@ 2022-11-30 20:59   ` Tejun Heo
  2022-12-01  1:19     ` Yu Kuai
  2022-12-05  9:35     ` Yu Kuai
  0 siblings, 2 replies; 38+ messages in thread
From: Tejun Heo @ 2022-11-30 20:59 UTC (permalink / raw)
  To: Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yukuai3, yi.zhang

On Wed, Nov 30, 2022 at 09:21:56PM +0800, Li Nan wrote:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> Our test report a problem:
> 
> ------------[ cut here ]------------
> list_del corruption. next->prev should be ffff888127e0c4b0, but was ffff888127e090b0
> WARNING: CPU: 2 PID: 3117789 at lib/list_debug.c:62 __list_del_entry_valid+0x119/0x130
> RIP: 0010:__list_del_entry_valid+0x119/0x130
> RIP: 0010:__list_del_entry_valid+0x119/0x130
> Call Trace:
>  <IRQ>
>  iocg_flush_stat.isra.0+0x11e/0x230
>  ? ioc_rqos_done+0x230/0x230
>  ? ioc_now+0x14f/0x180
>  ioc_timer_fn+0x569/0x1640
> 
> We haven't reporduced it yet, but we think this is due to parent iocg is
> freed before child iocg, and then in ioc_timer_fn, walk_list is
> corrupted.
> 
> 1) Remove child cgroup can concurrent with remove parent cgroup, and
> ioc_pd_free for parent iocg can be called before child iocg. This can be
> fixed by moving the handle of walk_list to ioc_pd_offline, since that
> offline from child is ensured to be called before parent.

Which you already did in a previous patch, right?

> 2) ioc_pd_free can be triggered from both removing device and removing
> cgroup, this patch fix the problem by deleting timer before deactivating
> policy, so that free parent iocg first in this case won't matter.

Okay, so, yeah, css's pin parents but blkg's don't. I think the right thing
to do here is making sure that a child blkg pins its parent (and eventually
ioc).

> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>  block/blk-iocost.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/block/blk-iocost.c b/block/blk-iocost.c
> index 710cf63a1643..d2b873908f88 100644
> --- a/block/blk-iocost.c
> +++ b/block/blk-iocost.c
> @@ -2813,13 +2813,14 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
>  {
>  	struct ioc *ioc = rqos_to_ioc(rqos);
>  
> +	del_timer_sync(&ioc->timer);
> +
>  	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
>  
>  	spin_lock_irq(&ioc->lock);
>  	ioc->running = IOC_STOP;
>  	spin_unlock_irq(&ioc->lock);
>  
> -	del_timer_sync(&ioc->timer);

I don't about this workaround. Let's fix properly?

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption
  2022-11-30 20:59   ` Tejun Heo
@ 2022-12-01  1:19     ` Yu Kuai
  2022-12-01 10:00       ` Tejun Heo
  2022-12-05  9:35     ` Yu Kuai
  1 sibling, 1 reply; 38+ messages in thread
From: Yu Kuai @ 2022-12-01  1:19 UTC (permalink / raw)
  To: Tejun Heo, Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yi.zhang, yukuai (C)

Hi,

在 2022/12/01 4:59, Tejun Heo 写道:
> On Wed, Nov 30, 2022 at 09:21:56PM +0800, Li Nan wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Our test report a problem:
>>
>> ------------[ cut here ]------------
>> list_del corruption. next->prev should be ffff888127e0c4b0, but was ffff888127e090b0
>> WARNING: CPU: 2 PID: 3117789 at lib/list_debug.c:62 __list_del_entry_valid+0x119/0x130
>> RIP: 0010:__list_del_entry_valid+0x119/0x130
>> RIP: 0010:__list_del_entry_valid+0x119/0x130
>> Call Trace:
>>   <IRQ>
>>   iocg_flush_stat.isra.0+0x11e/0x230
>>   ? ioc_rqos_done+0x230/0x230
>>   ? ioc_now+0x14f/0x180
>>   ioc_timer_fn+0x569/0x1640
>>
>> We haven't reporduced it yet, but we think this is due to parent iocg is
>> freed before child iocg, and then in ioc_timer_fn, walk_list is
>> corrupted.
>>
>> 1) Remove child cgroup can concurrent with remove parent cgroup, and
>> ioc_pd_free for parent iocg can be called before child iocg. This can be
>> fixed by moving the handle of walk_list to ioc_pd_offline, since that
>> offline from child is ensured to be called before parent.
> 
> Which you already did in a previous patch, right?

yes, this is already did in patch 7.

> 
>> 2) ioc_pd_free can be triggered from both removing device and removing
>> cgroup, this patch fix the problem by deleting timer before deactivating
>> policy, so that free parent iocg first in this case won't matter.
> 
> Okay, so, yeah, css's pin parents but blkg's don't. I think the right thing
> to do here is making sure that a child blkg pins its parent (and eventually
> ioc).

Ok, I can try to do that.

> 
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> Signed-off-by: Li Nan <linan122@huawei.com>
>> ---
>>   block/blk-iocost.c | 3 ++-
>>   1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/block/blk-iocost.c b/block/blk-iocost.c
>> index 710cf63a1643..d2b873908f88 100644
>> --- a/block/blk-iocost.c
>> +++ b/block/blk-iocost.c
>> @@ -2813,13 +2813,14 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
>>   {
>>   	struct ioc *ioc = rqos_to_ioc(rqos);
>>   
>> +	del_timer_sync(&ioc->timer);
>> +
>>   	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
>>   
>>   	spin_lock_irq(&ioc->lock);
>>   	ioc->running = IOC_STOP;
>>   	spin_unlock_irq(&ioc->lock);
>>   
>> -	del_timer_sync(&ioc->timer);
> 
> I don't about this workaround. Let's fix properly?

Ok, and by the way, is there any reason to delete timer after
deactivate policy? This seems a litter wreid to me.

Thanks,
Kuai
> 


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init
  2022-11-30 20:50   ` Tejun Heo
@ 2022-12-01  2:12     ` Yu Kuai
  2022-12-01 10:11       ` Tejun Heo
  0 siblings, 1 reply; 38+ messages in thread
From: Yu Kuai @ 2022-12-01  2:12 UTC (permalink / raw)
  To: Tejun Heo, Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yi.zhang, yukuai (C)

Hi,

在 2022/12/01 4:50, Tejun Heo 写道:
> On Wed, Nov 30, 2022 at 09:21:55PM +0800, Li Nan wrote:
>> Remove block device when iocost is initializing may cause
>> null-pointer dereference:
>>
>> 	CPU1				   CPU2
>>    ioc_qos_write
>>     blkcg_conf_open_bdev
>>      blkdev_get_no_open
>>       kobject_get_unless_zero
>>      blk_iocost_init
>>       rq_qos_add
>>    					del_gendisk
>>    					 rq_qos_exit
>>    					  q->rq_qos = rqos->next
>>    					   //iocost is removed from q->roqs
>>        blkcg_activate_policy
>>         pd_init_fn
>>          ioc_pd_init
>>    	 ioc = q_to_ioc(blkg->q)
>>   	  //cant find iocost and return null
>>
>> Fix problem by moving rq_qos_exit() to disk_release(). ioc_qos_write() get
>> bd_device.kobj in blkcg_conf_open_bdev(), so disk_release will not be
>> actived until iocost initialization is complited.
> 
> I think it'd be better to make del_gendisk wait for these in-flight
> operations because this might fix the above particular issue but now all the
> policies are exposed to request_queue in a state it never expected before.
> 
> del_gendisk() is quiescing the queue around rq_qos_exit(), so maybe we can
> piggyback on that and update blkcg_conf_open_bdev() to provide such
> exclusion?

Let del_gendisk waiting for that sounds good, but I'm litter confused
how to do that. Following are what I think about:

1) By mentioning that "del_gendisk() is quiescing the queue", do you
suggest to add rcu_read_lock()? This seems wrong because blk_iocost_init
requires memory allocation.

2) Hold gendisk open mutex

3) Use q_usage_counter, and in the meantime, rq_qos_add() and
blkcg_activate_policy() will need refactoring to factor out freeze
queue.

4) Define a new metux

Thanks,
Kuai
> 
> Thanks.
> 


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64()
  2022-11-30 20:32   ` Tejun Heo
@ 2022-12-01  2:15     ` Yu Kuai
  2022-12-01 10:08       ` Tejun Heo
  0 siblings, 1 reply; 38+ messages in thread
From: Yu Kuai @ 2022-12-01  2:15 UTC (permalink / raw)
  To: Tejun Heo, Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yi.zhang, yukuai (C)

Hi,

在 2022/12/01 4:32, Tejun Heo 写道:
> On Wed, Nov 30, 2022 at 09:21:49PM +0800, Li Nan wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> 1) There are one place that return value of match_u64() is not checked.
>> 2) If match_u64() failed, return value is set to -EINVAL despite that
>>     there are other possible errnos.
> 
> Ditto. Does this matter?
> 

It's not a big deal, but I think at least return value of match_u64()
should be checked, we don't want to continue with invalid input, right?

By the way, match_u64() can return -ERANGE, which can provide more
specific error messge to user.

Thanks,
Kuai


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption
  2022-12-01  1:19     ` Yu Kuai
@ 2022-12-01 10:00       ` Tejun Heo
  2022-12-01 10:14         ` Yu Kuai
  0 siblings, 1 reply; 38+ messages in thread
From: Tejun Heo @ 2022-12-01 10:00 UTC (permalink / raw)
  To: Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

On Thu, Dec 01, 2022 at 09:19:54AM +0800, Yu Kuai wrote:
> > > diff --git a/block/blk-iocost.c b/block/blk-iocost.c
> > > index 710cf63a1643..d2b873908f88 100644
> > > --- a/block/blk-iocost.c
> > > +++ b/block/blk-iocost.c
> > > @@ -2813,13 +2813,14 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
> > >   {
> > >   	struct ioc *ioc = rqos_to_ioc(rqos);
> > > +	del_timer_sync(&ioc->timer);
> > > +
> > >   	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
> > >   	spin_lock_irq(&ioc->lock);
> > >   	ioc->running = IOC_STOP;
> > >   	spin_unlock_irq(&ioc->lock);
> > > -	del_timer_sync(&ioc->timer);
> > 
> > I don't about this workaround. Let's fix properly?
> 
> Ok, and by the way, is there any reason to delete timer after
> deactivate policy? This seems a litter wreid to me.

ioc->running is what controls whether the timer gets rescheduled or not. If
we don't shut that down, the timer may as well get rescheduled after being
deleted. Here, the only extra activation point is IO issue which shouldn't
trigger during rq_qos_exit, so the ordering shouldn't matter but this is the
right order for anything which can get restarted.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64()
  2022-12-01  2:15     ` Yu Kuai
@ 2022-12-01 10:08       ` Tejun Heo
  2022-12-01 13:47         ` Yu Kuai
  0 siblings, 1 reply; 38+ messages in thread
From: Tejun Heo @ 2022-12-01 10:08 UTC (permalink / raw)
  To: Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

On Thu, Dec 01, 2022 at 10:15:53AM +0800, Yu Kuai wrote:
> Hi,
> 
> 在 2022/12/01 4:32, Tejun Heo 写道:
> > On Wed, Nov 30, 2022 at 09:21:49PM +0800, Li Nan wrote:
> > > From: Yu Kuai <yukuai3@huawei.com>
> > > 
> > > 1) There are one place that return value of match_u64() is not checked.
> > > 2) If match_u64() failed, return value is set to -EINVAL despite that
> > >     there are other possible errnos.
> > 
> > Ditto. Does this matter?
> > 
> 
> It's not a big deal, but I think at least return value of match_u64()
> should be checked, we don't want to continue with invalid input, right?

Yeah, sure.

> By the way, match_u64() can return -ERANGE, which can provide more
> specific error messge to user.

I'm really not convinced going over 64bit range would be all that difficult
to spot whether the error code is -EINVAL or -ERANGE. There isn't anything
wrong with returning -ERANGE but the fact that that particular function
returns an error code doesn't necessarily mean that it *must* be forwarded.

Imagine that we used sscanf(buf, "%llu", &value) to parse the number
instead. We'd only know whether the parsing would have succeeded or not and
would probably return -EINVAL on failure and the behavior would be just
fine. This does not matter *at all*.

So, idk, I'm not necessarily against it but changing -EINVAL to -ERANGE is
pure churn. Nothing material is being improved by that change.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init
  2022-12-01  2:12     ` Yu Kuai
@ 2022-12-01 10:11       ` Tejun Heo
  2022-12-01 10:23         ` Yu Kuai
  0 siblings, 1 reply; 38+ messages in thread
From: Tejun Heo @ 2022-12-01 10:11 UTC (permalink / raw)
  To: Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

On Thu, Dec 01, 2022 at 10:12:13AM +0800, Yu Kuai wrote:
> 1) By mentioning that "del_gendisk() is quiescing the queue", do you
> suggest to add rcu_read_lock()? This seems wrong because blk_iocost_init
> requires memory allocation.

Quiescing uses SRCU so that should be fine but I'm not sure whether this is
the right one to piggyback on. Jens should have a better idea.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption
  2022-12-01 10:00       ` Tejun Heo
@ 2022-12-01 10:14         ` Yu Kuai
  2022-12-01 10:29           ` Tejun Heo
  0 siblings, 1 reply; 38+ messages in thread
From: Yu Kuai @ 2022-12-01 10:14 UTC (permalink / raw)
  To: Tejun Heo, Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

Hi,

在 2022/12/01 18:00, Tejun Heo 写道:
> On Thu, Dec 01, 2022 at 09:19:54AM +0800, Yu Kuai wrote:
>>>> diff --git a/block/blk-iocost.c b/block/blk-iocost.c
>>>> index 710cf63a1643..d2b873908f88 100644
>>>> --- a/block/blk-iocost.c
>>>> +++ b/block/blk-iocost.c
>>>> @@ -2813,13 +2813,14 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
>>>>    {
>>>>    	struct ioc *ioc = rqos_to_ioc(rqos);
>>>> +	del_timer_sync(&ioc->timer);
>>>> +
>>>>    	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
>>>>    	spin_lock_irq(&ioc->lock);
>>>>    	ioc->running = IOC_STOP;
>>>>    	spin_unlock_irq(&ioc->lock);
>>>> -	del_timer_sync(&ioc->timer);
>>>
>>> I don't about this workaround. Let's fix properly?
>>
>> Ok, and by the way, is there any reason to delete timer after
>> deactivate policy? This seems a litter wreid to me.
> 
> ioc->running is what controls whether the timer gets rescheduled or not. If
> we don't shut that down, the timer may as well get rescheduled after being
> deleted. Here, the only extra activation point is IO issue which shouldn't
> trigger during rq_qos_exit, so the ordering shouldn't matter but this is the
> right order for anything which can get restarted.

Thanks for the explanation.

I'm trying to figure out how to make sure child blkg pins it's parent,
btw, do you think following cleanup make sense?

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index a645184aba4a..6ad8791af9d7 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2810,13 +2810,13 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
  {
         struct ioc *ioc = rqos_to_ioc(rqos);

-       blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
-
         spin_lock_irq(&ioc->lock);
         ioc->running = IOC_STOP;
         spin_unlock_irq(&ioc->lock);

         del_timer_sync(&ioc->timer);
+       blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
+
         free_percpu(ioc->pcpu_stat);
         kfree(ioc);
  }

Thanks,
Kuai


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init
  2022-12-01 10:11       ` Tejun Heo
@ 2022-12-01 10:23         ` Yu Kuai
  2022-12-01 10:31           ` Tejun Heo
  0 siblings, 1 reply; 38+ messages in thread
From: Yu Kuai @ 2022-12-01 10:23 UTC (permalink / raw)
  To: Tejun Heo, Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

Hi,

在 2022/12/01 18:11, Tejun Heo 写道:
> On Thu, Dec 01, 2022 at 10:12:13AM +0800, Yu Kuai wrote:
>> 1) By mentioning that "del_gendisk() is quiescing the queue", do you
>> suggest to add rcu_read_lock()? This seems wrong because blk_iocost_init
>> requires memory allocation.
> 
> Quiescing uses SRCU so that should be fine but I'm not sure whether this is
> the right one to piggyback on. Jens should have a better idea.
> 
> Thanks.
> 

Currently SRCU is used if BLK_MQ_F_BLOCKING set, otherwise RCU is used.

dispatch:
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
do {                                                            \
         if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) {          \
                 int srcu_idx;                                   \
                                                                 \
                 might_sleep_if(check_sleep);                    \
                 srcu_idx = srcu_read_lock((q)->tag_set->srcu);  \
                 (dispatch_ops);                                 \
                 srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \
         } else {                                                \
                 rcu_read_lock();                                \
                 (dispatch_ops);                                 \
                 rcu_read_unlock();                              \
         }                                                       \
} while (0)

quiesce:
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{
         if (set->flags & BLK_MQ_F_BLOCKING)
                 synchronize_srcu(set->srcu);
         else
                 synchronize_rcu();
}

Thanks,
Kuai


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption
  2022-12-01 10:14         ` Yu Kuai
@ 2022-12-01 10:29           ` Tejun Heo
  2022-12-01 13:43             ` Yu Kuai
  0 siblings, 1 reply; 38+ messages in thread
From: Tejun Heo @ 2022-12-01 10:29 UTC (permalink / raw)
  To: Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

On Thu, Dec 01, 2022 at 06:14:32PM +0800, Yu Kuai wrote:
> Hi,
> 
> 在 2022/12/01 18:00, Tejun Heo 写道:
> > On Thu, Dec 01, 2022 at 09:19:54AM +0800, Yu Kuai wrote:
> > > > > diff --git a/block/blk-iocost.c b/block/blk-iocost.c
> > > > > index 710cf63a1643..d2b873908f88 100644
> > > > > --- a/block/blk-iocost.c
> > > > > +++ b/block/blk-iocost.c
> > > > > @@ -2813,13 +2813,14 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
> > > > >    {
> > > > >    	struct ioc *ioc = rqos_to_ioc(rqos);
> > > > > +	del_timer_sync(&ioc->timer);
> > > > > +
> > > > >    	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
> > > > >    	spin_lock_irq(&ioc->lock);
> > > > >    	ioc->running = IOC_STOP;
> > > > >    	spin_unlock_irq(&ioc->lock);
> > > > > -	del_timer_sync(&ioc->timer);
> > > > 
> > > > I don't about this workaround. Let's fix properly?
> > > 
> > > Ok, and by the way, is there any reason to delete timer after
> > > deactivate policy? This seems a litter wreid to me.
> > 
> > ioc->running is what controls whether the timer gets rescheduled or not. If
> > we don't shut that down, the timer may as well get rescheduled after being
> > deleted. Here, the only extra activation point is IO issue which shouldn't
> > trigger during rq_qos_exit, so the ordering shouldn't matter but this is the
> > right order for anything which can get restarted.
> 
> Thanks for the explanation.
> 
> I'm trying to figure out how to make sure child blkg pins it's parent,
> btw, do you think following cleanup make sense?

It's on you to explain why any change that you're suggesting is better and
safe. I know it's not intentional but you're repeatedly suggesting operation
reorderings in code paths which are really sensitive to ordering at least
seemingly without putting much effort into thinking through the side
effects. This costs disproportionate amount of review bandwidth, and
increases the chance of new subtle bugs. Can you please slow down a bit and
be more deliberate?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init
  2022-12-01 10:23         ` Yu Kuai
@ 2022-12-01 10:31           ` Tejun Heo
  2022-12-05  9:32             ` Yu Kuai
  0 siblings, 1 reply; 38+ messages in thread
From: Tejun Heo @ 2022-12-01 10:31 UTC (permalink / raw)
  To: Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

On Thu, Dec 01, 2022 at 06:23:16PM +0800, Yu Kuai wrote:
> Hi,
> 
> 在 2022/12/01 18:11, Tejun Heo 写道:
> > On Thu, Dec 01, 2022 at 10:12:13AM +0800, Yu Kuai wrote:
> > > 1) By mentioning that "del_gendisk() is quiescing the queue", do you
> > > suggest to add rcu_read_lock()? This seems wrong because blk_iocost_init
> > > requires memory allocation.
> > 
> > Quiescing uses SRCU so that should be fine but I'm not sure whether this is
> > the right one to piggyback on. Jens should have a better idea.
> > 
> > Thanks.
> > 
> 
> Currently SRCU is used if BLK_MQ_F_BLOCKING set, otherwise RCU is used.
> 
> dispatch:
> #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
> do {                                                            \
>         if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) {          \
>                 int srcu_idx;                                   \
>                                                                 \
>                 might_sleep_if(check_sleep);                    \
>                 srcu_idx = srcu_read_lock((q)->tag_set->srcu);  \
>                 (dispatch_ops);                                 \
>                 srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \
>         } else {                                                \
>                 rcu_read_lock();                                \
>                 (dispatch_ops);                                 \
>                 rcu_read_unlock();                              \
>         }                                                       \
> } while (0)
> 
> quiesce:
> void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
> {
>         if (set->flags & BLK_MQ_F_BLOCKING)
>                 synchronize_srcu(set->srcu);
>         else
>                 synchronize_rcu();
> }

Oh I see. Unfortunately, I don't know what to do off the top of my head.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption
  2022-12-01 10:29           ` Tejun Heo
@ 2022-12-01 13:43             ` Yu Kuai
  0 siblings, 0 replies; 38+ messages in thread
From: Yu Kuai @ 2022-12-01 13:43 UTC (permalink / raw)
  To: Tejun Heo, Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)



在 2022/12/01 18:29, Tejun Heo 写道:
> On Thu, Dec 01, 2022 at 06:14:32PM +0800, Yu Kuai wrote:
>> Hi,
>>
>> 在 2022/12/01 18:00, Tejun Heo 写道:
>>> On Thu, Dec 01, 2022 at 09:19:54AM +0800, Yu Kuai wrote:
>>>>>> diff --git a/block/blk-iocost.c b/block/blk-iocost.c
>>>>>> index 710cf63a1643..d2b873908f88 100644
>>>>>> --- a/block/blk-iocost.c
>>>>>> +++ b/block/blk-iocost.c
>>>>>> @@ -2813,13 +2813,14 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
>>>>>>     {
>>>>>>     	struct ioc *ioc = rqos_to_ioc(rqos);
>>>>>> +	del_timer_sync(&ioc->timer);
>>>>>> +
>>>>>>     	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
>>>>>>     	spin_lock_irq(&ioc->lock);
>>>>>>     	ioc->running = IOC_STOP;
>>>>>>     	spin_unlock_irq(&ioc->lock);
>>>>>> -	del_timer_sync(&ioc->timer);
>>>>>
>>>>> I don't about this workaround. Let's fix properly?
>>>>
>>>> Ok, and by the way, is there any reason to delete timer after
>>>> deactivate policy? This seems a litter wreid to me.
>>>
>>> ioc->running is what controls whether the timer gets rescheduled or not. If
>>> we don't shut that down, the timer may as well get rescheduled after being
>>> deleted. Here, the only extra activation point is IO issue which shouldn't
>>> trigger during rq_qos_exit, so the ordering shouldn't matter but this is the
>>> right order for anything which can get restarted.
>>
>> Thanks for the explanation.
>>
>> I'm trying to figure out how to make sure child blkg pins it's parent,
>> btw, do you think following cleanup make sense?
> 
> It's on you to explain why any change that you're suggesting is better and
> safe. I know it's not intentional but you're repeatedly suggesting operation
> reorderings in code paths which are really sensitive to ordering at least
> seemingly without putting much effort into thinking through the side
> effects. This costs disproportionate amount of review bandwidth, and
> increases the chance of new subtle bugs. Can you please slow down a bit and
> be more deliberate?

Thanks for the suggestion, I'll pay close attention to explain this "why
the change is better and safe". And sorry for the review pressure. 😔

> 
> Thanks.
> 


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64()
  2022-12-01 10:08       ` Tejun Heo
@ 2022-12-01 13:47         ` Yu Kuai
  0 siblings, 0 replies; 38+ messages in thread
From: Yu Kuai @ 2022-12-01 13:47 UTC (permalink / raw)
  To: Tejun Heo, Yu Kuai
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

Hi,

在 2022/12/01 18:08, Tejun Heo 写道:
> On Thu, Dec 01, 2022 at 10:15:53AM +0800, Yu Kuai wrote:
>> Hi,
>>
>> 在 2022/12/01 4:32, Tejun Heo 写道:
>>> On Wed, Nov 30, 2022 at 09:21:49PM +0800, Li Nan wrote:
>>>> From: Yu Kuai <yukuai3@huawei.com>
>>>>
>>>> 1) There are one place that return value of match_u64() is not checked.
>>>> 2) If match_u64() failed, return value is set to -EINVAL despite that
>>>>      there are other possible errnos.
>>>
>>> Ditto. Does this matter?
>>>
>>
>> It's not a big deal, but I think at least return value of match_u64()
>> should be checked, we don't want to continue with invalid input, right?
> 
> Yeah, sure.
> 
>> By the way, match_u64() can return -ERANGE, which can provide more
>> specific error messge to user.
> 
> I'm really not convinced going over 64bit range would be all that difficult
> to spot whether the error code is -EINVAL or -ERANGE. There isn't anything
> wrong with returning -ERANGE but the fact that that particular function
> returns an error code doesn't necessarily mean that it *must* be forwarded.
> 
> Imagine that we used sscanf(buf, "%llu", &value) to parse the number
> instead. We'd only know whether the parsing would have succeeded or not and
> would probably return -EINVAL on failure and the behavior would be just
> fine. This does not matter *at all*.
> 
> So, idk, I'm not necessarily against it but changing -EINVAL to -ERANGE is
> pure churn. Nothing material is being improved by that change.

Thanks for the review and explanation, I'll just keep the addition of
return value  checking of the former 2 patches.

Thanks,
Kuai


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init
  2022-12-01 10:31           ` Tejun Heo
@ 2022-12-05  9:32             ` Yu Kuai
  2022-12-12 23:10               ` Tejun Heo
  0 siblings, 1 reply; 38+ messages in thread
From: Yu Kuai @ 2022-12-05  9:32 UTC (permalink / raw)
  To: Tejun Heo, Yu Kuai, Christoph Hellwig
  Cc: Li Nan, josef, axboe, cgroups, linux-block, linux-kernel,
	yi.zhang, yukuai (C)

Hi, Tejun!

While reviewing rq_qos code, I found that there are some other possible
defects:

1) queue_lock is held to protect rq_qos_add() and rq_qos_del(), whlie
it's not held to protect rq_qos_exit(), which is absolutely not safe
because they can be called concurrently by configuring iocost and
removing device.
I'm thinking about holding the lock to fetch the list and reset
q->rq_qos first:

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 88f0fe7dcf54..271ad65eebd9 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -288,9 +288,15 @@ void rq_qos_wait(struct rq_wait *rqw, void 
*private_data,

  void rq_qos_exit(struct request_queue *q)
  {
-       while (q->rq_qos) {
-               struct rq_qos *rqos = q->rq_qos;
-               q->rq_qos = rqos->next;
+       struct rq_qos *rqos;
+
+       spin_lock_irq(&q->queue_lock);
+       rqos = q->rq_qos;
+       q->rq_qos = NULL;
+       spin_unlock_irq(&q->queue_lock);
+
+       while (rqos) {
                 rqos->ops->exit(rqos);
+               rqos = rqos->next;
         }
  }

2) rq_qos_add() can still succeed after rq_qos_exit() is done, which
will cause memory leak. Hence a checking is required beforing adding
to q->rq_qos. I'm thinking about flag QUEUE_FLAG_DYING first, but the
flag will not set if disk state is not marked GD_OWNS_QUEUE. Since
blk_unregister_queue() is called before rq_qos_exit(), use the queue
flag QUEUE_FLAG_REGISTERED should be OK.

For the current problem that device can be removed while initializing
, I'm thinking about some possible solutions:

Since bfq is initialized in elevator initialization, and others are
in queue initialization, such problem is only possible in iocost, hence
it make sense to fix it in iocost:

1) use open mutex to prevet concurrency, however, this will cause
that configuring iocost will block some other operations that is relied
on open_mutex.

@@ -2889,7 +2889,15 @@ static int blk_iocost_init(struct gendisk *disk)
         if (ret)
                 goto err_free_ioc;

+       mutex_lock(&disk->open_mutex);
+       if (!disk_live(disk)) {
+               mutex_unlock(&disk->open_mutex);
+               ret = -ENODEV;
+               goto err_del_qos;
+       }
+
         ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
+       mutex_unlock(&disk->open_mutex);
         if (ret)
                 goto err_del_qos;

2) like 1), the difference is that define a new mutex just in iocst.

3) Or is it better to fix it in the higher level? For example:
add a new restriction that blkcg_deactivate_policy() should be called
with blkcg_activate_policy() in pairs, and blkcg_deactivate_policy()
will wait for blkcg_activate_policy() to finish. Something like:

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ef4fef1af909..6266f702157f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1410,7 +1410,7 @@ int blkcg_activate_policy(struct request_queue *q,
         struct blkcg_gq *blkg, *pinned_blkg = NULL;
         int ret;

-       if (blkcg_policy_enabled(q, pol))
+       if (WARN_ON_ONCE(blkcg_policy_enabled(q, pol)))
                 return 0;

         if (queue_is_mq(q))
@@ -1477,6 +1477,8 @@ int blkcg_activate_policy(struct request_queue *q,
                 blkg_put(pinned_blkg);
         if (pd_prealloc)
                 pol->pd_free_fn(pd_prealloc);
+       if (!ret)
+               wake_up(q->policy_waitq);
         return ret;

  enomem:
@@ -1512,7 +1514,7 @@ void blkcg_deactivate_policy(struct request_queue *q,
         struct blkcg_gq *blkg;

         if (!blkcg_policy_enabled(q, pol))
-               return;
+               wait_event(q->policy_waitq, blkcg_policy_enabled(q, pol));
    wait_event(q->xxx, blkcg_policy_enabled(q, pol));


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption
  2022-11-30 20:59   ` Tejun Heo
  2022-12-01  1:19     ` Yu Kuai
@ 2022-12-05  9:35     ` Yu Kuai
  1 sibling, 0 replies; 38+ messages in thread
From: Yu Kuai @ 2022-12-05  9:35 UTC (permalink / raw)
  To: Tejun Heo, Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yi.zhang, yukuai (C)

Hi, Tejun

在 2022/12/01 4:59, Tejun Heo 写道:
> On Wed, Nov 30, 2022 at 09:21:56PM +0800, Li Nan wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Our test report a problem:
>>
>> ------------[ cut here ]------------
>> list_del corruption. next->prev should be ffff888127e0c4b0, but was ffff888127e090b0
>> WARNING: CPU: 2 PID: 3117789 at lib/list_debug.c:62 __list_del_entry_valid+0x119/0x130
>> RIP: 0010:__list_del_entry_valid+0x119/0x130
>> RIP: 0010:__list_del_entry_valid+0x119/0x130
>> Call Trace:
>>   <IRQ>
>>   iocg_flush_stat.isra.0+0x11e/0x230
>>   ? ioc_rqos_done+0x230/0x230
>>   ? ioc_now+0x14f/0x180
>>   ioc_timer_fn+0x569/0x1640
>>
>> We haven't reporduced it yet, but we think this is due to parent iocg is
>> freed before child iocg, and then in ioc_timer_fn, walk_list is
>> corrupted.
>>
>> 1) Remove child cgroup can concurrent with remove parent cgroup, and
>> ioc_pd_free for parent iocg can be called before child iocg. This can be
>> fixed by moving the handle of walk_list to ioc_pd_offline, since that
>> offline from child is ensured to be called before parent.
> 
> Which you already did in a previous patch, right?
> 
>> 2) ioc_pd_free can be triggered from both removing device and removing
>> cgroup, this patch fix the problem by deleting timer before deactivating
>> policy, so that free parent iocg first in this case won't matter.
> 
> Okay, so, yeah, css's pin parents but blkg's don't. I think the right thing
> to do here is making sure that a child blkg pins its parent (and eventually
> ioc).

Sorry about this, actually it's can be ensured that pd_offline
from child will be called before parent. Hence just moving he handle of
walk_list to ioc_pd_offline can fix this problem thoroughly.

Thanks,
Kuai


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 7/9] blk-iocost: fix UAF in ioc_pd_free
  2022-11-30 20:42   ` Tejun Heo
@ 2022-12-06  7:53     ` Yu Kuai
  0 siblings, 0 replies; 38+ messages in thread
From: Yu Kuai @ 2022-12-06  7:53 UTC (permalink / raw)
  To: Tejun Heo, Li Nan
  Cc: josef, axboe, cgroups, linux-block, linux-kernel, yi.zhang, yukuai (C)

Hi, Tejun!

在 2022/12/01 4:42, Tejun Heo 写道:
> On Wed, Nov 30, 2022 at 09:21:54PM +0800, Li Nan wrote:
>> 	T1		     T2			T3
>>    //delete device
>>    del_gendisk
>>     bdi_unregister
>>      bdi_remove_from_list
>>       synchronize_rcu_expedited
>>
>> 		         //rmdir cgroup
>> 		         blkcg_destroy_blkgs
>> 		          blkg_destroy
>> 		           percpu_ref_kill
>> 		            blkg_release
>> 		             call_rcu
>>     rq_qos_exit
>>      ioc_rqos_exit
>>       kfree(ioc)
>> 					   __blkg_release
>> 					    blkg_free
>> 					     blkg_free_workfn
>> 					      pd_free_fn
>> 					       ioc_pd_free
>> 						spin_lock_irqsave
>> 						 ->ioc is freed
>>
>> Fix the problem by moving the operation on ioc in ioc_pd_free() to
>> ioc_pd_offline(), and just free resource in ioc_pd_free() like iolatency
>> and throttle.
>>
>> Signed-off-by: Li Nan <linan122@huawei.com>
> 
> I wonder what we really wanna do is pinning ioc while blkgs are still around
> but I think this should work too.
> 

I just found that this is not enough, other problems still exist:

t1:		
bio_init
  bio_associate_blkg
   //get blkg->refcnt
......
submit_bio
  blk_throtl_bio
  // bio is throttlled, user thread can exit
  			t2:
			// blkcg online_pin is zero
			blkcg_destroy_blkgs
			 blkg_destroy
			  ioc_pd_offline
			   list_del_init(&iocg->active_list)
t3:
ioc_rqos_throttle
  blkg_to_iocg
  // got the iocg that is offlined
   iocg_activate
   // acitvate the iocg again

For consequence, kernel can crash due to access unexpected
address. Fortunately, bfq already handle similar problem by checking
blkg->online in bfq_bio_bfqg(), this problem can be fixed by checking
it in iocg_activate().

BTW, I'm still working on checking if other policies have the same
problem.

Thanks,
Kuai


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init
  2022-12-05  9:32             ` Yu Kuai
@ 2022-12-12 23:10               ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2022-12-12 23:10 UTC (permalink / raw)
  To: Yu Kuai
  Cc: Christoph Hellwig, Li Nan, josef, axboe, cgroups, linux-block,
	linux-kernel, yi.zhang, yukuai (C)

Hello,

On Mon, Dec 05, 2022 at 05:32:17PM +0800, Yu Kuai wrote:
> 1) queue_lock is held to protect rq_qos_add() and rq_qos_del(), whlie
> it's not held to protect rq_qos_exit(), which is absolutely not safe
> because they can be called concurrently by configuring iocost and
> removing device.
> I'm thinking about holding the lock to fetch the list and reset
> q->rq_qos first:
> 
> diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
> index 88f0fe7dcf54..271ad65eebd9 100644
> --- a/block/blk-rq-qos.c
> +++ b/block/blk-rq-qos.c
> @@ -288,9 +288,15 @@ void rq_qos_wait(struct rq_wait *rqw, void
> *private_data,
> 
>  void rq_qos_exit(struct request_queue *q)
>  {
> -       while (q->rq_qos) {
> -               struct rq_qos *rqos = q->rq_qos;
> -               q->rq_qos = rqos->next;
> +       struct rq_qos *rqos;
> +
> +       spin_lock_irq(&q->queue_lock);
> +       rqos = q->rq_qos;
> +       q->rq_qos = NULL;
> +       spin_unlock_irq(&q->queue_lock);
> +
> +       while (rqos) {
>                 rqos->ops->exit(rqos);
> +               rqos = rqos->next;
>         }
>  }
> 
> 2) rq_qos_add() can still succeed after rq_qos_exit() is done, which
> will cause memory leak. Hence a checking is required beforing adding
> to q->rq_qos. I'm thinking about flag QUEUE_FLAG_DYING first, but the
> flag will not set if disk state is not marked GD_OWNS_QUEUE. Since
> blk_unregister_queue() is called before rq_qos_exit(), use the queue
> flag QUEUE_FLAG_REGISTERED should be OK.
> 
> For the current problem that device can be removed while initializing
> , I'm thinking about some possible solutions:
> 
> Since bfq is initialized in elevator initialization, and others are
> in queue initialization, such problem is only possible in iocost, hence
> it make sense to fix it in iocost:

So, iolatency is likely to switch to similar lazy init scheme, so we better
fix it in the rq_qos / core block layer.

...
> 3) Or is it better to fix it in the higher level? For example:
> add a new restriction that blkcg_deactivate_policy() should be called
> with blkcg_activate_policy() in pairs, and blkcg_deactivate_policy()
> will wait for blkcg_activate_policy() to finish. Something like:
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index ef4fef1af909..6266f702157f 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -1410,7 +1410,7 @@ int blkcg_activate_policy(struct request_queue *q,
>         struct blkcg_gq *blkg, *pinned_blkg = NULL;
>         int ret;
> 
> -       if (blkcg_policy_enabled(q, pol))
> +       if (WARN_ON_ONCE(blkcg_policy_enabled(q, pol)))
>                 return 0;
> 
>         if (queue_is_mq(q))
> @@ -1477,6 +1477,8 @@ int blkcg_activate_policy(struct request_queue *q,
>                 blkg_put(pinned_blkg);
>         if (pd_prealloc)
>                 pol->pd_free_fn(pd_prealloc);
> +       if (!ret)
> +               wake_up(q->policy_waitq);
>         return ret;
> 
>  enomem:
> @@ -1512,7 +1514,7 @@ void blkcg_deactivate_policy(struct request_queue *q,
>         struct blkcg_gq *blkg;
> 
>         if (!blkcg_policy_enabled(q, pol))
> -               return;
> +               wait_event(q->policy_waitq, blkcg_policy_enabled(q, pol));
>    wait_event(q->xxx, blkcg_policy_enabled(q, pol));

Yeah, along this line but hopefully something simpler like a mutex.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2022-12-12 23:10 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-30 13:21 [PATCH -next v2 0/9] iocost bugfix Li Nan
2022-11-30 13:21 ` [PATCH -next v2 1/9] blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write() Li Nan
2022-11-30 15:54   ` Christoph Hellwig
2022-11-30 15:55     ` Christoph Hellwig
2022-11-30 20:31   ` Tejun Heo
2022-11-30 13:21 ` [PATCH -next v2 2/9] blk-iocost: improve hanlder of match_u64() Li Nan
2022-11-30 20:32   ` Tejun Heo
2022-12-01  2:15     ` Yu Kuai
2022-12-01 10:08       ` Tejun Heo
2022-12-01 13:47         ` Yu Kuai
2022-11-30 13:21 ` [PATCH -next v2 3/9] blk-iocost: don't allow to configure bio based device Li Nan
2022-11-30 20:15   ` Tejun Heo
2022-11-30 13:21 ` [PATCH -next v2 4/9] blk-iocost: read params inside lock in sysfs apis Li Nan
2022-11-30 20:16   ` Tejun Heo
2022-11-30 13:21 ` [PATCH -next v2 5/9] blk-iocost: fix divide by 0 error in calc_lcoefs() Li Nan
2022-11-30 20:20   ` Tejun Heo
2022-11-30 13:21 ` [PATCH -next v2 6/9] blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in ioc_refresh_params() Li Nan
2022-11-30 20:22   ` Tejun Heo
2022-11-30 13:21 ` [PATCH -next v2 7/9] blk-iocost: fix UAF in ioc_pd_free Li Nan
2022-11-30 20:42   ` Tejun Heo
2022-12-06  7:53     ` Yu Kuai
2022-11-30 13:21 ` [PATCH -next v2 8/9] block: fix null-pointer dereference in ioc_pd_init Li Nan
2022-11-30 20:50   ` Tejun Heo
2022-12-01  2:12     ` Yu Kuai
2022-12-01 10:11       ` Tejun Heo
2022-12-01 10:23         ` Yu Kuai
2022-12-01 10:31           ` Tejun Heo
2022-12-05  9:32             ` Yu Kuai
2022-12-12 23:10               ` Tejun Heo
2022-11-30 13:21 ` [PATCH -next v2 9/9] blk-iocost: fix walk_list corruption Li Nan
2022-11-30 20:59   ` Tejun Heo
2022-12-01  1:19     ` Yu Kuai
2022-12-01 10:00       ` Tejun Heo
2022-12-01 10:14         ` Yu Kuai
2022-12-01 10:29           ` Tejun Heo
2022-12-01 13:43             ` Yu Kuai
2022-12-05  9:35     ` Yu Kuai
2022-11-30 17:26 ` [PATCH -next v2 0/9] iocost bugfix Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).