* [PATCH 1/4] block: split wbt_init() into two parts
2021-05-25 8:04 [PATCH 0/4] block: fix race between adding wbt and normal IO Ming Lei
@ 2021-05-25 8:04 ` Ming Lei
2021-06-04 0:32 ` Bart Van Assche
2021-05-25 8:04 ` [PATCH 2/4] block: move wbt allocation into blk_alloc_queue Ming Lei
` (3 subsequent siblings)
4 siblings, 1 reply; 9+ messages in thread
From: Ming Lei @ 2021-05-25 8:04 UTC (permalink / raw)
To: Jens Axboe, Christoph Hellwig; +Cc: linux-block, Ming Lei, Yi Zhang
Split wbt_init() into wbt_alloc() and wbt_init(), and prepare for
moving wbt allocation into blk_alloc_queue().
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-wbt.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 42aed0160f86..efff1232446f 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -809,7 +809,7 @@ static struct rq_qos_ops wbt_rqos_ops = {
#endif
};
-int wbt_init(struct request_queue *q)
+static int wbt_alloc(struct request_queue *q)
{
struct rq_wb *rwb;
int i;
@@ -832,7 +832,6 @@ int wbt_init(struct request_queue *q)
rwb->rqos.q = q;
rwb->last_comp = rwb->last_issue = jiffies;
rwb->win_nsec = RWB_WINDOW_NSEC;
- rwb->enable_state = WBT_STATE_ON_DEFAULT;
rwb->wc = 1;
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
@@ -842,6 +841,19 @@ int wbt_init(struct request_queue *q)
rq_qos_add(q, &rwb->rqos);
blk_stat_add_callback(q, rwb->cb);
+ return 0;
+}
+
+int wbt_init(struct request_queue *q)
+{
+ int ret = wbt_alloc(q);
+ struct rq_wb *rwb;
+
+ if (ret)
+ return ret;
+
+ rwb = RQWB(wbt_rq_qos(q));
+ rwb->enable_state = WBT_STATE_ON_DEFAULT;
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_queue_depth_changed(&rwb->rqos);
--
2.29.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/4] block: move wbt allocation into blk_alloc_queue
2021-05-25 8:04 [PATCH 0/4] block: fix race between adding wbt and normal IO Ming Lei
2021-05-25 8:04 ` [PATCH 1/4] block: split wbt_init() into two parts Ming Lei
@ 2021-05-25 8:04 ` Ming Lei
2021-06-04 0:44 ` Bart Van Assche
2021-05-25 8:04 ` [PATCH 3/4] block: reuse wbt_set_min_lat for setting wbt->min_lat_nsec Ming Lei
` (2 subsequent siblings)
4 siblings, 1 reply; 9+ messages in thread
From: Ming Lei @ 2021-05-25 8:04 UTC (permalink / raw)
To: Jens Axboe, Christoph Hellwig; +Cc: linux-block, Ming Lei, Yi Zhang
wbt_init() calls wbt_alloc() which adds allocated wbt instance into
q->rq_qos. This way is very dangerous because q->rq_qos is accessed in
IO fast path.
So far wbt_init() is called in the two code paths:
1) blk_register_queue(), when the block device has been exposed to
usespace, so IO may come when adding wbt into q->rq_qos
2) sysfs attribute store, in which normal IO is definitely allowed
Move wbt allocation into blk_alloc_queue() for avoiding to add wbt
instance dynamically to q->rq_qos. And before calling wbt_init(), the
wbt is disabled, so functionally it works as expected.
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-core.c | 6 ++++++
block/blk-mq-debugfs.c | 3 +++
block/blk-sysfs.c | 8 --------
block/blk-wbt.c | 21 ++++++---------------
block/blk-wbt.h | 10 +++++++---
5 files changed, 22 insertions(+), 26 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 689aac2625d2..e7d9eac5d4c1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,6 +51,7 @@
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-rq-qos.h"
+#include "blk-wbt.h"
struct dentry *blk_debugfs_root;
@@ -579,12 +580,17 @@ struct request_queue *blk_alloc_queue(int node_id)
if (blkcg_init_queue(q))
goto fail_ref;
+ if (wbt_alloc(q))
+ goto fail_blkcg;
+
blk_queue_dma_alignment(q, 511);
blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_MAX_RQ;
return q;
+fail_blkcg:
+ blkcg_exit_queue(q);
fail_ref:
percpu_ref_exit(&q->q_usage_counter);
fail_bdi:
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 2a75bc7401df..c1790f313479 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -948,6 +948,9 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
struct request_queue *q = rqos->q;
const char *dir_name = rq_qos_id_to_name(rqos->id);
+ if (!q->debugfs_dir)
+ return;
+
if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
return;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f89e2fc3963b..7fd4ffadcdfa 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -481,7 +481,6 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
size_t count)
{
- struct rq_qos *rqos;
ssize_t ret;
s64 val;
@@ -491,13 +490,6 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
if (val < -1)
return -EINVAL;
- rqos = wbt_rq_qos(q);
- if (!rqos) {
- ret = wbt_init(q);
- if (ret)
- return ret;
- }
-
if (val == -1)
val = wbt_default_latency_nsec(q);
else if (val >= 0)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index efff1232446f..5da48294e3e9 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -421,16 +421,14 @@ static void wbt_update_limits(struct rq_wb *rwb)
u64 wbt_get_min_lat(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
- if (!rqos)
- return 0;
+
return RQWB(rqos)->min_lat_nsec;
}
void wbt_set_min_lat(struct request_queue *q, u64 val)
{
struct rq_qos *rqos = wbt_rq_qos(q);
- if (!rqos)
- return;
+
RQWB(rqos)->min_lat_nsec = val;
RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
wbt_update_limits(RQWB(rqos));
@@ -637,7 +635,7 @@ void wbt_enable_default(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
/* Throttling already enabled? */
- if (rqos)
+ if (rwb_enabled(RQWB(rqos)))
return;
/* Queue not registered? Maybe shutting down... */
@@ -809,7 +807,7 @@ static struct rq_qos_ops wbt_rqos_ops = {
#endif
};
-static int wbt_alloc(struct request_queue *q)
+int wbt_alloc(struct request_queue *q)
{
struct rq_wb *rwb;
int i;
@@ -844,20 +842,13 @@ static int wbt_alloc(struct request_queue *q)
return 0;
}
-int wbt_init(struct request_queue *q)
+void wbt_init(struct request_queue *q)
{
- int ret = wbt_alloc(q);
- struct rq_wb *rwb;
-
- if (ret)
- return ret;
+ struct rq_wb *rwb = RQWB(wbt_rq_qos(q));
- rwb = RQWB(wbt_rq_qos(q));
rwb->enable_state = WBT_STATE_ON_DEFAULT;
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_queue_depth_changed(&rwb->rqos);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
-
- return 0;
}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 16bdc85b8df9..de07963f3544 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -87,7 +87,8 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
#ifdef CONFIG_BLK_WBT
-int wbt_init(struct request_queue *);
+int wbt_alloc(struct request_queue *q);
+void wbt_init(struct request_queue *);
void wbt_disable_default(struct request_queue *);
void wbt_enable_default(struct request_queue *);
@@ -103,9 +104,12 @@ u64 wbt_default_latency_nsec(struct request_queue *);
static inline void wbt_track(struct request *rq, enum wbt_flags flags)
{
}
-static inline int wbt_init(struct request_queue *q)
+static inline int wbt_alloc(struct request_queue *q)
+{
+ return 0;
+}
+static inline void wbt_init(struct request_queue *q)
{
- return -EINVAL;
}
static inline void wbt_disable_default(struct request_queue *q)
{
--
2.29.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/4] block: move wbt allocation into blk_alloc_queue
2021-05-25 8:04 ` [PATCH 2/4] block: move wbt allocation into blk_alloc_queue Ming Lei
@ 2021-06-04 0:44 ` Bart Van Assche
2021-06-04 1:22 ` Ming Lei
0 siblings, 1 reply; 9+ messages in thread
From: Bart Van Assche @ 2021-06-04 0:44 UTC (permalink / raw)
To: Ming Lei, Jens Axboe, Christoph Hellwig; +Cc: linux-block, Yi Zhang
On 5/25/21 1:04 AM, Ming Lei wrote:
> wbt_init() calls wbt_alloc() which adds allocated wbt instance into
> q->rq_qos. This way is very dangerous because q->rq_qos is accessed in
> IO fast path.
>
> So far wbt_init() is called in the two code paths:
>
> 1) blk_register_queue(), when the block device has been exposed to
> usespace, so IO may come when adding wbt into q->rq_qos
>
> 2) sysfs attribute store, in which normal IO is definitely allowed
>
> Move wbt allocation into blk_alloc_queue() for avoiding to add wbt
> instance dynamically to q->rq_qos. And before calling wbt_init(), the
> wbt is disabled, so functionally it works as expected.
I don't like this change since it is not generic - it only helps the WBT
implementation.
All rq-qos policies call rq_qos_add() and all these policies take effect
before rq_qos_add() returns. Does the q->rq_qos list perhaps have to be
protected with RCU? Would that be sufficient to fix the crashes reported
in the cover letter?
Thanks,
Bart.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/4] block: move wbt allocation into blk_alloc_queue
2021-06-04 0:44 ` Bart Van Assche
@ 2021-06-04 1:22 ` Ming Lei
0 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-06-04 1:22 UTC (permalink / raw)
To: Bart Van Assche; +Cc: Jens Axboe, Christoph Hellwig, linux-block, Yi Zhang
On Thu, Jun 03, 2021 at 05:44:00PM -0700, Bart Van Assche wrote:
> On 5/25/21 1:04 AM, Ming Lei wrote:
> > wbt_init() calls wbt_alloc() which adds allocated wbt instance into
> > q->rq_qos. This way is very dangerous because q->rq_qos is accessed in
> > IO fast path.
> >
> > So far wbt_init() is called in the two code paths:
> >
> > 1) blk_register_queue(), when the block device has been exposed to
> > usespace, so IO may come when adding wbt into q->rq_qos
> >
> > 2) sysfs attribute store, in which normal IO is definitely allowed
> >
> > Move wbt allocation into blk_alloc_queue() for avoiding to add wbt
> > instance dynamically to q->rq_qos. And before calling wbt_init(), the
> > wbt is disabled, so functionally it works as expected.
>
> I don't like this change since it is not generic - it only helps the WBT
> implementation.
OK, actually except for wbt, the only one left is iocost which adds
rq_qos via cgroup attribute.
>
> All rq-qos policies call rq_qos_add() and all these policies take effect
> before rq_qos_add() returns. Does the q->rq_qos list perhaps have to be
> protected with RCU? Would that be sufficient to fix the crashes reported
> in the cover letter?
Freezing queue should be easier for providing the protection, and I will try
that approach.
Thanks,
Ming
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 3/4] block: reuse wbt_set_min_lat for setting wbt->min_lat_nsec
2021-05-25 8:04 [PATCH 0/4] block: fix race between adding wbt and normal IO Ming Lei
2021-05-25 8:04 ` [PATCH 1/4] block: split wbt_init() into two parts Ming Lei
2021-05-25 8:04 ` [PATCH 2/4] block: move wbt allocation into blk_alloc_queue Ming Lei
@ 2021-05-25 8:04 ` Ming Lei
2021-05-25 8:04 ` [PATCH 4/4] block: mark queue init done at the end of blk_register_queue Ming Lei
2021-06-04 0:03 ` [PATCH 0/4] block: fix race between adding wbt and normal IO Ming Lei
4 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-05-25 8:04 UTC (permalink / raw)
To: Jens Axboe, Christoph Hellwig; +Cc: linux-block, Ming Lei, Yi Zhang
We need to freeze queue when setting wbt->min_lat_nsec because
wbt enabled state can be changed, so reuse wbt_set_min_lat() for doing
that for users of wbt_init().
Meantime move wbt_enable_default() out of q->sysfs_lock, since we
don't need that lock for enabling wbt.
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-sysfs.c | 16 +++-------------
block/blk-wbt.c | 29 ++++++++++++++++++++++-------
block/blk-wbt.h | 4 ++--
3 files changed, 27 insertions(+), 22 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7fd4ffadcdfa..925043f926c5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -498,18 +498,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
if (wbt_get_min_lat(q) == val)
return count;
- /*
- * Ensure that the queue is idled, in case the latency update
- * ends up either enabling or disabling wbt completely. We can't
- * have IO inflight if that happens.
- */
- blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
-
- wbt_set_min_lat(q, val);
-
- blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ wbt_set_min_lat(q, val, true);
return count;
}
@@ -918,7 +907,6 @@ int blk_register_queue(struct gendisk *disk)
}
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
- wbt_enable_default(q);
blk_throtl_register_queue(q);
/* Now everything is ready and send out KOBJ_ADD uevent */
@@ -927,6 +915,8 @@ int blk_register_queue(struct gendisk *disk)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock);
+ wbt_enable_default(q);
+
ret = 0;
unlock:
mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5da48294e3e9..5d2db197cce6 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/swap.h>
+#include <linux/blk-mq.h>
#include "blk-wbt.h"
#include "blk-rq-qos.h"
@@ -425,13 +426,27 @@ u64 wbt_get_min_lat(struct request_queue *q)
return RQWB(rqos)->min_lat_nsec;
}
-void wbt_set_min_lat(struct request_queue *q, u64 val)
+void wbt_set_min_lat(struct request_queue *q, u64 val, bool manual)
{
struct rq_qos *rqos = wbt_rq_qos(q);
+ /*
+ * Ensure that the queue is idled, in case the latency update
+ * ends up either enabling or disabling wbt completely. We can't
+ * have IO inflight if that happens.
+ */
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
RQWB(rqos)->min_lat_nsec = val;
- RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ if (manual)
+ RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ else
+ RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
wbt_update_limits(RQWB(rqos));
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
}
@@ -844,11 +859,11 @@ int wbt_alloc(struct request_queue *q)
void wbt_init(struct request_queue *q)
{
- struct rq_wb *rwb = RQWB(wbt_rq_qos(q));
-
- rwb->enable_state = WBT_STATE_ON_DEFAULT;
- rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ struct rq_wb *rwb = RQWB(rqos);
- wbt_queue_depth_changed(&rwb->rqos);
+ rwb->rq_depth.queue_depth = blk_queue_depth(rqos->q);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+
+ wbt_set_min_lat(q, wbt_default_latency_nsec(q), false);
}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index de07963f3544..d9b643a55407 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -93,7 +93,7 @@ void wbt_disable_default(struct request_queue *);
void wbt_enable_default(struct request_queue *);
u64 wbt_get_min_lat(struct request_queue *q);
-void wbt_set_min_lat(struct request_queue *q, u64 val);
+void wbt_set_min_lat(struct request_queue *q, u64 val, bool manual);
void wbt_set_write_cache(struct request_queue *, bool);
@@ -124,7 +124,7 @@ static inline u64 wbt_get_min_lat(struct request_queue *q)
{
return 0;
}
-static inline void wbt_set_min_lat(struct request_queue *q, u64 val)
+static inline void wbt_set_min_lat(struct request_queue *q, u64 val, bool manual)
{
}
static inline u64 wbt_default_latency_nsec(struct request_queue *q)
--
2.29.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 4/4] block: mark queue init done at the end of blk_register_queue
2021-05-25 8:04 [PATCH 0/4] block: fix race between adding wbt and normal IO Ming Lei
` (2 preceding siblings ...)
2021-05-25 8:04 ` [PATCH 3/4] block: reuse wbt_set_min_lat for setting wbt->min_lat_nsec Ming Lei
@ 2021-05-25 8:04 ` Ming Lei
2021-06-04 0:03 ` [PATCH 0/4] block: fix race between adding wbt and normal IO Ming Lei
4 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-05-25 8:04 UTC (permalink / raw)
To: Jens Axboe, Christoph Hellwig; +Cc: linux-block, Ming Lei, Yi Zhang
Mark queue init done when everything is done well in blk_register_queue(),
so that wbt_enable_default() can be run quickly without any RCU period
involved.
Also no any side effect by delaying to mark queue init done.
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-sysfs.c | 29 +++++++++++++++--------------
1 file changed, 15 insertions(+), 14 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 925043f926c5..aef7bfd8e600 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -847,20 +847,6 @@ int blk_register_queue(struct gendisk *disk)
"%s is registering an already registered queue\n",
kobject_name(&dev->kobj));
- /*
- * SCSI probing may synchronously create and destroy a lot of
- * request_queues for non-existent devices. Shutting down a fully
- * functional queue takes measureable wallclock time as RCU grace
- * periods are involved. To avoid excessive latency in these
- * cases, a request_queue starts out in a degraded mode which is
- * faster to shut down and is made fully functional here as
- * request_queues for non-existent devices never get registered.
- */
- if (!blk_queue_init_done(q)) {
- blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
- percpu_ref_switch_to_percpu(&q->q_usage_counter);
- }
-
blk_queue_update_readahead(q);
ret = blk_trace_init_sysfs(dev);
@@ -920,6 +906,21 @@ int blk_register_queue(struct gendisk *disk)
ret = 0;
unlock:
mutex_unlock(&q->sysfs_dir_lock);
+
+ /*
+ * SCSI probing may synchronously create and destroy a lot of
+ * request_queues for non-existent devices. Shutting down a fully
+ * functional queue takes measureable wallclock time as RCU grace
+ * periods are involved. To avoid excessive latency in these
+ * cases, a request_queue starts out in a degraded mode which is
+ * faster to shut down and is made fully functional here as
+ * request_queues for non-existent devices never get registered.
+ */
+ if (!blk_queue_init_done(q)) {
+ blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
+ percpu_ref_switch_to_percpu(&q->q_usage_counter);
+ }
+
return ret;
}
EXPORT_SYMBOL_GPL(blk_register_queue);
--
2.29.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 0/4] block: fix race between adding wbt and normal IO
2021-05-25 8:04 [PATCH 0/4] block: fix race between adding wbt and normal IO Ming Lei
` (3 preceding siblings ...)
2021-05-25 8:04 ` [PATCH 4/4] block: mark queue init done at the end of blk_register_queue Ming Lei
@ 2021-06-04 0:03 ` Ming Lei
4 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-06-04 0:03 UTC (permalink / raw)
To: Jens Axboe, Christoph Hellwig; +Cc: linux-block, Yi Zhang
On Tue, May 25, 2021 at 04:04:38PM +0800, Ming Lei wrote:
> Hello,
>
> Yi reported several kernel panics on:
>
> [16687.001777] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
> ...
> [16687.163549] pc : __rq_qos_track+0x38/0x60
>
> or
>
> [ 997.690455] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
> ...
> [ 997.850347] pc : __rq_qos_done+0x2c/0x50
>
> Turns out it is caused by race between adding wbt and normal IO.
>
> Fix the issue by moving wbt allocation/addition into blk_alloc_queue().
Hello Guys,
Ping...
Thanks,
Ming
^ permalink raw reply [flat|nested] 9+ messages in thread