* [PATCH] bdi: fix use-after-free for bdi device
@ 2020-02-11 14:00 Yufen Yu
2020-02-11 13:57 ` Yufen Yu
0 siblings, 1 reply; 14+ messages in thread
From: Yufen Yu @ 2020-02-11 14:00 UTC (permalink / raw)
To: axboe, linux-block; +Cc: jack, bvanassche, tj
We reported a kernel crash:
[201962.639350] Call trace:
[201962.644403] string+0x28/0xa0
[201962.650501] vsnprintf+0x5f0/0x748
[201962.657472] seq_vprintf+0x70/0x98
[201962.664442] seq_printf+0x7c/0xa0
[201962.671238] __blkg_prfill_rwstat+0x84/0x128
[201962.679949] blkg_prfill_rwstat_field+0x94/0xc0
[201962.689182] blkcg_print_blkgs+0xcc/0x140
[201962.697370] blkg_print_stat_bytes+0x4c/0x60
[201962.706083] cgroup_seqfile_show+0x58/0xc0
[201962.714446] kernfs_seq_show+0x44/0x50
[201962.722112] seq_read+0xd4/0x4a8
[201962.728732] kernfs_fop_read+0x16c/0x218
[201962.736748] __vfs_read+0x60/0x188
[201962.743717] vfs_read+0x94/0x150
[201962.750338] ksys_read+0x6c/0xd8
[201962.756958] __arm64_sys_read+0x24/0x30
[201962.764800] el0_svc_common+0x78/0x130
[201962.772466] el0_svc_handler+0x38/0x78
[201962.780131] el0_svc+0x8/0xc
__blkg_prfill_rwstat() tries to get the device name by
'bdi->dev', while the 'dev' has been freed by bdi_unregister().
Then, blkg_dev_name() will return an invalid name pointer,
resulting in crash on string(). The race as following:
CPU1 CPU2
blkg_print_stat_bytes
__scsi_remove_device
del_gendisk
bdi_unregister
put_device(bdi->dev)
kfree(bdi->dev)
__blkg_prfill_rwstat
blkg_dev_name
//use the freed bdi->dev
dev_name(blkg->q->backing_dev_info->dev)
bdi->dev = NULL
To fix the bug, we add a new spinlock for bdi to protect
the device. Then, blk_dev_name() returns valid name or
'NULL', both of them will be okay.
Signed-off-by: Yufen Yu <yuyufen@huawei.com>
---
block/blk-cgroup-rwstat.c | 3 ++-
block/blk-cgroup.c | 26 +++++++++++++++++++-------
block/blk-iocost.c | 9 ++++++---
block/blk-iolatency.c | 3 ++-
block/blk-throttle.c | 3 ++-
include/linux/backing-dev-defs.h | 1 +
include/linux/blk-cgroup.h | 2 +-
mm/backing-dev.c | 8 ++++++--
8 files changed, 39 insertions(+), 16 deletions(-)
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
index 85d5790ac49b..20ef51cb3ea4 100644
--- a/block/blk-cgroup-rwstat.c
+++ b/block/blk-cgroup-rwstat.c
@@ -49,7 +49,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
[BLKG_RWSTAT_ASYNC] = "Async",
[BLKG_RWSTAT_DISCARD] = "Discard",
};
- const char *dname = blkg_dev_name(pd->blkg);
+ char dev_name[32];
+ char *dname = blkg_dev_name(pd->blkg, dev_name);
u64 v;
int i;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a229b94d5390..41bf7513c249 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -492,12 +492,22 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
return 0;
}
-const char *blkg_dev_name(struct blkcg_gq *blkg)
+char *blkg_dev_name(struct blkcg_gq *blkg, char *dname)
{
/* some drivers (floppy) instantiate a queue w/o disk registered */
- if (blkg->q->backing_dev_info->dev)
- return dev_name(blkg->q->backing_dev_info->dev);
- return NULL;
+ struct backing_dev_info *bdi = blkg->q->backing_dev_info;
+
+ /*
+ * We use spinlock to protect bdi->dev, avoiding
+ * the device been freed by bdi_unregister().
+ */
+ spin_lock_irq(&bdi->lock);
+ if (bdi->dev)
+ strlcpy(dname, dev_name(bdi->dev), 32);
+ else
+ dname = NULL;
+ spin_unlock_irq(&bdi->lock);
+ return dname;
}
/**
@@ -551,7 +561,8 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
*/
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
- const char *dname = blkg_dev_name(pd->blkg);
+ char dev_name[32];
+ char *dname = blkg_dev_name(pd->blkg, dev_name);
if (!dname)
return 0;
@@ -749,7 +760,8 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
struct blkg_iostat_set *bis = &blkg->iostat;
- const char *dname;
+ char dev_name[32];
+ char *dname;
char *buf;
u64 rbytes, wbytes, rios, wios, dbytes, dios;
size_t size = seq_get_buf(sf, &buf), off = 0;
@@ -762,7 +774,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
if (!blkg->online)
goto skip;
- dname = blkg_dev_name(blkg);
+ dname = blkg_dev_name(blkg, dev_name);
if (!dname)
goto skip;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 27ca68621137..d38d2f81343e 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2047,7 +2047,8 @@ static void ioc_pd_free(struct blkg_policy_data *pd)
static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
- const char *dname = blkg_dev_name(pd->blkg);
+ char dev_name[32];
+ char *dname = blkg_dev_name(pd->blkg, dev_name);
struct ioc_gq *iocg = pd_to_iocg(pd);
if (dname && iocg->cfg_weight)
@@ -2133,7 +2134,8 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
- const char *dname = blkg_dev_name(pd->blkg);
+ char dev_name[32];
+ const char *dname = blkg_dev_name(pd->blkg, dev_name);
struct ioc *ioc = pd_to_iocg(pd)->ioc;
if (!dname)
@@ -2304,7 +2306,8 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
static u64 ioc_cost_model_prfill(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- const char *dname = blkg_dev_name(pd->blkg);
+ char dev_name[32];
+ char *dname = blkg_dev_name(pd->blkg, dev_name);
struct ioc *ioc = pd_to_iocg(pd)->ioc;
u64 *u = ioc->params.i_lcoefs;
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index c128d50cb410..71bcbd757889 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -868,7 +868,8 @@ static u64 iolatency_prfill_limit(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
- const char *dname = blkg_dev_name(pd->blkg);
+ char dev_name[32];
+ char *dname = blkg_dev_name(pd->blkg, dev_name);
if (!dname || !iolat->min_lat_nsec)
return 0;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 98233c9c65a8..83b0b43fe4a3 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1560,7 +1560,8 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
struct throtl_grp *tg = pd_to_tg(pd);
- const char *dname = blkg_dev_name(pd->blkg);
+ char dev_name[32];
+ char *dname = blkg_dev_name(pd->blkg, dev_name);
char bufs[4][21] = { "max", "max", "max", "max" };
u64 bps_dft;
unsigned int iops_dft;
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 4fc87dee005a..c98dac4a7953 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -227,6 +227,7 @@ struct backing_dev_info {
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
#endif
+ spinlock_t lock; /* protects dev */
};
enum {
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e4a6949fd171..6034ef9cf07a 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -198,7 +198,7 @@ int blkcg_activate_policy(struct request_queue *q,
void blkcg_deactivate_policy(struct request_queue *q,
const struct blkcg_policy *pol);
-const char *blkg_dev_name(struct blkcg_gq *blkg);
+char *blkg_dev_name(struct blkcg_gq *blkg, char *dev_name);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
u64 (*prfill)(struct seq_file *,
struct blkg_policy_data *, int),
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 62f05f605fb5..a5aa00ec2d8a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -859,6 +859,7 @@ static int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ spin_lock_init(&bdi->lock);
ret = cgwb_bdi_init(bdi);
@@ -1007,15 +1008,18 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
+ struct device *dev = bdi->dev;
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
cgwb_bdi_unregister(bdi);
- if (bdi->dev) {
+ if (dev) {
bdi_debug_unregister(bdi);
- device_unregister(bdi->dev);
+ spin_lock(&bdi->lock);
bdi->dev = NULL;
+ spin_unlock(&bdi->lock);
+ device_unregister(dev);
}
if (bdi->owner) {
--
2.16.2.dirty
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-11 14:00 [PATCH] bdi: fix use-after-free for bdi device Yufen Yu
@ 2020-02-11 13:57 ` Yufen Yu
2020-02-12 21:33 ` Tejun Heo
0 siblings, 1 reply; 14+ messages in thread
From: Yufen Yu @ 2020-02-11 13:57 UTC (permalink / raw)
To: axboe, linux-block; +Cc: jack, bvanassche, tj
> __blkg_prfill_rwstat() tries to get the device name by
> 'bdi->dev', while the 'dev' has been freed by bdi_unregister().
> Then, blkg_dev_name() will return an invalid name pointer,
> resulting in crash on string(). The race as following:
>
> CPU1 CPU2
> blkg_print_stat_bytes
> __scsi_remove_device
> del_gendisk
> bdi_unregister
>
> put_device(bdi->dev)
> kfree(bdi->dev)
> __blkg_prfill_rwstat
> blkg_dev_name
> //use the freed bdi->dev
> dev_name(blkg->q->backing_dev_info->dev)
> bdi->dev = NULL
>
There is another simple way to fix the problem.
Since blkg_dev_name() have been coverd by rcu_read_lock/unlock(),
we can wait all rcu reader to finish before free 'bdi->dev' to avoid use-after-free.
But I am not sure if this solution will introduce new problems.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 62f05f605fb5..6f322473ca4d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1007,15 +1007,19 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
+ struct device *dev = bdi->dev;
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
cgwb_bdi_unregister(bdi);
- if (bdi->dev) {
+ if (dev) {
bdi_debug_unregister(bdi);
- device_unregister(bdi->dev);
bdi->dev = NULL;
+ device_del(dev);
+ /* wait all rcu reader of bdi->dev before free dev */
+ synchronize_rcu();
+ put_device(dev);
}
if (bdi->owner) {
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-11 13:57 ` Yufen Yu
@ 2020-02-12 21:33 ` Tejun Heo
2020-02-13 2:46 ` Yufen Yu
0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2020-02-12 21:33 UTC (permalink / raw)
To: Yufen Yu; +Cc: axboe, linux-block, jack, bvanassche
Hello, Yufen.
On Tue, Feb 11, 2020 at 09:57:44PM +0800, Yufen Yu wrote:
> There is another simple way to fix the problem.
>
> Since blkg_dev_name() have been coverd by rcu_read_lock/unlock(),
> we can wait all rcu reader to finish before free 'bdi->dev' to avoid use-after-free.
>
> But I am not sure if this solution will introduce new problems.
So, I don't see why bdi->dev should be freed before bdi itself does.
Would something like the following work?
bdi_unregister()
{
...
if (bdi->dev) {
...
device_get(bdi->dev); // to be put on release
device_unregister(bdi->dev);
}
...
}
release_bdi()
{
...
if (bdi->dev) {
// warn if dev is still registered
device_put(bdi->dev);
}
...
}
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-12 21:33 ` Tejun Heo
@ 2020-02-13 2:46 ` Yufen Yu
2020-02-13 3:48 ` Tejun Heo
0 siblings, 1 reply; 14+ messages in thread
From: Yufen Yu @ 2020-02-13 2:46 UTC (permalink / raw)
To: Tejun Heo; +Cc: axboe, linux-block, jack, bvanassche
Hi,
On 2020/2/13 5:33, Tejun Heo wrote:
> So, I don't see why bdi->dev should be freed before bdi itself does.
> Would something like the following work?
>
> bdi_unregister()
> {
> ...
> if (bdi->dev) {
> ...
> device_get(bdi->dev); // to be put on release
> device_unregister(bdi->dev);
> }
> ...
> }
>
> release_bdi()
> {
> ...
> if (bdi->dev) {
> // warn if dev is still registered
> device_put(bdi->dev);
> }
> ...
> }
For each time of register, bdi_register() will try to create a new 'dev'.
bdi_register
bdi_register_va
if (bdi->dev) // if bdi->dev is not NULL, return directly
return 0;
dev = device_create_vargs()...
So, I think freeing bdi->dev until bdi itself does may be a problem
for drivers that supported re-registration bdi, such as:
commit b6f8fec4448aa52a8c36a392aa1ca2ea99acd460
Author: Jan Kara <jack@suse.cz>
Date: Wed Mar 8 17:48:31 2017 +0100
block: Allow bdi re-registration
SCSI can call device_add_disk() several times for one request queue when
a device in unbound and bound, creating new gendisk each time. This will
lead to bdi being repeatedly registered and unregistered.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-13 2:46 ` Yufen Yu
@ 2020-02-13 3:48 ` Tejun Heo
2020-02-13 7:51 ` Yufen Yu
0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2020-02-13 3:48 UTC (permalink / raw)
To: Yufen Yu; +Cc: axboe, linux-block, jack, bvanassche
Hello,
On Thu, Feb 13, 2020 at 10:46:34AM +0800, Yufen Yu wrote:
> For each time of register, bdi_register() will try to create a new 'dev'.
>
> bdi_register
> bdi_register_va
> if (bdi->dev) // if bdi->dev is not NULL, return directly
> return 0;
> dev = device_create_vargs()...
>
> So, I think freeing bdi->dev until bdi itself does may be a problem
> for drivers that supported re-registration bdi, such as:
Ugh, thanks for noticing that. I guess the right thing to do is then
going full RCU. What do you think about expanding your previous patch
so that ->dev has __rcu annotation, users use the RCU accessors and
the device is destroyed asynchronously through call_rcu()?
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-13 3:48 ` Tejun Heo
@ 2020-02-13 7:51 ` Yufen Yu
2020-02-13 13:58 ` Tejun Heo
0 siblings, 1 reply; 14+ messages in thread
From: Yufen Yu @ 2020-02-13 7:51 UTC (permalink / raw)
To: Tejun Heo; +Cc: axboe, linux-block, jack, bvanassche
Hi,
On 2020/2/13 11:48, Tejun Heo wrote:
> Hello,
>
> On Thu, Feb 13, 2020 at 10:46:34AM +0800, Yufen Yu wrote:
>> For each time of register, bdi_register() will try to create a new 'dev'.
>>
>> bdi_register
>> bdi_register_va
>> if (bdi->dev) // if bdi->dev is not NULL, return directly
>> return 0;
>> dev = device_create_vargs()...
>>
>> So, I think freeing bdi->dev until bdi itself does may be a problem
>> for drivers that supported re-registration bdi, such as:
>
> Ugh, thanks for noticing that. I guess the right thing to do is then
> going full RCU. What do you think about expanding your previous patch
> so that ->dev has __rcu annotation, users use the RCU accessors and
> the device is destroyed asynchronously through call_rcu()?
>
If we destroy the device asynchronously by call_rcu(), we may need to
add a new member 'rcu_head' into struct backing_dev_info. Right?
The code may be like:
bdi_unregister()
{
...
if (bdi->dev) {
...
device_get(bdi->dev);
device_unregister(bdi->dev);
bdi->dev = NULL; //XXX
bdi_get(bdi); //avoiding bdi to be freed before calling bdi_release_device
call_rcu(&bdi->rcu_head, bdi_release_device);
}
...
}
bdi_release_device()
{
...
put_device(bdi->dev);//XXX
bdi_put(bdi);
}
But, the problem is how do we get 'bdi->dev' in bdi_release_device().
If we do not set bdi->dev as 'NULL', re-registration bdi may cannot work well.
If I get it wrong, please point it out.
Thanks
Yufen
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-13 7:51 ` Yufen Yu
@ 2020-02-13 13:58 ` Tejun Heo
2020-02-14 2:50 ` Yufen Yu
0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2020-02-13 13:58 UTC (permalink / raw)
To: Yufen Yu; +Cc: axboe, linux-block, jack, bvanassche
Hello,
On Thu, Feb 13, 2020 at 03:51:40PM +0800, Yufen Yu wrote:
> If we destroy the device asynchronously by call_rcu(), we may need to
> add a new member 'rcu_head' into struct backing_dev_info. Right?
Yes.
> The code may be like:
>
> bdi_unregister()
> {
> ...
> if (bdi->dev) {
> ...
> device_get(bdi->dev);
> device_unregister(bdi->dev);
> bdi->dev = NULL; //XXX
> bdi_get(bdi); //avoiding bdi to be freed before calling bdi_release_device
> call_rcu(&bdi->rcu_head, bdi_release_device);
> }
> ...
> }
>
> bdi_release_device()
> {
> ...
> put_device(bdi->dev);//XXX
> bdi_put(bdi);
> }
>
> But, the problem is how do we get 'bdi->dev' in bdi_release_device().
> If we do not set bdi->dev as 'NULL', re-registration bdi may cannot work well.
So, unregistering can leave ->dev along and re-registering can test
whether it's NULL and if not put the existing one and put a new one
there. Wouldn't that work?
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-13 13:58 ` Tejun Heo
@ 2020-02-14 2:50 ` Yufen Yu
2020-02-14 14:05 ` Tejun Heo
0 siblings, 1 reply; 14+ messages in thread
From: Yufen Yu @ 2020-02-14 2:50 UTC (permalink / raw)
To: Tejun Heo; +Cc: axboe, linux-block, jack, bvanassche
On 2020/2/13 21:58, Tejun Heo wrote:
> Hello,
>
> On Thu, Feb 13, 2020 at 03:51:40PM +0800, Yufen Yu wrote:
>> If we destroy the device asynchronously by call_rcu(), we may need to
>> add a new member 'rcu_head' into struct backing_dev_info. Right?
>
> Yes.
>
>> The code may be like:
>>
>> bdi_unregister()
>> {
>> ...
>> if (bdi->dev) {
>> ...
>> device_get(bdi->dev);
>> device_unregister(bdi->dev);
>> bdi->dev = NULL; //XXX
>> bdi_get(bdi); //avoiding bdi to be freed before calling bdi_release_device
>> call_rcu(&bdi->rcu_head, bdi_release_device);
>> }
>> ...
>> }
>>
>> bdi_release_device()
>> {
>> ...
>> put_device(bdi->dev);//XXX
>> bdi_put(bdi);
>> }
>>
>> But, the problem is how do we get 'bdi->dev' in bdi_release_device().
>> If we do not set bdi->dev as 'NULL', re-registration bdi may cannot work well.
>
> So, unregistering can leave ->dev along and re-registering can test
> whether it's NULL and if not put the existing one and put a new one
> there. Wouldn't that work?
Do you mean set bdi->dev as 'NULL' in call_rcu() callback function
(i.e. bdi_release_device()) and test 'bdi->dev' in bdi_register_va()?
I think that may do not work.
We cannot make sure the order of rcu callback function and re-registering.
Then bdi_release_device() may put the new allocated device by re-registering.
Thanks,
Yufen
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-14 2:50 ` Yufen Yu
@ 2020-02-14 14:05 ` Tejun Heo
2020-02-15 13:54 ` Yufen Yu
0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2020-02-14 14:05 UTC (permalink / raw)
To: Yufen Yu; +Cc: axboe, linux-block, jack, bvanassche
Hello,
On Fri, Feb 14, 2020 at 10:50:01AM +0800, Yufen Yu wrote:
> > So, unregistering can leave ->dev along and re-registering can test
> > whether it's NULL and if not put the existing one and put a new one
> > there. Wouldn't that work?
>
> Do you mean set bdi->dev as 'NULL' in call_rcu() callback function
> (i.e. bdi_release_device()) and test 'bdi->dev' in bdi_register_va()?
>
> I think that may do not work.
> We cannot make sure the order of rcu callback function and re-registering.
> Then bdi_release_device() may put the new allocated device by re-registering.
No, I meant not freeing bdi->dev on deregistration and only doing so
when it actually needs to - on re-registration or release. So, sth
like the following.
* Unregister: Unregister bdi->dev but don't free it. Leave the pointer
alone.
* Re-register: If bdi->dev is not null, initiate RCU-free and update
bdi->dev to the new dev.
* Release: If bdi->dev is not NULL, initiate RCU-free of it.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-14 14:05 ` Tejun Heo
@ 2020-02-15 13:54 ` Yufen Yu
2020-02-19 12:55 ` Jan Kara
0 siblings, 1 reply; 14+ messages in thread
From: Yufen Yu @ 2020-02-15 13:54 UTC (permalink / raw)
To: Tejun Heo; +Cc: axboe, linux-block, jack, bvanassche
Hi, tejun
On 2020/2/14 22:05, Tejun Heo wrote:
> Hello,
>
> On Fri, Feb 14, 2020 at 10:50:01AM +0800, Yufen Yu wrote:
>>> So, unregistering can leave ->dev along and re-registering can test
>>> whether it's NULL and if not put the existing one and put a new one
>>> there. Wouldn't that work?
>>
>> Do you mean set bdi->dev as 'NULL' in call_rcu() callback function
>> (i.e. bdi_release_device()) and test 'bdi->dev' in bdi_register_va()?
>>
>> I think that may do not work.
>> We cannot make sure the order of rcu callback function and re-registering.
>> Then bdi_release_device() may put the new allocated device by re-registering.
>
> No, I meant not freeing bdi->dev on deregistration and only doing so
> when it actually needs to - on re-registration or release. So, sth
> like the following.
>
> * Unregister: Unregister bdi->dev but don't free it. Leave the pointer
> alone.
>
> * Re-register: If bdi->dev is not null, initiate RCU-free and update
> bdi->dev to the new dev.
>
> * Release: If bdi->dev is not NULL, initiate RCU-free of it.
Okay, I think we can do that.
When do re-register, we need to update bdi->dev as 'NULL' or the new dev
before invoking call_rcu() to free '->dev'. So readers started after call_rcu()
cannot read the old dev, like:
bdi_register_va()
{
...
if (bdi->dev) {
//rcu_assgin_pointer(bdi->dev, new_dev);
rcu_assgin_pointer(bdi->dev, NULL);
call_rcu();//rcu callback function will free the old dev
}
...
}
After assigning new value for bdi->dev, rcu callback function cannot get
the old device. So I think we may need to replace the '->dev' with a new
struct pointer, in which includes '->dev' and 'rcu_head'. We pass the
struct.rcu_head pointer to call_rcu() and then it can get old dev address.
IMO, maybe we can maintain the original code logic, fix the problem like:
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 4fc87dee005a..e2de4a4e5392 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -185,6 +185,11 @@ struct bdi_writeback {
#endif
};
+struct bdi_rcu_device {
+ struct device *dev;
+ struct rcu_head rcu_head;
+};
+
struct backing_dev_info {
u64 id;
struct rb_node rb_node; /* keyed by ->id */
@@ -219,7 +224,7 @@ struct backing_dev_info {
#endif
wait_queue_head_t wb_waitq;
- struct device *dev;
+ struct bdi_rcu_device *rcu_dev;
struct device *owner;
struct timer_list laptop_mode_wb_timer;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 62f05f605fb5..05f07ce19091 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -850,7 +850,7 @@ static int bdi_init(struct backing_dev_info *bdi)
{
int ret;
- bdi->dev = NULL;
+ bdi->rcu_dev = NULL;
kref_init(&bdi->refcnt);
bdi->min_ratio = 0;
@@ -932,20 +932,28 @@ struct backing_dev_info *bdi_get_by_id(u64 id)
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
- struct device *dev;
struct rb_node *parent, **p;
+ struct bdi_rcu_device *rcu_dev;
- if (bdi->dev) /* The driver needs to use separate queues per device */
+ /* The driver needs to use separate queues per device */
+ if (bdi->rcu_dev)
return 0;
- dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
- if (IS_ERR(dev))
- return PTR_ERR(dev);
+ rcu_dev = kzalloc(sizeof(struct bdi_rcu_device), GFP_KERNEL);
+ if (!rcu_dev)
+ return -ENOMEM;
+
+ rcu_dev->dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0),
+ bdi, fmt, args);
+ if (IS_ERR(rcu_dev->dev)) {
+ kfree(rcu_dev);
+ return PTR_ERR(rcu_dev->dev);
+ }
cgwb_bdi_register(bdi);
- bdi->dev = dev;
+ bdi->rcu_dev = rcu_dev;
- bdi_debug_register(bdi, dev_name(dev));
+ bdi_debug_register(bdi, dev_name(rcu_dev->dev));
set_bit(WB_registered, &bdi->wb.state);
spin_lock_bh(&bdi_lock);
@@ -1005,17 +1013,28 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
synchronize_rcu_expedited();
}
+static void bdi_put_device_rcu(struct rcu_head *rcu)
+{
+ struct bdi_rcu_device *rcu_dev = container_of(rcu,
+ struct bdi_rcu_device, rcu_head);
+ put_device(rcu_dev->dev);
+ kfree(rcu_dev);
+}
+
void bdi_unregister(struct backing_dev_info *bdi)
{
+ struct bdi_rcu_device *rcu_dev = bdi->rcu_dev;
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
cgwb_bdi_unregister(bdi);
- if (bdi->dev) {
+ if (rcu_dev) {
bdi_debug_unregister(bdi);
- device_unregister(bdi->dev);
- bdi->dev = NULL;
+ get_device(rcu_dev->dev);
+ device_unregister(rcu_dev->dev);
+ rcu_assign_pointer(bdi->rcu_dev, NULL);
+ call_rcu(&rcu_dev->rcu_head, bdi_put_device_rcu);
}
if (bdi->owner) {
@@ -1031,7 +1050,7 @@ static void release_bdi(struct kref *ref)
if (test_bit(WB_registered, &bdi->wb.state))
bdi_unregister(bdi);
- WARN_ON_ONCE(bdi->dev);
+ WARN_ON_ONCE(bdi->rcu_dev);
wb_exit(&bdi->wb);
cgwb_bdi_exit(bdi);
kfree(bdi);
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-15 13:54 ` Yufen Yu
@ 2020-02-19 12:55 ` Jan Kara
2020-02-19 15:12 ` Tejun Heo
2020-02-20 11:07 ` Yufen Yu
0 siblings, 2 replies; 14+ messages in thread
From: Jan Kara @ 2020-02-19 12:55 UTC (permalink / raw)
To: Yufen Yu; +Cc: Tejun Heo, axboe, linux-block, jack, bvanassche
Hi!
On Sat 15-02-20 21:54:08, Yufen Yu wrote:
> On 2020/2/14 22:05, Tejun Heo wrote:
> > Hello,
> >
> > On Fri, Feb 14, 2020 at 10:50:01AM +0800, Yufen Yu wrote:
> > > > So, unregistering can leave ->dev along and re-registering can test
> > > > whether it's NULL and if not put the existing one and put a new one
> > > > there. Wouldn't that work?
> > >
> > > Do you mean set bdi->dev as 'NULL' in call_rcu() callback function
> > > (i.e. bdi_release_device()) and test 'bdi->dev' in bdi_register_va()?
> > >
> > > I think that may do not work.
> > > We cannot make sure the order of rcu callback function and re-registering.
> > > Then bdi_release_device() may put the new allocated device by re-registering.
> >
> > No, I meant not freeing bdi->dev on deregistration and only doing so
> > when it actually needs to - on re-registration or release. So, sth
> > like the following.
> >
> > * Unregister: Unregister bdi->dev but don't free it. Leave the pointer
> > alone.
> >
> > * Re-register: If bdi->dev is not null, initiate RCU-free and update
> > bdi->dev to the new dev.
> >
> > * Release: If bdi->dev is not NULL, initiate RCU-free of it.
>
> Okay, I think we can do that.
>
> When do re-register, we need to update bdi->dev as 'NULL' or the new dev
> before invoking call_rcu() to free '->dev'. So readers started after call_rcu()
> cannot read the old dev, like:
>
> bdi_register_va()
> {
> ...
> if (bdi->dev) {
> //rcu_assgin_pointer(bdi->dev, new_dev);
> rcu_assgin_pointer(bdi->dev, NULL);
> call_rcu();//rcu callback function will free the old dev
> }
> ...
> }
>
> After assigning new value for bdi->dev, rcu callback function cannot get
> the old device. So I think we may need to replace the '->dev' with a new
> struct pointer, in which includes '->dev' and 'rcu_head'. We pass the
> struct.rcu_head pointer to call_rcu() and then it can get old dev address.
>
> IMO, maybe we can maintain the original code logic, fix the problem like:
I've now noticed there's commit 68f23b8906 "memcg: fix a crash in wb_workfn
when a device disappears" from end of January which tries to address the
issue you're looking into. Now AFAIU the code is till somewhat racy after
that commit so I wanted to mention this mostly so that you fixup also the
new bdi_dev_name() while you're fixing blkg_dev_name().
Also I was wondering about one thing: If we really care about bdi->dev only
for the name, won't we be much better off with just copying the name to
bdi->name on registration? Sure it would consume a bit of memory for the
name copy but I don't think we really care and things would be IMO *much*
simpler that way... Yufen, Tejun, what do you think?
Honza
>
> diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
> index 4fc87dee005a..e2de4a4e5392 100644
> --- a/include/linux/backing-dev-defs.h
> +++ b/include/linux/backing-dev-defs.h
> @@ -185,6 +185,11 @@ struct bdi_writeback {
> #endif
> };
>
> +struct bdi_rcu_device {
> + struct device *dev;
> + struct rcu_head rcu_head;
> +};
> +
> struct backing_dev_info {
> u64 id;
> struct rb_node rb_node; /* keyed by ->id */
> @@ -219,7 +224,7 @@ struct backing_dev_info {
> #endif
> wait_queue_head_t wb_waitq;
>
> - struct device *dev;
> + struct bdi_rcu_device *rcu_dev;
> struct device *owner;
>
> struct timer_list laptop_mode_wb_timer;
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 62f05f605fb5..05f07ce19091 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -850,7 +850,7 @@ static int bdi_init(struct backing_dev_info *bdi)
> {
> int ret;
>
> - bdi->dev = NULL;
> + bdi->rcu_dev = NULL;
>
> kref_init(&bdi->refcnt);
> bdi->min_ratio = 0;
> @@ -932,20 +932,28 @@ struct backing_dev_info *bdi_get_by_id(u64 id)
>
> int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
> {
> - struct device *dev;
> struct rb_node *parent, **p;
> + struct bdi_rcu_device *rcu_dev;
>
> - if (bdi->dev) /* The driver needs to use separate queues per device */
> + /* The driver needs to use separate queues per device */
> + if (bdi->rcu_dev)
> return 0;
>
> - dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
> - if (IS_ERR(dev))
> - return PTR_ERR(dev);
> + rcu_dev = kzalloc(sizeof(struct bdi_rcu_device), GFP_KERNEL);
> + if (!rcu_dev)
> + return -ENOMEM;
> +
> + rcu_dev->dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0),
> + bdi, fmt, args);
> + if (IS_ERR(rcu_dev->dev)) {
> + kfree(rcu_dev);
> + return PTR_ERR(rcu_dev->dev);
> + }
>
> cgwb_bdi_register(bdi);
> - bdi->dev = dev;
> + bdi->rcu_dev = rcu_dev;
>
> - bdi_debug_register(bdi, dev_name(dev));
> + bdi_debug_register(bdi, dev_name(rcu_dev->dev));
> set_bit(WB_registered, &bdi->wb.state);
>
> spin_lock_bh(&bdi_lock);
> @@ -1005,17 +1013,28 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
> synchronize_rcu_expedited();
> }
>
> +static void bdi_put_device_rcu(struct rcu_head *rcu)
> +{
> + struct bdi_rcu_device *rcu_dev = container_of(rcu,
> + struct bdi_rcu_device, rcu_head);
> + put_device(rcu_dev->dev);
> + kfree(rcu_dev);
> +}
> +
> void bdi_unregister(struct backing_dev_info *bdi)
> {
> + struct bdi_rcu_device *rcu_dev = bdi->rcu_dev;
> /* make sure nobody finds us on the bdi_list anymore */
> bdi_remove_from_list(bdi);
> wb_shutdown(&bdi->wb);
> cgwb_bdi_unregister(bdi);
>
> - if (bdi->dev) {
> + if (rcu_dev) {
> bdi_debug_unregister(bdi);
> - device_unregister(bdi->dev);
> - bdi->dev = NULL;
> + get_device(rcu_dev->dev);
> + device_unregister(rcu_dev->dev);
> + rcu_assign_pointer(bdi->rcu_dev, NULL);
> + call_rcu(&rcu_dev->rcu_head, bdi_put_device_rcu);
> }
>
> if (bdi->owner) {
> @@ -1031,7 +1050,7 @@ static void release_bdi(struct kref *ref)
>
> if (test_bit(WB_registered, &bdi->wb.state))
> bdi_unregister(bdi);
> - WARN_ON_ONCE(bdi->dev);
> + WARN_ON_ONCE(bdi->rcu_dev);
> wb_exit(&bdi->wb);
> cgwb_bdi_exit(bdi);
> kfree(bdi);
>
>
>
>
>
>
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-19 12:55 ` Jan Kara
@ 2020-02-19 15:12 ` Tejun Heo
2020-02-20 11:07 ` Yufen Yu
1 sibling, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2020-02-19 15:12 UTC (permalink / raw)
To: Jan Kara; +Cc: Yufen Yu, axboe, linux-block, bvanassche
Hello, Jan.
On Wed, Feb 19, 2020 at 01:55:05PM +0100, Jan Kara wrote:
> Also I was wondering about one thing: If we really care about bdi->dev only
> for the name, won't we be much better off with just copying the name to
> bdi->name on registration? Sure it would consume a bit of memory for the
> name copy but I don't think we really care and things would be IMO *much*
> simpler that way... Yufen, Tejun, what do you think?
Yeah, could be. So, object lifetimes in block layer have been kinda
janky mostly for historical reasons and in a lot of cases ppl apply
bandaids to work around immediate problems and at other times things
get restructured and properly fixed. Given how the objects are used
here, it'd be a typical case for RCU protecting bdi->dev and I wonder,
in the longer term, that'd be a better way to go than special-casing
name. That said, it's not a strong opinion.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-19 12:55 ` Jan Kara
2020-02-19 15:12 ` Tejun Heo
@ 2020-02-20 11:07 ` Yufen Yu
2020-02-20 12:07 ` Jan Kara
1 sibling, 1 reply; 14+ messages in thread
From: Yufen Yu @ 2020-02-20 11:07 UTC (permalink / raw)
To: Jan Kara; +Cc: Tejun Heo, axboe, linux-block, bvanassche
Hi,
On 2020/2/19 20:55, Jan Kara wrote:
> Hi!
>
> On Sat 15-02-20 21:54:08, Yufen Yu wrote:
>
> I've now noticed there's commit 68f23b8906 "memcg: fix a crash in wb_workfn
> when a device disappears" from end of January which tries to address the
> issue you're looking into. Now AFAIU the code is till somewhat racy after
> that commit so I wanted to mention this mostly so that you fixup also the
> new bdi_dev_name() while you're fixing blkg_dev_name().
>
> Also I was wondering about one thing: If we really care about bdi->dev only
> for the name, won't we be much better off with just copying the name to
> bdi->name on registration? Sure it would consume a bit of memory for the
> name copy but I don't think we really care and things would be IMO *much*
> simpler that way... Yufen, Tejun, what do you think?
>
I think copying the name to bdi->name is also need protected by lock.
Otherwise, the reader of bdi->name may read incorrect value when
re-registion have not copy the name completely. Right? So, I also think
using RCU to protect object lifetimes may be a better way.
Thanks,
Yufen
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] bdi: fix use-after-free for bdi device
2020-02-20 11:07 ` Yufen Yu
@ 2020-02-20 12:07 ` Jan Kara
0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2020-02-20 12:07 UTC (permalink / raw)
To: Yufen Yu; +Cc: Jan Kara, Tejun Heo, axboe, linux-block, bvanassche
On Thu 20-02-20 19:07:01, Yufen Yu wrote:
> Hi,
>
> On 2020/2/19 20:55, Jan Kara wrote:
> > Hi!
> >
> > On Sat 15-02-20 21:54:08, Yufen Yu wrote:
>
> >
> > I've now noticed there's commit 68f23b8906 "memcg: fix a crash in wb_workfn
> > when a device disappears" from end of January which tries to address the
> > issue you're looking into. Now AFAIU the code is till somewhat racy after
> > that commit so I wanted to mention this mostly so that you fixup also the
> > new bdi_dev_name() while you're fixing blkg_dev_name().
> >
> > Also I was wondering about one thing: If we really care about bdi->dev only
> > for the name, won't we be much better off with just copying the name to
> > bdi->name on registration? Sure it would consume a bit of memory for the
> > name copy but I don't think we really care and things would be IMO *much*
> > simpler that way... Yufen, Tejun, what do you think?
> >
>
> I think copying the name to bdi->name is also need protected by lock.
> Otherwise, the reader of bdi->name may read incorrect value when
> re-registion have not copy the name completely. Right? So, I also think
> using RCU to protect object lifetimes may be a better way.
OK, fair enough. :)
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2020-02-20 12:08 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-11 14:00 [PATCH] bdi: fix use-after-free for bdi device Yufen Yu
2020-02-11 13:57 ` Yufen Yu
2020-02-12 21:33 ` Tejun Heo
2020-02-13 2:46 ` Yufen Yu
2020-02-13 3:48 ` Tejun Heo
2020-02-13 7:51 ` Yufen Yu
2020-02-13 13:58 ` Tejun Heo
2020-02-14 2:50 ` Yufen Yu
2020-02-14 14:05 ` Tejun Heo
2020-02-15 13:54 ` Yufen Yu
2020-02-19 12:55 ` Jan Kara
2020-02-19 15:12 ` Tejun Heo
2020-02-20 11:07 ` Yufen Yu
2020-02-20 12:07 ` Jan Kara
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).