From: Jens Axboe <axboe@kernel.dk>
To: Bart Van Assche <bvanassche@acm.org>,
"jianchao.wang" <jianchao.w.wang@oracle.com>,
"linux-block@vger.kernel.org" <linux-block@vger.kernel.org>
Subject: Re: v4.20-rc6: Sporadic use-after-free in bt_iter()
Date: Thu, 20 Dec 2018 15:50:13 -0700 [thread overview]
Message-ID: <71fb9eff-43eb-24aa-fb67-be56a3a97983@kernel.dk> (raw)
In-Reply-To: <5869f2ed-dc65-135f-f12f-c14a1184125e@kernel.dk>
On 12/20/18 3:47 PM, Jens Axboe wrote:
> On 12/20/18 3:33 PM, Jens Axboe wrote:
>> On 12/20/18 3:23 PM, Jens Axboe wrote:
>>> On 12/20/18 3:19 PM, Bart Van Assche wrote:
>>>> On Thu, 2018-12-20 at 14:48 -0700, Jens Axboe wrote:
>>>>> -void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>>>>> - unsigned int hctx_idx)
>>>>> +static void blk_mq_rcu_free_pages(struct work_struct *work)
>>>>> {
>>>>> + struct blk_mq_tags *tags = container_of(to_rcu_work(work),
>>>>> + struct blk_mq_tags, rcu_work);
>>>>> struct page *page;
>>>>>
>>>>> + while (!list_empty(&tags->page_list)) {
>>>>> + page = list_first_entry(&tags->page_list, struct page, lru);
>>>>> + list_del_init(&page->lru);
>>>>> + /*
>>>>> + * Remove kmemleak object previously allocated in
>>>>> + * blk_mq_init_rq_map().
>>>>> + */
>>>>> + kmemleak_free(page_address(page));
>>>>> + __free_pages(page, page->private);
>>>>> + }
>>>>> +}
>>>>> +
>>>>> +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>>>>> + unsigned int hctx_idx)
>>>>> +{
>>>>> if (tags->rqs && set->ops->exit_request) {
>>>>> int i;
>>>>>
>>>>> @@ -2038,16 +2061,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>>>>> }
>>>>> }
>>>>>
>>>>> - while (!list_empty(&tags->page_list)) {
>>>>> - page = list_first_entry(&tags->page_list, struct page, lru);
>>>>> - list_del_init(&page->lru);
>>>>> - /*
>>>>> - * Remove kmemleak object previously allocated in
>>>>> - * blk_mq_init_rq_map().
>>>>> - */
>>>>> - kmemleak_free(page_address(page));
>>>>> - __free_pages(page, page->private);
>>>>> - }
>>>>> + /* Punt to RCU free, so we don't race with tag iteration */
>>>>> + INIT_RCU_WORK(&tags->rcu_work, blk_mq_rcu_free_pages);
>>>>> + queue_rcu_work(system_wq, &tags->rcu_work);
>>>>> }
>>>>
>>>> This can only work correctly if blk_mq_rcu_free_pages() is called before
>>>> INIT_RCU_WORK() is called a second time for the same bkl_mq_tags structure
>>>> and if blk_mq_rcu_free_pages() is called before struct blk_mq_tags is freed.
>>>> What provides these guarantees? Did I perhaps miss something?
>>>
>>> We don't call it twice. Protecting against that is outside the scope
>>> of this function. But we call and clear form the regular shutdown path,
>>> and the rest is error handling on setup.
>>>
>>> But yes, we do need to ensure that 'tags' doesn't go away. Probably best
>>> to rework it to shove it somewhere else for freeing for that case, and
>>> leave the rest of the shutdown alone. I'll update the patch.
>>
>> Here's a version that doesn't rely on tags, we just dynamically allocate
>> the structure. For the very odd case where we fail, we just punt to
>> an on-stack structure and use the big synchronize_rcu() hammer first.
>
> Forgot to init the lists... This one I actually booted, works for me.
> Outside of that, not tested, in terms of verifying the use-after-free is
> gone for tag iteration.
Oh, and we probably shouldn't free it unless we actually allocated it...
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a3fc7191c694..827e3d2180d8 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -491,12 +491,22 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
return NULL;
}
+static void blk_fq_rcu_free(struct work_struct *work)
+{
+ struct blk_flush_queue *fq = container_of(to_rcu_work(work),
+ struct blk_flush_queue,
+ rcu_work);
+
+ kfree(fq->flush_rq);
+ kfree(fq);
+}
+
void blk_free_flush_queue(struct blk_flush_queue *fq)
{
/* bio based request queue hasn't flush queue */
if (!fq)
return;
- kfree(fq->flush_rq);
- kfree(fq);
+ INIT_RCU_WORK(&fq->rcu_work, blk_fq_rcu_free);
+ queue_rcu_work(system_wq, &fq->rcu_work);
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2089c6c62f44..c39b58391ae8 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -228,13 +228,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
if (!reserved)
bitnr += tags->nr_reserved_tags;
- rq = tags->rqs[bitnr];
+ if (tags->rqs[bitnr].queue != hctx->queue)
+ return true;
/*
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
- if (rq && rq->q == hctx->queue)
+ rq = tags->rqs[bitnr].rq;
+ if (rq)
return iter_data->fn(hctx, rq, iter_data->data, reserved);
return true;
}
@@ -263,7 +265,9 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
.reserved = reserved,
};
+ rcu_read_lock();
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
+ rcu_read_unlock();
}
struct bt_tags_iter_data {
@@ -287,7 +291,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assining ->rqs[].
*/
- rq = tags->rqs[bitnr];
+ rq = tags->rqs[bitnr].rq;
if (rq && blk_mq_request_started(rq))
return iter_data->fn(rq, iter_data->data, reserved);
@@ -317,8 +321,11 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
.reserved = reserved,
};
- if (tags->rqs)
+ if (tags->rqs) {
+ rcu_read_lock();
sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
+ rcu_read_unlock();
+ }
}
/**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..bb84d1f099c7 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -4,6 +4,11 @@
#include "blk-mq.h"
+struct rq_tag_entry {
+ struct request_queue *queue;
+ struct request *rq;
+};
+
/*
* Tag address space map.
*/
@@ -16,7 +21,7 @@ struct blk_mq_tags {
struct sbitmap_queue bitmap_tags;
struct sbitmap_queue breserved_tags;
- struct request **rqs;
+ struct rq_tag_entry *rqs;
struct request **static_rqs;
struct list_head page_list;
};
@@ -78,7 +83,8 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
unsigned int tag, struct request *rq)
{
- hctx->tags->rqs[tag] = rq;
+ hctx->tags->rqs[tag].queue = hctx->queue;
+ hctx->tags->rqs[tag].rq = rq;
}
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2de972857496..82341a78a0c0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -298,13 +298,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->tag = -1;
rq->internal_tag = tag;
} else {
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
+ struct blk_mq_hw_ctx *hctx = data->hctx;
+
+ if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
rq_flags = RQF_MQ_INFLIGHT;
- atomic_inc(&data->hctx->nr_active);
+ atomic_inc(&hctx->nr_active);
}
rq->tag = tag;
rq->internal_tag = -1;
- data->hctx->tags->rqs[rq->tag] = rq;
+ hctx->tags->rqs[rq->tag].queue = hctx->queue;
+ hctx->tags->rqs[rq->tag].rq = rq;
}
/* csd/requeue_work/fifo_time is initialized before use */
@@ -797,8 +800,8 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
if (tag < tags->nr_tags) {
- prefetch(tags->rqs[tag]);
- return tags->rqs[tag];
+ prefetch(tags->rqs[tag].rq);
+ return tags->rqs[tag].rq;
}
return NULL;
@@ -809,10 +812,11 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
void *priv, bool reserved)
{
/*
- * If we find a request that is inflight and the queue matches,
- * we know the queue is busy. Return false to stop the iteration.
+ * We're only called here if the queue matches. If the rq state is
+ * inflight, we know the queue is busy. Return false to stop the
+ * iteration.
*/
- if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
+ if (rq->state == MQ_RQ_IN_FLIGHT) {
bool *busy = priv;
*busy = true;
@@ -1049,11 +1053,14 @@ bool blk_mq_get_driver_tag(struct request *rq)
shared = blk_mq_tag_busy(data.hctx);
rq->tag = blk_mq_get_tag(&data);
if (rq->tag >= 0) {
+ struct blk_mq_hw_ctx *hctx = data.hctx;
+
if (shared) {
rq->rq_flags |= RQF_MQ_INFLIGHT;
atomic_inc(&data.hctx->nr_active);
}
- data.hctx->tags->rqs[rq->tag] = rq;
+ hctx->tags->rqs[rq->tag].queue = hctx->queue;
+ hctx->tags->rqs[rq->tag].rq = rq;
}
done:
@@ -2020,10 +2027,36 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
return cookie;
}
+struct rq_page_list {
+ struct rcu_work rcu_work;
+ struct list_head list;
+ bool on_stack;
+};
+
+static void blk_mq_rcu_free_pages(struct work_struct *work)
+{
+ struct rq_page_list *rpl = container_of(to_rcu_work(work),
+ struct rq_page_list, rcu_work);
+ struct page *page;
+
+ while (!list_empty(&rpl->list)) {
+ page = list_first_entry(&rpl->list, struct page, lru);
+ list_del_init(&page->lru);
+ /*
+ * Remove kmemleak object previously allocated in
+ * blk_mq_init_rq_map().
+ */
+ kmemleak_free(page_address(page));
+ __free_pages(page, page->private);
+ }
+ if (!rpl->on_stack)
+ kfree(rpl);
+}
+
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
- struct page *page;
+ struct rq_page_list *rpl;
if (tags->rqs && set->ops->exit_request) {
int i;
@@ -2038,15 +2071,30 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
}
- while (!list_empty(&tags->page_list)) {
- page = list_first_entry(&tags->page_list, struct page, lru);
- list_del_init(&page->lru);
+ if (list_empty(&tags->page_list))
+ return;
+
+ rpl = kmalloc(sizeof(*rpl), GFP_NOIO);
+ if (rpl) {
+ INIT_LIST_HEAD(&rpl->list);
+ list_splice_init(&tags->page_list, &rpl->list);
+
+ /* Punt to RCU free, so we don't race with tag iteration */
+ INIT_RCU_WORK(&rpl->rcu_work, blk_mq_rcu_free_pages);
+ rpl->on_stack = false;
+ queue_rcu_work(system_wq, &rpl->rcu_work);
+ } else {
+ struct rq_page_list stack_rpl;
+
/*
- * Remove kmemleak object previously allocated in
- * blk_mq_init_rq_map().
+ * Fail alloc, punt to on-stack, we just have to synchronize
+ * RCU first to ensure readers are done.
*/
- kmemleak_free(page_address(page));
- __free_pages(page, page->private);
+ INIT_LIST_HEAD(&stack_rpl.list);
+ list_splice_init(&tags->page_list, &stack_rpl.list);
+ stack_rpl.on_stack = true;
+ synchronize_rcu();
+ blk_mq_rcu_free_pages(&stack_rpl.rcu_work.work);
}
}
@@ -2077,7 +2125,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
if (!tags)
return NULL;
- tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
+ tags->rqs = kcalloc_node(nr_tags, sizeof(struct rq_tag_entry),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
if (!tags->rqs) {
diff --git a/block/blk.h b/block/blk.h
index 848278c52030..785207fc8a30 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,6 +29,8 @@ struct blk_flush_queue {
*/
struct request *orig_rq;
spinlock_t mq_flush_lock;
+
+ struct rcu_work rcu_work;
};
extern struct kmem_cache *blk_requestq_cachep;
--
Jens Axboe
next prev parent reply other threads:[~2018-12-20 22:50 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-12-19 23:24 v4.20-rc6: Sporadic use-after-free in bt_iter() Bart Van Assche
2018-12-19 23:27 ` Jens Axboe
2018-12-20 0:16 ` Bart Van Assche
2018-12-20 3:17 ` Jens Axboe
2018-12-20 3:24 ` jianchao.wang
2018-12-20 4:19 ` Jens Axboe
2018-12-20 4:32 ` jianchao.wang
2018-12-20 4:48 ` Jens Axboe
2018-12-20 5:03 ` jianchao.wang
2018-12-20 13:02 ` Jens Axboe
2018-12-20 13:07 ` Jens Axboe
2018-12-20 18:01 ` Bart Van Assche
2018-12-20 18:21 ` Jens Axboe
2018-12-20 18:33 ` Jens Axboe
2018-12-20 20:56 ` Bart Van Assche
2018-12-20 21:00 ` Jens Axboe
2018-12-20 21:23 ` Bart Van Assche
2018-12-20 21:26 ` Jens Axboe
2018-12-20 21:31 ` Bart Van Assche
2018-12-20 21:34 ` Jens Axboe
2018-12-20 21:40 ` Bart Van Assche
2018-12-20 21:44 ` Jens Axboe
2018-12-20 21:48 ` Jens Axboe
2018-12-20 22:19 ` Bart Van Assche
2018-12-20 22:23 ` Jens Axboe
2018-12-20 22:33 ` Jens Axboe
2018-12-20 22:47 ` Jens Axboe
2018-12-20 22:50 ` Jens Axboe [this message]
2019-02-14 23:36 ` Bart Van Assche
2019-02-15 18:29 ` Evan Green
2019-02-19 16:48 ` Bart Van Assche
2019-02-21 20:54 ` Evan Green
2019-02-15 2:57 ` jianchao.wang
2018-12-20 4:06 ` Ming Lei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=71fb9eff-43eb-24aa-fb67-be56a3a97983@kernel.dk \
--to=axboe@kernel.dk \
--cc=bvanassche@acm.org \
--cc=jianchao.w.wang@oracle.com \
--cc=linux-block@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).