All of lore.kernel.org
 help / color / mirror / Atom feed
From: Hannes Reinecke <hare@suse.de>
To: Ming Lei <ming.lei@redhat.com>, Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org,
	Dongli Zhang <dongli.zhang@oracle.com>,
	James Smart <james.smart@broadcom.com>,
	Bart Van Assche <bart.vanassche@wdc.com>,
	linux-scsi@vger.kernel.org,
	"Martin K . Petersen" <martin.petersen@oracle.com>,
	Christoph Hellwig <hch@lst.de>,
	"James E . J . Bottomley" <jejb@linux.vnet.ibm.com>,
	jianchao wang <jianchao.w.wang@oracle.com>
Subject: Re: [PATCH V5 6/9] blk-mq: always free hctx after request queue is freed
Date: Fri, 12 Apr 2019 13:06:07 +0200	[thread overview]
Message-ID: <3e4c2137-2a3e-cf00-45ec-3d54dbc4fc6a@suse.de> (raw)
In-Reply-To: <20190412033032.10418-7-ming.lei@redhat.com>

On 4/12/19 5:30 AM, Ming Lei wrote:
> In normal queue cleanup path, hctx is released after request queue
> is freed, see blk_mq_release().
> 
> However, in __blk_mq_update_nr_hw_queues(), hctx may be freed because
> of hw queues shrinking. This way is easy to cause use-after-free,
> because: one implicit rule is that it is safe to call almost all block
> layer APIs if the request queue is alive; and one hctx may be retrieved
> by one API, then the hctx can be freed by blk_mq_update_nr_hw_queues();
> finally use-after-free is triggered.
> 
> Fixes this issue by always freeing hctx after releasing request queue.
> If some hctxs are removed in blk_mq_update_nr_hw_queues(), introduce
> a per-queue list to hold them, then try to resuse these hctxs if numa
> node is matched.
> 
> Cc: Dongli Zhang <dongli.zhang@oracle.com>
> Cc: James Smart <james.smart@broadcom.com>
> Cc: Bart Van Assche <bart.vanassche@wdc.com>
> Cc: linux-scsi@vger.kernel.org,
> Cc: Martin K . Petersen <martin.petersen@oracle.com>,
> Cc: Christoph Hellwig <hch@lst.de>,
> Cc: James E . J . Bottomley <jejb@linux.vnet.ibm.com>,
> Cc: jianchao wang <jianchao.w.wang@oracle.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   block/blk-mq.c         | 40 +++++++++++++++++++++++++++-------------
>   include/linux/blk-mq.h |  2 ++
>   include/linux/blkdev.h |  7 +++++++
>   3 files changed, 36 insertions(+), 13 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 71996fe494eb..886fbb678617 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2260,6 +2260,10 @@ static void blk_mq_exit_hctx(struct request_queue *q,
>   		set->ops->exit_hctx(hctx, hctx_idx);
>   
>   	blk_mq_remove_cpuhp(hctx);
> +
> +	spin_lock(&q->dead_hctx_lock);
> +	list_add(&hctx->hctx_list, &q->dead_hctx_list);
> +	spin_unlock(&q->dead_hctx_lock);
>   }
>   
>   static void blk_mq_exit_hw_queues(struct request_queue *q,
> @@ -2660,15 +2664,13 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
>    */
>   void blk_mq_release(struct request_queue *q)
>   {
> -	struct blk_mq_hw_ctx *hctx;
> -	unsigned int i;
> +	struct blk_mq_hw_ctx *hctx, *next;
>   
>   	cancel_delayed_work_sync(&q->requeue_work);
>   
> -	/* hctx kobj stays in hctx */
> -	queue_for_each_hw_ctx(q, hctx, i) {
> -		if (!hctx)
> -			continue;
> +	/* all hctx are in .dead_hctx_list now */
> +	list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) {
> +		list_del_init(&hctx->hctx_list);
>   		kobject_put(&hctx->kobj);
>   	}
>   
> @@ -2735,9 +2737,22 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
>   		struct blk_mq_tag_set *set, struct request_queue *q,
>   		int hctx_idx, int node)
>   {
> -	struct blk_mq_hw_ctx *hctx;
> +	struct blk_mq_hw_ctx *hctx = NULL, *tmp;
> +
> +	/* reuse dead hctx first */
> +	spin_lock(&q->dead_hctx_lock);
> +	list_for_each_entry(tmp, &q->dead_hctx_list, hctx_list) {
> +		if (tmp->numa_node == node) {
> +			hctx = tmp;
> +			break;
> +		}
> +	}
> +	if (hctx)
> +		list_del_init(&hctx->hctx_list);
> +	spin_unlock(&q->dead_hctx_lock);
>   
> -	hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node);
> +	if (!hctx)
> +		hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node);
>   	if (!hctx)
>   		goto fail;
>   
> @@ -2775,10 +2790,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
>   
>   		hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
>   		if (hctx) {
> -			if (hctxs[i]) {
> +			if (hctxs[i])
>   				blk_mq_exit_hctx(q, set, hctxs[i], i);
> -				kobject_put(&hctxs[i]->kobj);
> -			}
>   			hctxs[i] = hctx;
>   		} else {
>   			if (hctxs[i])
> @@ -2809,9 +2822,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
>   			if (hctx->tags)
>   				blk_mq_free_map_and_requests(set, j);
>   			blk_mq_exit_hctx(q, set, hctx, j);
> -			kobject_put(&hctx->kobj);
>   			hctxs[j] = NULL;
> -
>   		}
>   	}
>   	mutex_unlock(&q->sysfs_lock);
> @@ -2854,6 +2865,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>   	if (!q->queue_hw_ctx)
>   		goto err_sys_init;
>   
> +	INIT_LIST_HEAD(&q->dead_hctx_list);
> +	spin_lock_init(&q->dead_hctx_lock);
> +
>   	blk_mq_realloc_hw_ctxs(set, q);
>   	if (!q->nr_hw_queues)
>   		goto err_hctxs;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index cb2aa7ecafff..a44c3f95dcc1 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -70,6 +70,8 @@ struct blk_mq_hw_ctx {
>   	struct dentry		*sched_debugfs_dir;
>   #endif
>   
> +	struct list_head	hctx_list;
> +
>   	/* Must be the last member - see also blk_mq_hw_ctx_size(). */
>   	struct srcu_struct	srcu[0];
>   };
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 4b85dc066264..1325f941f0be 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -535,6 +535,13 @@ struct request_queue {
>   
>   	struct mutex		sysfs_lock;
>   
> +	/*
> +	 * for reusing dead hctx instance in case of updating
> +	 * nr_hw_queues
> +	 */
> +	struct list_head	dead_hctx_list;
> +	spinlock_t		dead_hctx_lock;
> +
>   	atomic_t		mq_freeze_depth;
>   
>   #if defined(CONFIG_BLK_DEV_BSG)
> 
I actually had been looking into this, too, but couldn't convince myself 
that the code really is a problem.
Did you see this happening in real life?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG Nürnberg)

  reply	other threads:[~2019-04-12 11:06 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-12  3:30 [PATCH V5 0/9] blk-mq: fix races related with freeing queue Ming Lei
2019-04-12  3:30 ` [PATCH V5 1/9] blk-mq: grab .q_usage_counter when queuing request from plug code path Ming Lei
2019-04-12  8:20   ` Johannes Thumshirn
2019-04-12 10:55   ` Hannes Reinecke
2019-04-12  3:30 ` [PATCH V5 2/9] blk-mq: move cancel of requeue_work into blk_mq_release Ming Lei
2019-04-12  8:23   ` Johannes Thumshirn
2019-04-12  3:30 ` [PATCH V5 3/9] blk-mq: free hw queue's resource in hctx's release handler Ming Lei
2019-04-12 11:03   ` Hannes Reinecke
2019-04-13  7:18     ` Ming Lei
2019-04-12  3:30 ` [PATCH V5 4/9] blk-mq: move all hctx alloction & initialization into __blk_mq_alloc_and_init_hctx Ming Lei
2019-04-12 11:04   ` Hannes Reinecke
2019-04-12  3:30 ` [PATCH V5 5/9] blk-mq: split blk_mq_alloc_and_init_hctx into two parts Ming Lei
2019-04-12 11:04   ` Hannes Reinecke
2019-04-12  3:30 ` [PATCH V5 6/9] blk-mq: always free hctx after request queue is freed Ming Lei
2019-04-12 11:06   ` Hannes Reinecke [this message]
2019-04-13  7:27     ` Ming Lei
2019-04-12  3:30 ` [PATCH V5 7/9] blk-mq: move cancel of hctx->run_work into blk_mq_hw_sysfs_release Ming Lei
2019-04-12 11:08   ` Hannes Reinecke
2019-04-12  3:30 ` [PATCH V5 8/9] block: don't drain in-progress dispatch in blk_cleanup_queue() Ming Lei
2019-04-12 11:09   ` Hannes Reinecke
2019-04-12  3:30 ` [PATCH V5 9/9] SCSI: don't hold device refcount in IO path Ming Lei
2019-04-12 11:09   ` Hannes Reinecke
2019-04-13  0:04   ` Martin K. Petersen
2019-04-13  6:56     ` Ming Lei
2019-04-13  9:23       ` Ming Lei
2019-04-16  2:12       ` Martin K. Petersen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3e4c2137-2a3e-cf00-45ec-3d54dbc4fc6a@suse.de \
    --to=hare@suse.de \
    --cc=axboe@kernel.dk \
    --cc=bart.vanassche@wdc.com \
    --cc=dongli.zhang@oracle.com \
    --cc=hch@lst.de \
    --cc=james.smart@broadcom.com \
    --cc=jejb@linux.vnet.ibm.com \
    --cc=jianchao.w.wang@oracle.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=ming.lei@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.