All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 0/2] blk-mq: fix kobject lifetime issue
@ 2018-11-16 11:23 Ming Lei
  2018-11-16 11:23 ` [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance Ming Lei
  2018-11-16 11:23 ` [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array Ming Lei
  0 siblings, 2 replies; 13+ messages in thread
From: Ming Lei @ 2018-11-16 11:23 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, jianchao.wang, Guenter Roeck, Greg Kroah-Hartman

Hi Jens,

The 1st patch fixes the kobject lifetime issue which is triggerd when
DEBUG_KOBJECT_RELEASE is enabled.

The 2nd patch can be thought as one follow-up cleanup.

V2:
	- allocate 'blk_mq_ctx' inside blk_mq_init_allocated_queue()
	- allocate q->mq_kobj directly 

Ming Lei (2):
  blk-mq: not embed .mq_kobj and ctx->kobj into queue instance
  blk-mq: alloc q->queue_ctx as normal array

 block/blk-mq-sysfs.c   | 51 ++++++++++++++++++++++++-----------------
 block/blk-mq.c         | 61 +++++++++++++++++++++++++++++++++++++++++---------
 block/blk-mq.h         |  4 ++--
 include/linux/blkdev.h |  4 ++--
 4 files changed, 84 insertions(+), 36 deletions(-)

Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

-- 
2.9.5


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance
  2018-11-16 11:23 [PATCH V2 0/2] blk-mq: fix kobject lifetime issue Ming Lei
@ 2018-11-16 11:23 ` Ming Lei
  2018-11-16 14:05   ` Greg Kroah-Hartman
  2018-11-16 11:23 ` [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array Ming Lei
  1 sibling, 1 reply; 13+ messages in thread
From: Ming Lei @ 2018-11-16 11:23 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, jianchao.wang, Guenter Roeck,
	Greg Kroah-Hartman, stable

Even though .mq_kobj, ctx->kobj and q->kobj share same lifetime
from block layer's view, actually they don't because userspace may
grab one kobject anytime via sysfs, so each kobject's lifetime has
to be independent, then the objects(mq_kobj, ctx) which hosts its
own kobject have to be allocated dynamically.

This patch fixes kernel panic issue during booting when DEBUG_KOBJECT_RELEASE
is enabled.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: stable@vger.kernel.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sysfs.c   | 41 +++++++++++++++++++++++++------------
 block/blk-mq.c         | 55 +++++++++++++++++++++++++++++++++++++++++---------
 block/blk-mq.h         |  4 ++--
 include/linux/blkdev.h |  4 ++--
 4 files changed, 78 insertions(+), 26 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 3d25b9c419e9..cc2fef909afc 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -15,6 +15,14 @@
 
 static void blk_mq_sysfs_release(struct kobject *kobj)
 {
+	kfree(kobj);
+}
+
+static void blk_mq_ctx_sysfs_release(struct kobject *kobj)
+{
+	struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+
+	kfree(ctx);
 }
 
 static void blk_mq_hw_sysfs_release(struct kobject *kobj)
@@ -213,7 +221,7 @@ static struct kobj_type blk_mq_ktype = {
 static struct kobj_type blk_mq_ctx_ktype = {
 	.sysfs_ops	= &blk_mq_sysfs_ops,
 	.default_attrs	= default_ctx_attrs,
-	.release	= blk_mq_sysfs_release,
+	.release	= blk_mq_ctx_sysfs_release,
 };
 
 static struct kobj_type blk_mq_hw_ktype = {
@@ -245,7 +253,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	if (!hctx->nr_ctx)
 		return 0;
 
-	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
+	ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num);
 	if (ret)
 		return ret;
 
@@ -268,8 +276,8 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 
-	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
-	kobject_del(&q->mq_kobj);
+	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(q->mq_kobj);
 	kobject_put(&dev->kobj);
 
 	q->mq_sysfs_init_done = false;
@@ -286,23 +294,30 @@ void blk_mq_sysfs_deinit(struct request_queue *q)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		ctx = per_cpu_ptr(q->queue_ctx, cpu);
+		ctx = *per_cpu_ptr(q->queue_ctx, cpu);
 		kobject_put(&ctx->kobj);
 	}
-	kobject_put(&q->mq_kobj);
+	kobject_put(q->mq_kobj);
 }
 
-void blk_mq_sysfs_init(struct request_queue *q)
+int blk_mq_sysfs_init(struct request_queue *q)
 {
 	struct blk_mq_ctx *ctx;
 	int cpu;
+	struct kobject *mq_kobj;
+
+	mq_kobj = kzalloc(sizeof(*mq_kobj), GFP_KERNEL);
+	if (!mq_kobj)
+		return -ENOMEM;
 
-	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+	kobject_init(mq_kobj, &blk_mq_ktype);
 
 	for_each_possible_cpu(cpu) {
-		ctx = per_cpu_ptr(q->queue_ctx, cpu);
+		ctx = *per_cpu_ptr(q->queue_ctx, cpu);
 		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
 	}
+	q->mq_kobj = mq_kobj;
+	return 0;
 }
 
 int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
@@ -313,11 +328,11 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
 	WARN_ON_ONCE(!q->kobj.parent);
 	lockdep_assert_held(&q->sysfs_lock);
 
-	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+	ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
 		goto out;
 
-	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
+	kobject_uevent(q->mq_kobj, KOBJ_ADD);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
@@ -334,8 +349,8 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
 	while (--i >= 0)
 		blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
 
-	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
-	kobject_del(&q->mq_kobj);
+	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(q->mq_kobj);
 	kobject_put(&dev->kobj);
 	return ret;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b823891b3ef..376c04778d33 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2299,7 +2299,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 	unsigned int i, j;
 
 	for_each_possible_cpu(i) {
-		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
+		struct blk_mq_ctx *__ctx = *per_cpu_ptr(q->queue_ctx, i);
 		struct blk_mq_hw_ctx *hctx;
 
 		__ctx->cpu = i;
@@ -2385,7 +2385,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			set->map[0].mq_map[i] = 0;
 		}
 
-		ctx = per_cpu_ptr(q->queue_ctx, i);
+		ctx = *per_cpu_ptr(q->queue_ctx, i);
 		for (j = 0; j < set->nr_maps; j++) {
 			hctx = blk_mq_map_queue_type(q, j, i);
 
@@ -2515,6 +2515,38 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 	mutex_unlock(&set->tag_list_lock);
 }
 
+static void blk_mq_dealloc_queue_ctx(struct request_queue *q, bool free_ctxs)
+{
+	if (free_ctxs) {
+		int cpu;
+		for_each_possible_cpu(cpu)
+			kfree(*per_cpu_ptr(q->queue_ctx, cpu));
+	}
+	free_percpu(q->queue_ctx);
+}
+
+static int blk_mq_alloc_queue_ctx(struct request_queue *q)
+{
+	struct blk_mq_ctx *ctx;
+	int cpu;
+
+	q->queue_ctx = alloc_percpu(struct blk_mq_ctx *);
+	if (!q->queue_ctx)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		ctx = kzalloc_node(sizeof(*ctx), GFP_KERNEL, cpu_to_node(cpu));
+		if (!ctx)
+			goto fail;
+		*per_cpu_ptr(q->queue_ctx, cpu) = ctx;
+	}
+
+	return 0;
+ fail:
+	blk_mq_dealloc_queue_ctx(q, true);
+	return -ENOMEM;
+}
+
 /*
  * It is the actual release handler for mq, but we do it from
  * request queue's release handler for avoiding use-after-free
@@ -2541,7 +2573,7 @@ void blk_mq_release(struct request_queue *q)
 	 */
 	blk_mq_sysfs_deinit(q);
 
-	free_percpu(q->queue_ctx);
+	blk_mq_dealloc_queue_ctx(q, false);
 }
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
@@ -2722,6 +2754,8 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						  struct request_queue *q)
 {
+	bool sysfs_init_done = false;
+
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
@@ -2731,18 +2765,19 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (!q->poll_cb)
 		goto err_exit;
 
-	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
-	if (!q->queue_ctx)
+	if (blk_mq_alloc_queue_ctx(q))
 		goto err_exit;
 
 	/* init q->mq_kobj and sw queues' kobjects */
-	blk_mq_sysfs_init(q);
+	if (blk_mq_sysfs_init(q))
+		goto err_queue_ctx;
 
+	sysfs_init_done = true;
 	q->nr_queues = nr_hw_queues(set);
 	q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
 						GFP_KERNEL, set->numa_node);
 	if (!q->queue_hw_ctx)
-		goto err_percpu;
+		goto err_sys_init;
 
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
@@ -2794,8 +2829,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 err_hctxs:
 	kfree(q->queue_hw_ctx);
-err_percpu:
-	free_percpu(q->queue_ctx);
+err_sys_init:
+	blk_mq_sysfs_deinit(q);
+err_queue_ctx:
+	blk_mq_dealloc_queue_ctx(q, !sysfs_init_done);
 err_exit:
 	q->mq_ops = NULL;
 	return ERR_PTR(-ENOMEM);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index facb6e9ddce4..84898793c230 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -108,7 +108,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 /*
  * sysfs helpers
  */
-extern void blk_mq_sysfs_init(struct request_queue *q);
+extern int blk_mq_sysfs_init(struct request_queue *q);
 extern void blk_mq_sysfs_deinit(struct request_queue *q);
 extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
 extern int blk_mq_sysfs_register(struct request_queue *q);
@@ -129,7 +129,7 @@ static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
-	return per_cpu_ptr(q->queue_ctx, cpu);
+	return *per_cpu_ptr(q->queue_ctx, cpu);
 }
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1d185f1fc333..9e3892bd67fd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -407,7 +407,7 @@ struct request_queue {
 	const struct blk_mq_ops	*mq_ops;
 
 	/* sw queues */
-	struct blk_mq_ctx __percpu	*queue_ctx;
+	struct blk_mq_ctx __percpu	**queue_ctx;
 	unsigned int		nr_queues;
 
 	unsigned int		queue_depth;
@@ -456,7 +456,7 @@ struct request_queue {
 	/*
 	 * mq queue kobject
 	 */
-	struct kobject mq_kobj;
+	struct kobject *mq_kobj;
 
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity integrity;
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array
  2018-11-16 11:23 [PATCH V2 0/2] blk-mq: fix kobject lifetime issue Ming Lei
  2018-11-16 11:23 ` [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance Ming Lei
@ 2018-11-16 11:23 ` Ming Lei
  2018-11-16 14:06   ` Greg Kroah-Hartman
  1 sibling, 1 reply; 13+ messages in thread
From: Ming Lei @ 2018-11-16 11:23 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, jianchao.wang, Guenter Roeck, Greg Kroah-Hartman

Now q->queue_ctx is just one read-mostly table for query the
'blk_mq_ctx' instance from one cpu index, it isn't necessary
to allocate it as percpu variable. One simple array may be
more efficient.

Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sysfs.c   | 14 ++++----------
 block/blk-mq.c         | 18 ++++++++++--------
 block/blk-mq.h         |  2 +-
 include/linux/blkdev.h |  2 +-
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index cc2fef909afc..ae2cafd6f8a8 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -290,19 +290,15 @@ void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
 
 void blk_mq_sysfs_deinit(struct request_queue *q)
 {
-	struct blk_mq_ctx *ctx;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		ctx = *per_cpu_ptr(q->queue_ctx, cpu);
-		kobject_put(&ctx->kobj);
-	}
+	for_each_possible_cpu(cpu)
+		kobject_put(&q->queue_ctx[cpu]->kobj);
 	kobject_put(q->mq_kobj);
 }
 
 int blk_mq_sysfs_init(struct request_queue *q)
 {
-	struct blk_mq_ctx *ctx;
 	int cpu;
 	struct kobject *mq_kobj;
 
@@ -312,10 +308,8 @@ int blk_mq_sysfs_init(struct request_queue *q)
 
 	kobject_init(mq_kobj, &blk_mq_ktype);
 
-	for_each_possible_cpu(cpu) {
-		ctx = *per_cpu_ptr(q->queue_ctx, cpu);
-		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
-	}
+	for_each_possible_cpu(cpu)
+		kobject_init(&q->queue_ctx[cpu]->kobj, &blk_mq_ctx_ktype);
 	q->mq_kobj = mq_kobj;
 	return 0;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 376c04778d33..85d5dba56272 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2299,7 +2299,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 	unsigned int i, j;
 
 	for_each_possible_cpu(i) {
-		struct blk_mq_ctx *__ctx = *per_cpu_ptr(q->queue_ctx, i);
+		struct blk_mq_ctx *__ctx = q->queue_ctx[i];
 		struct blk_mq_hw_ctx *hctx;
 
 		__ctx->cpu = i;
@@ -2385,7 +2385,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			set->map[0].mq_map[i] = 0;
 		}
 
-		ctx = *per_cpu_ptr(q->queue_ctx, i);
+		ctx = q->queue_ctx[i];
 		for (j = 0; j < set->nr_maps; j++) {
 			hctx = blk_mq_map_queue_type(q, j, i);
 
@@ -2520,9 +2520,9 @@ static void blk_mq_dealloc_queue_ctx(struct request_queue *q, bool free_ctxs)
 	if (free_ctxs) {
 		int cpu;
 		for_each_possible_cpu(cpu)
-			kfree(*per_cpu_ptr(q->queue_ctx, cpu));
+			kfree(q->queue_ctx[cpu]);
 	}
-	free_percpu(q->queue_ctx);
+	kfree(q->queue_ctx);
 }
 
 static int blk_mq_alloc_queue_ctx(struct request_queue *q)
@@ -2530,7 +2530,9 @@ static int blk_mq_alloc_queue_ctx(struct request_queue *q)
 	struct blk_mq_ctx *ctx;
 	int cpu;
 
-	q->queue_ctx = alloc_percpu(struct blk_mq_ctx *);
+	q->queue_ctx = kmalloc_array_node(nr_cpu_ids,
+					  sizeof(struct blk_mq_ctx *),
+					  GFP_KERNEL, q->tag_set->numa_node);
 	if (!q->queue_ctx)
 		return -ENOMEM;
 
@@ -2538,7 +2540,7 @@ static int blk_mq_alloc_queue_ctx(struct request_queue *q)
 		ctx = kzalloc_node(sizeof(*ctx), GFP_KERNEL, cpu_to_node(cpu));
 		if (!ctx)
 			goto fail;
-		*per_cpu_ptr(q->queue_ctx, cpu) = ctx;
+		q->queue_ctx[cpu] = ctx;
 	}
 
 	return 0;
@@ -2759,6 +2761,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
+	q->tag_set = set;
+
 	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
 					     blk_mq_poll_stats_bkt,
 					     BLK_MQ_POLL_STATS_BKTS, q);
@@ -2786,8 +2790,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
-	q->tag_set = set;
-
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 84898793c230..97829388e1db 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -129,7 +129,7 @@ static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
-	return *per_cpu_ptr(q->queue_ctx, cpu);
+	return q->queue_ctx[cpu];
 }
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9e3892bd67fd..9b6ddc5c7a40 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -407,7 +407,7 @@ struct request_queue {
 	const struct blk_mq_ops	*mq_ops;
 
 	/* sw queues */
-	struct blk_mq_ctx __percpu	**queue_ctx;
+	struct blk_mq_ctx	**queue_ctx;
 	unsigned int		nr_queues;
 
 	unsigned int		queue_depth;
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance
  2018-11-16 11:23 ` [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance Ming Lei
@ 2018-11-16 14:05   ` Greg Kroah-Hartman
  2018-11-17  2:26     ` Ming Lei
  0 siblings, 1 reply; 13+ messages in thread
From: Greg Kroah-Hartman @ 2018-11-16 14:05 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck, stable

On Fri, Nov 16, 2018 at 07:23:10PM +0800, Ming Lei wrote:
> @@ -456,7 +456,7 @@ struct request_queue {
>  	/*
>  	 * mq queue kobject
>  	 */
> -	struct kobject mq_kobj;
> +	struct kobject *mq_kobj;

What is this kobject even used for?  It wasn't obvious at all from this
patch, why is it needed if you are not using it to reference count the
larger structure here?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array
  2018-11-16 11:23 ` [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array Ming Lei
@ 2018-11-16 14:06   ` Greg Kroah-Hartman
  2018-11-17  2:34     ` Ming Lei
  0 siblings, 1 reply; 13+ messages in thread
From: Greg Kroah-Hartman @ 2018-11-16 14:06 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck

On Fri, Nov 16, 2018 at 07:23:11PM +0800, Ming Lei wrote:
> Now q->queue_ctx is just one read-mostly table for query the
> 'blk_mq_ctx' instance from one cpu index, it isn't necessary
> to allocate it as percpu variable. One simple array may be
> more efficient.

"may be", have you run benchmarks to be sure?  If so, can you add the
results of them to this changelog?  If there is no measurable
difference, then why make this change at all?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance
  2018-11-16 14:05   ` Greg Kroah-Hartman
@ 2018-11-17  2:26     ` Ming Lei
  2018-11-19 10:06       ` Greg Kroah-Hartman
  0 siblings, 1 reply; 13+ messages in thread
From: Ming Lei @ 2018-11-17  2:26 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck, stable

On Fri, Nov 16, 2018 at 06:05:21AM -0800, Greg Kroah-Hartman wrote:
> On Fri, Nov 16, 2018 at 07:23:10PM +0800, Ming Lei wrote:
> > @@ -456,7 +456,7 @@ struct request_queue {
> >  	/*
> >  	 * mq queue kobject
> >  	 */
> > -	struct kobject mq_kobj;
> > +	struct kobject *mq_kobj;
> 
> What is this kobject even used for?  It wasn't obvious at all from this
> patch, why is it needed if you are not using it to reference count the
> larger structure here?

All attributes and kobjects under /sys/block/$DEV/mq are covered by this kobject
actually, and all are for exposing blk-mq specific information, but now there is
only blk-mq, and legacy io path is removed.

That is why I mentioned we may remove this kobject last time and move all under
/sys/block/$DEV/queue, however you thought that may break some userspace.

If we want to backport them to stable, this patch may be a bit easier to go.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array
  2018-11-16 14:06   ` Greg Kroah-Hartman
@ 2018-11-17  2:34     ` Ming Lei
  2018-11-17 10:03       ` Greg Kroah-Hartman
  0 siblings, 1 reply; 13+ messages in thread
From: Ming Lei @ 2018-11-17  2:34 UTC (permalink / raw)
  To: Greg Kroah-Hartman; +Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck

On Fri, Nov 16, 2018 at 06:06:23AM -0800, Greg Kroah-Hartman wrote:
> On Fri, Nov 16, 2018 at 07:23:11PM +0800, Ming Lei wrote:
> > Now q->queue_ctx is just one read-mostly table for query the
> > 'blk_mq_ctx' instance from one cpu index, it isn't necessary
> > to allocate it as percpu variable. One simple array may be
> > more efficient.
> 
> "may be", have you run benchmarks to be sure?  If so, can you add the
> results of them to this changelog?  If there is no measurable
> difference, then why make this change at all?

__blk_mq_get_ctx() is used in fast path, what do you think about which
one is more efficient?

- *per_cpu_ptr(q->queue_ctx, cpu);

- q->queue_ctx[cpu]

At least the latter isn't worse than the former.

Especially q->queue_ctx is just a read-only look-up table, it doesn't
make sense to make it percpu any more.

Not mention q->queue_ctx[cpu] is more clean/readable.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array
  2018-11-17  2:34     ` Ming Lei
@ 2018-11-17 10:03       ` Greg Kroah-Hartman
  2018-11-19  2:04         ` Ming Lei
  0 siblings, 1 reply; 13+ messages in thread
From: Greg Kroah-Hartman @ 2018-11-17 10:03 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck

On Sat, Nov 17, 2018 at 10:34:18AM +0800, Ming Lei wrote:
> On Fri, Nov 16, 2018 at 06:06:23AM -0800, Greg Kroah-Hartman wrote:
> > On Fri, Nov 16, 2018 at 07:23:11PM +0800, Ming Lei wrote:
> > > Now q->queue_ctx is just one read-mostly table for query the
> > > 'blk_mq_ctx' instance from one cpu index, it isn't necessary
> > > to allocate it as percpu variable. One simple array may be
> > > more efficient.
> > 
> > "may be", have you run benchmarks to be sure?  If so, can you add the
> > results of them to this changelog?  If there is no measurable
> > difference, then why make this change at all?
> 
> __blk_mq_get_ctx() is used in fast path, what do you think about which
> one is more efficient?
> 
> - *per_cpu_ptr(q->queue_ctx, cpu);
> 
> - q->queue_ctx[cpu]

You need to actually test to see which one is faster, you might be
surprised :)

In other words, do not just guess.

> At least the latter isn't worse than the former.

How do you know?

> Especially q->queue_ctx is just a read-only look-up table, it doesn't
> make sense to make it percpu any more.
> 
> Not mention q->queue_ctx[cpu] is more clean/readable.

Again, please test to verify this.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array
  2018-11-17 10:03       ` Greg Kroah-Hartman
@ 2018-11-19  2:04         ` Ming Lei
  2018-11-19 10:17           ` Greg Kroah-Hartman
  0 siblings, 1 reply; 13+ messages in thread
From: Ming Lei @ 2018-11-19  2:04 UTC (permalink / raw)
  To: Greg Kroah-Hartman; +Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck

On Sat, Nov 17, 2018 at 11:03:42AM +0100, Greg Kroah-Hartman wrote:
> On Sat, Nov 17, 2018 at 10:34:18AM +0800, Ming Lei wrote:
> > On Fri, Nov 16, 2018 at 06:06:23AM -0800, Greg Kroah-Hartman wrote:
> > > On Fri, Nov 16, 2018 at 07:23:11PM +0800, Ming Lei wrote:
> > > > Now q->queue_ctx is just one read-mostly table for query the
> > > > 'blk_mq_ctx' instance from one cpu index, it isn't necessary
> > > > to allocate it as percpu variable. One simple array may be
> > > > more efficient.
> > > 
> > > "may be", have you run benchmarks to be sure?  If so, can you add the
> > > results of them to this changelog?  If there is no measurable
> > > difference, then why make this change at all?
> > 
> > __blk_mq_get_ctx() is used in fast path, what do you think about which
> > one is more efficient?
> > 
> > - *per_cpu_ptr(q->queue_ctx, cpu);
> > 
> > - q->queue_ctx[cpu]
> 
> You need to actually test to see which one is faster, you might be
> surprised :)
> 
> In other words, do not just guess.

No performance difference is observed wrt. this patchset when I
run the following fio test on null_blk(modprobe null_blk) in my VM:

fio --direct=1 --size=128G --bsrange=4k-4k --runtime=40 --numjobs=32 \
  --ioengine=libaio --iodepth=64 --group_reporting=1 --filename=/dev/nullb0 \
  --name=null_blk-ttest-randread --rw=randread

Running test is important, but IMO it is more important to understand
the idea behind is correct, or the approach can be proved as correct.

Given the count of test cases can be increased exponentially when the related
factors or settings are covered, obviously we can't run all the test cases.

> 
> > At least the latter isn't worse than the former.
> 
> How do you know?

As I explained, q->queue_ctx is basically read-only loop-up table after
queue is initialized, and there isn't any benefit to use percpu
allocator here.

> 
> > Especially q->queue_ctx is just a read-only look-up table, it doesn't
> > make sense to make it percpu any more.
> > 
> > Not mention q->queue_ctx[cpu] is more clean/readable.
> 
> Again, please test to verify this.

IMO, 'test' can't be enough to verify if one approach is correct,
or patch is correct given we can't cover all cases by test, and it should be
served as a supplement for proof or patch analysis, seems it is usually done
in patch commit log, or review. 


Thanks,
Ming

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance
  2018-11-17  2:26     ` Ming Lei
@ 2018-11-19 10:06       ` Greg Kroah-Hartman
  2018-11-19 10:20         ` Ming Lei
  0 siblings, 1 reply; 13+ messages in thread
From: Greg Kroah-Hartman @ 2018-11-19 10:06 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck, stable

On Sat, Nov 17, 2018 at 10:26:38AM +0800, Ming Lei wrote:
> On Fri, Nov 16, 2018 at 06:05:21AM -0800, Greg Kroah-Hartman wrote:
> > On Fri, Nov 16, 2018 at 07:23:10PM +0800, Ming Lei wrote:
> > > @@ -456,7 +456,7 @@ struct request_queue {
> > >  	/*
> > >  	 * mq queue kobject
> > >  	 */
> > > -	struct kobject mq_kobj;
> > > +	struct kobject *mq_kobj;
> > 
> > What is this kobject even used for?  It wasn't obvious at all from this
> > patch, why is it needed if you are not using it to reference count the
> > larger structure here?
> 
> All attributes and kobjects under /sys/block/$DEV/mq are covered by this kobject
> actually, and all are for exposing blk-mq specific information, but now there is
> only blk-mq, and legacy io path is removed.

I am sorry, but I really can not parse this sentance at all.

What Documentation/ABI/ entries are covered by this kobject, that should
help me out more.  And what do you mean by "legacy io"?

> That is why I mentioned we may remove this kobject last time and move all under
> /sys/block/$DEV/queue, however you thought that may break some userspace.

Who relies on these sysfs files today?

> If we want to backport them to stable, this patch may be a bit easier to go.

Why do you want to backport any of this to stable?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array
  2018-11-19  2:04         ` Ming Lei
@ 2018-11-19 10:17           ` Greg Kroah-Hartman
  2018-11-19 10:51             ` Ming Lei
  0 siblings, 1 reply; 13+ messages in thread
From: Greg Kroah-Hartman @ 2018-11-19 10:17 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck

On Mon, Nov 19, 2018 at 10:04:27AM +0800, Ming Lei wrote:
> On Sat, Nov 17, 2018 at 11:03:42AM +0100, Greg Kroah-Hartman wrote:
> > On Sat, Nov 17, 2018 at 10:34:18AM +0800, Ming Lei wrote:
> > > On Fri, Nov 16, 2018 at 06:06:23AM -0800, Greg Kroah-Hartman wrote:
> > > > On Fri, Nov 16, 2018 at 07:23:11PM +0800, Ming Lei wrote:
> > > > > Now q->queue_ctx is just one read-mostly table for query the
> > > > > 'blk_mq_ctx' instance from one cpu index, it isn't necessary
> > > > > to allocate it as percpu variable. One simple array may be
> > > > > more efficient.
> > > > 
> > > > "may be", have you run benchmarks to be sure?  If so, can you add the
> > > > results of them to this changelog?  If there is no measurable
> > > > difference, then why make this change at all?
> > > 
> > > __blk_mq_get_ctx() is used in fast path, what do you think about which
> > > one is more efficient?
> > > 
> > > - *per_cpu_ptr(q->queue_ctx, cpu);
> > > 
> > > - q->queue_ctx[cpu]
> > 
> > You need to actually test to see which one is faster, you might be
> > surprised :)
> > 
> > In other words, do not just guess.
> 
> No performance difference is observed wrt. this patchset when I
> run the following fio test on null_blk(modprobe null_blk) in my VM:
> 
> fio --direct=1 --size=128G --bsrange=4k-4k --runtime=40 --numjobs=32 \
>   --ioengine=libaio --iodepth=64 --group_reporting=1 --filename=/dev/nullb0 \
>   --name=null_blk-ttest-randread --rw=randread
> 
> Running test is important, but IMO it is more important to understand
> the idea behind is correct, or the approach can be proved as correct.
> 
> Given the count of test cases can be increased exponentially when the related
> factors or settings are covered, obviously we can't run all the test cases.

And what happens when you start to scale the number of queues and cpus
in the system?  Does both options work the same?  Why did the original
code have per-cpu variables?

Anyway, this isn't my subsystem, and it has nothing to do with kobjects,
so I really do not care.  My point is, do not make core changes like
this without really knowing the reason behind the original choice and at
least testing that your change does not break that reasoning.

good luck!

greg k-h

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance
  2018-11-19 10:06       ` Greg Kroah-Hartman
@ 2018-11-19 10:20         ` Ming Lei
  0 siblings, 0 replies; 13+ messages in thread
From: Ming Lei @ 2018-11-19 10:20 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck, stable

On Mon, Nov 19, 2018 at 11:06:06AM +0100, Greg Kroah-Hartman wrote:
> On Sat, Nov 17, 2018 at 10:26:38AM +0800, Ming Lei wrote:
> > On Fri, Nov 16, 2018 at 06:05:21AM -0800, Greg Kroah-Hartman wrote:
> > > On Fri, Nov 16, 2018 at 07:23:10PM +0800, Ming Lei wrote:
> > > > @@ -456,7 +456,7 @@ struct request_queue {
> > > >  	/*
> > > >  	 * mq queue kobject
> > > >  	 */
> > > > -	struct kobject mq_kobj;
> > > > +	struct kobject *mq_kobj;
> > > 
> > > What is this kobject even used for?  It wasn't obvious at all from this
> > > patch, why is it needed if you are not using it to reference count the
> > > larger structure here?
> > 
> > All attributes and kobjects under /sys/block/$DEV/mq are covered by this kobject
> > actually, and all are for exposing blk-mq specific information, but now there is
> > only blk-mq, and legacy io path is removed.
> 
> I am sorry, but I really can not parse this sentance at all.
> 
> What Documentation/ABI/ entries are covered by this kobject, that should
> help me out more.  And what do you mean by "legacy io"?

After blk-mq is introduced, there are two main IO request paths:

1) blk-mq, IO is queued via blk_mq_make_request()

2) legacy IO path, IO is queued via blk_queue_bio()

The legacy IO path will be removed in 4.21, and patches have been queued
in for-4.21/block.

> 
> > That is why I mentioned we may remove this kobject last time and move all under
> > /sys/block/$DEV/queue, however you thought that may break some userspace.
> 
> Who relies on these sysfs files today?

I don't know, actually the question is from you, :-(

https://marc.info/?l=linux-kernel&m=154205455332755&w=2

Even we remove q->mq_kobj, the same kobject lifetime issue is still here in
kobjects embedded in 'struct blk_mq_ctx'.

> 
> > If we want to backport them to stable, this patch may be a bit easier to go.
> 
> Why do you want to backport any of this to stable?

For this report from Guenter, it should be enough to backport the 1st patch,
and it shouldn't be very hard to backport both. 

I can help to backport them if patches can't be applied cleanly.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array
  2018-11-19 10:17           ` Greg Kroah-Hartman
@ 2018-11-19 10:51             ` Ming Lei
  0 siblings, 0 replies; 13+ messages in thread
From: Ming Lei @ 2018-11-19 10:51 UTC (permalink / raw)
  To: Greg Kroah-Hartman; +Cc: Jens Axboe, linux-block, jianchao.wang, Guenter Roeck

On Mon, Nov 19, 2018 at 11:17:49AM +0100, Greg Kroah-Hartman wrote:
> On Mon, Nov 19, 2018 at 10:04:27AM +0800, Ming Lei wrote:
> > On Sat, Nov 17, 2018 at 11:03:42AM +0100, Greg Kroah-Hartman wrote:
> > > On Sat, Nov 17, 2018 at 10:34:18AM +0800, Ming Lei wrote:
> > > > On Fri, Nov 16, 2018 at 06:06:23AM -0800, Greg Kroah-Hartman wrote:
> > > > > On Fri, Nov 16, 2018 at 07:23:11PM +0800, Ming Lei wrote:
> > > > > > Now q->queue_ctx is just one read-mostly table for query the
> > > > > > 'blk_mq_ctx' instance from one cpu index, it isn't necessary
> > > > > > to allocate it as percpu variable. One simple array may be
> > > > > > more efficient.
> > > > > 
> > > > > "may be", have you run benchmarks to be sure?  If so, can you add the
> > > > > results of them to this changelog?  If there is no measurable
> > > > > difference, then why make this change at all?
> > > > 
> > > > __blk_mq_get_ctx() is used in fast path, what do you think about which
> > > > one is more efficient?
> > > > 
> > > > - *per_cpu_ptr(q->queue_ctx, cpu);
> > > > 
> > > > - q->queue_ctx[cpu]
> > > 
> > > You need to actually test to see which one is faster, you might be
> > > surprised :)
> > > 
> > > In other words, do not just guess.
> > 
> > No performance difference is observed wrt. this patchset when I
> > run the following fio test on null_blk(modprobe null_blk) in my VM:
> > 
> > fio --direct=1 --size=128G --bsrange=4k-4k --runtime=40 --numjobs=32 \
> >   --ioengine=libaio --iodepth=64 --group_reporting=1 --filename=/dev/nullb0 \
> >   --name=null_blk-ttest-randread --rw=randread
> > 
> > Running test is important, but IMO it is more important to understand
> > the idea behind is correct, or the approach can be proved as correct.
> > 
> > Given the count of test cases can be increased exponentially when the related
> > factors or settings are covered, obviously we can't run all the test cases.
> 
> And what happens when you start to scale the number of queues and cpus
> in the system?

It is related with cpus.

> Does both options work the same?

This patch may introduce one extra memory read for getting one
'blk_mq_ctx' instance.

> Why did the original code have per-cpu variables?

Each instance of 'struct blk_mq_ctx' is actually percpu, so it is natural
to allocate it as one percpu variable.

However, there is the kobject lifetime issue if we allocate all 'blk_mq_ctx'
instances as one single percpu variable, because we can't release just one
part(for one cpu) of the single percpu variable.

So this patch converts the percpu variable into one loop-up table(read
only) and one 'blk_mq_ctx' instance for each CPU, and all the instances
are allocated from the local NUMA node of its CPU.

Another approach is to keep the percpu allocation, and introduces one
reference counter for counting how many active 'ctx' there are, and only
free the percpu variable when the ref drops zero, then we may save one
extra memory read for __blk_mq_get_ctx().


Thanks,
Ming

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-11-19 10:51 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-16 11:23 [PATCH V2 0/2] blk-mq: fix kobject lifetime issue Ming Lei
2018-11-16 11:23 ` [PATCH V2 for-4.21 1/2] blk-mq: not embed .mq_kobj and ctx->kobj into queue instance Ming Lei
2018-11-16 14:05   ` Greg Kroah-Hartman
2018-11-17  2:26     ` Ming Lei
2018-11-19 10:06       ` Greg Kroah-Hartman
2018-11-19 10:20         ` Ming Lei
2018-11-16 11:23 ` [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array Ming Lei
2018-11-16 14:06   ` Greg Kroah-Hartman
2018-11-17  2:34     ` Ming Lei
2018-11-17 10:03       ` Greg Kroah-Hartman
2018-11-19  2:04         ` Ming Lei
2018-11-19 10:17           ` Greg Kroah-Hartman
2018-11-19 10:51             ` Ming Lei

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.