All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/5] block: add weighted round robin for blkcgroup
       [not found] <cover.1561385989.git.zhangweiping@didiglobal.com>
@ 2019-06-24 14:28   ` Weiping Zhang
  2019-06-24 14:29   ` Weiping Zhang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:28 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev
  Cc: linux-block, cgroups, linux-nvme

Each block cgroup can select a weighted round robin type to make
its io requests go to the specified haredware queue. Now we support
three round robin type high, medium, low like what nvme specification
donse.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 block/blk-cgroup.c         | 89 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.c     |  4 +++
 block/blk-mq-sched.c       |  6 +++-
 block/blk-mq-tag.c         |  4 +--
 block/blk-mq-tag.h         |  2 +-
 block/blk-mq.c             | 12 +++++--
 block/blk-mq.h             | 20 +++++++++--
 block/blk.h                |  2 +-
 include/linux/blk-cgroup.h |  2 ++
 include/linux/blk-mq.h     | 14 ++++++++
 10 files changed, 145 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 440797293235..723a27417754 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1014,6 +1014,90 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 	return 0;
 }
 
+static const char *blk_wrr_name[BLK_WRR_COUNT] = {
+	[BLK_WRR_NONE]		= "none",
+	[BLK_WRR_LOW]		= "low",
+	[BLK_WRR_MEDIUM]	= "medium",
+	[BLK_WRR_HIGH]		= "high",
+	[BLK_WRR_URGENT]	= "urgent",
+};
+
+static inline const char *blk_wrr_to_name(int wrr)
+{
+	if (wrr < BLK_WRR_NONE || wrr >= BLK_WRR_COUNT)
+		return "wrong";
+
+	return blk_wrr_name[wrr];
+}
+
+static ssize_t blkcg_wrr_write(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct gendisk *disk;
+	struct request_queue *q;
+	struct blkcg_gq *blkg;
+	unsigned int major, minor;
+	int wrr, key_len, part, ret;
+	char *body;
+
+	if (sscanf(buf, "%u:%u%n", &major, &minor, &key_len) != 2)
+		return -EINVAL;
+
+	body = buf + key_len;
+	if (!isspace(*body))
+		return -EINVAL;
+	body = skip_spaces(body);
+	wrr = sysfs_match_string(blk_wrr_name, body);
+	if (wrr == BLK_WRR_COUNT)
+		return -EINVAL;
+
+	disk = get_gendisk(MKDEV(major, minor), &part);
+	if (!disk)
+		return -ENODEV;
+	if (part) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	q = disk->queue;
+
+	blkg = blkg_lookup_create(blkcg, q);
+
+	atomic_set(&blkg->wrr, wrr);
+	put_disk_and_module(disk);
+
+	return nbytes;
+fail:
+	put_disk_and_module(disk);
+	return ret;
+}
+
+static int blkcg_wrr_show(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+		const char *dname;
+		char *buf;
+		size_t size = seq_get_buf(sf, &buf), off = 0;
+
+		dname = blkg_dev_name(blkg);
+		if (!dname)
+			continue;
+
+		off += scnprintf(buf+off, size-off, "%s %s\n", dname,
+			blk_wrr_to_name(atomic_read(&blkg->wrr)));
+		seq_commit(sf, off);
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
 static struct cftype blkcg_files[] = {
 	{
 		.name = "stat",
@@ -1028,6 +1112,11 @@ static struct cftype blkcg_legacy_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkcg_reset_stats,
 	},
+	{
+		.name = "wrr",
+		.write = blkcg_wrr_write,
+		.seq_show = blkcg_wrr_show,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5d940ff124a5..53dcfca015cd 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -436,6 +436,10 @@ static int hctx_busy_show(void *data, struct seq_file *m)
 static const char *const hctx_types[] = {
 	[HCTX_TYPE_DEFAULT]	= "default",
 	[HCTX_TYPE_READ]	= "read",
+	[HCTX_TYPE_WRR_LOW]	= "wrr_low",
+	[HCTX_TYPE_WRR_MEDIUM]	= "wrr_medium",
+	[HCTX_TYPE_WRR_HIGH]	= "wrr_high",
+	[HCTX_TYPE_WRR_URGENT]	= "wrr_urgent",
 	[HCTX_TYPE_POLL]	= "poll",
 };
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2766066a15db..8a6c564877d0 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -7,6 +7,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
@@ -322,10 +323,13 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+	struct blk_mq_hw_ctx *hctx;
+	struct blkcg_gq *blkg = bio->bi_blkg;
+	int wrr = blkg ? atomic_read(&blkg->wrr) : BLK_WRR_NONE;
 	bool ret = false;
 	enum hctx_type type;
 
+	hctx = blk_mq_map_queue(q, bio->bi_opf, ctx, wrr);
 	if (e && e->type->ops.bio_merge) {
 		blk_mq_put_ctx(ctx);
 		return e->type->ops.bio_merge(hctx, bio);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 7513c8eaabee..6fd5261c4fbc 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -106,7 +106,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 		return __sbitmap_queue_get(bt);
 }
 
-unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data, int wrr)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct sbitmap_queue *bt;
@@ -171,7 +171,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
-						data->ctx);
+						data->ctx, wrr);
 		tags = blk_mq_tags_from_data(data);
 		if (data->flags & BLK_MQ_REQ_RESERVED)
 			bt = &tags->breserved_tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..ef43f819ead8 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -25,7 +25,7 @@ struct blk_mq_tags {
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
-extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data, int wrr);
 extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
 			   struct blk_mq_ctx *ctx, unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ce0f5f4ede70..8e8e743c4c0d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -356,6 +356,12 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	struct request *rq;
 	unsigned int tag;
 	bool put_ctx_on_error = false;
+	int wrr;
+
+	if (bio && bio->bi_blkg)
+		wrr = atomic_read(&bio->bi_blkg->wrr);
+	else
+		wrr = BLK_WRR_NONE;
 
 	blk_queue_enter_live(q);
 	data->q = q;
@@ -365,7 +371,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	}
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-						data->ctx);
+						data->ctx, wrr);
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
@@ -385,7 +391,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 		blk_mq_tag_busy(data->hctx);
 	}
 
-	tag = blk_mq_get_tag(data);
+	tag = blk_mq_get_tag(data, wrr);
 	if (tag == BLK_MQ_TAG_FAIL) {
 		if (put_ctx_on_error) {
 			blk_mq_put_ctx(data->ctx);
@@ -1060,7 +1066,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
 		data.flags |= BLK_MQ_REQ_RESERVED;
 
 	shared = blk_mq_tag_busy(data.hctx);
-	rq->tag = blk_mq_get_tag(&data);
+	rq->tag = blk_mq_get_tag(&data, BLK_WRR_NONE);
 	if (rq->tag >= 0) {
 		if (shared) {
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 633a5a77ee8b..493a63e9f5ae 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -101,7 +101,8 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
  */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 						     unsigned int flags,
-						     struct blk_mq_ctx *ctx)
+						     struct blk_mq_ctx *ctx,
+						     int wrr)
 {
 	enum hctx_type type = HCTX_TYPE_DEFAULT;
 
@@ -110,7 +111,22 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 	 */
 	if (flags & REQ_HIPRI)
 		type = HCTX_TYPE_POLL;
-	else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+	else if (wrr > BLK_WRR_NONE && wrr < BLK_WRR_COUNT) {
+		switch (wrr) {
+		case BLK_WRR_LOW:
+			type = HCTX_TYPE_WRR_LOW;
+			break;
+		case BLK_WRR_MEDIUM:
+			type = HCTX_TYPE_WRR_MEDIUM;
+			break;
+		case BLK_WRR_HIGH:
+			type = HCTX_TYPE_WRR_HIGH;
+			break;
+		default:
+			type = HCTX_TYPE_WRR_URGENT;
+			break;
+		}
+	} else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
 		type = HCTX_TYPE_READ;
 	
 	return ctx->hctxs[type];
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..7779157c63ee 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -39,7 +39,7 @@ extern struct ida blk_queue_ida;
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx, BLK_WRR_NONE)->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 76c61318fda5..0a6a9bcea590 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -141,6 +141,8 @@ struct blkcg_gq {
 	atomic64_t			delay_start;
 	u64				last_delay;
 	int				last_use;
+	/* weighted round robin */
+	atomic_t			wrr;
 };
 
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 15d1aa53d96c..b5445a8f58d8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -85,11 +85,25 @@ struct blk_mq_queue_map {
 enum hctx_type {
 	HCTX_TYPE_DEFAULT,	/* all I/O not otherwise accounted for */
 	HCTX_TYPE_READ,		/* just for READ I/O */
+	HCTX_TYPE_WRR_LOW,	/* blkcg: weighted round robin - low */
+	HCTX_TYPE_WRR_MEDIUM,	/* blkcg: weighted round robin - medium */
+	HCTX_TYPE_WRR_HIGH,	/* blkcg: weighted round robin - high */
+	HCTX_TYPE_WRR_URGENT,	/* blkcg: weighted round robin - urgent */
 	HCTX_TYPE_POLL,		/* polled I/O of any kind */
 
 	HCTX_MAX_TYPES,
 };
 
+enum blk_wrr {
+	BLK_WRR_NONE,
+	BLK_WRR_LOW,
+	BLK_WRR_MEDIUM,
+	BLK_WRR_HIGH,
+	BLK_WRR_URGENT,
+
+	BLK_WRR_COUNT,
+};
+
 struct blk_mq_tag_set {
 	/*
 	 * map[] holds ctx -> hctx mappings, one map exists for each type
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 1/5] block: add weighted round robin for blkcgroup
@ 2019-06-24 14:28   ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:28 UTC (permalink / raw)


Each block cgroup can select a weighted round robin type to make
its io requests go to the specified haredware queue. Now we support
three round robin type high, medium, low like what nvme specification
donse.

Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
---
 block/blk-cgroup.c         | 89 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.c     |  4 +++
 block/blk-mq-sched.c       |  6 +++-
 block/blk-mq-tag.c         |  4 +--
 block/blk-mq-tag.h         |  2 +-
 block/blk-mq.c             | 12 +++++--
 block/blk-mq.h             | 20 +++++++++--
 block/blk.h                |  2 +-
 include/linux/blk-cgroup.h |  2 ++
 include/linux/blk-mq.h     | 14 ++++++++
 10 files changed, 145 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 440797293235..723a27417754 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1014,6 +1014,90 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 	return 0;
 }
 
+static const char *blk_wrr_name[BLK_WRR_COUNT] = {
+	[BLK_WRR_NONE]		= "none",
+	[BLK_WRR_LOW]		= "low",
+	[BLK_WRR_MEDIUM]	= "medium",
+	[BLK_WRR_HIGH]		= "high",
+	[BLK_WRR_URGENT]	= "urgent",
+};
+
+static inline const char *blk_wrr_to_name(int wrr)
+{
+	if (wrr < BLK_WRR_NONE || wrr >= BLK_WRR_COUNT)
+		return "wrong";
+
+	return blk_wrr_name[wrr];
+}
+
+static ssize_t blkcg_wrr_write(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct gendisk *disk;
+	struct request_queue *q;
+	struct blkcg_gq *blkg;
+	unsigned int major, minor;
+	int wrr, key_len, part, ret;
+	char *body;
+
+	if (sscanf(buf, "%u:%u%n", &major, &minor, &key_len) != 2)
+		return -EINVAL;
+
+	body = buf + key_len;
+	if (!isspace(*body))
+		return -EINVAL;
+	body = skip_spaces(body);
+	wrr = sysfs_match_string(blk_wrr_name, body);
+	if (wrr == BLK_WRR_COUNT)
+		return -EINVAL;
+
+	disk = get_gendisk(MKDEV(major, minor), &part);
+	if (!disk)
+		return -ENODEV;
+	if (part) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	q = disk->queue;
+
+	blkg = blkg_lookup_create(blkcg, q);
+
+	atomic_set(&blkg->wrr, wrr);
+	put_disk_and_module(disk);
+
+	return nbytes;
+fail:
+	put_disk_and_module(disk);
+	return ret;
+}
+
+static int blkcg_wrr_show(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+		const char *dname;
+		char *buf;
+		size_t size = seq_get_buf(sf, &buf), off = 0;
+
+		dname = blkg_dev_name(blkg);
+		if (!dname)
+			continue;
+
+		off += scnprintf(buf+off, size-off, "%s %s\n", dname,
+			blk_wrr_to_name(atomic_read(&blkg->wrr)));
+		seq_commit(sf, off);
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
 static struct cftype blkcg_files[] = {
 	{
 		.name = "stat",
@@ -1028,6 +1112,11 @@ static struct cftype blkcg_legacy_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkcg_reset_stats,
 	},
+	{
+		.name = "wrr",
+		.write = blkcg_wrr_write,
+		.seq_show = blkcg_wrr_show,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5d940ff124a5..53dcfca015cd 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -436,6 +436,10 @@ static int hctx_busy_show(void *data, struct seq_file *m)
 static const char *const hctx_types[] = {
 	[HCTX_TYPE_DEFAULT]	= "default",
 	[HCTX_TYPE_READ]	= "read",
+	[HCTX_TYPE_WRR_LOW]	= "wrr_low",
+	[HCTX_TYPE_WRR_MEDIUM]	= "wrr_medium",
+	[HCTX_TYPE_WRR_HIGH]	= "wrr_high",
+	[HCTX_TYPE_WRR_URGENT]	= "wrr_urgent",
 	[HCTX_TYPE_POLL]	= "poll",
 };
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2766066a15db..8a6c564877d0 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -7,6 +7,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
@@ -322,10 +323,13 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+	struct blk_mq_hw_ctx *hctx;
+	struct blkcg_gq *blkg = bio->bi_blkg;
+	int wrr = blkg ? atomic_read(&blkg->wrr) : BLK_WRR_NONE;
 	bool ret = false;
 	enum hctx_type type;
 
+	hctx = blk_mq_map_queue(q, bio->bi_opf, ctx, wrr);
 	if (e && e->type->ops.bio_merge) {
 		blk_mq_put_ctx(ctx);
 		return e->type->ops.bio_merge(hctx, bio);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 7513c8eaabee..6fd5261c4fbc 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -106,7 +106,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 		return __sbitmap_queue_get(bt);
 }
 
-unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data, int wrr)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct sbitmap_queue *bt;
@@ -171,7 +171,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
-						data->ctx);
+						data->ctx, wrr);
 		tags = blk_mq_tags_from_data(data);
 		if (data->flags & BLK_MQ_REQ_RESERVED)
 			bt = &tags->breserved_tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..ef43f819ead8 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -25,7 +25,7 @@ struct blk_mq_tags {
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
-extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data, int wrr);
 extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
 			   struct blk_mq_ctx *ctx, unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ce0f5f4ede70..8e8e743c4c0d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -356,6 +356,12 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	struct request *rq;
 	unsigned int tag;
 	bool put_ctx_on_error = false;
+	int wrr;
+
+	if (bio && bio->bi_blkg)
+		wrr = atomic_read(&bio->bi_blkg->wrr);
+	else
+		wrr = BLK_WRR_NONE;
 
 	blk_queue_enter_live(q);
 	data->q = q;
@@ -365,7 +371,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	}
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-						data->ctx);
+						data->ctx, wrr);
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
@@ -385,7 +391,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 		blk_mq_tag_busy(data->hctx);
 	}
 
-	tag = blk_mq_get_tag(data);
+	tag = blk_mq_get_tag(data, wrr);
 	if (tag == BLK_MQ_TAG_FAIL) {
 		if (put_ctx_on_error) {
 			blk_mq_put_ctx(data->ctx);
@@ -1060,7 +1066,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
 		data.flags |= BLK_MQ_REQ_RESERVED;
 
 	shared = blk_mq_tag_busy(data.hctx);
-	rq->tag = blk_mq_get_tag(&data);
+	rq->tag = blk_mq_get_tag(&data, BLK_WRR_NONE);
 	if (rq->tag >= 0) {
 		if (shared) {
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 633a5a77ee8b..493a63e9f5ae 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -101,7 +101,8 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
  */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 						     unsigned int flags,
-						     struct blk_mq_ctx *ctx)
+						     struct blk_mq_ctx *ctx,
+						     int wrr)
 {
 	enum hctx_type type = HCTX_TYPE_DEFAULT;
 
@@ -110,7 +111,22 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 	 */
 	if (flags & REQ_HIPRI)
 		type = HCTX_TYPE_POLL;
-	else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+	else if (wrr > BLK_WRR_NONE && wrr < BLK_WRR_COUNT) {
+		switch (wrr) {
+		case BLK_WRR_LOW:
+			type = HCTX_TYPE_WRR_LOW;
+			break;
+		case BLK_WRR_MEDIUM:
+			type = HCTX_TYPE_WRR_MEDIUM;
+			break;
+		case BLK_WRR_HIGH:
+			type = HCTX_TYPE_WRR_HIGH;
+			break;
+		default:
+			type = HCTX_TYPE_WRR_URGENT;
+			break;
+		}
+	} else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
 		type = HCTX_TYPE_READ;
 	
 	return ctx->hctxs[type];
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..7779157c63ee 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -39,7 +39,7 @@ extern struct ida blk_queue_ida;
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx, BLK_WRR_NONE)->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 76c61318fda5..0a6a9bcea590 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -141,6 +141,8 @@ struct blkcg_gq {
 	atomic64_t			delay_start;
 	u64				last_delay;
 	int				last_use;
+	/* weighted round robin */
+	atomic_t			wrr;
 };
 
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 15d1aa53d96c..b5445a8f58d8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -85,11 +85,25 @@ struct blk_mq_queue_map {
 enum hctx_type {
 	HCTX_TYPE_DEFAULT,	/* all I/O not otherwise accounted for */
 	HCTX_TYPE_READ,		/* just for READ I/O */
+	HCTX_TYPE_WRR_LOW,	/* blkcg: weighted round robin - low */
+	HCTX_TYPE_WRR_MEDIUM,	/* blkcg: weighted round robin - medium */
+	HCTX_TYPE_WRR_HIGH,	/* blkcg: weighted round robin - high */
+	HCTX_TYPE_WRR_URGENT,	/* blkcg: weighted round robin - urgent */
 	HCTX_TYPE_POLL,		/* polled I/O of any kind */
 
 	HCTX_MAX_TYPES,
 };
 
+enum blk_wrr {
+	BLK_WRR_NONE,
+	BLK_WRR_LOW,
+	BLK_WRR_MEDIUM,
+	BLK_WRR_HIGH,
+	BLK_WRR_URGENT,
+
+	BLK_WRR_COUNT,
+};
+
 struct blk_mq_tag_set {
 	/*
 	 * map[] holds ctx -> hctx mappings, one map exists for each type
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 2/5] nvme: add get_ams for nvme_ctrl_ops
       [not found] <cover.1561385989.git.zhangweiping@didiglobal.com>
@ 2019-06-24 14:29   ` Weiping Zhang
  2019-06-24 14:29   ` Weiping Zhang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev
  Cc: linux-block, cgroups, linux-nvme

The get_ams() will return the AMS(Arbitration Mechanism Selected)
from the driver.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 drivers/nvme/host/core.c | 9 ++++++++-
 drivers/nvme/host/nvme.h | 1 +
 drivers/nvme/host/pci.c  | 6 ++++++
 include/linux/nvme.h     | 1 +
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b2dd4e391f5c..4cb781e73123 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1943,6 +1943,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
 	 */
 	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
 	int ret;
+	u32 ams = NVME_CC_AMS_RR;
 
 	if (page_shift < dev_page_min) {
 		dev_err(ctrl->device,
@@ -1951,11 +1952,17 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
 		return -ENODEV;
 	}
 
+	/* get Arbitration Mechanism Selected */
+	if (ctrl->ops->get_ams) {
+		ctrl->ops->get_ams(ctrl, &ams);
+		ams &= NVME_CC_AMS_MASK;
+	}
+
 	ctrl->page_size = 1 << page_shift;
 
 	ctrl->ctrl_config = NVME_CC_CSS_NVM;
 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
-	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
+	ctrl->ctrl_config |= ams | NVME_CC_SHN_NONE;
 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 	ctrl->ctrl_config |= NVME_CC_ENABLE;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ea45d7d393ad..9c7e9217f78b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -369,6 +369,7 @@ struct nvme_ctrl_ops {
 	void (*submit_async_event)(struct nvme_ctrl *ctrl);
 	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+	void (*get_ams)(struct nvme_ctrl *ctrl, u32 *ams);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 189352081994..5d84241f0214 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2627,6 +2627,11 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
 	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
 }
 
+static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
+{
+	*ams = NVME_CC_AMS_RR;
+}
+
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.name			= "pcie",
 	.module			= THIS_MODULE,
@@ -2638,6 +2643,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.submit_async_event	= nvme_pci_submit_async_event,
 	.get_address		= nvme_pci_get_address,
+	.get_ams		= nvme_pci_get_ams,
 };
 
 static int nvme_dev_map(struct nvme_dev *dev)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index da5f696aec00..8f71451fc2fa 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -156,6 +156,7 @@ enum {
 	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
+	NVME_CC_AMS_MASK	= 7 << NVME_CC_AMS_SHIFT,
 	NVME_CC_SHN_NONE	= 0 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_NORMAL	= 1 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_ABRUPT	= 2 << NVME_CC_SHN_SHIFT,
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 2/5] nvme: add get_ams for nvme_ctrl_ops
@ 2019-06-24 14:29   ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)


The get_ams() will return the AMS(Arbitration Mechanism Selected)
from the driver.

Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
---
 drivers/nvme/host/core.c | 9 ++++++++-
 drivers/nvme/host/nvme.h | 1 +
 drivers/nvme/host/pci.c  | 6 ++++++
 include/linux/nvme.h     | 1 +
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b2dd4e391f5c..4cb781e73123 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1943,6 +1943,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
 	 */
 	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
 	int ret;
+	u32 ams = NVME_CC_AMS_RR;
 
 	if (page_shift < dev_page_min) {
 		dev_err(ctrl->device,
@@ -1951,11 +1952,17 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
 		return -ENODEV;
 	}
 
+	/* get Arbitration Mechanism Selected */
+	if (ctrl->ops->get_ams) {
+		ctrl->ops->get_ams(ctrl, &ams);
+		ams &= NVME_CC_AMS_MASK;
+	}
+
 	ctrl->page_size = 1 << page_shift;
 
 	ctrl->ctrl_config = NVME_CC_CSS_NVM;
 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
-	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
+	ctrl->ctrl_config |= ams | NVME_CC_SHN_NONE;
 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 	ctrl->ctrl_config |= NVME_CC_ENABLE;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ea45d7d393ad..9c7e9217f78b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -369,6 +369,7 @@ struct nvme_ctrl_ops {
 	void (*submit_async_event)(struct nvme_ctrl *ctrl);
 	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+	void (*get_ams)(struct nvme_ctrl *ctrl, u32 *ams);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 189352081994..5d84241f0214 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2627,6 +2627,11 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
 	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
 }
 
+static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
+{
+	*ams = NVME_CC_AMS_RR;
+}
+
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.name			= "pcie",
 	.module			= THIS_MODULE,
@@ -2638,6 +2643,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.submit_async_event	= nvme_pci_submit_async_event,
 	.get_address		= nvme_pci_get_address,
+	.get_ams		= nvme_pci_get_ams,
 };
 
 static int nvme_dev_map(struct nvme_dev *dev)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index da5f696aec00..8f71451fc2fa 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -156,6 +156,7 @@ enum {
 	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
+	NVME_CC_AMS_MASK	= 7 << NVME_CC_AMS_SHIFT,
 	NVME_CC_SHN_NONE	= 0 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_NORMAL	= 1 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_ABRUPT	= 2 << NVME_CC_SHN_SHIFT,
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
       [not found] <cover.1561385989.git.zhangweiping@didiglobal.com>
@ 2019-06-24 14:29   ` Weiping Zhang
  2019-06-24 14:29   ` Weiping Zhang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev
  Cc: linux-block, cgroups, linux-nvme

Now nvme support three type hardware queues, read, poll and default,
this patch rename write_queues to read_queues to set the number of
read queues more explicitly. This patch alos is prepared for nvme
support WRR(weighted round robin) that we can get the number of
each queue type easily.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 drivers/nvme/host/pci.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5d84241f0214..a3c9bb72d90e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -68,10 +68,10 @@ static int io_queue_depth = 1024;
 module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
 MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
 
-static int write_queues;
-module_param(write_queues, int, 0644);
-MODULE_PARM_DESC(write_queues,
-	"Number of queues to use for writes. If not set, reads and writes "
+static int read_queues;
+module_param(read_queues, int, 0644);
+MODULE_PARM_DESC(read_queues,
+	"Number of queues to use for reads. If not set, reads and writes "
 	"will share a queue set.");
 
 static int poll_queues;
@@ -211,7 +211,7 @@ struct nvme_iod {
 
 static unsigned int max_io_queues(void)
 {
-	return num_possible_cpus() + write_queues + poll_queues;
+	return num_possible_cpus() + read_queues + poll_queues;
 }
 
 static unsigned int max_queue_count(void)
@@ -2021,18 +2021,16 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 	 * If only one interrupt is available or 'write_queue' == 0, combine
 	 * write and read queues.
 	 *
-	 * If 'write_queues' > 0, ensure it leaves room for at least one read
+	 * If 'read_queues' > 0, ensure it leaves room for at least one write
 	 * queue.
 	 */
-	if (!nrirqs) {
+	if (!nrirqs || nrirqs == 1) {
 		nrirqs = 1;
 		nr_read_queues = 0;
-	} else if (nrirqs == 1 || !write_queues) {
-		nr_read_queues = 0;
-	} else if (write_queues >= nrirqs) {
-		nr_read_queues = 1;
+	} else if (read_queues >= nrirqs) {
+		nr_read_queues = nrirqs - 1;
 	} else {
-		nr_read_queues = nrirqs - write_queues;
+		nr_read_queues = read_queues;
 	}
 
 	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
@ 2019-06-24 14:29   ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)


Now nvme support three type hardware queues, read, poll and default,
this patch rename write_queues to read_queues to set the number of
read queues more explicitly. This patch alos is prepared for nvme
support WRR(weighted round robin) that we can get the number of
each queue type easily.

Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
---
 drivers/nvme/host/pci.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5d84241f0214..a3c9bb72d90e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -68,10 +68,10 @@ static int io_queue_depth = 1024;
 module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
 MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
 
-static int write_queues;
-module_param(write_queues, int, 0644);
-MODULE_PARM_DESC(write_queues,
-	"Number of queues to use for writes. If not set, reads and writes "
+static int read_queues;
+module_param(read_queues, int, 0644);
+MODULE_PARM_DESC(read_queues,
+	"Number of queues to use for reads. If not set, reads and writes "
 	"will share a queue set.");
 
 static int poll_queues;
@@ -211,7 +211,7 @@ struct nvme_iod {
 
 static unsigned int max_io_queues(void)
 {
-	return num_possible_cpus() + write_queues + poll_queues;
+	return num_possible_cpus() + read_queues + poll_queues;
 }
 
 static unsigned int max_queue_count(void)
@@ -2021,18 +2021,16 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 	 * If only one interrupt is available or 'write_queue' == 0, combine
 	 * write and read queues.
 	 *
-	 * If 'write_queues' > 0, ensure it leaves room for at least one read
+	 * If 'read_queues' > 0, ensure it leaves room for at least one write
 	 * queue.
 	 */
-	if (!nrirqs) {
+	if (!nrirqs || nrirqs == 1) {
 		nrirqs = 1;
 		nr_read_queues = 0;
-	} else if (nrirqs == 1 || !write_queues) {
-		nr_read_queues = 0;
-	} else if (write_queues >= nrirqs) {
-		nr_read_queues = 1;
+	} else if (read_queues >= nrirqs) {
+		nr_read_queues = nrirqs - 1;
 	} else {
-		nr_read_queues = nrirqs - write_queues;
+		nr_read_queues = read_queues;
 	}
 
 	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
       [not found] <cover.1561385989.git.zhangweiping@didiglobal.com>
  2019-06-24 14:28   ` Weiping Zhang
@ 2019-06-24 14:29   ` Weiping Zhang
  2019-06-24 14:29   ` Weiping Zhang
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)
  To: tglx, axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev
  Cc: linux-block, cgroups, linux-nvme, linux-kernel

The driver may implement multiple affinity set, and some of
are empty, for this case we just skip them.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 kernel/irq/affinity.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..6d964fe0fbd8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -295,6 +295,10 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 		unsigned int this_vecs = affd->set_size[i];
 		int ret;
 
+		/* skip empty affinity set */
+		if (this_vecs == 0)
+			continue;
+
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
 					       curvec, masks);
 		if (ret) {
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
@ 2019-06-24 14:29   ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)


The driver may implement multiple affinity set, and some of
are empty, for this case we just skip them.

Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
---
 kernel/irq/affinity.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..6d964fe0fbd8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -295,6 +295,10 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 		unsigned int this_vecs = affd->set_size[i];
 		int ret;
 
+		/* skip empty affinity set */
+		if (this_vecs == 0)
+			continue;
+
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
 					       curvec, masks);
 		if (ret) {
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
@ 2019-06-24 14:29   ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)
  To: tglx, axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev
  Cc: linux-block, cgroups, linux-nvme, linux-kernel

The driver may implement multiple affinity set, and some of
are empty, for this case we just skip them.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 kernel/irq/affinity.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..6d964fe0fbd8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -295,6 +295,10 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 		unsigned int this_vecs = affd->set_size[i];
 		int ret;
 
+		/* skip empty affinity set */
+		if (this_vecs == 0)
+			continue;
+
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
 					       curvec, masks);
 		if (ret) {
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
       [not found] <cover.1561385989.git.zhangweiping@didiglobal.com>
@ 2019-06-24 14:29   ` Weiping Zhang
  2019-06-24 14:29   ` Weiping Zhang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev
  Cc: linux-block, cgroups, linux-nvme

This patch enalbe Weithed Round Robin if nvme device support it.
We add four module parameters wrr_urgent_queues, wrr_high_queeus,
wrr_medium_queues, wrr_low_queues to control the number of queues for
specified priority. If device doesn't support WRR, all these four
parameters will be forced reset to 0. If device support WRR, but
all of these four parameters are 0, nvme dirver will not enable WRR
when set CC.EN to 1.

Now nvme support five types hardware queue:
poll:		if io was marked for poll
wrr_low:	weighted round robin low
wrr_medium:	weighted round robin medium
wrr_high:	weighted round robin high
wrr_urgent:	weighted round robin urgent
read:		io read, if blkcg's wrr is none and the io is not poll
defaut:		for write/flush, if blkcg's wrr is none and is not poll

The read, default and poll submission queue's priority is medium, when
nvme's wrr was enabled.

Test result:

CPU:    Intel(R) Xeon(R) Platinum 8160 CPU @ 2.10GHz
NVME:   Intel SSDPE2KX020T8 P4510 2TB

[root@tmp-201812-d1802-818396173 low]# nvme show-regs /dev/nvme0n1
cap     : 2078030fff
version : 10200
intms   : 0
intmc   : 0
cc      : 460801
csts    : 1
nssr    : 0
aqa     : 1f001f
asq     : 5f7cc08000
acq     : 5f5ac23000
cmbloc  : 0
cmbsz   : 0

Run fio-1, fio-2, fio-3 in parallel,

For RR(round robin) these three fio nearly get same iops or bps,
if we set blkio.wrr for different priority, the WRR "high" will
get more iops/bps than "medium" and "low".

RR:
fio-1: echo "259:0 none" > /sys/fs/cgroup/blkio/high/blkio.wrr
fio-2: echo "259:0 none" > /sys/fs/cgroup/blkio/medium/blkio.wrr
fio-3: echo "259:0 none" > /sys/fs/cgroup/blkio/low/blkio.wrr

WRR:
fio-1: echo "259:0 high" > /sys/fs/cgroup/blkio/high/blkio.wrr
fio-2: echo "259:0 medium" > /sys/fs/cgroup/blkio/medium/blkio.wrr
fio-3: echo "259:0 low" > /sys/fs/cgroup/blkio/low/blkio.wrr

rwtest=randread
fio --bs=4k --ioengine=libaio --iodepth=32 --filename=/dev/nvme0n1 \
--direct=1 --runtime=60 --numjobs=8 --rw=$rwtest --name=test \
--group_reporting

Randread 4K     RR              WRR
-------------------------------------------------------
fio-1:          220 k           395 k
fio-2:          220 k           197 k
fio-3:          220 k           66  k

rwtest=randwrite
fio --bs=4k --ioengine=libaio --iodepth=32 --filename=/dev/nvme0n1 \
--direct=1 --runtime=60 --numjobs=8 --rw=$rwtest --name=test \
--group_reporting

Randwrite 4K    RR              WRR
-------------------------------------------------------
fio-1:          150 k           295 k
fio-2:          150 k           148 k
fio-3:          150 k           51  k

rwtest=read
fio --bs=512k --ioengine=libaio --iodepth=32 --filename=/dev/nvme0n1 \
--direct=1 --runtime=60 --numjobs=8 --rw=$rwtest --name=test \
--group_reporting

read 512K       RR              WRR
-------------------------------------------------------
fio-1:          963 MiB/s       1704 MiB/s
fio-2:          950 MiB/s       850  MiB/s
fio-3:          961 MiB/s       284  MiB/s

rwtest=read
fio --bs=512k --ioengine=libaio --iodepth=32 --filename=/dev/nvme0n1 \
--direct=1 --runtime=60 --numjobs=8 --rw=$rwtest --name=test \
--group_reporting

write 512K      RR              WRR
-------------------------------------------------------
fio-1:          890 MiB/s       1150 MiB/s
fio-2:          871 MiB/s       595  MiB/s
fio-3:          895 MiB/s       188  MiB/s

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 drivers/nvme/host/nvme.h  |   1 +
 drivers/nvme/host/pci.c   | 228 ++++++++++++++++++++++++++++++++++++++--------
 include/linux/interrupt.h |   2 +-
 include/linux/nvme.h      |   2 +
 4 files changed, 192 insertions(+), 41 deletions(-)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9c7e9217f78b..2960d3bfa9bf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -212,6 +212,7 @@ struct nvme_ctrl {
 	unsigned int shutdown_timeout;
 	unsigned int kato;
 	bool subsystem;
+	bool wrr_enabled;
 	unsigned long quirks;
 	struct nvme_id_power_state psd[32];
 	struct nvme_effects_log *effects;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a3c9bb72d90e..d4aaa4e87312 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -78,6 +78,22 @@ static int poll_queues;
 module_param(poll_queues, int, 0644);
 MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
 
+static int wrr_low_queues;
+module_param(wrr_low_queues, int, 0644);
+MODULE_PARM_DESC(wrr_low_queues, "Number of WRR low queues.");
+
+static int wrr_medium_queues;
+module_param(wrr_medium_queues, int, 0644);
+MODULE_PARM_DESC(wrr_medium_queues, "Number of WRR medium queues.");
+
+static int wrr_high_queues;
+module_param(wrr_high_queues, int, 0644);
+MODULE_PARM_DESC(wrr_high_queues, "Number of WRR high queues.");
+
+static int wrr_urgent_queues;
+module_param(wrr_urgent_queues, int, 0644);
+MODULE_PARM_DESC(wrr_urgent_queues, "Number of WRR urgent queues.");
+
 struct nvme_dev;
 struct nvme_queue;
 
@@ -209,6 +225,14 @@ struct nvme_iod {
 	struct scatterlist *sg;
 };
 
+static inline bool nvme_is_wrr_allocated(struct nvme_dev *dev)
+{
+	return dev->io_queues[HCTX_TYPE_WRR_LOW] +
+		dev->io_queues[HCTX_TYPE_WRR_MEDIUM] +
+		dev->io_queues[HCTX_TYPE_WRR_HIGH] +
+		dev->io_queues[HCTX_TYPE_WRR_URGENT] > 0;
+}
+
 static unsigned int max_io_queues(void)
 {
 	return num_possible_cpus() + read_queues + poll_queues;
@@ -1139,19 +1163,23 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
-						struct nvme_queue *nvmeq)
+					struct nvme_queue *nvmeq, int wrr_flag)
 {
 	struct nvme_ctrl *ctrl = &dev->ctrl;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG;
 
-	/*
-	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
-	 * set. Since URGENT priority is zeroes, it makes all queues
-	 * URGENT.
-	 */
-	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
-		flags |= NVME_SQ_PRIO_MEDIUM;
+	if (!dev->ctrl.wrr_enabled && !nvme_is_wrr_allocated(dev)) {
+		/*
+		 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
+		 * set. Since URGENT priority is zeroes, it makes all queues
+		 * URGENT.
+		 */
+		if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
+			flags |= NVME_SQ_PRIO_MEDIUM;
+	} else {
+		flags |= wrr_flag;
+	}
 
 	/*
 	 * Note: we (ab)use the fact that the prp fields survive if no data
@@ -1517,11 +1545,51 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	wmb(); /* ensure the first interrupt sees the initialization */
 }
 
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
-	int result;
+	int start, end, result, wrr_flag;
+	bool polled = false;
 	u16 vector = 0;
+	enum hctx_type type;
+
+	/* 0 for admain queue, io queue index >= 1 */
+	start = 1;
+	/* get hardware context type base on qid */
+	for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) {
+		end = start + dev->io_queues[type] - 1;
+		if (qid >= start && qid <= end)
+			break;
+		start = end + 1;
+	}
+
+	if (type == HCTX_TYPE_POLL)
+		polled = true;
+
+	if (dev->ctrl.wrr_enabled && nvme_is_wrr_allocated(dev)) {
+		/* set read,poll,default to medium by default */
+		switch (type) {
+		case HCTX_TYPE_WRR_LOW:
+			wrr_flag = NVME_SQ_PRIO_LOW;
+			break;
+		case HCTX_TYPE_WRR_MEDIUM:
+		case HCTX_TYPE_POLL:
+		case HCTX_TYPE_DEFAULT:
+		case HCTX_TYPE_READ:
+			wrr_flag = NVME_SQ_PRIO_MEDIUM;
+			break;
+		case HCTX_TYPE_WRR_HIGH:
+			wrr_flag = NVME_SQ_PRIO_HIGH;
+			break;
+		case HCTX_TYPE_WRR_URGENT:
+			wrr_flag = NVME_SQ_PRIO_URGENT;
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		wrr_flag = NVME_SQ_PRIO_IGNORE;
+	}
 
 	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
@@ -1538,7 +1606,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 	if (result)
 		return result;
 
-	result = adapter_alloc_sq(dev, qid, nvmeq);
+	result = adapter_alloc_sq(dev, qid, nvmeq, wrr_flag);
 	if (result < 0)
 		return result;
 	else if (result)
@@ -1709,7 +1777,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i, max, rw_queues;
+	unsigned i, max;
 	int ret = 0;
 
 	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1720,17 +1788,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 	}
 
 	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
-	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
-		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
-				dev->io_queues[HCTX_TYPE_READ];
-	} else {
-		rw_queues = max;
-	}
 
 	for (i = dev->online_queues; i <= max; i++) {
-		bool polled = i > rw_queues;
-
-		ret = nvme_create_queue(&dev->queues[i], i, polled);
+		ret = nvme_create_queue(&dev->queues[i], i);
 		if (ret)
 			break;
 	}
@@ -2011,7 +2071,10 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 {
 	struct nvme_dev *dev = affd->priv;
-	unsigned int nr_read_queues;
+	unsigned int nr_total, nr, nr_read, nr_default;
+	unsigned int nr_wrr_urgent, nr_wrr_high, nr_wrr_medium, nr_wrr_low;
+	unsigned int nr_sets;
+
 
 	/*
 	 * If there is no interupt available for queues, ensure that
@@ -2024,20 +2087,75 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 	 * If 'read_queues' > 0, ensure it leaves room for at least one write
 	 * queue.
 	 */
-	if (!nrirqs || nrirqs == 1) {
+	if (!nrirqs)
 		nrirqs = 1;
-		nr_read_queues = 0;
-	} else if (read_queues >= nrirqs) {
-		nr_read_queues = nrirqs - 1;
-	} else {
-		nr_read_queues = read_queues;
-	}
 
-	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
-	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
-	affd->nr_sets = nr_read_queues ? 2 : 1;
+	nr_total = nrirqs;
+
+	nr_read = nr_wrr_urgent = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0;
+
+	/* set default to 1, add all the rest queue to default at last */
+	nr = nr_default = 1;
+	nr_sets = 1;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* read queues */
+	nr_sets++;
+	nr_read = nr = read_queues > nr_total ? nr_total : read_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr low queues */
+	nr_sets++;
+	nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr medium queues */
+	nr_sets++;
+	nr_wrr_medium = nr =
+		wrr_medium_queues > nr_total ? nr_total : wrr_medium_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr high queues */
+	nr_sets++;
+	nr_wrr_high = nr =
+		wrr_high_queues > nr_total ? nr_total : wrr_high_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr urgent queues */
+	nr_sets++;
+	nr_wrr_urgent = nr =
+		wrr_urgent_queues > nr_total ? nr_total : wrr_urgent_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* add all the rest queue to default */
+	nr_default += nr_total;
+
+done:
+	dev->io_queues[HCTX_TYPE_DEFAULT] = nr_default;
+	affd->set_size[HCTX_TYPE_DEFAULT] = nr_default;
+	dev->io_queues[HCTX_TYPE_READ] = nr_read;
+	affd->set_size[HCTX_TYPE_READ] = nr_read;
+	dev->io_queues[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
+	affd->set_size[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
+	dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
+	affd->set_size[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
+	dev->io_queues[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
+	affd->set_size[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
+	dev->io_queues[HCTX_TYPE_WRR_URGENT] = nr_wrr_urgent;
+	affd->set_size[HCTX_TYPE_WRR_URGENT] = nr_wrr_urgent;
+	affd->nr_sets = nr_sets;
 }
 
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
@@ -2070,6 +2188,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 	/* Initialize for the single interrupt case */
 	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
 	dev->io_queues[HCTX_TYPE_READ] = 0;
+	dev->io_queues[HCTX_TYPE_WRR_LOW] = 0;
+	dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = 0;
+	dev->io_queues[HCTX_TYPE_WRR_HIGH] = 0;
+	dev->io_queues[HCTX_TYPE_WRR_URGENT] = 0;
 
 	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
 			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
@@ -2156,10 +2278,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		nvme_suspend_io_queues(dev);
 		goto retry;
 	}
-	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+	dev_info(dev->ctrl.device, "%d/%d/%d/%d/%d/%d/%d "
+			"default/read/poll/low/medium/high/urgent queues\n",
 					dev->io_queues[HCTX_TYPE_DEFAULT],
 					dev->io_queues[HCTX_TYPE_READ],
-					dev->io_queues[HCTX_TYPE_POLL]);
+					dev->io_queues[HCTX_TYPE_POLL],
+					dev->io_queues[HCTX_TYPE_WRR_LOW],
+					dev->io_queues[HCTX_TYPE_WRR_MEDIUM],
+					dev->io_queues[HCTX_TYPE_WRR_HIGH],
+					dev->io_queues[HCTX_TYPE_WRR_URGENT]);
 	return 0;
 }
 
@@ -2248,9 +2375,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	if (!dev->ctrl.tagset) {
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
-		dev->tagset.nr_maps = 2; /* default + read */
-		if (dev->io_queues[HCTX_TYPE_POLL])
-			dev->tagset.nr_maps++;
+		dev->tagset.nr_maps = HCTX_MAX_TYPES;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev_to_node(dev->dev);
 		dev->tagset.queue_depth =
@@ -2627,7 +2752,30 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
 
 static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
 {
-	*ams = NVME_CC_AMS_RR;
+	/* if deivce doesn't support WRR, force reset wrr queues to 0 */
+	if (!NVME_CAP_AMS_WRRU(ctrl->cap)) {
+		wrr_low_queues = 0;
+		wrr_medium_queues = 0;
+		wrr_high_queues = 0;
+		wrr_urgent_queues = 0;
+
+		*ams = NVME_CC_AMS_RR;
+		ctrl->wrr_enabled = false;
+		return;
+	}
+
+	/*
+	 * if device support WRR, check wrr queue count, all wrr queues are
+	 * 0, don't enable device's WRR.
+	 */
+	if ((wrr_low_queues + wrr_medium_queues + wrr_high_queues +
+				wrr_urgent_queues) > 0) {
+		*ams = NVME_CC_AMS_WRRU;
+		ctrl->wrr_enabled = true;
+	} else {
+		*ams = NVME_CC_AMS_RR;
+		ctrl->wrr_enabled = false;
+	}
 }
 
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c7eef32e7739..ea726c2f95cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -259,7 +259,7 @@ struct irq_affinity_notify {
 	void (*release)(struct kref *ref);
 };
 
-#define	IRQ_AFFINITY_MAX_SETS  4
+#define	IRQ_AFFINITY_MAX_SETS  7
 
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 8f71451fc2fa..59b91a38d323 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -113,6 +113,7 @@ enum {
 };
 
 #define NVME_CAP_MQES(cap)	((cap) & 0xffff)
+#define NVME_CAP_AMS_WRRU(cap)	((cap) & (1 << 17))
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
 #define NVME_CAP_NSSRC(cap)	(((cap) >> 36) & 0x1)
@@ -844,6 +845,7 @@ enum {
 	NVME_SQ_PRIO_HIGH	= (1 << 1),
 	NVME_SQ_PRIO_MEDIUM	= (2 << 1),
 	NVME_SQ_PRIO_LOW	= (3 << 1),
+	NVME_SQ_PRIO_IGNORE	= NVME_SQ_PRIO_URGENT,
 	NVME_FEAT_ARBITRATION	= 0x01,
 	NVME_FEAT_POWER_MGMT	= 0x02,
 	NVME_FEAT_LBA_RANGE	= 0x03,
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
@ 2019-06-24 14:29   ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-24 14:29 UTC (permalink / raw)


This patch enalbe Weithed Round Robin if nvme device support it.
We add four module parameters wrr_urgent_queues, wrr_high_queeus,
wrr_medium_queues, wrr_low_queues to control the number of queues for
specified priority. If device doesn't support WRR, all these four
parameters will be forced reset to 0. If device support WRR, but
all of these four parameters are 0, nvme dirver will not enable WRR
when set CC.EN to 1.

Now nvme support five types hardware queue:
poll:		if io was marked for poll
wrr_low:	weighted round robin low
wrr_medium:	weighted round robin medium
wrr_high:	weighted round robin high
wrr_urgent:	weighted round robin urgent
read:		io read, if blkcg's wrr is none and the io is not poll
defaut:		for write/flush, if blkcg's wrr is none and is not poll

The read, default and poll submission queue's priority is medium, when
nvme's wrr was enabled.

Test result:

CPU:    Intel(R) Xeon(R) Platinum 8160 CPU @ 2.10GHz
NVME:   Intel SSDPE2KX020T8 P4510 2TB

[root at tmp-201812-d1802-818396173 low]# nvme show-regs /dev/nvme0n1
cap     : 2078030fff
version : 10200
intms   : 0
intmc   : 0
cc      : 460801
csts    : 1
nssr    : 0
aqa     : 1f001f
asq     : 5f7cc08000
acq     : 5f5ac23000
cmbloc  : 0
cmbsz   : 0

Run fio-1, fio-2, fio-3 in parallel,

For RR(round robin) these three fio nearly get same iops or bps,
if we set blkio.wrr for different priority, the WRR "high" will
get more iops/bps than "medium" and "low".

RR:
fio-1: echo "259:0 none" > /sys/fs/cgroup/blkio/high/blkio.wrr
fio-2: echo "259:0 none" > /sys/fs/cgroup/blkio/medium/blkio.wrr
fio-3: echo "259:0 none" > /sys/fs/cgroup/blkio/low/blkio.wrr

WRR:
fio-1: echo "259:0 high" > /sys/fs/cgroup/blkio/high/blkio.wrr
fio-2: echo "259:0 medium" > /sys/fs/cgroup/blkio/medium/blkio.wrr
fio-3: echo "259:0 low" > /sys/fs/cgroup/blkio/low/blkio.wrr

rwtest=randread
fio --bs=4k --ioengine=libaio --iodepth=32 --filename=/dev/nvme0n1 \
--direct=1 --runtime=60 --numjobs=8 --rw=$rwtest --name=test \
--group_reporting

Randread 4K     RR              WRR
-------------------------------------------------------
fio-1:          220 k           395 k
fio-2:          220 k           197 k
fio-3:          220 k           66  k

rwtest=randwrite
fio --bs=4k --ioengine=libaio --iodepth=32 --filename=/dev/nvme0n1 \
--direct=1 --runtime=60 --numjobs=8 --rw=$rwtest --name=test \
--group_reporting

Randwrite 4K    RR              WRR
-------------------------------------------------------
fio-1:          150 k           295 k
fio-2:          150 k           148 k
fio-3:          150 k           51  k

rwtest=read
fio --bs=512k --ioengine=libaio --iodepth=32 --filename=/dev/nvme0n1 \
--direct=1 --runtime=60 --numjobs=8 --rw=$rwtest --name=test \
--group_reporting

read 512K       RR              WRR
-------------------------------------------------------
fio-1:          963 MiB/s       1704 MiB/s
fio-2:          950 MiB/s       850  MiB/s
fio-3:          961 MiB/s       284  MiB/s

rwtest=read
fio --bs=512k --ioengine=libaio --iodepth=32 --filename=/dev/nvme0n1 \
--direct=1 --runtime=60 --numjobs=8 --rw=$rwtest --name=test \
--group_reporting

write 512K      RR              WRR
-------------------------------------------------------
fio-1:          890 MiB/s       1150 MiB/s
fio-2:          871 MiB/s       595  MiB/s
fio-3:          895 MiB/s       188  MiB/s

Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
---
 drivers/nvme/host/nvme.h  |   1 +
 drivers/nvme/host/pci.c   | 228 ++++++++++++++++++++++++++++++++++++++--------
 include/linux/interrupt.h |   2 +-
 include/linux/nvme.h      |   2 +
 4 files changed, 192 insertions(+), 41 deletions(-)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9c7e9217f78b..2960d3bfa9bf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -212,6 +212,7 @@ struct nvme_ctrl {
 	unsigned int shutdown_timeout;
 	unsigned int kato;
 	bool subsystem;
+	bool wrr_enabled;
 	unsigned long quirks;
 	struct nvme_id_power_state psd[32];
 	struct nvme_effects_log *effects;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a3c9bb72d90e..d4aaa4e87312 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -78,6 +78,22 @@ static int poll_queues;
 module_param(poll_queues, int, 0644);
 MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
 
+static int wrr_low_queues;
+module_param(wrr_low_queues, int, 0644);
+MODULE_PARM_DESC(wrr_low_queues, "Number of WRR low queues.");
+
+static int wrr_medium_queues;
+module_param(wrr_medium_queues, int, 0644);
+MODULE_PARM_DESC(wrr_medium_queues, "Number of WRR medium queues.");
+
+static int wrr_high_queues;
+module_param(wrr_high_queues, int, 0644);
+MODULE_PARM_DESC(wrr_high_queues, "Number of WRR high queues.");
+
+static int wrr_urgent_queues;
+module_param(wrr_urgent_queues, int, 0644);
+MODULE_PARM_DESC(wrr_urgent_queues, "Number of WRR urgent queues.");
+
 struct nvme_dev;
 struct nvme_queue;
 
@@ -209,6 +225,14 @@ struct nvme_iod {
 	struct scatterlist *sg;
 };
 
+static inline bool nvme_is_wrr_allocated(struct nvme_dev *dev)
+{
+	return dev->io_queues[HCTX_TYPE_WRR_LOW] +
+		dev->io_queues[HCTX_TYPE_WRR_MEDIUM] +
+		dev->io_queues[HCTX_TYPE_WRR_HIGH] +
+		dev->io_queues[HCTX_TYPE_WRR_URGENT] > 0;
+}
+
 static unsigned int max_io_queues(void)
 {
 	return num_possible_cpus() + read_queues + poll_queues;
@@ -1139,19 +1163,23 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
-						struct nvme_queue *nvmeq)
+					struct nvme_queue *nvmeq, int wrr_flag)
 {
 	struct nvme_ctrl *ctrl = &dev->ctrl;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG;
 
-	/*
-	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
-	 * set. Since URGENT priority is zeroes, it makes all queues
-	 * URGENT.
-	 */
-	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
-		flags |= NVME_SQ_PRIO_MEDIUM;
+	if (!dev->ctrl.wrr_enabled && !nvme_is_wrr_allocated(dev)) {
+		/*
+		 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
+		 * set. Since URGENT priority is zeroes, it makes all queues
+		 * URGENT.
+		 */
+		if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
+			flags |= NVME_SQ_PRIO_MEDIUM;
+	} else {
+		flags |= wrr_flag;
+	}
 
 	/*
 	 * Note: we (ab)use the fact that the prp fields survive if no data
@@ -1517,11 +1545,51 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	wmb(); /* ensure the first interrupt sees the initialization */
 }
 
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
-	int result;
+	int start, end, result, wrr_flag;
+	bool polled = false;
 	u16 vector = 0;
+	enum hctx_type type;
+
+	/* 0 for admain queue, io queue index >= 1 */
+	start = 1;
+	/* get hardware context type base on qid */
+	for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) {
+		end = start + dev->io_queues[type] - 1;
+		if (qid >= start && qid <= end)
+			break;
+		start = end + 1;
+	}
+
+	if (type == HCTX_TYPE_POLL)
+		polled = true;
+
+	if (dev->ctrl.wrr_enabled && nvme_is_wrr_allocated(dev)) {
+		/* set read,poll,default to medium by default */
+		switch (type) {
+		case HCTX_TYPE_WRR_LOW:
+			wrr_flag = NVME_SQ_PRIO_LOW;
+			break;
+		case HCTX_TYPE_WRR_MEDIUM:
+		case HCTX_TYPE_POLL:
+		case HCTX_TYPE_DEFAULT:
+		case HCTX_TYPE_READ:
+			wrr_flag = NVME_SQ_PRIO_MEDIUM;
+			break;
+		case HCTX_TYPE_WRR_HIGH:
+			wrr_flag = NVME_SQ_PRIO_HIGH;
+			break;
+		case HCTX_TYPE_WRR_URGENT:
+			wrr_flag = NVME_SQ_PRIO_URGENT;
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		wrr_flag = NVME_SQ_PRIO_IGNORE;
+	}
 
 	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
@@ -1538,7 +1606,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 	if (result)
 		return result;
 
-	result = adapter_alloc_sq(dev, qid, nvmeq);
+	result = adapter_alloc_sq(dev, qid, nvmeq, wrr_flag);
 	if (result < 0)
 		return result;
 	else if (result)
@@ -1709,7 +1777,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i, max, rw_queues;
+	unsigned i, max;
 	int ret = 0;
 
 	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1720,17 +1788,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 	}
 
 	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
-	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
-		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
-				dev->io_queues[HCTX_TYPE_READ];
-	} else {
-		rw_queues = max;
-	}
 
 	for (i = dev->online_queues; i <= max; i++) {
-		bool polled = i > rw_queues;
-
-		ret = nvme_create_queue(&dev->queues[i], i, polled);
+		ret = nvme_create_queue(&dev->queues[i], i);
 		if (ret)
 			break;
 	}
@@ -2011,7 +2071,10 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 {
 	struct nvme_dev *dev = affd->priv;
-	unsigned int nr_read_queues;
+	unsigned int nr_total, nr, nr_read, nr_default;
+	unsigned int nr_wrr_urgent, nr_wrr_high, nr_wrr_medium, nr_wrr_low;
+	unsigned int nr_sets;
+
 
 	/*
 	 * If there is no interupt available for queues, ensure that
@@ -2024,20 +2087,75 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 	 * If 'read_queues' > 0, ensure it leaves room for at least one write
 	 * queue.
 	 */
-	if (!nrirqs || nrirqs == 1) {
+	if (!nrirqs)
 		nrirqs = 1;
-		nr_read_queues = 0;
-	} else if (read_queues >= nrirqs) {
-		nr_read_queues = nrirqs - 1;
-	} else {
-		nr_read_queues = read_queues;
-	}
 
-	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
-	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
-	affd->nr_sets = nr_read_queues ? 2 : 1;
+	nr_total = nrirqs;
+
+	nr_read = nr_wrr_urgent = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0;
+
+	/* set default to 1, add all the rest queue to default at last */
+	nr = nr_default = 1;
+	nr_sets = 1;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* read queues */
+	nr_sets++;
+	nr_read = nr = read_queues > nr_total ? nr_total : read_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr low queues */
+	nr_sets++;
+	nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr medium queues */
+	nr_sets++;
+	nr_wrr_medium = nr =
+		wrr_medium_queues > nr_total ? nr_total : wrr_medium_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr high queues */
+	nr_sets++;
+	nr_wrr_high = nr =
+		wrr_high_queues > nr_total ? nr_total : wrr_high_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr urgent queues */
+	nr_sets++;
+	nr_wrr_urgent = nr =
+		wrr_urgent_queues > nr_total ? nr_total : wrr_urgent_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* add all the rest queue to default */
+	nr_default += nr_total;
+
+done:
+	dev->io_queues[HCTX_TYPE_DEFAULT] = nr_default;
+	affd->set_size[HCTX_TYPE_DEFAULT] = nr_default;
+	dev->io_queues[HCTX_TYPE_READ] = nr_read;
+	affd->set_size[HCTX_TYPE_READ] = nr_read;
+	dev->io_queues[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
+	affd->set_size[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
+	dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
+	affd->set_size[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
+	dev->io_queues[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
+	affd->set_size[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
+	dev->io_queues[HCTX_TYPE_WRR_URGENT] = nr_wrr_urgent;
+	affd->set_size[HCTX_TYPE_WRR_URGENT] = nr_wrr_urgent;
+	affd->nr_sets = nr_sets;
 }
 
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
@@ -2070,6 +2188,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 	/* Initialize for the single interrupt case */
 	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
 	dev->io_queues[HCTX_TYPE_READ] = 0;
+	dev->io_queues[HCTX_TYPE_WRR_LOW] = 0;
+	dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = 0;
+	dev->io_queues[HCTX_TYPE_WRR_HIGH] = 0;
+	dev->io_queues[HCTX_TYPE_WRR_URGENT] = 0;
 
 	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
 			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
@@ -2156,10 +2278,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		nvme_suspend_io_queues(dev);
 		goto retry;
 	}
-	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+	dev_info(dev->ctrl.device, "%d/%d/%d/%d/%d/%d/%d "
+			"default/read/poll/low/medium/high/urgent queues\n",
 					dev->io_queues[HCTX_TYPE_DEFAULT],
 					dev->io_queues[HCTX_TYPE_READ],
-					dev->io_queues[HCTX_TYPE_POLL]);
+					dev->io_queues[HCTX_TYPE_POLL],
+					dev->io_queues[HCTX_TYPE_WRR_LOW],
+					dev->io_queues[HCTX_TYPE_WRR_MEDIUM],
+					dev->io_queues[HCTX_TYPE_WRR_HIGH],
+					dev->io_queues[HCTX_TYPE_WRR_URGENT]);
 	return 0;
 }
 
@@ -2248,9 +2375,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	if (!dev->ctrl.tagset) {
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
-		dev->tagset.nr_maps = 2; /* default + read */
-		if (dev->io_queues[HCTX_TYPE_POLL])
-			dev->tagset.nr_maps++;
+		dev->tagset.nr_maps = HCTX_MAX_TYPES;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev_to_node(dev->dev);
 		dev->tagset.queue_depth =
@@ -2627,7 +2752,30 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
 
 static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
 {
-	*ams = NVME_CC_AMS_RR;
+	/* if deivce doesn't support WRR, force reset wrr queues to 0 */
+	if (!NVME_CAP_AMS_WRRU(ctrl->cap)) {
+		wrr_low_queues = 0;
+		wrr_medium_queues = 0;
+		wrr_high_queues = 0;
+		wrr_urgent_queues = 0;
+
+		*ams = NVME_CC_AMS_RR;
+		ctrl->wrr_enabled = false;
+		return;
+	}
+
+	/*
+	 * if device support WRR, check wrr queue count, all wrr queues are
+	 * 0, don't enable device's WRR.
+	 */
+	if ((wrr_low_queues + wrr_medium_queues + wrr_high_queues +
+				wrr_urgent_queues) > 0) {
+		*ams = NVME_CC_AMS_WRRU;
+		ctrl->wrr_enabled = true;
+	} else {
+		*ams = NVME_CC_AMS_RR;
+		ctrl->wrr_enabled = false;
+	}
 }
 
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c7eef32e7739..ea726c2f95cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -259,7 +259,7 @@ struct irq_affinity_notify {
 	void (*release)(struct kref *ref);
 };
 
-#define	IRQ_AFFINITY_MAX_SETS  4
+#define	IRQ_AFFINITY_MAX_SETS  7
 
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 8f71451fc2fa..59b91a38d323 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -113,6 +113,7 @@ enum {
 };
 
 #define NVME_CAP_MQES(cap)	((cap) & 0xffff)
+#define NVME_CAP_AMS_WRRU(cap)	((cap) & (1 << 17))
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
 #define NVME_CAP_NSSRC(cap)	(((cap) >> 36) & 0x1)
@@ -844,6 +845,7 @@ enum {
 	NVME_SQ_PRIO_HIGH	= (1 << 1),
 	NVME_SQ_PRIO_MEDIUM	= (2 << 1),
 	NVME_SQ_PRIO_LOW	= (3 << 1),
+	NVME_SQ_PRIO_IGNORE	= NVME_SQ_PRIO_URGENT,
 	NVME_FEAT_ARBITRATION	= 0x01,
 	NVME_FEAT_POWER_MGMT	= 0x02,
 	NVME_FEAT_LBA_RANGE	= 0x03,
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
  2019-06-24 14:29   ` Weiping Zhang
@ 2019-06-24 15:42     ` Thomas Gleixner
  -1 siblings, 0 replies; 48+ messages in thread
From: Thomas Gleixner @ 2019-06-24 15:42 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: axboe, Tejun Heo, Christoph Hellwig, bvanassche, keith.busch,
	minwoo.im.dev, linux-block, cgroups, linux-nvme, LKML, Ming Lei

On Mon, 24 Jun 2019, Weiping Zhang wrote:

> The driver may implement multiple affinity set, and some of
> are empty, for this case we just skip them.

Why? What's the point of creating empty sets? Just because is not a real
good justification.

Leaving the patch for Ming.

Thanks,

	tglx

> Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
> ---
>  kernel/irq/affinity.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index f18cd5aa33e8..6d964fe0fbd8 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -295,6 +295,10 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
>  		unsigned int this_vecs = affd->set_size[i];
>  		int ret;
>  
> +		/* skip empty affinity set */
> +		if (this_vecs == 0)
> +			continue;
> +
>  		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
>  					       curvec, masks);
>  		if (ret) {
> -- 
> 2.14.1
> 
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
@ 2019-06-24 15:42     ` Thomas Gleixner
  0 siblings, 0 replies; 48+ messages in thread
From: Thomas Gleixner @ 2019-06-24 15:42 UTC (permalink / raw)


On Mon, 24 Jun 2019, Weiping Zhang wrote:

> The driver may implement multiple affinity set, and some of
> are empty, for this case we just skip them.

Why? What's the point of creating empty sets? Just because is not a real
good justification.

Leaving the patch for Ming.

Thanks,

	tglx

> Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
> ---
>  kernel/irq/affinity.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index f18cd5aa33e8..6d964fe0fbd8 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -295,6 +295,10 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
>  		unsigned int this_vecs = affd->set_size[i];
>  		int ret;
>  
> +		/* skip empty affinity set */
> +		if (this_vecs == 0)
> +			continue;
> +
>  		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
>  					       curvec, masks);
>  		if (ret) {
> -- 
> 2.14.1
> 
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
  2019-06-24 14:29   ` Weiping Zhang
@ 2019-06-24 20:04     ` Minwoo Im
  -1 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-24 20:04 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev,
	linux-block, cgroups, linux-nvme

On 19-06-24 22:29:19, Weiping Zhang wrote:
> Now nvme support three type hardware queues, read, poll and default,
> this patch rename write_queues to read_queues to set the number of
> read queues more explicitly. This patch alos is prepared for nvme
> support WRR(weighted round robin) that we can get the number of
> each queue type easily.
> 
> Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>

Hello, Weiping.

Thanks for making this patch as a separated one.  Actually I'd like to
hear about if the origin purpose of this param can be changed or not.

I can see a log from Jens when it gets added her:
  Commit 3b6592f70ad7("nvme: utilize two queue maps, one for reads and
                       one for writes")
  It says:
  """
  NVMe does round-robin between queues by default, which means that
  sharing a queue map for both reads and writes can be problematic
  in terms of read servicing. It's much easier to flood the queue
  with writes and reduce the read servicing.
  """

So, I'd like to hear what other people think about this patch :)

Thanks,

> ---
>  drivers/nvme/host/pci.c | 22 ++++++++++------------
>  1 file changed, 10 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 5d84241f0214..a3c9bb72d90e 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -68,10 +68,10 @@ static int io_queue_depth = 1024;
>  module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
>  MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
>  
> -static int write_queues;
> -module_param(write_queues, int, 0644);
> -MODULE_PARM_DESC(write_queues,
> -	"Number of queues to use for writes. If not set, reads and writes "
> +static int read_queues;
> +module_param(read_queues, int, 0644);
> +MODULE_PARM_DESC(read_queues,
> +	"Number of queues to use for reads. If not set, reads and writes "
>  	"will share a queue set.");
>  
>  static int poll_queues;
> @@ -211,7 +211,7 @@ struct nvme_iod {
>  
>  static unsigned int max_io_queues(void)
>  {
> -	return num_possible_cpus() + write_queues + poll_queues;
> +	return num_possible_cpus() + read_queues + poll_queues;
>  }
>  
>  static unsigned int max_queue_count(void)
> @@ -2021,18 +2021,16 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
>  	 * If only one interrupt is available or 'write_queue' == 0, combine
>  	 * write and read queues.
>  	 *
> -	 * If 'write_queues' > 0, ensure it leaves room for at least one read
> +	 * If 'read_queues' > 0, ensure it leaves room for at least one write
>  	 * queue.
>  	 */
> -	if (!nrirqs) {
> +	if (!nrirqs || nrirqs == 1) {
>  		nrirqs = 1;
>  		nr_read_queues = 0;
> -	} else if (nrirqs == 1 || !write_queues) {
> -		nr_read_queues = 0;
> -	} else if (write_queues >= nrirqs) {
> -		nr_read_queues = 1;
> +	} else if (read_queues >= nrirqs) {
> +		nr_read_queues = nrirqs - 1;
>  	} else {
> -		nr_read_queues = nrirqs - write_queues;
> +		nr_read_queues = read_queues;
>  	}
>  
>  	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> -- 
> 2.14.1
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
@ 2019-06-24 20:04     ` Minwoo Im
  0 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-24 20:04 UTC (permalink / raw)


On 19-06-24 22:29:19, Weiping Zhang wrote:
> Now nvme support three type hardware queues, read, poll and default,
> this patch rename write_queues to read_queues to set the number of
> read queues more explicitly. This patch alos is prepared for nvme
> support WRR(weighted round robin) that we can get the number of
> each queue type easily.
> 
> Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>

Hello, Weiping.

Thanks for making this patch as a separated one.  Actually I'd like to
hear about if the origin purpose of this param can be changed or not.

I can see a log from Jens when it gets added her:
  Commit 3b6592f70ad7("nvme: utilize two queue maps, one for reads and
                       one for writes")
  It says:
  """
  NVMe does round-robin between queues by default, which means that
  sharing a queue map for both reads and writes can be problematic
  in terms of read servicing. It's much easier to flood the queue
  with writes and reduce the read servicing.
  """

So, I'd like to hear what other people think about this patch :)

Thanks,

> ---
>  drivers/nvme/host/pci.c | 22 ++++++++++------------
>  1 file changed, 10 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 5d84241f0214..a3c9bb72d90e 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -68,10 +68,10 @@ static int io_queue_depth = 1024;
>  module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
>  MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
>  
> -static int write_queues;
> -module_param(write_queues, int, 0644);
> -MODULE_PARM_DESC(write_queues,
> -	"Number of queues to use for writes. If not set, reads and writes "
> +static int read_queues;
> +module_param(read_queues, int, 0644);
> +MODULE_PARM_DESC(read_queues,
> +	"Number of queues to use for reads. If not set, reads and writes "
>  	"will share a queue set.");
>  
>  static int poll_queues;
> @@ -211,7 +211,7 @@ struct nvme_iod {
>  
>  static unsigned int max_io_queues(void)
>  {
> -	return num_possible_cpus() + write_queues + poll_queues;
> +	return num_possible_cpus() + read_queues + poll_queues;
>  }
>  
>  static unsigned int max_queue_count(void)
> @@ -2021,18 +2021,16 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
>  	 * If only one interrupt is available or 'write_queue' == 0, combine
>  	 * write and read queues.
>  	 *
> -	 * If 'write_queues' > 0, ensure it leaves room for at least one read
> +	 * If 'read_queues' > 0, ensure it leaves room for at least one write
>  	 * queue.
>  	 */
> -	if (!nrirqs) {
> +	if (!nrirqs || nrirqs == 1) {
>  		nrirqs = 1;
>  		nr_read_queues = 0;
> -	} else if (nrirqs == 1 || !write_queues) {
> -		nr_read_queues = 0;
> -	} else if (write_queues >= nrirqs) {
> -		nr_read_queues = 1;
> +	} else if (read_queues >= nrirqs) {
> +		nr_read_queues = nrirqs - 1;
>  	} else {
> -		nr_read_queues = nrirqs - write_queues;
> +		nr_read_queues = read_queues;
>  	}
>  
>  	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> -- 
> 2.14.1
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 2/5] nvme: add get_ams for nvme_ctrl_ops
  2019-06-24 14:29   ` Weiping Zhang
@ 2019-06-24 20:12     ` Minwoo Im
  -1 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-24 20:12 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev,
	linux-block, cgroups, linux-nvme

On 19-06-24 22:29:05, Weiping Zhang wrote:
> The get_ams() will return the AMS(Arbitration Mechanism Selected)
> from the driver.
> 
> Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>

Hello, Weiping.

Sorry, but I don't really get what your point is here.  Could you please
elaborate this patch a little bit more?  The commit description just
says a function would just return the AMS from nowhere..

> ---
>  drivers/nvme/host/core.c | 9 ++++++++-
>  drivers/nvme/host/nvme.h | 1 +
>  drivers/nvme/host/pci.c  | 6 ++++++
>  include/linux/nvme.h     | 1 +
>  4 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index b2dd4e391f5c..4cb781e73123 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1943,6 +1943,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
>  	 */
>  	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
>  	int ret;
> +	u32 ams = NVME_CC_AMS_RR;
>  
>  	if (page_shift < dev_page_min) {
>  		dev_err(ctrl->device,
> @@ -1951,11 +1952,17 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
>  		return -ENODEV;
>  	}
>  
> +	/* get Arbitration Mechanism Selected */
> +	if (ctrl->ops->get_ams) {

I just wonder if this check will be valid because this patch just
register the function nvme_pci_get_ams() without any conditions.

> +		ctrl->ops->get_ams(ctrl, &ams);
> +		ams &= NVME_CC_AMS_MASK;
> +	}
> +
>  	ctrl->page_size = 1 << page_shift;
>  
>  	ctrl->ctrl_config = NVME_CC_CSS_NVM;
>  	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
> -	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
> +	ctrl->ctrl_config |= ams | NVME_CC_SHN_NONE;
>  	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
>  	ctrl->ctrl_config |= NVME_CC_ENABLE;
>  
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index ea45d7d393ad..9c7e9217f78b 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -369,6 +369,7 @@ struct nvme_ctrl_ops {
>  	void (*submit_async_event)(struct nvme_ctrl *ctrl);
>  	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
>  	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
> +	void (*get_ams)(struct nvme_ctrl *ctrl, u32 *ams);

Can we just have a return value for the AMS value?

>  };
>  
>  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 189352081994..5d84241f0214 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2627,6 +2627,11 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
>  	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
>  }
>  
> +static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
> +{
> +	*ams = NVME_CC_AMS_RR;
> +}
> +
>  static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
>  	.name			= "pcie",
>  	.module			= THIS_MODULE,
> @@ -2638,6 +2643,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
>  	.free_ctrl		= nvme_pci_free_ctrl,
>  	.submit_async_event	= nvme_pci_submit_async_event,
>  	.get_address		= nvme_pci_get_address,
> +	.get_ams		= nvme_pci_get_ams,

Question: Do we really need this being added to nvme_ctrl_ops?

Also If 5th patch will make this function much more than this, then it
would be great if you describe this kind of situation in description :)

>  };
>  
>  static int nvme_dev_map(struct nvme_dev *dev)
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index da5f696aec00..8f71451fc2fa 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -156,6 +156,7 @@ enum {
>  	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
>  	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
>  	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
> +	NVME_CC_AMS_MASK	= 7 << NVME_CC_AMS_SHIFT,
>  	NVME_CC_SHN_NONE	= 0 << NVME_CC_SHN_SHIFT,
>  	NVME_CC_SHN_NORMAL	= 1 << NVME_CC_SHN_SHIFT,
>  	NVME_CC_SHN_ABRUPT	= 2 << NVME_CC_SHN_SHIFT,
> -- 
> 2.14.1
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 2/5] nvme: add get_ams for nvme_ctrl_ops
@ 2019-06-24 20:12     ` Minwoo Im
  0 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-24 20:12 UTC (permalink / raw)


On 19-06-24 22:29:05, Weiping Zhang wrote:
> The get_ams() will return the AMS(Arbitration Mechanism Selected)
> from the driver.
> 
> Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>

Hello, Weiping.

Sorry, but I don't really get what your point is here.  Could you please
elaborate this patch a little bit more?  The commit description just
says a function would just return the AMS from nowhere..

> ---
>  drivers/nvme/host/core.c | 9 ++++++++-
>  drivers/nvme/host/nvme.h | 1 +
>  drivers/nvme/host/pci.c  | 6 ++++++
>  include/linux/nvme.h     | 1 +
>  4 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index b2dd4e391f5c..4cb781e73123 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1943,6 +1943,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
>  	 */
>  	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
>  	int ret;
> +	u32 ams = NVME_CC_AMS_RR;
>  
>  	if (page_shift < dev_page_min) {
>  		dev_err(ctrl->device,
> @@ -1951,11 +1952,17 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
>  		return -ENODEV;
>  	}
>  
> +	/* get Arbitration Mechanism Selected */
> +	if (ctrl->ops->get_ams) {

I just wonder if this check will be valid because this patch just
register the function nvme_pci_get_ams() without any conditions.

> +		ctrl->ops->get_ams(ctrl, &ams);
> +		ams &= NVME_CC_AMS_MASK;
> +	}
> +
>  	ctrl->page_size = 1 << page_shift;
>  
>  	ctrl->ctrl_config = NVME_CC_CSS_NVM;
>  	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
> -	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
> +	ctrl->ctrl_config |= ams | NVME_CC_SHN_NONE;
>  	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
>  	ctrl->ctrl_config |= NVME_CC_ENABLE;
>  
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index ea45d7d393ad..9c7e9217f78b 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -369,6 +369,7 @@ struct nvme_ctrl_ops {
>  	void (*submit_async_event)(struct nvme_ctrl *ctrl);
>  	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
>  	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
> +	void (*get_ams)(struct nvme_ctrl *ctrl, u32 *ams);

Can we just have a return value for the AMS value?

>  };
>  
>  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 189352081994..5d84241f0214 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2627,6 +2627,11 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
>  	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
>  }
>  
> +static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
> +{
> +	*ams = NVME_CC_AMS_RR;
> +}
> +
>  static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
>  	.name			= "pcie",
>  	.module			= THIS_MODULE,
> @@ -2638,6 +2643,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
>  	.free_ctrl		= nvme_pci_free_ctrl,
>  	.submit_async_event	= nvme_pci_submit_async_event,
>  	.get_address		= nvme_pci_get_address,
> +	.get_ams		= nvme_pci_get_ams,

Question: Do we really need this being added to nvme_ctrl_ops?

Also If 5th patch will make this function much more than this, then it
would be great if you describe this kind of situation in description :)

>  };
>  
>  static int nvme_dev_map(struct nvme_dev *dev)
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index da5f696aec00..8f71451fc2fa 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -156,6 +156,7 @@ enum {
>  	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
>  	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
>  	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
> +	NVME_CC_AMS_MASK	= 7 << NVME_CC_AMS_SHIFT,
>  	NVME_CC_SHN_NONE	= 0 << NVME_CC_SHN_SHIFT,
>  	NVME_CC_SHN_NORMAL	= 1 << NVME_CC_SHN_SHIFT,
>  	NVME_CC_SHN_ABRUPT	= 2 << NVME_CC_SHN_SHIFT,
> -- 
> 2.14.1
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 5/5] nvme: add support weighted round robin queue
  2019-06-24 14:29   ` Weiping Zhang
@ 2019-06-24 20:21     ` Minwoo Im
  -1 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-24 20:21 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: axboe, tj, hch, bvanassche, keith.busch, minwoo.im.dev,
	linux-block, cgroups, linux-nvme

> @@ -2627,7 +2752,30 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
>  
>  static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
>  {
> -	*ams = NVME_CC_AMS_RR;
> +	/* if deivce doesn't support WRR, force reset wrr queues to 0 */
> +	if (!NVME_CAP_AMS_WRRU(ctrl->cap)) {
> +		wrr_low_queues = 0;
> +		wrr_medium_queues = 0;
> +		wrr_high_queues = 0;
> +		wrr_urgent_queues = 0;

Could we avoid this kind of reset variables in get_XXX() function?  I
guess it would be great if it just tries to get some value which is
mainly focused to do.

> +
> +		*ams = NVME_CC_AMS_RR;
> +		ctrl->wrr_enabled = false;
> +		return;
> +	}
> +
> +	/*
> +	 * if device support WRR, check wrr queue count, all wrr queues are
> +	 * 0, don't enable device's WRR.
> +	 */
> +	if ((wrr_low_queues + wrr_medium_queues + wrr_high_queues +
> +				wrr_urgent_queues) > 0) {
> +		*ams = NVME_CC_AMS_WRRU;
> +		ctrl->wrr_enabled = true;
> +	} else {
> +		*ams = NVME_CC_AMS_RR;
> +		ctrl->wrr_enabled = false;

These two line can be merged into above condition:

	if (!NVME_CAP_AMS_WRRU(ctrl->cap) ||
		wrr_low_queues + wrr_medium_queues + wrr_high_queues +
			wrr_urgent_queues <= 0) {
		*ams = NVME_CC_AMS_RR;
		ctrl->wrr_enabled = false;
	}

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
@ 2019-06-24 20:21     ` Minwoo Im
  0 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-24 20:21 UTC (permalink / raw)


> @@ -2627,7 +2752,30 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
>  
>  static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
>  {
> -	*ams = NVME_CC_AMS_RR;
> +	/* if deivce doesn't support WRR, force reset wrr queues to 0 */
> +	if (!NVME_CAP_AMS_WRRU(ctrl->cap)) {
> +		wrr_low_queues = 0;
> +		wrr_medium_queues = 0;
> +		wrr_high_queues = 0;
> +		wrr_urgent_queues = 0;

Could we avoid this kind of reset variables in get_XXX() function?  I
guess it would be great if it just tries to get some value which is
mainly focused to do.

> +
> +		*ams = NVME_CC_AMS_RR;
> +		ctrl->wrr_enabled = false;
> +		return;
> +	}
> +
> +	/*
> +	 * if device support WRR, check wrr queue count, all wrr queues are
> +	 * 0, don't enable device's WRR.
> +	 */
> +	if ((wrr_low_queues + wrr_medium_queues + wrr_high_queues +
> +				wrr_urgent_queues) > 0) {
> +		*ams = NVME_CC_AMS_WRRU;
> +		ctrl->wrr_enabled = true;
> +	} else {
> +		*ams = NVME_CC_AMS_RR;
> +		ctrl->wrr_enabled = false;

These two line can be merged into above condition:

	if (!NVME_CAP_AMS_WRRU(ctrl->cap) ||
		wrr_low_queues + wrr_medium_queues + wrr_high_queues +
			wrr_urgent_queues <= 0) {
		*ams = NVME_CC_AMS_RR;
		ctrl->wrr_enabled = false;
	}

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
  2019-06-24 15:42     ` Thomas Gleixner
@ 2019-06-25  2:14       ` Ming Lei
  -1 siblings, 0 replies; 48+ messages in thread
From: Ming Lei @ 2019-06-25  2:14 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Weiping Zhang, axboe, Tejun Heo, Christoph Hellwig, bvanassche,
	keith.busch, minwoo.im.dev, linux-block, cgroups, linux-nvme,
	LKML

Hi Thomas,

On Mon, Jun 24, 2019 at 05:42:39PM +0200, Thomas Gleixner wrote:
> On Mon, 24 Jun 2019, Weiping Zhang wrote:
> 
> > The driver may implement multiple affinity set, and some of
> > are empty, for this case we just skip them.
> 
> Why? What's the point of creating empty sets? Just because is not a real
> good justification.

Patch 5 will add 4 new sets for supporting NVMe's weighted round robin
arbitration. It can be a headache to manage so many irq sets(now the total
sets can become 6) dynamically since size of anyone in the new 4 sets can
be zero, so each particular set is assigned one static index for avoiding
the management trouble, then empty set will be seen by
irq_create_affinity_masks().

So looks skipping the empty set makes sense because the API will become
easier to use than before.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
@ 2019-06-25  2:14       ` Ming Lei
  0 siblings, 0 replies; 48+ messages in thread
From: Ming Lei @ 2019-06-25  2:14 UTC (permalink / raw)


Hi Thomas,

On Mon, Jun 24, 2019@05:42:39PM +0200, Thomas Gleixner wrote:
> On Mon, 24 Jun 2019, Weiping Zhang wrote:
> 
> > The driver may implement multiple affinity set, and some of
> > are empty, for this case we just skip them.
> 
> Why? What's the point of creating empty sets? Just because is not a real
> good justification.

Patch 5 will add 4 new sets for supporting NVMe's weighted round robin
arbitration. It can be a headache to manage so many irq sets(now the total
sets can become 6) dynamically since size of anyone in the new 4 sets can
be zero, so each particular set is assigned one static index for avoiding
the management trouble, then empty set will be seen by
irq_create_affinity_masks().

So looks skipping the empty set makes sense because the API will become
easier to use than before.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
  2019-06-25  2:14       ` Ming Lei
@ 2019-06-25  6:13         ` Thomas Gleixner
  -1 siblings, 0 replies; 48+ messages in thread
From: Thomas Gleixner @ 2019-06-25  6:13 UTC (permalink / raw)
  To: Ming Lei
  Cc: Weiping Zhang, axboe, Tejun Heo, Christoph Hellwig, bvanassche,
	keith.busch, minwoo.im.dev, linux-block, cgroups, linux-nvme,
	LKML

MIng,

On Tue, 25 Jun 2019, Ming Lei wrote:
> On Mon, Jun 24, 2019 at 05:42:39PM +0200, Thomas Gleixner wrote:
> > On Mon, 24 Jun 2019, Weiping Zhang wrote:
> > 
> > > The driver may implement multiple affinity set, and some of
> > > are empty, for this case we just skip them.
> > 
> > Why? What's the point of creating empty sets? Just because is not a real
> > good justification.
> 
> Patch 5 will add 4 new sets for supporting NVMe's weighted round robin
> arbitration. It can be a headache to manage so many irq sets(now the total
> sets can become 6) dynamically since size of anyone in the new 4 sets can
> be zero, so each particular set is assigned one static index for avoiding
> the management trouble, then empty set will be seen by
> irq_create_affinity_masks().
> 
> So looks skipping the empty set makes sense because the API will become
> easier to use than before.

That makes sense, but at least some of that information wants to be in the
change log and not some uninformative description of what the patch does.

I was not Cc'ed on the rest of the patches so I had exactly zero context.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
@ 2019-06-25  6:13         ` Thomas Gleixner
  0 siblings, 0 replies; 48+ messages in thread
From: Thomas Gleixner @ 2019-06-25  6:13 UTC (permalink / raw)


MIng,

On Tue, 25 Jun 2019, Ming Lei wrote:
> On Mon, Jun 24, 2019@05:42:39PM +0200, Thomas Gleixner wrote:
> > On Mon, 24 Jun 2019, Weiping Zhang wrote:
> > 
> > > The driver may implement multiple affinity set, and some of
> > > are empty, for this case we just skip them.
> > 
> > Why? What's the point of creating empty sets? Just because is not a real
> > good justification.
> 
> Patch 5 will add 4 new sets for supporting NVMe's weighted round robin
> arbitration. It can be a headache to manage so many irq sets(now the total
> sets can become 6) dynamically since size of anyone in the new 4 sets can
> be zero, so each particular set is assigned one static index for avoiding
> the management trouble, then empty set will be seen by
> irq_create_affinity_masks().
> 
> So looks skipping the empty set makes sense because the API will become
> easier to use than before.

That makes sense, but at least some of that information wants to be in the
change log and not some uninformative description of what the patch does.

I was not Cc'ed on the rest of the patches so I had exactly zero context.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 2/5] nvme: add get_ams for nvme_ctrl_ops
  2019-06-24 20:12     ` Minwoo Im
@ 2019-06-25 14:46       ` Weiping Zhang
  -1 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-25 14:46 UTC (permalink / raw)
  To: Minwoo Im
  Cc: Weiping Zhang, Jens Axboe, Tejun Heo, Christoph Hellwig,
	Bart Van Assche, keith.busch, linux-block, cgroups, linux-nvme

Minwoo Im <minwoo.im.dev@gmail.com> 于2019年6月25日周二 上午6:01写道:
>
> On 19-06-24 22:29:05, Weiping Zhang wrote:
> > The get_ams() will return the AMS(Arbitration Mechanism Selected)
> > from the driver.
> >
> > Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
>
> Hello, Weiping.
>
> Sorry, but I don't really get what your point is here.  Could you please
> elaborate this patch a little bit more?  The commit description just
> says a function would just return the AMS from nowhere..
>
I will add more detail description for this operation in commit message,
when we enable nvme controller, we need to know the correct AMS setting.

 There two cases for NVME_CC_AMS_RR:
1. nvme controller doesn't support AMS, now we set ams to NVME_CC_AMS_RR.
2. nvme controller support AMS, but the user did not want to enable
it, for example
set all wrr_xxx_queue to 0.

We only set ams to NVME_CC_AMS_WRRU if nvme controller support WRR and
the user want to enable it explictly.

nvme_enable_ctrl is a common function for nvme-pci, nvme-rdma, nvme-tcp,
it needs to konw the correct AMS setting from different nvme driver by
this operation.

> > ---
> >  drivers/nvme/host/core.c | 9 ++++++++-
> >  drivers/nvme/host/nvme.h | 1 +
> >  drivers/nvme/host/pci.c  | 6 ++++++
> >  include/linux/nvme.h     | 1 +
> >  4 files changed, 16 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index b2dd4e391f5c..4cb781e73123 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -1943,6 +1943,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
> >        */
> >       unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
> >       int ret;
> > +     u32 ams = NVME_CC_AMS_RR;
> >
> >       if (page_shift < dev_page_min) {
> >               dev_err(ctrl->device,
> > @@ -1951,11 +1952,17 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
> >               return -ENODEV;
> >       }
> >
> > +     /* get Arbitration Mechanism Selected */
> > +     if (ctrl->ops->get_ams) {
>
> I just wonder if this check will be valid because this patch just
> register the function nvme_pci_get_ams() without any conditions.
The nvme-pci, nvme-rdma,, should make sure that the ams is valid,
for example check CAP.AMS and other conditions.
>
> > +             ctrl->ops->get_ams(ctrl, &ams);
> > +             ams &= NVME_CC_AMS_MASK;
> > +     }
> > +
> >       ctrl->page_size = 1 << page_shift;
> >
> >       ctrl->ctrl_config = NVME_CC_CSS_NVM;
> >       ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
> > -     ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
> > +     ctrl->ctrl_config |= ams | NVME_CC_SHN_NONE;
> >       ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
> >       ctrl->ctrl_config |= NVME_CC_ENABLE;
> >
> > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> > index ea45d7d393ad..9c7e9217f78b 100644
> > --- a/drivers/nvme/host/nvme.h
> > +++ b/drivers/nvme/host/nvme.h
> > @@ -369,6 +369,7 @@ struct nvme_ctrl_ops {
> >       void (*submit_async_event)(struct nvme_ctrl *ctrl);
> >       void (*delete_ctrl)(struct nvme_ctrl *ctrl);
> >       int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
> > +     void (*get_ams)(struct nvme_ctrl *ctrl, u32 *ams);
>
> Can we just have a return value for the AMS value?
Both these two methods are acceptable to me.
>
> >  };
> >
> >  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 189352081994..5d84241f0214 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2627,6 +2627,11 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
> >       return snprintf(buf, size, "%s", dev_name(&pdev->dev));
> >  }
> >
> > +static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
> > +{
> > +     *ams = NVME_CC_AMS_RR;
> > +}
> > +
> >  static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
> >       .name                   = "pcie",
> >       .module                 = THIS_MODULE,
> > @@ -2638,6 +2643,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
> >       .free_ctrl              = nvme_pci_free_ctrl,
> >       .submit_async_event     = nvme_pci_submit_async_event,
> >       .get_address            = nvme_pci_get_address,
> > +     .get_ams                = nvme_pci_get_ams,
>
> Question: Do we really need this being added to nvme_ctrl_ops?
>
> Also If 5th patch will make this function much more than this, then it
> would be great if you describe this kind of situation in description :)
>
> >  };
> >
> >  static int nvme_dev_map(struct nvme_dev *dev)
> > diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> > index da5f696aec00..8f71451fc2fa 100644
> > --- a/include/linux/nvme.h
> > +++ b/include/linux/nvme.h
> > @@ -156,6 +156,7 @@ enum {
> >       NVME_CC_AMS_RR          = 0 << NVME_CC_AMS_SHIFT,
> >       NVME_CC_AMS_WRRU        = 1 << NVME_CC_AMS_SHIFT,
> >       NVME_CC_AMS_VS          = 7 << NVME_CC_AMS_SHIFT,
> > +     NVME_CC_AMS_MASK        = 7 << NVME_CC_AMS_SHIFT,
> >       NVME_CC_SHN_NONE        = 0 << NVME_CC_SHN_SHIFT,
> >       NVME_CC_SHN_NORMAL      = 1 << NVME_CC_SHN_SHIFT,
> >       NVME_CC_SHN_ABRUPT      = 2 << NVME_CC_SHN_SHIFT,
> > --
> > 2.14.1
> >

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 2/5] nvme: add get_ams for nvme_ctrl_ops
@ 2019-06-25 14:46       ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-25 14:46 UTC (permalink / raw)


Minwoo Im <minwoo.im.dev at gmail.com> ?2019?6?25??? ??6:01???
>
> On 19-06-24 22:29:05, Weiping Zhang wrote:
> > The get_ams() will return the AMS(Arbitration Mechanism Selected)
> > from the driver.
> >
> > Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
>
> Hello, Weiping.
>
> Sorry, but I don't really get what your point is here.  Could you please
> elaborate this patch a little bit more?  The commit description just
> says a function would just return the AMS from nowhere..
>
I will add more detail description for this operation in commit message,
when we enable nvme controller, we need to know the correct AMS setting.

 There two cases for NVME_CC_AMS_RR:
1. nvme controller doesn't support AMS, now we set ams to NVME_CC_AMS_RR.
2. nvme controller support AMS, but the user did not want to enable
it, for example
set all wrr_xxx_queue to 0.

We only set ams to NVME_CC_AMS_WRRU if nvme controller support WRR and
the user want to enable it explictly.

nvme_enable_ctrl is a common function for nvme-pci, nvme-rdma, nvme-tcp,
it needs to konw the correct AMS setting from different nvme driver by
this operation.

> > ---
> >  drivers/nvme/host/core.c | 9 ++++++++-
> >  drivers/nvme/host/nvme.h | 1 +
> >  drivers/nvme/host/pci.c  | 6 ++++++
> >  include/linux/nvme.h     | 1 +
> >  4 files changed, 16 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index b2dd4e391f5c..4cb781e73123 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -1943,6 +1943,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
> >        */
> >       unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
> >       int ret;
> > +     u32 ams = NVME_CC_AMS_RR;
> >
> >       if (page_shift < dev_page_min) {
> >               dev_err(ctrl->device,
> > @@ -1951,11 +1952,17 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
> >               return -ENODEV;
> >       }
> >
> > +     /* get Arbitration Mechanism Selected */
> > +     if (ctrl->ops->get_ams) {
>
> I just wonder if this check will be valid because this patch just
> register the function nvme_pci_get_ams() without any conditions.
The nvme-pci, nvme-rdma,, should make sure that the ams is valid,
for example check CAP.AMS and other conditions.
>
> > +             ctrl->ops->get_ams(ctrl, &ams);
> > +             ams &= NVME_CC_AMS_MASK;
> > +     }
> > +
> >       ctrl->page_size = 1 << page_shift;
> >
> >       ctrl->ctrl_config = NVME_CC_CSS_NVM;
> >       ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
> > -     ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
> > +     ctrl->ctrl_config |= ams | NVME_CC_SHN_NONE;
> >       ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
> >       ctrl->ctrl_config |= NVME_CC_ENABLE;
> >
> > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> > index ea45d7d393ad..9c7e9217f78b 100644
> > --- a/drivers/nvme/host/nvme.h
> > +++ b/drivers/nvme/host/nvme.h
> > @@ -369,6 +369,7 @@ struct nvme_ctrl_ops {
> >       void (*submit_async_event)(struct nvme_ctrl *ctrl);
> >       void (*delete_ctrl)(struct nvme_ctrl *ctrl);
> >       int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
> > +     void (*get_ams)(struct nvme_ctrl *ctrl, u32 *ams);
>
> Can we just have a return value for the AMS value?
Both these two methods are acceptable to me.
>
> >  };
> >
> >  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 189352081994..5d84241f0214 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2627,6 +2627,11 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
> >       return snprintf(buf, size, "%s", dev_name(&pdev->dev));
> >  }
> >
> > +static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
> > +{
> > +     *ams = NVME_CC_AMS_RR;
> > +}
> > +
> >  static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
> >       .name                   = "pcie",
> >       .module                 = THIS_MODULE,
> > @@ -2638,6 +2643,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
> >       .free_ctrl              = nvme_pci_free_ctrl,
> >       .submit_async_event     = nvme_pci_submit_async_event,
> >       .get_address            = nvme_pci_get_address,
> > +     .get_ams                = nvme_pci_get_ams,
>
> Question: Do we really need this being added to nvme_ctrl_ops?
>
> Also If 5th patch will make this function much more than this, then it
> would be great if you describe this kind of situation in description :)
>
> >  };
> >
> >  static int nvme_dev_map(struct nvme_dev *dev)
> > diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> > index da5f696aec00..8f71451fc2fa 100644
> > --- a/include/linux/nvme.h
> > +++ b/include/linux/nvme.h
> > @@ -156,6 +156,7 @@ enum {
> >       NVME_CC_AMS_RR          = 0 << NVME_CC_AMS_SHIFT,
> >       NVME_CC_AMS_WRRU        = 1 << NVME_CC_AMS_SHIFT,
> >       NVME_CC_AMS_VS          = 7 << NVME_CC_AMS_SHIFT,
> > +     NVME_CC_AMS_MASK        = 7 << NVME_CC_AMS_SHIFT,
> >       NVME_CC_SHN_NONE        = 0 << NVME_CC_SHN_SHIFT,
> >       NVME_CC_SHN_NORMAL      = 1 << NVME_CC_SHN_SHIFT,
> >       NVME_CC_SHN_ABRUPT      = 2 << NVME_CC_SHN_SHIFT,
> > --
> > 2.14.1
> >

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
  2019-06-24 20:04     ` Minwoo Im
@ 2019-06-25 14:48       ` Weiping Zhang
  -1 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-25 14:48 UTC (permalink / raw)
  To: Minwoo Im
  Cc: Weiping Zhang, Jens Axboe, Tejun Heo, Christoph Hellwig,
	Bart Van Assche, keith.busch, linux-block, cgroups, linux-nvme

Minwoo Im <minwoo.im.dev@gmail.com> 于2019年6月25日周二 上午6:00写道:
>
> On 19-06-24 22:29:19, Weiping Zhang wrote:
> > Now nvme support three type hardware queues, read, poll and default,
> > this patch rename write_queues to read_queues to set the number of
> > read queues more explicitly. This patch alos is prepared for nvme
> > support WRR(weighted round robin) that we can get the number of
> > each queue type easily.
> >
> > Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
>
> Hello, Weiping.
>
> Thanks for making this patch as a separated one.  Actually I'd like to
> hear about if the origin purpose of this param can be changed or not.
>
> I can see a log from Jens when it gets added her:
>   Commit 3b6592f70ad7("nvme: utilize two queue maps, one for reads and
>                        one for writes")
>   It says:
>   """
>   NVMe does round-robin between queues by default, which means that
>   sharing a queue map for both reads and writes can be problematic
>   in terms of read servicing. It's much easier to flood the queue
>   with writes and reduce the read servicing.
>   """
>
> So, I'd like to hear what other people think about this patch :)
>

This patch does not change its original behavior, if we set read_queue
greater than 0, the read and write request will use different tagset map,
so they will use different hardware queue.

> Thanks,
>
> > ---
> >  drivers/nvme/host/pci.c | 22 ++++++++++------------
> >  1 file changed, 10 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 5d84241f0214..a3c9bb72d90e 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -68,10 +68,10 @@ static int io_queue_depth = 1024;
> >  module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
> >  MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
> >
> > -static int write_queues;
> > -module_param(write_queues, int, 0644);
> > -MODULE_PARM_DESC(write_queues,
> > -     "Number of queues to use for writes. If not set, reads and writes "
> > +static int read_queues;
> > +module_param(read_queues, int, 0644);
> > +MODULE_PARM_DESC(read_queues,
> > +     "Number of queues to use for reads. If not set, reads and writes "
> >       "will share a queue set.");
> >
> >  static int poll_queues;
> > @@ -211,7 +211,7 @@ struct nvme_iod {
> >
> >  static unsigned int max_io_queues(void)
> >  {
> > -     return num_possible_cpus() + write_queues + poll_queues;
> > +     return num_possible_cpus() + read_queues + poll_queues;
> >  }
> >
> >  static unsigned int max_queue_count(void)
> > @@ -2021,18 +2021,16 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
> >        * If only one interrupt is available or 'write_queue' == 0, combine
> >        * write and read queues.
> >        *
> > -      * If 'write_queues' > 0, ensure it leaves room for at least one read
> > +      * If 'read_queues' > 0, ensure it leaves room for at least one write
> >        * queue.
> >        */
> > -     if (!nrirqs) {
> > +     if (!nrirqs || nrirqs == 1) {
> >               nrirqs = 1;
> >               nr_read_queues = 0;
> > -     } else if (nrirqs == 1 || !write_queues) {
> > -             nr_read_queues = 0;
> > -     } else if (write_queues >= nrirqs) {
> > -             nr_read_queues = 1;
> > +     } else if (read_queues >= nrirqs) {
> > +             nr_read_queues = nrirqs - 1;
> >       } else {
> > -             nr_read_queues = nrirqs - write_queues;
> > +             nr_read_queues = read_queues;
> >       }
> >
> >       dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> > --
> > 2.14.1
> >

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
@ 2019-06-25 14:48       ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-25 14:48 UTC (permalink / raw)


Minwoo Im <minwoo.im.dev at gmail.com> ?2019?6?25??? ??6:00???
>
> On 19-06-24 22:29:19, Weiping Zhang wrote:
> > Now nvme support three type hardware queues, read, poll and default,
> > this patch rename write_queues to read_queues to set the number of
> > read queues more explicitly. This patch alos is prepared for nvme
> > support WRR(weighted round robin) that we can get the number of
> > each queue type easily.
> >
> > Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
>
> Hello, Weiping.
>
> Thanks for making this patch as a separated one.  Actually I'd like to
> hear about if the origin purpose of this param can be changed or not.
>
> I can see a log from Jens when it gets added her:
>   Commit 3b6592f70ad7("nvme: utilize two queue maps, one for reads and
>                        one for writes")
>   It says:
>   """
>   NVMe does round-robin between queues by default, which means that
>   sharing a queue map for both reads and writes can be problematic
>   in terms of read servicing. It's much easier to flood the queue
>   with writes and reduce the read servicing.
>   """
>
> So, I'd like to hear what other people think about this patch :)
>

This patch does not change its original behavior, if we set read_queue
greater than 0, the read and write request will use different tagset map,
so they will use different hardware queue.

> Thanks,
>
> > ---
> >  drivers/nvme/host/pci.c | 22 ++++++++++------------
> >  1 file changed, 10 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 5d84241f0214..a3c9bb72d90e 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -68,10 +68,10 @@ static int io_queue_depth = 1024;
> >  module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
> >  MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
> >
> > -static int write_queues;
> > -module_param(write_queues, int, 0644);
> > -MODULE_PARM_DESC(write_queues,
> > -     "Number of queues to use for writes. If not set, reads and writes "
> > +static int read_queues;
> > +module_param(read_queues, int, 0644);
> > +MODULE_PARM_DESC(read_queues,
> > +     "Number of queues to use for reads. If not set, reads and writes "
> >       "will share a queue set.");
> >
> >  static int poll_queues;
> > @@ -211,7 +211,7 @@ struct nvme_iod {
> >
> >  static unsigned int max_io_queues(void)
> >  {
> > -     return num_possible_cpus() + write_queues + poll_queues;
> > +     return num_possible_cpus() + read_queues + poll_queues;
> >  }
> >
> >  static unsigned int max_queue_count(void)
> > @@ -2021,18 +2021,16 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
> >        * If only one interrupt is available or 'write_queue' == 0, combine
> >        * write and read queues.
> >        *
> > -      * If 'write_queues' > 0, ensure it leaves room for at least one read
> > +      * If 'read_queues' > 0, ensure it leaves room for at least one write
> >        * queue.
> >        */
> > -     if (!nrirqs) {
> > +     if (!nrirqs || nrirqs == 1) {
> >               nrirqs = 1;
> >               nr_read_queues = 0;
> > -     } else if (nrirqs == 1 || !write_queues) {
> > -             nr_read_queues = 0;
> > -     } else if (write_queues >= nrirqs) {
> > -             nr_read_queues = 1;
> > +     } else if (read_queues >= nrirqs) {
> > +             nr_read_queues = nrirqs - 1;
> >       } else {
> > -             nr_read_queues = nrirqs - write_queues;
> > +             nr_read_queues = read_queues;
> >       }
> >
> >       dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> > --
> > 2.14.1
> >

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
  2019-06-25  6:13         ` Thomas Gleixner
@ 2019-06-25 14:55           ` Weiping Zhang
  -1 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-25 14:55 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Ming Lei, Weiping Zhang, Jens Axboe, Tejun Heo,
	Christoph Hellwig, Bart Van Assche, keith.busch, Minwoo Im,
	linux-block, cgroups, linux-nvme, LKML

Thomas Gleixner <tglx@linutronix.de> 于2019年6月25日周二 下午3:36写道:
>
> MIng,
>
> On Tue, 25 Jun 2019, Ming Lei wrote:
> > On Mon, Jun 24, 2019 at 05:42:39PM +0200, Thomas Gleixner wrote:
> > > On Mon, 24 Jun 2019, Weiping Zhang wrote:
> > >
> > > > The driver may implement multiple affinity set, and some of
> > > > are empty, for this case we just skip them.
> > >
> > > Why? What's the point of creating empty sets? Just because is not a real
> > > good justification.
> >
> > Patch 5 will add 4 new sets for supporting NVMe's weighted round robin
> > arbitration. It can be a headache to manage so many irq sets(now the total
> > sets can become 6) dynamically since size of anyone in the new 4 sets can
> > be zero, so each particular set is assigned one static index for avoiding
> > the management trouble, then empty set will be seen by
> > irq_create_affinity_masks().
> >
> > So looks skipping the empty set makes sense because the API will become
> > easier to use than before.
>
Hello Ming,
Thanks your detail explanation.

> That makes sense, but at least some of that information wants to be in the
> change log and not some uninformative description of what the patch does.
>
> I was not Cc'ed on the rest of the patches so I had exactly zero context.
>
Hello Thomas,

I am sorry I didn't cc you the full patchset, I will add more detail
description in
commit message at V4.

> Thanks,
>
>         tglx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set
@ 2019-06-25 14:55           ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-25 14:55 UTC (permalink / raw)


Thomas Gleixner <tglx at linutronix.de> ?2019?6?25??? ??3:36???
>
> MIng,
>
> On Tue, 25 Jun 2019, Ming Lei wrote:
> > On Mon, Jun 24, 2019@05:42:39PM +0200, Thomas Gleixner wrote:
> > > On Mon, 24 Jun 2019, Weiping Zhang wrote:
> > >
> > > > The driver may implement multiple affinity set, and some of
> > > > are empty, for this case we just skip them.
> > >
> > > Why? What's the point of creating empty sets? Just because is not a real
> > > good justification.
> >
> > Patch 5 will add 4 new sets for supporting NVMe's weighted round robin
> > arbitration. It can be a headache to manage so many irq sets(now the total
> > sets can become 6) dynamically since size of anyone in the new 4 sets can
> > be zero, so each particular set is assigned one static index for avoiding
> > the management trouble, then empty set will be seen by
> > irq_create_affinity_masks().
> >
> > So looks skipping the empty set makes sense because the API will become
> > easier to use than before.
>
Hello Ming,
Thanks your detail explanation.

> That makes sense, but at least some of that information wants to be in the
> change log and not some uninformative description of what the patch does.
>
> I was not Cc'ed on the rest of the patches so I had exactly zero context.
>
Hello Thomas,

I am sorry I didn't cc you the full patchset, I will add more detail
description in
commit message at V4.

> Thanks,
>
>         tglx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 5/5] nvme: add support weighted round robin queue
  2019-06-24 20:21     ` Minwoo Im
@ 2019-06-25 15:06       ` Weiping Zhang
  -1 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-25 15:06 UTC (permalink / raw)
  To: Minwoo Im
  Cc: Weiping Zhang, Jens Axboe, Tejun Heo, Christoph Hellwig,
	Bart Van Assche, keith.busch, linux-block, cgroups, linux-nvme

Minwoo Im <minwoo.im.dev@gmail.com> 于2019年6月25日周二 上午6:01写道:
>
> > @@ -2627,7 +2752,30 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
> >
> >  static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
> >  {
> > -     *ams = NVME_CC_AMS_RR;
> > +     /* if deivce doesn't support WRR, force reset wrr queues to 0 */
> > +     if (!NVME_CAP_AMS_WRRU(ctrl->cap)) {
> > +             wrr_low_queues = 0;
> > +             wrr_medium_queues = 0;
> > +             wrr_high_queues = 0;
> > +             wrr_urgent_queues = 0;
>
> Could we avoid this kind of reset variables in get_XXX() function?  I
> guess it would be great if it just tries to get some value which is
> mainly focused to do.

I think its ok, when we use these variables in nvme_setup_irqs,
we can check ctrl->wrr_enabled, if it was false, skip all wrr_xxx_queues.
In other words, if ctrl->wrr_enabled is true, at least we have one wrr queue.

>
> > +
> > +             *ams = NVME_CC_AMS_RR;
> > +             ctrl->wrr_enabled = false;
> > +             return;
> > +     }
> > +
> > +     /*
> > +      * if device support WRR, check wrr queue count, all wrr queues are
> > +      * 0, don't enable device's WRR.
> > +      */
> > +     if ((wrr_low_queues + wrr_medium_queues + wrr_high_queues +
> > +                             wrr_urgent_queues) > 0) {
> > +             *ams = NVME_CC_AMS_WRRU;
> > +             ctrl->wrr_enabled = true;
> > +     } else {
> > +             *ams = NVME_CC_AMS_RR;
> > +             ctrl->wrr_enabled = false;
>
> These two line can be merged into above condition:
It's ok, and merge comments for NVME_CC_AMS_RR.
>
>         if (!NVME_CAP_AMS_WRRU(ctrl->cap) ||
>                 wrr_low_queues + wrr_medium_queues + wrr_high_queues +
>                         wrr_urgent_queues <= 0) {
>                 *ams = NVME_CC_AMS_RR;
>                 ctrl->wrr_enabled = false;
>         }

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
@ 2019-06-25 15:06       ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-25 15:06 UTC (permalink / raw)


Minwoo Im <minwoo.im.dev at gmail.com> ?2019?6?25??? ??6:01???
>
> > @@ -2627,7 +2752,30 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
> >
> >  static void nvme_pci_get_ams(struct nvme_ctrl *ctrl, u32 *ams)
> >  {
> > -     *ams = NVME_CC_AMS_RR;
> > +     /* if deivce doesn't support WRR, force reset wrr queues to 0 */
> > +     if (!NVME_CAP_AMS_WRRU(ctrl->cap)) {
> > +             wrr_low_queues = 0;
> > +             wrr_medium_queues = 0;
> > +             wrr_high_queues = 0;
> > +             wrr_urgent_queues = 0;
>
> Could we avoid this kind of reset variables in get_XXX() function?  I
> guess it would be great if it just tries to get some value which is
> mainly focused to do.

I think its ok, when we use these variables in nvme_setup_irqs,
we can check ctrl->wrr_enabled, if it was false, skip all wrr_xxx_queues.
In other words, if ctrl->wrr_enabled is true, at least we have one wrr queue.

>
> > +
> > +             *ams = NVME_CC_AMS_RR;
> > +             ctrl->wrr_enabled = false;
> > +             return;
> > +     }
> > +
> > +     /*
> > +      * if device support WRR, check wrr queue count, all wrr queues are
> > +      * 0, don't enable device's WRR.
> > +      */
> > +     if ((wrr_low_queues + wrr_medium_queues + wrr_high_queues +
> > +                             wrr_urgent_queues) > 0) {
> > +             *ams = NVME_CC_AMS_WRRU;
> > +             ctrl->wrr_enabled = true;
> > +     } else {
> > +             *ams = NVME_CC_AMS_RR;
> > +             ctrl->wrr_enabled = false;
>
> These two line can be merged into above condition:
It's ok, and merge comments for NVME_CC_AMS_RR.
>
>         if (!NVME_CAP_AMS_WRRU(ctrl->cap) ||
>                 wrr_low_queues + wrr_medium_queues + wrr_high_queues +
>                         wrr_urgent_queues <= 0) {
>                 *ams = NVME_CC_AMS_RR;
>                 ctrl->wrr_enabled = false;
>         }

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
  2019-06-25 14:48       ` Weiping Zhang
@ 2019-06-26 20:27         ` Minwoo Im
  -1 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-26 20:27 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: Weiping Zhang, Jens Axboe, Tejun Heo, Christoph Hellwig,
	Bart Van Assche, keith.busch, linux-block, cgroups, linux-nvme,
	Minwoo Im

On 19-06-25 22:48:57, Weiping Zhang wrote:
> Minwoo Im <minwoo.im.dev@gmail.com> 于2019年6月25日周二 上午6:00写道:
> >
> > On 19-06-24 22:29:19, Weiping Zhang wrote:
> > > Now nvme support three type hardware queues, read, poll and default,
> > > this patch rename write_queues to read_queues to set the number of
> > > read queues more explicitly. This patch alos is prepared for nvme
> > > support WRR(weighted round robin) that we can get the number of
> > > each queue type easily.
> > >
> > > Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
> >
> > Hello, Weiping.
> >
> > Thanks for making this patch as a separated one.  Actually I'd like to
> > hear about if the origin purpose of this param can be changed or not.
> >
> > I can see a log from Jens when it gets added her:
> >   Commit 3b6592f70ad7("nvme: utilize two queue maps, one for reads and
> >                        one for writes")
> >   It says:
> >   """
> >   NVMe does round-robin between queues by default, which means that
> >   sharing a queue map for both reads and writes can be problematic
> >   in terms of read servicing. It's much easier to flood the queue
> >   with writes and reduce the read servicing.
> >   """
> >
> > So, I'd like to hear what other people think about this patch :)
> >
> 
> This patch does not change its original behavior, if we set read_queue
> greater than 0, the read and write request will use different tagset map,
> so they will use different hardware queue.

Yes, that's why I want to hear some comments for this change from other
people.  I'm not against this change, though.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
@ 2019-06-26 20:27         ` Minwoo Im
  0 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-26 20:27 UTC (permalink / raw)


On 19-06-25 22:48:57, Weiping Zhang wrote:
> Minwoo Im <minwoo.im.dev at gmail.com> ?2019?6?25??? ??6:00???
> >
> > On 19-06-24 22:29:19, Weiping Zhang wrote:
> > > Now nvme support three type hardware queues, read, poll and default,
> > > this patch rename write_queues to read_queues to set the number of
> > > read queues more explicitly. This patch alos is prepared for nvme
> > > support WRR(weighted round robin) that we can get the number of
> > > each queue type easily.
> > >
> > > Signed-off-by: Weiping Zhang <zhangweiping at didiglobal.com>
> >
> > Hello, Weiping.
> >
> > Thanks for making this patch as a separated one.  Actually I'd like to
> > hear about if the origin purpose of this param can be changed or not.
> >
> > I can see a log from Jens when it gets added her:
> >   Commit 3b6592f70ad7("nvme: utilize two queue maps, one for reads and
> >                        one for writes")
> >   It says:
> >   """
> >   NVMe does round-robin between queues by default, which means that
> >   sharing a queue map for both reads and writes can be problematic
> >   in terms of read servicing. It's much easier to flood the queue
> >   with writes and reduce the read servicing.
> >   """
> >
> > So, I'd like to hear what other people think about this patch :)
> >
> 
> This patch does not change its original behavior, if we set read_queue
> greater than 0, the read and write request will use different tagset map,
> so they will use different hardware queue.

Yes, that's why I want to hear some comments for this change from other
people.  I'm not against this change, though.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 5/5] nvme: add support weighted round robin queue
  2019-06-24 14:29   ` Weiping Zhang
@ 2019-06-27 10:37     ` Minwoo Im
  -1 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-27 10:37 UTC (permalink / raw)
  To: hch, keith.busch, sagi; +Cc: linux-block, linux-nvme

Hi, Maintainers

Would you guys please give some thoughts about this patch?  I like this
feature WRR addition to the driver so I really want to hear something
from you guys.

Thanks,

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
@ 2019-06-27 10:37     ` Minwoo Im
  0 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-06-27 10:37 UTC (permalink / raw)


Hi, Maintainers

Would you guys please give some thoughts about this patch?  I like this
feature WRR addition to the driver so I really want to hear something
from you guys.

Thanks,

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 5/5] nvme: add support weighted round robin queue
  2019-06-27 10:37     ` Minwoo Im
@ 2019-06-27 11:03       ` Christoph Hellwig
  -1 siblings, 0 replies; 48+ messages in thread
From: Christoph Hellwig @ 2019-06-27 11:03 UTC (permalink / raw)
  To: Minwoo Im; +Cc: hch, keith.busch, sagi, linux-block, linux-nvme

On Thu, Jun 27, 2019 at 07:37:19PM +0900, Minwoo Im wrote:
> Hi, Maintainers
> 
> Would you guys please give some thoughts about this patch?  I like this
> feature WRR addition to the driver so I really want to hear something
> from you guys.

We are at the end of the merge window with tons of things to sort out.
A giant feature series with a lot of impact is not at the top of the
priority list right now.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
@ 2019-06-27 11:03       ` Christoph Hellwig
  0 siblings, 0 replies; 48+ messages in thread
From: Christoph Hellwig @ 2019-06-27 11:03 UTC (permalink / raw)


On Thu, Jun 27, 2019@07:37:19PM +0900, Minwoo Im wrote:
> Hi, Maintainers
> 
> Would you guys please give some thoughts about this patch?  I like this
> feature WRR addition to the driver so I really want to hear something
> from you guys.

We are at the end of the merge window with tons of things to sort out.
A giant feature series with a lot of impact is not at the top of the
priority list right now.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 5/5] nvme: add support weighted round robin queue
  2019-06-27 11:03       ` Christoph Hellwig
@ 2019-06-28 15:57         ` Weiping Zhang
  -1 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-28 15:57 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Minwoo Im, keith.busch, sagi, linux-block, linux-nvme

Christoph Hellwig <hch@lst.de> 于2019年6月27日周四 下午7:06写道:
>
> On Thu, Jun 27, 2019 at 07:37:19PM +0900, Minwoo Im wrote:
> > Hi, Maintainers
> >
> > Would you guys please give some thoughts about this patch?  I like this
> > feature WRR addition to the driver so I really want to hear something
> > from you guys.
>
> We are at the end of the merge window with tons of things to sort out.
> A giant feature series with a lot of impact is not at the top of the
> priority list right now.

Hi Christoph,

There are some feedback in V3, I really want to get some more feedback from you
and other people, at that time I post V4.

So please give some comments for V3 at your convenience after this merge window.

Thanks a ton
Weiping

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
@ 2019-06-28 15:57         ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-06-28 15:57 UTC (permalink / raw)


Christoph Hellwig <hch at lst.de> ?2019?6?27??? ??7:06???
>
> On Thu, Jun 27, 2019@07:37:19PM +0900, Minwoo Im wrote:
> > Hi, Maintainers
> >
> > Would you guys please give some thoughts about this patch?  I like this
> > feature WRR addition to the driver so I really want to hear something
> > from you guys.
>
> We are at the end of the merge window with tons of things to sort out.
> A giant feature series with a lot of impact is not at the top of the
> priority list right now.

Hi Christoph,

There are some feedback in V3, I really want to get some more feedback from you
and other people, at that time I post V4.

So please give some comments for V3 at your convenience after this merge window.

Thanks a ton
Weiping

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 5/5] nvme: add support weighted round robin queue
  2019-06-28 15:57         ` Weiping Zhang
@ 2019-07-10 14:20           ` Weiping Zhang
  -1 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-07-10 14:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Minwoo Im, keith.busch, sagi, linux-block, linux-nvme

Weiping Zhang <zwp10758@gmail.com> 于2019年6月28日周五 下午11:57写道:
>
> Christoph Hellwig <hch@lst.de> 于2019年6月27日周四 下午7:06写道:
> >
> > On Thu, Jun 27, 2019 at 07:37:19PM +0900, Minwoo Im wrote:
> > > Hi, Maintainers
> > >
> > > Would you guys please give some thoughts about this patch?  I like this
> > > feature WRR addition to the driver so I really want to hear something
> > > from you guys.
> >
> > We are at the end of the merge window with tons of things to sort out.
> > A giant feature series with a lot of impact is not at the top of the
> > priority list right now.
>
> Hi Christoph,
>
> There are some feedback in V3, I really want to get some more feedback from you
> and other people, at that time I post V4.
>
> So please give some comments for V3 at your convenience after this merge window.
>
Hi Christoph,

Ping

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
@ 2019-07-10 14:20           ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-07-10 14:20 UTC (permalink / raw)


Weiping Zhang <zwp10758 at gmail.com> ?2019?6?28??? ??11:57???
>
> Christoph Hellwig <hch at lst.de> ?2019?6?27??? ??7:06???
> >
> > On Thu, Jun 27, 2019@07:37:19PM +0900, Minwoo Im wrote:
> > > Hi, Maintainers
> > >
> > > Would you guys please give some thoughts about this patch?  I like this
> > > feature WRR addition to the driver so I really want to hear something
> > > from you guys.
> >
> > We are at the end of the merge window with tons of things to sort out.
> > A giant feature series with a lot of impact is not at the top of the
> > priority list right now.
>
> Hi Christoph,
>
> There are some feedback in V3, I really want to get some more feedback from you
> and other people, at that time I post V4.
>
> So please give some comments for V3 at your convenience after this merge window.
>
Hi Christoph,

Ping

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues
  2019-06-24 14:29   ` Weiping Zhang
  (?)
  (?)
@ 2019-07-14 11:36   ` Minwoo Im
  -1 siblings, 0 replies; 48+ messages in thread
From: Minwoo Im @ 2019-07-14 11:36 UTC (permalink / raw)



Hi All,

Could you guys please give some advices for this patch?  I have been
confused for why we need to use write_queues even we have HCTX_TYPE_READ
instead of HCTX_TYPE_WRITE in block layer.

Thanks,

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 1/5] block: add weighted round robin for blkcgroup
  2019-06-24 14:28   ` Weiping Zhang
@ 2019-07-18 13:59     ` Tejun Heo
  -1 siblings, 0 replies; 48+ messages in thread
From: Tejun Heo @ 2019-07-18 13:59 UTC (permalink / raw)
  To: axboe, hch, bvanassche, keith.busch, minwoo.im.dev, linux-block,
	cgroups, linux-nvme

Hello, Weiping.

On Mon, Jun 24, 2019 at 10:28:51PM +0800, Weiping Zhang wrote:
> +static const char *blk_wrr_name[BLK_WRR_COUNT] = {
> +	[BLK_WRR_NONE]		= "none",
> +	[BLK_WRR_LOW]		= "low",
> +	[BLK_WRR_MEDIUM]	= "medium",
> +	[BLK_WRR_HIGH]		= "high",
> +	[BLK_WRR_URGENT]	= "urgent",
> +};

cgroup controllers must be fully hierarchical which the proposed
implementation isn't.  While it can be made hierarchical, there's only
so much one can do if there are only five priority levels.

Can you please take a look at the following?

  http://lkml.kernel.org/r/20190710205128.1316483-1-tj@kernel.org

In comparison, I'm having a bit of hard time seeing the benefits of
this approach.  In addition to the finite level limitation, the actual
WRR behavior would be device dependent and what each level means is
likely to fluctuate depending on the workload and device model.

I wonder whether WRR is something more valuable to help internal queue
management rather than being exposed to userspace directly.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 1/5] block: add weighted round robin for blkcgroup
@ 2019-07-18 13:59     ` Tejun Heo
  0 siblings, 0 replies; 48+ messages in thread
From: Tejun Heo @ 2019-07-18 13:59 UTC (permalink / raw)


Hello, Weiping.

On Mon, Jun 24, 2019@10:28:51PM +0800, Weiping Zhang wrote:
> +static const char *blk_wrr_name[BLK_WRR_COUNT] = {
> +	[BLK_WRR_NONE]		= "none",
> +	[BLK_WRR_LOW]		= "low",
> +	[BLK_WRR_MEDIUM]	= "medium",
> +	[BLK_WRR_HIGH]		= "high",
> +	[BLK_WRR_URGENT]	= "urgent",
> +};

cgroup controllers must be fully hierarchical which the proposed
implementation isn't.  While it can be made hierarchical, there's only
so much one can do if there are only five priority levels.

Can you please take a look at the following?

  http://lkml.kernel.org/r/20190710205128.1316483-1-tj at kernel.org

In comparison, I'm having a bit of hard time seeing the benefits of
this approach.  In addition to the finite level limitation, the actual
WRR behavior would be device dependent and what each level means is
likely to fluctuate depending on the workload and device model.

I wonder whether WRR is something more valuable to help internal queue
management rather than being exposed to userspace directly.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 1/5] block: add weighted round robin for blkcgroup
  2019-07-18 13:59     ` Tejun Heo
@ 2019-07-23 14:29       ` Weiping Zhang
  -1 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-07-23 14:29 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jens Axboe, Christoph Hellwig, Bart Van Assche, keith.busch,
	Minwoo Im, linux-block, cgroups, linux-nvme

Tejun Heo <tj@kernel.org> 于2019年7月18日周四 下午10:00写道:
>
> Hello, Weiping.
>
> On Mon, Jun 24, 2019 at 10:28:51PM +0800, Weiping Zhang wrote:
> > +static const char *blk_wrr_name[BLK_WRR_COUNT] = {
> > +     [BLK_WRR_NONE]          = "none",
> > +     [BLK_WRR_LOW]           = "low",
> > +     [BLK_WRR_MEDIUM]        = "medium",
> > +     [BLK_WRR_HIGH]          = "high",
> > +     [BLK_WRR_URGENT]        = "urgent",
> > +};
>
Hello Tejun,

> cgroup controllers must be fully hierarchical which the proposed
> implementation isn't.  While it can be made hierarchical, there's only
> so much one can do if there are only five priority levels.
>

These priority are fully mapped to nvme specification except WRR_NONE.
The Weighted Round Robin only support some of nvme devices, not all nvme
support this feature, if you think the name of blkio.wrr is too common
for block layer
I like to rename it to blkio.nvme.wrr. This patchset implent a simple interface
to user, if user want to use this feature they should to know the Qos
of WRR provided by
nvme device is accetable for their applicatiions. The NVME WRR is a
simple and usefull
feature, I want to give user one more option when they select a proper
io isolation policy.
It's not a general io isolation method, like what blkio.throttlle or
iocost did, it just implement
a simple mapping between application and nvme hardware submission
queue,  not add
any extra io statistic at block layer. The weight of (high, medium,
low) and the burst can be
changed by nvme-set-feature command. But this patchset does not
support that, will be
added in the feature.

> Can you please take a look at the following?
>
>   http://lkml.kernel.org/r/20190710205128.1316483-1-tj@kernel.org
>
> In comparison, I'm having a bit of hard time seeing the benefits of
> this approach.  In addition to the finite level limitation, the actual
> WRR behavior would be device dependent and what each level means is
> likely to fluctuate depending on the workload and device model.
>
From the test result(sequtial and random) it seems the high priority
can get more
bps/iops than lower priority. If device cannot guarantee the io
latency when mixture
IOs issued to the device, I think, for WRR,  the software should tune Weigth of
high,medium, low and arbitration burst may provide a more stable
latency, like what
iocost does(tune overall io issue rate).

> I wonder whether WRR is something more valuable to help internal queue
> management rather than being exposed to userspace directly.
>
> Thanks.
>
> --
> tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 1/5] block: add weighted round robin for blkcgroup
@ 2019-07-23 14:29       ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-07-23 14:29 UTC (permalink / raw)


Tejun Heo <tj at kernel.org> ?2019?7?18??? ??10:00???
>
> Hello, Weiping.
>
> On Mon, Jun 24, 2019@10:28:51PM +0800, Weiping Zhang wrote:
> > +static const char *blk_wrr_name[BLK_WRR_COUNT] = {
> > +     [BLK_WRR_NONE]          = "none",
> > +     [BLK_WRR_LOW]           = "low",
> > +     [BLK_WRR_MEDIUM]        = "medium",
> > +     [BLK_WRR_HIGH]          = "high",
> > +     [BLK_WRR_URGENT]        = "urgent",
> > +};
>
Hello Tejun,

> cgroup controllers must be fully hierarchical which the proposed
> implementation isn't.  While it can be made hierarchical, there's only
> so much one can do if there are only five priority levels.
>

These priority are fully mapped to nvme specification except WRR_NONE.
The Weighted Round Robin only support some of nvme devices, not all nvme
support this feature, if you think the name of blkio.wrr is too common
for block layer
I like to rename it to blkio.nvme.wrr. This patchset implent a simple interface
to user, if user want to use this feature they should to know the Qos
of WRR provided by
nvme device is accetable for their applicatiions. The NVME WRR is a
simple and usefull
feature, I want to give user one more option when they select a proper
io isolation policy.
It's not a general io isolation method, like what blkio.throttlle or
iocost did, it just implement
a simple mapping between application and nvme hardware submission
queue,  not add
any extra io statistic at block layer. The weight of (high, medium,
low) and the burst can be
changed by nvme-set-feature command. But this patchset does not
support that, will be
added in the feature.

> Can you please take a look at the following?
>
>   http://lkml.kernel.org/r/20190710205128.1316483-1-tj at kernel.org
>
> In comparison, I'm having a bit of hard time seeing the benefits of
> this approach.  In addition to the finite level limitation, the actual
> WRR behavior would be device dependent and what each level means is
> likely to fluctuate depending on the workload and device model.
>
>From the test result(sequtial and random) it seems the high priority
can get more
bps/iops than lower priority. If device cannot guarantee the io
latency when mixture
IOs issued to the device, I think, for WRR,  the software should tune Weigth of
high,medium, low and arbitration burst may provide a more stable
latency, like what
iocost does(tune overall io issue rate).

> I wonder whether WRR is something more valuable to help internal queue
> management rather than being exposed to userspace directly.
>
> Thanks.
>
> --
> tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 5/5] nvme: add support weighted round robin queue
  2019-07-10 14:20           ` Weiping Zhang
@ 2019-07-29 10:22             ` Weiping Zhang
  -1 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-07-29 10:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Minwoo Im, keith.busch, sagi, linux-block, linux-nvme

Weiping Zhang <zwp10758@gmail.com> 于2019年7月10日周三 下午10:20写道:
>
> Weiping Zhang <zwp10758@gmail.com> 于2019年6月28日周五 下午11:57写道:
> >
> > Christoph Hellwig <hch@lst.de> 于2019年6月27日周四 下午7:06写道:
> > >
> > > On Thu, Jun 27, 2019 at 07:37:19PM +0900, Minwoo Im wrote:
> > > > Hi, Maintainers
> > > >
> > > > Would you guys please give some thoughts about this patch?  I like this
> > > > feature WRR addition to the driver so I really want to hear something
> > > > from you guys.
> > >
> > > We are at the end of the merge window with tons of things to sort out.
> > > A giant feature series with a lot of impact is not at the top of the
> > > priority list right now.
> >
> > Hi Christoph,
> >
> > There are some feedback in V3, I really want to get some more feedback from you
> > and other people, at that time I post V4.
> >
> > So please give some comments for V3 at your convenience after this merge window.
> >
> Hi Christoph,
>
> Ping

Hi Christoph,

Ping

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 5/5] nvme: add support weighted round robin queue
@ 2019-07-29 10:22             ` Weiping Zhang
  0 siblings, 0 replies; 48+ messages in thread
From: Weiping Zhang @ 2019-07-29 10:22 UTC (permalink / raw)


Weiping Zhang <zwp10758 at gmail.com> ?2019?7?10??? ??10:20???
>
> Weiping Zhang <zwp10758 at gmail.com> ?2019?6?28??? ??11:57???
> >
> > Christoph Hellwig <hch at lst.de> ?2019?6?27??? ??7:06???
> > >
> > > On Thu, Jun 27, 2019@07:37:19PM +0900, Minwoo Im wrote:
> > > > Hi, Maintainers
> > > >
> > > > Would you guys please give some thoughts about this patch?  I like this
> > > > feature WRR addition to the driver so I really want to hear something
> > > > from you guys.
> > >
> > > We are at the end of the merge window with tons of things to sort out.
> > > A giant feature series with a lot of impact is not at the top of the
> > > priority list right now.
> >
> > Hi Christoph,
> >
> > There are some feedback in V3, I really want to get some more feedback from you
> > and other people, at that time I post V4.
> >
> > So please give some comments for V3 at your convenience after this merge window.
> >
> Hi Christoph,
>
> Ping

Hi Christoph,

Ping

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2019-07-29 10:23 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <cover.1561385989.git.zhangweiping@didiglobal.com>
2019-06-24 14:28 ` [PATCH v3 1/5] block: add weighted round robin for blkcgroup Weiping Zhang
2019-06-24 14:28   ` Weiping Zhang
2019-07-18 13:59   ` Tejun Heo
2019-07-18 13:59     ` Tejun Heo
2019-07-23 14:29     ` Weiping Zhang
2019-07-23 14:29       ` Weiping Zhang
2019-06-24 14:29 ` [PATCH v3 2/5] nvme: add get_ams for nvme_ctrl_ops Weiping Zhang
2019-06-24 14:29   ` Weiping Zhang
2019-06-24 20:12   ` Minwoo Im
2019-06-24 20:12     ` Minwoo Im
2019-06-25 14:46     ` Weiping Zhang
2019-06-25 14:46       ` Weiping Zhang
2019-06-24 14:29 ` [PATCH v3 3/5] nvme-pci: rename module parameter write_queues to read_queues Weiping Zhang
2019-06-24 14:29   ` Weiping Zhang
2019-06-24 20:04   ` Minwoo Im
2019-06-24 20:04     ` Minwoo Im
2019-06-25 14:48     ` Weiping Zhang
2019-06-25 14:48       ` Weiping Zhang
2019-06-26 20:27       ` Minwoo Im
2019-06-26 20:27         ` Minwoo Im
2019-07-14 11:36   ` Minwoo Im
2019-06-24 14:29 ` [PATCH v3 4/5] genirq/affinity: allow driver's discontigous affinity set Weiping Zhang
2019-06-24 14:29   ` Weiping Zhang
2019-06-24 14:29   ` Weiping Zhang
2019-06-24 15:42   ` Thomas Gleixner
2019-06-24 15:42     ` Thomas Gleixner
2019-06-25  2:14     ` Ming Lei
2019-06-25  2:14       ` Ming Lei
2019-06-25  6:13       ` Thomas Gleixner
2019-06-25  6:13         ` Thomas Gleixner
2019-06-25 14:55         ` Weiping Zhang
2019-06-25 14:55           ` Weiping Zhang
2019-06-24 14:29 ` [PATCH v3 5/5] nvme: add support weighted round robin queue Weiping Zhang
2019-06-24 14:29   ` Weiping Zhang
2019-06-24 20:21   ` Minwoo Im
2019-06-24 20:21     ` Minwoo Im
2019-06-25 15:06     ` Weiping Zhang
2019-06-25 15:06       ` Weiping Zhang
2019-06-27 10:37   ` Minwoo Im
2019-06-27 10:37     ` Minwoo Im
2019-06-27 11:03     ` Christoph Hellwig
2019-06-27 11:03       ` Christoph Hellwig
2019-06-28 15:57       ` Weiping Zhang
2019-06-28 15:57         ` Weiping Zhang
2019-07-10 14:20         ` Weiping Zhang
2019-07-10 14:20           ` Weiping Zhang
2019-07-29 10:22           ` Weiping Zhang
2019-07-29 10:22             ` Weiping Zhang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.