[RFC PATCH v2 0/4] blkcg: add support weighted round robin tagset map

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH v2 0/4] blkcg: add support weighted round robin tagset map
@ 2019-06-16 10:14 Weiping Zhang
  2019-06-16 10:15 ` [PATCH v2 1/4] block: add weighted round robin for blkcgroup Weiping Zhang
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Weiping Zhang @ 2019-06-16 10:14 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche; +Cc: linux-block, cgroups

This series try to add support Weighted Round Robin for blkcg, and
add some module parameters to enable nvme driver WRR.

The first patch add an WRR infrastucture for block cgroup.
The second patch add demon WRR for null_blk to similate nvme spec.
The last two patched try to enable WRR for nvme driver.

For nvme part, rename write_queues to read_queues that we can add the
wrr_low/medium/high_queues easily to support WRR. To compatible with
READ and POLL, DEFAULT, tagset mapping, this patch set these three
types hardware submition queue's priority to medium.

Changes since V1:
 * reorder HCTX_TYPE_POLL to the last one to adopt nvme driver easily.
 * add support WRR(Weighted Round Robin) for nvme driver

Weiping Zhang (4):
  block: add weighted round robin for blkcgroup
  null_blk: add support weighted round robin submition queue
  genirq/affinity: allow driver's discontigous affinity set
  nvme: add support weighted round robin queue

 block/blk-cgroup.c            |  88 +++++++++++++
 block/blk-mq-debugfs.c        |   3 +
 block/blk-mq-sched.c          |   6 +-
 block/blk-mq-tag.c            |   4 +-
 block/blk-mq-tag.h            |   2 +-
 block/blk-mq.c                |  12 +-
 block/blk-mq.h                |  17 ++-
 block/blk.h                   |   2 +-
 drivers/block/null_blk.h      |   7 +
 drivers/block/null_blk_main.c | 294 ++++++++++++++++++++++++++++++++++++++++--
 drivers/nvme/host/pci.c       | 195 ++++++++++++++++++++--------
 include/linux/blk-cgroup.h    |   2 +
 include/linux/blk-mq.h        |  12 ++
 include/linux/interrupt.h     |   2 +-
 kernel/irq/affinity.c         |   4 +
 15 files changed, 574 insertions(+), 76 deletions(-)

-- 
2.14.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2 1/4] block: add weighted round robin for blkcgroup
  2019-06-16 10:14 [RFC PATCH v2 0/4] blkcg: add support weighted round robin tagset map Weiping Zhang
@ 2019-06-16 10:15 ` Weiping Zhang
  2019-06-16 10:15 ` [PATCH v2 2/4] null_blk: add support weighted round robin submition queue Weiping Zhang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Weiping Zhang @ 2019-06-16 10:15 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche; +Cc: linux-block, cgroups

Each block cgroup can select a weighted round robin type to make
its io requests go to the specified haredware queue. Now we support
three round robin type high, medium, low like what nvme specification
donse.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 block/blk-cgroup.c         | 88 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.c     |  3 ++
 block/blk-mq-sched.c       |  6 +++-
 block/blk-mq-tag.c         |  4 +--
 block/blk-mq-tag.h         |  2 +-
 block/blk-mq.c             | 12 +++++--
 block/blk-mq.h             | 17 +++++++--
 block/blk.h                |  2 +-
 include/linux/blk-cgroup.h |  2 ++
 include/linux/blk-mq.h     | 12 +++++++
 10 files changed, 138 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b97b479e4f64..0b0234dd2976 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1015,6 +1015,89 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 	return 0;
 }
 
+static const char *blk_wrr_name[BLK_WRR_COUNT] = {
+	[BLK_WRR_NONE]		= "none",
+	[BLK_WRR_LOW]		= "low",
+	[BLK_WRR_MEDIUM]	= "medium",
+	[BLK_WRR_HIGH]		= "high",
+};
+
+static inline const char *blk_wrr_to_name(int wrr)
+{
+	if (wrr < BLK_WRR_NONE || wrr >= BLK_WRR_COUNT)
+		return "wrong";
+
+	return blk_wrr_name[wrr];
+}
+
+static ssize_t blkcg_wrr_write(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct gendisk *disk;
+	struct request_queue *q;
+	struct blkcg_gq *blkg;
+	unsigned int major, minor;
+	int wrr, key_len, part, ret;
+	char *body;
+
+	if (sscanf(buf, "%u:%u%n", &major, &minor, &key_len) != 2)
+		return -EINVAL;
+
+	body = buf + key_len;
+	if (!isspace(*body))
+		return -EINVAL;
+	body = skip_spaces(body);
+	wrr = sysfs_match_string(blk_wrr_name, body);
+	if (wrr == BLK_WRR_COUNT)
+		return -EINVAL;
+
+	disk = get_gendisk(MKDEV(major, minor), &part);
+	if (!disk)
+		return -ENODEV;
+	if (part) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	q = disk->queue;
+
+	blkg = blkg_lookup_create(blkcg, q);
+
+	atomic_set(&blkg->wrr, wrr);
+	put_disk_and_module(disk);
+
+	return nbytes;
+fail:
+	put_disk_and_module(disk);
+	return ret;
+}
+
+static int blkcg_wrr_show(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+		const char *dname;
+		char *buf;
+		size_t size = seq_get_buf(sf, &buf), off = 0;
+
+		dname = blkg_dev_name(blkg);
+		if (!dname)
+			continue;
+
+		off += scnprintf(buf+off, size-off, "%s %s\n", dname,
+			blk_wrr_to_name(atomic_read(&blkg->wrr)));
+		seq_commit(sf, off);
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
 static struct cftype blkcg_files[] = {
 	{
 		.name = "stat",
@@ -1029,6 +1112,11 @@ static struct cftype blkcg_legacy_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkcg_reset_stats,
 	},
+	{
+		.name = "wrr",
+		.write = blkcg_wrr_write,
+		.seq_show = blkcg_wrr_show,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 6aea0ebc3a73..a65ec9dbcb27 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -436,6 +436,9 @@ static int hctx_busy_show(void *data, struct seq_file *m)
 static const char *const hctx_types[] = {
 	[HCTX_TYPE_DEFAULT]	= "default",
 	[HCTX_TYPE_READ]	= "read",
+	[HCTX_TYPE_WRR_LOW]	= "wrr_low",
+	[HCTX_TYPE_WRR_MEDIUM]	= "wrr_medium",
+	[HCTX_TYPE_WRR_HIGH]	= "wrr_high",
 	[HCTX_TYPE_POLL]	= "poll",
 };
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 74c6bb871f7e..8500c878f0c4 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -7,6 +7,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
@@ -322,10 +323,13 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+	struct blk_mq_hw_ctx *hctx;
+	struct blkcg_gq *blkg = bio->bi_blkg;
+	int wrr = blkg ? atomic_read(&blkg->wrr) : BLK_WRR_NONE;
 	bool ret = false;
 	enum hctx_type type;
 
+	hctx = blk_mq_map_queue(q, bio->bi_opf, ctx, wrr);
 	if (e && e->type->ops.bio_merge) {
 		blk_mq_put_ctx(ctx);
 		return e->type->ops.bio_merge(hctx, bio);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 7513c8eaabee..6fd5261c4fbc 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -106,7 +106,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 		return __sbitmap_queue_get(bt);
 }
 
-unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data, int wrr)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct sbitmap_queue *bt;
@@ -171,7 +171,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
-						data->ctx);
+						data->ctx, wrr);
 		tags = blk_mq_tags_from_data(data);
 		if (data->flags & BLK_MQ_REQ_RESERVED)
 			bt = &tags->breserved_tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..ef43f819ead8 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -25,7 +25,7 @@ struct blk_mq_tags {
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
-extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data, int wrr);
 extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
 			   struct blk_mq_ctx *ctx, unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ce0f5f4ede70..8e8e743c4c0d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -356,6 +356,12 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	struct request *rq;
 	unsigned int tag;
 	bool put_ctx_on_error = false;
+	int wrr;
+
+	if (bio && bio->bi_blkg)
+		wrr = atomic_read(&bio->bi_blkg->wrr);
+	else
+		wrr = BLK_WRR_NONE;
 
 	blk_queue_enter_live(q);
 	data->q = q;
@@ -365,7 +371,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	}
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-						data->ctx);
+						data->ctx, wrr);
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
@@ -385,7 +391,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 		blk_mq_tag_busy(data->hctx);
 	}
 
-	tag = blk_mq_get_tag(data);
+	tag = blk_mq_get_tag(data, wrr);
 	if (tag == BLK_MQ_TAG_FAIL) {
 		if (put_ctx_on_error) {
 			blk_mq_put_ctx(data->ctx);
@@ -1060,7 +1066,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
 		data.flags |= BLK_MQ_REQ_RESERVED;
 
 	shared = blk_mq_tag_busy(data.hctx);
-	rq->tag = blk_mq_get_tag(&data);
+	rq->tag = blk_mq_get_tag(&data, BLK_WRR_NONE);
 	if (rq->tag >= 0) {
 		if (shared) {
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 633a5a77ee8b..51d59462c825 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -101,7 +101,8 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
  */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 						     unsigned int flags,
-						     struct blk_mq_ctx *ctx)
+						     struct blk_mq_ctx *ctx,
+						     int wrr)
 {
 	enum hctx_type type = HCTX_TYPE_DEFAULT;
 
@@ -110,7 +111,19 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 	 */
 	if (flags & REQ_HIPRI)
 		type = HCTX_TYPE_POLL;
-	else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+	else if (wrr > BLK_WRR_NONE && wrr < BLK_WRR_COUNT) {
+		switch (wrr) {
+		case BLK_WRR_LOW:
+			type = HCTX_TYPE_WRR_LOW;
+			break;
+		case BLK_WRR_MEDIUM:
+			type = HCTX_TYPE_WRR_MEDIUM;
+			break;
+		default:
+			type = HCTX_TYPE_WRR_HIGH;
+			break;
+		}
+	} else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
 		type = HCTX_TYPE_READ;
 	
 	return ctx->hctxs[type];
diff --git a/block/blk.h b/block/blk.h
index 91b3581b7c7a..6ea188c499f4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -38,7 +38,7 @@ extern struct ida blk_queue_ida;
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx, BLK_WRR_NONE)->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 76c61318fda5..0a6a9bcea590 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -141,6 +141,8 @@ struct blkcg_gq {
 	atomic64_t			delay_start;
 	u64				last_delay;
 	int				last_use;
+	/* weighted round robin */
+	atomic_t			wrr;
 };
 
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 15d1aa53d96c..b07e7b866cec 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -85,11 +85,23 @@ struct blk_mq_queue_map {
 enum hctx_type {
 	HCTX_TYPE_DEFAULT,	/* all I/O not otherwise accounted for */
 	HCTX_TYPE_READ,		/* just for READ I/O */
+	HCTX_TYPE_WRR_LOW,	/* blkcg: weighted round robin - low */
+	HCTX_TYPE_WRR_MEDIUM,	/* blkcg: weighted round robin - medium */
+	HCTX_TYPE_WRR_HIGH,	/* blkcg: weighted round robin - high */
 	HCTX_TYPE_POLL,		/* polled I/O of any kind */
 
 	HCTX_MAX_TYPES,
 };
 
+enum blk_wrr {
+	BLK_WRR_NONE,
+	BLK_WRR_LOW,
+	BLK_WRR_MEDIUM,
+	BLK_WRR_HIGH,
+
+	BLK_WRR_COUNT,
+};
+
 struct blk_mq_tag_set {
 	/*
 	 * map[] holds ctx -> hctx mappings, one map exists for each type
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 2/4] null_blk: add support weighted round robin submition queue
  2019-06-16 10:14 [RFC PATCH v2 0/4] blkcg: add support weighted round robin tagset map Weiping Zhang
  2019-06-16 10:15 ` [PATCH v2 1/4] block: add weighted round robin for blkcgroup Weiping Zhang
@ 2019-06-16 10:15 ` Weiping Zhang
  2019-06-16 10:15 ` [PATCH v2 3/4] genirq/affinity: allow driver's discontigous affinity set Weiping Zhang
  2019-06-16 10:15 ` [PATCH v2 4/4] nvme: add support weighted round robin queue Weiping Zhang
  3 siblings, 0 replies; 9+ messages in thread
From: Weiping Zhang @ 2019-06-16 10:15 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche; +Cc: linux-block, cgroups

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 drivers/block/null_blk.h      |   7 +
 drivers/block/null_blk_main.c | 294 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 288 insertions(+), 13 deletions(-)

diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 34b22d6523ba..aa53c4b6de49 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -12,6 +12,7 @@
 
 struct nullb_cmd {
 	struct list_head list;
+	struct list_head wrr_node;
 	struct llist_node ll_list;
 	struct __call_single_data csd;
 	struct request *rq;
@@ -23,6 +24,8 @@ struct nullb_cmd {
 };
 
 struct nullb_queue {
+	spinlock_t wrr_lock;
+	struct list_head wrr_head;
 	unsigned long *tag_map;
 	wait_queue_head_t wait;
 	unsigned int queue_depth;
@@ -83,6 +86,10 @@ struct nullb {
 	struct nullb_queue *queues;
 	unsigned int nr_queues;
 	char disk_name[DISK_NAME_LEN];
+
+	struct task_struct *wrr_thread;
+	atomic_long_t wrrd_inflight;
+	wait_queue_head_t wrrd_wait;
 };
 
 #ifdef CONFIG_BLK_DEV_ZONED
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 447d635c79a2..100fc0e13036 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -4,6 +4,8 @@
  * Shaohua Li <shli@fb.com>
  */
 #include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/blk-cgroup.h>
 
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
@@ -75,6 +77,7 @@ enum {
 	NULL_IRQ_NONE		= 0,
 	NULL_IRQ_SOFTIRQ	= 1,
 	NULL_IRQ_TIMER		= 2,
+	NULL_IRQ_WRR		= 3,
 };
 
 enum {
@@ -87,10 +90,23 @@ static int g_no_sched;
 module_param_named(no_sched, g_no_sched, int, 0444);
 MODULE_PARM_DESC(no_sched, "No io scheduler");
 
+static int g_tagset_nr_maps = 1;
 static int g_submit_queues = 1;
 module_param_named(submit_queues, g_submit_queues, int, 0444);
 MODULE_PARM_DESC(submit_queues, "Number of submission queues");
 
+#define NULLB_SUBMIT_QUEUE(attr, count) \
+static int g_submit_queues_##attr = count; \
+module_param_named(submit_queues_##attr, g_submit_queues_##attr, int, 0444); \
+MODULE_PARM_DESC(submit_queues_##attr, "Number of " #attr " submission queues");
+
+NULLB_SUBMIT_QUEUE(default, 1)
+NULLB_SUBMIT_QUEUE(read, 0)
+NULLB_SUBMIT_QUEUE(poll, 0)
+NULLB_SUBMIT_QUEUE(wrr_low, 0)
+NULLB_SUBMIT_QUEUE(wrr_medium, 0)
+NULLB_SUBMIT_QUEUE(wrr_high, 0)
+
 static int g_home_node = NUMA_NO_NODE;
 module_param_named(home_node, g_home_node, int, 0444);
 MODULE_PARM_DESC(home_node, "Home node for the device");
@@ -158,7 +174,7 @@ static int g_irqmode = NULL_IRQ_SOFTIRQ;
 static int null_set_irqmode(const char *str, const struct kernel_param *kp)
 {
 	return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
-					NULL_IRQ_TIMER);
+					NULL_IRQ_WRR);
 }
 
 static const struct kernel_param_ops null_irqmode_param_ops = {
@@ -643,6 +659,22 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 	hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
 }
 
+static void null_cmd_wrr_insert(struct nullb_cmd *cmd)
+{
+	struct nullb_queue *nq = cmd->nq;
+	struct nullb *nullb = nq->dev->nullb;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&cmd->wrr_node);
+	spin_lock_irqsave(&nq->wrr_lock, flags);
+	list_add_tail(&cmd->wrr_node, &nq->wrr_head);
+	spin_unlock_irqrestore(&nq->wrr_lock, flags);
+
+	/* wake up wrr_thread if needed */
+	if (atomic_long_inc_return(&nullb->wrrd_inflight) == 1)
+		wake_up_interruptible(&nullb->wrrd_wait);
+}
+
 static void null_complete_rq(struct request *rq)
 {
 	end_cmd(blk_mq_rq_to_pdu(rq));
@@ -1236,6 +1268,9 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
 	case NULL_IRQ_TIMER:
 		null_cmd_end_timer(cmd);
 		break;
+	case NULL_IRQ_WRR:
+		null_cmd_wrr_insert(cmd);
+		break;
 	}
 	return BLK_STS_OK;
 }
@@ -1351,10 +1386,64 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return null_handle_cmd(cmd);
 }
 
+static inline int null_hctx_nr_queue(int type)
+{
+	int ret;
+
+	switch (type) {
+	case HCTX_TYPE_DEFAULT:
+		ret = g_submit_queues_default;
+		break;
+	case HCTX_TYPE_READ:
+		ret = g_submit_queues_read;
+		break;
+	case HCTX_TYPE_POLL:
+		ret = g_submit_queues_poll;
+		break;
+	case HCTX_TYPE_WRR_LOW:
+		ret = g_submit_queues_wrr_low;
+		break;
+	case HCTX_TYPE_WRR_MEDIUM:
+		ret = g_submit_queues_wrr_medium;
+		break;
+	case HCTX_TYPE_WRR_HIGH:
+		ret = g_submit_queues_wrr_high;
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+
+	return ret;
+}
+
+static int null_map_queues(struct blk_mq_tag_set *set)
+{
+	int i, offset;
+
+	for (i = 0, offset = 0; i < set->nr_maps; i++) {
+		struct blk_mq_queue_map *map = &set->map[i];
+
+		map->nr_queues = null_hctx_nr_queue(i);
+
+		if (!map->nr_queues) {
+			BUG_ON(i == HCTX_TYPE_DEFAULT);
+			continue;
+		}
+
+		map->queue_offset = offset;
+		blk_mq_map_queues(map);
+		offset += map->nr_queues;
+	}
+
+	return 0;
+}
+
 static const struct blk_mq_ops null_mq_ops = {
 	.queue_rq       = null_queue_rq,
 	.complete	= null_complete_rq,
 	.timeout	= null_timeout_rq,
+	.map_queues	= null_map_queues,
 };
 
 static void cleanup_queue(struct nullb_queue *nq)
@@ -1397,6 +1486,9 @@ static void null_del_dev(struct nullb *nullb)
 	cleanup_queues(nullb);
 	if (null_cache_active(nullb))
 		null_free_device_storage(nullb->dev, true);
+
+	if (dev->irqmode == NULL_IRQ_WRR)
+		kthread_stop(nullb->wrr_thread);
 	kfree(nullb);
 	dev->nullb = NULL;
 }
@@ -1435,6 +1527,8 @@ static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
 	init_waitqueue_head(&nq->wait);
 	nq->queue_depth = nullb->queue_depth;
 	nq->dev = nullb->dev;
+	INIT_LIST_HEAD(&nq->wrr_head);
+	spin_lock_init(&nq->wrr_lock);
 }
 
 static void null_init_queues(struct nullb *nullb)
@@ -1549,6 +1643,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
 						g_submit_queues;
 	set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
 						g_hw_queue_depth;
+	set->nr_maps = g_tagset_nr_maps;
 	set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
 	set->cmd_size	= sizeof(struct nullb_cmd);
 	set->flags = BLK_MQ_F_SHOULD_MERGE;
@@ -1576,7 +1671,7 @@ static void null_validate_conf(struct nullb_device *dev)
 		dev->submit_queues = 1;
 
 	dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
-	dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
+	dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_WRR);
 
 	/* Do memory allocation, so set blocking */
 	if (dev->memory_backed)
@@ -1616,6 +1711,72 @@ static bool null_setup_fault(void)
 	return true;
 }
 
+static inline void null_wrr_handle_map(struct nullb *nullb,
+				struct blk_mq_queue_map *map, int batch)
+{
+	int i, nr;
+	struct nullb_queue *nq;
+	struct nullb_cmd *cmd,*tmp;
+	unsigned long flags;
+
+	for (i = 0; i < map->nr_queues; i++) {
+		nq = &nullb->queues[i + map->queue_offset];
+		nr = batch;
+		spin_lock_irqsave(&nq->wrr_lock, flags);
+		list_for_each_entry_safe(cmd, tmp, &nq->wrr_head, wrr_node) {
+			list_del(&cmd->wrr_node);
+			blk_mq_end_request(cmd->rq, cmd->error);
+			atomic_long_dec(&nullb->wrrd_inflight);
+			if (nr-- == 0)
+				break;
+		}
+		spin_unlock_irqrestore(&nq->wrr_lock, flags);
+	}
+}
+
+static int null_wrr_thread(void *data)
+{
+	struct nullb *nullb = (struct nullb *)data;
+	struct blk_mq_tag_set *set = nullb->tag_set;
+	struct blk_mq_queue_map	*map;
+	DEFINE_WAIT(wait);
+
+	while (1) {
+		if (kthread_should_stop())
+			goto out;
+
+		cond_resched();
+
+		/* handle each hardware queue in weighted round robin way */
+
+		map = &set->map[HCTX_TYPE_WRR_HIGH];
+		null_wrr_handle_map(nullb, map, 8);
+
+		map = &set->map[HCTX_TYPE_WRR_MEDIUM];
+		null_wrr_handle_map(nullb, map, 4);
+
+		map = &set->map[HCTX_TYPE_WRR_LOW];
+		null_wrr_handle_map(nullb, map, 1);
+
+		map = &set->map[HCTX_TYPE_POLL];
+		null_wrr_handle_map(nullb, map, 1);
+
+		map = &set->map[HCTX_TYPE_READ];
+		null_wrr_handle_map(nullb, map, 1);
+
+		map = &set->map[HCTX_TYPE_DEFAULT];
+		null_wrr_handle_map(nullb, map, 1);
+
+		prepare_to_wait(&nullb->wrrd_wait, &wait, TASK_INTERRUPTIBLE);
+		if (0 == atomic_long_read(&nullb->wrrd_inflight))
+			schedule();
+		finish_wait(&nullb->wrrd_wait, &wait);
+	}
+
+out:
+	return 0;
+}
+
 static int null_add_dev(struct nullb_device *dev)
 {
 	struct nullb *nullb;
@@ -1706,15 +1867,27 @@ static int null_add_dev(struct nullb_device *dev)
 
 	sprintf(nullb->disk_name, "nullb%d", nullb->index);
 
+	if (dev->irqmode == NULL_IRQ_WRR) {
+		init_waitqueue_head(&nullb->wrrd_wait);
+		atomic_long_set(&nullb->wrrd_inflight, 0);
+		nullb->wrr_thread = kthread_run(null_wrr_thread, (void *)nullb,
+			"wrrd_%s", nullb->disk_name);
+		if (!nullb->wrr_thread)
+			goto out_cleanup_zone;
+	}
+
 	rv = null_gendisk_register(nullb);
 	if (rv)
-		goto out_cleanup_zone;
+		goto out_cleanup_wrrd;
 
 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
 	mutex_unlock(&lock);
 
 	return 0;
+out_cleanup_wrrd:
+	if (dev->irqmode == NULL_IRQ_WRR)
+		kthread_stop(nullb->wrr_thread);
 out_cleanup_zone:
 	if (dev->zoned)
 		null_zone_exit(dev);
@@ -1731,6 +1904,106 @@ static int null_add_dev(struct nullb_device *dev)
 	return rv;
 }
 
+static int null_verify_queues(void)
+{
+	int queues, nr;
+
+	if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
+		if (g_submit_queues != nr_online_nodes) {
+			pr_warn("null_blk: submit_queues param is set to %u.\n",
+							nr_online_nodes);
+			g_submit_queues = nr_online_nodes;
+		}
+	} else if (g_submit_queues > nr_cpu_ids)
+		g_submit_queues = nr_cpu_ids;
+	else if (g_submit_queues <= 0)
+		g_submit_queues = 1;
+
+	/* at least leave one queue for default */
+	g_submit_queues_default = 1;
+	queues = g_submit_queues - 1;
+	if (queues == 0)
+		goto def;
+
+	/* read queues */
+	nr = g_submit_queues_read;
+	if (nr < 0 || nr > queues) {
+		pr_warn("null_blk: invalid read queue count\n");
+		return -EINVAL;
+	}
+	g_tagset_nr_maps++;
+	queues -= nr;
+	if (queues == 0)
+		goto read;
+
+	/* poll queues */
+	nr = g_submit_queues_poll;
+	if (nr < 0 || nr > queues) {
+		pr_warn("null_blk: invalid poll queue count\n");
+		return -EINVAL;
+	}
+	g_tagset_nr_maps++;
+	queues -= nr;
+	if (queues == 0)
+		goto poll;
+
+	/* wrr_low queues */
+	nr = g_submit_queues_wrr_low;
+	if (nr < 0 || nr > queues) {
+		pr_warn("null_blk: invalid wrr_low queue count\n");
+		return -EINVAL;
+	}
+	g_tagset_nr_maps++;
+	queues -= nr;
+	if (queues == 0)
+		goto wrr_low;
+
+	/* wrr_medium queues */
+	nr = g_submit_queues_wrr_medium;
+	if (nr < 0 || nr > queues) {
+		pr_warn("null_blk: invalid wrr_medium queue count\n");
+		return -EINVAL;
+	}
+	g_tagset_nr_maps++;
+	queues -= nr;
+	if (queues == 0)
+		goto wrr_medium;
+
+	/* wrr_high queues */
+	nr = g_submit_queues_wrr_high;
+	if (nr < 0 || nr > queues) {
+		pr_warn("null_blk: invalid wrr_medium queue count\n");
+		return -EINVAL;
+	}
+	g_tagset_nr_maps++;
+	queues -= nr;
+
+	/* add all others queue to default group */
+	g_submit_queues_default += queues;
+
+	goto out;
+
+def:
+	g_submit_queues_read = 0;
+read:
+	g_submit_queues_poll = 0;
+poll:
+	g_submit_queues_wrr_low = 0;
+wrr_low:
+	g_submit_queues_wrr_medium = 0;
+wrr_medium:
+	g_submit_queues_wrr_high = 0;
+out:
+	pr_info("null_blk: total submit queues:%d, nr_map:%d, default:%d, "
+		"read:%d, poll:%d, wrr_low:%d, wrr_medium:%d wrr_high:%d\n",
+		g_submit_queues, g_tagset_nr_maps, g_submit_queues_default,
+		g_submit_queues_read, g_submit_queues_poll,
+		g_submit_queues_wrr_low, g_submit_queues_wrr_medium,
+		g_submit_queues_wrr_high);
+
+	return 0;
+}
+
 static int __init null_init(void)
 {
 	int ret = 0;
@@ -1758,16 +2031,11 @@ static int __init null_init(void)
 		pr_err("null_blk: legacy IO path no longer available\n");
 		return -EINVAL;
 	}
-	if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
-		if (g_submit_queues != nr_online_nodes) {
-			pr_warn("null_blk: submit_queues param is set to %u.\n",
-							nr_online_nodes);
-			g_submit_queues = nr_online_nodes;
-		}
-	} else if (g_submit_queues > nr_cpu_ids)
-		g_submit_queues = nr_cpu_ids;
-	else if (g_submit_queues <= 0)
-		g_submit_queues = 1;
+
+	if (null_verify_queues()){
+		pr_err("null_blk: invalid submit queue parameter\n");
+		return -EINVAL;
+	}
 
 	if (g_queue_mode == NULL_Q_MQ && shared_tags) {
 		ret = null_init_tag_set(NULL, &tag_set);
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 3/4] genirq/affinity: allow driver's discontigous affinity set
  2019-06-16 10:14 [RFC PATCH v2 0/4] blkcg: add support weighted round robin tagset map Weiping Zhang
  2019-06-16 10:15 ` [PATCH v2 1/4] block: add weighted round robin for blkcgroup Weiping Zhang
  2019-06-16 10:15 ` [PATCH v2 2/4] null_blk: add support weighted round robin submition queue Weiping Zhang
@ 2019-06-16 10:15 ` Weiping Zhang
  2019-06-16 10:15 ` [PATCH v2 4/4] nvme: add support weighted round robin queue Weiping Zhang
  3 siblings, 0 replies; 9+ messages in thread
From: Weiping Zhang @ 2019-06-16 10:15 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche; +Cc: linux-block, cgroups

The driver may implement multiple affinity set, and some of
are empty, for this case we just skip them.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 kernel/irq/affinity.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..6d964fe0fbd8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -295,6 +295,10 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 		unsigned int this_vecs = affd->set_size[i];
 		int ret;
 
+		/* skip empty affinity set */
+		if (this_vecs == 0)
+			continue;
+
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
 					       curvec, masks);
 		if (ret) {
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 4/4] nvme: add support weighted round robin queue
  2019-06-16 10:14 [RFC PATCH v2 0/4] blkcg: add support weighted round robin tagset map Weiping Zhang
                   ` (2 preceding siblings ...)
  2019-06-16 10:15 ` [PATCH v2 3/4] genirq/affinity: allow driver's discontigous affinity set Weiping Zhang
@ 2019-06-16 10:15 ` Weiping Zhang
  2019-06-18 13:18   ` Minwoo Im
                     ` (2 more replies)
  3 siblings, 3 replies; 9+ messages in thread
From: Weiping Zhang @ 2019-06-16 10:15 UTC (permalink / raw)
  To: axboe, tj, hch, bvanassche; +Cc: linux-block, cgroups

Now nvme support five types hardware queue:
poll:		if io was marked for poll
wrr_low:	weighted round robin low
wrr_medium:	weighted round robin medium
wrr_high:	weighted round robin high
read:		for read, if blkcg's wrr is none and is not poll
defaut:		for write/flush, if blkcg's wrr is none and is not poll

for read, default and poll those submission queue's priority is medium by default;

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 drivers/nvme/host/pci.c   | 195 +++++++++++++++++++++++++++++++++-------------
 include/linux/interrupt.h |   2 +-
 2 files changed, 144 insertions(+), 53 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f562154551ce..ee9f3239f3e7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -73,16 +73,28 @@ static const struct kernel_param_ops queue_count_ops = {
 	.get = param_get_int,
 };
 
-static int write_queues;
-module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
-MODULE_PARM_DESC(write_queues,
-	"Number of queues to use for writes. If not set, reads and writes "
+static int read_queues;
+module_param_cb(read_queues, &queue_count_ops, &read_queues, 0644);
+MODULE_PARM_DESC(read_queues,
+	"Number of queues to use for reads. If not set, reads and writes "
 	"will share a queue set.");
 
 static int poll_queues = 0;
 module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
 MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
 
+static int wrr_high_queues = 0;
+module_param_cb(wrr_high_queues, &queue_count_ops, &wrr_high_queues, 0644);
+MODULE_PARM_DESC(wrr_high_queues, "Number of queues to use for WRR high.");
+
+static int wrr_medium_queues = 0;
+module_param_cb(wrr_medium_queues, &queue_count_ops, &wrr_medium_queues, 0644);
+MODULE_PARM_DESC(wrr_medium_queues, "Number of queues to use for WRR medium.");
+
+static int wrr_low_queues = 0;
+module_param_cb(wrr_low_queues, &queue_count_ops, &wrr_low_queues, 0644);
+MODULE_PARM_DESC(wrr_low_queues, "Number of queues to use for WRR low.");
+
 struct nvme_dev;
 struct nvme_queue;
 
@@ -226,9 +238,17 @@ struct nvme_iod {
 	struct scatterlist *sg;
 };
 
+static inline bool nvme_is_enable_wrr(struct nvme_dev *dev)
+{
+	return dev->io_queues[HCTX_TYPE_WRR_LOW] +
+		dev->io_queues[HCTX_TYPE_WRR_MEDIUM] +
+		dev->io_queues[HCTX_TYPE_WRR_HIGH] > 0;
+}
+
 static unsigned int max_io_queues(void)
 {
-	return num_possible_cpus() + write_queues + poll_queues;
+	return num_possible_cpus() + read_queues + poll_queues +
+		wrr_high_queues + wrr_medium_queues + wrr_low_queues;
 }
 
 static unsigned int max_queue_count(void)
@@ -1156,19 +1176,23 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
-						struct nvme_queue *nvmeq)
+					struct nvme_queue *nvmeq, int wrr)
 {
 	struct nvme_ctrl *ctrl = &dev->ctrl;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG;
 
-	/*
-	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
-	 * set. Since URGENT priority is zeroes, it makes all queues
-	 * URGENT.
-	 */
-	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
-		flags |= NVME_SQ_PRIO_MEDIUM;
+	if (!nvme_is_enable_wrr(dev)) {
+		/*
+		 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
+		 * set. Since URGENT priority is zeroes, it makes all queues
+		 * URGENT.
+		 */
+		if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
+			flags |= NVME_SQ_PRIO_MEDIUM;
+	} else {
+		flags |= wrr;
+	}
 
 	/*
 	 * Note: we (ab)use the fact that the prp fields survive if no data
@@ -1534,11 +1558,46 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	wmb(); /* ensure the first interrupt sees the initialization */
 }
 
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
-	int result;
+	int start, end, result, wrr;
+	bool polled = false;
 	u16 vector = 0;
+	enum hctx_type type;
+
+	/* 0 for admain queue, io queue index >= 1 */
+	start = 1;
+	/* get hardware context type base on qid */
+	for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) {
+		end = start + dev->io_queues[type] - 1;
+		if (qid >= start && qid <= end)
+			break;
+		start = end + 1;
+	}
+
+	if (nvme_is_enable_wrr(dev)) {
+		/* set read,poll,default to medium by default */
+		switch (type) {
+		case HCTX_TYPE_POLL:
+			polled = true;
+		case HCTX_TYPE_DEFAULT:
+		case HCTX_TYPE_READ:
+		case HCTX_TYPE_WRR_MEDIUM:
+			wrr = NVME_SQ_PRIO_MEDIUM;
+			break;
+		case HCTX_TYPE_WRR_LOW:
+			wrr = NVME_SQ_PRIO_LOW;
+			break;
+		case HCTX_TYPE_WRR_HIGH:
+			wrr = NVME_SQ_PRIO_HIGH;
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		wrr = 0;
+	}
 
 	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
@@ -1555,7 +1614,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 	if (result)
 		return result;
 
-	result = adapter_alloc_sq(dev, qid, nvmeq);
+	result = adapter_alloc_sq(dev, qid, nvmeq, wrr);
 	if (result < 0)
 		return result;
 	else if (result)
@@ -1726,7 +1785,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i, max, rw_queues;
+	unsigned i, max;
 	int ret = 0;
 
 	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1737,17 +1796,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 	}
 
 	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
-	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
-		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
-				dev->io_queues[HCTX_TYPE_READ];
-	} else {
-		rw_queues = max;
-	}
 
 	for (i = dev->online_queues; i <= max; i++) {
-		bool polled = i > rw_queues;
-
-		ret = nvme_create_queue(&dev->queues[i], i, polled);
+		ret = nvme_create_queue(&dev->queues[i], i);
 		if (ret)
 			break;
 	}
@@ -2028,35 +2079,73 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 {
 	struct nvme_dev *dev = affd->priv;
-	unsigned int nr_read_queues;
+	unsigned int nr_total, nr, nr_read, nr_default;
+	unsigned int nr_wrr_high, nr_wrr_medium, nr_wrr_low;
+	unsigned int nr_sets;
 
 	/*
 	 * If there is no interupt available for queues, ensure that
 	 * the default queue is set to 1. The affinity set size is
 	 * also set to one, but the irq core ignores it for this case.
-	 *
-	 * If only one interrupt is available or 'write_queue' == 0, combine
-	 * write and read queues.
-	 *
-	 * If 'write_queues' > 0, ensure it leaves room for at least one read
-	 * queue.
 	 */
-	if (!nrirqs) {
+	if (!nrirqs)
 		nrirqs = 1;
-		nr_read_queues = 0;
-	} else if (nrirqs == 1 || !write_queues) {
-		nr_read_queues = 0;
-	} else if (write_queues >= nrirqs) {
-		nr_read_queues = 1;
-	} else {
-		nr_read_queues = nrirqs - write_queues;
-	}
 
-	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
-	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
-	affd->nr_sets = nr_read_queues ? 2 : 1;
+	nr_total = nrirqs;
+
+	nr_read = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0;
+
+	/* set default to 1, add all the rest queue to default at last */
+	nr = nr_default = 1;
+	nr_sets = 1;
+
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* read queues */
+	nr_sets++;
+	nr_read = nr = read_queues > nr_total ? nr_total : read_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr low queues */
+	nr_sets++;
+	nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr medium queues */
+	nr_sets++;
+	nr_wrr_medium = nr = wrr_medium_queues > nr_total ? nr_total : wrr_medium_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* wrr high queues */
+	nr_sets++;
+	nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : wrr_high_queues;
+	nr_total -= nr;
+	if (!nr_total)
+		goto done;
+
+	/* set all the rest queue to default */
+	nr_default += nr_total;
+
+done:
+	dev->io_queues[HCTX_TYPE_DEFAULT] = nr_default;
+	affd->set_size[HCTX_TYPE_DEFAULT] = nr_default;
+	dev->io_queues[HCTX_TYPE_READ] = nr_read;
+	affd->set_size[HCTX_TYPE_READ] = nr_read;
+	dev->io_queues[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
+	affd->set_size[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
+	dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
+	affd->set_size[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
+	dev->io_queues[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
+	affd->set_size[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
+	affd->nr_sets = nr_sets;
 }
 
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
@@ -2171,10 +2260,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		nvme_suspend_io_queues(dev);
 		goto retry;
 	}
-	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+	dev_info(dev->ctrl.device, "%d/%d/%d/%d/%d/%d "
+			"default/read/poll/wrr_low/wrr_medium/wrr_high queues\n",
 					dev->io_queues[HCTX_TYPE_DEFAULT],
 					dev->io_queues[HCTX_TYPE_READ],
-					dev->io_queues[HCTX_TYPE_POLL]);
+					dev->io_queues[HCTX_TYPE_POLL],
+					dev->io_queues[HCTX_TYPE_WRR_LOW],
+					dev->io_queues[HCTX_TYPE_WRR_MEDIUM],
+					dev->io_queues[HCTX_TYPE_WRR_HIGH]);
 	return 0;
 }
 
@@ -2263,9 +2356,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	if (!dev->ctrl.tagset) {
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
-		dev->tagset.nr_maps = 2; /* default + read */
-		if (dev->io_queues[HCTX_TYPE_POLL])
-			dev->tagset.nr_maps++;
+		dev->tagset.nr_maps = HCTX_MAX_TYPES;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev_to_node(dev->dev);
 		dev->tagset.queue_depth =
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c7eef32e7739..ea726c2f95cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -259,7 +259,7 @@ struct irq_affinity_notify {
 	void (*release)(struct kref *ref);
 };
 
-#define	IRQ_AFFINITY_MAX_SETS  4
+#define	IRQ_AFFINITY_MAX_SETS  7
 
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
-- 
2.14.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 4/4] nvme: add support weighted round robin queue
  2019-06-16 10:15 ` [PATCH v2 4/4] nvme: add support weighted round robin queue Weiping Zhang
@ 2019-06-18 13:18   ` Minwoo Im
  2019-06-19 18:37   ` Chaitanya Kulkarni
  2019-06-20 14:16   ` Minwoo Im
  2 siblings, 0 replies; 9+ messages in thread
From: Minwoo Im @ 2019-06-18 13:18 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: axboe, tj, hch, bvanassche, linux-block, cgroups, linux-nvme

On 19-06-16 18:15:56, Weiping Zhang wrote:
> Now nvme support five types hardware queue:
> poll:		if io was marked for poll
> wrr_low:	weighted round robin low
> wrr_medium:	weighted round robin medium
> wrr_high:	weighted round robin high
> read:		for read, if blkcg's wrr is none and is not poll
> defaut:		for write/flush, if blkcg's wrr is none and is not poll
> 
> for read, default and poll those submission queue's priority is medium by default;
> 
> Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>

Hello Weiping,

Please add linux-nvme mailing list for this patch to be reviewed from
the nvme people.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 4/4] nvme: add support weighted round robin queue
  2019-06-16 10:15 ` [PATCH v2 4/4] nvme: add support weighted round robin queue Weiping Zhang
  2019-06-18 13:18   ` Minwoo Im
@ 2019-06-19 18:37   ` Chaitanya Kulkarni
  2019-06-20 14:16   ` Minwoo Im
  2 siblings, 0 replies; 9+ messages in thread
From: Chaitanya Kulkarni @ 2019-06-19 18:37 UTC (permalink / raw)
  To: Weiping Zhang, hch, bvanassche; +Cc: axboe, tj, linux-block, cgroups

Please add linux-nvme@lists.infradead.org to this list
and Cc maintainers of NVMe subsystems.

Also do you have performance numbers for WRR ?

On 06/16/2019 03:16 AM, Weiping Zhang wrote:
> Now nvme support five types hardware queue:
> poll:		if io was marked for poll
> wrr_low:	weighted round robin low
> wrr_medium:	weighted round robin medium
> wrr_high:	weighted round robin high
> read:		for read, if blkcg's wrr is none and is not poll
> defaut:		for write/flush, if blkcg's wrr is none and is not poll
>
> for read, default and poll those submission queue's priority is medium by default;
>
> Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
> ---
>   drivers/nvme/host/pci.c   | 195 +++++++++++++++++++++++++++++++++-------------
>   include/linux/interrupt.h |   2 +-
>   2 files changed, 144 insertions(+), 53 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index f562154551ce..ee9f3239f3e7 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -73,16 +73,28 @@ static const struct kernel_param_ops queue_count_ops = {
>   	.get = param_get_int,
>   };
>
> -static int write_queues;
> -module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
> -MODULE_PARM_DESC(write_queues,
> -	"Number of queues to use for writes. If not set, reads and writes "
> +static int read_queues;
> +module_param_cb(read_queues, &queue_count_ops, &read_queues, 0644);
> +MODULE_PARM_DESC(read_queues,
> +	"Number of queues to use for reads. If not set, reads and writes "
>   	"will share a queue set.");
>
>   static int poll_queues = 0;
>   module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
>   MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
>
> +static int wrr_high_queues = 0;
> +module_param_cb(wrr_high_queues, &queue_count_ops, &wrr_high_queues, 0644);
> +MODULE_PARM_DESC(wrr_high_queues, "Number of queues to use for WRR high.");
> +
> +static int wrr_medium_queues = 0;
> +module_param_cb(wrr_medium_queues, &queue_count_ops, &wrr_medium_queues, 0644);
> +MODULE_PARM_DESC(wrr_medium_queues, "Number of queues to use for WRR medium.");
> +
> +static int wrr_low_queues = 0;
> +module_param_cb(wrr_low_queues, &queue_count_ops, &wrr_low_queues, 0644);
> +MODULE_PARM_DESC(wrr_low_queues, "Number of queues to use for WRR low.");
> +
>   struct nvme_dev;
>   struct nvme_queue;
>
> @@ -226,9 +238,17 @@ struct nvme_iod {
>   	struct scatterlist *sg;
>   };
>
> +static inline bool nvme_is_enable_wrr(struct nvme_dev *dev)
> +{
> +	return dev->io_queues[HCTX_TYPE_WRR_LOW] +
> +		dev->io_queues[HCTX_TYPE_WRR_MEDIUM] +
> +		dev->io_queues[HCTX_TYPE_WRR_HIGH] > 0;
> +}
> +
>   static unsigned int max_io_queues(void)
>   {
> -	return num_possible_cpus() + write_queues + poll_queues;
> +	return num_possible_cpus() + read_queues + poll_queues +
> +		wrr_high_queues + wrr_medium_queues + wrr_low_queues;
>   }
>
>   static unsigned int max_queue_count(void)
> @@ -1156,19 +1176,23 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
>   }
>
>   static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
> -						struct nvme_queue *nvmeq)
> +					struct nvme_queue *nvmeq, int wrr)
>   {
>   	struct nvme_ctrl *ctrl = &dev->ctrl;
>   	struct nvme_command c;
>   	int flags = NVME_QUEUE_PHYS_CONTIG;
>
> -	/*
> -	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
> -	 * set. Since URGENT priority is zeroes, it makes all queues
> -	 * URGENT.
> -	 */
> -	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
> -		flags |= NVME_SQ_PRIO_MEDIUM;
> +	if (!nvme_is_enable_wrr(dev)) {
> +		/*
> +		 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
> +		 * set. Since URGENT priority is zeroes, it makes all queues
> +		 * URGENT.
> +		 */
> +		if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
> +			flags |= NVME_SQ_PRIO_MEDIUM;
> +	} else {
> +		flags |= wrr;
> +	}
>
>   	/*
>   	 * Note: we (ab)use the fact that the prp fields survive if no data
> @@ -1534,11 +1558,46 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
>   	wmb(); /* ensure the first interrupt sees the initialization */
>   }
>
> -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
> +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
>   {
>   	struct nvme_dev *dev = nvmeq->dev;
> -	int result;
> +	int start, end, result, wrr;
> +	bool polled = false;
>   	u16 vector = 0;
> +	enum hctx_type type;
> +
> +	/* 0 for admain queue, io queue index >= 1 */
> +	start = 1;
> +	/* get hardware context type base on qid */
> +	for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) {
> +		end = start + dev->io_queues[type] - 1;
> +		if (qid >= start && qid <= end)
> +			break;
> +		start = end + 1;
> +	}
> +
> +	if (nvme_is_enable_wrr(dev)) {
> +		/* set read,poll,default to medium by default */
> +		switch (type) {
> +		case HCTX_TYPE_POLL:
> +			polled = true;
> +		case HCTX_TYPE_DEFAULT:
> +		case HCTX_TYPE_READ:
> +		case HCTX_TYPE_WRR_MEDIUM:
> +			wrr = NVME_SQ_PRIO_MEDIUM;
> +			break;
> +		case HCTX_TYPE_WRR_LOW:
> +			wrr = NVME_SQ_PRIO_LOW;
> +			break;
> +		case HCTX_TYPE_WRR_HIGH:
> +			wrr = NVME_SQ_PRIO_HIGH;
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	} else {
> +		wrr = 0;
> +	}
>
>   	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
>
> @@ -1555,7 +1614,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
>   	if (result)
>   		return result;
>
> -	result = adapter_alloc_sq(dev, qid, nvmeq);
> +	result = adapter_alloc_sq(dev, qid, nvmeq, wrr);
>   	if (result < 0)
>   		return result;
>   	else if (result)
> @@ -1726,7 +1785,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
>
>   static int nvme_create_io_queues(struct nvme_dev *dev)
>   {
> -	unsigned i, max, rw_queues;
> +	unsigned i, max;
>   	int ret = 0;
>
>   	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
> @@ -1737,17 +1796,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
>   	}
>
>   	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
> -	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
> -		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
> -				dev->io_queues[HCTX_TYPE_READ];
> -	} else {
> -		rw_queues = max;
> -	}
>
>   	for (i = dev->online_queues; i <= max; i++) {
> -		bool polled = i > rw_queues;
> -
> -		ret = nvme_create_queue(&dev->queues[i], i, polled);
> +		ret = nvme_create_queue(&dev->queues[i], i);
>   		if (ret)
>   			break;
>   	}
> @@ -2028,35 +2079,73 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
>   static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
>   {
>   	struct nvme_dev *dev = affd->priv;
> -	unsigned int nr_read_queues;
> +	unsigned int nr_total, nr, nr_read, nr_default;
> +	unsigned int nr_wrr_high, nr_wrr_medium, nr_wrr_low;
> +	unsigned int nr_sets;
>
>   	/*
>   	 * If there is no interupt available for queues, ensure that
>   	 * the default queue is set to 1. The affinity set size is
>   	 * also set to one, but the irq core ignores it for this case.
> -	 *
> -	 * If only one interrupt is available or 'write_queue' == 0, combine
> -	 * write and read queues.
> -	 *
> -	 * If 'write_queues' > 0, ensure it leaves room for at least one read
> -	 * queue.
>   	 */
> -	if (!nrirqs) {
> +	if (!nrirqs)
>   		nrirqs = 1;
> -		nr_read_queues = 0;
> -	} else if (nrirqs == 1 || !write_queues) {
> -		nr_read_queues = 0;
> -	} else if (write_queues >= nrirqs) {
> -		nr_read_queues = 1;
> -	} else {
> -		nr_read_queues = nrirqs - write_queues;
> -	}
>
> -	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> -	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> -	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
> -	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
> -	affd->nr_sets = nr_read_queues ? 2 : 1;
> +	nr_total = nrirqs;
> +
> +	nr_read = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0;
> +
> +	/* set default to 1, add all the rest queue to default at last */
> +	nr = nr_default = 1;
> +	nr_sets = 1;
> +
> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* read queues */
> +	nr_sets++;
> +	nr_read = nr = read_queues > nr_total ? nr_total : read_queues;
> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* wrr low queues */
> +	nr_sets++;
> +	nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues;
> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* wrr medium queues */
> +	nr_sets++;
> +	nr_wrr_medium = nr = wrr_medium_queues > nr_total ? nr_total : wrr_medium_queues;
> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* wrr high queues */
> +	nr_sets++;
> +	nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : wrr_high_queues;
> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* set all the rest queue to default */
> +	nr_default += nr_total;
> +
> +done:
> +	dev->io_queues[HCTX_TYPE_DEFAULT] = nr_default;
> +	affd->set_size[HCTX_TYPE_DEFAULT] = nr_default;
> +	dev->io_queues[HCTX_TYPE_READ] = nr_read;
> +	affd->set_size[HCTX_TYPE_READ] = nr_read;
> +	dev->io_queues[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
> +	affd->set_size[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
> +	dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
> +	affd->set_size[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
> +	dev->io_queues[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
> +	affd->set_size[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
> +	affd->nr_sets = nr_sets;
>   }
>
>   static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
> @@ -2171,10 +2260,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>   		nvme_suspend_io_queues(dev);
>   		goto retry;
>   	}
> -	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
> +	dev_info(dev->ctrl.device, "%d/%d/%d/%d/%d/%d "
> +			"default/read/poll/wrr_low/wrr_medium/wrr_high queues\n",
>   					dev->io_queues[HCTX_TYPE_DEFAULT],
>   					dev->io_queues[HCTX_TYPE_READ],
> -					dev->io_queues[HCTX_TYPE_POLL]);
> +					dev->io_queues[HCTX_TYPE_POLL],
> +					dev->io_queues[HCTX_TYPE_WRR_LOW],
> +					dev->io_queues[HCTX_TYPE_WRR_MEDIUM],
> +					dev->io_queues[HCTX_TYPE_WRR_HIGH]);
>   	return 0;
>   }
>
> @@ -2263,9 +2356,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
>   	if (!dev->ctrl.tagset) {
>   		dev->tagset.ops = &nvme_mq_ops;
>   		dev->tagset.nr_hw_queues = dev->online_queues - 1;
> -		dev->tagset.nr_maps = 2; /* default + read */
> -		if (dev->io_queues[HCTX_TYPE_POLL])
> -			dev->tagset.nr_maps++;
> +		dev->tagset.nr_maps = HCTX_MAX_TYPES;
>   		dev->tagset.timeout = NVME_IO_TIMEOUT;
>   		dev->tagset.numa_node = dev_to_node(dev->dev);
>   		dev->tagset.queue_depth =
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index c7eef32e7739..ea726c2f95cc 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -259,7 +259,7 @@ struct irq_affinity_notify {
>   	void (*release)(struct kref *ref);
>   };
>
> -#define	IRQ_AFFINITY_MAX_SETS  4
> +#define	IRQ_AFFINITY_MAX_SETS  7
>
>   /**
>    * struct irq_affinity - Description for automatic irq affinity assignements
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 4/4] nvme: add support weighted round robin queue
  2019-06-16 10:15 ` [PATCH v2 4/4] nvme: add support weighted round robin queue Weiping Zhang
  2019-06-18 13:18   ` Minwoo Im
  2019-06-19 18:37   ` Chaitanya Kulkarni
@ 2019-06-20 14:16   ` Minwoo Im
  2019-06-21 14:38     ` Weiping Zhang
  2 siblings, 1 reply; 9+ messages in thread
From: Minwoo Im @ 2019-06-20 14:16 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: axboe, tj, hch, bvanassche, linux-block, cgroups, linux-nvme,
	Javier González

> -static int write_queues;
> -module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
> -MODULE_PARM_DESC(write_queues,
> -	"Number of queues to use for writes. If not set, reads and writes "
> +static int read_queues;
> +module_param_cb(read_queues, &queue_count_ops, &read_queues, 0644);
> +MODULE_PARM_DESC(read_queues,
> +	"Number of queues to use for reads. If not set, reads and writes "
>  	"will share a queue set.");

Before starting my review for this, I'd like to talk about this part
first.  It would be better if you can split this change from this commit
into a new one because it just replaced the write_queues with
read_queues which is directly mapped to HCTX_TYPE_READ.  This change
might not be critical for the WRR implementation.

>  
>  static int poll_queues = 0;
>  module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
>  MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
>  
> +static int wrr_high_queues = 0;

Nitpick here: maybe we don't need to 0-initialize static variables
explicitly.

> +module_param_cb(wrr_high_queues, &queue_count_ops, &wrr_high_queues, 0644);
> +MODULE_PARM_DESC(wrr_high_queues, "Number of queues to use for WRR high.");
> +
> +static int wrr_medium_queues = 0;
> +module_param_cb(wrr_medium_queues, &queue_count_ops, &wrr_medium_queues, 0644);
> +MODULE_PARM_DESC(wrr_medium_queues, "Number of queues to use for WRR medium.");
> +
> +static int wrr_low_queues = 0;
> +module_param_cb(wrr_low_queues, &queue_count_ops, &wrr_low_queues, 0644);
> +MODULE_PARM_DESC(wrr_low_queues, "Number of queues to use for WRR low.");
> +
>  struct nvme_dev;
>  struct nvme_queue;
>  
> @@ -226,9 +238,17 @@ struct nvme_iod {
>  	struct scatterlist *sg;
>  };
>  
> +static inline bool nvme_is_enable_wrr(struct nvme_dev *dev)
> +{
> +	return dev->io_queues[HCTX_TYPE_WRR_LOW] +
> +		dev->io_queues[HCTX_TYPE_WRR_MEDIUM] +
> +		dev->io_queues[HCTX_TYPE_WRR_HIGH] > 0;
> +}

It looks like that it might be confused with AMS(Arbitration Mechanism
Selected) in CC or CAP?  If it meant how many irqs for the sets were
allocated, then can we have this function with another name like:
	nvme_is_wrr_allocated or something indicating the irqsets

> +
>  static unsigned int max_io_queues(void)
>  {
> -	return num_possible_cpus() + write_queues + poll_queues;
> +	return num_possible_cpus() + read_queues + poll_queues +
> +		wrr_high_queues + wrr_medium_queues + wrr_low_queues;
>  }
>  
>  static unsigned int max_queue_count(void)
> @@ -1534,11 +1558,46 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
>  	wmb(); /* ensure the first interrupt sees the initialization */
>  }
>  
> -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
> +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
>  {
>  	struct nvme_dev *dev = nvmeq->dev;
> -	int result;
> +	int start, end, result, wrr;
> +	bool polled = false;
>  	u16 vector = 0;
> +	enum hctx_type type;
> +
> +	/* 0 for admain queue, io queue index >= 1 */
> +	start = 1;
> +	/* get hardware context type base on qid */
> +	for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) {
> +		end = start + dev->io_queues[type] - 1;
> +		if (qid >= start && qid <= end)
> +			break;
> +		start = end + 1;
> +	}
> +
> +	if (nvme_is_enable_wrr(dev)) {

I think we need to check not only the irqset allocations, but also if the
device is really supports WRR or not.

> +		/* set read,poll,default to medium by default */
> +		switch (type) {
> +		case HCTX_TYPE_POLL:
> +			polled = true;

Question: Is poll-queue not avilable to be used in case of !WRR?

> +		case HCTX_TYPE_DEFAULT:
> +		case HCTX_TYPE_READ:
> +		case HCTX_TYPE_WRR_MEDIUM:
> +			wrr = NVME_SQ_PRIO_MEDIUM;

Also it seems like it could be named like flags because it will show the
SQ priority.  What do you think?

> +			break;
> +		case HCTX_TYPE_WRR_LOW:
> +			wrr = NVME_SQ_PRIO_LOW;
> +			break;
> +		case HCTX_TYPE_WRR_HIGH:
> +			wrr = NVME_SQ_PRIO_HIGH;
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	} else {
> +		wrr = 0;

Would it be different with the following value ?
	NVME_SQ_PRIO_URGENT     = (0 << 1)
If it means no WRR, then can it be avoided the value which is already
defined ?

> +	}
>  
>  	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
>  

> @@ -2028,35 +2079,73 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
>  static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
>  {
>  	struct nvme_dev *dev = affd->priv;
> -	unsigned int nr_read_queues;
> +	unsigned int nr_total, nr, nr_read, nr_default;
> +	unsigned int nr_wrr_high, nr_wrr_medium, nr_wrr_low;
> +	unsigned int nr_sets;
>  
>  	/*
>  	 * If there is no interupt available for queues, ensure that
>  	 * the default queue is set to 1. The affinity set size is
>  	 * also set to one, but the irq core ignores it for this case.
> -	 *
> -	 * If only one interrupt is available or 'write_queue' == 0, combine
> -	 * write and read queues.
> -	 *
> -	 * If 'write_queues' > 0, ensure it leaves room for at least one read
> -	 * queue.
>  	 */
> -	if (!nrirqs) {
> +	if (!nrirqs)
>  		nrirqs = 1;
> -		nr_read_queues = 0;
> -	} else if (nrirqs == 1 || !write_queues) {
> -		nr_read_queues = 0;
> -	} else if (write_queues >= nrirqs) {
> -		nr_read_queues = 1;
> -	} else {
> -		nr_read_queues = nrirqs - write_queues;
> -	}
>  
> -	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> -	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> -	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
> -	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
> -	affd->nr_sets = nr_read_queues ? 2 : 1;
> +	nr_total = nrirqs;
> +
> +	nr_read = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0;
> +
> +	/* set default to 1, add all the rest queue to default at last */
> +	nr = nr_default = 1;
> +	nr_sets = 1;
> +
> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* read queues */
> +	nr_sets++;
> +	nr_read = nr = read_queues > nr_total ? nr_total : read_queues;
> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* wrr low queues */
> +	nr_sets++;
> +	nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues;
> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* wrr medium queues */
> +	nr_sets++;
> +	nr_wrr_medium = nr = wrr_medium_queues > nr_total ? nr_total : wrr_medium_queues;

It looks like exceeded 80 chracters here.

> +	nr_total -= nr;
> +	if (!nr_total)
> +		goto done;
> +
> +	/* wrr high queues */
> +	nr_sets++;
> +	nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : wrr_high_queues;
> +	nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : wrr_high_queues;

Here also.

If I misunderstood something here, please feel free to let me know.

Thanks,

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 4/4] nvme: add support weighted round robin queue
  2019-06-20 14:16   ` Minwoo Im
@ 2019-06-21 14:38     ` Weiping Zhang
  0 siblings, 0 replies; 9+ messages in thread
From: Weiping Zhang @ 2019-06-21 14:38 UTC (permalink / raw)
  To: Minwoo Im
  Cc: Weiping Zhang, Jens Axboe, Tejun Heo, Christoph Hellwig,
	Bart Van Assche, linux-block, cgroups, linux-nvme,
	Javier González

Hi Minwoo,

Thanks your feedback.

Minwoo Im <minwoo.im.dev@gmail.com> 于2019年6月20日周四 下午10:17写道：
>
> > -static int write_queues;
> > -module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
> > -MODULE_PARM_DESC(write_queues,
> > -     "Number of queues to use for writes. If not set, reads and writes "
> > +static int read_queues;
> > +module_param_cb(read_queues, &queue_count_ops, &read_queues, 0644);
> > +MODULE_PARM_DESC(read_queues,
> > +     "Number of queues to use for reads. If not set, reads and writes "
> >       "will share a queue set.");
>
> Before starting my review for this, I'd like to talk about this part
> first.  It would be better if you can split this change from this commit
> into a new one because it just replaced the write_queues with
> read_queues which is directly mapped to HCTX_TYPE_READ.  This change
> might not be critical for the WRR implementation.
>
Yes, I'll split it into a sperate patch, the reason why I rename it to
read is that
it can siplify the calulation for wrr related queue count.
> >
> >  static int poll_queues = 0;
> >  module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
> >  MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
> >
> > +static int wrr_high_queues = 0;
>
> Nitpick here: maybe we don't need to 0-initialize static variables
> explicitly.
ok, I will rebase this patch set to nvme-5.3 branch.

>
> > +module_param_cb(wrr_high_queues, &queue_count_ops, &wrr_high_queues, 0644);
> > +MODULE_PARM_DESC(wrr_high_queues, "Number of queues to use for WRR high.");
> > +
> > +static int wrr_medium_queues = 0;
> > +module_param_cb(wrr_medium_queues, &queue_count_ops, &wrr_medium_queues, 0644);
> > +MODULE_PARM_DESC(wrr_medium_queues, "Number of queues to use for WRR medium.");
> > +
> > +static int wrr_low_queues = 0;
> > +module_param_cb(wrr_low_queues, &queue_count_ops, &wrr_low_queues, 0644);
> > +MODULE_PARM_DESC(wrr_low_queues, "Number of queues to use for WRR low.");
> > +
> >  struct nvme_dev;
> >  struct nvme_queue;
> >
> > @@ -226,9 +238,17 @@ struct nvme_iod {
> >       struct scatterlist *sg;
> >  };
> >
> > +static inline bool nvme_is_enable_wrr(struct nvme_dev *dev)
> > +{
> > +     return dev->io_queues[HCTX_TYPE_WRR_LOW] +
> > +             dev->io_queues[HCTX_TYPE_WRR_MEDIUM] +
> > +             dev->io_queues[HCTX_TYPE_WRR_HIGH] > 0;
> > +}
>
> It looks like that it might be confused with AMS(Arbitration Mechanism
> Selected) in CC or CAP?  If it meant how many irqs for the sets were
> allocated, then can we have this function with another name like:
>         nvme_is_wrr_allocated or something indicating the irqsets
>
Yes, we should dectect AMS in CAP and CC, if we not enable WRR, we should
ignore all wrr_high/medium/low/urgent_queues.
For my point of view, this function is used for check if nvme enable WRR, so
we should check AMS in both CAP and CC.
We also need define nvme_is_wrr_allocated which will be used when we
create io queues.
> > +
> >  static unsigned int max_io_queues(void)
> >  {
> > -     return num_possible_cpus() + write_queues + poll_queues;
> > +     return num_possible_cpus() + read_queues + poll_queues +
> > +             wrr_high_queues + wrr_medium_queues + wrr_low_queues;
> >  }
> >
> >  static unsigned int max_queue_count(void)
> > @@ -1534,11 +1558,46 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
> >       wmb(); /* ensure the first interrupt sees the initialization */
> >  }
> >
> > -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
> > +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
> >  {
> >       struct nvme_dev *dev = nvmeq->dev;
> > -     int result;
> > +     int start, end, result, wrr;
> > +     bool polled = false;
> >       u16 vector = 0;
> > +     enum hctx_type type;
> > +
> > +     /* 0 for admain queue, io queue index >= 1 */
> > +     start = 1;
> > +     /* get hardware context type base on qid */
> > +     for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) {
> > +             end = start + dev->io_queues[type] - 1;
> > +             if (qid >= start && qid <= end)
> > +                     break;
> > +             start = end + 1;
> > +     }
> > +
> > +     if (nvme_is_enable_wrr(dev)) {
>
> I think we need to check not only the irqset allocations, but also if the
> device is really supports WRR or not.
OK.
>
> > +             /* set read,poll,default to medium by default */
> > +             switch (type) {
> > +             case HCTX_TYPE_POLL:
> > +                     polled = true;
>
> Question: Is poll-queue not avilable to be used in case of !WRR?
>
Ya, I will fix it.
> > +             case HCTX_TYPE_DEFAULT:
> > +             case HCTX_TYPE_READ:
> > +             case HCTX_TYPE_WRR_MEDIUM:
> > +                     wrr = NVME_SQ_PRIO_MEDIUM;
>
> Also it seems like it could be named like flags because it will show the
> SQ priority.  What do you think?
>
It's ok, I will rename wrr to wrr_flag;
> > +                     break;
> > +             case HCTX_TYPE_WRR_LOW:
> > +                     wrr = NVME_SQ_PRIO_LOW;
> > +                     break;
> > +             case HCTX_TYPE_WRR_HIGH:
> > +                     wrr = NVME_SQ_PRIO_HIGH;
> > +                     break;
> > +             default:
> > +                     return -EINVAL;
> > +             }
> > +     } else {
> > +             wrr = 0;
>
> Would it be different with the following value ?
>         NVME_SQ_PRIO_URGENT     = (0 << 1)
> If it means no WRR, then can it be avoided the value which is already
> defined ?
I means no WRR, so I want to
#define NVME_SQ_PRIO_IGNORE NVME_SQ_PRIO_URGENT,
because if nvme's WRR is not enabled, the controller should ignore this field.
>
> > +     }
> >
> >       clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
> >
>
> > @@ -2028,35 +2079,73 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
> >  static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
> >  {
> >       struct nvme_dev *dev = affd->priv;
> > -     unsigned int nr_read_queues;
> > +     unsigned int nr_total, nr, nr_read, nr_default;
> > +     unsigned int nr_wrr_high, nr_wrr_medium, nr_wrr_low;
> > +     unsigned int nr_sets;
> >
> >       /*
> >        * If there is no interupt available for queues, ensure that
> >        * the default queue is set to 1. The affinity set size is
> >        * also set to one, but the irq core ignores it for this case.
> > -      *
> > -      * If only one interrupt is available or 'write_queue' == 0, combine
> > -      * write and read queues.
> > -      *
> > -      * If 'write_queues' > 0, ensure it leaves room for at least one read
> > -      * queue.
> >        */
> > -     if (!nrirqs) {
> > +     if (!nrirqs)
> >               nrirqs = 1;
> > -             nr_read_queues = 0;
> > -     } else if (nrirqs == 1 || !write_queues) {
> > -             nr_read_queues = 0;
> > -     } else if (write_queues >= nrirqs) {
> > -             nr_read_queues = 1;
> > -     } else {
> > -             nr_read_queues = nrirqs - write_queues;
> > -     }
> >
> > -     dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> > -     affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> > -     dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
> > -     affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
> > -     affd->nr_sets = nr_read_queues ? 2 : 1;
> > +     nr_total = nrirqs;
> > +
> > +     nr_read = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0;
> > +
> > +     /* set default to 1, add all the rest queue to default at last */
> > +     nr = nr_default = 1;
> > +     nr_sets = 1;
> > +
> > +     nr_total -= nr;
> > +     if (!nr_total)
> > +             goto done;
> > +
> > +     /* read queues */
> > +     nr_sets++;
> > +     nr_read = nr = read_queues > nr_total ? nr_total : read_queues;
> > +     nr_total -= nr;
> > +     if (!nr_total)
> > +             goto done;
> > +
> > +     /* wrr low queues */
> > +     nr_sets++;
> > +     nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues;
> > +     nr_total -= nr;
> > +     if (!nr_total)
> > +             goto done;
> > +
> > +     /* wrr medium queues */
> > +     nr_sets++;
> > +     nr_wrr_medium = nr = wrr_medium_queues > nr_total ? nr_total : wrr_medium_queues;
>
> It looks like exceeded 80 chracters here.
I will fix it.
>
> > +     nr_total -= nr;
> > +     if (!nr_total)
> > +             goto done;
> > +
> > +     /* wrr high queues */
> > +     nr_sets++;
> > +     nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : wrr_high_queues;
> > +     nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : wrr_high_queues;
>
> Here also.
>
> If I misunderstood something here, please feel free to let me know.
>
Thanks very much for your feedback.
> Thanks,

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-06-21 14:38 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-16 10:14 [RFC PATCH v2 0/4] blkcg: add support weighted round robin tagset map Weiping Zhang
2019-06-16 10:15 ` [PATCH v2 1/4] block: add weighted round robin for blkcgroup Weiping Zhang
2019-06-16 10:15 ` [PATCH v2 2/4] null_blk: add support weighted round robin submition queue Weiping Zhang
2019-06-16 10:15 ` [PATCH v2 3/4] genirq/affinity: allow driver's discontigous affinity set Weiping Zhang
2019-06-16 10:15 ` [PATCH v2 4/4] nvme: add support weighted round robin queue Weiping Zhang
2019-06-18 13:18   ` Minwoo Im
2019-06-19 18:37   ` Chaitanya Kulkarni
2019-06-20 14:16   ` Minwoo Im
2019-06-21 14:38     ` Weiping Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).