From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> To: sagi@grimberg.me, hch@lst.de Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>, linux-nvme@lists.infradead.org Subject: [RFC PATCH 1/2] nvmet: add bdev-ns polling support Date: Mon, 9 Dec 2019 22:25:56 -0800 [thread overview] Message-ID: <20191210062557.5171-2-chaitanya.kulkarni@wdc.com> (raw) In-Reply-To: <20191210062557.5171-1-chaitanya.kulkarni@wdc.com> This patch adds support for the bdev-ns polling. We also add a new file to keep polling code separate (io-poll.c). The newly added configfs attribute allows user to enable/disable polling. We only enable polling for the READ/WRITE operation based on support from bdev and and use_poll configfs attr. We configure polling per namespace. For each namespace we create polling threads. For general approach please have a look at the cover-letter of this patch-series. Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> --- drivers/nvme/target/Makefile | 3 +- drivers/nvme/target/configfs.c | 29 ++++++ drivers/nvme/target/core.c | 13 +++ drivers/nvme/target/io-cmd-bdev.c | 61 +++++++++-- drivers/nvme/target/io-poll.c | 165 ++++++++++++++++++++++++++++++ drivers/nvme/target/nvmet.h | 31 +++++- 6 files changed, 291 insertions(+), 11 deletions(-) create mode 100644 drivers/nvme/target/io-poll.c diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 2b33836f3d3e..8877ba16305c 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ - discovery.o io-cmd-file.o io-cmd-bdev.o + discovery.o io-cmd-file.o io-cmd-bdev.o \ + io-poll.o nvme-loop-y += loop.o nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 98613a45bd3b..0e6e8b0dbf79 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -545,6 +545,34 @@ static ssize_t nvmet_ns_buffered_io_store(struct config_item *item, CONFIGFS_ATTR(nvmet_ns_, buffered_io); +static ssize_t nvmet_ns_use_poll_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_nvmet_ns(item)->use_poll); +} + +static ssize_t nvmet_ns_use_poll_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool val; + + if (strtobool(page, &val)) + return -EINVAL; + + mutex_lock(&ns->subsys->lock); + if (ns->enabled) { + pr_err("disable ns before setting use_poll value.\n"); + mutex_unlock(&ns->subsys->lock); + return -EINVAL; + } + + ns->use_poll = val; + mutex_unlock(&ns->subsys->lock); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, use_poll); + static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, @@ -552,6 +580,7 @@ static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_ana_grpid, &nvmet_ns_attr_enable, &nvmet_ns_attr_buffered_io, + &nvmet_ns_attr_use_poll, #ifdef CONFIG_PCI_P2PDMA &nvmet_ns_attr_p2pmem, #endif diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 28438b833c1b..d8f9130d1cd1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -510,6 +510,18 @@ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, ns->nsid); } +inline void nvmet_req_done(struct nvmet_req *req) +{ + if (req->ns->bdev) + nvmet_bdev_req_complete(req); +} + +inline void nvmet_req_poll_complete(struct nvmet_req *req) +{ + if (req->ns->bdev) + nvmet_bdev_poll_complete(req); +} + int nvmet_ns_enable(struct nvmet_ns *ns) { struct nvmet_subsys *subsys = ns->subsys; @@ -653,6 +665,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) uuid_gen(&ns->uuid); ns->buffered_io = false; + ns->use_poll = false; return ns; } diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index b6fca0e421ef..13507e0cbc1d 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -49,10 +49,11 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) int nvmet_bdev_ns_enable(struct nvmet_ns *ns) { + int fmode = FMODE_READ | FMODE_WRITE; + struct request_queue *q; int ret; - ns->bdev = blkdev_get_by_path(ns->device_path, - FMODE_READ | FMODE_WRITE, NULL); + ns->bdev = blkdev_get_by_path(ns->device_path, fmode, NULL); if (IS_ERR(ns->bdev)) { ret = PTR_ERR(ns->bdev); if (ret != -ENOTBLK) { @@ -60,16 +61,21 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) ns->device_path, PTR_ERR(ns->bdev)); } ns->bdev = NULL; - return ret; + goto out; } ns->size = i_size_read(ns->bdev->bd_inode); ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); - return 0; + q = bdev_get_queue(ns->bdev); + ns->poll = ns->use_poll && test_bit(QUEUE_FLAG_POLL, &q->queue_flags); + ret = ns->poll ? nvmet_ns_start_poll(ns) : 0; +out: + return ret; } void nvmet_bdev_ns_disable(struct nvmet_ns *ns) { if (ns->bdev) { + ns->poll ? nvmet_ns_stop_poll(ns) : 0; blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ); ns->bdev = NULL; } @@ -133,15 +139,39 @@ static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) return status; } -static void nvmet_bio_done(struct bio *bio) +void nvmet_bdev_req_complete(struct nvmet_req *req) { - struct nvmet_req *req = bio->bi_private; + struct bio *bio = req->b.last_bio; nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status)); if (bio != &req->b.inline_bio) bio_put(bio); } +static void nvmet_bio_done(struct bio *bio) +{ + struct nvmet_req *req = bio->bi_private; + + req->b.last_bio = bio; + + req->poll ? complete(&req->wait) : nvmet_bdev_req_complete(req); +} + +void nvmet_bdev_poll_complete(struct nvmet_req *req) +{ + struct request_queue *q = bdev_get_queue(req->ns->bdev); + + while (!completion_done(&req->wait)) { + int ret = blk_poll(q, req->b.cookie, true); + + if (ret < 0) { + pr_err("tid %d poll error %d", req->t->id, ret); + break; + } + } + nvmet_bdev_req_complete(req); +} + static void nvmet_bdev_execute_rw(struct nvmet_req *req) { int sg_cnt = req->sg_cnt; @@ -185,7 +215,8 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) bio->bi_end_io = nvmet_bio_done; bio->bi_opf = op; - blk_start_plug(&plug); + if (!req->poll) + blk_start_plug(&plug); for_each_sg(req->sg, sg, req->sg_cnt, i) { while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) != sg->length) { @@ -204,8 +235,16 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) sg_cnt--; } - submit_bio(bio); - blk_finish_plug(&plug); + req->b.last_bio = bio; + if (req->poll) + req->b.last_bio->bi_opf |= REQ_HIPRI; + + req->b.cookie = submit_bio(bio); + + nvmet_req_prep_poll(req); + nvmet_req_issue_poll(req); + if (!req->poll) + blk_finish_plug(&plug); } static void nvmet_bdev_execute_flush(struct nvmet_req *req) @@ -330,15 +369,19 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) switch (cmd->common.opcode) { case nvme_cmd_read: case nvme_cmd_write: + req->poll = req->ns->poll; req->execute = nvmet_bdev_execute_rw; return 0; case nvme_cmd_flush: + req->poll = false; req->execute = nvmet_bdev_execute_flush; return 0; case nvme_cmd_dsm: + req->poll = false; req->execute = nvmet_bdev_execute_dsm; return 0; case nvme_cmd_write_zeroes: + req->poll = false; req->execute = nvmet_bdev_execute_write_zeroes; return 0; default: diff --git a/drivers/nvme/target/io-poll.c b/drivers/nvme/target/io-poll.c new file mode 100644 index 000000000000..175c939c22ff --- /dev/null +++ b/drivers/nvme/target/io-poll.c @@ -0,0 +1,165 @@ +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/sched/signal.h> +#include <linux/kthread.h> +#include <uapi/linux/sched/types.h> + +#include "nvmet.h" +#define THREAD_PER_CPU (num_online_cpus() * 2) + +static int nvmet_poll_thread(void *data); + +int nvmet_ns_start_poll(struct nvmet_ns *ns) +{ + struct nvmet_poll_data *t; + int ret = 0; + int i; + + t = kzalloc(sizeof(*t) * THREAD_PER_CPU, GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < THREAD_PER_CPU; i++) { + init_completion(&t[i].thread_complete); + init_waitqueue_head(&t[i].poll_waitq); + INIT_LIST_HEAD(&t[i].list); + INIT_LIST_HEAD(&t[i].done); + mutex_init(&t[i].list_lock); + t[i].id = i; + + t[i].thread = kthread_create(nvmet_poll_thread, &t[i], + "nvmet_poll_thread%s/%d", + ns->device_path, i); + + if (IS_ERR(t[i].thread)) { + ret = PTR_ERR(t[i].thread); + goto err; + } + + kthread_bind(t[i].thread, i % num_online_cpus()); + wake_up_process(t[i].thread); + wait_for_completion(&t[i].thread_complete); + } + + ns->t = t; +out: + return ret; +err: + nvmet_ns_stop_poll(ns); + goto out; +} + +void nvmet_ns_stop_poll(struct nvmet_ns *ns) +{ + struct nvmet_poll_data *t = ns->t; + int i; + + for (i = 0; i < THREAD_PER_CPU; i++) { + if (!t[i].thread) + continue; + + if (wq_has_sleeper(&t[i].poll_waitq)) + wake_up(&t[i].poll_waitq); + kthread_park(t[i].thread); + kthread_stop(t[i].thread); + } +} + +static void nvmet_poll(struct nvmet_poll_data *t) +{ + struct nvmet_req *req, *tmp; + + lockdep_assert_held(&t->list_lock); + + list_for_each_entry_safe(req, tmp, &t->list, poll_entry) { + + if (completion_done(&req->wait)) { + list_move_tail(&req->poll_entry, &t->done); + continue; + } + + if (!list_empty(&t->done)) + break; + + list_del(&req->poll_entry); + mutex_unlock(&t->list_lock); + nvmet_req_poll_complete(req); + mutex_lock(&t->list_lock); + } + mutex_unlock(&t->list_lock); + /* + * In future we can also add batch timeout or nr request to complete. + */ + while (!list_empty(&t->done)) { + /* + * We lock and unlock for t->list which gurantee progress of + * nvmet_xxx_rw_execute() when under pressure while we complete + * the request. + */ + req = list_first_entry(&t->done, struct nvmet_req, poll_entry); + list_del(&req->poll_entry); + nvmet_req_done(req); + } + + mutex_lock(&t->list_lock); +} + +static int nvmet_poll_thread(void *data) +{ + struct nvmet_poll_data *t = (struct nvmet_poll_data *) data; + DEFINE_WAIT(wait); + + complete(&t->thread_complete); + + while (!kthread_should_park()) { + + mutex_lock(&t->list_lock); + while (!list_empty(&t->list) && !need_resched()) + nvmet_poll(t); + mutex_unlock(&t->list_lock); + + prepare_to_wait(&t->poll_waitq, &wait, TASK_INTERRUPTIBLE); + if (signal_pending(current)) + flush_signals(current); + smp_mb(); + schedule(); + + finish_wait(&t->poll_waitq, &wait); + } + + kthread_parkme(); + return 0; +} + +inline void nvmet_req_prep_poll(struct nvmet_req *req) +{ + if (!req->poll) + return; + + init_completion(&req->wait); + req->t = &req->ns->t[smp_processor_id()]; +} + +inline void nvmet_req_issue_poll(struct nvmet_req *req) +{ + if (!req->poll) + return; + + while (!mutex_trylock(&req->t->list_lock)) { + if (req->t->id < num_online_cpus()) + req->t = &req->ns->t[req->t->id + num_online_cpus()]; + else + req->t = &req->ns->t[req->t->id - num_online_cpus()]; + } + + if (completion_done(&req->wait)) + list_add(&req->poll_entry, &req->t->list); + else + list_add_tail(&req->poll_entry, &req->t->list); + mutex_unlock(&req->t->list_lock); + + if (wq_has_sleeper(&req->t->poll_waitq)) + wake_up(&req->t->poll_waitq); +} diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 46df45e837c9..ef2919e23e0b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -49,11 +49,22 @@ #define IPO_IATTR_CONNECT_SQE(x) \ (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) +struct nvmet_poll_data { + struct completion thread_complete; + wait_queue_head_t poll_waitq; + struct mutex list_lock; + struct task_struct *thread; + struct list_head list; + struct list_head done; + unsigned int id; +}; + struct nvmet_ns { struct list_head dev_link; struct percpu_ref ref; struct block_device *bdev; struct file *file; + struct nvmet_poll_data *t; bool readonly; u32 nsid; u32 blksize_shift; @@ -63,6 +74,8 @@ struct nvmet_ns { u32 anagrpid; bool buffered_io; + bool use_poll; + bool poll; bool enabled; struct nvmet_subsys *subsys; const char *device_path; @@ -292,9 +305,15 @@ struct nvmet_req { struct nvmet_ns *ns; struct scatterlist *sg; struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; + struct completion wait; + bool poll; + struct nvmet_poll_data *t; + struct list_head poll_entry; union { struct { - struct bio inline_bio; + struct bio inline_bio; + blk_qc_t cookie; + struct bio *last_bio; } b; struct { bool mpool_alloc; @@ -442,6 +461,16 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, u8 event_info, u8 log_page); +int nvmet_ns_start_poll(struct nvmet_ns *ns); +void nvmet_ns_stop_poll(struct nvmet_ns *ns); +void nvmet_req_prep_poll(struct nvmet_req *req); +void nvmet_req_issue_poll(struct nvmet_req *req); + +void nvmet_req_poll_complete(struct nvmet_req *req); +void nvmet_bdev_poll_complete(struct nvmet_req *req); +void nvmet_bdev_req_complete(struct nvmet_req *req); +void nvmet_req_done(struct nvmet_req *req); + #define NVMET_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD NVMET_QUEUE_SIZE -- 2.22.1 _______________________________________________ linux-nvme mailing list linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme
next prev parent reply other threads:[~2019-12-10 6:26 UTC|newest] Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top 2019-12-10 6:25 [RFC PATCH 0/2] nvmet: add " Chaitanya Kulkarni 2019-12-10 6:25 ` Chaitanya Kulkarni [this message] 2020-01-20 12:52 ` [RFC PATCH 1/2] nvmet: add bdev-ns " Max Gurtovoy 2020-01-21 19:22 ` Chaitanya Kulkarni 2020-01-23 14:23 ` Max Gurtovoy 2020-01-30 18:19 ` Chaitanya Kulkarni 2019-12-10 6:25 ` [RFC PATCH 2/2] nvmet: add file-ns " Chaitanya Kulkarni 2019-12-12 1:01 ` [RFC PATCH 0/2] nvmet: add " Sagi Grimberg 2019-12-12 5:44 ` Chaitanya Kulkarni 2019-12-12 20:32 ` Sagi Grimberg 2020-01-20 5:13 ` Chaitanya Kulkarni 2020-01-20 4:48 ` Chaitanya Kulkarni
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20191210062557.5171-2-chaitanya.kulkarni@wdc.com \ --to=chaitanya.kulkarni@wdc.com \ --cc=hch@lst.de \ --cc=linux-nvme@lists.infradead.org \ --cc=sagi@grimberg.me \ --subject='Re: [RFC PATCH 1/2] nvmet: add bdev-ns polling support' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).