Target-devel archive on lore.kernel.org
 help / color / Atom feed
From: Mike Christie <michael.christie@oracle.com>
To: martin.petersen@oracle.com, linux-scsi@vger.kernel.org,
	target-devel@vger.kernel.org, mst@redhat.com,
	jasowang@redhat.com, pbonzini@redhat.com, stefanha@redhat.com,
	virtualization@lists.linux-foundation.org
Subject: [PATCH 13/17] vhost: support multiple worker threads
Date: Thu, 22 Oct 2020 00:34:59 +0000
Message-ID: <1603326903-27052-14-git-send-email-michael.christie@oracle.com> (raw)
In-Reply-To: <1603326903-27052-1-git-send-email-michael.christie@oracle.com>

This is a prep patch to support multiple vhost worker threads per vhost
dev. This patch converts the code that had assumed a single worker
thread by:

1. Moving worker related fields to a new struct vhost_worker.
2. Converting vhost.c code to use the new struct and assume we will
have an array of workers.
3. It also exports a helper function that will be used in the last
patch when vhost-scsi is converted to use this new functionality.

Why do we need multiple worker threads?

For vhost-scsi, we do the initial submission and completion from the
vhost worker thread and after adding 2 vqs this single thread becomes a
bottleneck.

With the null_blk driver we max out at 360K IOPs when doing a random
workload like:

fio --direct=1 --rw=randrw --bs=4k --ioengine=libaio \
--iodepth=VQ_QUEUE_DEPTH --numjobs=NUM_VQS --filename  /dev/sdXYZ

where NUM_VQS gets up to 8 (number of cores per numa node on my system)
and VQ_QUEUE_DEPTH can be anywhere from 32 to 128.

With the patches in this set, we are able to get IOPs from a single
LUN up to 640K. And, With some other changes I am working on to the
LIO locking and binding worker threads to specific CPUs we can get this
up to 880K

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/vhost.c | 232 +++++++++++++++++++++++++++++++++++++++-----------
 drivers/vhost/vhost.h |  12 ++-
 2 files changed, 190 insertions(+), 54 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ca2e71..75866a2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,18 +231,48 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
-void vhost_work_dev_flush(struct vhost_dev *dev)
+static void vhost_work_queue_on(struct vhost_dev *dev, struct vhost_work *work,
+				int worker_id)
+{
+	if (!dev->workers)
+		return;
+
+	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+		/* We can only add the work to the list after we're
+		 * sure it was not in the list.
+		 * test_and_set_bit() implies a memory barrier.
+		 */
+		llist_add(&work->node, &dev->workers[worker_id]->work_list);
+		wake_up_process(dev->workers[worker_id]->task);
+	}
+}
+
+void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+{
+	vhost_work_queue_on(dev, work, 0);
+}
+EXPORT_SYMBOL_GPL(vhost_work_queue);
+
+static void vhost_work_flush_on(struct vhost_dev *dev, int worker_id)
 {
 	struct vhost_flush_struct flush;
 
-	if (dev->worker) {
+	if (dev->workers) {
 		init_completion(&flush.wait_event);
 		vhost_work_init(&flush.work, vhost_flush_work);
 
-		vhost_work_queue(dev, &flush.work);
+		vhost_work_queue_on(dev, &flush.work, worker_id);
 		wait_for_completion(&flush.wait_event);
 	}
 }
+
+void vhost_work_dev_flush(struct vhost_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->num_workers; i++)
+		vhost_work_flush_on(dev, i);
+}
 EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
 
 /* Flush any work that has been scheduled. When calling this, don't hold any
@@ -253,26 +283,20 @@ void vhost_poll_flush(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_flush);
 
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+/* A lockless hint for busy polling code to exit the loop */
+bool vhost_has_work(struct vhost_dev *dev)
 {
-	if (!dev->worker)
-		return;
+	int i;
 
-	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
-		/* We can only add the work to the list after we're
-		 * sure it was not in the list.
-		 * test_and_set_bit() implies a memory barrier.
-		 */
-		llist_add(&work->node, &dev->work_list);
-		wake_up_process(dev->worker);
+	if (!dev->workers)
+		return false;
+
+	for (i = 0; i < dev->num_workers; i++) {
+		if (!llist_empty(&dev->workers[i]->work_list))
+			return true;
 	}
-}
-EXPORT_SYMBOL_GPL(vhost_work_queue);
 
-/* A lockless hint for busy polling code to exit the loop */
-bool vhost_has_work(struct vhost_dev *dev)
-{
-	return !llist_empty(&dev->work_list);
+	return false;
 }
 EXPORT_SYMBOL_GPL(vhost_has_work);
 
@@ -338,7 +362,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 
 static int vhost_worker(void *data)
 {
-	struct vhost_dev *dev = data;
+	struct vhost_worker *worker = data;
+	struct vhost_dev *dev = worker->dev;
 	struct vhost_work *work, *work_next;
 	struct llist_node *node;
 
@@ -352,8 +377,7 @@ static int vhost_worker(void *data)
 			__set_current_state(TASK_RUNNING);
 			break;
 		}
-
-		node = llist_del_all(&dev->work_list);
+		node = llist_del_all(&worker->work_list);
 		if (!node)
 			schedule();
 
@@ -506,13 +530,13 @@ int vhost_dev_init(struct vhost_dev *dev,
 	dev->umem = NULL;
 	dev->iotlb = NULL;
 	dev->mm = NULL;
-	dev->worker = NULL;
+	dev->workers = NULL;
+	dev->num_workers = 0;
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
-	dev->byte_weight = byte_weight;
 	dev->use_worker = use_worker;
+	dev->byte_weight = byte_weight;
 	dev->msg_handler = msg_handler;
-	init_llist_head(&dev->work_list);
 	init_waitqueue_head(&dev->wait);
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
@@ -558,16 +582,28 @@ static void vhost_attach_cgroups_work(struct vhost_work *work)
 	s->ret = cgroup_attach_task_all(s->owner, current);
 }
 
-static int vhost_attach_cgroups(struct vhost_dev *dev)
+static int vhost_attach_cgroups_on(struct vhost_dev *dev, int worker_id)
 {
 	struct vhost_attach_cgroups_struct attach;
 
 	attach.owner = current;
 	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
-	vhost_work_queue(dev, &attach.work);
-	vhost_work_dev_flush(dev);
+	vhost_work_queue_on(dev, &attach.work, worker_id);
+	vhost_work_flush_on(dev, worker_id);
 	return attach.ret;
 }
+static int vhost_attach_cgroups(struct vhost_dev *dev, int first_worker)
+{
+	int i, ret = 0;
+
+	for (i = first_worker; i < dev->num_workers; i++) {
+		ret = vhost_attach_cgroups_on(dev, i);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
 
 /* Caller should have device mutex */
 bool vhost_dev_has_owner(struct vhost_dev *dev)
@@ -606,10 +642,117 @@ static void vhost_detach_mm(struct vhost_dev *dev)
 	dev->mm = NULL;
 }
 
+static void vhost_workers_free(struct vhost_dev *dev)
+{
+	struct vhost_worker *worker;
+	int i;
+
+	if (!dev->workers)
+		return;
+
+	for (i = 0; i < dev->num_workers; i++) {
+		worker = dev->workers[i];
+
+		WARN_ON(!llist_empty(&worker->work_list));
+		kthread_stop(worker->task);
+		kfree(worker);
+	}
+
+	kfree(dev->workers);
+	dev->workers = NULL;
+	dev->num_workers = 0;
+}
+
+static int vhost_worker_create(struct vhost_dev *dev, int worker_id)
+{
+	struct vhost_worker *worker;
+	struct task_struct *task;
+	int ret;
+
+	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+	if (!worker)
+		return -ENOMEM;
+
+	init_llist_head(&worker->work_list);
+	worker->dev = dev;
+
+	task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto free_worker;
+	}
+
+	dev->workers[worker_id] = worker;
+	worker->task = task;
+	wake_up_process(task); /* avoid contributing to loadavg */
+	return 0;
+
+free_worker:
+	kfree(worker);
+	return ret;
+}
+
+/**
+ * vhost_workers_create - create vhost workers and attach to cgroup
+ * @dev: vhost device
+ * @new_num_workers: the total number of workers we want after this returns
+ *
+ * Caller must have the device mutex and have stopped operations that
+ * can access the workers array.
+ */
+int vhost_workers_create(struct vhost_dev *dev, int new_num_workers)
+{
+	struct vhost_worker **new_workers;
+	struct mm_struct *mm;
+	bool owner_match = true;
+	int i, err, start;
+
+	if (new_num_workers = dev->num_workers)
+		return 0;
+
+	if (new_num_workers < dev->num_workers)
+		return -EINVAL;
+
+	if (vhost_dev_has_owner(dev)) {
+		mm = get_task_mm(current);
+		if (mm != dev->mm)
+			owner_match = false;
+		mmput(mm);
+		if (!owner_match)
+			return -EBUSY;
+	}
+
+	new_workers = krealloc(dev->workers, new_num_workers * sizeof(*new_workers),
+			       GFP_KERNEL);
+	if (!new_workers) {
+		err = -ENOMEM;
+		goto free_workers;
+	}
+	dev->workers = new_workers;
+
+	start = dev->num_workers;
+	for (i = start; i < new_num_workers; i++) {
+		err = vhost_worker_create(dev, i);
+		if (err)
+			goto free_workers;
+		dev->num_workers++;
+	}
+
+	err = vhost_attach_cgroups(dev, start);
+	if (err)
+		goto free_workers;
+
+	return 0;
+
+free_workers:
+	vhost_workers_free(dev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(vhost_workers_create);
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-	struct task_struct *worker;
 	int err;
 
 	/* Is there an owner already? */
@@ -622,27 +765,16 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 
 	dev->kcov_handle = kcov_common_handle();
 	if (dev->use_worker) {
-		worker = kthread_create(vhost_worker, dev,
-					"vhost-%d", current->pid);
-		if (IS_ERR(worker)) {
-			err = PTR_ERR(worker);
-			goto err_worker;
-		}
-
-		dev->worker = worker;
-		wake_up_process(worker); /* avoid contributing to loadavg */
-
-		err = vhost_attach_cgroups(dev);
+		/*
+		 * All drivers that set use_worker=true, use at least one
+		 * worker. Drivers like vhost-scsi may override this later.
+		 */
+		err = vhost_workers_create(dev, 1);
 		if (err)
-			goto err_cgroup;
+			goto err_worker;
 	}
 
 	return 0;
-err_cgroup:
-	if (dev->worker) {
-		kthread_stop(dev->worker);
-		dev->worker = NULL;
-	}
 err_worker:
 	vhost_detach_mm(dev);
 	dev->kcov_handle = 0;
@@ -735,12 +867,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	dev->iotlb = NULL;
 	vhost_clear_msg(dev);
 	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
-	WARN_ON(!llist_empty(&dev->work_list));
-	if (dev->worker) {
-		kthread_stop(dev->worker);
-		dev->worker = NULL;
-		dev->kcov_handle = 0;
-	}
+	vhost_workers_free(dev);
+	dev->kcov_handle = 0;
 	vhost_detach_mm(dev);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 08c5aef..b0973e6 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -25,6 +25,12 @@ struct vhost_work {
 	unsigned long		  flags;
 };
 
+struct vhost_worker {
+	struct task_struct *task;
+	struct llist_head work_list;
+	struct vhost_dev *dev;
+};
+
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
@@ -39,6 +45,7 @@ struct vhost_poll {
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 bool vhost_has_work(struct vhost_dev *dev);
+int vhost_workers_create(struct vhost_dev *dev, int new_num_workers);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 		     __poll_t mask, struct vhost_dev *dev);
@@ -152,8 +159,8 @@ struct vhost_dev {
 	int nvqs;
 	int max_nvqs;
 	struct eventfd_ctx *log_ctx;
-	struct llist_head work_list;
-	struct task_struct *worker;
+	struct vhost_worker **workers;
+	int num_workers;
 	struct vhost_iotlb *umem;
 	struct vhost_iotlb *iotlb;
 	spinlock_t iotlb_lock;
@@ -175,6 +182,7 @@ int vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs,
 		   int byte_weight, bool use_worker,
 		   int (*msg_handler)(struct vhost_dev *dev,
 				      struct vhost_iotlb_msg *msg));
+int vhost_vq_set_worker(struct vhost_virtqueue *vq, int worker_id);
 long vhost_dev_set_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
 long vhost_dev_check_owner(struct vhost_dev *);
-- 
1.8.3.1

  parent reply index

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-22  0:34 [PATCH 00/17 V3] vhost: fix scsi cmd handling and cgroup support Mike Christie
2020-10-22  0:34 ` [PATCH 01/17] vhost scsi: add lun parser helper Mike Christie
2020-10-26  3:33   ` Jason Wang
2020-10-22  0:34 ` [PATCH 02/17] vhost: remove work arg from vhost_work_flush Mike Christie
2020-10-22  0:51   ` Chaitanya Kulkarni
2020-10-22  0:34 ` [PATCH 03/17] vhost net: use goto error handling in open Mike Christie
2020-10-22  0:45   ` Chaitanya Kulkarni
2020-10-26  3:34   ` Jason Wang
2020-10-22  0:34 ` [PATCH 04/17] vhost: prep vhost_dev_init users to handle failures Mike Christie
2020-10-22  5:22   ` kernel test robot
2020-10-23 16:15   ` Mike Christie
2020-11-02  5:57   ` Jason Wang
2020-11-03 10:04   ` Dan Carpenter
2020-10-22  0:34 ` [PATCH 05/17] vhost: move vq iovec allocation to dev init time Mike Christie
2020-10-22  0:34 ` [PATCH 06/17] vhost: support delayed vq creation Mike Christie
2020-10-22  0:34 ` [PATCH 07/17] vhost scsi: support delayed IO " Mike Christie
2020-10-26  3:51   ` Jason Wang
2020-10-27  5:47     ` Mike Christie
2020-10-28  1:55       ` Jason Wang
2020-10-30  8:47       ` Michael S. Tsirkin
2020-10-30 16:30         ` Mike Christie
2020-10-30 17:26           ` Mike Christie
2020-11-01 22:06         ` Mike Christie
2020-11-02  6:36         ` Jason Wang
2020-11-02  6:49           ` Jason Wang
2020-11-02 16:19             ` Mike Christie
2020-10-22  0:34 ` [PATCH 08/17] vhost scsi: alloc cmds per vq instead of session Mike Christie
2020-10-22  0:34 ` [PATCH 09/17] vhost scsi: fix cmd completion race Mike Christie
2020-10-27 13:07   ` Maurizio Lombardi
2020-10-30  8:51   ` Michael S. Tsirkin
2020-10-30 16:04     ` Paolo Bonzini
2020-10-22  0:34 ` [PATCH 10/17] vhost scsi: Add support for LUN resets Mike Christie
2020-10-22  0:34 ` [PATCH 11/17] vhost scsi: remove extra flushes Mike Christie
2020-10-22  0:34 ` [PATCH 12/17] vhost poll: fix coding style Mike Christie
2020-10-22  0:39   ` Chaitanya Kulkarni
2020-10-22  0:34 ` Mike Christie [this message]
2020-10-22  0:35 ` [PATCH 14/17] vhost: poll support support multiple workers Mike Christie
2020-10-22  0:35 ` [PATCH 15/17] host: support delayed vq creation Mike Christie
2020-10-22  0:50   ` Mike Christie
2020-10-22  0:35 ` [PATCH 16/17] vhost scsi: multiple worker support Mike Christie
2020-10-22  0:35 ` [PATCH 17/17] vhost scsi: drop submission workqueue Mike Christie
2020-10-29 21:47 ` [PATCH 00/17 V3] vhost: fix scsi cmd handling and cgroup support Michael S. Tsirkin
2020-10-29 22:19   ` Mike Christie

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1603326903-27052-14-git-send-email-michael.christie@oracle.com \
    --to=michael.christie@oracle.com \
    --cc=jasowang@redhat.com \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=mst@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=target-devel@vger.kernel.org \
    --cc=virtualization@lists.linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Target-devel archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/target-devel/0 target-devel/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 target-devel target-devel/ https://lore.kernel.org/target-devel \
		target-devel@vger.kernel.org
	public-inbox-index target-devel

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.target-devel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git