On Tue, May 25, 2021 at 01:05:58PM -0500, Mike Christie wrote: > This patch allows userspace to create workers and bind them to vqs, so you > can have N workers per dev and also share N workers with M vqs. The next > patch will allow sharing across devices. > > Signed-off-by: Mike Christie > --- > drivers/vhost/vhost.c | 94 +++++++++++++++++++++++++++++++- > drivers/vhost/vhost.h | 3 + > include/uapi/linux/vhost.h | 6 ++ > include/uapi/linux/vhost_types.h | 12 ++++ > 4 files changed, 113 insertions(+), 2 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 345ade0af133..981e9bac7a31 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -30,6 +30,7 @@ > #include > #include > #include > +#include > > #include "vhost.h" > > @@ -42,6 +43,9 @@ module_param(max_iotlb_entries, int, 0444); > MODULE_PARM_DESC(max_iotlb_entries, > "Maximum number of iotlb entries. (default: 2048)"); > > +static DEFINE_HASHTABLE(vhost_workers, 5); > +static DEFINE_SPINLOCK(vhost_workers_lock); > + > enum { > VHOST_MEMORY_F_LOG = 0x1, > }; > @@ -617,8 +621,17 @@ static void vhost_detach_mm(struct vhost_dev *dev) > dev->mm = NULL; > } > > -static void vhost_worker_free(struct vhost_worker *worker) > +static void vhost_worker_put(struct vhost_worker *worker) > { > + spin_lock(&vhost_workers_lock); > + if (!refcount_dec_and_test(&worker->refcount)) { > + spin_unlock(&vhost_workers_lock); > + return; > + } > + > + hash_del(&worker->h_node); > + spin_unlock(&vhost_workers_lock); > + > WARN_ON(!llist_empty(&worker->work_list)); > kthread_stop(worker->task); > kfree(worker); > @@ -632,7 +645,7 @@ static void vhost_workers_free(struct vhost_dev *dev) > return; > > for (i = 0; i < dev->num_workers; i++) > - vhost_worker_free(dev->workers[i]); > + vhost_worker_put(dev->workers[i]); > > kfree(dev->workers); > dev->num_workers = 0; > @@ -652,6 +665,8 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) > worker->id = dev->num_workers; > worker->dev = dev; > init_llist_head(&worker->work_list); > + INIT_HLIST_NODE(&worker->h_node); > + refcount_set(&worker->refcount, 1); > > task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid); > if (IS_ERR(task)) > @@ -664,6 +679,9 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) > if (ret) > goto stop_worker; > > + spin_lock(&vhost_workers_lock); > + hash_add(vhost_workers, &worker->h_node, worker->task->pid); > + spin_unlock(&vhost_workers_lock); > return worker; > > stop_worker: > @@ -673,6 +691,67 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) > return NULL; > } > > +static struct vhost_worker *vhost_worker_find(struct vhost_dev *dev, pid_t pid) > +{ > + struct vhost_worker *worker, *found_worker = NULL; > + > + spin_lock(&vhost_workers_lock); > + hash_for_each_possible(vhost_workers, worker, h_node, pid) { > + if (worker->task->pid == pid) { > + /* tmp - next patch allows sharing across devs */ > + if (worker->dev != dev) > + break; > + > + found_worker = worker; > + refcount_inc(&worker->refcount); > + break; > + } > + } > + spin_unlock(&vhost_workers_lock); > + return found_worker; > +} > + > +/* Caller must have device mutex */ > +static int vhost_vq_set_worker(struct vhost_virtqueue *vq, > + struct vhost_vring_worker *info) > +{ > + struct vhost_dev *dev = vq->dev; > + struct vhost_worker *worker; > + > + if (vq->worker) { > + /* TODO - support changing while works are running */ > + return -EBUSY; > + } > + > + if (info->pid == VHOST_VRING_NEW_WORKER) { > + worker = vhost_worker_create(dev); The maximum number of kthreads created is limited by vhost_dev_init(nvqs)? For example VHOST_SCSI_MAX_VQ 128. IIUC kthread_create is not limited by or accounted against the current task, so I'm a little worried that a process can create a lot of kthreads. I haven't investigated other kthread_create() users reachable from userspace applications to see how they bound the number of threads effectively. Any thoughts? > + if (!worker) > + return -ENOMEM; > + > + info->pid = worker->task->pid; > + } else { > + worker = vhost_worker_find(dev, info->pid); > + if (!worker) > + return -ENODEV; > + } > + > + if (!dev->workers) { > + dev->workers = kcalloc(vq->dev->nvqs, > + sizeof(struct vhost_worker *), > + GFP_KERNEL); Another candidate for GFP_KERNEL_ACCOUNT. > + if (!dev->workers) { > + vhost_worker_put(worker); > + return -ENOMEM; > + } > + } > + > + vq->worker = worker; > + > + dev->workers[dev->num_workers] = worker; > + dev->num_workers++; Hmm...should we really append to workers[] in the vhost_worker_find() case? > + return 0; > +} > + > /* Caller must have device mutex */ > static int vhost_worker_try_create_def(struct vhost_dev *dev) > { > @@ -1680,6 +1759,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg > struct eventfd_ctx *ctx = NULL; > u32 __user *idxp = argp; > struct vhost_virtqueue *vq; > + struct vhost_vring_worker w; > struct vhost_vring_state s; > struct vhost_vring_file f; > u32 idx; > @@ -1794,6 +1874,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg > if (copy_to_user(argp, &s, sizeof(s))) > r = -EFAULT; > break; > + case VHOST_SET_VRING_WORKER: > + if (copy_from_user(&w, argp, sizeof(w))) { > + r = -EFAULT; > + break; > + } > + r = vhost_vq_set_worker(vq, &w); > + if (!r && copy_to_user(argp, &w, sizeof(w))) > + r = -EFAULT; > + break; > default: > r = -ENOIOCTLCMD; > } > @@ -2726,6 +2815,7 @@ EXPORT_SYMBOL_GPL(vhost_set_backend_features); > > static int __init vhost_init(void) > { > + hash_init(vhost_workers); > return 0; > } > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 0a252dd45101..75b884ad1f17 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -14,6 +14,7 @@ > #include > #include > #include > +#include > > struct vhost_work; > typedef void (*vhost_work_fn_t)(struct vhost_work *work); > @@ -28,6 +29,8 @@ struct vhost_work { > struct vhost_worker { > struct task_struct *task; > struct llist_head work_list; > + struct hlist_node h_node; h_node is a generic name. If you're willing to use a longer name then vhost_workers_node would make it clear which hlist this is associated with. > + refcount_t refcount; > struct vhost_dev *dev; > int id; > }; > diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h > index c998860d7bbc..ce32119cb139 100644 > --- a/include/uapi/linux/vhost.h > +++ b/include/uapi/linux/vhost.h > @@ -70,6 +70,12 @@ > #define VHOST_VRING_BIG_ENDIAN 1 > #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) > #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > +/* Create/bind a vhost worker to a virtqueue. If pid > 0 and matches an existing > + * vhost_worker thread it will be bound to the vq. If pid is > + * VHOST_VRING_NEW_WORKER, then a new worker will be created and bound to the > + * vq. > + */ Please document when this ioctl must be called (before kick is set up). > +#define VHOST_SET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x15, struct vhost_vring_worker) > > /* The following ioctls use eventfd file descriptors to signal and poll > * for events. */ > diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h > index f7f6a3a28977..5113baa8bc3e 100644 > --- a/include/uapi/linux/vhost_types.h > +++ b/include/uapi/linux/vhost_types.h > @@ -47,6 +47,18 @@ struct vhost_vring_addr { > __u64 log_guest_addr; > }; > > +#define VHOST_VRING_NEW_WORKER -1 > + > +struct vhost_vring_worker { > + unsigned int index; > + /* > + * The pid of the vhost worker that the vq will be bound to. If > + * pid is VHOST_VRING_NEW_WORKER a new worker will be created and it's s/it's/its/ > + * pid will be returned in pid. > + */ > + __kernel_pid_t pid; > +}; > + > /* no alignment requirement */ > struct vhost_iotlb_msg { > __u64 iova; > -- > 2.25.1 >