From: Luiz Capitulino <lcapitulino-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
To: Pankaj Gupta <pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: kwolf-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
jack-AlSwsSmVLrQ@public.gmane.org,
xiaoguangrong.eric-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
kvm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
riel-ebMLmSuQjDVBDgjK7y7TUQ@public.gmane.org,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w@public.gmane.org,
david-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
ross.zwisler-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
qemu-devel-qX2TKyscuCcdnm+yROfE0A@public.gmane.org,
hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org,
imammedo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
stefanha-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
niteshnarayanlal-PkbjNfxxIARBDgjK7y7TUQ@public.gmane.org,
pbonzini-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
nilal-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
eblake-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org
Subject: Re: [PATCH 3/3] virtio-pmem: Add virtio pmem driver
Date: Wed, 12 Sep 2018 12:54:23 -0400 [thread overview]
Message-ID: <20180912125423.3ad0e034@doriath> (raw)
In-Reply-To: <20180831133019.27579-4-pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
On Fri, 31 Aug 2018 19:00:18 +0530
Pankaj Gupta <pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> This patch adds virtio-pmem driver for KVM guest.
>
> Guest reads the persistent memory range information from
> Qemu over VIRTIO and registers it on nvdimm_bus. It also
> creates a nd_region object with the persistent memory
> range information so that existing 'nvdimm/pmem' driver
> can reserve this into system memory map. This way
> 'virtio-pmem' driver uses existing functionality of pmem
> driver to register persistent memory compatible for DAX
> capable filesystems.
>
> This also provides function to perform guest flush over
> VIRTIO from 'pmem' driver when userspace performs flush
> on DAX memory range.
>
> Signed-off-by: Pankaj Gupta <pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> ---
> drivers/virtio/Kconfig | 9 ++
> drivers/virtio/Makefile | 1 +
> drivers/virtio/virtio_pmem.c | 255 +++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/virtio_ids.h | 1 +
> include/uapi/linux/virtio_pmem.h | 40 ++++++
> 5 files changed, 306 insertions(+)
> create mode 100644 drivers/virtio/virtio_pmem.c
> create mode 100644 include/uapi/linux/virtio_pmem.h
>
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index 3589764..a331e23 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -42,6 +42,15 @@ config VIRTIO_PCI_LEGACY
>
> If unsure, say Y.
>
> +config VIRTIO_PMEM
> + tristate "Support for virtio pmem driver"
> + depends on VIRTIO
> + help
> + This driver provides support for virtio based flushing interface
> + for persistent memory range.
> +
> + If unsure, say M.
> +
> config VIRTIO_BALLOON
> tristate "Virtio balloon driver"
> depends on VIRTIO
> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> index 3a2b5c5..cbe91c6 100644
> --- a/drivers/virtio/Makefile
> +++ b/drivers/virtio/Makefile
> @@ -6,3 +6,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
> virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
> obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
> obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
> +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o
> diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c
> new file mode 100644
> index 0000000..c22cc87
> --- /dev/null
> +++ b/drivers/virtio/virtio_pmem.c
> @@ -0,0 +1,255 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * virtio_pmem.c: Virtio pmem Driver
> + *
> + * Discovers persistent memory range information
> + * from host and provides a virtio based flushing
> + * interface.
> + */
> +#include <linux/virtio.h>
> +#include <linux/module.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/virtio_config.h>
> +#include <uapi/linux/virtio_pmem.h>
> +#include <linux/spinlock.h>
> +#include <linux/libnvdimm.h>
> +#include <linux/nd.h>
> +
> +struct virtio_pmem_request {
> + /* Host return status corresponding to flush request */
> + int ret;
> +
> + /* command name*/
> + char name[16];
> +
> + /* Wait queue to process deferred work after ack from host */
> + wait_queue_head_t host_acked;
> + bool done;
> +
> + /* Wait queue to process deferred work after virt queue buffer avail */
> + wait_queue_head_t wq_buf;
> + bool wq_buf_avail;
> + struct list_head list;
> +};
> +
> +struct virtio_pmem {
> + struct virtio_device *vdev;
> +
> + /* Virtio pmem request queue */
> + struct virtqueue *req_vq;
> +
> + /* nvdimm bus registers virtio pmem device */
> + struct nvdimm_bus *nvdimm_bus;
> + struct nvdimm_bus_descriptor nd_desc;
> +
> + /* List to store deferred work if virtqueue is full */
> + struct list_head req_list;
> +
> + /* Synchronize virtqueue data */
> + spinlock_t pmem_lock;
> +
> + /* Memory region information */
> + uint64_t start;
> + uint64_t size;
> +};
> +
> +static struct virtio_device_id id_table[] = {
> + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID },
> + { 0 },
> +};
> +
> + /* The interrupt handler */
> +static void host_ack(struct virtqueue *vq)
> +{
> + unsigned int len;
> + unsigned long flags;
> + struct virtio_pmem_request *req, *req_buf;
> + struct virtio_pmem *vpmem = vq->vdev->priv;
> +
> + spin_lock_irqsave(&vpmem->pmem_lock, flags);
> + while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
> + req->done = true;
> + wake_up(&req->host_acked);
> +
> + if (!list_empty(&vpmem->req_list)) {
> + req_buf = list_first_entry(&vpmem->req_list,
> + struct virtio_pmem_request, list);
> + list_del(&vpmem->req_list);
> + req_buf->wq_buf_avail = true;
> + wake_up(&req_buf->wq_buf);
> + }
> + }
> + spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
> +}
> + /* Initialize virt queue */
> +static int init_vq(struct virtio_pmem *vpmem)
> +{
> + struct virtqueue *vq;
> +
> + /* single vq */
> + vpmem->req_vq = vq = virtio_find_single_vq(vpmem->vdev,
> + host_ack, "flush_queue");
> + if (IS_ERR(vq))
> + return PTR_ERR(vq);
> +
> + spin_lock_init(&vpmem->pmem_lock);
> + INIT_LIST_HEAD(&vpmem->req_list);
> +
> + return 0;
> +};
> +
> + /* The request submission function */
> +static int virtio_pmem_flush(struct nd_region *nd_region)
> +{
> + int err;
> + unsigned long flags;
> + struct scatterlist *sgs[2], sg, ret;
> + struct virtio_device *vdev =
> + dev_to_virtio(nd_region->dev.parent->parent);
> + struct virtio_pmem *vpmem = vdev->priv;
I'm missing a might_sleep() call in this function.
> + struct virtio_pmem_request *req = kmalloc(sizeof(*req), GFP_KERNEL);
> +
> + if (!req)
> + return -ENOMEM;
> +
> + req->done = req->wq_buf_avail = false;
> + strcpy(req->name, "FLUSH");
> + init_waitqueue_head(&req->host_acked);
> + init_waitqueue_head(&req->wq_buf);
> +
> + spin_lock_irqsave(&vpmem->pmem_lock, flags);
> + sg_init_one(&sg, req->name, strlen(req->name));
> + sgs[0] = &sg;
> + sg_init_one(&ret, &req->ret, sizeof(req->ret));
> + sgs[1] = &ret;
It seems that sg_init_one() is only setting fields, in this
case you can move spin_lock_irqsave() here.
> + err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req, GFP_ATOMIC);
> + if (err) {
> + dev_err(&vdev->dev, "failed to send command to virtio pmem device\n");
> +
> + list_add_tail(&vpmem->req_list, &req->list);
> + spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
> +
> + /* When host has read buffer, this completes via host_ack */
> + wait_event(req->wq_buf, req->wq_buf_avail);
> + spin_lock_irqsave(&vpmem->pmem_lock, flags);
Is this error handling code assuming that at some point
virtqueue_add_sgs() will succeed for a different thread? If yes,
what happens if the assumption is false? That is, what happens if
virtqueue_add_sgs() never succeeds anymore?
Why not just return an error?
> + }
> + virtqueue_kick(vpmem->req_vq);
> + spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
> +
> + /* When host has read buffer, this completes via host_ack */
> + wait_event(req->host_acked, req->done);
> + err = req->ret;
If I'm understanding the QEMU code correctly, you're returning EIO
from QEMU if fsync() fails. I think this is wrong, since we don't know
if EIO in QEMU will be the same EIO in the guest. One way to solve this
would be to return 0 for success and 1 for failure from QEMU, and let the
guest implementation pick its error code (for your implementation it
could be EIO).
> + kfree(req);
> +
> + return err;
> +};
> +EXPORT_SYMBOL_GPL(virtio_pmem_flush);
> +
> +static int virtio_pmem_probe(struct virtio_device *vdev)
> +{
> + int err = 0;
> + struct resource res;
> + struct virtio_pmem *vpmem;
> + struct nvdimm_bus *nvdimm_bus;
> + struct nd_region_desc ndr_desc;
> + int nid = dev_to_node(&vdev->dev);
> + struct nd_region *nd_region;
> +
> + if (!vdev->config->get) {
> + dev_err(&vdev->dev, "%s failure: config disabled\n",
> + __func__);
> + return -EINVAL;
> + }
> +
> + vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem),
> + GFP_KERNEL);
> + if (!vpmem) {
> + err = -ENOMEM;
> + goto out_err;
> + }
> +
> + vpmem->vdev = vdev;
> + err = init_vq(vpmem);
> + if (err)
> + goto out_err;
> +
> + virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> + start, &vpmem->start);
> + virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> + size, &vpmem->size);
> +
> + res.start = vpmem->start;
> + res.end = vpmem->start + vpmem->size-1;
> + vpmem->nd_desc.provider_name = "virtio-pmem";
> + vpmem->nd_desc.module = THIS_MODULE;
> +
> + vpmem->nvdimm_bus = nvdimm_bus = nvdimm_bus_register(&vdev->dev,
> + &vpmem->nd_desc);
> + if (!nvdimm_bus)
> + goto out_vq;
> +
> + dev_set_drvdata(&vdev->dev, nvdimm_bus);
> + memset(&ndr_desc, 0, sizeof(ndr_desc));
> +
> + ndr_desc.res = &res;
> + ndr_desc.numa_node = nid;
> + ndr_desc.flush = virtio_pmem_flush;
> + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
> + nd_region = nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc);
> +
> + if (!nd_region)
> + goto out_nd;
> +
> + //virtio_device_ready(vdev);
> + return 0;
> +out_nd:
> + err = -ENXIO;
> + nvdimm_bus_unregister(nvdimm_bus);
> +out_vq:
> + vdev->config->del_vqs(vdev);
> +out_err:
> + dev_err(&vdev->dev, "failed to register virtio pmem memory\n");
> + return err;
> +}
> +
> +static void virtio_pmem_remove(struct virtio_device *vdev)
> +{
> + struct virtio_pmem *vpmem = vdev->priv;
> + struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
> +
> + nvdimm_bus_unregister(nvdimm_bus);
> + vdev->config->del_vqs(vdev);
> + kfree(vpmem);
> +}
> +
> +#ifdef CONFIG_PM_SLEEP
> +static int virtio_pmem_freeze(struct virtio_device *vdev)
> +{
> + /* todo: handle freeze function */
> + return -EPERM;
> +}
> +
> +static int virtio_pmem_restore(struct virtio_device *vdev)
> +{
> + /* todo: handle restore function */
> + return -EPERM;
> +}
> +#endif
> +
> +
> +static struct virtio_driver virtio_pmem_driver = {
> + .driver.name = KBUILD_MODNAME,
> + .driver.owner = THIS_MODULE,
> + .id_table = id_table,
> + .probe = virtio_pmem_probe,
> + .remove = virtio_pmem_remove,
> +#ifdef CONFIG_PM_SLEEP
> + .freeze = virtio_pmem_freeze,
> + .restore = virtio_pmem_restore,
> +#endif
> +};
> +
> +module_virtio_driver(virtio_pmem_driver);
> +MODULE_DEVICE_TABLE(virtio, id_table);
> +MODULE_DESCRIPTION("Virtio pmem driver");
> +MODULE_LICENSE("GPL");
> diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
> index 6d5c3b2..3463895 100644
> --- a/include/uapi/linux/virtio_ids.h
> +++ b/include/uapi/linux/virtio_ids.h
> @@ -43,5 +43,6 @@
> #define VIRTIO_ID_INPUT 18 /* virtio input */
> #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */
> #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */
> +#define VIRTIO_ID_PMEM 25 /* virtio pmem */
>
> #endif /* _LINUX_VIRTIO_IDS_H */
> diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h
> new file mode 100644
> index 0000000..c7c22a5
> --- /dev/null
> +++ b/include/uapi/linux/virtio_pmem.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
> + * anyone can use the definitions to implement compatible drivers/servers:
> + *
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of IBM nor the names of its contributors
> + * may be used to endorse or promote products derived from this software
> + * without specific prior written permission.
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
> + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Copyright (C) Red Hat, Inc., 2018-2019
> + * Copyright (C) Pankaj Gupta <pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>, 2018
> + */
> +#ifndef _UAPI_LINUX_VIRTIO_PMEM_H
> +#define _UAPI_LINUX_VIRTIO_PMEM_H
> +
> +struct virtio_pmem_config {
> + __le64 start;
> + __le64 size;
> +};
> +#endif
next prev parent reply other threads:[~2018-09-12 16:54 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-08-31 13:30 [PATCH 0/3] kvm "fake DAX" device Pankaj Gupta
[not found] ` <20180831133019.27579-1-pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2018-08-31 13:30 ` [PATCH 1/3] nd: move nd_region to common header Pankaj Gupta
[not found] ` <20180831133019.27579-2-pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2018-09-22 0:47 ` Dan Williams
[not found] ` <CAPcyv4jFimkVnVuzza5TCG=KvY88KZnXzH4GNEgUBbTouprzJA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-09-24 11:40 ` Pankaj Gupta
2018-08-31 13:30 ` [PATCH 2/3] libnvdimm: nd_region flush callback support Pankaj Gupta
2018-09-04 15:29 ` kbuild test robot
[not found] ` <20180904152917.GE17047-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2018-09-05 8:40 ` Pankaj Gupta
2018-09-22 0:43 ` Dan Williams
2018-09-24 11:07 ` Pankaj Gupta
2018-08-31 13:30 ` [PATCH 3/3] virtio-pmem: Add virtio pmem driver Pankaj Gupta
[not found] ` <20180831133019.27579-4-pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2018-09-04 15:17 ` kbuild test robot
2018-09-05 8:34 ` Pankaj Gupta
2018-09-05 12:02 ` kbuild test robot
2018-09-12 16:54 ` Luiz Capitulino [this message]
2018-09-13 6:58 ` [Qemu-devel] " Pankaj Gupta
[not found] ` <831225077.12817716.1536821901550.JavaMail.zimbra-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2018-09-13 12:19 ` Luiz Capitulino
2018-09-14 12:13 ` Pankaj Gupta
2018-09-22 1:08 ` Dan Williams
2018-09-24 9:41 ` Pankaj Gupta
2018-09-27 13:06 ` Pankaj Gupta
2018-09-27 15:55 ` Dan Williams
2018-08-31 13:30 ` [PATCH] qemu: Add virtio pmem device Pankaj Gupta
[not found] ` <20180831133019.27579-5-pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2018-09-12 16:57 ` Luiz Capitulino
2018-09-13 7:06 ` Pankaj Gupta
[not found] ` <563893075.12819183.1536822387535.JavaMail.zimbra-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2018-09-13 12:22 ` Luiz Capitulino
2018-09-20 11:21 ` David Hildenbrand
[not found] ` <2721c3ee-88d1-a8e9-1f1e-ffc3eef1d1ca-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2018-09-20 12:03 ` [Qemu-devel] " Pankaj Gupta
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180912125423.3ad0e034@doriath \
--to=lcapitulino-h+wxahxf7alqt0dzr+alfa@public.gmane.org \
--cc=david-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=eblake-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org \
--cc=imammedo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=jack-AlSwsSmVLrQ@public.gmane.org \
--cc=kvm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
--cc=kwolf-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
--cc=linux-nvdimm-y27Ovi1pjclAfugRpC6u6w@public.gmane.org \
--cc=mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=nilal-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=niteshnarayanlal-PkbjNfxxIARBDgjK7y7TUQ@public.gmane.org \
--cc=pagupta-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=pbonzini-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=qemu-devel-qX2TKyscuCcdnm+yROfE0A@public.gmane.org \
--cc=riel-ebMLmSuQjDVBDgjK7y7TUQ@public.gmane.org \
--cc=ross.zwisler-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org \
--cc=stefanha-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=xiaoguangrong.eric-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).