netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jason Wang <jasowang@redhat.com>
To: Yongji Xie <xieyongji@bytedance.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>,
	"Stefan Hajnoczi" <stefanha@redhat.com>,
	"Stefano Garzarella" <sgarzare@redhat.com>,
	"Parav Pandit" <parav@nvidia.com>, "Bob Liu" <bob.liu@oracle.com>,
	"Christoph Hellwig" <hch@infradead.org>,
	"Randy Dunlap" <rdunlap@infradead.org>,
	"Matthew Wilcox" <willy@infradead.org>,
	viro@zeniv.linux.org.uk, "Jens Axboe" <axboe@kernel.dk>,
	bcrl@kvack.org, "Jonathan Corbet" <corbet@lwn.net>,
	"Mika Penttilä" <mika.penttila@nextfour.com>,
	"Dan Carpenter" <dan.carpenter@oracle.com>,
	virtualization@lists.linux-foundation.org,
	netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH v5 08/11] vduse: Implement an MMU-based IOMMU driver
Date: Fri, 26 Mar 2021 12:26:59 +0800	[thread overview]
Message-ID: <2db71996-037e-494d-6ef0-de3ff164d3c3@redhat.com> (raw)
In-Reply-To: <CACycT3uS870yy04rw7KBk==sioi+VNunxVz6BQH-Lmxk6m-VSg@mail.gmail.com>


在 2021/3/25 下午3:38, Yongji Xie 写道:
> On Thu, Mar 25, 2021 at 12:53 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/3/24 下午3:39, Yongji Xie 写道:
>>> On Wed, Mar 24, 2021 at 11:54 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2021/3/15 下午1:37, Xie Yongji 写道:
>>>>> This implements an MMU-based IOMMU driver to support mapping
>>>>> kernel dma buffer into userspace. The basic idea behind it is
>>>>> treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
>>>>> up MMU mapping instead of IOMMU mapping for the DMA transfer so
>>>>> that the userspace process is able to use its virtual address to
>>>>> access the dma buffer in kernel.
>>>>>
>>>>> And to avoid security issue, a bounce-buffering mechanism is
>>>>> introduced to prevent userspace accessing the original buffer
>>>>> directly.
>>>>>
>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>>>>> ---
>>>>>     drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++
>>>>>     drivers/vdpa/vdpa_user/iova_domain.h |  75 +++++
>>>>>     2 files changed, 610 insertions(+)
>>>>>     create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
>>>>>     create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h
>>>>>
>>>>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
>>>>> new file mode 100644
>>>>> index 000000000000..83de216b0e51
>>>>> --- /dev/null
>>>>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
>>>>> @@ -0,0 +1,535 @@
>>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>>> +/*
>>>>> + * MMU-based IOMMU implementation
>>>>> + *
>>>>> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved.
>>>> 2021 as well.
>>>>
>>> Sure.
>>>
>>>>> + *
>>>>> + * Author: Xie Yongji <xieyongji@bytedance.com>
>>>>> + *
>>>>> + */
>>>>> +
>>>>> +#include <linux/slab.h>
>>>>> +#include <linux/file.h>
>>>>> +#include <linux/anon_inodes.h>
>>>>> +#include <linux/highmem.h>
>>>>> +#include <linux/vmalloc.h>
>>>>> +#include <linux/vdpa.h>
>>>>> +
>>>>> +#include "iova_domain.h"
>>>>> +
>>>>> +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
>>>>> +                              u64 start, u64 last,
>>>>> +                              u64 addr, unsigned int perm,
>>>>> +                              struct file *file, u64 offset)
>>>>> +{
>>>>> +     struct vdpa_map_file *map_file;
>>>>> +     int ret;
>>>>> +
>>>>> +     map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
>>>>> +     if (!map_file)
>>>>> +             return -ENOMEM;
>>>>> +
>>>>> +     map_file->file = get_file(file);
>>>>> +     map_file->offset = offset;
>>>>> +
>>>>> +     ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
>>>>> +                                     addr, perm, map_file);
>>>>> +     if (ret) {
>>>>> +             fput(map_file->file);
>>>>> +             kfree(map_file);
>>>>> +             return ret;
>>>>> +     }
>>>>> +     return 0;
>>>>> +}
>>>>> +
>>>>> +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
>>>>> +                               u64 start, u64 last)
>>>>> +{
>>>>> +     struct vdpa_map_file *map_file;
>>>>> +     struct vhost_iotlb_map *map;
>>>>> +
>>>>> +     while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
>>>>> +             map_file = (struct vdpa_map_file *)map->opaque;
>>>>> +             fput(map_file->file);
>>>>> +             kfree(map_file);
>>>>> +             vhost_iotlb_map_free(domain->iotlb, map);
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +int vduse_domain_set_map(struct vduse_iova_domain *domain,
>>>>> +                      struct vhost_iotlb *iotlb)
>>>>> +{
>>>>> +     struct vdpa_map_file *map_file;
>>>>> +     struct vhost_iotlb_map *map;
>>>>> +     u64 start = 0ULL, last = ULLONG_MAX;
>>>>> +     int ret;
>>>>> +
>>>>> +     spin_lock(&domain->iotlb_lock);
>>>>> +     vduse_iotlb_del_range(domain, start, last);
>>>>> +
>>>>> +     for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
>>>>> +          map = vhost_iotlb_itree_next(map, start, last)) {
>>>>> +             map_file = (struct vdpa_map_file *)map->opaque;
>>>>> +             ret = vduse_iotlb_add_range(domain, map->start, map->last,
>>>>> +                                         map->addr, map->perm,
>>>>> +                                         map_file->file,
>>>>> +                                         map_file->offset);
>>>>> +             if (ret)
>>>>> +                     goto err;
>>>>> +     }
>>>>> +     spin_unlock(&domain->iotlb_lock);
>>>>> +
>>>>> +     return 0;
>>>>> +err:
>>>>> +     vduse_iotlb_del_range(domain, start, last);
>>>>> +     spin_unlock(&domain->iotlb_lock);
>>>>> +     return ret;
>>>>> +}
>>>>> +
>>>>> +static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
>>>>> +                                      u64 iova, u64 size, u64 paddr)
>>>>> +{
>>>>> +     struct vduse_bounce_map *map;
>>>>> +     unsigned int index;
>>>>> +     u64 last = iova + size - 1;
>>>>> +
>>>>> +     while (iova < last) {
>>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
>>>>> +             map->orig_phys[index] = paddr;
>>>>> +             paddr += IOVA_ALLOC_SIZE;
>>>>> +             iova += IOVA_ALLOC_SIZE;
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
>>>>> +                                        u64 iova, u64 size)
>>>>> +{
>>>>> +     struct vduse_bounce_map *map;
>>>>> +     unsigned int index;
>>>>> +     u64 last = iova + size - 1;
>>>>> +
>>>>> +     while (iova < last) {
>>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>> +             index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
>>>>> +             map->orig_phys[index] = INVALID_PHYS_ADDR;
>>>>> +             iova += IOVA_ALLOC_SIZE;
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +static void do_bounce(phys_addr_t orig, void *addr, size_t size,
>>>>> +                   enum dma_data_direction dir)
>>>>> +{
>>>>> +     unsigned long pfn = PFN_DOWN(orig);
>>>>> +
>>>>> +     if (PageHighMem(pfn_to_page(pfn))) {
>>>>> +             unsigned int offset = offset_in_page(orig);
>>>>> +             char *buffer;
>>>>> +             unsigned int sz = 0;
>>>>> +
>>>>> +             while (size) {
>>>>> +                     sz = min_t(size_t, PAGE_SIZE - offset, size);
>>>>> +
>>>>> +                     buffer = kmap_atomic(pfn_to_page(pfn));
>>>> So kmap_atomic() can autoamtically go with fast path if the page does
>>>> not belong to highmem.
>>>>
>>>> I think we can removce the condition and just use kmap_atomic() for all
>>>> the cases here.
>>>>
>>> Looks good to me.
>>>
>>>>> +                     if (dir == DMA_TO_DEVICE)
>>>>> +                             memcpy(addr, buffer + offset, sz);
>>>>> +                     else
>>>>> +                             memcpy(buffer + offset, addr, sz);
>>>>> +                     kunmap_atomic(buffer);
>>>>> +
>>>>> +                     size -= sz;
>>>>> +                     pfn++;
>>>>> +                     addr += sz;
>>>>> +                     offset = 0;
>>>>> +             }
>>>>> +     } else if (dir == DMA_TO_DEVICE) {
>>>>> +             memcpy(addr, phys_to_virt(orig), size);
>>>>> +     } else {
>>>>> +             memcpy(phys_to_virt(orig), addr, size);
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +static void vduse_domain_bounce(struct vduse_iova_domain *domain,
>>>>> +                             dma_addr_t iova, size_t size,
>>>>> +                             enum dma_data_direction dir)
>>>>> +{
>>>>> +     struct vduse_bounce_map *map;
>>>>> +     unsigned int index, offset;
>>>>> +     void *addr;
>>>>> +     size_t sz;
>>>>> +
>>>>> +     while (size) {
>>>>> +             map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>> +             offset = offset_in_page(iova);
>>>>> +             sz = min_t(size_t, IOVA_ALLOC_SIZE, size);
>>>>> +
>>>>> +             if (map->bounce_page &&
>>>>> +                 map->orig_phys[index] != INVALID_PHYS_ADDR) {
>>>>> +                     addr = page_address(map->bounce_page) + offset;
>>>>> +                     index = offset >> IOVA_ALLOC_ORDER;
>>>>> +                     do_bounce(map->orig_phys[index], addr, sz, dir);
>>>>> +             }
>>>>> +             size -= sz;
>>>>> +             iova += sz;
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +static struct page *
>>>>> +vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova)
>>>>> +{
>>>>> +     u64 start = iova & PAGE_MASK;
>>>>> +     u64 last = start + PAGE_SIZE - 1;
>>>>> +     struct vhost_iotlb_map *map;
>>>>> +     struct page *page = NULL;
>>>>> +
>>>>> +     spin_lock(&domain->iotlb_lock);
>>>>> +     map = vhost_iotlb_itree_first(domain->iotlb, start, last);
>>>>> +     if (!map)
>>>>> +             goto out;
>>>>> +
>>>>> +     page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
>>>>> +     get_page(page);
>>>>> +out:
>>>>> +     spin_unlock(&domain->iotlb_lock);
>>>>> +
>>>>> +     return page;
>>>>> +}
>>>>> +
>>>>> +static struct page *
>>>>> +vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova)
>>>>> +{
>>>>> +     u64 start = iova & PAGE_MASK;
>>>>> +     struct page *page = alloc_page(GFP_KERNEL);
>>>>> +     struct vduse_bounce_map *map;
>>>>> +
>>>>> +     if (!page)
>>>>> +             return NULL;
>>>>> +
>>>>> +     spin_lock(&domain->iotlb_lock);
>>>>> +     map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>> +     if (map->bounce_page) {
>>>>> +             __free_page(page);
>>>>> +             goto out;
>>>>> +     }
>>>>> +     map->bounce_page = page;
>>>>> +
>>>>> +     /* paired with vduse_domain_map_page() */
>>>>> +     smp_mb();
>>>> So this is suspicious. It's better to explain like, we need make sure A
>>>> must be done after B.
>>> OK. I see. It's used to protect this pattern:
>>>
>>>      vduse_domain_alloc_bounce_page:          vduse_domain_map_page:
>>>      write map->bounce_page                           write map->orig_phys
>>>      mb()                                                            mb()
>>>      read map->orig_phys                                 read map->bounce_page
>>>
>>> Make sure there will always be a path to do bouncing.
>>
>> Ok.
>>
>>
>>>> And it looks to me the iotlb_lock is sufficnet to do the synchronization
>>>> here. E.g any reason that you don't take it in
>>>> vduse_domain_map_bounce_page().
>>>>
>>> Yes, we can. But the performance in multi-queue cases will go down if
>>> we use iotlb_lock on this critical path.
>>>
>>>> And what's more, is there anyway to aovid holding the spinlock during
>>>> bouncing?
>>>>
>>> Looks like we can't. In the case that multiple page faults happen on
>>> the same page, we should make sure the bouncing is done before any
>>> page fault handler returns.
>>
>> So it looks to me all those extra complexitiy comes from the fact that
>> the bounce_page and orig_phys are set by different places so we need to
>> do the bouncing in two places.
>>
>> I wonder how much we can gain from the "lazy" boucning in page fault.
>> The buffer mapped via dma_ops from virtio driver is expected to be
>> accessed by the userspace soon.  It looks to me we can do all those
>> stuffs during dma_map() then things would be greatly simplified.
>>
> If so, we need to allocate lots of pages from the pool reserved for
> atomic memory allocation requests.


This should be fine, a lot of drivers tries to allocate pages in atomic 
context. The point is to simplify the codes to make it easy to 
determince the correctness so we can add optimization on top simply by 
benchmarking the difference.

E.g we have serveral places that accesses orig_phys:

1) map_page(), write
2) unmap_page(), write
3) page fault handler, read

It's not clear to me how they were synchronized. Or if it was 
synchronzied implicitly (via iova allocator?), we'd better document it. 
Or simply use spinlock (which is the preferrable way I'd like to go). We 
probably don't need to worry too much about the cost of spinlock since 
iova allocater use it heavily.

Thanks


>
> Thanks,
> Yongji
>


  reply	other threads:[~2021-03-26  4:28 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-15  5:37 [PATCH v5 00/11] Introduce VDUSE - vDPA Device in Userspace Xie Yongji
2021-03-15  5:37 ` [PATCH v5 01/11] file: Export __receive_fd() to modules Xie Yongji
2021-03-15  9:08   ` Christoph Hellwig
2021-03-15  9:46     ` Yongji Xie
2021-03-15 14:44       ` Christian Brauner
2021-03-16  2:48         ` Yongji Xie
2021-03-25  8:23       ` Christoph Hellwig
2021-03-25 11:04         ` Yongji Xie
2021-03-15  5:37 ` [PATCH v5 02/11] eventfd: Increase the recursion depth of eventfd_signal() Xie Yongji
2021-03-15  5:37 ` [PATCH v5 03/11] vhost-vdpa: protect concurrent access to vhost device iotlb Xie Yongji
2021-03-23  3:02   ` Jason Wang
2021-03-23  7:25     ` Yongji Xie
2021-03-25 11:08   ` Stefano Garzarella
2021-03-15  5:37 ` [PATCH v5 04/11] vhost-iotlb: Add an opaque pointer for vhost IOTLB Xie Yongji
2021-03-15  5:37 ` [PATCH v5 05/11] vdpa: Add an opaque pointer for vdpa_config_ops.dma_map() Xie Yongji
2021-03-15  5:37 ` [PATCH v5 06/11] vdpa: factor out vhost_vdpa_pa_map() Xie Yongji
2021-03-23  3:09   ` Jason Wang
2021-03-15  5:37 ` [PATCH v5 07/11] vdpa: Support transferring virtual addressing during DMA mapping Xie Yongji
2021-03-23  3:13   ` Jason Wang
2021-03-23  7:26     ` Yongji Xie
2021-03-15  5:37 ` [PATCH v5 08/11] vduse: Implement an MMU-based IOMMU driver Xie Yongji
2021-03-24  3:54   ` Jason Wang
2021-03-24  7:39     ` Yongji Xie
2021-03-25  4:52       ` Jason Wang
2021-03-25  7:38         ` Yongji Xie
2021-03-26  4:26           ` Jason Wang [this message]
2021-03-26  5:14             ` Yongji Xie
     [not found]               ` <75e3b941-dfd2-ebbc-d752-8f25c1f14cab@redhat.com>
2021-03-26  6:56                 ` Yongji Xie
2021-03-26  7:36                   ` Jason Wang
2021-03-15  5:37 ` [PATCH v5 09/11] vduse: Introduce VDUSE - vDPA Device in Userspace Xie Yongji
2021-03-24  4:43   ` Jason Wang
2021-03-24  8:55     ` Yongji Xie
2021-03-25  6:30       ` Jason Wang
2021-03-25  7:47         ` Yongji Xie
2021-03-15  5:37 ` [PATCH v5 10/11] vduse: Add config interrupt support Xie Yongji
2021-03-24  4:45   ` Jason Wang
2021-03-24  8:56     ` Yongji Xie
2021-03-15  5:37 ` [PATCH v5 11/11] Documentation: Add documentation for VDUSE Xie Yongji

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2db71996-037e-494d-6ef0-de3ff164d3c3@redhat.com \
    --to=jasowang@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=bcrl@kvack.org \
    --cc=bob.liu@oracle.com \
    --cc=corbet@lwn.net \
    --cc=dan.carpenter@oracle.com \
    --cc=hch@infradead.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=mika.penttila@nextfour.com \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=parav@nvidia.com \
    --cc=rdunlap@infradead.org \
    --cc=sgarzare@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=virtualization@lists.linux-foundation.org \
    --cc=willy@infradead.org \
    --cc=xieyongji@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).