From: Logan Gunthorpe <logang@deltatee.com>
To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-kernel@vger.kernel.org, linux-nvme@lists.infradead.org,
linux-block@vger.kernel.org, linux-pci@vger.kernel.org,
linux-mm@kvack.org, "Christoph Hellwig" <hch@lst.de>,
"Dan Williams" <dan.j.williams@intel.com>,
"Jason Gunthorpe" <jgg@ziepe.ca>,
"Christian König" <christian.koenig@amd.com>,
"John Hubbard" <jhubbard@nvidia.com>,
"Don Dutile" <ddutile@redhat.com>,
"Matthew Wilcox" <willy@infradead.org>,
"Daniel Vetter" <daniel.vetter@ffwll.ch>,
"Minturn Dave B" <dave.b.minturn@intel.com>,
"Jason Ekstrand" <jason@jlekstrand.net>,
"Dave Hansen" <dave.hansen@linux.intel.com>,
"Xiong Jianxin" <jianxin.xiong@intel.com>,
"Bjorn Helgaas" <helgaas@kernel.org>,
"Ira Weiny" <ira.weiny@intel.com>,
"Robin Murphy" <robin.murphy@arm.com>,
"Martin Oliveira" <martin.oliveira@eideticom.com>,
"Chaitanya Kulkarni" <ckulkarnilinux@gmail.com>,
"Ralph Campbell" <rcampbell@nvidia.com>,
"Stephen Bates" <sbates@raithlin.com>
Subject: Re: [PATCH v9 7/8] PCI/P2PDMA: Allow userspace VMA allocations through sysfs
Date: Thu, 1 Sep 2022 10:32:55 -0600 [thread overview]
Message-ID: <4a4bca1e-bebf-768f-92d4-92eb8ae714e1@deltatee.com> (raw)
In-Reply-To: <YxDb2MyRx6o/wDAz@kroah.com>
On 2022-09-01 10:20, Greg Kroah-Hartman wrote:
> On Thu, Aug 25, 2022 at 09:24:24AM -0600, Logan Gunthorpe wrote:
>> Create a sysfs bin attribute called "allocate" under the existing
>> "p2pmem" group. The only allowable operation on this file is the mmap()
>> call.
>>
>> When mmap() is called on this attribute, the kernel allocates a chunk of
>> memory from the genalloc and inserts the pages into the VMA. The
>> dev_pagemap .page_free callback will indicate when these pages are no
>> longer used and they will be put back into the genalloc.
>>
>> On device unbind, remove the sysfs file before the memremap_pages are
>> cleaned up. This ensures unmap_mapping_range() is called on the files
>> inode and no new mappings can be created.
>>
>> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
>> ---
>> drivers/pci/p2pdma.c | 124 +++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 124 insertions(+)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index 4496a7c5c478..a6ed6bbca214 100644
>> --- a/drivers/pci/p2pdma.c
>> +++ b/drivers/pci/p2pdma.c
>> @@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
>> }
>> static DEVICE_ATTR_RO(published);
>>
>> +static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
>> + struct bin_attribute *attr, struct vm_area_struct *vma)
>> +{
>> + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
>> + size_t len = vma->vm_end - vma->vm_start;
>> + struct pci_p2pdma *p2pdma;
>> + struct percpu_ref *ref;
>> + unsigned long vaddr;
>> + void *kaddr;
>> + int ret;
>> +
>> + /* prevent private mappings from being established */
>> + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
>> + pci_info_ratelimited(pdev,
>> + "%s: fail, attempted private mapping\n",
>> + current->comm);
>> + return -EINVAL;
>> + }
>> +
>> + if (vma->vm_pgoff) {
>> + pci_info_ratelimited(pdev,
>> + "%s: fail, attempted mapping with non-zero offset\n",
>> + current->comm);
>> + return -EINVAL;
>> + }
>> +
>> + rcu_read_lock();
>> + p2pdma = rcu_dereference(pdev->p2pdma);
>> + if (!p2pdma) {
>> + ret = -ENODEV;
>> + goto out;
>> + }
>> +
>> + kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
>> + if (!kaddr) {
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> +
>> + /*
>> + * vm_insert_page() can sleep, so a reference is taken to mapping
>> + * such that rcu_read_unlock() can be done before inserting the
>> + * pages
>> + */
>> + if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
>> + ret = -ENODEV;
>> + goto out_free_mem;
>> + }
>> + rcu_read_unlock();
>> +
>> + for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
>> + ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
>> + if (ret) {
>> + gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
>> + return ret;
>> + }
>> + percpu_ref_get(ref);
>> + put_page(virt_to_page(kaddr));
>> + kaddr += PAGE_SIZE;
>> + len -= PAGE_SIZE;
>> + }
>> +
>> + percpu_ref_put(ref);
>> +
>> + return 0;
>> +out_free_mem:
>> + gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
>> +out:
>> + rcu_read_unlock();
>> + return ret;
>> +}
>> +
>> +static struct bin_attribute p2pmem_alloc_attr = {
>> + .attr = { .name = "allocate", .mode = 0660 },
>> + .mmap = p2pmem_alloc_mmap,
>> + /*
>> + * Some places where we want to call mmap (ie. python) will check
>> + * that the file size is greater than the mmap size before allowing
>> + * the mmap to continue. To work around this, just set the size
>> + * to be very large.
>> + */
>> + .size = SZ_1T,
>> +};
>> +
>> static struct attribute *p2pmem_attrs[] = {
>> &dev_attr_size.attr,
>> &dev_attr_available.attr,
>> @@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = {
>> NULL,
>> };
>>
>> +static struct bin_attribute *p2pmem_bin_attrs[] = {
>> + &p2pmem_alloc_attr,
>> + NULL,
>> +};
>> +
>> static const struct attribute_group p2pmem_group = {
>> .attrs = p2pmem_attrs,
>> + .bin_attrs = p2pmem_bin_attrs,
>> .name = "p2pmem",
>> };
>>
>> +static void p2pdma_page_free(struct page *page)
>> +{
>> + struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
>> + struct percpu_ref *ref;
>> +
>> + gen_pool_free_owner(pgmap->provider->p2pdma->pool,
>> + (uintptr_t)page_to_virt(page), PAGE_SIZE,
>> + (void **)&ref);
>> + percpu_ref_put(ref);
>> +}
>> +
>> +static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
>> + .page_free = p2pdma_page_free,
>> +};
>> +
>> static void pci_p2pdma_release(void *data)
>> {
>> struct pci_dev *pdev = data;
>> @@ -152,6 +257,19 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
>> return error;
>> }
>>
>> +static void pci_p2pdma_unmap_mappings(void *data)
>> +{
>> + struct pci_dev *pdev = data;
>> +
>> + /*
>> + * Removing the alloc attribute from sysfs will call
>> + * unmap_mapping_range() on the inode, teardown any existing userspace
>> + * mappings and prevent new ones from being created.
>> + */
>> + sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
>> + p2pmem_group.name);
>
> Wait, why are you manually removing the sysfs file here? It's part of
> the group, if you do this then it is gone for forever, right? Why
> manually do this the sysfs core should handle this for you if the device
> is removed.
We have to make sure the mappings are all removed before the cleanup of
devm_memremap_pages() which will wait for all the pages to be freed. If
we don't do this any userspace mapping will hang the cleanup until those
uses are unmapped themselves.
> And worst case, just pass in the device, not the pci device.
Ok, I'll make that change for v10.
Logan
next prev parent reply other threads:[~2022-09-01 16:33 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-25 15:24 [PATCH v9 0/8] Userspace P2PDMA with O_DIRECT NVMe devices Logan Gunthorpe
2022-08-25 15:24 ` [PATCH v9 1/8] mm: introduce FOLL_PCI_P2PDMA to gate getting PCI P2PDMA pages Logan Gunthorpe
2022-09-05 22:27 ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 2/8] iov_iter: introduce iov_iter_get_pages_[alloc_]flags() Logan Gunthorpe
2022-09-05 14:33 ` Christoph Hellwig
2022-09-05 23:21 ` John Hubbard
2022-09-06 16:52 ` Logan Gunthorpe
2022-08-25 15:24 ` [PATCH v9 3/8] block: add check when merging zone device pages Logan Gunthorpe
2022-09-05 14:34 ` Christoph Hellwig
2022-09-05 23:58 ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 4/8] lib/scatterlist: " Logan Gunthorpe
2022-09-05 14:34 ` Christoph Hellwig
2022-09-06 0:21 ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 5/8] block: set FOLL_PCI_P2PDMA in __bio_iov_iter_get_pages() Logan Gunthorpe
2022-09-05 14:36 ` Christoph Hellwig
2022-09-06 0:48 ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 6/8] block: set FOLL_PCI_P2PDMA in bio_map_user_iov() Logan Gunthorpe
2022-09-05 14:36 ` Christoph Hellwig
2022-09-06 0:54 ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 7/8] PCI/P2PDMA: Allow userspace VMA allocations through sysfs Logan Gunthorpe
2022-09-01 16:20 ` Greg Kroah-Hartman
2022-09-01 16:32 ` Logan Gunthorpe [this message]
2022-09-01 16:42 ` Greg Kroah-Hartman
2022-09-01 18:14 ` Logan Gunthorpe
2022-09-01 18:36 ` Greg Kroah-Hartman
2022-09-01 19:16 ` Logan Gunthorpe
2022-09-02 5:53 ` Greg Kroah-Hartman
2022-09-02 18:46 ` Logan Gunthorpe
2022-09-20 6:46 ` Christoph Hellwig
2022-09-22 8:38 ` Greg Kroah-Hartman
2022-09-22 14:58 ` Logan Gunthorpe
2022-08-25 15:24 ` [PATCH v9 8/8] ABI: sysfs-bus-pci: add documentation for p2pmem allocate Logan Gunthorpe
2022-09-01 16:18 ` Greg Kroah-Hartman
2022-09-01 16:33 ` Logan Gunthorpe
2022-09-06 1:03 ` John Hubbard
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4a4bca1e-bebf-768f-92d4-92eb8ae714e1@deltatee.com \
--to=logang@deltatee.com \
--cc=christian.koenig@amd.com \
--cc=ckulkarnilinux@gmail.com \
--cc=dan.j.williams@intel.com \
--cc=daniel.vetter@ffwll.ch \
--cc=dave.b.minturn@intel.com \
--cc=dave.hansen@linux.intel.com \
--cc=ddutile@redhat.com \
--cc=gregkh@linuxfoundation.org \
--cc=hch@lst.de \
--cc=helgaas@kernel.org \
--cc=ira.weiny@intel.com \
--cc=jason@jlekstrand.net \
--cc=jgg@ziepe.ca \
--cc=jhubbard@nvidia.com \
--cc=jianxin.xiong@intel.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-pci@vger.kernel.org \
--cc=martin.oliveira@eideticom.com \
--cc=rcampbell@nvidia.com \
--cc=robin.murphy@arm.com \
--cc=sbates@raithlin.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).