From: Mike Kravetz <mike.kravetz@oracle.com>
To: Vivek Kasireddy <vivek.kasireddy@intel.com>
Cc: dri-devel@lists.freedesktop.org, linux-mm@kvack.org,
David Hildenbrand <david@redhat.com>,
Hugh Dickins <hughd@google.com>, Peter Xu <peterx@redhat.com>,
Jason Gunthorpe <jgg@nvidia.com>,
Gerd Hoffmann <kraxel@redhat.com>,
Dongwon Kim <dongwon.kim@intel.com>,
Junxiao Chang <junxiao.chang@intel.com>
Subject: Re: [PATCH v2 2/2] udmabuf: Add back support for mapping hugetlb pages (v2)
Date: Tue, 18 Jul 2023 16:44:36 -0700 [thread overview]
Message-ID: <20230718234436.GB10413@monkey> (raw)
In-Reply-To: <20230718082605.1570740-3-vivek.kasireddy@intel.com>
On 07/18/23 01:26, Vivek Kasireddy wrote:
> A user or admin can configure a VMM (Qemu) Guest's memory to be
> backed by hugetlb pages for various reasons. However, a Guest OS
> would still allocate (and pin) buffers that are backed by regular
> 4k sized pages. In order to map these buffers and create dma-bufs
> for them on the Host, we first need to find the hugetlb pages where
> the buffer allocations are located and then determine the offsets
> of individual chunks (within those pages) and use this information
> to eventually populate a scatterlist.
>
> Testcase: default_hugepagesz=2M hugepagesz=2M hugepages=2500 options
> were passed to the Host kernel and Qemu was launched with these
> relevant options: qemu-system-x86_64 -m 4096m....
> -device virtio-gpu-pci,max_outputs=1,blob=true,xres=1920,yres=1080
> -display gtk,gl=on
> -object memory-backend-memfd,hugetlb=on,id=mem1,size=4096M
> -machine memory-backend=mem1
>
> Replacing -display gtk,gl=on with -display gtk,gl=off above would
> exercise the mmap handler.
>
> v2: Updated get_sg_table() to manually populate the scatterlist for
> both huge page and non-huge-page cases.
>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Jason Gunthorpe <jgg@nvidia.com>
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Junxiao Chang <junxiao.chang@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
> drivers/dma-buf/udmabuf.c | 84 +++++++++++++++++++++++++++++++++------
> 1 file changed, 71 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
> index 820c993c8659..10c47bf77fb5 100644
> --- a/drivers/dma-buf/udmabuf.c
> +++ b/drivers/dma-buf/udmabuf.c
> @@ -10,6 +10,7 @@
> #include <linux/miscdevice.h>
> #include <linux/module.h>
> #include <linux/shmem_fs.h>
> +#include <linux/hugetlb.h>
> #include <linux/slab.h>
> #include <linux/udmabuf.h>
> #include <linux/vmalloc.h>
> @@ -28,6 +29,7 @@ struct udmabuf {
> struct page **pages;
> struct sg_table *sg;
> struct miscdevice *device;
> + pgoff_t *offsets;
> };
>
> static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
> @@ -41,6 +43,10 @@ static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
> return VM_FAULT_SIGBUS;
>
> pfn = page_to_pfn(ubuf->pages[pgoff]);
> + if (ubuf->offsets) {
> + pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
> + }
> +
> return vmf_insert_pfn(vma, vmf->address, pfn);
> }
>
> @@ -90,23 +96,31 @@ static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf,
> {
> struct udmabuf *ubuf = buf->priv;
> struct sg_table *sg;
> + struct scatterlist *sgl;
> + pgoff_t offset;
> + unsigned long i = 0;
> int ret;
>
> sg = kzalloc(sizeof(*sg), GFP_KERNEL);
> if (!sg)
> return ERR_PTR(-ENOMEM);
> - ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount,
> - 0, ubuf->pagecount << PAGE_SHIFT,
> - GFP_KERNEL);
> +
> + ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
> if (ret < 0)
> - goto err;
> + goto err_alloc;
> +
> + for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) {
> + offset = ubuf->offsets ? ubuf->offsets[i] : 0;
> + sg_set_page(sgl, ubuf->pages[i], PAGE_SIZE, offset);
> + }
> ret = dma_map_sgtable(dev, sg, direction, 0);
> if (ret < 0)
> - goto err;
> + goto err_map;
> return sg;
>
> -err:
> +err_map:
> sg_free_table(sg);
> +err_alloc:
> kfree(sg);
> return ERR_PTR(ret);
> }
> @@ -143,6 +157,7 @@ static void release_udmabuf(struct dma_buf *buf)
>
> for (pg = 0; pg < ubuf->pagecount; pg++)
> put_page(ubuf->pages[pg]);
> + kfree(ubuf->offsets);
> kfree(ubuf->pages);
> kfree(ubuf);
> }
> @@ -206,7 +221,9 @@ static long udmabuf_create(struct miscdevice *device,
> struct udmabuf *ubuf;
> struct dma_buf *buf;
> pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
> - struct page *page;
> + struct page *page, *hpage = NULL;
> + pgoff_t hpoff, chunkoff, maxchunks;
> + struct hstate *hpstate;
> int seals, ret = -EINVAL;
> u32 i, flags;
>
> @@ -242,7 +259,7 @@ static long udmabuf_create(struct miscdevice *device,
> if (!memfd)
> goto err;
> mapping = memfd->f_mapping;
> - if (!shmem_mapping(mapping))
> + if (!shmem_mapping(mapping) && !is_file_hugepages(memfd))
> goto err;
> seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
> if (seals == -EINVAL)
> @@ -253,16 +270,56 @@ static long udmabuf_create(struct miscdevice *device,
> goto err;
> pgoff = list[i].offset >> PAGE_SHIFT;
> pgcnt = list[i].size >> PAGE_SHIFT;
> + if (is_file_hugepages(memfd)) {
> + if (!ubuf->offsets) {
> + ubuf->offsets = kmalloc_array(ubuf->pagecount,
> + sizeof(*ubuf->offsets),
> + GFP_KERNEL);
> + if (!ubuf->offsets) {
> + ret = -ENOMEM;
> + goto err;
> + }
> + }
> + hpstate = hstate_file(memfd);
> + hpoff = list[i].offset >> huge_page_shift(hpstate);
> + chunkoff = (list[i].offset &
> + ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
> + maxchunks = huge_page_size(hpstate) >> PAGE_SHIFT;
> + }
> for (pgidx = 0; pgidx < pgcnt; pgidx++) {
> - page = shmem_read_mapping_page(mapping, pgoff + pgidx);
> - if (IS_ERR(page)) {
> - ret = PTR_ERR(page);
> - goto err;
> + if (is_file_hugepages(memfd)) {
> + if (!hpage) {
> + hpage = find_get_page_flags(mapping, hpoff,
> + FGP_ACCESSED);
> + if (!hpage) {
> + ret = -EINVAL;
> + goto err;
> + }
> + }
> + get_page(hpage);
Is the intention to increase the ref count of the hugetlb page once for
each 'sub-page' added? Or, am I reading that incorrectly?
> + ubuf->pages[pgbuf] = hpage;
Ah!, answering my own question. Since the 'head page' is added to the
array the ref count of the head page will decremented in
release_udmabuf.
> + ubuf->offsets[pgbuf++] = chunkoff << PAGE_SHIFT;
> + if (++chunkoff == maxchunks) {
> + put_page(hpage);
> + hpage = NULL;
> + chunkoff = 0;
> + hpoff++;
> + }
> + } else {
> + page = shmem_read_mapping_page(mapping, pgoff + pgidx);
It may not matter to your users, but the semantics for hugetlb and shmem
pages is different. hugetlb requires the pages exist in the page cache
while shmem will create/add pages to the cache if necessary.
> + if (IS_ERR(page)) {
> + ret = PTR_ERR(page);
> + goto err;
> + }
> + ubuf->pages[pgbuf++] = page;
> }
> - ubuf->pages[pgbuf++] = page;
> }
> fput(memfd);
> memfd = NULL;
> + if (hpage) {
> + put_page(hpage);
> + hpage = NULL;
> + }
> }
>
> exp_info.ops = &udmabuf_ops;
> @@ -287,6 +344,7 @@ static long udmabuf_create(struct miscdevice *device,
> put_page(ubuf->pages[--pgbuf]);
> if (memfd)
> fput(memfd);
> + kfree(ubuf->offsets);
> kfree(ubuf->pages);
> kfree(ubuf);
> return ret;
> --
> 2.39.2
>
Nothing else stands out,
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
I see there is a RFC for the coherency issue with hole punch.
--
Mike Kravetz
next prev parent reply other threads:[~2023-07-18 23:44 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-07-18 8:26 [PATCH v2 0/2] udmabuf: Add back support for mapping hugetlb pages (v2) Vivek Kasireddy
2023-07-18 8:26 ` [PATCH v2 1/2] udmabuf: Use vmf_insert_pfn and VM_PFNMAP for handling mmap Vivek Kasireddy
2023-07-18 18:03 ` David Hildenbrand
2023-07-18 8:26 ` [PATCH v2 2/2] udmabuf: Add back support for mapping hugetlb pages (v2) Vivek Kasireddy
2023-07-18 18:05 ` David Hildenbrand
2023-07-18 23:44 ` Mike Kravetz [this message]
2023-07-19 5:42 ` Kasireddy, Vivek
2023-07-27 20:54 ` Peter Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230718234436.GB10413@monkey \
--to=mike.kravetz@oracle.com \
--cc=david@redhat.com \
--cc=dongwon.kim@intel.com \
--cc=dri-devel@lists.freedesktop.org \
--cc=hughd@google.com \
--cc=jgg@nvidia.com \
--cc=junxiao.chang@intel.com \
--cc=kraxel@redhat.com \
--cc=linux-mm@kvack.org \
--cc=peterx@redhat.com \
--cc=vivek.kasireddy@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).