From: Ackerley Tng <ackerleytng@google.com>
To: akpm@linux-foundation.org, mike.kravetz@oracle.com,
muchun.song@linux.dev, pbonzini@redhat.com, seanjc@google.com,
shuah@kernel.org, willy@infradead.org
Cc: brauner@kernel.org, chao.p.peng@linux.intel.com,
coltonlewis@google.com, david@redhat.com, dhildenb@redhat.com,
dmatlack@google.com, erdemaktas@google.com, hughd@google.com,
isaku.yamahata@gmail.com, jarkko@kernel.org, jmattson@google.com,
joro@8bytes.org, jthoughton@google.com, jun.nakajima@intel.com,
kirill.shutemov@linux.intel.com, liam.merwick@oracle.com,
mail@maciej.szmigiero.name, mhocko@suse.com,
michael.roth@amd.com, qperret@google.com, rientjes@google.com,
rppt@kernel.org, steven.price@arm.com, tabba@google.com,
vannapurve@google.com, vbabka@suse.cz, vipinsh@google.com,
vkuznets@redhat.com, wei.w.wang@intel.com,
yu.c.zhang@linux.intel.com, kvm@vger.kernel.org,
linux-api@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org,
linux-mm@kvack.org, qemu-devel@nongnu.org, x86@kernel.org,
Ackerley Tng <ackerleytng@google.com>
Subject: [RFC PATCH 16/19] KVM: guest_mem: hugetlb: allocate and truncate from hugetlb
Date: Tue, 6 Jun 2023 19:04:01 +0000 [thread overview]
Message-ID: <18e518695854cc7243866d7b1be2fbbb3aa87c71.1686077275.git.ackerleytng@google.com> (raw)
In-Reply-To: <cover.1686077275.git.ackerleytng@google.com>
Introduce kvm_gmem_hugetlb_get_folio(), then update
kvm_gmem_allocate() and kvm_gmem_truncate() to use hugetlb functions.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
virt/kvm/guest_mem.c | 215 +++++++++++++++++++++++++++++++++++++------
1 file changed, 188 insertions(+), 27 deletions(-)
diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index b533143e2878..6271621f6b73 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -43,6 +43,95 @@ static loff_t kvm_gmem_get_size(struct file *file)
return i_size_read(file_inode(file));
}
+static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(
+ struct file *file, pgoff_t hindex)
+{
+ int err;
+ struct folio *folio;
+ struct kvm_gmem *gmem;
+ struct hstate *h;
+ struct resv_map *resv_map;
+ unsigned long offset;
+ struct vm_area_struct pseudo_vma;
+
+ gmem = file->private_data;
+ h = gmem->hugetlb.h;
+ resv_map = gmem->hugetlb.resv_map;
+ offset = hindex << huge_page_shift(h);
+
+ vma_init(&pseudo_vma, NULL);
+ vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ /* vma infrastructure is dependent on vm_file being set */
+ pseudo_vma.vm_file = file;
+
+ /* TODO setup NUMA policy. Meanwhile, fallback to get_task_policy(). */
+ pseudo_vma.vm_policy = NULL;
+ folio = alloc_hugetlb_folio_from_subpool(
+ gmem->hugetlb.spool, h, resv_map, &pseudo_vma, offset, 0);
+ /* Remember to take and drop refcount from vm_policy */
+ if (IS_ERR(folio))
+ return folio;
+
+ /*
+ * FIXME: Skip clearing pages when trusted firmware will do it when
+ * assigning memory to the guest.
+ */
+ clear_huge_page(&folio->page, offset, pages_per_huge_page(h));
+ __folio_mark_uptodate(folio);
+ err = hugetlb_filemap_add_folio(file->f_mapping, h, folio, hindex);
+ if (unlikely(err)) {
+ restore_reserve_on_error(resv_map, hindex, true, folio);
+ folio_put(folio);
+ folio = ERR_PTR(err);
+ }
+
+ return folio;
+}
+
+/**
+ * Gets a hugetlb folio, from @file, at @index (in terms of PAGE_SIZE) within
+ * the file.
+ *
+ * The returned folio will be in @file's page cache, and locked.
+ */
+static struct folio *kvm_gmem_hugetlb_get_folio(struct file *file, pgoff_t index)
+{
+ struct folio *folio;
+ u32 hash;
+ /* hindex is in terms of huge_page_size(h) and not PAGE_SIZE */
+ pgoff_t hindex;
+ struct kvm_gmem *gmem;
+ struct hstate *h;
+ struct address_space *mapping;
+
+ gmem = file->private_data;
+ h = gmem->hugetlb.h;
+ hindex = index >> huge_page_order(h);
+
+ mapping = file->f_mapping;
+ hash = hugetlb_fault_mutex_hash(mapping, hindex);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ rcu_read_lock();
+ folio = filemap_lock_folio(mapping, hindex);
+ rcu_read_unlock();
+ if (folio)
+ goto folio_valid;
+
+ folio = kvm_gmem_hugetlb_alloc_and_cache_folio(file, hindex);
+ /*
+ * TODO Perhaps the interface of kvm_gmem_get_folio should change to better
+ * report errors
+ */
+ if (IS_ERR(folio))
+ folio = NULL;
+
+folio_valid:
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ return folio;
+}
+
static struct folio *kvm_gmem_get_huge_folio(struct file *file, pgoff_t index)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -74,36 +163,56 @@ static struct folio *kvm_gmem_get_huge_folio(struct file *file, pgoff_t index)
#endif
}
+/**
+ * Gets a folio, from @file, at @index (in terms of PAGE_SIZE) within the file.
+ *
+ * The returned folio will be in @file's page cache and locked.
+ */
static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
{
struct folio *folio;
+ struct kvm_gmem *gmem = file->private_data;
- folio = kvm_gmem_get_huge_folio(file, index);
- if (!folio) {
- folio = filemap_grab_folio(file->f_mapping, index);
+ if (gmem->flags & KVM_GUEST_MEMFD_HUGETLB) {
+ folio = kvm_gmem_hugetlb_get_folio(file, index);
+
+ /* hugetlb gmem does not fall back to non-hugetlb pages */
if (!folio)
return NULL;
- }
- /*
- * TODO: Confirm this won't zero in-use pages, and skip clearing pages
- * when trusted firmware will do it when assigning memory to the guest.
- */
- if (!folio_test_uptodate(folio)) {
- unsigned long nr_pages = folio_nr_pages(folio);
- unsigned long i;
+ /*
+ * Don't need to clear pages because
+ * kvm_gmem_hugetlb_alloc_and_cache_folio() already clears pages
+ * when allocating
+ */
+ } else {
+ folio = kvm_gmem_get_huge_folio(file, index);
+ if (!folio) {
+ folio = filemap_grab_folio(file->f_mapping, index);
+ if (!folio)
+ return NULL;
+ }
- for (i = 0; i < nr_pages; i++)
- clear_highpage(folio_page(folio, i));
- }
+ /*
+ * TODO: Confirm this won't zero in-use pages, and skip clearing pages
+ * when trusted firmware will do it when assigning memory to the guest.
+ */
+ if (!folio_test_uptodate(folio)) {
+ unsigned long nr_pages = folio_nr_pages(folio);
+ unsigned long i;
- /*
- * filemap_grab_folio() uses FGP_ACCESSED, which already called
- * folio_mark_accessed(), so we clear it.
- * TODO: Should we instead be clearing this when truncating?
- * TODO: maybe don't use FGP_ACCESSED at all and call __filemap_get_folio directly.
- */
- folio_clear_referenced(folio);
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(folio_page(folio, i));
+ }
+
+ /*
+ * filemap_grab_folio() uses FGP_ACCESSED, which already called
+ * folio_mark_accessed(), so we clear it.
+ * TODO: Should we instead be clearing this when truncating?
+ * TODO: maybe don't use FGP_ACCESSED at all and call __filemap_get_folio directly.
+ */
+ folio_clear_referenced(folio);
+ }
/*
* Indicate that this folio matches the backing store (in this case, has
@@ -156,6 +265,44 @@ static void kvm_gmem_invalidate_end(struct kvm *kvm, struct kvm_gmem *gmem,
KVM_MMU_UNLOCK(kvm);
}
+static void kvm_gmem_hugetlb_truncate_range(struct inode *inode,
+ loff_t offset, loff_t len)
+{
+ loff_t hsize;
+ loff_t full_hpage_start;
+ loff_t full_hpage_end;
+ struct kvm_gmem *gmem;
+ struct hstate *h;
+ struct address_space *mapping;
+
+ mapping = inode->i_mapping;
+ gmem = mapping->private_data;
+ h = gmem->hugetlb.h;
+ hsize = huge_page_size(h);
+ full_hpage_start = round_up(offset, hsize);
+ full_hpage_end = round_down(offset + len, hsize);
+
+ /* If range starts before first full page, zero partial page. */
+ if (offset < full_hpage_start) {
+ hugetlb_zero_partial_page(
+ h, mapping, offset, min(offset + len, full_hpage_start));
+ }
+
+ /* Remove full pages from the file. */
+ if (full_hpage_end > full_hpage_start) {
+ remove_mapping_hugepages(mapping, h, gmem->hugetlb.spool,
+ gmem->hugetlb.resv_map, inode,
+ full_hpage_start, full_hpage_end);
+ }
+
+
+ /* If range extends beyond last full page, zero partial page. */
+ if ((offset + len) > full_hpage_end && (offset + len) > full_hpage_start) {
+ hugetlb_zero_partial_page(
+ h, mapping, full_hpage_end, offset + len);
+ }
+}
+
static long kvm_gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
{
struct kvm_gmem *gmem = file->private_data;
@@ -171,7 +318,10 @@ static long kvm_gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
kvm_gmem_invalidate_begin(kvm, gmem, start, end);
- truncate_inode_pages_range(file->f_mapping, offset, offset + len - 1);
+ if (gmem->flags & KVM_GUEST_MEMFD_HUGETLB)
+ kvm_gmem_hugetlb_truncate_range(file_inode(file), offset, len);
+ else
+ truncate_inode_pages_range(file->f_mapping, offset, offset + len - 1);
kvm_gmem_invalidate_end(kvm, gmem, start, end);
@@ -183,6 +333,7 @@ static long kvm_gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
static long kvm_gmem_allocate(struct file *file, loff_t offset, loff_t len)
{
struct address_space *mapping = file->f_mapping;
+ struct kvm_gmem *gmem = file->private_data;
pgoff_t start, index, end;
int r;
@@ -192,9 +343,14 @@ static long kvm_gmem_allocate(struct file *file, loff_t offset, loff_t len)
filemap_invalidate_lock_shared(mapping);
- start = offset >> PAGE_SHIFT;
- /* Align so that at least 1 page is allocated */
- end = ALIGN(offset + len, PAGE_SIZE) >> PAGE_SHIFT;
+ if (gmem->flags & KVM_GUEST_MEMFD_HUGETLB) {
+ start = offset >> huge_page_shift(gmem->hugetlb.h);
+ end = ALIGN(offset + len, huge_page_size(gmem->hugetlb.h)) >> PAGE_SHIFT;
+ } else {
+ start = offset >> PAGE_SHIFT;
+ /* Align so that at least 1 page is allocated */
+ end = ALIGN(offset + len, PAGE_SIZE) >> PAGE_SHIFT;
+ }
r = 0;
for (index = start; index < end; ) {
@@ -211,7 +367,7 @@ static long kvm_gmem_allocate(struct file *file, loff_t offset, loff_t len)
break;
}
- index = folio_next_index(folio);
+ index += folio_nr_pages(folio);
folio_unlock(folio);
folio_put(folio);
@@ -625,7 +781,12 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
return -ENOMEM;
}
- page = folio_file_page(folio, index);
+ /*
+ * folio_file_page() always returns the head page for hugetlb
+ * folios. Reimplement to get the page within this folio, even for
+ * hugetlb pages.
+ */
+ page = folio_page(folio, index & (folio_nr_pages(folio) - 1));
*pfn = page_to_pfn(page);
*order = thp_order(compound_head(page));
--
2.41.0.rc0.172.g3f132b7071-goog
next prev parent reply other threads:[~2023-06-06 19:07 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-06-06 19:03 [RFC PATCH 00/19] hugetlb support for KVM guest_mem Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 01/19] mm: hugetlb: Expose get_hstate_idx() Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 02/19] mm: hugetlb: Move and expose hugetlbfs_zero_partial_page Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 03/19] mm: hugetlb: Expose remove_inode_hugepages Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 04/19] mm: hugetlb: Decouple hstate, subpool from inode Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 05/19] mm: hugetlb: Allow alloc_hugetlb_folio() to be parametrized by subpool and hstate Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 06/19] mm: hugetlb: Provide hugetlb_filemap_add_folio() Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 07/19] mm: hugetlb: Refactor vma_*_reservation functions Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 08/19] mm: hugetlb: Refactor restore_reserve_on_error Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 09/19] mm: hugetlb: Use restore_reserve_on_error directly in filesystems Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 10/19] mm: hugetlb: Parametrize alloc_hugetlb_folio_from_subpool() by resv_map Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 11/19] mm: hugetlb: Parametrize hugetlb functions " Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 12/19] mm: truncate: Expose preparation steps for truncate_inode_pages_final Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 13/19] KVM: guest_mem: Refactor kvm_gmem fd creation to be in layers Ackerley Tng
2023-06-06 19:03 ` [RFC PATCH 14/19] KVM: guest_mem: Refactor cleanup to separate inode and file cleanup Ackerley Tng
2023-06-06 19:04 ` [RFC PATCH 15/19] KVM: guest_mem: hugetlb: initialization and cleanup Ackerley Tng
2023-06-06 19:04 ` Ackerley Tng [this message]
2023-06-06 19:04 ` [RFC PATCH 17/19] KVM: selftests: Add basic selftests for hugetlbfs-backed guest_mem Ackerley Tng
2023-06-06 19:04 ` [RFC PATCH 18/19] KVM: selftests: Support various types of backing sources for private memory Ackerley Tng
2023-06-06 19:04 ` [RFC PATCH 19/19] KVM: selftests: Update test for various private memory backing source types Ackerley Tng
2023-06-08 4:38 ` [RFC PATCH 00/19] hugetlb support for KVM guest_mem Isaku Yamahata
2023-06-16 18:28 ` Mike Kravetz
2023-06-21 9:01 ` Vishal Annapurve
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=18e518695854cc7243866d7b1be2fbbb3aa87c71.1686077275.git.ackerleytng@google.com \
--to=ackerleytng@google.com \
--cc=akpm@linux-foundation.org \
--cc=brauner@kernel.org \
--cc=chao.p.peng@linux.intel.com \
--cc=coltonlewis@google.com \
--cc=david@redhat.com \
--cc=dhildenb@redhat.com \
--cc=dmatlack@google.com \
--cc=erdemaktas@google.com \
--cc=hughd@google.com \
--cc=isaku.yamahata@gmail.com \
--cc=jarkko@kernel.org \
--cc=jmattson@google.com \
--cc=joro@8bytes.org \
--cc=jthoughton@google.com \
--cc=jun.nakajima@intel.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=kvm@vger.kernel.org \
--cc=liam.merwick@oracle.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mail@maciej.szmigiero.name \
--cc=mhocko@suse.com \
--cc=michael.roth@amd.com \
--cc=mike.kravetz@oracle.com \
--cc=muchun.song@linux.dev \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=qperret@google.com \
--cc=rientjes@google.com \
--cc=rppt@kernel.org \
--cc=seanjc@google.com \
--cc=shuah@kernel.org \
--cc=steven.price@arm.com \
--cc=tabba@google.com \
--cc=vannapurve@google.com \
--cc=vbabka@suse.cz \
--cc=vipinsh@google.com \
--cc=vkuznets@redhat.com \
--cc=wei.w.wang@intel.com \
--cc=willy@infradead.org \
--cc=x86@kernel.org \
--cc=yu.c.zhang@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).