* [RFC 1/1] memfd: Support mapping to zero page on reading
2021-12-22 12:33 [RFC 0/1] memfd: Support mapping to zero page on reading Peng Liang
@ 2021-12-22 12:34 ` Peng Liang
2022-01-04 14:44 ` [RFC 0/1] " David Hildenbrand
2022-01-12 2:30 ` Hugh Dickins
2 siblings, 0 replies; 7+ messages in thread
From: Peng Liang @ 2021-12-22 12:34 UTC (permalink / raw)
To: linux-mm, linux-kernel
Cc: akpm, hughd, xiexiangyou, zhengchuan, wanghao232, liangpeng10
Mapping to zero page on reading memfd and COWing it when a write occurs.
Signed-off-by: Peng Liang <liangpeng10@huawei.com>
---
include/linux/fs.h | 2 ++
include/uapi/linux/memfd.h | 1 +
mm/memfd.c | 8 ++++++--
mm/memory.c | 37 ++++++++++++++++++++++++++++++++++---
mm/shmem.c | 10 ++++++++--
5 files changed, 51 insertions(+), 7 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbf812ce89a8..404c0c26ba98 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2249,6 +2249,7 @@ struct super_operations {
#define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
+#define S_ZEROPAGE (1 << 17)
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2291,6 +2292,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode) ((inode)->i_flags & S_VERITY)
+#define IS_ZEROPAGE(inode) ((inode)->i_flags & S_ZEROPAGE)
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 7a8a26751c23..2bfac06f53fb 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -8,6 +8,7 @@
#define MFD_CLOEXEC 0x0001U
#define MFD_ALLOW_SEALING 0x0002U
#define MFD_HUGETLB 0x0004U
+#define MFD_ZEROPAGE 0x0008U
/*
* Huge page size encoding when MFD_HUGETLB is specified, and a huge page
diff --git a/mm/memfd.c b/mm/memfd.c
index 9f80f162791a..5c167b2de9ae 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -245,7 +245,7 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_ZEROPAGE)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
@@ -301,8 +301,12 @@ SYSCALL_DEFINE2(memfd_create,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
- } else
+ } else {
file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (flags & MFD_ZEROPAGE) {
+ file_inode(file)->i_flags |= S_ZEROPAGE;
+ }
+ }
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_fd;
diff --git a/mm/memory.c b/mm/memory.c
index 8f1de811a1dc..360606964a7d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3208,6 +3208,26 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
return ret;
}
+static vm_fault_t do_shared_fault(struct vm_fault *vmf);
+
+static vm_fault_t wp_zero_shared(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+ vm_fault_t ret;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ ret = do_shared_fault(vmf);
+ mmu_notifier_invalidate_range_only_end(&range);
+ return ret;
+}
+
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
@@ -3254,8 +3274,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
- return wp_pfn_shared(vmf);
+ (VM_WRITE|VM_SHARED)) {
+ if (unlikely(vma->vm_file &&
+ IS_ZEROPAGE(file_inode(vma->vm_file)) &&
+ is_zero_pfn(pte_pfn(*vmf->pte)))) {
+ return wp_zero_shared(vmf);
+ } else {
+ return wp_pfn_shared(vmf);
+ }
+ }
pte_unmap_unlock(vmf->pte, vmf->ptl);
return wp_page_copy(vmf);
@@ -3970,12 +3997,16 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (unlikely(vma->vm_file && IS_ZEROPAGE(file_inode(vma->vm_file)) &&
+ is_zero_pfn(page_to_pfn(page))))
+ entry = pte_mkspecial(pte_wrprotect(entry));
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
lru_cache_add_inactive_or_unevictable(page, vma);
- } else {
+ } else if (likely(!vma->vm_file || !IS_ZEROPAGE(file_inode(vma->vm_file)) ||
+ !is_zero_pfn(page_to_pfn(page)))) {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 18f93c2d68f1..f4b23124826d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1899,8 +1899,14 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
alloc_nohuge:
- page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
+ if (IS_ZEROPAGE(inode) && vmf &&
+ !(vmf->flags & FAULT_FLAG_WRITE)) {
+ page = ZERO_PAGE(0);
+ goto out;
+ } else {
+ page = shmem_alloc_and_acct_page(gfp, inode,
+ index, false);
+ }
}
if (IS_ERR(page)) {
int retry = 5;
--
2.33.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [RFC 0/1] memfd: Support mapping to zero page on reading
2021-12-22 12:33 [RFC 0/1] memfd: Support mapping to zero page on reading Peng Liang
2021-12-22 12:34 ` [RFC 1/1] " Peng Liang
@ 2022-01-04 14:44 ` David Hildenbrand
2022-01-12 2:30 ` Hugh Dickins
2 siblings, 0 replies; 7+ messages in thread
From: David Hildenbrand @ 2022-01-04 14:44 UTC (permalink / raw)
To: Peng Liang, linux-mm, linux-kernel
Cc: akpm, hughd, xiexiangyou, zhengchuan, wanghao232, dgilbert
On 22.12.21 13:33, Peng Liang wrote:
> Hi all,
>
> Recently we are working on implementing CRIU [1] for QEMU based on
> Steven's work [2]. It will use memfd to allocate guest memory in order
> to restore (inherit) it in the new QEMU process. However, memfd will
> allocate a new page for reading while anonymous memory will map to zero
> page for reading. For QEMU, memfd may cause that all memory are
> allocated during the migration because QEMU will read all pages in
> migration. It may lead to OOM if over-committed memory is enabled,
> which is usually enabled in public cloud.
Hi,
it's the exact same problem as if just migrating a VM after inflating
the balloon, or after reporting free memory to the hypervisor via
virtio-balloon free page reporting.
Even populating the shared zero page still wastes CPU time and more
importantly memory for page tables. Further, you'll end up reading the
whole page to discover that you just populated the shared zeropage, far
from optimal. Instead of doing that dance, just check if there is
something worth reading at all.
You could simply sense if a page is actually populated before going
ahead and reading it for migration. I actually discussed that recently
with Dave Gilbert.
For anonymous memory it's pretty straight forward via
/proc/self/pagemap. For files you can use lseek.
https://lkml.kernel.org/r/20210923064618.157046-2-tiberiu.georgescu@nutanix.com
Contains some details. There was a discussion to eventually have a
better bulk interface for it if it's necessary for performance.
--
Thanks,
David / dhildenb
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC 0/1] memfd: Support mapping to zero page on reading
2021-12-22 12:33 [RFC 0/1] memfd: Support mapping to zero page on reading Peng Liang
2021-12-22 12:34 ` [RFC 1/1] " Peng Liang
2022-01-04 14:44 ` [RFC 0/1] " David Hildenbrand
@ 2022-01-12 2:30 ` Hugh Dickins
2022-01-12 3:33 ` Yang Shi
2022-01-12 4:32 ` Matthew Wilcox
2 siblings, 2 replies; 7+ messages in thread
From: Hugh Dickins @ 2022-01-12 2:30 UTC (permalink / raw)
To: Peng Liang
Cc: David Hildenbrand, linux-mm, linux-kernel, akpm, hughd,
xiexiangyou, zhengchuan, wanghao232
On Wed, 22 Dec 2021, Peng Liang wrote:
> Hi all,
>
> Recently we are working on implementing CRIU [1] for QEMU based on
> Steven's work [2]. It will use memfd to allocate guest memory in order
> to restore (inherit) it in the new QEMU process. However, memfd will
> allocate a new page for reading while anonymous memory will map to zero
> page for reading. For QEMU, memfd may cause that all memory are
> allocated during the migration because QEMU will read all pages in
> migration. It may lead to OOM if over-committed memory is enabled,
> which is usually enabled in public cloud.
>
> In this patch I try to add support mapping to zero pages on reading
> memfd. On reading, memfd will map to zero page instead of allocating a
> new page. Then COW it when a write occurs.
>
> For now it's just a demo for discussion. There are lots of work to do,
> e.g.:
> 1. don't support THP;
> 2. don't support shared reading and writing, only for inherit. For
> example:
> task1 | task2
> 1) read from addr |
> | 2) write to addr
> 3) read from addr again |
> then 3) will read 0 instead of the data task2 writed in 2).
>
> Would something similar be welcome in the Linux?
David has made good suggestions on better avoiding the need for
such a change, for the use case you have in mind.
And I don't care for the particular RFC patch that you posted.
But I have to say that use of ZERO_PAGE for shmem/memfd/tmpfs read-fault
might (potentially) be very welcome. Not as some MFD_ZEROPAGE special
case, but as how it would always work. Deleting the shmem_recalc_inode()
cruft, which is there to correct accounting for the unmodified read-only
pages, after page reclaim has got around to freeing them later.
It does require more work than you gave it in 1/1: mainly, as you call
out above, there's a need to note in the mapping's XArray when ZERO_PAGE
has been used at an offset, and do an rmap walk to unmap those ptes when
a writable page is substituted - see __xip_unmap() in Linux 3.19's
mm/filemap_xip.c for such an rmap walk.
Though when this came up before (in the "no-fault mmap" MAP_NOSIGBUS
thread last year - which then got forgotten), Linus was wary of that
unmapping, and it was dropped for a simple MAP_PRIVATE implementation.
And I've never scoped out what is needed to protect the page from
writing in all circumstances: in principle, it ought to be easy by
giving shmem_vm_ops a page_mkwrite; but that's likely to come with
a performance penalty, which may not be justified for this case.
I didn't check what you did for write protection: maybe what you
did was enough, but one has to be very careful about that.
Making this change to ZERO_PAGE has never quite justified the effort
so far: temporarily allocated pages have worked well enough in most
circumstances.
Hugh
>
> Thanks,
> Peng
>
> [1] https://criu.org/Checkpoint/Restore
> [2] https://patchwork.kernel.org/project/qemu-devel/cover/1628286241-217457-1-git-send-email-steven.sistare@oracle.com/
>
> Peng Liang (1):
> memfd: Support mapping to zero page on reading memfd
>
> include/linux/fs.h | 2 ++
> include/uapi/linux/memfd.h | 1 +
> mm/memfd.c | 8 ++++++--
> mm/memory.c | 37 ++++++++++++++++++++++++++++++++++---
> mm/shmem.c | 10 ++++++++--
> 5 files changed, 51 insertions(+), 7 deletions(-)
>
> --
> 2.33.1
^ permalink raw reply [flat|nested] 7+ messages in thread