linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: yulei.kernel@gmail.com
To: akpm@linux-foundation.org, naoya.horiguchi@nec.com,
	viro@zeniv.linux.org.uk, pbonzini@redhat.com
Cc: linux-fsdevel@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, xiaoguangrong.eric@gmail.com,
	kernellwp@gmail.com, lihaiwei.kernel@gmail.com,
	Yulei Zhang <yuleixzhang@tencent.com>,
	Chen Zhuo <sagazchen@tencent.com>
Subject: [PATCH 24/35] dmemfs: support hugepage for dmemfs
Date: Thu,  8 Oct 2020 15:54:14 +0800	[thread overview]
Message-ID: <4d6038207c6472a0dd3084cbc77e70554fb9de91.1602093760.git.yuleixzhang@tencent.com> (raw)
In-Reply-To: <cover.1602093760.git.yuleixzhang@tencent.com>
In-Reply-To: <cover.1602093760.git.yuleixzhang@tencent.com>

From: Yulei Zhang <yuleixzhang@tencent.com>

It add hugepage support for dmemfs. We use PFN_DMEM to notify
vmf_insert_pfn_pmd, and dmem huge pmd will be marked with
_PAGE_SPECIAL and _PAGE_DMEM. So that GUP-fast can separate
dmemfs page from other page type and handle it correctly.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com>
---
 fs/dmemfs/inode.c | 113 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 2 deletions(-)

diff --git a/fs/dmemfs/inode.c b/fs/dmemfs/inode.c
index b3e394f33b42..53a9bf214e0d 100644
--- a/fs/dmemfs/inode.c
+++ b/fs/dmemfs/inode.c
@@ -460,7 +460,7 @@ static int dmemfs_split(struct vm_area_struct *vma, unsigned long addr)
 	return 0;
 }
 
-static vm_fault_t dmemfs_fault(struct vm_fault *vmf)
+static vm_fault_t __dmemfs_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
@@ -488,6 +488,63 @@ static vm_fault_t dmemfs_fault(struct vm_fault *vmf)
 	return ret;
 }
 
+static vm_fault_t  __dmemfs_pmd_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	unsigned long page_addr;
+	struct inode *inode = file_inode(vma->vm_file);
+	void *entry;
+	phys_addr_t phys;
+	pfn_t pfn;
+	int ret;
+
+	if (dmem_page_size(inode) < PMD_SIZE)
+		return VM_FAULT_FALLBACK;
+
+	WARN_ON(pmd_addr < vma->vm_start ||
+		vma->vm_end < pmd_addr + PMD_SIZE);
+
+	page_addr = vmf->address & ~(dmem_page_size(inode) - 1);
+	entry = radix_get_create_entry(vma, page_addr, inode,
+				       linear_page_index(vma, page_addr));
+	if (IS_ERR(entry))
+		return (PTR_ERR(entry) == -ENOMEM) ?
+			VM_FAULT_OOM : VM_FAULT_SIGBUS;
+
+	phys = dmem_addr_to_pfn(inode, dmem_entry_to_addr(inode, entry),
+				linear_page_index(vma, pmd_addr), PMD_SHIFT);
+	phys <<= PAGE_SHIFT;
+	pfn = phys_to_pfn_t(phys, PFN_DMEM);
+	ret = vmf_insert_pfn_pmd(vmf, pfn, !!(vma->vm_flags & VM_WRITE));
+
+	radix_put_entry();
+	return ret;
+}
+
+static vm_fault_t dmemfs_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size)
+{
+	int ret;
+
+	switch (pe_size) {
+	case PE_SIZE_PTE:
+		ret = __dmemfs_fault(vmf);
+		break;
+	case PE_SIZE_PMD:
+		ret = __dmemfs_pmd_fault(vmf);
+		break;
+	default:
+		ret = VM_FAULT_SIGBUS;
+	}
+
+	return ret;
+}
+
+static vm_fault_t dmemfs_fault(struct vm_fault *vmf)
+{
+	return dmemfs_huge_fault(vmf, PE_SIZE_PTE);
+}
+
 static unsigned long dmemfs_pagesize(struct vm_area_struct *vma)
 {
 	return dmem_page_size(file_inode(vma->vm_file));
@@ -498,6 +555,7 @@ static const struct vm_operations_struct dmemfs_vm_ops = {
 	.fault = dmemfs_fault,
 	.pagesize = dmemfs_pagesize,
 	.access = dmemfs_access_dmem,
+	.huge_fault = dmemfs_huge_fault,
 };
 
 int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -510,15 +568,66 @@ int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	vma->vm_flags |= VM_PFNMAP | VM_DMEM | VM_IO;
+	vma->vm_flags |= VM_PFNMAP | VM_DONTCOPY | VM_DMEM | VM_IO;
+
+	if (dmem_page_size(inode) != PAGE_SIZE)
+		vma->vm_flags |= VM_HUGEPAGE;
 
 	file_accessed(file);
 	vma->vm_ops = &dmemfs_vm_ops;
 	return 0;
 }
 
+/*
+ * If the size of area returned by mm->get_unmapped_area() is one
+ * dmem pagesize larger than 'len', the returned addr by
+ * mm->get_unmapped_area() could be aligned to dmem pagesize to
+ * meet alignment demand.
+ */
+static unsigned long
+dmemfs_get_unmapped_area(struct file *file, unsigned long addr,
+			 unsigned long len, unsigned long pgoff,
+			 unsigned long flags)
+{
+	unsigned long len_pad;
+	unsigned long off = pgoff << PAGE_SHIFT;
+	unsigned long align;
+
+	align = dmem_page_size(file_inode(file));
+
+	/* For pud or pmd pagesize, could not support fault fallback. */
+	if (len & (align - 1))
+		return -EINVAL;
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED) {
+		if (addr & (align - 1))
+			return -EINVAL;
+		return addr;
+	}
+
+	/*
+	 * Pad a extra align space for 'len', as we want to find a unmapped
+	 * area which is larger enough to align with dmemfs pagesize, if
+	 * pagesize of dmem is larger than 4K.
+	 */
+	len_pad = (align == PAGE_SIZE) ? len : len + align;
+
+	/* 'len' or 'off' is too large for pad. */
+	if (len_pad < len || (off + len_pad) < off)
+		return -EINVAL;
+
+	addr = current->mm->get_unmapped_area(file, addr, len_pad,
+					      pgoff, flags);
+
+	/* Now 'addr' could be aligned to upper boundary. */
+	return IS_ERR_VALUE(addr) ? addr : round_up(addr, align);
+}
+
 static const struct file_operations dmemfs_file_operations = {
 	.mmap = dmemfs_file_mmap,
+	.get_unmapped_area = dmemfs_get_unmapped_area,
 };
 
 static int dmemfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
-- 
2.28.0


  parent reply	other threads:[~2020-10-08  7:56 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-08  7:53 [PATCH 00/35] Enhance memory utilization with DMEMFS yulei.kernel
2020-10-08  7:53 ` [PATCH 01/35] fs: introduce dmemfs module yulei.kernel
2020-11-10 20:04   ` Al Viro
2020-11-11  8:53     ` yulei zhang
2020-11-11 23:09       ` Al Viro
2020-11-12 10:03         ` yulei zhang
2020-10-08  7:53 ` [PATCH 02/35] mm: support direct memory reservation yulei.kernel
2020-10-08 20:27   ` Randy Dunlap
2020-10-08 20:34   ` Randy Dunlap
2020-10-08  7:53 ` [PATCH 03/35] dmem: implement dmem memory management yulei.kernel
2020-10-08  7:53 ` [PATCH 04/35] dmem: let pat recognize dmem yulei.kernel
2020-10-13  7:27   ` Paolo Bonzini
2020-10-13  9:53     ` yulei zhang
2020-10-08  7:53 ` [PATCH 05/35] dmemfs: support mmap yulei.kernel
2020-10-08  7:53 ` [PATCH 06/35] dmemfs: support truncating inode down yulei.kernel
2020-10-08  7:53 ` [PATCH 07/35] dmem: trace core functions yulei.kernel
2020-10-08  7:53 ` [PATCH 08/35] dmem: show some statistic in debugfs yulei.kernel
2020-10-08 20:23   ` Randy Dunlap
2020-10-09 11:49     ` yulei zhang
2020-10-08  7:53 ` [PATCH 09/35] dmemfs: support remote access yulei.kernel
2020-10-08  7:54 ` [PATCH 10/35] dmemfs: introduce max_alloc_try_dpages parameter yulei.kernel
2020-10-08  7:54 ` [PATCH 11/35] mm: export mempolicy interfaces to serve dmem allocator yulei.kernel
2020-10-08  7:54 ` [PATCH 12/35] dmem: introduce mempolicy support yulei.kernel
2020-10-08  7:54 ` [PATCH 13/35] mm, dmem: introduce PFN_DMEM and pfn_t_dmem yulei.kernel
2020-10-08  7:54 ` [PATCH 14/35] mm, dmem: dmem-pmd vs thp-pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 15/35] mm: add pmd_special() check for pmd_trans_huge_lock() yulei.kernel
2020-10-08  7:54 ` [PATCH 16/35] dmemfs: introduce ->split() to dmemfs_vm_ops yulei.kernel
2020-10-08  7:54 ` [PATCH 17/35] mm, dmemfs: support unmap_page_range() for dmemfs pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 18/35] mm: follow_pmd_mask() for dmem huge pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 19/35] mm: gup_huge_pmd() " yulei.kernel
2020-10-08  7:54 ` [PATCH 20/35] mm: support dmem huge pmd for vmf_insert_pfn_pmd() yulei.kernel
2020-10-08  7:54 ` [PATCH 21/35] mm: support dmem huge pmd for follow_pfn() yulei.kernel
2020-10-08  7:54 ` [PATCH 22/35] kvm, x86: Distinguish dmemfs page from mmio page yulei.kernel
2020-10-09  0:58   ` Sean Christopherson
2020-10-09 10:28     ` Joao Martins
2020-10-09 11:42       ` yulei zhang
2020-10-08  7:54 ` [PATCH 23/35] kvm, x86: introduce VM_DMEM yulei.kernel
2020-10-08  7:54 ` yulei.kernel [this message]
2020-10-08  7:54 ` [PATCH 25/35] mm, x86, dmem: fix estimation of reserved page for vaddr_get_pfn() yulei.kernel
2020-10-08  7:54 ` [PATCH 26/35] mm, dmem: introduce pud_special() yulei.kernel
2020-10-08  7:54 ` [PATCH 27/35] mm: add pud_special() to support dmem huge pud yulei.kernel
2020-10-08  7:54 ` [PATCH 28/35] mm, dmemfs: support huge_fault() for dmemfs yulei.kernel
2020-10-08  7:54 ` [PATCH 29/35] mm: add follow_pte_pud() yulei.kernel
2020-10-08  7:54 ` [PATCH 30/35] dmem: introduce dmem_bitmap_alloc() and dmem_bitmap_free() yulei.kernel
2020-10-08  7:54 ` [PATCH 31/35] dmem: introduce mce handler yulei.kernel
2020-10-08  7:54 ` [PATCH 32/35] mm, dmemfs: register and handle the dmem mce yulei.kernel
2020-10-08  7:54 ` [PATCH 33/35] kvm, x86: temporary disable record_steal_time for dmem yulei.kernel
2020-10-08  7:54 ` [PATCH 34/35] dmem: add dmem unit tests yulei.kernel
2020-10-08  7:54 ` [PATCH 35/35] Add documentation for dmemfs yulei.kernel
2020-10-09  1:26   ` Randy Dunlap
2020-10-08 19:01 ` [PATCH 00/35] Enhance memory utilization with DMEMFS Joao Martins
2020-10-09 11:39   ` yulei zhang
2020-10-09 11:53     ` Joao Martins
2020-10-10  8:15       ` yulei zhang
2020-10-12 10:59         ` Joao Martins
2020-10-14 22:25           ` Dan Williams
2020-10-19 13:37             ` Paolo Bonzini
2020-10-19 19:03               ` Joao Martins
2020-10-20 15:22                 ` yulei zhang
2020-10-12 11:57 ` Zengtao (B)
2020-10-13  2:45   ` yulei zhang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4d6038207c6472a0dd3084cbc77e70554fb9de91.1602093760.git.yuleixzhang@tencent.com \
    --to=yulei.kernel@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=kernellwp@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=lihaiwei.kernel@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=pbonzini@redhat.com \
    --cc=sagazchen@tencent.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xiaoguangrong.eric@gmail.com \
    --cc=yuleixzhang@tencent.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).