kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: yulei.kernel@gmail.com
To: akpm@linux-foundation.org, naoya.horiguchi@nec.com,
	viro@zeniv.linux.org.uk, pbonzini@redhat.com
Cc: linux-fsdevel@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, xiaoguangrong.eric@gmail.com,
	kernellwp@gmail.com, lihaiwei.kernel@gmail.com,
	Yulei Zhang <yuleixzhang@tencent.com>,
	Xiao Guangrong <gloryxiao@tencent.com>
Subject: [PATCH 05/35] dmemfs: support mmap
Date: Thu,  8 Oct 2020 15:53:55 +0800	[thread overview]
Message-ID: <21b236c361e48a8e1118c681570dbe79ac7336db.1602093760.git.yuleixzhang@tencent.com> (raw)
In-Reply-To: <cover.1602093760.git.yuleixzhang@tencent.com>
In-Reply-To: <cover.1602093760.git.yuleixzhang@tencent.com>

From: Yulei Zhang <yuleixzhang@tencent.com>

It adds mmap support. Note the file will be extended if it's
beyond mmap's offset, that drops the requirement of write()
operation, however, it has not supported cutting file down.

Signed-off-by: Xiao Guangrong <gloryxiao@tencent.com>
Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com>
---
 fs/dmemfs/inode.c    | 337 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/dmem.h |  10 ++
 2 files changed, 345 insertions(+), 2 deletions(-)

diff --git a/fs/dmemfs/inode.c b/fs/dmemfs/inode.c
index 6a8a2d9f94e9..21d2f951b4ea 100644
--- a/fs/dmemfs/inode.c
+++ b/fs/dmemfs/inode.c
@@ -26,6 +26,7 @@
 #include <linux/pagevec.h>
 #include <linux/fs_parser.h>
 #include <linux/seq_file.h>
+#include <linux/dmem.h>
 
 MODULE_AUTHOR("Tencent Corporation");
 MODULE_LICENSE("GPL v2");
@@ -105,8 +106,250 @@ static const struct inode_operations dmemfs_file_inode_operations = {
 	.getattr = simple_getattr,
 };
 
+static unsigned long dmem_pgoff_to_index(struct inode *inode, pgoff_t pgoff)
+{
+	struct super_block *sb = inode->i_sb;
+
+	return pgoff >> (sb->s_blocksize_bits - PAGE_SHIFT);
+}
+
+static void *dmem_addr_to_entry(struct inode *inode, phys_addr_t addr)
+{
+	struct super_block *sb = inode->i_sb;
+
+	addr >>= sb->s_blocksize_bits;
+	return xa_mk_value(addr);
+}
+
+static phys_addr_t dmem_entry_to_addr(struct inode *inode, void *entry)
+{
+	struct super_block *sb = inode->i_sb;
+
+	WARN_ON(!xa_is_value(entry));
+	return xa_to_value(entry) << sb->s_blocksize_bits;
+}
+
+static unsigned long
+dmem_addr_to_pfn(struct inode *inode, phys_addr_t addr, pgoff_t pgoff,
+		 unsigned int fault_shift)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long pfn = addr >> PAGE_SHIFT;
+	unsigned long mask;
+
+	mask = (1UL << ((unsigned int)sb->s_blocksize_bits - fault_shift)) - 1;
+	mask <<= fault_shift - PAGE_SHIFT;
+
+	return pfn + (pgoff & mask);
+}
+
+static inline unsigned long dmem_page_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static int check_inode_size(struct inode *inode, loff_t offset)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (offset >= i_size_read(inode))
+		return -EINVAL;
+
+	return 0;
+}
+
+static unsigned
+dmemfs_find_get_entries(struct address_space *mapping, unsigned long start,
+			unsigned int nr_entries, void **entries,
+			unsigned long *indices)
+{
+	XA_STATE(xas, &mapping->i_pages, start);
+
+	void *entry;
+	unsigned int ret = 0;
+
+	if (!nr_entries)
+		return 0;
+
+	rcu_read_lock();
+
+	xas_for_each(&xas, entry, ULONG_MAX) {
+		if (xas_retry(&xas, entry))
+			continue;
+
+		if (xa_is_value(entry))
+			goto export;
+
+		if (unlikely(entry != xas_reload(&xas)))
+			goto retry;
+
+export:
+		indices[ret] = xas.xa_index;
+		entries[ret] = entry;
+		if (++ret == nr_entries)
+			break;
+		continue;
+retry:
+		xas_reset(&xas);
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static void *find_radix_entry_or_next(struct address_space *mapping,
+				      unsigned long start,
+				      unsigned long *eindex)
+{
+	void *entry = NULL;
+
+	dmemfs_find_get_entries(mapping, start, 1, &entry, eindex);
+	return entry;
+}
+
+/*
+ * find the entry in radix tree based on @index, create it if
+ * it does not exist
+ *
+ * return the entry with rcu locked, otherwise ERR_PTR()
+ * is returned
+ */
+static void *
+radix_get_create_entry(struct vm_area_struct *vma, unsigned long fault_addr,
+		       struct inode *inode, pgoff_t pgoff)
+{
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long eindex, index;
+	loff_t offset;
+	phys_addr_t addr;
+	gfp_t gfp_masks = mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM;
+	void *entry;
+	unsigned int try_dpages, dpages;
+	int ret;
+
+retry:
+	offset = ((loff_t)pgoff << PAGE_SHIFT);
+	index = dmem_pgoff_to_index(inode, pgoff);
+	rcu_read_lock();
+	ret = check_inode_size(inode, offset);
+	if (ret) {
+		rcu_read_unlock();
+		return ERR_PTR(ret);
+	}
+
+	try_dpages = dmem_pgoff_to_index(inode, (i_size_read(inode) - offset)
+				     >> PAGE_SHIFT);
+	entry = find_radix_entry_or_next(mapping, index, &eindex);
+	if (entry) {
+		WARN_ON(!xa_is_value(entry));
+		if (eindex == index)
+			return entry;
+
+		WARN_ON(eindex <= index);
+		try_dpages = eindex - index;
+	}
+	rcu_read_unlock();
+
+	/* entry does not exist, create it */
+	addr = dmem_alloc_pages_vma(vma, fault_addr, try_dpages, &dpages);
+	if (!addr) {
+		/*
+		 * do not return -ENOMEM as that will trigger OOM,
+		 * it is useless for reclaiming dmem page
+		 */
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	try_dpages = dpages;
+	while (dpages) {
+		rcu_read_lock();
+		ret = check_inode_size(inode, offset);
+		if (ret)
+			goto unlock_rcu;
+
+		entry = dmem_addr_to_entry(inode, addr);
+		entry = xa_store(&mapping->i_pages, index, entry, gfp_masks);
+		if (!xa_is_err(entry)) {
+			addr += inode->i_sb->s_blocksize;
+			offset += inode->i_sb->s_blocksize;
+			dpages--;
+			mapping->nrexceptional++;
+			index++;
+		}
+
+unlock_rcu:
+		rcu_read_unlock();
+		if (ret)
+			break;
+	}
+
+	if (dpages)
+		dmem_free_pages(addr, dpages);
+
+	/* we have created some entries, let's retry it */
+	if (ret == -EEXIST || try_dpages != dpages)
+		goto retry;
+exit:
+	return ERR_PTR(ret);
+}
+
+static void radix_put_entry(void)
+{
+	rcu_read_unlock();
+}
+
+static vm_fault_t dmemfs_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct inode *inode = file_inode(vma->vm_file);
+	phys_addr_t addr;
+	void *entry;
+	int ret;
+
+	if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
+		return VM_FAULT_SIGBUS;
+
+	entry = radix_get_create_entry(vma, (unsigned long)vmf->address,
+				       inode, vmf->pgoff);
+	if (IS_ERR(entry)) {
+		ret = PTR_ERR(entry);
+		goto exit;
+	}
+
+	addr = dmem_entry_to_addr(inode, entry);
+	ret = vmf_insert_pfn(vma, (unsigned long)vmf->address,
+			    dmem_addr_to_pfn(inode, addr, vmf->pgoff,
+					     PAGE_SHIFT));
+	radix_put_entry();
+
+exit:
+	return ret;
+}
+
+static unsigned long dmemfs_pagesize(struct vm_area_struct *vma)
+{
+	return dmem_page_size(file_inode(vma->vm_file));
+}
+
+static const struct vm_operations_struct dmemfs_vm_ops = {
+	.fault = dmemfs_fault,
+	.pagesize = dmemfs_pagesize,
+};
+
 int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct inode *inode = file_inode(file);
+
+	if (vma->vm_pgoff & ((dmem_page_size(inode) - 1) >> PAGE_SHIFT))
+		return -EINVAL;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	vma->vm_flags |= VM_PFNMAP;
+
+	file_accessed(file);
+	vma->vm_ops = &dmemfs_vm_ops;
 	return 0;
 }
 
@@ -189,9 +432,86 @@ static int dmemfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+/*
+ * should make sure the dmem page in the dropped region is not
+ * being mapped by any process
+ */
+static void inode_drop_dpages(struct inode *inode, loff_t start, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	unsigned long istart, iend, indices[PAGEVEC_SIZE];
+	int i;
+
+	/* we never use normap page */
+	WARN_ON(mapping->nrpages);
+
+	/* if no dpage is allocated for the inode */
+	if (!mapping->nrexceptional)
+		return;
+
+	istart = dmem_pgoff_to_index(inode, start >> PAGE_SHIFT);
+	iend = dmem_pgoff_to_index(inode, end >> PAGE_SHIFT);
+	pagevec_init(&pvec);
+	while (istart < iend) {
+		pvec.nr = dmemfs_find_get_entries(mapping, istart,
+				min(iend - istart,
+				(unsigned long)PAGEVEC_SIZE),
+				(void **)pvec.pages,
+				indices);
+		if (!pvec.nr)
+			break;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			phys_addr_t addr;
+
+			istart = indices[i];
+			if (istart >= iend)
+				break;
+
+			xa_erase(&mapping->i_pages, istart);
+			mapping->nrexceptional--;
+
+			addr = dmem_entry_to_addr(inode, pvec.pages[i]);
+			dmem_free_page(addr);
+		}
+
+		/*
+		 * only exception entries in pagevec, it's safe to
+		 * reinit it
+		 */
+		pagevec_reinit(&pvec);
+		cond_resched();
+		istart++;
+	}
+}
+
+static void dmemfs_evict_inode(struct inode *inode)
+{
+	/* no VMA works on it */
+	WARN_ON(!RB_EMPTY_ROOT(&inode->i_data.i_mmap.rb_root));
+
+	inode_drop_dpages(inode, 0, LLONG_MAX);
+	clear_inode(inode);
+}
+
+/*
+ * Display the mount options in /proc/mounts.
+ */
+static int dmemfs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct dmemfs_fs_info *fsi = root->d_sb->s_fs_info;
+
+	if (check_dpage_size(fsi->mount_opts.dpage_size))
+		seq_printf(m, ",pagesize=%lx", fsi->mount_opts.dpage_size);
+	return 0;
+}
+
 static const struct super_operations dmemfs_ops = {
 	.statfs	= dmemfs_statfs,
+	.evict_inode = dmemfs_evict_inode,
 	.drop_inode = generic_delete_inode,
+	.show_options = dmemfs_show_options,
 };
 
 static int
@@ -199,6 +519,7 @@ dmemfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct inode *inode;
 	struct dmemfs_fs_info *fsi = sb->s_fs_info;
+	int ret;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = fsi->mount_opts.dpage_size;
@@ -207,11 +528,17 @@ dmemfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_op = &dmemfs_ops;
 	sb->s_time_gran = 1;
 
+	ret = dmem_alloc_init(sb->s_blocksize_bits);
+	if (ret)
+		return ret;
+
 	inode = dmemfs_get_inode(sb, NULL, S_IFDIR, 0);
 	sb->s_root = d_make_root(inode);
-	if (!sb->s_root)
-		return -ENOMEM;
 
+	if (!sb->s_root) {
+		dmem_alloc_uinit();
+		return -ENOMEM;
+	}
 	return 0;
 }
 
@@ -247,7 +574,13 @@ int dmemfs_init_fs_context(struct fs_context *fc)
 
 static void dmemfs_kill_sb(struct super_block *sb)
 {
+	bool has_inode = !!sb->s_root;
+
 	kill_litter_super(sb);
+
+	/* do not uninit dmem allocator if mount failed */
+	if (has_inode)
+		dmem_alloc_uinit();
 }
 
 static struct file_system_type dmemfs_fs_type = {
diff --git a/include/linux/dmem.h b/include/linux/dmem.h
index 476a82e8f252..8682d63ed43a 100644
--- a/include/linux/dmem.h
+++ b/include/linux/dmem.h
@@ -10,6 +10,16 @@ int dmem_region_register(int node, phys_addr_t start, phys_addr_t end);
 int dmem_alloc_init(unsigned long dpage_shift);
 void dmem_alloc_uinit(void);
 
+phys_addr_t
+dmem_alloc_pages_nodemask(int nid, nodemask_t *nodemask, unsigned int try_max,
+			  unsigned int *result_nr);
+
+phys_addr_t
+dmem_alloc_pages_vma(struct vm_area_struct *vma, unsigned long addr,
+		     unsigned int try_max, unsigned int *result_nr);
+
+void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr);
+#define dmem_free_page(addr)	dmem_free_pages(addr, 1)
 #else
 static inline int dmem_reserve_init(void)
 {
-- 
2.28.0


  parent reply	other threads:[~2020-10-08  7:54 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-08  7:53 [PATCH 00/35] Enhance memory utilization with DMEMFS yulei.kernel
2020-10-08  7:53 ` [PATCH 01/35] fs: introduce dmemfs module yulei.kernel
2020-11-10 20:04   ` Al Viro
2020-11-11  8:53     ` yulei zhang
2020-11-11 23:09       ` Al Viro
2020-11-12 10:03         ` yulei zhang
2020-10-08  7:53 ` [PATCH 02/35] mm: support direct memory reservation yulei.kernel
2020-10-08 20:27   ` Randy Dunlap
2020-10-08 20:34   ` Randy Dunlap
2020-10-08  7:53 ` [PATCH 03/35] dmem: implement dmem memory management yulei.kernel
2020-10-08  7:53 ` [PATCH 04/35] dmem: let pat recognize dmem yulei.kernel
2020-10-13  7:27   ` Paolo Bonzini
2020-10-13  9:53     ` yulei zhang
2020-10-08  7:53 ` yulei.kernel [this message]
2020-10-08  7:53 ` [PATCH 06/35] dmemfs: support truncating inode down yulei.kernel
2020-10-08  7:53 ` [PATCH 07/35] dmem: trace core functions yulei.kernel
2020-10-08  7:53 ` [PATCH 08/35] dmem: show some statistic in debugfs yulei.kernel
2020-10-08 20:23   ` Randy Dunlap
2020-10-09 11:49     ` yulei zhang
2020-10-08  7:53 ` [PATCH 09/35] dmemfs: support remote access yulei.kernel
2020-10-08  7:54 ` [PATCH 10/35] dmemfs: introduce max_alloc_try_dpages parameter yulei.kernel
2020-10-08  7:54 ` [PATCH 11/35] mm: export mempolicy interfaces to serve dmem allocator yulei.kernel
2020-10-08  7:54 ` [PATCH 12/35] dmem: introduce mempolicy support yulei.kernel
2020-10-08  7:54 ` [PATCH 13/35] mm, dmem: introduce PFN_DMEM and pfn_t_dmem yulei.kernel
2020-10-08  7:54 ` [PATCH 14/35] mm, dmem: dmem-pmd vs thp-pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 15/35] mm: add pmd_special() check for pmd_trans_huge_lock() yulei.kernel
2020-10-08  7:54 ` [PATCH 16/35] dmemfs: introduce ->split() to dmemfs_vm_ops yulei.kernel
2020-10-08  7:54 ` [PATCH 17/35] mm, dmemfs: support unmap_page_range() for dmemfs pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 18/35] mm: follow_pmd_mask() for dmem huge pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 19/35] mm: gup_huge_pmd() " yulei.kernel
2020-10-08  7:54 ` [PATCH 20/35] mm: support dmem huge pmd for vmf_insert_pfn_pmd() yulei.kernel
2020-10-08  7:54 ` [PATCH 21/35] mm: support dmem huge pmd for follow_pfn() yulei.kernel
2020-10-08  7:54 ` [PATCH 22/35] kvm, x86: Distinguish dmemfs page from mmio page yulei.kernel
2020-10-09  0:58   ` Sean Christopherson
2020-10-09 10:28     ` Joao Martins
2020-10-09 11:42       ` yulei zhang
2020-10-08  7:54 ` [PATCH 23/35] kvm, x86: introduce VM_DMEM yulei.kernel
2020-10-08  7:54 ` [PATCH 24/35] dmemfs: support hugepage for dmemfs yulei.kernel
2020-10-08  7:54 ` [PATCH 25/35] mm, x86, dmem: fix estimation of reserved page for vaddr_get_pfn() yulei.kernel
2020-10-08  7:54 ` [PATCH 26/35] mm, dmem: introduce pud_special() yulei.kernel
2020-10-08  7:54 ` [PATCH 27/35] mm: add pud_special() to support dmem huge pud yulei.kernel
2020-10-08  7:54 ` [PATCH 28/35] mm, dmemfs: support huge_fault() for dmemfs yulei.kernel
2020-10-08  7:54 ` [PATCH 29/35] mm: add follow_pte_pud() yulei.kernel
2020-10-08  7:54 ` [PATCH 30/35] dmem: introduce dmem_bitmap_alloc() and dmem_bitmap_free() yulei.kernel
2020-10-08  7:54 ` [PATCH 31/35] dmem: introduce mce handler yulei.kernel
2020-10-08  7:54 ` [PATCH 32/35] mm, dmemfs: register and handle the dmem mce yulei.kernel
2020-10-08  7:54 ` [PATCH 33/35] kvm, x86: temporary disable record_steal_time for dmem yulei.kernel
2020-10-08  7:54 ` [PATCH 34/35] dmem: add dmem unit tests yulei.kernel
2020-10-08  7:54 ` [PATCH 35/35] Add documentation for dmemfs yulei.kernel
2020-10-09  1:26   ` Randy Dunlap
2020-10-08 19:01 ` [PATCH 00/35] Enhance memory utilization with DMEMFS Joao Martins
2020-10-09 11:39   ` yulei zhang
2020-10-09 11:53     ` Joao Martins
2020-10-10  8:15       ` yulei zhang
2020-10-12 10:59         ` Joao Martins
2020-10-14 22:25           ` Dan Williams
2020-10-19 13:37             ` Paolo Bonzini
2020-10-19 19:03               ` Joao Martins
2020-10-20 15:22                 ` yulei zhang
2020-10-12 11:57 ` Zengtao (B)
2020-10-13  2:45   ` yulei zhang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=21b236c361e48a8e1118c681570dbe79ac7336db.1602093760.git.yuleixzhang@tencent.com \
    --to=yulei.kernel@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=gloryxiao@tencent.com \
    --cc=kernellwp@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=lihaiwei.kernel@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=pbonzini@redhat.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xiaoguangrong.eric@gmail.com \
    --cc=yuleixzhang@tencent.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).