linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: yulei.kernel@gmail.com
To: linux-mm@kvack.org, akpm@linux-foundation.org,
	linux-fsdevel@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, naoya.horiguchi@nec.com,
	viro@zeniv.linux.org.uk, pbonzini@redhat.com
Cc: joao.m.martins@oracle.com, rdunlap@infradead.org,
	sean.j.christopherson@intel.com, xiaoguangrong.eric@gmail.com,
	kernellwp@gmail.com, lihaiwei.kernel@gmail.com,
	Yulei Zhang <yuleixzhang@tencent.com>,
	Xiao Guangrong <gloryxiao@tencent.com>
Subject: [RFC V2 05/37] dmemfs: support mmap for dmemfs
Date: Mon,  7 Dec 2020 19:30:58 +0800	[thread overview]
Message-ID: <556903717e3d0b0fc0b9583b709f4b34be2154cb.1607332046.git.yuleixzhang@tencent.com> (raw)
In-Reply-To: <cover.1607332046.git.yuleixzhang@tencent.com>

From: Yulei Zhang <yuleixzhang@tencent.com>

This patch adds mmap support. Note the file will be extended if
it's beyond mmap's offset, that drops the requirement of write()
operation, however, it has not supported cutting file down yet.

Signed-off-by: Xiao Guangrong <gloryxiao@tencent.com>
Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com>
---
 fs/dmemfs/inode.c    | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/dmem.h |  10 ++
 2 files changed, 351 insertions(+), 2 deletions(-)

diff --git a/fs/dmemfs/inode.c b/fs/dmemfs/inode.c
index 0aa3d3b..7b6e51d 100644
--- a/fs/dmemfs/inode.c
+++ b/fs/dmemfs/inode.c
@@ -26,6 +26,7 @@
 #include <linux/pagevec.h>
 #include <linux/fs_parser.h>
 #include <linux/seq_file.h>
+#include <linux/dmem.h>
 
 MODULE_AUTHOR("Tencent Corporation");
 MODULE_LICENSE("GPL v2");
@@ -102,7 +103,255 @@ static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
 	.getattr = simple_getattr,
 };
 
+static unsigned long dmem_pgoff_to_index(struct inode *inode, pgoff_t pgoff)
+{
+	struct super_block *sb = inode->i_sb;
+
+	return pgoff >> (sb->s_blocksize_bits - PAGE_SHIFT);
+}
+
+static void *dmem_addr_to_entry(struct inode *inode, phys_addr_t addr)
+{
+	struct super_block *sb = inode->i_sb;
+
+	addr >>= sb->s_blocksize_bits;
+	return xa_mk_value(addr);
+}
+
+static phys_addr_t dmem_entry_to_addr(struct inode *inode, void *entry)
+{
+	struct super_block *sb = inode->i_sb;
+
+	WARN_ON(!xa_is_value(entry));
+	return xa_to_value(entry) << sb->s_blocksize_bits;
+}
+
+static unsigned long
+dmem_addr_to_pfn(struct inode *inode, phys_addr_t addr, pgoff_t pgoff,
+		 unsigned int fault_shift)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long pfn = addr >> PAGE_SHIFT;
+	unsigned long mask;
+
+	mask = (1UL << ((unsigned int)sb->s_blocksize_bits - fault_shift)) - 1;
+	mask <<= fault_shift - PAGE_SHIFT;
+
+	return pfn + (pgoff & mask);
+}
+
+static inline unsigned long dmem_page_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static int check_inode_size(struct inode *inode, loff_t offset)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (offset >= i_size_read(inode))
+		return -EINVAL;
+
+	return 0;
+}
+
+static unsigned
+dmemfs_find_get_entries(struct address_space *mapping, unsigned long start,
+			unsigned int nr_entries, void **entries,
+			unsigned long *indices)
+{
+	XA_STATE(xas, &mapping->i_pages, start);
+
+	void *entry;
+	unsigned int ret = 0;
+
+	if (!nr_entries)
+		return 0;
+
+	rcu_read_lock();
+
+	xas_for_each(&xas, entry, ULONG_MAX) {
+		if (xas_retry(&xas, entry))
+			continue;
+
+		if (xa_is_value(entry))
+			goto export;
+
+		if (unlikely(entry != xas_reload(&xas)))
+			goto retry;
+
+export:
+		indices[ret] = xas.xa_index;
+		entries[ret] = entry;
+		if (++ret == nr_entries)
+			break;
+		continue;
+retry:
+		xas_reset(&xas);
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static void *find_radix_entry_or_next(struct address_space *mapping,
+				      unsigned long start,
+				      unsigned long *eindex)
+{
+	void *entry = NULL;
+
+	dmemfs_find_get_entries(mapping, start, 1, &entry, eindex);
+	return entry;
+}
+
+/*
+ * find the entry in radix tree based on @index, create it if
+ * it does not exist
+ *
+ * return the entry with rcu locked, otherwise ERR_PTR()
+ * is returned
+ */
+static void *
+radix_get_create_entry(struct vm_area_struct *vma, unsigned long fault_addr,
+		       struct inode *inode, pgoff_t pgoff)
+{
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long eindex, index;
+	loff_t offset;
+	phys_addr_t addr;
+	gfp_t gfp_masks = mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM;
+	void *entry;
+	unsigned int try_dpages, dpages;
+	int ret;
+
+retry:
+	offset = ((loff_t)pgoff << PAGE_SHIFT);
+	index = dmem_pgoff_to_index(inode, pgoff);
+	rcu_read_lock();
+	ret = check_inode_size(inode, offset);
+	if (ret) {
+		rcu_read_unlock();
+		return ERR_PTR(ret);
+	}
+
+	try_dpages = dmem_pgoff_to_index(inode, (i_size_read(inode) - offset)
+				     >> PAGE_SHIFT);
+	entry = find_radix_entry_or_next(mapping, index, &eindex);
+	if (entry) {
+		WARN_ON(!xa_is_value(entry));
+		if (eindex == index)
+			return entry;
+
+		WARN_ON(eindex <= index);
+		try_dpages = eindex - index;
+	}
+	rcu_read_unlock();
+
+	/* entry does not exist, create it */
+	addr = dmem_alloc_pages_vma(vma, fault_addr, try_dpages, &dpages);
+	if (!addr) {
+		/*
+		 * do not return -ENOMEM as that will trigger OOM,
+		 * it is useless for reclaiming dmem page
+		 */
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	try_dpages = dpages;
+	while (dpages) {
+		rcu_read_lock();
+		ret = check_inode_size(inode, offset);
+		if (ret)
+			goto unlock_rcu;
+
+		entry = dmem_addr_to_entry(inode, addr);
+		entry = xa_store(&mapping->i_pages, index, entry, gfp_masks);
+		if (!xa_is_err(entry)) {
+			addr += inode->i_sb->s_blocksize;
+			offset += inode->i_sb->s_blocksize;
+			dpages--;
+			mapping->nrexceptional++;
+			index++;
+		}
+
+unlock_rcu:
+		rcu_read_unlock();
+		if (ret)
+			break;
+	}
+
+	if (dpages)
+		dmem_free_pages(addr, dpages);
+
+	/* we have created some entries, let's retry it */
+	if (ret == -EEXIST || try_dpages != dpages)
+		goto retry;
+exit:
+	return ERR_PTR(ret);
+}
+
+static void radix_put_entry(void)
+{
+	rcu_read_unlock();
+}
+
+static vm_fault_t dmemfs_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct inode *inode = file_inode(vma->vm_file);
+	phys_addr_t addr;
+	void *entry;
+	int ret;
+
+	if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
+		return VM_FAULT_SIGBUS;
+
+	entry = radix_get_create_entry(vma, (unsigned long)vmf->address,
+				       inode, vmf->pgoff);
+	if (IS_ERR(entry)) {
+		ret = PTR_ERR(entry);
+		goto exit;
+	}
+
+	addr = dmem_entry_to_addr(inode, entry);
+	ret = vmf_insert_pfn(vma, (unsigned long)vmf->address,
+			    dmem_addr_to_pfn(inode, addr, vmf->pgoff,
+					     PAGE_SHIFT));
+	radix_put_entry();
+
+exit:
+	return ret;
+}
+
+static unsigned long dmemfs_pagesize(struct vm_area_struct *vma)
+{
+	return dmem_page_size(file_inode(vma->vm_file));
+}
+
+static const struct vm_operations_struct dmemfs_vm_ops = {
+	.fault = dmemfs_fault,
+	.pagesize = dmemfs_pagesize,
+};
+
+int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(file);
+
+	if (vma->vm_pgoff & ((dmem_page_size(inode) - 1) >> PAGE_SHIFT))
+		return -EINVAL;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	vma->vm_flags |= VM_PFNMAP;
+
+	file_accessed(file);
+	vma->vm_ops = &dmemfs_vm_ops;
+	return 0;
+}
+
 static const struct file_operations dmemfs_file_operations = {
+	.mmap = dmemfs_file_mmap,
 };
 
 static int dmemfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -180,9 +429,86 @@ static int dmemfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+/*
+ * should make sure the dmem page in the dropped region is not
+ * being mapped by any process
+ */
+static void inode_drop_dpages(struct inode *inode, loff_t start, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	unsigned long istart, iend, indices[PAGEVEC_SIZE];
+	int i;
+
+	/* we never use normap page */
+	WARN_ON(mapping->nrpages);
+
+	/* if no dpage is allocated for the inode */
+	if (!mapping->nrexceptional)
+		return;
+
+	istart = dmem_pgoff_to_index(inode, start >> PAGE_SHIFT);
+	iend = dmem_pgoff_to_index(inode, end >> PAGE_SHIFT);
+	pagevec_init(&pvec);
+	while (istart < iend) {
+		pvec.nr = dmemfs_find_get_entries(mapping, istart,
+				min(iend - istart,
+				(unsigned long)PAGEVEC_SIZE),
+				(void **)pvec.pages,
+				indices);
+		if (!pvec.nr)
+			break;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			phys_addr_t addr;
+
+			istart = indices[i];
+			if (istart >= iend)
+				break;
+
+			xa_erase(&mapping->i_pages, istart);
+			mapping->nrexceptional--;
+
+			addr = dmem_entry_to_addr(inode, pvec.pages[i]);
+			dmem_free_page(addr);
+		}
+
+		/*
+		 * only exception entries in pagevec, it's safe to
+		 * reinit it
+		 */
+		pagevec_reinit(&pvec);
+		cond_resched();
+		istart++;
+	}
+}
+
+static void dmemfs_evict_inode(struct inode *inode)
+{
+	/* no VMA works on it */
+	WARN_ON(!RB_EMPTY_ROOT(&inode->i_data.i_mmap.rb_root));
+
+	inode_drop_dpages(inode, 0, LLONG_MAX);
+	clear_inode(inode);
+}
+
+/*
+ * Display the mount options in /proc/mounts.
+ */
+static int dmemfs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct dmemfs_fs_info *fsi = root->d_sb->s_fs_info;
+
+	if (check_dpage_size(fsi->mount_opts.dpage_size))
+		seq_printf(m, ",pagesize=%lx", fsi->mount_opts.dpage_size);
+	return 0;
+}
+
 static const struct super_operations dmemfs_ops = {
 	.statfs	= dmemfs_statfs,
+	.evict_inode = dmemfs_evict_inode,
 	.drop_inode = generic_delete_inode,
+	.show_options = dmemfs_show_options,
 };
 
 static int
@@ -190,6 +516,7 @@ static int dmemfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct inode *inode;
 	struct dmemfs_fs_info *fsi = sb->s_fs_info;
+	int ret;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = fsi->mount_opts.dpage_size;
@@ -198,11 +525,17 @@ static int dmemfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	sb->s_op = &dmemfs_ops;
 	sb->s_time_gran = 1;
 
+	ret = dmem_alloc_init(sb->s_blocksize_bits);
+	if (ret)
+		return ret;
+
 	inode = dmemfs_get_inode(sb, NULL, S_IFDIR);
 	sb->s_root = d_make_root(inode);
-	if (!sb->s_root)
-		return -ENOMEM;
 
+	if (!sb->s_root) {
+		dmem_alloc_uinit();
+		return -ENOMEM;
+	}
 	return 0;
 }
 
@@ -238,7 +571,13 @@ int dmemfs_init_fs_context(struct fs_context *fc)
 
 static void dmemfs_kill_sb(struct super_block *sb)
 {
+	bool has_inode = !!sb->s_root;
+
 	kill_litter_super(sb);
+
+	/* do not uninit dmem allocator if mount failed */
+	if (has_inode)
+		dmem_alloc_uinit();
 }
 
 static struct file_system_type dmemfs_fs_type = {
diff --git a/include/linux/dmem.h b/include/linux/dmem.h
index 476a82e..8682d63 100644
--- a/include/linux/dmem.h
+++ b/include/linux/dmem.h
@@ -10,6 +10,16 @@
 int dmem_alloc_init(unsigned long dpage_shift);
 void dmem_alloc_uinit(void);
 
+phys_addr_t
+dmem_alloc_pages_nodemask(int nid, nodemask_t *nodemask, unsigned int try_max,
+			  unsigned int *result_nr);
+
+phys_addr_t
+dmem_alloc_pages_vma(struct vm_area_struct *vma, unsigned long addr,
+		     unsigned int try_max, unsigned int *result_nr);
+
+void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr);
+#define dmem_free_page(addr)	dmem_free_pages(addr, 1)
 #else
 static inline int dmem_reserve_init(void)
 {
-- 
1.8.3.1



  parent reply	other threads:[~2020-12-07 11:33 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-07 11:30 [RFC V2 00/37] Enhance memory utilization with DMEMFS yulei.kernel
2020-12-07 11:30 ` [RFC V2 01/37] fs: introduce dmemfs module yulei.kernel
2020-12-07 11:30 ` [RFC V2 02/37] mm: support direct memory reservation yulei.kernel
2020-12-07 11:30 ` [RFC V2 03/37] dmem: implement dmem memory management yulei.kernel
2020-12-07 11:30 ` [RFC V2 04/37] dmem: let pat recognize dmem yulei.kernel
2020-12-07 11:30 ` yulei.kernel [this message]
2020-12-07 11:30 ` [RFC V2 06/37] dmemfs: support truncating inode down yulei.kernel
2020-12-07 11:31 ` [RFC V2 07/37] dmem: trace core functions yulei.kernel
2020-12-07 11:31 ` [RFC V2 08/37] dmem: show some statistic in debugfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 09/37] dmemfs: support remote access yulei.kernel
2020-12-07 11:31 ` [RFC V2 10/37] dmemfs: introduce max_alloc_try_dpages parameter yulei.kernel
2020-12-07 11:31 ` [RFC V2 11/37] mm: export mempolicy interfaces to serve dmem allocator yulei.kernel
2020-12-07 11:31 ` [RFC V2 12/37] dmem: introduce mempolicy support yulei.kernel
2020-12-07 11:31 ` [RFC V2 13/37] mm, dmem: introduce PFN_DMEM and pfn_t_dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 14/37] mm, dmem: differentiate dmem-pmd and thp-pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 15/37] mm: add pmd_special() check for pmd_trans_huge_lock() yulei.kernel
2020-12-07 11:31 ` [RFC V2 16/37] dmemfs: introduce ->split() to dmemfs_vm_ops yulei.kernel
2020-12-07 11:31 ` [RFC V2 17/37] mm, dmemfs: support unmap_page_range() for dmemfs pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 18/37] mm: follow_pmd_mask() for dmem huge pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 19/37] mm: gup_huge_pmd() " yulei.kernel
2020-12-07 11:31 ` [RFC V2 20/37] mm: support dmem huge pmd for vmf_insert_pfn_pmd() yulei.kernel
2020-12-07 11:31 ` [RFC V2 21/37] mm: support dmem huge pmd for follow_pfn() yulei.kernel
2020-12-07 11:31 ` [RFC V2 22/37] kvm, x86: Distinguish dmemfs page from mmio page yulei.kernel
2020-12-07 11:31 ` [RFC V2 23/37] kvm, x86: introduce VM_DMEM for syscall support usage yulei.kernel
2020-12-07 11:31 ` [RFC V2 24/37] dmemfs: support hugepage for dmemfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 25/37] mm, x86, dmem: fix estimation of reserved page for vaddr_get_pfn() yulei.kernel
2020-12-07 11:31 ` [RFC V2 26/37] mm, dmem: introduce pud_special() for dmem huge pud support yulei.kernel
2020-12-07 11:31 ` [RFC V2 27/37] mm: add pud_special() check to support dmem huge pud yulei.kernel
2020-12-07 11:31 ` [RFC V2 28/37] mm, dmemfs: support huge_fault() for dmemfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 29/37] mm: add follow_pte_pud() to support huge pud look up yulei.kernel
2020-12-07 11:31 ` [RFC V2 30/37] dmem: introduce dmem_bitmap_alloc() and dmem_bitmap_free() yulei.kernel
2020-12-07 11:31 ` [RFC V2 31/37] dmem: introduce mce handler yulei.kernel
2020-12-07 11:31 ` [RFC V2 32/37] mm, dmemfs: register and handle the dmem mce yulei.kernel
2020-12-07 11:31 ` [RFC V2 33/37] kvm, x86: enable record_steal_time for dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 34/37] dmem: add dmem unit tests yulei.kernel
2020-12-07 11:31 ` [RFC V2 35/37] mm, dmem: introduce dregion->memmap for dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 36/37] vfio: support dmempage refcount for vfio yulei.kernel
2020-12-07 11:31 ` [RFC V2 37/37] Add documentation for dmemfs yulei.kernel
2020-12-24 18:27   ` Randy Dunlap
2020-12-07 12:02 ` [RFC V2 00/37] Enhance memory utilization with DMEMFS David Hildenbrand
2020-12-07 19:32   ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=556903717e3d0b0fc0b9583b709f4b34be2154cb.1607332046.git.yuleixzhang@tencent.com \
    --to=yulei.kernel@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=gloryxiao@tencent.com \
    --cc=joao.m.martins@oracle.com \
    --cc=kernellwp@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=lihaiwei.kernel@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=pbonzini@redhat.com \
    --cc=rdunlap@infradead.org \
    --cc=sean.j.christopherson@intel.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xiaoguangrong.eric@gmail.com \
    --cc=yuleixzhang@tencent.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).