linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Vivek Goyal <vgoyal@redhat.com>
To: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-nvdimm@lists.01.org
Cc: virtio-fs@redhat.com, vgoyal@redhat.com, miklos@szeredi.hu,
	stefanha@redhat.com, dgilbert@redhat.com,
	kbuild test robot <lkp@intel.com>,
	Liu Bo <bo.liu@linux.alibaba.com>
Subject: [PATCH 17/19] fuse: Add logic to free up a memory range
Date: Wed, 21 Aug 2019 13:57:18 -0400	[thread overview]
Message-ID: <20190821175720.25901-18-vgoyal@redhat.com> (raw)
In-Reply-To: <20190821175720.25901-1-vgoyal@redhat.com>

Add logic to free up a busy memory range. Freed memory range will be
returned to free pool. Add a worker which can be started to select
and free some busy memory ranges.

In certain cases (write path), process can steal one of its busy
dax ranges (inline reclaim) if free range is not available.

If free range is not available and nothing can't be stolen from same
inode, caller waits on a waitq for free range to become available.

For reclaiming a range, as of now we need to hold following locks in
specified order.

	down_write(&fi->i_mmap_sem);
	down_write(&fi->i_dmap_sem);


Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
 fs/fuse/file.c      | 488 +++++++++++++++++++++++++++++++++++++++++++-
 fs/fuse/fuse_i.h    |  25 +++
 fs/fuse/inode.c     |   5 +
 fs/fuse/virtio_fs.c |  10 +
 4 files changed, 519 insertions(+), 9 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8c1777fb61f7..2ff7624d58c0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -25,6 +25,8 @@
 INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
                      START, LAST, static inline, fuse_dax_interval_tree);
 
+static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc,
+				struct inode *inode);
 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 			  int opcode, struct fuse_open_out *outargp)
 {
@@ -177,6 +179,28 @@ static void fuse_link_write_file(struct file *file)
 	spin_unlock(&fi->lock);
 }
 
+static void
+__kick_dmap_free_worker(struct fuse_conn *fc, unsigned long delay_ms)
+{
+	unsigned long free_threshold;
+
+	/* If number of free ranges are below threshold, start reclaim */
+	free_threshold = max((fc->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD)/100,
+				(unsigned long)1);
+	if (fc->nr_free_ranges < free_threshold) {
+		pr_debug("fuse: Kicking dax memory reclaim worker. nr_free_ranges=0x%ld nr_total_ranges=%ld\n", fc->nr_free_ranges, fc->nr_ranges);
+		queue_delayed_work(system_long_wq, &fc->dax_free_work,
+				   msecs_to_jiffies(delay_ms));
+	}
+}
+
+static void kick_dmap_free_worker(struct fuse_conn *fc, unsigned long delay_ms)
+{
+	spin_lock(&fc->lock);
+	__kick_dmap_free_worker(fc, delay_ms);
+	spin_unlock(&fc->lock);
+}
+
 static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
 {
 	struct fuse_dax_mapping *dmap = NULL;
@@ -186,7 +210,7 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
 	/* TODO: Add logic to try to free up memory if wait is allowed */
 	if (fc->nr_free_ranges <= 0) {
 		spin_unlock(&fc->lock);
-		return NULL;
+		goto out_kick;
 	}
 
 	WARN_ON(list_empty(&fc->free_ranges));
@@ -197,6 +221,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
 	list_del_init(&dmap->list);
 	fc->nr_free_ranges--;
 	spin_unlock(&fc->lock);
+
+out_kick:
+	kick_dmap_free_worker(fc, 0);
 	return dmap;
 }
 
@@ -223,6 +250,8 @@ static void __dmap_add_to_free_pool(struct fuse_conn *fc,
 {
 	list_add_tail(&dmap->list, &fc->free_ranges);
 	fc->nr_free_ranges++;
+	/* TODO: Wake up only when needed */
+	wake_up(&fc->dax_range_waitq);
 }
 
 static void dmap_add_to_free_pool(struct fuse_conn *fc,
@@ -274,9 +303,15 @@ static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
 
 	dmap->writable = writable;
 	if (!upgrade) {
-		/* TODO: What locking is required here. For now,
-		 * using fc->lock
+		/*
+		 * We don't take a refernce on inode. inode is valid right now
+		 * and when inode is going away, cleanup logic should first
+		 * cleanup dmap entries.
+		 *
+		 * TODO: Do we need to ensure that we are holding inode lock
+		 * as well.
 		 */
+		dmap->inode = inode;
 		dmap->start = offset;
 		dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
 		/* Protected by fi->i_dmap_sem */
@@ -356,6 +391,7 @@ static void dmap_reinit_add_to_free_pool(struct fuse_conn *fc,
 		 "window_offset=0x%llx length=0x%llx\n", dmap->start,
 		 dmap->end, dmap->window_offset, dmap->length);
 	__dmap_remove_busy_list(fc, dmap);
+	dmap->inode = NULL;
 	dmap->start = dmap->end = 0;
 	__dmap_add_to_free_pool(fc, dmap);
 }
@@ -424,6 +460,21 @@ static void inode_reclaim_dmap_range(struct fuse_conn *fc, struct inode *inode,
 	spin_unlock(&fc->lock);
 }
 
+static int dmap_removemapping_one(struct inode *inode,
+				  struct fuse_dax_mapping *dmap)
+{
+	struct fuse_removemapping_one forget_one;
+	struct fuse_removemapping_in inarg;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.count = 1;
+	memset(&forget_one, 0, sizeof(forget_one));
+	forget_one.moffset = dmap->window_offset;
+	forget_one.len = dmap->length;
+
+	return fuse_send_removemapping(inode, &inarg, &forget_one);
+}
+
 /*
  * It is called from evict_inode() and by that time inode is going away. So
  * this function does not take any locks like fi->i_dmap_sem for traversing
@@ -1816,6 +1867,18 @@ static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
 		if (flags & IOMAP_FAULT)
 			iomap->length = ALIGN(len, PAGE_SIZE);
 		iomap->type = IOMAP_MAPPED;
+
+		/*
+		 * increace refcnt so that reclaim code knows this dmap is in
+		 * use. This assumes i_dmap_sem mutex is held either
+		 * shared/exclusive.
+		 */
+		refcount_inc(&dmap->refcnt);
+
+		/* iomap->private should be NULL */
+		WARN_ON_ONCE(iomap->private);
+		iomap->private = dmap;
+
 		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
 				" length 0x%llx\n", __func__, iomap->addr,
 				iomap->offset, iomap->length);
@@ -1838,8 +1901,23 @@ static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
 	int ret;
 	bool writable = flags & IOMAP_WRITE;
 
-	alloc_dmap = alloc_dax_mapping(fc);
-	if (!alloc_dmap)
+	/* Can't do reclaim in fault path yet due to lock ordering.
+	 * Read path takes shared inode lock and that's not sufficient
+	 * for inline range reclaim. Caller needs to drop lock, wait
+	 * and retry.
+	 */
+	if (flags & IOMAP_FAULT || !(flags & IOMAP_WRITE)) {
+		alloc_dmap = alloc_dax_mapping(fc);
+		if (!alloc_dmap)
+			return -ENOSPC;
+	} else {
+		alloc_dmap = alloc_dax_mapping_reclaim(fc, inode);
+		if (IS_ERR(alloc_dmap))
+			return PTR_ERR(alloc_dmap);
+	}
+
+	/* If we are here, we should have memory allocated */
+	if (WARN_ON(!alloc_dmap))
 		return -EBUSY;
 
 	/*
@@ -1892,14 +1970,25 @@ static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
 	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
 
 	/* We are holding either inode lock or i_mmap_sem, and that should
-	 * ensure that dmap can't reclaimed or truncated and it should still
-	 * be there in tree despite the fact we dropped and re-acquired the
-	 * lock.
+	 * ensure that dmap can't be truncated. We are holding a reference
+	 * on dmap and that should make sure it can't be reclaimed. So dmap
+	 * should still be there in tree despite the fact we dropped and
+	 * re-acquired the i_dmap_sem lock.
 	 */
 	ret = -EIO;
 	if (WARN_ON(!dmap))
 		goto out_err;
 
+	/* We took an extra reference on dmap to make sure its not reclaimd.
+	 * Now we hold i_dmap_sem lock and that reference is not needed
+	 * anymore. Drop it.
+	 */
+	if (refcount_dec_and_test(&dmap->refcnt)) {
+		/* refcount should not hit 0. This object only goes
+		 * away when fuse connection goes away */
+		WARN_ON_ONCE(1);
+	}
+
 	/* Maybe another thread already upgraded mapping while we were not
 	 * holding lock.
 	 */
@@ -1968,7 +2057,11 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 			 * two threads to be trying to this simultaneously
 			 * for same dmap. So drop shared lock and acquire
 			 * exclusive lock.
+			 *
+			 * Before dropping i_dmap_sem lock, take reference
+			 * on dmap so that its not freed by range reclaim.
 			 */
+			refcount_inc(&dmap->refcnt);
 			up_read(&fi->i_dmap_sem);
 			pr_debug("%s: Upgrading mapping at offset 0x%llx"
 				 " length 0x%llx\n", __func__, pos, length);
@@ -2004,6 +2097,16 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 			  ssize_t written, unsigned flags,
 			  struct iomap *iomap)
 {
+	struct fuse_dax_mapping *dmap = iomap->private;
+
+	if (dmap) {
+		if (refcount_dec_and_test(&dmap->refcnt)) {
+			/* refcount should not hit 0. This object only goes
+			 * away when fuse connection goes away */
+			WARN_ON_ONCE(1);
+		}
+	}
+
 	/* DAX writes beyond end-of-file aren't handled using iomap, so the
 	 * file size is unchanged and there is nothing to do here.
 	 */
@@ -2018,7 +2121,18 @@ static const struct iomap_ops fuse_iomap_ops = {
 static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_conn *fc = get_fuse_conn(inode);
 	ssize_t ret;
+	bool retry = false;
+
+retry:
+	if (retry && !(fc->nr_free_ranges > 0)) {
+		ret = -EINTR;
+		if (wait_event_killable_exclusive(fc->dax_range_waitq,
+						  (fc->nr_free_ranges > 0))) {
+			goto out;
+		}
+	}
 
 	if (iocb->ki_flags & IOCB_NOWAIT) {
 		if (!inode_trylock_shared(inode))
@@ -2030,8 +2144,19 @@ static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
 	inode_unlock_shared(inode);
 
+	/* If a dax range could not be allocated and it can't be reclaimed
+	 * inline, then drop inode lock and retry. Range reclaim logic
+	 * requires exclusive access to inode lock.
+	 *
+	 * TODO: What if -ENOSPC needs to be returned to user space. Fix it.
+	 */
+	if (ret == -ENOSPC) {
+		retry = true;
+		goto retry;
+	}
 	/* TODO file_accessed(iocb->f_filp) */
 
+out:
 	return ret;
 }
 
@@ -2810,10 +2935,21 @@ static int __fuse_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	pfn_t pfn;
+	int error = 0;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	bool retry = false;
 
 	if (write)
 		sb_start_pagefault(sb);
 
+retry:
+	if (retry && !(fc->nr_free_ranges > 0)) {
+		ret = -EINTR;
+		if (wait_event_killable_exclusive(fc->dax_range_waitq,
+					(fc->nr_free_ranges > 0)))
+			goto out;
+	}
+
 	/*
 	 * We need to serialize against not only truncate but also against
 	 * fuse dax memory range reclaim. While a range is being reclaimed,
@@ -2821,13 +2957,20 @@ static int __fuse_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 	 * to populate page cache or access memory we are trying to free.
 	 */
 	down_read(&get_fuse_inode(inode)->i_mmap_sem);
-	ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &fuse_iomap_ops);
+	ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+	if ((ret & VM_FAULT_ERROR) && error == -ENOSPC) {
+		error = 0;
+		retry = true;
+		up_read(&get_fuse_inode(inode)->i_mmap_sem);
+		goto retry;
+	}
 
 	if (ret & VM_FAULT_NEEDDSYNC)
 		ret = dax_finish_sync_fault(vmf, pe_size, pfn);
 
 	up_read(&get_fuse_inode(inode)->i_mmap_sem);
 
+out:
 	if (write)
 		sb_end_pagefault(sb);
 
@@ -3979,3 +4122,330 @@ void fuse_init_file_inode(struct inode *inode)
 		inode->i_data.a_ops = &fuse_dax_file_aops;
 	}
 }
+
+static int dmap_writeback_invalidate(struct inode *inode,
+				     struct fuse_dax_mapping *dmap)
+{
+	int ret;
+
+	ret = filemap_fdatawrite_range(inode->i_mapping, dmap->start,
+				       dmap->end);
+	if (ret) {
+		printk("filemap_fdatawrite_range() failed. err=%d start=0x%llx,"
+			" end=0x%llx\n", ret, dmap->start, dmap->end);
+		return ret;
+	}
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    dmap->start >> PAGE_SHIFT,
+					    dmap->end >> PAGE_SHIFT);
+	if (ret)
+		printk("invalidate_inode_pages2_range() failed err=%d\n", ret);
+
+	return ret;
+}
+
+static int reclaim_one_dmap_locked(struct fuse_conn *fc, struct inode *inode,
+				   struct fuse_dax_mapping *dmap)
+{
+	int ret;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	/*
+	 * igrab() was done to make sure inode won't go under us, and this
+	 * further avoids the race with evict().
+	 */
+	ret = dmap_writeback_invalidate(inode, dmap);
+
+	/* TODO: What to do if above fails? For now,
+	 * leave the range in place.
+	 */
+	if (ret)
+		return ret;
+
+	/* Remove dax mapping from inode interval tree now */
+	fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree);
+	fi->nr_dmaps--;
+
+	ret = dmap_removemapping_one(inode, dmap);
+	if (ret) {
+		pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx\n",
+			dmap->window_offset, dmap->length);
+	}
+
+	return 0;
+}
+
+static void fuse_wait_dax_page(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+        up_write(&fi->i_mmap_sem);
+        schedule();
+        down_write(&fi->i_mmap_sem);
+}
+
+/* Should be called with fi->i_mmap_sem lock held exclusively */
+static int __fuse_break_dax_layouts(struct inode *inode, bool *retry,
+				    loff_t start, loff_t end)
+{
+	struct page *page;
+
+	page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+	if (!page)
+		return 0;
+
+	*retry = true;
+	return ___wait_var_event(&page->_refcount,
+			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+			0, 0, fuse_wait_dax_page(inode));
+}
+
+/* dmap_end == 0 leads to unmapping of whole file */
+static int fuse_break_dax_layouts(struct inode *inode, u64 dmap_start,
+				  u64 dmap_end)
+{
+	bool	retry;
+	int	ret;
+
+	do {
+		retry = false;
+		ret = __fuse_break_dax_layouts(inode, &retry, dmap_start,
+					       dmap_end);
+        } while (ret == 0 && retry);
+
+        return ret;
+}
+
+/* First first mapping in the tree and free it. */
+static struct fuse_dax_mapping *
+inode_reclaim_first_dmap_locked(struct fuse_conn *fc, struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_dax_mapping *dmap;
+	int ret;
+
+	/* Find fuse dax mapping at file offset inode. */
+	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, 0, -1);
+	if (!dmap)
+		return NULL;
+
+	ret = reclaim_one_dmap_locked(fc, inode, dmap);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	/* Clean up dmap. Do not add back to free list */
+	dmap_remove_busy_list(fc, dmap);
+	dmap->inode = NULL;
+	dmap->start = dmap->end = 0;
+
+	pr_debug("fuse: reclaimed memory range window_offset=0x%llx,"
+				" length=0x%llx\n", dmap->window_offset,
+				dmap->length);
+	return dmap;
+}
+
+/*
+ * First first mapping in the tree and free it and return it. Do not add
+ * it back to free pool.
+ *
+ * This is called with inode lock held.
+ */
+static struct fuse_dax_mapping *inode_reclaim_first_dmap(struct fuse_conn *fc,
+							 struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_dax_mapping *dmap;
+	int ret;
+
+	down_write(&fi->i_mmap_sem);
+
+	/* Make sure there are references to inode pages using
+	 * get_user_pages()
+	 *
+	 * TODO: Only check for page range inside dmap (and not whole inode)
+	 */
+	ret = fuse_break_dax_layouts(inode, 0, 0);
+	if (ret) {
+		printk("virtio_fs: fuse_break_dax_layouts() failed. err=%d\n",
+		       ret);
+		dmap = ERR_PTR(ret);
+		goto out_mmap_sem;
+	}
+	down_write(&fi->i_dmap_sem);
+	dmap = inode_reclaim_first_dmap_locked(fc, inode);
+	up_write(&fi->i_dmap_sem);
+out_mmap_sem:
+	up_write(&fi->i_mmap_sem);
+	return dmap;
+}
+
+static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc,
+					struct inode *inode)
+{
+	struct fuse_dax_mapping *dmap;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	while(1) {
+		dmap = alloc_dax_mapping(fc);
+		if (dmap)
+			return dmap;
+
+		if (fi->nr_dmaps) {
+			dmap = inode_reclaim_first_dmap(fc, inode);
+			if (dmap)
+				return dmap;
+		}
+		/*
+		 * There are no mappings which can be reclaimed.
+		 * Wait for one.
+		 */
+		if (!(fc->nr_free_ranges > 0)) {
+			if (wait_event_killable_exclusive(fc->dax_range_waitq,
+					(fc->nr_free_ranges > 0)))
+				return ERR_PTR(-EINTR);
+		}
+	}
+}
+
+static int lookup_and_reclaim_dmap_locked(struct fuse_conn *fc,
+					  struct inode *inode, u64 dmap_start)
+{
+	int ret;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_dax_mapping *dmap;
+
+	/* Find fuse dax mapping at file offset inode. */
+	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, dmap_start,
+						 dmap_start);
+
+	/* Range already got cleaned up by somebody else */
+	if (!dmap)
+		return 0;
+
+	/* still in use. */
+	if (refcount_read(&dmap->refcnt) > 1)
+		return 0;
+
+	ret = reclaim_one_dmap_locked(fc, inode, dmap);
+	if (ret < 0)
+		return ret;
+
+	/* Cleanup dmap entry and add back to free list */
+	spin_lock(&fc->lock);
+	dmap_reinit_add_to_free_pool(fc, dmap);
+	spin_unlock(&fc->lock);
+	return ret;
+}
+
+/*
+ * Free a range of memory.
+ * Locking.
+ * 1. Take fuse_inode->i_mmap_sem to block dax faults.
+ * 2. Take fuse_inode->i_dmap_sem to protect interval tree and also to make
+ *    sure read/write can not reuse a dmap which we might be freeing.
+ */
+static int lookup_and_reclaim_dmap(struct fuse_conn *fc, struct inode *inode,
+				   u64 dmap_start, u64 dmap_end)
+{
+	int ret;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	down_write(&fi->i_mmap_sem);
+	ret = fuse_break_dax_layouts(inode, dmap_start, dmap_end);
+	if (ret) {
+		printk("virtio_fs: fuse_break_dax_layouts() failed. err=%d\n",
+		       ret);
+		goto out_mmap_sem;
+	}
+
+	down_write(&fi->i_dmap_sem);
+	ret = lookup_and_reclaim_dmap_locked(fc, inode, dmap_start);
+	up_write(&fi->i_dmap_sem);
+out_mmap_sem:
+	up_write(&fi->i_mmap_sem);
+	return ret;
+}
+
+static int try_to_free_dmap_chunks(struct fuse_conn *fc,
+				   unsigned long nr_to_free)
+{
+	struct fuse_dax_mapping *dmap, *pos, *temp;
+	int ret, nr_freed = 0;
+	u64 dmap_start = 0, window_offset = 0, dmap_end = 0;
+	struct inode *inode = NULL;
+
+	/* Pick first busy range and free it for now*/
+	while(1) {
+		if (nr_freed >= nr_to_free)
+			break;
+
+		dmap = NULL;
+		spin_lock(&fc->lock);
+
+		if (!fc->nr_busy_ranges) {
+			spin_unlock(&fc->lock);
+			return 0;
+		}
+
+		list_for_each_entry_safe(pos, temp, &fc->busy_ranges,
+						busy_list) {
+			/* skip this range if it's in use. */
+			if (refcount_read(&pos->refcnt) > 1)
+				continue;
+
+			inode = igrab(pos->inode);
+			/*
+			 * This inode is going away. That will free
+			 * up all the ranges anyway, continue to
+			 * next range.
+			 */
+			if (!inode)
+				continue;
+			/*
+			 * Take this element off list and add it tail. If
+			 * inode lock can't be obtained, this will help with
+			 * selecting new element
+			 */
+			dmap = pos;
+			list_move_tail(&dmap->busy_list, &fc->busy_ranges);
+			dmap_start = dmap->start;
+			dmap_end = dmap->end;
+			window_offset = dmap->window_offset;
+			break;
+		}
+		spin_unlock(&fc->lock);
+		if (!dmap)
+			return 0;
+
+		ret = lookup_and_reclaim_dmap(fc, inode, dmap_start, dmap_end);
+		iput(inode);
+		if (ret) {
+			printk("%s(window_offset=0x%llx) failed. err=%d\n",
+				__func__, window_offset, ret);
+			return ret;
+		}
+		nr_freed++;
+	}
+	return 0;
+}
+
+/* TODO: This probably should go in inode.c */
+void fuse_dax_free_mem_worker(struct work_struct *work)
+{
+	int ret;
+	struct fuse_conn *fc = container_of(work, struct fuse_conn,
+						dax_free_work.work);
+	pr_debug("fuse: Worker to free memory called. nr_free_ranges=%lu"
+		 " nr_busy_ranges=%lu\n", fc->nr_free_ranges,
+		 fc->nr_busy_ranges);
+
+	ret = try_to_free_dmap_chunks(fc, FUSE_DAX_RECLAIM_CHUNK);
+	if (ret) {
+		pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
+			 ret);
+	}
+
+	/* If number of free ranges are still below threhold, requeue */
+	kick_dmap_free_worker(fc, 1);
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 070a5c2b6498..5f2f348536aa 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -57,6 +57,16 @@
 #define FUSE_DAX_MEM_RANGE_SZ	(2*1024*1024)
 #define FUSE_DAX_MEM_RANGE_PAGES	(FUSE_DAX_MEM_RANGE_SZ/PAGE_SIZE)
 
+/* Number of ranges reclaimer will try to free in one invocation */
+#define FUSE_DAX_RECLAIM_CHUNK		(10)
+
+/*
+ * Dax memory reclaim threshold in percetage of total ranges. When free
+ * number of free ranges drops below this threshold, reclaim can trigger
+ * Default is 20%
+ * */
+#define FUSE_DAX_RECLAIM_THRESHOLD	(20)
+
 /** List of active connections */
 extern struct list_head fuse_conn_list;
 
@@ -109,6 +119,9 @@ struct fuse_forget_link {
 
 /** Translation information for file offsets to DAX window offsets */
 struct fuse_dax_mapping {
+	/* Pointer to inode where this memory range is mapped */
+	struct inode *inode;
+
 	/* Will connect in fc->free_ranges to keep track of free memory */
 	struct list_head list;
 
@@ -131,6 +144,9 @@ struct fuse_dax_mapping {
 
 	/* Is this mapping read-only or read-write */
 	bool writable;
+
+	/* reference count when the mapping is used by dax iomap. */
+	refcount_t refcnt;
 };
 
 /** FUSE inode */
@@ -895,12 +911,20 @@ struct fuse_conn {
 	unsigned long nr_busy_ranges;
 	struct list_head busy_ranges;
 
+	/* Worker to free up memory ranges */
+	struct delayed_work dax_free_work;
+
+	/* Wait queue for a dax range to become free */
+	wait_queue_head_t dax_range_waitq;
+
 	/*
 	 * DAX Window Free Ranges. TODO: This might not be best place to store
 	 * this free list
 	 */
 	long nr_free_ranges;
 	struct list_head free_ranges;
+
+	unsigned long nr_ranges;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -1279,6 +1303,7 @@ unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args);
  */
 u64 fuse_get_unique(struct fuse_iqueue *fiq);
 void fuse_free_conn(struct fuse_conn *fc);
+void fuse_dax_free_mem_worker(struct work_struct *work);
 void fuse_cleanup_inode_mappings(struct inode *inode);
 
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b80e76a307f3..4871933f4557 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -661,11 +661,13 @@ static int fuse_dax_mem_range_init(struct fuse_conn *fc,
 		range->window_offset = i * FUSE_DAX_MEM_RANGE_SZ;
 		range->length = FUSE_DAX_MEM_RANGE_SZ;
 		INIT_LIST_HEAD(&range->busy_list);
+		refcount_set(&range->refcnt, 1);
 		list_add_tail(&range->list, &mem_ranges);
 	}
 
 	list_replace_init(&mem_ranges, &fc->free_ranges);
 	fc->nr_free_ranges = nr_ranges;
+	fc->nr_ranges = nr_ranges;
 	return 0;
 out_err:
 	/* Free All allocated elements */
@@ -692,6 +694,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
 	atomic_set(&fc->dev_count, 1);
 	init_waitqueue_head(&fc->blocked_waitq);
 	init_waitqueue_head(&fc->reserved_req_waitq);
+	init_waitqueue_head(&fc->dax_range_waitq);
 	fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
@@ -712,6 +715,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
 	fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
 	INIT_LIST_HEAD(&fc->free_ranges);
 	INIT_LIST_HEAD(&fc->busy_ranges);
+	INIT_DELAYED_WORK(&fc->dax_free_work, fuse_dax_free_mem_worker);
 }
 EXPORT_SYMBOL_GPL(fuse_conn_init);
 
@@ -720,6 +724,7 @@ void fuse_conn_put(struct fuse_conn *fc)
 	if (refcount_dec_and_test(&fc->count)) {
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
+		flush_delayed_work(&fc->dax_free_work);
 		if (fc->dax_dev)
 			fuse_free_dax_mem_ranges(&fc->free_ranges);
 		put_pid_ns(fc->pid_ns);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 9198c2b84677..72b97bcd8e44 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -491,6 +491,15 @@ static void virtio_fs_cleanup_dax(void *data)
 	put_dax(fs->dax_dev);
 }
 
+static void virtio_fs_pagemap_page_free(struct page *page)
+{
+	wake_up_var(&page->_refcount);
+}
+
+static const struct dev_pagemap_ops virtio_fs_pagemap_ops = {
+	.page_free	= virtio_fs_pagemap_page_free,
+};
+
 static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 {
 	struct virtio_shm_region cache_reg;
@@ -517,6 +526,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 		return -ENOMEM;
 
 	pgmap->type = MEMORY_DEVICE_FS_DAX;
+	pgmap->ops = &virtio_fs_pagemap_ops;
 
 	/* Ideally we would directly use the PCI BAR resource but
 	 * devm_memremap_pages() wants its own copy in pgmap.  So
-- 
2.20.1


  parent reply	other threads:[~2019-08-21 17:58 UTC|newest]

Thread overview: 77+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-21 17:57 [PATCH v3 00/19][RFC] virtio-fs: Enable DAX support Vivek Goyal
2019-08-21 17:57 ` [PATCH 01/19] dax: remove block device dependencies Vivek Goyal
2019-08-26 11:51   ` Christoph Hellwig
2019-08-27 16:38     ` Vivek Goyal
2019-08-28  6:58       ` Christoph Hellwig
2019-08-28 17:58         ` Vivek Goyal
2019-08-28 22:53           ` Dave Chinner
2019-08-29  0:04             ` Dan Williams
2019-08-29  9:32               ` Christoph Hellwig
2019-12-16 18:10               ` Vivek Goyal
2020-01-07 12:51                 ` Christoph Hellwig
2020-01-07 14:22                   ` Dan Williams
2020-01-07 17:07                     ` Darrick J. Wong
2020-01-07 17:29                       ` Dan Williams
2020-01-07 18:01                         ` Vivek Goyal
2020-01-07 18:07                           ` Dan Williams
2020-01-07 18:33                             ` Vivek Goyal
2020-01-07 18:49                               ` Dan Williams
2020-01-07 19:02                                 ` Darrick J. Wong
2020-01-07 19:46                                   ` Dan Williams
2020-01-07 23:38                                     ` Dan Williams
2020-01-09 11:24                                 ` Jan Kara
2020-01-09 20:03                                   ` Dan Williams
2020-01-10 12:36                                     ` Christoph Hellwig
2020-01-14 20:31                                     ` Vivek Goyal
2020-01-14 20:39                                       ` Dan Williams
2020-01-14 21:28                                         ` Vivek Goyal
2020-01-14 22:23                                           ` Dan Williams
2020-01-15 19:56                                             ` Vivek Goyal
2020-01-15 20:17                                               ` Dan Williams
2020-01-15 21:08                                                 ` Jeff Moyer
2020-01-16 18:09                                                   ` Dan Williams
2020-01-16 18:39                                                     ` Vivek Goyal
2020-01-16 19:09                                                       ` Dan Williams
2020-01-16 19:23                                                         ` Vivek Goyal
2020-02-11 17:33                                                     ` Vivek Goyal
2020-01-15  9:03                                           ` Jan Kara
2019-08-21 17:57 ` [PATCH 02/19] dax: Pass dax_dev to dax_writeback_mapping_range() Vivek Goyal
2019-08-26 11:53   ` Christoph Hellwig
2019-08-26 20:33     ` Vivek Goyal
2019-08-26 20:58       ` Vivek Goyal
2019-08-26 21:33         ` Dan Williams
2019-08-28  6:58         ` Christoph Hellwig
2020-01-03 14:12         ` Vivek Goyal
2020-01-03 18:12           ` Dan Williams
2020-01-03 18:18             ` Dan Williams
2020-01-03 18:33               ` Vivek Goyal
2020-01-03 19:30                 ` Dan Williams
2020-01-03 18:43               ` Vivek Goyal
2019-08-27 13:45       ` Jan Kara
2019-08-21 17:57 ` [PATCH 03/19] virtio: Add get_shm_region method Vivek Goyal
2019-08-21 17:57 ` [PATCH 04/19] virtio: Implement get_shm_region for PCI transport Vivek Goyal
2019-08-26  1:43   ` [Virtio-fs] " piaojun
2019-08-26 13:06     ` Vivek Goyal
2019-08-27  9:41       ` piaojun
2019-08-27  8:34   ` Cornelia Huck
2019-08-27  8:46     ` Cornelia Huck
2019-08-27 11:53     ` Vivek Goyal
2019-08-21 17:57 ` [PATCH 05/19] virtio: Implement get_shm_region for MMIO transport Vivek Goyal
2019-08-27  8:39   ` Cornelia Huck
2019-08-27 11:54     ` Vivek Goyal
2019-08-21 17:57 ` [PATCH 06/19] fuse, dax: add fuse_conn->dax_dev field Vivek Goyal
2019-08-21 17:57 ` [PATCH 07/19] virtio_fs, dax: Set up virtio_fs dax_device Vivek Goyal
2019-08-21 17:57 ` [PATCH 08/19] fuse: Keep a list of free dax memory ranges Vivek Goyal
2019-08-21 17:57 ` [PATCH 09/19] fuse: implement FUSE_INIT map_alignment field Vivek Goyal
2019-08-21 17:57 ` [PATCH 10/19] fuse: Introduce setupmapping/removemapping commands Vivek Goyal
2019-08-21 17:57 ` [PATCH 11/19] fuse, dax: Implement dax read/write operations Vivek Goyal
2019-08-21 19:49   ` Liu Bo
2019-08-22 12:59     ` Vivek Goyal
2019-08-21 17:57 ` [PATCH 12/19] fuse, dax: add DAX mmap support Vivek Goyal
2019-08-21 17:57 ` [PATCH 13/19] fuse: Define dax address space operations Vivek Goyal
2019-08-21 17:57 ` [PATCH 14/19] fuse, dax: Take ->i_mmap_sem lock during dax page fault Vivek Goyal
2019-08-21 17:57 ` [PATCH 15/19] fuse: Maintain a list of busy elements Vivek Goyal
2019-08-21 17:57 ` [PATCH 16/19] dax: Create a range version of dax_layout_busy_page() Vivek Goyal
2019-08-21 17:57 ` Vivek Goyal [this message]
2019-08-21 17:57 ` [PATCH 18/19] fuse: Release file in process context Vivek Goyal
2019-08-21 17:57 ` [PATCH 19/19] fuse: Take inode lock for dax inode truncation Vivek Goyal

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190821175720.25901-18-vgoyal@redhat.com \
    --to=vgoyal@redhat.com \
    --cc=bo.liu@linux.alibaba.com \
    --cc=dgilbert@redhat.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=lkp@intel.com \
    --cc=miklos@szeredi.hu \
    --cc=stefanha@redhat.com \
    --cc=virtio-fs@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).