From: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
To: <linux-kernel@vger.kernel.org>, <linux-xfs@vger.kernel.org>,
<linux-nvdimm@lists.01.org>
Cc: <linux-mm@kvack.org>, <linux-fsdevel@vger.kernel.org>,
<darrick.wong@oracle.com>, <dan.j.williams@intel.com>,
<david@fromorbit.com>, <hch@lst.de>, <rgoldwyn@suse.de>,
<qi.fuli@fujitsu.com>, <y-goto@fujitsu.com>
Subject: [RFC PATCH 1/8] fs/dax: Introduce dax-rmap btree for reflink
Date: Mon, 27 Apr 2020 16:47:43 +0800 [thread overview]
Message-ID: <20200427084750.136031-2-ruansy.fnst@cn.fujitsu.com> (raw)
In-Reply-To: <20200427084750.136031-1-ruansy.fnst@cn.fujitsu.com>
Normally, when accessing a mmapped file, entering the page fault, the
file's (->mapping, ->index) will be associated with dax entry(represents
for one page or a couple of pages) to facilitate the reverse mapping
search. But in the case of reflink, a dax entry may be shared by multiple
files or offsets. In order to establish a reverse mapping relationship in
this case, I introduce a rb-tree to track multiple files and offsets.
The root of the rb-tree is stored in page->private, since I haven't found
it be used in fsdax. We create the rb-tree and insert the
(->mapping, ->index) tuple in the second time a dax entry is associated,
which means this dax entry is shared. And delete this tuple from the
rb-tree when disassociating.
Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
---
fs/dax.c | 153 ++++++++++++++++++++++++++++++++++++++++----
include/linux/dax.h | 6 ++
2 files changed, 147 insertions(+), 12 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 11b16729b86f..2f996c566103 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -25,6 +25,7 @@
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
+#include <linux/rbtree.h>
#include <asm/pgalloc.h>
#define CREATE_TRACE_POINTS
@@ -310,6 +311,120 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}
+static struct kmem_cache *dax_rmap_node_cachep;
+static struct kmem_cache *dax_rmap_root_cachep;
+
+static int __init init_dax_rmap_cache(void)
+{
+ dax_rmap_root_cachep = KMEM_CACHE(rb_root_cached, SLAB_PANIC|SLAB_ACCOUNT);
+ dax_rmap_node_cachep = KMEM_CACHE(shared_file, SLAB_PANIC|SLAB_ACCOUNT);
+ return 0;
+}
+fs_initcall(init_dax_rmap_cache);
+
+struct rb_root_cached *dax_create_rbroot(void)
+{
+ struct rb_root_cached *root = kmem_cache_alloc(dax_rmap_root_cachep,
+ GFP_KERNEL);
+
+ memset(root, 0, sizeof(struct rb_root_cached));
+ return root;
+}
+
+static bool dax_rmap_insert(struct page *page, struct address_space *mapping,
+ pgoff_t index)
+{
+ struct rb_root_cached *root = (struct rb_root_cached *)page_private(page);
+ struct rb_node **new, *parent = NULL;
+ struct shared_file *p;
+ bool leftmost = true;
+
+ if (!root) {
+ root = dax_create_rbroot();
+ set_page_private(page, (unsigned long)root);
+ dax_rmap_insert(page, page->mapping, page->index);
+ }
+ new = &root->rb_root.rb_node;
+ /* Figure out where to insert new node */
+ while (*new) {
+ struct shared_file *this = container_of(*new, struct shared_file, node);
+ long result = (long)mapping - (long)this->mapping;
+
+ if (result == 0)
+ result = (long)index - (long)this->index;
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0) {
+ new = &((*new)->rb_right);
+ leftmost = false;
+ } else
+ return false;
+ }
+ p = kmem_cache_alloc(dax_rmap_node_cachep, GFP_KERNEL);
+ p->mapping = mapping;
+ p->index = index;
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&p->node, parent, new);
+ rb_insert_color_cached(&p->node, root, leftmost);
+
+ return true;
+}
+
+static struct shared_file *dax_rmap_search(struct page *page,
+ struct address_space *mapping,
+ pgoff_t index)
+{
+ struct rb_root_cached *root = (struct rb_root_cached *)page_private(page);
+ struct rb_node *node = root->rb_root.rb_node;
+
+ while (node) {
+ struct shared_file *this = container_of(node, struct shared_file, node);
+ long result = (long)mapping - (long)this->mapping;
+
+ if (result == 0)
+ result = (long)index - (long)this->index;
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return this;
+ }
+ return NULL;
+}
+
+static void dax_rmap_delete(struct page *page, struct address_space *mapping,
+ pgoff_t index)
+{
+ struct rb_root_cached *root = (struct rb_root_cached *)page_private(page);
+ struct shared_file *this;
+
+ if (!root) {
+ page->mapping = NULL;
+ page->index = 0;
+ return;
+ }
+
+ this = dax_rmap_search(page, mapping, index);
+ rb_erase_cached(&this->node, root);
+ kmem_cache_free(dax_rmap_node_cachep, this);
+
+ if (!RB_EMPTY_ROOT(&root->rb_root)) {
+ if (page->mapping == mapping && page->index == index) {
+ this = container_of(rb_first_cached(root), struct shared_file, node);
+ page->mapping = this->mapping;
+ page->index = this->index;
+ }
+ } else {
+ kmem_cache_free(dax_rmap_root_cachep, root);
+ set_page_private(page, 0);
+ page->mapping = NULL;
+ page->index = 0;
+ }
+}
+
static unsigned long dax_end_pfn(void *entry)
{
return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
@@ -341,16 +456,20 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
+ if (!page->mapping) {
+ page->mapping = mapping;
+ page->index = index + i++;
+ } else {
+ dax_rmap_insert(page, mapping, index + i++);
+ }
}
}
static void dax_disassociate_entry(void *entry, struct address_space *mapping,
- bool trunc)
+ pgoff_t index, bool trunc)
{
unsigned long pfn;
+ int i = 0;
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return;
@@ -359,9 +478,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
- page->mapping = NULL;
- page->index = 0;
+ WARN_ON_ONCE(!page->mapping);
+ dax_rmap_delete(page, mapping, index + i++);
+ }
+}
+
+static void __dax_decrease_nrexceptional(void *entry,
+ struct address_space *mapping)
+{
+ if (dax_is_empty_entry(entry) || dax_is_zero_entry(entry) ||
+ dax_is_pmd_entry(entry)) {
+ mapping->nrexceptional--;
+ } else {
+ mapping->nrexceptional -= PHYS_PFN(dax_entry_size(entry));
}
}
@@ -522,10 +651,10 @@ static void *grab_mapping_entry(struct xa_state *xas,
xas_lock_irq(xas);
}
- dax_disassociate_entry(entry, mapping, false);
+ dax_disassociate_entry(entry, mapping, index, false);
xas_store(xas, NULL); /* undo the PMD join */
dax_wake_entry(xas, entry, true);
- mapping->nrexceptional--;
+ __dax_decrease_nrexceptional(entry, mapping);
entry = NULL;
xas_set(xas, index);
}
@@ -642,9 +771,9 @@ static int __dax_invalidate_entry(struct address_space *mapping,
(xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
goto out;
- dax_disassociate_entry(entry, mapping, trunc);
+ dax_disassociate_entry(entry, mapping, index, trunc);
xas_store(&xas, NULL);
- mapping->nrexceptional--;
+ __dax_decrease_nrexceptional(entry, mapping);
ret = 1;
out:
put_unlocked_entry(&xas, entry);
@@ -737,7 +866,7 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
- dax_disassociate_entry(entry, mapping, false);
+ dax_disassociate_entry(entry, mapping, xas->xa_index, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
/*
* Only swap our new entry into the page cache if the current
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d7af5d243f24..1e2e81c701b6 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -39,6 +39,12 @@ struct dax_operations {
int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
};
+struct shared_file {
+ struct address_space *mapping;
+ pgoff_t index;
+ struct rb_node node;
+};
+
extern struct attribute_group dax_attribute_group;
#if IS_ENABLED(CONFIG_DAX)
--
2.26.2
next prev parent reply other threads:[~2020-04-27 8:48 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-04-27 8:47 [RFC PATCH 0/8] dax: Add a dax-rmap tree to support reflink Shiyang Ruan
2020-04-27 8:47 ` Shiyang Ruan [this message]
2020-04-27 8:47 ` [RFC PATCH 2/8] mm: add dax-rmap for memory-failure and rmap Shiyang Ruan
2020-04-27 8:47 ` [RFC PATCH 3/8] fs/dax: Introduce dax_copy_edges() for COW Shiyang Ruan
2020-04-27 8:47 ` [RFC PATCH 4/8] fs/dax: copy data before write Shiyang Ruan
2020-04-27 8:47 ` [RFC PATCH 5/8] fs/dax: replace mmap entry in case of CoW Shiyang Ruan
2020-04-27 8:47 ` [RFC PATCH 6/8] fs/dax: dedup file range to use a compare function Shiyang Ruan
2020-04-27 8:47 ` [RFC PATCH 7/8] fs/xfs: handle CoW for fsdax write() path Shiyang Ruan
2020-04-27 8:47 ` [RFC PATCH 8/8] fs/xfs: support dedupe for fsdax Shiyang Ruan
2020-04-27 12:28 ` [RFC PATCH 0/8] dax: Add a dax-rmap tree to support reflink Matthew Wilcox
2020-04-28 6:09 ` 回复: " Ruan, Shiyang
2020-04-28 6:43 ` Dave Chinner
2020-04-28 9:32 ` Ruan Shiyang
2020-04-28 11:16 ` Matthew Wilcox
2020-04-28 11:24 ` Dave Chinner
2020-04-28 15:37 ` Darrick J. Wong
2020-04-28 22:02 ` Dave Chinner
2020-06-04 7:37 ` Ruan Shiyang
2020-06-04 14:51 ` Darrick J. Wong
2020-06-05 1:30 ` Dave Chinner
2020-06-05 2:30 ` Ruan Shiyang
2020-06-05 2:11 ` Ruan Shiyang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200427084750.136031-2-ruansy.fnst@cn.fujitsu.com \
--to=ruansy.fnst@cn.fujitsu.com \
--cc=dan.j.williams@intel.com \
--cc=darrick.wong@oracle.com \
--cc=david@fromorbit.com \
--cc=hch@lst.de \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-nvdimm@lists.01.org \
--cc=linux-xfs@vger.kernel.org \
--cc=qi.fuli@fujitsu.com \
--cc=rgoldwyn@suse.de \
--cc=y-goto@fujitsu.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).