From: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com> To: <linux-kernel@vger.kernel.org>, <linux-xfs@vger.kernel.org>, <linux-nvdimm@lists.01.org> Cc: linux-mm@kvack.org, linux-fsdevel@vger.kernel.org, darrick.wong@oracle.com, david@fromorbit.com, hch@lst.de, rgoldwyn@suse.de, qi.fuli@fujitsu.com, y-goto@fujitsu.com Subject: [RFC PATCH 1/8] fs/dax: Introduce dax-rmap btree for reflink Date: Mon, 27 Apr 2020 16:47:43 +0800 [thread overview] Message-ID: <20200427084750.136031-2-ruansy.fnst@cn.fujitsu.com> (raw) In-Reply-To: <20200427084750.136031-1-ruansy.fnst@cn.fujitsu.com> Normally, when accessing a mmapped file, entering the page fault, the file's (->mapping, ->index) will be associated with dax entry(represents for one page or a couple of pages) to facilitate the reverse mapping search. But in the case of reflink, a dax entry may be shared by multiple files or offsets. In order to establish a reverse mapping relationship in this case, I introduce a rb-tree to track multiple files and offsets. The root of the rb-tree is stored in page->private, since I haven't found it be used in fsdax. We create the rb-tree and insert the (->mapping, ->index) tuple in the second time a dax entry is associated, which means this dax entry is shared. And delete this tuple from the rb-tree when disassociating. Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com> --- fs/dax.c | 153 ++++++++++++++++++++++++++++++++++++++++---- include/linux/dax.h | 6 ++ 2 files changed, 147 insertions(+), 12 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 11b16729b86f..2f996c566103 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -25,6 +25,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <linux/rbtree.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS @@ -310,6 +311,120 @@ static unsigned long dax_entry_size(void *entry) return PAGE_SIZE; } +static struct kmem_cache *dax_rmap_node_cachep; +static struct kmem_cache *dax_rmap_root_cachep; + +static int __init init_dax_rmap_cache(void) +{ + dax_rmap_root_cachep = KMEM_CACHE(rb_root_cached, SLAB_PANIC|SLAB_ACCOUNT); + dax_rmap_node_cachep = KMEM_CACHE(shared_file, SLAB_PANIC|SLAB_ACCOUNT); + return 0; +} +fs_initcall(init_dax_rmap_cache); + +struct rb_root_cached *dax_create_rbroot(void) +{ + struct rb_root_cached *root = kmem_cache_alloc(dax_rmap_root_cachep, + GFP_KERNEL); + + memset(root, 0, sizeof(struct rb_root_cached)); + return root; +} + +static bool dax_rmap_insert(struct page *page, struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct rb_node **new, *parent = NULL; + struct shared_file *p; + bool leftmost = true; + + if (!root) { + root = dax_create_rbroot(); + set_page_private(page, (unsigned long)root); + dax_rmap_insert(page, page->mapping, page->index); + } + new = &root->rb_root.rb_node; + /* Figure out where to insert new node */ + while (*new) { + struct shared_file *this = container_of(*new, struct shared_file, node); + long result = (long)mapping - (long)this->mapping; + + if (result == 0) + result = (long)index - (long)this->index; + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) { + new = &((*new)->rb_right); + leftmost = false; + } else + return false; + } + p = kmem_cache_alloc(dax_rmap_node_cachep, GFP_KERNEL); + p->mapping = mapping; + p->index = index; + + /* Add new node and rebalance tree. */ + rb_link_node(&p->node, parent, new); + rb_insert_color_cached(&p->node, root, leftmost); + + return true; +} + +static struct shared_file *dax_rmap_search(struct page *page, + struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct rb_node *node = root->rb_root.rb_node; + + while (node) { + struct shared_file *this = container_of(node, struct shared_file, node); + long result = (long)mapping - (long)this->mapping; + + if (result == 0) + result = (long)index - (long)this->index; + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return this; + } + return NULL; +} + +static void dax_rmap_delete(struct page *page, struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct shared_file *this; + + if (!root) { + page->mapping = NULL; + page->index = 0; + return; + } + + this = dax_rmap_search(page, mapping, index); + rb_erase_cached(&this->node, root); + kmem_cache_free(dax_rmap_node_cachep, this); + + if (!RB_EMPTY_ROOT(&root->rb_root)) { + if (page->mapping == mapping && page->index == index) { + this = container_of(rb_first_cached(root), struct shared_file, node); + page->mapping = this->mapping; + page->index = this->index; + } + } else { + kmem_cache_free(dax_rmap_root_cachep, root); + set_page_private(page, 0); + page->mapping = NULL; + page->index = 0; + } +} + static unsigned long dax_end_pfn(void *entry) { return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; @@ -341,16 +456,20 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - WARN_ON_ONCE(page->mapping); - page->mapping = mapping; - page->index = index + i++; + if (!page->mapping) { + page->mapping = mapping; + page->index = index + i++; + } else { + dax_rmap_insert(page, mapping, index + i++); + } } } static void dax_disassociate_entry(void *entry, struct address_space *mapping, - bool trunc) + pgoff_t index, bool trunc) { unsigned long pfn; + int i = 0; if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; @@ -359,9 +478,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - WARN_ON_ONCE(page->mapping && page->mapping != mapping); - page->mapping = NULL; - page->index = 0; + WARN_ON_ONCE(!page->mapping); + dax_rmap_delete(page, mapping, index + i++); + } +} + +static void __dax_decrease_nrexceptional(void *entry, + struct address_space *mapping) +{ + if (dax_is_empty_entry(entry) || dax_is_zero_entry(entry) || + dax_is_pmd_entry(entry)) { + mapping->nrexceptional--; + } else { + mapping->nrexceptional -= PHYS_PFN(dax_entry_size(entry)); } } @@ -522,10 +651,10 @@ static void *grab_mapping_entry(struct xa_state *xas, xas_lock_irq(xas); } - dax_disassociate_entry(entry, mapping, false); + dax_disassociate_entry(entry, mapping, index, false); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, true); - mapping->nrexceptional--; + __dax_decrease_nrexceptional(entry, mapping); entry = NULL; xas_set(xas, index); } @@ -642,9 +771,9 @@ static int __dax_invalidate_entry(struct address_space *mapping, (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; - dax_disassociate_entry(entry, mapping, trunc); + dax_disassociate_entry(entry, mapping, index, trunc); xas_store(&xas, NULL); - mapping->nrexceptional--; + __dax_decrease_nrexceptional(entry, mapping); ret = 1; out: put_unlocked_entry(&xas, entry); @@ -737,7 +866,7 @@ static void *dax_insert_entry(struct xa_state *xas, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; - dax_disassociate_entry(entry, mapping, false); + dax_disassociate_entry(entry, mapping, xas->xa_index, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); /* * Only swap our new entry into the page cache if the current diff --git a/include/linux/dax.h b/include/linux/dax.h index d7af5d243f24..1e2e81c701b6 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -39,6 +39,12 @@ struct dax_operations { int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); }; +struct shared_file { + struct address_space *mapping; + pgoff_t index; + struct rb_node node; +}; + extern struct attribute_group dax_attribute_group; #if IS_ENABLED(CONFIG_DAX) -- 2.26.2 _______________________________________________ Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org To unsubscribe send an email to linux-nvdimm-leave@lists.01.org
WARNING: multiple messages have this Message-ID (diff)
From: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com> To: <linux-kernel@vger.kernel.org>, <linux-xfs@vger.kernel.org>, <linux-nvdimm@lists.01.org> Cc: <linux-mm@kvack.org>, <linux-fsdevel@vger.kernel.org>, <darrick.wong@oracle.com>, <dan.j.williams@intel.com>, <david@fromorbit.com>, <hch@lst.de>, <rgoldwyn@suse.de>, <qi.fuli@fujitsu.com>, <y-goto@fujitsu.com> Subject: [RFC PATCH 1/8] fs/dax: Introduce dax-rmap btree for reflink Date: Mon, 27 Apr 2020 16:47:43 +0800 [thread overview] Message-ID: <20200427084750.136031-2-ruansy.fnst@cn.fujitsu.com> (raw) In-Reply-To: <20200427084750.136031-1-ruansy.fnst@cn.fujitsu.com> Normally, when accessing a mmapped file, entering the page fault, the file's (->mapping, ->index) will be associated with dax entry(represents for one page or a couple of pages) to facilitate the reverse mapping search. But in the case of reflink, a dax entry may be shared by multiple files or offsets. In order to establish a reverse mapping relationship in this case, I introduce a rb-tree to track multiple files and offsets. The root of the rb-tree is stored in page->private, since I haven't found it be used in fsdax. We create the rb-tree and insert the (->mapping, ->index) tuple in the second time a dax entry is associated, which means this dax entry is shared. And delete this tuple from the rb-tree when disassociating. Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com> --- fs/dax.c | 153 ++++++++++++++++++++++++++++++++++++++++---- include/linux/dax.h | 6 ++ 2 files changed, 147 insertions(+), 12 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 11b16729b86f..2f996c566103 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -25,6 +25,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <linux/rbtree.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS @@ -310,6 +311,120 @@ static unsigned long dax_entry_size(void *entry) return PAGE_SIZE; } +static struct kmem_cache *dax_rmap_node_cachep; +static struct kmem_cache *dax_rmap_root_cachep; + +static int __init init_dax_rmap_cache(void) +{ + dax_rmap_root_cachep = KMEM_CACHE(rb_root_cached, SLAB_PANIC|SLAB_ACCOUNT); + dax_rmap_node_cachep = KMEM_CACHE(shared_file, SLAB_PANIC|SLAB_ACCOUNT); + return 0; +} +fs_initcall(init_dax_rmap_cache); + +struct rb_root_cached *dax_create_rbroot(void) +{ + struct rb_root_cached *root = kmem_cache_alloc(dax_rmap_root_cachep, + GFP_KERNEL); + + memset(root, 0, sizeof(struct rb_root_cached)); + return root; +} + +static bool dax_rmap_insert(struct page *page, struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct rb_node **new, *parent = NULL; + struct shared_file *p; + bool leftmost = true; + + if (!root) { + root = dax_create_rbroot(); + set_page_private(page, (unsigned long)root); + dax_rmap_insert(page, page->mapping, page->index); + } + new = &root->rb_root.rb_node; + /* Figure out where to insert new node */ + while (*new) { + struct shared_file *this = container_of(*new, struct shared_file, node); + long result = (long)mapping - (long)this->mapping; + + if (result == 0) + result = (long)index - (long)this->index; + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) { + new = &((*new)->rb_right); + leftmost = false; + } else + return false; + } + p = kmem_cache_alloc(dax_rmap_node_cachep, GFP_KERNEL); + p->mapping = mapping; + p->index = index; + + /* Add new node and rebalance tree. */ + rb_link_node(&p->node, parent, new); + rb_insert_color_cached(&p->node, root, leftmost); + + return true; +} + +static struct shared_file *dax_rmap_search(struct page *page, + struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct rb_node *node = root->rb_root.rb_node; + + while (node) { + struct shared_file *this = container_of(node, struct shared_file, node); + long result = (long)mapping - (long)this->mapping; + + if (result == 0) + result = (long)index - (long)this->index; + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return this; + } + return NULL; +} + +static void dax_rmap_delete(struct page *page, struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct shared_file *this; + + if (!root) { + page->mapping = NULL; + page->index = 0; + return; + } + + this = dax_rmap_search(page, mapping, index); + rb_erase_cached(&this->node, root); + kmem_cache_free(dax_rmap_node_cachep, this); + + if (!RB_EMPTY_ROOT(&root->rb_root)) { + if (page->mapping == mapping && page->index == index) { + this = container_of(rb_first_cached(root), struct shared_file, node); + page->mapping = this->mapping; + page->index = this->index; + } + } else { + kmem_cache_free(dax_rmap_root_cachep, root); + set_page_private(page, 0); + page->mapping = NULL; + page->index = 0; + } +} + static unsigned long dax_end_pfn(void *entry) { return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; @@ -341,16 +456,20 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - WARN_ON_ONCE(page->mapping); - page->mapping = mapping; - page->index = index + i++; + if (!page->mapping) { + page->mapping = mapping; + page->index = index + i++; + } else { + dax_rmap_insert(page, mapping, index + i++); + } } } static void dax_disassociate_entry(void *entry, struct address_space *mapping, - bool trunc) + pgoff_t index, bool trunc) { unsigned long pfn; + int i = 0; if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; @@ -359,9 +478,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - WARN_ON_ONCE(page->mapping && page->mapping != mapping); - page->mapping = NULL; - page->index = 0; + WARN_ON_ONCE(!page->mapping); + dax_rmap_delete(page, mapping, index + i++); + } +} + +static void __dax_decrease_nrexceptional(void *entry, + struct address_space *mapping) +{ + if (dax_is_empty_entry(entry) || dax_is_zero_entry(entry) || + dax_is_pmd_entry(entry)) { + mapping->nrexceptional--; + } else { + mapping->nrexceptional -= PHYS_PFN(dax_entry_size(entry)); } } @@ -522,10 +651,10 @@ static void *grab_mapping_entry(struct xa_state *xas, xas_lock_irq(xas); } - dax_disassociate_entry(entry, mapping, false); + dax_disassociate_entry(entry, mapping, index, false); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, true); - mapping->nrexceptional--; + __dax_decrease_nrexceptional(entry, mapping); entry = NULL; xas_set(xas, index); } @@ -642,9 +771,9 @@ static int __dax_invalidate_entry(struct address_space *mapping, (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; - dax_disassociate_entry(entry, mapping, trunc); + dax_disassociate_entry(entry, mapping, index, trunc); xas_store(&xas, NULL); - mapping->nrexceptional--; + __dax_decrease_nrexceptional(entry, mapping); ret = 1; out: put_unlocked_entry(&xas, entry); @@ -737,7 +866,7 @@ static void *dax_insert_entry(struct xa_state *xas, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; - dax_disassociate_entry(entry, mapping, false); + dax_disassociate_entry(entry, mapping, xas->xa_index, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); /* * Only swap our new entry into the page cache if the current diff --git a/include/linux/dax.h b/include/linux/dax.h index d7af5d243f24..1e2e81c701b6 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -39,6 +39,12 @@ struct dax_operations { int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); }; +struct shared_file { + struct address_space *mapping; + pgoff_t index; + struct rb_node node; +}; + extern struct attribute_group dax_attribute_group; #if IS_ENABLED(CONFIG_DAX) -- 2.26.2
next prev parent reply other threads:[~2020-04-27 8:48 UTC|newest] Thread overview: 46+ messages / expand[flat|nested] mbox.gz Atom feed top 2020-04-27 8:47 [RFC PATCH 0/8] dax: Add a dax-rmap tree to support reflink Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan [this message] 2020-04-27 8:47 ` [RFC PATCH 1/8] fs/dax: Introduce dax-rmap btree for reflink Shiyang Ruan 2020-04-27 8:47 ` [RFC PATCH 2/8] mm: add dax-rmap for memory-failure and rmap Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan 2020-04-27 8:47 ` [RFC PATCH 3/8] fs/dax: Introduce dax_copy_edges() for COW Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan 2020-04-27 8:47 ` [RFC PATCH 4/8] fs/dax: copy data before write Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan 2020-04-27 8:47 ` [RFC PATCH 5/8] fs/dax: replace mmap entry in case of CoW Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan 2020-04-27 8:47 ` [RFC PATCH 6/8] fs/dax: dedup file range to use a compare function Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan 2020-04-28 1:13 ` kbuild test robot 2020-04-27 8:47 ` [RFC PATCH 7/8] fs/xfs: handle CoW for fsdax write() path Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan 2020-04-27 8:47 ` [RFC PATCH 8/8] fs/xfs: support dedupe for fsdax Shiyang Ruan 2020-04-27 8:47 ` Shiyang Ruan 2020-04-28 4:36 ` kbuild test robot 2020-04-27 12:28 ` [RFC PATCH 0/8] dax: Add a dax-rmap tree to support reflink Matthew Wilcox 2020-04-27 12:28 ` Matthew Wilcox 2020-04-28 6:09 ` 回复: " Ruan, Shiyang 2020-04-28 6:09 ` Ruan, Shiyang 2020-04-28 6:43 ` Dave Chinner 2020-04-28 6:43 ` Dave Chinner 2020-04-28 9:32 ` Ruan Shiyang 2020-04-28 9:32 ` Ruan Shiyang 2020-04-28 11:16 ` Matthew Wilcox 2020-04-28 11:16 ` Matthew Wilcox 2020-04-28 11:24 ` Dave Chinner 2020-04-28 11:24 ` Dave Chinner 2020-04-28 15:37 ` Darrick J. Wong 2020-04-28 15:37 ` Darrick J. Wong 2020-04-28 22:02 ` Dave Chinner 2020-04-28 22:02 ` Dave Chinner 2020-06-04 7:37 ` Ruan Shiyang 2020-06-04 7:37 ` Ruan Shiyang 2020-06-04 14:51 ` Darrick J. Wong 2020-06-04 14:51 ` Darrick J. Wong 2020-06-05 1:30 ` Dave Chinner 2020-06-05 1:30 ` Dave Chinner 2020-06-05 2:30 ` Ruan Shiyang 2020-06-05 2:30 ` Ruan Shiyang 2020-06-05 2:11 ` Ruan Shiyang 2020-06-05 2:11 ` Ruan Shiyang
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20200427084750.136031-2-ruansy.fnst@cn.fujitsu.com \ --to=ruansy.fnst@cn.fujitsu.com \ --cc=darrick.wong@oracle.com \ --cc=david@fromorbit.com \ --cc=hch@lst.de \ --cc=linux-fsdevel@vger.kernel.org \ --cc=linux-kernel@vger.kernel.org \ --cc=linux-mm@kvack.org \ --cc=linux-nvdimm@lists.01.org \ --cc=linux-xfs@vger.kernel.org \ --cc=qi.fuli@fujitsu.com \ --cc=rgoldwyn@suse.de \ --cc=y-goto@fujitsu.com \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.