From: yulei.kernel@gmail.com
To: linux-mm@kvack.org, akpm@linux-foundation.org,
linux-fsdevel@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, naoya.horiguchi@nec.com,
viro@zeniv.linux.org.uk, pbonzini@redhat.com
Cc: joao.m.martins@oracle.com, rdunlap@infradead.org,
sean.j.christopherson@intel.com, xiaoguangrong.eric@gmail.com,
kernellwp@gmail.com, lihaiwei.kernel@gmail.com,
Yulei Zhang <yuleixzhang@tencent.com>,
Chen Zhuo <sagazchen@tencent.com>
Subject: [RFC V2 35/37] mm, dmem: introduce dregion->memmap for dmem
Date: Mon, 7 Dec 2020 19:31:28 +0800 [thread overview]
Message-ID: <db966b831955200a63643dcbbb71ffc5ae65a642.1607332046.git.yuleixzhang@tencent.com> (raw)
In-Reply-To: <cover.1607332046.git.yuleixzhang@tencent.com>
From: Yulei Zhang <yuleixzhang@tencent.com>
Append 'memmap' into struct dmem_region, mapping each page of dmem with
struct dmempage.
Currently there is just one member '_refcount' in struct dmempage to
reflect the number of all modules which occupied the dmem page.
Modules which allocates the dmem page from dmempool will make first
reference and set _refcount to 1.
Modules which try to free the dmem page to dmempool will decrease 1
at _refcount and free it if _refcount is tested as zero after decrease.
At each time module A passes dmem page to module B, module B should call
get_dmem_pfn() to increase _refcount for dmem page before making use of it
to avoid referencing a dmem page which is occasionally freeed by any other
module in parallel. Vice versa after finishing usage of that dmem page
need call put_dmem_pfn() to decrease the _refcount.
Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com>
---
include/linux/dmem.h | 5 ++
mm/dmem.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 139 insertions(+), 13 deletions(-)
diff --git a/include/linux/dmem.h b/include/linux/dmem.h
index fe0b270..8aaa80b 100644
--- a/include/linux/dmem.h
+++ b/include/linux/dmem.h
@@ -22,6 +22,9 @@
bool is_dmem_pfn(unsigned long pfn);
#define dmem_free_page(addr) dmem_free_pages(addr, 1)
+void get_dmem_pfn(unsigned long pfn);
+#define put_dmem_pfn(pfn) dmem_free_page(PFN_PHYS(pfn))
+
bool dmem_memory_failure(unsigned long pfn, int flags);
struct dmem_mce_notifier_info {
@@ -45,5 +48,7 @@ static inline bool dmem_memory_failure(unsigned long pfn, int flags)
{
return false;
}
+void get_dmem_pfn(unsigned long pfn) {}
+void put_dmem_pfn(unsigned long pfn) {}
#endif
#endif /* _LINUX_DMEM_H */
diff --git a/mm/dmem.c b/mm/dmem.c
index dd81b24..776dbf2 100644
--- a/mm/dmem.c
+++ b/mm/dmem.c
@@ -47,6 +47,7 @@ struct dmem_region {
unsigned long static_error_bitmap;
unsigned long *error_bitmap;
+ void *memmap;
};
/*
@@ -91,6 +92,10 @@ struct dmem_pool {
struct dmem_node nodes[MAX_NUMNODES];
};
+struct dmempage {
+ atomic_t _refcount;
+};
+
static struct dmem_pool dmem_pool = {
.lock = __MUTEX_INITIALIZER(dmem_pool.lock),
.mce_notifier_chain = RAW_NOTIFIER_INIT(dmem_pool.mce_notifier_chain),
@@ -123,6 +128,40 @@ struct dmem_pool {
#define for_each_dmem_region(_dnode, _dregion) \
list_for_each_entry(_dregion, &(_dnode)->regions, node)
+#define pfn_to_dmempage(_pfn, _dregion) \
+ ((struct dmempage *)(_dregion)->memmap + \
+ pfn_to_dpage(_pfn) - (_dregion)->dpage_start_pfn)
+
+#define dmempage_to_dpage(_dmempage, _dregion) \
+ ((_dmempage) - (struct dmempage *)(_dregion)->memmap + \
+ (_dregion)->dpage_start_pfn)
+
+static inline int dmempage_count(struct dmempage *dmempage)
+{
+ return atomic_read(&dmempage->_refcount);
+}
+
+static inline void set_dmempage_count(struct dmempage *dmempage, int v)
+{
+ atomic_set(&dmempage->_refcount, v);
+}
+
+static inline void dmempage_ref_inc(struct dmempage *dmempage)
+{
+ atomic_inc(&dmempage->_refcount);
+}
+
+static inline int dmempage_ref_dec_and_test(struct dmempage *dmempage)
+{
+ return atomic_dec_and_test(&dmempage->_refcount);
+}
+
+static inline int put_dmempage_testzero(struct dmempage *dmempage)
+{
+ VM_BUG_ON(dmempage_count(dmempage) == 0);
+ return dmempage_ref_dec_and_test(dmempage);
+}
+
int dmem_register_mce_notifier(struct notifier_block *nb)
{
int ret;
@@ -559,10 +598,25 @@ static int __init dmem_late_init(void)
}
late_initcall(dmem_late_init);
+static void *dmem_memmap_alloc(unsigned long dpages)
+{
+ unsigned long size;
+
+ size = dpages * sizeof(struct dmempage);
+ return vzalloc(size);
+}
+
+static void dmem_memmap_free(void *memmap)
+{
+ if (memmap)
+ vfree(memmap);
+}
+
static int dmem_alloc_region_init(struct dmem_region *dregion,
unsigned long *dpages)
{
unsigned long start, end, *bitmap;
+ void *memmap;
start = DMEM_PAGE_UP(dregion->reserved_start_addr);
end = DMEM_PAGE_DOWN(dregion->reserved_end_addr);
@@ -575,7 +629,14 @@ static int dmem_alloc_region_init(struct dmem_region *dregion,
if (!bitmap)
return -ENOMEM;
+ memmap = dmem_memmap_alloc(*dpages);
+ if (!memmap) {
+ dmem_bitmap_free(*dpages, bitmap, &dregion->static_bitmap);
+ return -ENOMEM;
+ }
+
dregion->bitmap = bitmap;
+ dregion->memmap = memmap;
dregion->next_free_pos = 0;
dregion->dpage_start_pfn = start;
dregion->dpage_end_pfn = end;
@@ -650,7 +711,9 @@ static void dmem_alloc_region_uinit(struct dmem_region *dregion)
dmem_uinit_check_alloc_bitmap(dregion);
dmem_bitmap_free(dpages, bitmap, &dregion->static_bitmap);
+ dmem_memmap_free(dregion->memmap);
dregion->bitmap = NULL;
+ dregion->memmap = NULL;
}
static void __dmem_alloc_uinit(void)
@@ -793,6 +856,16 @@ int dmem_alloc_init(unsigned long dpage_shift)
return dpage_to_phys(dregion->dpage_start_pfn + pos);
}
+static void prep_new_dmempage(unsigned long phys, unsigned int nr,
+ struct dmem_region *dregion)
+{
+ struct dmempage *dmempage = pfn_to_dmempage(PHYS_PFN(phys), dregion);
+ unsigned int i;
+
+ for (i = 0; i < nr; i++, dmempage++)
+ set_dmempage_count(dmempage, 1);
+}
+
/*
* allocate dmem pages from the nodelist
*
@@ -839,6 +912,7 @@ int dmem_alloc_init(unsigned long dpage_shift)
if (addr) {
dnode_count_free_dpages(dnode,
-(long)(*result_nr));
+ prep_new_dmempage(addr, *result_nr, dregion);
break;
}
}
@@ -993,6 +1067,41 @@ static struct dmem_region *find_dmem_region(phys_addr_t phys_addr,
return NULL;
}
+static unsigned int free_dmempages_prepare(struct dmempage *dmempage,
+ unsigned int dpages_nr)
+{
+ unsigned int i, ret = 0;
+
+ for (i = 0; i < dpages_nr; i++, dmempage++)
+ if (put_dmempage_testzero(dmempage))
+ ret++;
+
+ return ret;
+}
+
+void __dmem_free_pages(struct dmempage *dmempage,
+ unsigned int dpages_nr,
+ struct dmem_region *dregion,
+ struct dmem_node *pdnode)
+{
+ phys_addr_t dpage = dmempage_to_dpage(dmempage, dregion);
+ u64 pos;
+ unsigned long err_dpages;
+
+ trace_dmem_free_pages(dpage_to_phys(dpage), dpages_nr);
+ WARN_ON(!dmem_pool.dpage_shift);
+
+ pos = dpage - dregion->dpage_start_pfn;
+ dregion->next_free_pos = min(dregion->next_free_pos, pos);
+
+ /* it is not possible to span multiple regions */
+ WARN_ON(dpage + dpages_nr - 1 >= dregion->dpage_end_pfn);
+
+ err_dpages = dmem_alloc_bitmap_clear(dregion, dpage, dpages_nr);
+
+ dnode_count_free_dpages(pdnode, dpages_nr - err_dpages);
+}
+
/*
* free dmem page to the dmem pool
* @addr: the physical addree will be freed
@@ -1002,27 +1111,26 @@ void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr)
{
struct dmem_region *dregion;
struct dmem_node *pdnode = NULL;
- phys_addr_t dpage = phys_to_dpage(addr);
- u64 pos;
- unsigned long err_dpages;
+ struct dmempage *dmempage;
+ unsigned int nr;
mutex_lock(&dmem_pool.lock);
- trace_dmem_free_pages(addr, dpages_nr);
- WARN_ON(!dmem_pool.dpage_shift);
-
dregion = find_dmem_region(addr, &pdnode);
WARN_ON(!dregion || !dregion->bitmap || !pdnode);
- pos = dpage - dregion->dpage_start_pfn;
- dregion->next_free_pos = min(dregion->next_free_pos, pos);
-
- /* it is not possible to span multiple regions */
- WARN_ON(dpage + dpages_nr - 1 >= dregion->dpage_end_pfn);
+ dmempage = pfn_to_dmempage(PHYS_PFN(addr), dregion);
- err_dpages = dmem_alloc_bitmap_clear(dregion, dpage, dpages_nr);
+ nr = free_dmempages_prepare(dmempage, dpages_nr);
+ if (nr == dpages_nr)
+ __dmem_free_pages(dmempage, dpages_nr, dregion, pdnode);
+ else if (nr)
+ while (dpages_nr--, dmempage++) {
+ if (dmempage_count(dmempage))
+ continue;
+ __dmem_free_pages(dmempage, 1, dregion, pdnode);
+ }
- dnode_count_free_dpages(pdnode, dpages_nr - err_dpages);
mutex_unlock(&dmem_pool.lock);
}
EXPORT_SYMBOL(dmem_free_pages);
@@ -1073,3 +1181,16 @@ bool is_dmem_pfn(unsigned long pfn)
return !!find_dmem_region(__pfn_to_phys(pfn), &dnode);
}
EXPORT_SYMBOL(is_dmem_pfn);
+
+void get_dmem_pfn(unsigned long pfn)
+{
+ struct dmem_region *dregion = find_dmem_region(PFN_PHYS(pfn), NULL);
+ struct dmempage *dmempage;
+
+ VM_BUG_ON(!dregion || !dregion->memmap);
+
+ dmempage = pfn_to_dmempage(pfn, dregion);
+ VM_BUG_ON(dmempage_count(dmempage) + 127u <= 127u);
+ dmempage_ref_inc(dmempage);
+}
+EXPORT_SYMBOL(get_dmem_pfn);
--
1.8.3.1
next prev parent reply other threads:[~2020-12-07 11:35 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-12-07 11:30 [RFC V2 00/37] Enhance memory utilization with DMEMFS yulei.kernel
2020-12-07 11:30 ` [RFC V2 01/37] fs: introduce dmemfs module yulei.kernel
2020-12-07 11:30 ` [RFC V2 02/37] mm: support direct memory reservation yulei.kernel
2020-12-07 11:30 ` [RFC V2 03/37] dmem: implement dmem memory management yulei.kernel
2020-12-07 11:30 ` [RFC V2 04/37] dmem: let pat recognize dmem yulei.kernel
2020-12-07 11:30 ` [RFC V2 05/37] dmemfs: support mmap for dmemfs yulei.kernel
2020-12-07 11:30 ` [RFC V2 06/37] dmemfs: support truncating inode down yulei.kernel
2020-12-07 11:31 ` [RFC V2 07/37] dmem: trace core functions yulei.kernel
2020-12-07 11:31 ` [RFC V2 08/37] dmem: show some statistic in debugfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 09/37] dmemfs: support remote access yulei.kernel
2020-12-07 11:31 ` [RFC V2 10/37] dmemfs: introduce max_alloc_try_dpages parameter yulei.kernel
2020-12-07 11:31 ` [RFC V2 11/37] mm: export mempolicy interfaces to serve dmem allocator yulei.kernel
2020-12-07 11:31 ` [RFC V2 12/37] dmem: introduce mempolicy support yulei.kernel
2020-12-07 11:31 ` [RFC V2 13/37] mm, dmem: introduce PFN_DMEM and pfn_t_dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 14/37] mm, dmem: differentiate dmem-pmd and thp-pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 15/37] mm: add pmd_special() check for pmd_trans_huge_lock() yulei.kernel
2020-12-07 11:31 ` [RFC V2 16/37] dmemfs: introduce ->split() to dmemfs_vm_ops yulei.kernel
2020-12-07 11:31 ` [RFC V2 17/37] mm, dmemfs: support unmap_page_range() for dmemfs pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 18/37] mm: follow_pmd_mask() for dmem huge pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 19/37] mm: gup_huge_pmd() " yulei.kernel
2020-12-07 11:31 ` [RFC V2 20/37] mm: support dmem huge pmd for vmf_insert_pfn_pmd() yulei.kernel
2020-12-07 11:31 ` [RFC V2 21/37] mm: support dmem huge pmd for follow_pfn() yulei.kernel
2020-12-07 11:31 ` [RFC V2 22/37] kvm, x86: Distinguish dmemfs page from mmio page yulei.kernel
2020-12-07 11:31 ` [RFC V2 23/37] kvm, x86: introduce VM_DMEM for syscall support usage yulei.kernel
2020-12-07 11:31 ` [RFC V2 24/37] dmemfs: support hugepage for dmemfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 25/37] mm, x86, dmem: fix estimation of reserved page for vaddr_get_pfn() yulei.kernel
2020-12-07 11:31 ` [RFC V2 26/37] mm, dmem: introduce pud_special() for dmem huge pud support yulei.kernel
2020-12-07 11:31 ` [RFC V2 27/37] mm: add pud_special() check to support dmem huge pud yulei.kernel
2020-12-07 11:31 ` [RFC V2 28/37] mm, dmemfs: support huge_fault() for dmemfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 29/37] mm: add follow_pte_pud() to support huge pud look up yulei.kernel
2020-12-07 11:31 ` [RFC V2 30/37] dmem: introduce dmem_bitmap_alloc() and dmem_bitmap_free() yulei.kernel
2020-12-07 11:31 ` [RFC V2 31/37] dmem: introduce mce handler yulei.kernel
2020-12-07 11:31 ` [RFC V2 32/37] mm, dmemfs: register and handle the dmem mce yulei.kernel
2020-12-07 11:31 ` [RFC V2 33/37] kvm, x86: enable record_steal_time for dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 34/37] dmem: add dmem unit tests yulei.kernel
2020-12-07 11:31 ` yulei.kernel [this message]
2020-12-07 11:31 ` [RFC V2 36/37] vfio: support dmempage refcount for vfio yulei.kernel
2020-12-07 11:31 ` [RFC V2 37/37] Add documentation for dmemfs yulei.kernel
2020-12-24 18:27 ` Randy Dunlap
2020-12-07 12:02 ` [RFC V2 00/37] Enhance memory utilization with DMEMFS David Hildenbrand
2020-12-07 19:32 ` Dan Williams
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=db966b831955200a63643dcbbb71ffc5ae65a642.1607332046.git.yuleixzhang@tencent.com \
--to=yulei.kernel@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=joao.m.martins@oracle.com \
--cc=kernellwp@gmail.com \
--cc=kvm@vger.kernel.org \
--cc=lihaiwei.kernel@gmail.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=naoya.horiguchi@nec.com \
--cc=pbonzini@redhat.com \
--cc=rdunlap@infradead.org \
--cc=sagazchen@tencent.com \
--cc=sean.j.christopherson@intel.com \
--cc=viro@zeniv.linux.org.uk \
--cc=xiaoguangrong.eric@gmail.com \
--cc=yuleixzhang@tencent.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).