linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: yulei.kernel@gmail.com
To: linux-mm@kvack.org, akpm@linux-foundation.org,
	linux-fsdevel@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, naoya.horiguchi@nec.com,
	viro@zeniv.linux.org.uk, pbonzini@redhat.com
Cc: joao.m.martins@oracle.com, rdunlap@infradead.org,
	sean.j.christopherson@intel.com, xiaoguangrong.eric@gmail.com,
	kernellwp@gmail.com, lihaiwei.kernel@gmail.com,
	Yulei Zhang <yuleixzhang@tencent.com>,
	Chen Zhuo <sagazchen@tencent.com>
Subject: [RFC V2 35/37] mm, dmem: introduce dregion->memmap for dmem
Date: Mon,  7 Dec 2020 19:31:28 +0800	[thread overview]
Message-ID: <db966b831955200a63643dcbbb71ffc5ae65a642.1607332046.git.yuleixzhang@tencent.com> (raw)
In-Reply-To: <cover.1607332046.git.yuleixzhang@tencent.com>

From: Yulei Zhang <yuleixzhang@tencent.com>

Append 'memmap' into struct dmem_region, mapping each page of dmem with
struct dmempage.

Currently there is just one member '_refcount' in struct dmempage to
reflect the number of all modules which occupied the dmem page.

Modules which allocates the dmem page from dmempool will make first
reference and set _refcount to 1.

Modules which try to free the dmem page to dmempool will decrease 1
at _refcount and free it if _refcount is tested as zero after decrease.

At each time module A passes dmem page to module B, module B should call
get_dmem_pfn() to increase _refcount for dmem page before making use of it
to avoid referencing a dmem page which is occasionally freeed by any other
module in parallel. Vice versa after finishing usage of that dmem page
need call put_dmem_pfn() to decrease the _refcount.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com>
---
 include/linux/dmem.h |   5 ++
 mm/dmem.c            | 147 ++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 139 insertions(+), 13 deletions(-)

diff --git a/include/linux/dmem.h b/include/linux/dmem.h
index fe0b270..8aaa80b 100644
--- a/include/linux/dmem.h
+++ b/include/linux/dmem.h
@@ -22,6 +22,9 @@
 bool is_dmem_pfn(unsigned long pfn);
 #define dmem_free_page(addr)	dmem_free_pages(addr, 1)
 
+void get_dmem_pfn(unsigned long pfn);
+#define put_dmem_pfn(pfn)	dmem_free_page(PFN_PHYS(pfn))
+
 bool dmem_memory_failure(unsigned long pfn, int flags);
 
 struct dmem_mce_notifier_info {
@@ -45,5 +48,7 @@ static inline bool dmem_memory_failure(unsigned long pfn, int flags)
 {
 	return false;
 }
+void get_dmem_pfn(unsigned long pfn) {}
+void put_dmem_pfn(unsigned long pfn) {}
 #endif
 #endif	/* _LINUX_DMEM_H */
diff --git a/mm/dmem.c b/mm/dmem.c
index dd81b24..776dbf2 100644
--- a/mm/dmem.c
+++ b/mm/dmem.c
@@ -47,6 +47,7 @@ struct dmem_region {
 
 	unsigned long static_error_bitmap;
 	unsigned long *error_bitmap;
+	void *memmap;
 };
 
 /*
@@ -91,6 +92,10 @@ struct dmem_pool {
 	struct dmem_node nodes[MAX_NUMNODES];
 };
 
+struct dmempage {
+	atomic_t _refcount;
+};
+
 static struct dmem_pool dmem_pool = {
 	.lock = __MUTEX_INITIALIZER(dmem_pool.lock),
 	.mce_notifier_chain = RAW_NOTIFIER_INIT(dmem_pool.mce_notifier_chain),
@@ -123,6 +128,40 @@ struct dmem_pool {
 #define for_each_dmem_region(_dnode, _dregion)				\
 	list_for_each_entry(_dregion, &(_dnode)->regions, node)
 
+#define pfn_to_dmempage(_pfn, _dregion)					\
+	((struct dmempage *)(_dregion)->memmap +			\
+	pfn_to_dpage(_pfn) - (_dregion)->dpage_start_pfn)
+
+#define dmempage_to_dpage(_dmempage, _dregion)				\
+	((_dmempage) - (struct dmempage *)(_dregion)->memmap +		\
+	(_dregion)->dpage_start_pfn)
+
+static inline int dmempage_count(struct dmempage *dmempage)
+{
+	return atomic_read(&dmempage->_refcount);
+}
+
+static inline void set_dmempage_count(struct dmempage *dmempage, int v)
+{
+	atomic_set(&dmempage->_refcount, v);
+}
+
+static inline void dmempage_ref_inc(struct dmempage *dmempage)
+{
+	atomic_inc(&dmempage->_refcount);
+}
+
+static inline int dmempage_ref_dec_and_test(struct dmempage *dmempage)
+{
+	return atomic_dec_and_test(&dmempage->_refcount);
+}
+
+static inline int put_dmempage_testzero(struct dmempage *dmempage)
+{
+	VM_BUG_ON(dmempage_count(dmempage) == 0);
+	return dmempage_ref_dec_and_test(dmempage);
+}
+
 int dmem_register_mce_notifier(struct notifier_block *nb)
 {
 	int ret;
@@ -559,10 +598,25 @@ static int __init dmem_late_init(void)
 }
 late_initcall(dmem_late_init);
 
+static void *dmem_memmap_alloc(unsigned long dpages)
+{
+	unsigned long size;
+
+	size = dpages * sizeof(struct dmempage);
+	return vzalloc(size);
+}
+
+static void dmem_memmap_free(void *memmap)
+{
+	if (memmap)
+		vfree(memmap);
+}
+
 static int dmem_alloc_region_init(struct dmem_region *dregion,
 				  unsigned long *dpages)
 {
 	unsigned long start, end, *bitmap;
+	void *memmap;
 
 	start = DMEM_PAGE_UP(dregion->reserved_start_addr);
 	end = DMEM_PAGE_DOWN(dregion->reserved_end_addr);
@@ -575,7 +629,14 @@ static int dmem_alloc_region_init(struct dmem_region *dregion,
 	if (!bitmap)
 		return -ENOMEM;
 
+	memmap = dmem_memmap_alloc(*dpages);
+	if (!memmap) {
+		dmem_bitmap_free(*dpages, bitmap, &dregion->static_bitmap);
+		return -ENOMEM;
+	}
+
 	dregion->bitmap = bitmap;
+	dregion->memmap = memmap;
 	dregion->next_free_pos = 0;
 	dregion->dpage_start_pfn = start;
 	dregion->dpage_end_pfn = end;
@@ -650,7 +711,9 @@ static void dmem_alloc_region_uinit(struct dmem_region *dregion)
 	dmem_uinit_check_alloc_bitmap(dregion);
 
 	dmem_bitmap_free(dpages, bitmap, &dregion->static_bitmap);
+	dmem_memmap_free(dregion->memmap);
 	dregion->bitmap = NULL;
+	dregion->memmap = NULL;
 }
 
 static void __dmem_alloc_uinit(void)
@@ -793,6 +856,16 @@ int dmem_alloc_init(unsigned long dpage_shift)
 	return dpage_to_phys(dregion->dpage_start_pfn + pos);
 }
 
+static void prep_new_dmempage(unsigned long phys, unsigned int nr,
+			      struct dmem_region *dregion)
+{
+	struct dmempage *dmempage = pfn_to_dmempage(PHYS_PFN(phys), dregion);
+	unsigned int i;
+
+	for (i = 0; i < nr; i++, dmempage++)
+		set_dmempage_count(dmempage, 1);
+}
+
 /*
  * allocate dmem pages from the nodelist
  *
@@ -839,6 +912,7 @@ int dmem_alloc_init(unsigned long dpage_shift)
 			if (addr) {
 				dnode_count_free_dpages(dnode,
 							-(long)(*result_nr));
+				prep_new_dmempage(addr, *result_nr, dregion);
 				break;
 			}
 		}
@@ -993,6 +1067,41 @@ static struct dmem_region *find_dmem_region(phys_addr_t phys_addr,
 	return NULL;
 }
 
+static unsigned int free_dmempages_prepare(struct dmempage *dmempage,
+				   unsigned int dpages_nr)
+{
+	unsigned int i, ret = 0;
+
+	for (i = 0; i < dpages_nr; i++, dmempage++)
+		if (put_dmempage_testzero(dmempage))
+			ret++;
+
+	return ret;
+}
+
+void __dmem_free_pages(struct dmempage *dmempage,
+		       unsigned int dpages_nr,
+		       struct dmem_region *dregion,
+		       struct dmem_node *pdnode)
+{
+	phys_addr_t dpage = dmempage_to_dpage(dmempage, dregion);
+	u64 pos;
+	unsigned long err_dpages;
+
+	trace_dmem_free_pages(dpage_to_phys(dpage), dpages_nr);
+	WARN_ON(!dmem_pool.dpage_shift);
+
+	pos = dpage - dregion->dpage_start_pfn;
+	dregion->next_free_pos = min(dregion->next_free_pos, pos);
+
+	/* it is not possible to span multiple regions */
+	WARN_ON(dpage + dpages_nr - 1 >= dregion->dpage_end_pfn);
+
+	err_dpages = dmem_alloc_bitmap_clear(dregion, dpage, dpages_nr);
+
+	dnode_count_free_dpages(pdnode, dpages_nr - err_dpages);
+}
+
 /*
  * free dmem page to the dmem pool
  *   @addr: the physical addree will be freed
@@ -1002,27 +1111,26 @@ void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr)
 {
 	struct dmem_region *dregion;
 	struct dmem_node *pdnode = NULL;
-	phys_addr_t dpage = phys_to_dpage(addr);
-	u64 pos;
-	unsigned long err_dpages;
+	struct dmempage *dmempage;
+	unsigned int nr;
 
 	mutex_lock(&dmem_pool.lock);
 
-	trace_dmem_free_pages(addr, dpages_nr);
-	WARN_ON(!dmem_pool.dpage_shift);
-
 	dregion = find_dmem_region(addr, &pdnode);
 	WARN_ON(!dregion || !dregion->bitmap || !pdnode);
 
-	pos = dpage - dregion->dpage_start_pfn;
-	dregion->next_free_pos = min(dregion->next_free_pos, pos);
-
-	/* it is not possible to span multiple regions */
-	WARN_ON(dpage + dpages_nr - 1 >= dregion->dpage_end_pfn);
+	dmempage = pfn_to_dmempage(PHYS_PFN(addr), dregion);
 
-	err_dpages = dmem_alloc_bitmap_clear(dregion, dpage, dpages_nr);
+	nr = free_dmempages_prepare(dmempage, dpages_nr);
+	if (nr == dpages_nr)
+		__dmem_free_pages(dmempage, dpages_nr, dregion, pdnode);
+	else if (nr)
+		while (dpages_nr--, dmempage++) {
+			if (dmempage_count(dmempage))
+				continue;
+			__dmem_free_pages(dmempage, 1, dregion, pdnode);
+		}
 
-	dnode_count_free_dpages(pdnode, dpages_nr - err_dpages);
 	mutex_unlock(&dmem_pool.lock);
 }
 EXPORT_SYMBOL(dmem_free_pages);
@@ -1073,3 +1181,16 @@ bool is_dmem_pfn(unsigned long pfn)
 	return !!find_dmem_region(__pfn_to_phys(pfn), &dnode);
 }
 EXPORT_SYMBOL(is_dmem_pfn);
+
+void get_dmem_pfn(unsigned long pfn)
+{
+	struct dmem_region *dregion = find_dmem_region(PFN_PHYS(pfn), NULL);
+	struct dmempage *dmempage;
+
+	VM_BUG_ON(!dregion || !dregion->memmap);
+
+	dmempage = pfn_to_dmempage(pfn, dregion);
+	VM_BUG_ON(dmempage_count(dmempage) + 127u <= 127u);
+	dmempage_ref_inc(dmempage);
+}
+EXPORT_SYMBOL(get_dmem_pfn);
-- 
1.8.3.1



  parent reply	other threads:[~2020-12-07 11:35 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-07 11:30 [RFC V2 00/37] Enhance memory utilization with DMEMFS yulei.kernel
2020-12-07 11:30 ` [RFC V2 01/37] fs: introduce dmemfs module yulei.kernel
2020-12-07 11:30 ` [RFC V2 02/37] mm: support direct memory reservation yulei.kernel
2020-12-07 11:30 ` [RFC V2 03/37] dmem: implement dmem memory management yulei.kernel
2020-12-07 11:30 ` [RFC V2 04/37] dmem: let pat recognize dmem yulei.kernel
2020-12-07 11:30 ` [RFC V2 05/37] dmemfs: support mmap for dmemfs yulei.kernel
2020-12-07 11:30 ` [RFC V2 06/37] dmemfs: support truncating inode down yulei.kernel
2020-12-07 11:31 ` [RFC V2 07/37] dmem: trace core functions yulei.kernel
2020-12-07 11:31 ` [RFC V2 08/37] dmem: show some statistic in debugfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 09/37] dmemfs: support remote access yulei.kernel
2020-12-07 11:31 ` [RFC V2 10/37] dmemfs: introduce max_alloc_try_dpages parameter yulei.kernel
2020-12-07 11:31 ` [RFC V2 11/37] mm: export mempolicy interfaces to serve dmem allocator yulei.kernel
2020-12-07 11:31 ` [RFC V2 12/37] dmem: introduce mempolicy support yulei.kernel
2020-12-07 11:31 ` [RFC V2 13/37] mm, dmem: introduce PFN_DMEM and pfn_t_dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 14/37] mm, dmem: differentiate dmem-pmd and thp-pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 15/37] mm: add pmd_special() check for pmd_trans_huge_lock() yulei.kernel
2020-12-07 11:31 ` [RFC V2 16/37] dmemfs: introduce ->split() to dmemfs_vm_ops yulei.kernel
2020-12-07 11:31 ` [RFC V2 17/37] mm, dmemfs: support unmap_page_range() for dmemfs pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 18/37] mm: follow_pmd_mask() for dmem huge pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 19/37] mm: gup_huge_pmd() " yulei.kernel
2020-12-07 11:31 ` [RFC V2 20/37] mm: support dmem huge pmd for vmf_insert_pfn_pmd() yulei.kernel
2020-12-07 11:31 ` [RFC V2 21/37] mm: support dmem huge pmd for follow_pfn() yulei.kernel
2020-12-07 11:31 ` [RFC V2 22/37] kvm, x86: Distinguish dmemfs page from mmio page yulei.kernel
2020-12-07 11:31 ` [RFC V2 23/37] kvm, x86: introduce VM_DMEM for syscall support usage yulei.kernel
2020-12-07 11:31 ` [RFC V2 24/37] dmemfs: support hugepage for dmemfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 25/37] mm, x86, dmem: fix estimation of reserved page for vaddr_get_pfn() yulei.kernel
2020-12-07 11:31 ` [RFC V2 26/37] mm, dmem: introduce pud_special() for dmem huge pud support yulei.kernel
2020-12-07 11:31 ` [RFC V2 27/37] mm: add pud_special() check to support dmem huge pud yulei.kernel
2020-12-07 11:31 ` [RFC V2 28/37] mm, dmemfs: support huge_fault() for dmemfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 29/37] mm: add follow_pte_pud() to support huge pud look up yulei.kernel
2020-12-07 11:31 ` [RFC V2 30/37] dmem: introduce dmem_bitmap_alloc() and dmem_bitmap_free() yulei.kernel
2020-12-07 11:31 ` [RFC V2 31/37] dmem: introduce mce handler yulei.kernel
2020-12-07 11:31 ` [RFC V2 32/37] mm, dmemfs: register and handle the dmem mce yulei.kernel
2020-12-07 11:31 ` [RFC V2 33/37] kvm, x86: enable record_steal_time for dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 34/37] dmem: add dmem unit tests yulei.kernel
2020-12-07 11:31 ` yulei.kernel [this message]
2020-12-07 11:31 ` [RFC V2 36/37] vfio: support dmempage refcount for vfio yulei.kernel
2020-12-07 11:31 ` [RFC V2 37/37] Add documentation for dmemfs yulei.kernel
2020-12-24 18:27   ` Randy Dunlap
2020-12-07 12:02 ` [RFC V2 00/37] Enhance memory utilization with DMEMFS David Hildenbrand
2020-12-07 19:32   ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=db966b831955200a63643dcbbb71ffc5ae65a642.1607332046.git.yuleixzhang@tencent.com \
    --to=yulei.kernel@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=joao.m.martins@oracle.com \
    --cc=kernellwp@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=lihaiwei.kernel@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=pbonzini@redhat.com \
    --cc=rdunlap@infradead.org \
    --cc=sagazchen@tencent.com \
    --cc=sean.j.christopherson@intel.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xiaoguangrong.eric@gmail.com \
    --cc=yuleixzhang@tencent.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).