All of lore.kernel.org
 help / color / mirror / Atom feed
From: yulei.kernel@gmail.com
To: linux-mm@kvack.org, akpm@linux-foundation.org,
	linux-fsdevel@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, naoya.horiguchi@nec.com,
	viro@zeniv.linux.org.uk, pbonzini@redhat.com
Cc: joao.m.martins@oracle.com, rdunlap@infradead.org,
	sean.j.christopherson@intel.com, xiaoguangrong.eric@gmail.com,
	kernellwp@gmail.com, lihaiwei.kernel@gmail.com,
	Yulei Zhang <yuleixzhang@tencent.com>,
	Chen Zhuo <sagazchen@tencent.com>
Subject: [RFC V2 27/37] mm: add pud_special() check to support dmem huge pud
Date: Mon,  7 Dec 2020 19:31:20 +0800	[thread overview]
Message-ID: <daeb7315990415d272803d5ddbf46418dac4a8f8.1607332046.git.yuleixzhang@tencent.com> (raw)
In-Reply-To: <cover.1607332046.git.yuleixzhang@tencent.com>

From: Yulei Zhang <yuleixzhang@tencent.com>

Add pud_special() and follow_special_pud() to support dmem
huge pud as we do for dmem huge pmd.

Signed-off-by: Chen Zhuo <sagazchen@tencent.com>
Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com>
---
 arch/x86/include/asm/pgtable.h |  2 +-
 include/linux/huge_mm.h        |  2 +-
 mm/gup.c                       | 46 ++++++++++++++++++++++++++++++++++++++++++
 mm/huge_memory.c               | 11 ++++++----
 mm/memory.c                    |  4 ++--
 mm/mprotect.c                  |  2 ++
 mm/pagewalk.c                  |  2 +-
 7 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9e36d42..2284387 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -265,7 +265,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static inline int pud_trans_huge(pud_t pud)
 {
-	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP|_PAGE_DMEM)) == _PAGE_PSE;
 }
 #endif
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2514b90..b69c940 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -251,7 +251,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
 		struct vm_area_struct *vma)
 {
-	if (pud_trans_huge(*pud) || pud_devmap(*pud))
+	if (pud_trans_huge(*pud) || pud_devmap(*pud) || pud_special(*pud))
 		return __pud_trans_huge_lock(pud, vma);
 	else
 		return NULL;
diff --git a/mm/gup.c b/mm/gup.c
index 0ea9071..8eb85ba 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -423,6 +423,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 	return ERR_PTR(-EEXIST);
 }
 
+static struct page *
+follow_special_pud(struct vm_area_struct *vma, unsigned long address,
+		   pud_t *pud, unsigned int flags)
+{
+	spinlock_t *ptl;
+
+	if ((flags & FOLL_DUMP) && is_huge_zero_pud(*pud))
+		/* Avoid special (like zero) pages in core dumps */
+		return ERR_PTR(-EFAULT);
+
+	/* No page to get reference */
+	if (flags & FOLL_GET)
+		return ERR_PTR(-EFAULT);
+
+	if (flags & FOLL_TOUCH) {
+		pud_t _pud;
+
+		ptl = pud_lock(vma->vm_mm, pud);
+		if (!pud_special(*pud)) {
+			spin_unlock(ptl);
+			return NULL;
+		}
+		_pud = pud_mkyoung(*pud);
+		if (flags & FOLL_WRITE)
+			_pud = pud_mkdirty(_pud);
+		if (pudp_set_access_flags(vma, address & HPAGE_PMD_MASK,
+					  pud, _pud,
+					  flags & FOLL_WRITE))
+			update_mmu_cache_pud(vma, address, pud);
+		spin_unlock(ptl);
+	}
+
+	/* Proper page table entry exists, but no corresponding struct page */
+	return ERR_PTR(-EEXIST);
+}
+
 /*
  * FOLL_FORCE can write to even unwritable pte's, but only
  * after we've gone through a COW cycle and they are dirty.
@@ -726,6 +762,12 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
 			return page;
 		return no_page_table(vma, flags);
 	}
+	if (pud_special(*pud)) {
+		page = follow_special_pud(vma, address, pud, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
 	if (is_hugepd(__hugepd(pud_val(*pud)))) {
 		page = follow_huge_pd(vma, address,
 				      __hugepd(pud_val(*pud)), flags,
@@ -2511,6 +2553,10 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 	if (!pud_access_permitted(orig, flags & FOLL_WRITE))
 		return 0;
 
+	/* Bypass dmem pud. It will be handled in outside routine. */
+	if (pud_special(orig))
+		return 0;
+
 	if (pud_devmap(orig)) {
 		if (unlikely(flags & FOLL_LONGTERM))
 			return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6e52d57..7c5385a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -883,6 +883,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
 	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
 	if (pfn_t_devmap(pfn))
 		entry = pud_mkdevmap(entry);
+	if (pfn_t_dmem(pfn))
+		entry = pud_mkdmem(entry);
 	if (write) {
 		entry = pud_mkyoung(pud_mkdirty(entry));
 		entry = maybe_pud_mkwrite(entry, vma);
@@ -919,7 +921,7 @@ vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
 	 * can't support a 'special' bit.
 	 */
 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
-			!pfn_t_devmap(pfn));
+			!pfn_t_devmap(pfn) && !pfn_t_dmem(pfn));
 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 						(VM_PFNMAP|VM_MIXEDMAP));
 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
@@ -1911,7 +1913,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
 	spinlock_t *ptl;
 
 	ptl = pud_lock(vma->vm_mm, pud);
-	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud) || pud_special(*pud)))
 		return ptl;
 	spin_unlock(ptl);
 	return NULL;
@@ -1922,6 +1924,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pud_t *pud, unsigned long addr)
 {
 	spinlock_t *ptl;
+	pud_t orig_pud;
 
 	ptl = __pud_trans_huge_lock(pud, vma);
 	if (!ptl)
@@ -1932,9 +1935,9 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	 * pgtable_trans_huge_withdraw after finishing pudp related
 	 * operations.
 	 */
-	pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
+	orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
 	tlb_remove_pud_tlb_entry(tlb, pud, addr);
-	if (vma_is_special_huge(vma)) {
+	if (vma_is_special_huge(vma) || pud_special(orig_pud)) {
 		spin_unlock(ptl);
 		/* No zero page support yet */
 	} else {
diff --git a/mm/memory.c b/mm/memory.c
index abb9148..01f3b05 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1078,7 +1078,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 	src_pud = pud_offset(src_p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud) || pud_special(*src_pud)) {
 			int err;
 
 			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
@@ -1375,7 +1375,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+		if (pud_trans_huge(*pud) || pud_devmap(*pud) || pud_special(*pud)) {
 			if (next - addr != HPAGE_PUD_SIZE) {
 				mmap_assert_locked(tlb->mm);
 				split_huge_pud(vma, pud, addr);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b1650b5..05fa453 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -292,6 +292,8 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
 	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
+		if (pud_special(*pud))
+			continue;
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		pages += change_pmd_range(vma, pud, addr, next, newprot,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e7c4575..afd8bca 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -129,7 +129,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 	do {
  again:
 		next = pud_addr_end(addr, end);
-		if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
+		if (pud_none(*pud) || (!walk->vma && !walk->no_vma) || pud_special(*pud)) {
 			if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, depth, walk);
 			if (err)
-- 
1.8.3.1


  parent reply	other threads:[~2020-12-07 11:37 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-07 11:30 [RFC V2 00/37] Enhance memory utilization with DMEMFS yulei.kernel
2020-12-07 11:30 ` [RFC V2 01/37] fs: introduce dmemfs module yulei.kernel
2020-12-07 14:29   ` kernel test robot
2020-12-07 11:30 ` [RFC V2 02/37] mm: support direct memory reservation yulei.kernel
2020-12-07 11:30 ` [RFC V2 03/37] dmem: implement dmem memory management yulei.kernel
2020-12-07 11:30 ` [RFC V2 04/37] dmem: let pat recognize dmem yulei.kernel
2020-12-07 11:30 ` [RFC V2 05/37] dmemfs: support mmap for dmemfs yulei.kernel
2020-12-07 14:51   ` kernel test robot
2020-12-07 11:30 ` [RFC V2 06/37] dmemfs: support truncating inode down yulei.kernel
2020-12-07 11:31 ` [RFC V2 07/37] dmem: trace core functions yulei.kernel
2020-12-07 11:31 ` [RFC V2 08/37] dmem: show some statistic in debugfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 09/37] dmemfs: support remote access yulei.kernel
2020-12-07 11:31 ` [RFC V2 10/37] dmemfs: introduce max_alloc_try_dpages parameter yulei.kernel
2020-12-07 11:31 ` [RFC V2 11/37] mm: export mempolicy interfaces to serve dmem allocator yulei.kernel
2020-12-07 11:31 ` [RFC V2 12/37] dmem: introduce mempolicy support yulei.kernel
2020-12-07 11:31 ` [RFC V2 13/37] mm, dmem: introduce PFN_DMEM and pfn_t_dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 14/37] mm, dmem: differentiate dmem-pmd and thp-pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 15/37] mm: add pmd_special() check for pmd_trans_huge_lock() yulei.kernel
2020-12-07 11:31 ` [RFC V2 16/37] dmemfs: introduce ->split() to dmemfs_vm_ops yulei.kernel
2020-12-07 11:31 ` [RFC V2 17/37] mm, dmemfs: support unmap_page_range() for dmemfs pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 18/37] mm: follow_pmd_mask() for dmem huge pmd yulei.kernel
2020-12-07 11:31 ` [RFC V2 19/37] mm: gup_huge_pmd() " yulei.kernel
2020-12-07 11:31 ` [RFC V2 20/37] mm: support dmem huge pmd for vmf_insert_pfn_pmd() yulei.kernel
2020-12-07 11:31 ` [RFC V2 21/37] mm: support dmem huge pmd for follow_pfn() yulei.kernel
2020-12-07 11:31 ` [RFC V2 22/37] kvm, x86: Distinguish dmemfs page from mmio page yulei.kernel
2020-12-07 11:31 ` [RFC V2 23/37] kvm, x86: introduce VM_DMEM for syscall support usage yulei.kernel
2020-12-07 15:25   ` kernel test robot
2020-12-07 11:31 ` [RFC V2 24/37] dmemfs: support hugepage for dmemfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 25/37] mm, x86, dmem: fix estimation of reserved page for vaddr_get_pfn() yulei.kernel
2020-12-07 11:31 ` [RFC V2 26/37] mm, dmem: introduce pud_special() for dmem huge pud support yulei.kernel
2020-12-07 11:31 ` yulei.kernel [this message]
2020-12-07 11:31 ` [RFC V2 28/37] mm, dmemfs: support huge_fault() for dmemfs yulei.kernel
2020-12-07 11:31 ` [RFC V2 29/37] mm: add follow_pte_pud() to support huge pud look up yulei.kernel
2020-12-07 11:31 ` [RFC V2 30/37] dmem: introduce dmem_bitmap_alloc() and dmem_bitmap_free() yulei.kernel
2020-12-07 11:31 ` [RFC V2 31/37] dmem: introduce mce handler yulei.kernel
2020-12-07 11:31 ` [RFC V2 32/37] mm, dmemfs: register and handle the dmem mce yulei.kernel
2020-12-07 15:35   ` kernel test robot
2020-12-07 11:31 ` [RFC V2 33/37] kvm, x86: enable record_steal_time for dmem yulei.kernel
2020-12-07 11:31 ` [RFC V2 34/37] dmem: add dmem unit tests yulei.kernel
2020-12-07 11:31 ` [RFC V2 35/37] mm, dmem: introduce dregion->memmap for dmem yulei.kernel
2020-12-07 17:25   ` kernel test robot
2020-12-07 11:31 ` [RFC V2 36/37] vfio: support dmempage refcount for vfio yulei.kernel
2020-12-07 14:29   ` kernel test robot
2020-12-07 17:38   ` kernel test robot
2020-12-07 11:31 ` [RFC V2 37/37] Add documentation for dmemfs yulei.kernel
2020-12-24 18:27   ` Randy Dunlap
2020-12-07 12:02 ` [RFC V2 00/37] Enhance memory utilization with DMEMFS David Hildenbrand
2020-12-07 19:32   ` Dan Williams
2020-12-07 19:32     ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=daeb7315990415d272803d5ddbf46418dac4a8f8.1607332046.git.yuleixzhang@tencent.com \
    --to=yulei.kernel@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=joao.m.martins@oracle.com \
    --cc=kernellwp@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=lihaiwei.kernel@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=pbonzini@redhat.com \
    --cc=rdunlap@infradead.org \
    --cc=sagazchen@tencent.com \
    --cc=sean.j.christopherson@intel.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xiaoguangrong.eric@gmail.com \
    --cc=yuleixzhang@tencent.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.