All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] numa: fix /proc/<pid>/numa_maps for THP
@ 2016-04-04 15:33 ` Gerald Schaefer
  0 siblings, 0 replies; 8+ messages in thread
From: Gerald Schaefer @ 2016-04-04 15:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Naoya Horiguchi, Kirill A . Shutemov,
	Konstantin Khlebnikov, Michal Hocko, Vlastimil Babka,
	Jerome Marchand, Johannes Weiner, Dave Hansen, Mel Gorman,
	Dan Williams, Martin Schwidefsky, Heiko Carstens,
	Michael Holzheu, Gerald Schaefer, # v4 . 3+

In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the
layouts may differ depending on the architecture. On s390 this will lead to
inaccurate numap_maps accounting in /proc because of misguided pte_present()
and pte_dirty() checks on the fake pte.

On other architectures pte_present() and pte_dirty() may work by chance, but
there may be an issue with direct-access (dax) mappings w/o underlying struct
pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page()
the fake pte will be checked with pte_special() and because there is no
"special" bit in a pmd, this will always return false and the VM_PFNMAP |
VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an
invalid struct page pointer would then be returned that can crash the kernel.

This patch fixes the numa_maps THP handling by introducing new "_pmd" variants
of the can_gather_numa_stats() and vm_normal_page() functions.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: <stable@vger.kernel.org> # v4.3+
---
 fs/proc/task_mmu.c | 29 ++++++++++++++++++++++++++---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9df4316..a5fb353 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1518,6 +1518,30 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 	return page;
 }
 
+static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
+					      struct vm_area_struct *vma,
+					      unsigned long addr)
+{
+	struct page *page;
+	int nid;
+
+	if (!pmd_present(pmd))
+		return NULL;
+
+	page = vm_normal_page_pmd(vma, addr, pmd);
+	if (!page)
+		return NULL;
+
+	if (PageReserved(page))
+		return NULL;
+
+	nid = page_to_nid(page);
+	if (!node_isset(nid, node_states[N_MEMORY]))
+		return NULL;
+
+	return page;
+}
+
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		unsigned long end, struct mm_walk *walk)
 {
@@ -1529,12 +1553,11 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
-		page = can_gather_numa_stats(huge_pte, vma, addr);
+		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
 		if (page)
-			gather_stats(page, md, pte_dirty(huge_pte),
+			gather_stats(page, md, pmd_dirty(*pmd),
 				     HPAGE_PMD_SIZE/PAGE_SIZE);
 		spin_unlock(ptl);
 		return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6bff79a..c5b8efc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1121,6 +1121,8 @@ struct zap_details {
 
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		pte_t pte);
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd);
 
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
diff --git a/mm/memory.c b/mm/memory.c
index 288a508..61460e2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -789,6 +789,44 @@ out:
 	return pfn_to_page(pfn);
 }
 
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd)
+{
+	unsigned long pfn = pmd_pfn(pmd);
+
+	/*
+	 * There is no pmd_special() but there may be special pmds, e.g.
+	 * in a direct-access (dax) mapping, so let's just replicate the
+	 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+	 */
+	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+		if (vma->vm_flags & VM_MIXEDMAP) {
+			if (!pfn_valid(pfn))
+				return NULL;
+			goto out;
+		} else {
+			unsigned long off;
+			off = (addr - vma->vm_start) >> PAGE_SHIFT;
+			if (pfn == vma->vm_pgoff + off)
+				return NULL;
+			if (!is_cow_mapping(vma->vm_flags))
+				return NULL;
+		}
+	}
+
+	if (is_zero_pfn(pfn))
+		return NULL;
+	if (unlikely(pfn > highest_memmap_pfn))
+		return NULL;
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page tables.
+	 * eg. VDSO mappings can cause them to exist.
+	 */
+out:
+	return pfn_to_page(pfn);
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
-- 
2.6.6

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] numa: fix /proc/<pid>/numa_maps for THP
@ 2016-04-04 15:33 ` Gerald Schaefer
  0 siblings, 0 replies; 8+ messages in thread
From: Gerald Schaefer @ 2016-04-04 15:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Naoya Horiguchi, Kirill A . Shutemov,
	Konstantin Khlebnikov, Michal Hocko, Vlastimil Babka,
	Jerome Marchand, Johannes Weiner, Dave Hansen, Mel Gorman,
	Dan Williams, Martin Schwidefsky, Heiko Carstens,
	Michael Holzheu, Gerald Schaefer, # v4 . 3+

In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the
layouts may differ depending on the architecture. On s390 this will lead to
inaccurate numap_maps accounting in /proc because of misguided pte_present()
and pte_dirty() checks on the fake pte.

On other architectures pte_present() and pte_dirty() may work by chance, but
there may be an issue with direct-access (dax) mappings w/o underlying struct
pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page()
the fake pte will be checked with pte_special() and because there is no
"special" bit in a pmd, this will always return false and the VM_PFNMAP |
VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an
invalid struct page pointer would then be returned that can crash the kernel.

This patch fixes the numa_maps THP handling by introducing new "_pmd" variants
of the can_gather_numa_stats() and vm_normal_page() functions.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: <stable@vger.kernel.org> # v4.3+
---
 fs/proc/task_mmu.c | 29 ++++++++++++++++++++++++++---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9df4316..a5fb353 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1518,6 +1518,30 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 	return page;
 }
 
+static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
+					      struct vm_area_struct *vma,
+					      unsigned long addr)
+{
+	struct page *page;
+	int nid;
+
+	if (!pmd_present(pmd))
+		return NULL;
+
+	page = vm_normal_page_pmd(vma, addr, pmd);
+	if (!page)
+		return NULL;
+
+	if (PageReserved(page))
+		return NULL;
+
+	nid = page_to_nid(page);
+	if (!node_isset(nid, node_states[N_MEMORY]))
+		return NULL;
+
+	return page;
+}
+
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		unsigned long end, struct mm_walk *walk)
 {
@@ -1529,12 +1553,11 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
-		page = can_gather_numa_stats(huge_pte, vma, addr);
+		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
 		if (page)
-			gather_stats(page, md, pte_dirty(huge_pte),
+			gather_stats(page, md, pmd_dirty(*pmd),
 				     HPAGE_PMD_SIZE/PAGE_SIZE);
 		spin_unlock(ptl);
 		return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6bff79a..c5b8efc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1121,6 +1121,8 @@ struct zap_details {
 
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		pte_t pte);
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd);
 
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
diff --git a/mm/memory.c b/mm/memory.c
index 288a508..61460e2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -789,6 +789,44 @@ out:
 	return pfn_to_page(pfn);
 }
 
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd)
+{
+	unsigned long pfn = pmd_pfn(pmd);
+
+	/*
+	 * There is no pmd_special() but there may be special pmds, e.g.
+	 * in a direct-access (dax) mapping, so let's just replicate the
+	 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+	 */
+	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+		if (vma->vm_flags & VM_MIXEDMAP) {
+			if (!pfn_valid(pfn))
+				return NULL;
+			goto out;
+		} else {
+			unsigned long off;
+			off = (addr - vma->vm_start) >> PAGE_SHIFT;
+			if (pfn == vma->vm_pgoff + off)
+				return NULL;
+			if (!is_cow_mapping(vma->vm_flags))
+				return NULL;
+		}
+	}
+
+	if (is_zero_pfn(pfn))
+		return NULL;
+	if (unlikely(pfn > highest_memmap_pfn))
+		return NULL;
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page tables.
+	 * eg. VDSO mappings can cause them to exist.
+	 */
+out:
+	return pfn_to_page(pfn);
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
-- 
2.6.6

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] numa: fix /proc/<pid>/numa_maps for THP
  2016-04-04 15:33 ` Gerald Schaefer
@ 2016-04-04 15:51   ` kbuild test robot
  -1 siblings, 0 replies; 8+ messages in thread
From: kbuild test robot @ 2016-04-04 15:51 UTC (permalink / raw)
  To: Gerald Schaefer
  Cc: kbuild-all, Andrew Morton, linux-mm, linux-kernel,
	Naoya Horiguchi, Kirill A . Shutemov, Konstantin Khlebnikov,
	Michal Hocko, Vlastimil Babka, Jerome Marchand, Johannes Weiner,
	Dave Hansen, Mel Gorman, Dan Williams, Martin Schwidefsky,
	Heiko Carstens, Michael Holzheu, Gerald Schaefer, # v4 . 3+

[-- Attachment #1: Type: text/plain, Size: 1399 bytes --]

Hi Gerald,

[auto build test ERROR on v4.6-rc2]
[also build test ERROR on next-20160404]
[if your patch is applied to the wrong git tree, please drop us a note to help improving the system]

url:    https://github.com/0day-ci/linux/commits/Gerald-Schaefer/numa-fix-proc-pid-numa_maps-for-THP/20160404-233625
config: xtensa-allyesconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   mm/memory.c: In function 'vm_normal_page_pmd':
>> mm/memory.c:795:2: error: implicit declaration of function 'pmd_pfn' [-Werror=implicit-function-declaration]
     unsigned long pfn = pmd_pfn(pmd);
     ^
   cc1: some warnings being treated as errors

vim +/pmd_pfn +795 mm/memory.c

   789		return pfn_to_page(pfn);
   790	}
   791	
   792	struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
   793					pmd_t pmd)
   794	{
 > 795		unsigned long pfn = pmd_pfn(pmd);
   796	
   797		/*
   798		 * There is no pmd_special() but there may be special pmds, e.g.

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 44886 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] numa: fix /proc/<pid>/numa_maps for THP
@ 2016-04-04 15:51   ` kbuild test robot
  0 siblings, 0 replies; 8+ messages in thread
From: kbuild test robot @ 2016-04-04 15:51 UTC (permalink / raw)
  To: Gerald Schaefer
  Cc: kbuild-all, Andrew Morton, linux-mm, linux-kernel,
	Naoya Horiguchi, Kirill A . Shutemov, Konstantin Khlebnikov,
	Michal Hocko, Vlastimil Babka, Jerome Marchand, Johannes Weiner,
	Dave Hansen, Mel Gorman, Dan Williams, Martin Schwidefsky,
	Heiko Carstens, Michael Holzheu, # v4 . 3+

[-- Attachment #1: Type: text/plain, Size: 1399 bytes --]

Hi Gerald,

[auto build test ERROR on v4.6-rc2]
[also build test ERROR on next-20160404]
[if your patch is applied to the wrong git tree, please drop us a note to help improving the system]

url:    https://github.com/0day-ci/linux/commits/Gerald-Schaefer/numa-fix-proc-pid-numa_maps-for-THP/20160404-233625
config: xtensa-allyesconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   mm/memory.c: In function 'vm_normal_page_pmd':
>> mm/memory.c:795:2: error: implicit declaration of function 'pmd_pfn' [-Werror=implicit-function-declaration]
     unsigned long pfn = pmd_pfn(pmd);
     ^
   cc1: some warnings being treated as errors

vim +/pmd_pfn +795 mm/memory.c

   789		return pfn_to_page(pfn);
   790	}
   791	
   792	struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
   793					pmd_t pmd)
   794	{
 > 795		unsigned long pfn = pmd_pfn(pmd);
   796	
   797		/*
   798		 * There is no pmd_special() but there may be special pmds, e.g.

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 44886 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] numa: fix /proc/<pid>/numa_maps for THP
  2016-04-04 15:33 ` Gerald Schaefer
@ 2016-04-04 15:56   ` kbuild test robot
  -1 siblings, 0 replies; 8+ messages in thread
From: kbuild test robot @ 2016-04-04 15:56 UTC (permalink / raw)
  To: Gerald Schaefer
  Cc: kbuild-all, Andrew Morton, linux-mm, linux-kernel,
	Naoya Horiguchi, Kirill A . Shutemov, Konstantin Khlebnikov,
	Michal Hocko, Vlastimil Babka, Jerome Marchand, Johannes Weiner,
	Dave Hansen, Mel Gorman, Dan Williams, Martin Schwidefsky,
	Heiko Carstens, Michael Holzheu, Gerald Schaefer, # v4 . 3+

[-- Attachment #1: Type: text/plain, Size: 1350 bytes --]

Hi Gerald,

[auto build test ERROR on v4.6-rc2]
[also build test ERROR on next-20160404]
[if your patch is applied to the wrong git tree, please drop us a note to help improving the system]

url:    https://github.com/0day-ci/linux/commits/Gerald-Schaefer/numa-fix-proc-pid-numa_maps-for-THP/20160404-233625
config: tile-allyesconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=tile 

All errors (new ones prefixed by >>):

   fs/proc/task_mmu.c: In function 'gather_pte_stats':
>> fs/proc/task_mmu.c:1560:4: error: implicit declaration of function 'pmd_dirty'
   cc1: some warnings being treated as errors

vim +/pmd_dirty +1560 fs/proc/task_mmu.c

  1554		ptl = pmd_trans_huge_lock(pmd, vma);
  1555		if (ptl) {
  1556			struct page *page;
  1557	
  1558			page = can_gather_numa_stats_pmd(*pmd, vma, addr);
  1559			if (page)
> 1560				gather_stats(page, md, pmd_dirty(*pmd),
  1561					     HPAGE_PMD_SIZE/PAGE_SIZE);
  1562			spin_unlock(ptl);
  1563			return 0;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 43884 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] numa: fix /proc/<pid>/numa_maps for THP
@ 2016-04-04 15:56   ` kbuild test robot
  0 siblings, 0 replies; 8+ messages in thread
From: kbuild test robot @ 2016-04-04 15:56 UTC (permalink / raw)
  To: Gerald Schaefer
  Cc: kbuild-all, Andrew Morton, linux-mm, linux-kernel,
	Naoya Horiguchi, Kirill A . Shutemov, Konstantin Khlebnikov,
	Michal Hocko, Vlastimil Babka, Jerome Marchand, Johannes Weiner,
	Dave Hansen, Mel Gorman, Dan Williams, Martin Schwidefsky,
	Heiko Carstens, Michael Holzheu, # v4 . 3+

[-- Attachment #1: Type: text/plain, Size: 1350 bytes --]

Hi Gerald,

[auto build test ERROR on v4.6-rc2]
[also build test ERROR on next-20160404]
[if your patch is applied to the wrong git tree, please drop us a note to help improving the system]

url:    https://github.com/0day-ci/linux/commits/Gerald-Schaefer/numa-fix-proc-pid-numa_maps-for-THP/20160404-233625
config: tile-allyesconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=tile 

All errors (new ones prefixed by >>):

   fs/proc/task_mmu.c: In function 'gather_pte_stats':
>> fs/proc/task_mmu.c:1560:4: error: implicit declaration of function 'pmd_dirty'
   cc1: some warnings being treated as errors

vim +/pmd_dirty +1560 fs/proc/task_mmu.c

  1554		ptl = pmd_trans_huge_lock(pmd, vma);
  1555		if (ptl) {
  1556			struct page *page;
  1557	
  1558			page = can_gather_numa_stats_pmd(*pmd, vma, addr);
  1559			if (page)
> 1560				gather_stats(page, md, pmd_dirty(*pmd),
  1561					     HPAGE_PMD_SIZE/PAGE_SIZE);
  1562			spin_unlock(ptl);
  1563			return 0;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 43884 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] numa: fix /proc/<pid>/numa_maps for THP
  2016-04-04 15:24 Gerald Schaefer
@ 2016-04-04 18:45 ` Greg KH
  0 siblings, 0 replies; 8+ messages in thread
From: Greg KH @ 2016-04-04 18:45 UTC (permalink / raw)
  To: Gerald Schaefer; +Cc: # v4 . 3+

On Mon, Apr 04, 2016 at 05:24:00PM +0200, Gerald Schaefer wrote:
> In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the
> layouts may differ depending on the architecture. On s390 this will lead to
> inaccurate numap_maps accounting in /proc because of misguided pte_present()
> and pte_dirty() checks on the fake pte.
> 
> On other architectures pte_present() and pte_dirty() may work by chance, but
> there may be an issue with direct-access (dax) mappings w/o underlying struct
> pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page()
> the fake pte will be checked with pte_special() and because there is no
> "special" bit in a pmd, this will always return false and the VM_PFNMAP |
> VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an
> invalid struct page pointer would then be returned that can crash the kernel.
> 
> This patch fixes the numa_maps THP handling by introducing new "_pmd" variants
> of the can_gather_numa_stats() and vm_normal_page() functions.
> 
> Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
> Cc: <stable@vger.kernel.org> # v4.3+

What is the git commit id of this in Linus's tree?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] numa: fix /proc/<pid>/numa_maps for THP
@ 2016-04-04 15:24 Gerald Schaefer
  2016-04-04 18:45 ` Greg KH
  0 siblings, 1 reply; 8+ messages in thread
From: Gerald Schaefer @ 2016-04-04 15:24 UTC (permalink / raw)
  To: gerald.schaefer; +Cc: # v4 . 3+

In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the
layouts may differ depending on the architecture. On s390 this will lead to
inaccurate numap_maps accounting in /proc because of misguided pte_present()
and pte_dirty() checks on the fake pte.

On other architectures pte_present() and pte_dirty() may work by chance, but
there may be an issue with direct-access (dax) mappings w/o underlying struct
pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page()
the fake pte will be checked with pte_special() and because there is no
"special" bit in a pmd, this will always return false and the VM_PFNMAP |
VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an
invalid struct page pointer would then be returned that can crash the kernel.

This patch fixes the numa_maps THP handling by introducing new "_pmd" variants
of the can_gather_numa_stats() and vm_normal_page() functions.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: <stable@vger.kernel.org> # v4.3+
---
 fs/proc/task_mmu.c | 29 ++++++++++++++++++++++++++---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9df4316..a5fb353 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1518,6 +1518,30 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 	return page;
 }
 
+static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
+					      struct vm_area_struct *vma,
+					      unsigned long addr)
+{
+	struct page *page;
+	int nid;
+
+	if (!pmd_present(pmd))
+		return NULL;
+
+	page = vm_normal_page_pmd(vma, addr, pmd);
+	if (!page)
+		return NULL;
+
+	if (PageReserved(page))
+		return NULL;
+
+	nid = page_to_nid(page);
+	if (!node_isset(nid, node_states[N_MEMORY]))
+		return NULL;
+
+	return page;
+}
+
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		unsigned long end, struct mm_walk *walk)
 {
@@ -1529,12 +1553,11 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
-		page = can_gather_numa_stats(huge_pte, vma, addr);
+		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
 		if (page)
-			gather_stats(page, md, pte_dirty(huge_pte),
+			gather_stats(page, md, pmd_dirty(*pmd),
 				     HPAGE_PMD_SIZE/PAGE_SIZE);
 		spin_unlock(ptl);
 		return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6bff79a..c5b8efc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1121,6 +1121,8 @@ struct zap_details {
 
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		pte_t pte);
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd);
 
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
diff --git a/mm/memory.c b/mm/memory.c
index 288a508..61460e2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -789,6 +789,44 @@ out:
 	return pfn_to_page(pfn);
 }
 
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd)
+{
+	unsigned long pfn = pmd_pfn(pmd);
+
+	/*
+	 * There is no pmd_special() but there may be special pmds, e.g.
+	 * in a direct-access (dax) mapping, so let's just replicate the
+	 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+	 */
+	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+		if (vma->vm_flags & VM_MIXEDMAP) {
+			if (!pfn_valid(pfn))
+				return NULL;
+			goto out;
+		} else {
+			unsigned long off;
+			off = (addr - vma->vm_start) >> PAGE_SHIFT;
+			if (pfn == vma->vm_pgoff + off)
+				return NULL;
+			if (!is_cow_mapping(vma->vm_flags))
+				return NULL;
+		}
+	}
+
+	if (is_zero_pfn(pfn))
+		return NULL;
+	if (unlikely(pfn > highest_memmap_pfn))
+		return NULL;
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page tables.
+	 * eg. VDSO mappings can cause them to exist.
+	 */
+out:
+	return pfn_to_page(pfn);
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
-- 
2.6.6


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-04-04 18:45 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-04 15:33 [PATCH] numa: fix /proc/<pid>/numa_maps for THP Gerald Schaefer
2016-04-04 15:33 ` Gerald Schaefer
2016-04-04 15:51 ` kbuild test robot
2016-04-04 15:51   ` kbuild test robot
2016-04-04 15:56 ` kbuild test robot
2016-04-04 15:56   ` kbuild test robot
  -- strict thread matches above, loose matches on Subject: below --
2016-04-04 15:24 Gerald Schaefer
2016-04-04 18:45 ` Greg KH

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.