All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-17  0:28 ` Dave Hansen
  0 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2016-11-17  0:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dave Hansen, hch, akpm, dan.j.williams, khandual, linux-mm


Changes from v1:
 * Do one 'Pte' line per pte size instead of mashing on one line
 * Use PMD_SIZE for pmds instead of PAGE_SIZE, whoops
 * Wrote some Documentation/

--

/proc/$pid/smaps has a number of fields that are intended to imply the
kinds of PTEs used to map memory.  "AnonHugePages" obviously tells you
how many PMDs are being used.  "MMUPageSize" along with the "Hugetlb"
fields tells you how many PTEs you have for a huge page.

The current mechanisms work fine when we have one or two page sizes.
But, they start to get a bit muddled when we mix page sizes inside
one VMA.  For instance, the DAX folks were proposing adding a set of
fields like:

	DevicePages:
	DeviceHugePages:
	DeviceGiganticPages:
	DeviceGinormousPages:

to unmuddle things when page sizes get mixed.  That's fine, but
it does require userspace know the mapping from our various
arbitrary names to hardware page sizes on each architecture and
kernel configuration.  That seems rather suboptimal.

What folks really want is to know how much memory is mapped with
each page size.  How about we just do *that*?

Patch attached.  Seems harmless enough.  Seems to compile on a
bunch of random architectures.  Makes smaps look like this:

Private_Hugetlb:       0 kB
Swap:                  0 kB
SwapPss:               0 kB
KernelPageSize:        4 kB
MMUPageSize:           4 kB
Locked:                0 kB
Ptes@4kB:	      32 kB
Ptes@2MB:	    2048 kB

The format I used here should be unlikely to break smaps parsers
unless they're looking for "kB" and now match the 'Ptes@4kB' instead
of the one at the end of the line.

1. I'd like to thank Dan Williams for showing me a mirror as I
   complained about the bozo that introduced 'AnonHugePages'.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: linux-mm@kvack.org

---

 b/Documentation/filesystems/proc.txt |    6 ++
 b/fs/proc/task_mmu.c                 |   81 ++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff -puN fs/proc/task_mmu.c~smaps-pte-sizes fs/proc/task_mmu.c
--- a/fs/proc/task_mmu.c~smaps-pte-sizes	2016-11-16 15:43:56.756991084 -0800
+++ b/fs/proc/task_mmu.c	2016-11-16 16:19:47.354789912 -0800
@@ -445,6 +445,9 @@ struct mem_size_stats {
 	unsigned long swap;
 	unsigned long shared_hugetlb;
 	unsigned long private_hugetlb;
+	unsigned long rss_pte;
+	unsigned long rss_pmd;
+	unsigned long rss_pud;
 	u64 pss;
 	u64 swap_pss;
 	bool check_shmem_swap;
@@ -519,6 +522,7 @@ static void smaps_pte_entry(pte_t *pte,
 
 	if (pte_present(*pte)) {
 		page = vm_normal_page(vma, addr, *pte);
+		mss->rss_pte += PAGE_SIZE;
 	} else if (is_swap_pte(*pte)) {
 		swp_entry_t swpent = pte_to_swp_entry(*pte);
 
@@ -578,6 +582,7 @@ static void smaps_pmd_entry(pmd_t *pmd,
 		/* pass */;
 	else
 		VM_BUG_ON_PAGE(1, page);
+	mss->rss_pmd += PMD_SIZE;
 	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
 }
 #else
@@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
 	}
 	if (page) {
 		int mapcount = page_mapcount(page);
+		unsigned long hpage_size = huge_page_size(hstate_vma(vma));
 
+		mss->rss_pud += hpage_size;
 		if (mapcount >= 2)
-			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+			mss->shared_hugetlb += hpage_size;
 		else
-			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+			mss->private_hugetlb += hpage_size;
 	}
 	return 0;
 }
@@ -716,6 +723,75 @@ void __weak arch_show_smap(struct seq_fi
 {
 }
 
+/*
+ * What units should we use for a given number?  We want
+ * 2048 to be 2k, so we return 'k'.  1048576 should be
+ * 1M, so we return 'M'.
+ */
+static char size_unit(unsigned long long nr)
+{
+	/*
+	 * This ' ' might look a bit goofy in the output.  But, why
+	 * bother doing anything.  Do we even have a <1k page size?
+	 */
+	if (nr < (1ULL<<10))
+		return ' ';
+	if (nr < (1ULL<<20))
+		return 'k';
+	if (nr < (1ULL<<30))
+		return 'M';
+	if (nr < (1ULL<<40))
+		return 'G';
+	if (nr < (1ULL<<50))
+		return 'T';
+	if (nr < (1ULL<<60))
+		return 'P';
+	return 'E';
+}
+
+/*
+ * How should we shift down a a given number to scale it
+ * with the units we are printing it as? 2048 to be 2k,
+ * so we want it shifted down by 10.  1048576 should be
+ * 1M, so we want it shifted down by 20.
+ */
+static int size_shift(unsigned long long nr)
+{
+	if (nr < (1ULL<<10))
+		return 0;
+	if (nr < (1ULL<<20))
+		return 10;
+	if (nr < (1ULL<<30))
+		return 20;
+	if (nr < (1ULL<<40))
+		return 30;
+	if (nr < (1ULL<<50))
+		return 40;
+	if (nr < (1ULL<<60))
+		return 50;
+	return 60;
+}
+
+static void show_one_smap_pte(struct seq_file *m, unsigned long bytes_rss,
+		unsigned long pte_size)
+{
+	seq_printf(m, "Ptes@%ld%cB:	%8lu kB\n",
+			pte_size >> size_shift(pte_size),
+			size_unit(pte_size),
+			bytes_rss >> 10);
+}
+
+static void show_smap_ptes(struct seq_file *m, struct mem_size_stats *mss)
+{
+	/* Only print the entries for page sizes present in the VMA */
+	if (mss->rss_pte)
+		show_one_smap_pte(m, mss->rss_pte, PAGE_SIZE);
+	if (mss->rss_pmd)
+		show_one_smap_pte(m, mss->rss_pmd, PMD_SIZE);
+	if (mss->rss_pud)
+		show_one_smap_pte(m, mss->rss_pud, PUD_SIZE);
+}
+
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
 	struct vm_area_struct *vma = v;
@@ -799,6 +875,7 @@ static int show_smap(struct seq_file *m,
 		   (vma->vm_flags & VM_LOCKED) ?
 			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
+	show_smap_ptes(m, &mss);
 	arch_show_smap(m, vma);
 	show_smap_vma_flags(m, vma);
 	m_cache_vma(m, vma);
diff -puN Documentation/filesystems/proc.txt~smaps-pte-sizes Documentation/filesystems/proc.txt
--- a/Documentation/filesystems/proc.txt~smaps-pte-sizes	2016-11-16 16:10:48.707307044 -0800
+++ b/Documentation/filesystems/proc.txt	2016-11-16 16:10:52.172464547 -0800
@@ -418,6 +418,9 @@ SwapPss:               0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:                0 kB
+Ptes@4kB:	       4 kB
+Ptes@2MB:	    8192 kB
+
 VmFlags: rd ex mr mw me dw
 
 the first of these lines shows the same information as is displayed for the
@@ -450,6 +453,9 @@ replaced by copy-on-write) part of the u
 "SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
 does not take into account swapped out page of underlying shmem objects.
 "Locked" indicates whether the mapping is locked in memory or not.
+"Ptes@..." lines show how many page table entries are currently in place and
+pointing to memory.  There is an entry for each size present in the hardware
+page tables for this mapping.
 
 "VmFlags" field deserves a separate description. This member represents the kernel
 flags associated with the particular virtual memory area in two letter encoded
_

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-17  0:28 ` Dave Hansen
  0 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2016-11-17  0:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dave Hansen, hch, akpm, dan.j.williams, khandual, linux-mm


Changes from v1:
 * Do one 'Pte' line per pte size instead of mashing on one line
 * Use PMD_SIZE for pmds instead of PAGE_SIZE, whoops
 * Wrote some Documentation/

--

/proc/$pid/smaps has a number of fields that are intended to imply the
kinds of PTEs used to map memory.  "AnonHugePages" obviously tells you
how many PMDs are being used.  "MMUPageSize" along with the "Hugetlb"
fields tells you how many PTEs you have for a huge page.

The current mechanisms work fine when we have one or two page sizes.
But, they start to get a bit muddled when we mix page sizes inside
one VMA.  For instance, the DAX folks were proposing adding a set of
fields like:

	DevicePages:
	DeviceHugePages:
	DeviceGiganticPages:
	DeviceGinormousPages:

to unmuddle things when page sizes get mixed.  That's fine, but
it does require userspace know the mapping from our various
arbitrary names to hardware page sizes on each architecture and
kernel configuration.  That seems rather suboptimal.

What folks really want is to know how much memory is mapped with
each page size.  How about we just do *that*?

Patch attached.  Seems harmless enough.  Seems to compile on a
bunch of random architectures.  Makes smaps look like this:

Private_Hugetlb:       0 kB
Swap:                  0 kB
SwapPss:               0 kB
KernelPageSize:        4 kB
MMUPageSize:           4 kB
Locked:                0 kB
Ptes@4kB:	      32 kB
Ptes@2MB:	    2048 kB

The format I used here should be unlikely to break smaps parsers
unless they're looking for "kB" and now match the 'Ptes@4kB' instead
of the one at the end of the line.

1. I'd like to thank Dan Williams for showing me a mirror as I
   complained about the bozo that introduced 'AnonHugePages'.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: linux-mm@kvack.org

---

 b/Documentation/filesystems/proc.txt |    6 ++
 b/fs/proc/task_mmu.c                 |   81 ++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff -puN fs/proc/task_mmu.c~smaps-pte-sizes fs/proc/task_mmu.c
--- a/fs/proc/task_mmu.c~smaps-pte-sizes	2016-11-16 15:43:56.756991084 -0800
+++ b/fs/proc/task_mmu.c	2016-11-16 16:19:47.354789912 -0800
@@ -445,6 +445,9 @@ struct mem_size_stats {
 	unsigned long swap;
 	unsigned long shared_hugetlb;
 	unsigned long private_hugetlb;
+	unsigned long rss_pte;
+	unsigned long rss_pmd;
+	unsigned long rss_pud;
 	u64 pss;
 	u64 swap_pss;
 	bool check_shmem_swap;
@@ -519,6 +522,7 @@ static void smaps_pte_entry(pte_t *pte,
 
 	if (pte_present(*pte)) {
 		page = vm_normal_page(vma, addr, *pte);
+		mss->rss_pte += PAGE_SIZE;
 	} else if (is_swap_pte(*pte)) {
 		swp_entry_t swpent = pte_to_swp_entry(*pte);
 
@@ -578,6 +582,7 @@ static void smaps_pmd_entry(pmd_t *pmd,
 		/* pass */;
 	else
 		VM_BUG_ON_PAGE(1, page);
+	mss->rss_pmd += PMD_SIZE;
 	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
 }
 #else
@@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
 	}
 	if (page) {
 		int mapcount = page_mapcount(page);
+		unsigned long hpage_size = huge_page_size(hstate_vma(vma));
 
+		mss->rss_pud += hpage_size;
 		if (mapcount >= 2)
-			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+			mss->shared_hugetlb += hpage_size;
 		else
-			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+			mss->private_hugetlb += hpage_size;
 	}
 	return 0;
 }
@@ -716,6 +723,75 @@ void __weak arch_show_smap(struct seq_fi
 {
 }
 
+/*
+ * What units should we use for a given number?  We want
+ * 2048 to be 2k, so we return 'k'.  1048576 should be
+ * 1M, so we return 'M'.
+ */
+static char size_unit(unsigned long long nr)
+{
+	/*
+	 * This ' ' might look a bit goofy in the output.  But, why
+	 * bother doing anything.  Do we even have a <1k page size?
+	 */
+	if (nr < (1ULL<<10))
+		return ' ';
+	if (nr < (1ULL<<20))
+		return 'k';
+	if (nr < (1ULL<<30))
+		return 'M';
+	if (nr < (1ULL<<40))
+		return 'G';
+	if (nr < (1ULL<<50))
+		return 'T';
+	if (nr < (1ULL<<60))
+		return 'P';
+	return 'E';
+}
+
+/*
+ * How should we shift down a a given number to scale it
+ * with the units we are printing it as? 2048 to be 2k,
+ * so we want it shifted down by 10.  1048576 should be
+ * 1M, so we want it shifted down by 20.
+ */
+static int size_shift(unsigned long long nr)
+{
+	if (nr < (1ULL<<10))
+		return 0;
+	if (nr < (1ULL<<20))
+		return 10;
+	if (nr < (1ULL<<30))
+		return 20;
+	if (nr < (1ULL<<40))
+		return 30;
+	if (nr < (1ULL<<50))
+		return 40;
+	if (nr < (1ULL<<60))
+		return 50;
+	return 60;
+}
+
+static void show_one_smap_pte(struct seq_file *m, unsigned long bytes_rss,
+		unsigned long pte_size)
+{
+	seq_printf(m, "Ptes@%ld%cB:	%8lu kB\n",
+			pte_size >> size_shift(pte_size),
+			size_unit(pte_size),
+			bytes_rss >> 10);
+}
+
+static void show_smap_ptes(struct seq_file *m, struct mem_size_stats *mss)
+{
+	/* Only print the entries for page sizes present in the VMA */
+	if (mss->rss_pte)
+		show_one_smap_pte(m, mss->rss_pte, PAGE_SIZE);
+	if (mss->rss_pmd)
+		show_one_smap_pte(m, mss->rss_pmd, PMD_SIZE);
+	if (mss->rss_pud)
+		show_one_smap_pte(m, mss->rss_pud, PUD_SIZE);
+}
+
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
 	struct vm_area_struct *vma = v;
@@ -799,6 +875,7 @@ static int show_smap(struct seq_file *m,
 		   (vma->vm_flags & VM_LOCKED) ?
 			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
+	show_smap_ptes(m, &mss);
 	arch_show_smap(m, vma);
 	show_smap_vma_flags(m, vma);
 	m_cache_vma(m, vma);
diff -puN Documentation/filesystems/proc.txt~smaps-pte-sizes Documentation/filesystems/proc.txt
--- a/Documentation/filesystems/proc.txt~smaps-pte-sizes	2016-11-16 16:10:48.707307044 -0800
+++ b/Documentation/filesystems/proc.txt	2016-11-16 16:10:52.172464547 -0800
@@ -418,6 +418,9 @@ SwapPss:               0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:                0 kB
+Ptes@4kB:	       4 kB
+Ptes@2MB:	    8192 kB
+
 VmFlags: rd ex mr mw me dw
 
 the first of these lines shows the same information as is displayed for the
@@ -450,6 +453,9 @@ replaced by copy-on-write) part of the u
 "SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
 does not take into account swapped out page of underlying shmem objects.
 "Locked" indicates whether the mapping is locked in memory or not.
+"Ptes@..." lines show how many page table entries are currently in place and
+pointing to memory.  There is an entry for each size present in the hardware
+page tables for this mapping.
 
 "VmFlags" field deserves a separate description. This member represents the kernel
 flags associated with the particular virtual memory area in two letter encoded
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
  2016-11-17  0:28 ` Dave Hansen
@ 2016-11-24 14:22   ` Vlastimil Babka
  -1 siblings, 0 replies; 18+ messages in thread
From: Vlastimil Babka @ 2016-11-24 14:22 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: hch, akpm, dan.j.williams, khandual, linux-mm

On 11/17/2016 01:28 AM, Dave Hansen wrote:
> Changes from v1:
>  * Do one 'Pte' line per pte size instead of mashing on one line
>  * Use PMD_SIZE for pmds instead of PAGE_SIZE, whoops
>  * Wrote some Documentation/
>
> --
>
> /proc/$pid/smaps has a number of fields that are intended to imply the
> kinds of PTEs used to map memory.  "AnonHugePages" obviously tells you
> how many PMDs are being used.  "MMUPageSize" along with the "Hugetlb"
> fields tells you how many PTEs you have for a huge page.
>
> The current mechanisms work fine when we have one or two page sizes.
> But, they start to get a bit muddled when we mix page sizes inside
> one VMA.  For instance, the DAX folks were proposing adding a set of
> fields like:
>
> 	DevicePages:
> 	DeviceHugePages:
> 	DeviceGiganticPages:
> 	DeviceGinormousPages:
>
> to unmuddle things when page sizes get mixed.  That's fine, but
> it does require userspace know the mapping from our various
> arbitrary names to hardware page sizes on each architecture and
> kernel configuration.  That seems rather suboptimal.
>
> What folks really want is to know how much memory is mapped with
> each page size.  How about we just do *that*?
>
> Patch attached.  Seems harmless enough.  Seems to compile on a
> bunch of random architectures.  Makes smaps look like this:
>
> Private_Hugetlb:       0 kB
> Swap:                  0 kB
> SwapPss:               0 kB
> KernelPageSize:        4 kB
> MMUPageSize:           4 kB
> Locked:                0 kB
> Ptes@4kB:	      32 kB
> Ptes@2MB:	    2048 kB
>
> The format I used here should be unlikely to break smaps parsers
> unless they're looking for "kB" and now match the 'Ptes@4kB' instead
> of the one at the end of the line.
>
> 1. I'd like to thank Dan Williams for showing me a mirror as I
>    complained about the bozo that introduced 'AnonHugePages'.
>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
> Cc: linux-mm@kvack.org

Hmm, why not, I guess. But are HugeTLBs handled correctly?

> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>  	}
>  	if (page) {
>  		int mapcount = page_mapcount(page);
> +		unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>
> +		mss->rss_pud += hpage_size;

This hardcoded pud doesn't look right, doesn't the pmd/pud depend on 
hpage_size?

>  		if (mapcount >= 2)
> -			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
> +			mss->shared_hugetlb += hpage_size;
>  		else
> -			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
> +			mss->private_hugetlb += hpage_size;
>  	}
>  	return 0;
>  }

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-24 14:22   ` Vlastimil Babka
  0 siblings, 0 replies; 18+ messages in thread
From: Vlastimil Babka @ 2016-11-24 14:22 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: hch, akpm, dan.j.williams, khandual, linux-mm

On 11/17/2016 01:28 AM, Dave Hansen wrote:
> Changes from v1:
>  * Do one 'Pte' line per pte size instead of mashing on one line
>  * Use PMD_SIZE for pmds instead of PAGE_SIZE, whoops
>  * Wrote some Documentation/
>
> --
>
> /proc/$pid/smaps has a number of fields that are intended to imply the
> kinds of PTEs used to map memory.  "AnonHugePages" obviously tells you
> how many PMDs are being used.  "MMUPageSize" along with the "Hugetlb"
> fields tells you how many PTEs you have for a huge page.
>
> The current mechanisms work fine when we have one or two page sizes.
> But, they start to get a bit muddled when we mix page sizes inside
> one VMA.  For instance, the DAX folks were proposing adding a set of
> fields like:
>
> 	DevicePages:
> 	DeviceHugePages:
> 	DeviceGiganticPages:
> 	DeviceGinormousPages:
>
> to unmuddle things when page sizes get mixed.  That's fine, but
> it does require userspace know the mapping from our various
> arbitrary names to hardware page sizes on each architecture and
> kernel configuration.  That seems rather suboptimal.
>
> What folks really want is to know how much memory is mapped with
> each page size.  How about we just do *that*?
>
> Patch attached.  Seems harmless enough.  Seems to compile on a
> bunch of random architectures.  Makes smaps look like this:
>
> Private_Hugetlb:       0 kB
> Swap:                  0 kB
> SwapPss:               0 kB
> KernelPageSize:        4 kB
> MMUPageSize:           4 kB
> Locked:                0 kB
> Ptes@4kB:	      32 kB
> Ptes@2MB:	    2048 kB
>
> The format I used here should be unlikely to break smaps parsers
> unless they're looking for "kB" and now match the 'Ptes@4kB' instead
> of the one at the end of the line.
>
> 1. I'd like to thank Dan Williams for showing me a mirror as I
>    complained about the bozo that introduced 'AnonHugePages'.
>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
> Cc: linux-mm@kvack.org

Hmm, why not, I guess. But are HugeTLBs handled correctly?

> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>  	}
>  	if (page) {
>  		int mapcount = page_mapcount(page);
> +		unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>
> +		mss->rss_pud += hpage_size;

This hardcoded pud doesn't look right, doesn't the pmd/pud depend on 
hpage_size?

>  		if (mapcount >= 2)
> -			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
> +			mss->shared_hugetlb += hpage_size;
>  		else
> -			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
> +			mss->private_hugetlb += hpage_size;
>  	}
>  	return 0;
>  }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
  2016-11-17  0:28 ` Dave Hansen
@ 2016-11-25  4:00   ` Anshuman Khandual
  -1 siblings, 0 replies; 18+ messages in thread
From: Anshuman Khandual @ 2016-11-25  4:00 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: hch, akpm, dan.j.williams, linux-mm

On 11/17/2016 05:58 AM, Dave Hansen wrote:
> Changes from v1:
>  * Do one 'Pte' line per pte size instead of mashing on one line
>  * Use PMD_SIZE for pmds instead of PAGE_SIZE, whoops
>  * Wrote some Documentation/
> 
> --
> 
> /proc/$pid/smaps has a number of fields that are intended to imply the
> kinds of PTEs used to map memory.  "AnonHugePages" obviously tells you
> how many PMDs are being used.  "MMUPageSize" along with the "Hugetlb"
> fields tells you how many PTEs you have for a huge page.
> 
> The current mechanisms work fine when we have one or two page sizes.
> But, they start to get a bit muddled when we mix page sizes inside
> one VMA.  For instance, the DAX folks were proposing adding a set of
> fields like:

So DAX is only case which creates this scenario of multi page sizes in
the same VMA ? Is there any cases other than DAX mapping ?

> 
> 	DevicePages:
> 	DeviceHugePages:
> 	DeviceGiganticPages:
> 	DeviceGinormousPages:

I guess these are the page sizes supported at PTE, PMD, PUD, PGD level.
Are all these page sizes supported right now or we are just creating
place holder for future.

> 
> to unmuddle things when page sizes get mixed.  That's fine, but
> it does require userspace know the mapping from our various
> arbitrary names to hardware page sizes on each architecture and
> kernel configuration.  That seems rather suboptimal.
> 
> What folks really want is to know how much memory is mapped with
> each page size.  How about we just do *that*?
> 
> Patch attached.  Seems harmless enough.  Seems to compile on a
> bunch of random architectures.  Makes smaps look like this:
> 
> Private_Hugetlb:       0 kB
> Swap:                  0 kB
> SwapPss:               0 kB
> KernelPageSize:        4 kB
> MMUPageSize:           4 kB
> Locked:                0 kB
> Ptes@4kB:	      32 kB
> Ptes@2MB:	    2048 kB

So in the left column we are explicitly indicating the size of the PTE
and expect the user to figure out where it can really be either at PTE,
PMD, PUD etc. Thats little bit different that 'AnonHugePages' or the
Shared_HugeTLB/Private_HugeTLB pages which we know are the the PMD/PUD
level.

> 
> The format I used here should be unlikely to break smaps parsers
> unless they're looking for "kB" and now match the 'Ptes@4kB' instead
> of the one at the end of the line.

Right. So you are dropping the idea to introduce these fields as you
mentioned before for DAX mappings.

 	DevicePages:
 	DeviceHugePages:
 	DeviceGiganticPages:
 	DeviceGinormousPages:


> 
> 1. I'd like to thank Dan Williams for showing me a mirror as I
>    complained about the bozo that introduced 'AnonHugePages'.
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
> Cc: linux-mm@kvack.org
> 
> ---
> 
>  b/Documentation/filesystems/proc.txt |    6 ++
>  b/fs/proc/task_mmu.c                 |   81 ++++++++++++++++++++++++++++++++++-
>  2 files changed, 85 insertions(+), 2 deletions(-)
> 
> diff -puN fs/proc/task_mmu.c~smaps-pte-sizes fs/proc/task_mmu.c
> --- a/fs/proc/task_mmu.c~smaps-pte-sizes	2016-11-16 15:43:56.756991084 -0800
> +++ b/fs/proc/task_mmu.c	2016-11-16 16:19:47.354789912 -0800
> @@ -445,6 +445,9 @@ struct mem_size_stats {
>  	unsigned long swap;
>  	unsigned long shared_hugetlb;
>  	unsigned long private_hugetlb;
> +	unsigned long rss_pte;
> +	unsigned long rss_pmd;
> +	unsigned long rss_pud;
>  	u64 pss;
>  	u64 swap_pss;
>  	bool check_shmem_swap;
> @@ -519,6 +522,7 @@ static void smaps_pte_entry(pte_t *pte,
> 
>  	if (pte_present(*pte)) {
>  		page = vm_normal_page(vma, addr, *pte);
> +		mss->rss_pte += PAGE_SIZE;
>  	} else if (is_swap_pte(*pte)) {
>  		swp_entry_t swpent = pte_to_swp_entry(*pte);
> 
> @@ -578,6 +582,7 @@ static void smaps_pmd_entry(pmd_t *pmd,
>  		/* pass */;
>  	else
>  		VM_BUG_ON_PAGE(1, page);
> +	mss->rss_pmd += PMD_SIZE;
>  	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
>  }
>  #else
> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>  	}
>  	if (page) {
>  		int mapcount = page_mapcount(page);
> +		unsigned long hpage_size = huge_page_size(hstate_vma(vma));
> 
> +		mss->rss_pud += hpage_size;
>  		if (mapcount >= 2)
> -			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
> +			mss->shared_hugetlb += hpage_size;
>  		else
> -			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
> +			mss->private_hugetlb += hpage_size;
>  	}
>  	return 0;

Hmm, is this related to these new changes ? The replacement of 'hpage_size'
instead of huge_page_size(hstate_vma(vma)) can be done in a separate patch.

>  }
> @@ -716,6 +723,75 @@ void __weak arch_show_smap(struct seq_fi
>  {
>  }
> 
> +/*
> + * What units should we use for a given number?  We want
> + * 2048 to be 2k, so we return 'k'.  1048576 should be
> + * 1M, so we return 'M'.
> + */
> +static char size_unit(unsigned long long nr)
> +{
> +	/*
> +	 * This ' ' might look a bit goofy in the output.  But, why
> +	 * bother doing anything.  Do we even have a <1k page size?
> +	 */
> +	if (nr < (1ULL<<10))
> +		return ' ';
> +	if (nr < (1ULL<<20))
> +		return 'k';
> +	if (nr < (1ULL<<30))
> +		return 'M';
> +	if (nr < (1ULL<<40))
> +		return 'G';
> +	if (nr < (1ULL<<50))
> +		return 'T';
> +	if (nr < (1ULL<<60))
> +		return 'P';
> +	return 'E';
> +}
> +
> +/*
> + * How should we shift down a a given number to scale it
> + * with the units we are printing it as? 2048 to be 2k,
> + * so we want it shifted down by 10.  1048576 should be
> + * 1M, so we want it shifted down by 20.
> + */
> +static int size_shift(unsigned long long nr)
> +{
> +	if (nr < (1ULL<<10))
> +		return 0;
> +	if (nr < (1ULL<<20))
> +		return 10;
> +	if (nr < (1ULL<<30))
> +		return 20;
> +	if (nr < (1ULL<<40))
> +		return 30;
> +	if (nr < (1ULL<<50))
> +		return 40;
> +	if (nr < (1ULL<<60))
> +		return 50;
> +	return 60;
> +}
> +
> +static void show_one_smap_pte(struct seq_file *m, unsigned long bytes_rss,
> +		unsigned long pte_size)
> +{
> +	seq_printf(m, "Ptes@%ld%cB:	%8lu kB\n",
> +			pte_size >> size_shift(pte_size),
> +			size_unit(pte_size),
> +			bytes_rss >> 10);
> +}
> +
> +static void show_smap_ptes(struct seq_file *m, struct mem_size_stats *mss)
> +{
> +	/* Only print the entries for page sizes present in the VMA */
> +	if (mss->rss_pte)
> +		show_one_smap_pte(m, mss->rss_pte, PAGE_SIZE);
> +	if (mss->rss_pmd)
> +		show_one_smap_pte(m, mss->rss_pmd, PMD_SIZE);
> +	if (mss->rss_pud)
> +		show_one_smap_pte(m, mss->rss_pud, PUD_SIZE);
> +}
> +
>  static int show_smap(struct seq_file *m, void *v, int is_pid)
>  {
>  	struct vm_area_struct *vma = v;
> @@ -799,6 +875,7 @@ static int show_smap(struct seq_file *m,
>  		   (vma->vm_flags & VM_LOCKED) ?
>  			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
> 
> +	show_smap_ptes(m, &mss);
>  	arch_show_smap(m, vma);
>  	show_smap_vma_flags(m, vma);
>  	m_cache_vma(m, vma);
> diff -puN Documentation/filesystems/proc.txt~smaps-pte-sizes Documentation/filesystems/proc.txt
> --- a/Documentation/filesystems/proc.txt~smaps-pte-sizes	2016-11-16 16:10:48.707307044 -0800
> +++ b/Documentation/filesystems/proc.txt	2016-11-16 16:10:52.172464547 -0800
> @@ -418,6 +418,9 @@ SwapPss:               0 kB
>  KernelPageSize:        4 kB
>  MMUPageSize:           4 kB
>  Locked:                0 kB
> +Ptes@4kB:	       4 kB
> +Ptes@2MB:	    8192 kB
> +
>  VmFlags: rd ex mr mw me dw
> 
>  the first of these lines shows the same information as is displayed for the
> @@ -450,6 +453,9 @@ replaced by copy-on-write) part of the u
>  "SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
>  does not take into account swapped out page of underlying shmem objects.
>  "Locked" indicates whether the mapping is locked in memory or not.
> +"Ptes@..." lines show how many page table entries are currently in place and
> +pointing to memory.  There is an entry for each size present in the hardware
> +page tables for this mapping.
> 
>  "VmFlags" field deserves a separate description. This member represents the kernel
>  flags associated with the particular virtual memory area in two letter encoded
> _
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-25  4:00   ` Anshuman Khandual
  0 siblings, 0 replies; 18+ messages in thread
From: Anshuman Khandual @ 2016-11-25  4:00 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: hch, akpm, dan.j.williams, linux-mm

On 11/17/2016 05:58 AM, Dave Hansen wrote:
> Changes from v1:
>  * Do one 'Pte' line per pte size instead of mashing on one line
>  * Use PMD_SIZE for pmds instead of PAGE_SIZE, whoops
>  * Wrote some Documentation/
> 
> --
> 
> /proc/$pid/smaps has a number of fields that are intended to imply the
> kinds of PTEs used to map memory.  "AnonHugePages" obviously tells you
> how many PMDs are being used.  "MMUPageSize" along with the "Hugetlb"
> fields tells you how many PTEs you have for a huge page.
> 
> The current mechanisms work fine when we have one or two page sizes.
> But, they start to get a bit muddled when we mix page sizes inside
> one VMA.  For instance, the DAX folks were proposing adding a set of
> fields like:

So DAX is only case which creates this scenario of multi page sizes in
the same VMA ? Is there any cases other than DAX mapping ?

> 
> 	DevicePages:
> 	DeviceHugePages:
> 	DeviceGiganticPages:
> 	DeviceGinormousPages:

I guess these are the page sizes supported at PTE, PMD, PUD, PGD level.
Are all these page sizes supported right now or we are just creating
place holder for future.

> 
> to unmuddle things when page sizes get mixed.  That's fine, but
> it does require userspace know the mapping from our various
> arbitrary names to hardware page sizes on each architecture and
> kernel configuration.  That seems rather suboptimal.
> 
> What folks really want is to know how much memory is mapped with
> each page size.  How about we just do *that*?
> 
> Patch attached.  Seems harmless enough.  Seems to compile on a
> bunch of random architectures.  Makes smaps look like this:
> 
> Private_Hugetlb:       0 kB
> Swap:                  0 kB
> SwapPss:               0 kB
> KernelPageSize:        4 kB
> MMUPageSize:           4 kB
> Locked:                0 kB
> Ptes@4kB:	      32 kB
> Ptes@2MB:	    2048 kB

So in the left column we are explicitly indicating the size of the PTE
and expect the user to figure out where it can really be either at PTE,
PMD, PUD etc. Thats little bit different that 'AnonHugePages' or the
Shared_HugeTLB/Private_HugeTLB pages which we know are the the PMD/PUD
level.

> 
> The format I used here should be unlikely to break smaps parsers
> unless they're looking for "kB" and now match the 'Ptes@4kB' instead
> of the one at the end of the line.

Right. So you are dropping the idea to introduce these fields as you
mentioned before for DAX mappings.

 	DevicePages:
 	DeviceHugePages:
 	DeviceGiganticPages:
 	DeviceGinormousPages:


> 
> 1. I'd like to thank Dan Williams for showing me a mirror as I
>    complained about the bozo that introduced 'AnonHugePages'.
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
> Cc: linux-mm@kvack.org
> 
> ---
> 
>  b/Documentation/filesystems/proc.txt |    6 ++
>  b/fs/proc/task_mmu.c                 |   81 ++++++++++++++++++++++++++++++++++-
>  2 files changed, 85 insertions(+), 2 deletions(-)
> 
> diff -puN fs/proc/task_mmu.c~smaps-pte-sizes fs/proc/task_mmu.c
> --- a/fs/proc/task_mmu.c~smaps-pte-sizes	2016-11-16 15:43:56.756991084 -0800
> +++ b/fs/proc/task_mmu.c	2016-11-16 16:19:47.354789912 -0800
> @@ -445,6 +445,9 @@ struct mem_size_stats {
>  	unsigned long swap;
>  	unsigned long shared_hugetlb;
>  	unsigned long private_hugetlb;
> +	unsigned long rss_pte;
> +	unsigned long rss_pmd;
> +	unsigned long rss_pud;
>  	u64 pss;
>  	u64 swap_pss;
>  	bool check_shmem_swap;
> @@ -519,6 +522,7 @@ static void smaps_pte_entry(pte_t *pte,
> 
>  	if (pte_present(*pte)) {
>  		page = vm_normal_page(vma, addr, *pte);
> +		mss->rss_pte += PAGE_SIZE;
>  	} else if (is_swap_pte(*pte)) {
>  		swp_entry_t swpent = pte_to_swp_entry(*pte);
> 
> @@ -578,6 +582,7 @@ static void smaps_pmd_entry(pmd_t *pmd,
>  		/* pass */;
>  	else
>  		VM_BUG_ON_PAGE(1, page);
> +	mss->rss_pmd += PMD_SIZE;
>  	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
>  }
>  #else
> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>  	}
>  	if (page) {
>  		int mapcount = page_mapcount(page);
> +		unsigned long hpage_size = huge_page_size(hstate_vma(vma));
> 
> +		mss->rss_pud += hpage_size;
>  		if (mapcount >= 2)
> -			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
> +			mss->shared_hugetlb += hpage_size;
>  		else
> -			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
> +			mss->private_hugetlb += hpage_size;
>  	}
>  	return 0;

Hmm, is this related to these new changes ? The replacement of 'hpage_size'
instead of huge_page_size(hstate_vma(vma)) can be done in a separate patch.

>  }
> @@ -716,6 +723,75 @@ void __weak arch_show_smap(struct seq_fi
>  {
>  }
> 
> +/*
> + * What units should we use for a given number?  We want
> + * 2048 to be 2k, so we return 'k'.  1048576 should be
> + * 1M, so we return 'M'.
> + */
> +static char size_unit(unsigned long long nr)
> +{
> +	/*
> +	 * This ' ' might look a bit goofy in the output.  But, why
> +	 * bother doing anything.  Do we even have a <1k page size?
> +	 */
> +	if (nr < (1ULL<<10))
> +		return ' ';
> +	if (nr < (1ULL<<20))
> +		return 'k';
> +	if (nr < (1ULL<<30))
> +		return 'M';
> +	if (nr < (1ULL<<40))
> +		return 'G';
> +	if (nr < (1ULL<<50))
> +		return 'T';
> +	if (nr < (1ULL<<60))
> +		return 'P';
> +	return 'E';
> +}
> +
> +/*
> + * How should we shift down a a given number to scale it
> + * with the units we are printing it as? 2048 to be 2k,
> + * so we want it shifted down by 10.  1048576 should be
> + * 1M, so we want it shifted down by 20.
> + */
> +static int size_shift(unsigned long long nr)
> +{
> +	if (nr < (1ULL<<10))
> +		return 0;
> +	if (nr < (1ULL<<20))
> +		return 10;
> +	if (nr < (1ULL<<30))
> +		return 20;
> +	if (nr < (1ULL<<40))
> +		return 30;
> +	if (nr < (1ULL<<50))
> +		return 40;
> +	if (nr < (1ULL<<60))
> +		return 50;
> +	return 60;
> +}
> +
> +static void show_one_smap_pte(struct seq_file *m, unsigned long bytes_rss,
> +		unsigned long pte_size)
> +{
> +	seq_printf(m, "Ptes@%ld%cB:	%8lu kB\n",
> +			pte_size >> size_shift(pte_size),
> +			size_unit(pte_size),
> +			bytes_rss >> 10);
> +}
> +
> +static void show_smap_ptes(struct seq_file *m, struct mem_size_stats *mss)
> +{
> +	/* Only print the entries for page sizes present in the VMA */
> +	if (mss->rss_pte)
> +		show_one_smap_pte(m, mss->rss_pte, PAGE_SIZE);
> +	if (mss->rss_pmd)
> +		show_one_smap_pte(m, mss->rss_pmd, PMD_SIZE);
> +	if (mss->rss_pud)
> +		show_one_smap_pte(m, mss->rss_pud, PUD_SIZE);
> +}
> +
>  static int show_smap(struct seq_file *m, void *v, int is_pid)
>  {
>  	struct vm_area_struct *vma = v;
> @@ -799,6 +875,7 @@ static int show_smap(struct seq_file *m,
>  		   (vma->vm_flags & VM_LOCKED) ?
>  			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
> 
> +	show_smap_ptes(m, &mss);
>  	arch_show_smap(m, vma);
>  	show_smap_vma_flags(m, vma);
>  	m_cache_vma(m, vma);
> diff -puN Documentation/filesystems/proc.txt~smaps-pte-sizes Documentation/filesystems/proc.txt
> --- a/Documentation/filesystems/proc.txt~smaps-pte-sizes	2016-11-16 16:10:48.707307044 -0800
> +++ b/Documentation/filesystems/proc.txt	2016-11-16 16:10:52.172464547 -0800
> @@ -418,6 +418,9 @@ SwapPss:               0 kB
>  KernelPageSize:        4 kB
>  MMUPageSize:           4 kB
>  Locked:                0 kB
> +Ptes@4kB:	       4 kB
> +Ptes@2MB:	    8192 kB
> +
>  VmFlags: rd ex mr mw me dw
> 
>  the first of these lines shows the same information as is displayed for the
> @@ -450,6 +453,9 @@ replaced by copy-on-write) part of the u
>  "SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
>  does not take into account swapped out page of underlying shmem objects.
>  "Locked" indicates whether the mapping is locked in memory or not.
> +"Ptes@..." lines show how many page table entries are currently in place and
> +pointing to memory.  There is an entry for each size present in the hardware
> +page tables for this mapping.
> 
>  "VmFlags" field deserves a separate description. This member represents the kernel
>  flags associated with the particular virtual memory area in two letter encoded
> _
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
  2016-11-24 14:22   ` Vlastimil Babka
@ 2016-11-28 16:52     ` Dave Hansen
  -1 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2016-11-28 16:52 UTC (permalink / raw)
  To: Vlastimil Babka, linux-kernel
  Cc: hch, akpm, dan.j.williams, khandual, linux-mm

On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
> On 11/17/2016 01:28 AM, Dave Hansen wrote:
>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>>      }
>>      if (page) {
>>          int mapcount = page_mapcount(page);
>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>
>> +        mss->rss_pud += hpage_size;
> 
> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
> hpage_size?

Urg, nope.  Thanks for noticing that!  I think we'll need something
along the lines of:

                if (hpage_size == PUD_SIZE)
                        mss->rss_pud += PUD_SIZE;
                else if (hpage_size == PMD_SIZE)
                        mss->rss_pmd += PMD_SIZE;

I'll respin and resend.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-28 16:52     ` Dave Hansen
  0 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2016-11-28 16:52 UTC (permalink / raw)
  To: Vlastimil Babka, linux-kernel
  Cc: hch, akpm, dan.j.williams, khandual, linux-mm

On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
> On 11/17/2016 01:28 AM, Dave Hansen wrote:
>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>>      }
>>      if (page) {
>>          int mapcount = page_mapcount(page);
>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>
>> +        mss->rss_pud += hpage_size;
> 
> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
> hpage_size?

Urg, nope.  Thanks for noticing that!  I think we'll need something
along the lines of:

                if (hpage_size == PUD_SIZE)
                        mss->rss_pud += PUD_SIZE;
                else if (hpage_size == PMD_SIZE)
                        mss->rss_pmd += PMD_SIZE;

I'll respin and resend.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
  2016-11-25  4:00   ` Anshuman Khandual
@ 2016-11-28 17:00     ` Dave Hansen
  -1 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2016-11-28 17:00 UTC (permalink / raw)
  To: Anshuman Khandual, linux-kernel; +Cc: hch, akpm, dan.j.williams, linux-mm

On 11/24/2016 08:00 PM, Anshuman Khandual wrote:
...
>> The current mechanisms work fine when we have one or two page sizes.
>> But, they start to get a bit muddled when we mix page sizes inside
>> one VMA.  For instance, the DAX folks were proposing adding a set of
>> fields like:
> 
> So DAX is only case which creates this scenario of multi page sizes in
> the same VMA ? Is there any cases other than DAX mapping ?

Both file and anonymous huge pages.  No other ones in the core VM that I
can think of.

>> 	DevicePages:
>> 	DeviceHugePages:
>> 	DeviceGiganticPages:
>> 	DeviceGinormousPages:
> 
> I guess these are the page sizes supported at PTE, PMD, PUD, PGD level.
> Are all these page sizes supported right now or we are just creating
> place holder for future.

I know there are patches for PUD level support in DAX, but I don't think
they're merged yet.  There is definitely *not* support for PGD level
since we don't have such support in hardware on x86 as far as I know.

>> SwapPss:               0 kB
>> KernelPageSize:        4 kB
>> MMUPageSize:           4 kB
>> Locked:                0 kB
>> Ptes@4kB:	      32 kB
>> Ptes@2MB:	    2048 kB
> 
> So in the left column we are explicitly indicating the size of the PTE
> and expect the user to figure out where it can really be either at PTE,
> PMD, PUD etc. Thats little bit different that 'AnonHugePages' or the
> Shared_HugeTLB/Private_HugeTLB pages which we know are the the PMD/PUD
> level.

Yeah, it's a little different from what we have.

>> The format I used here should be unlikely to break smaps parsers
>> unless they're looking for "kB" and now match the 'Ptes@4kB' instead
>> of the one at the end of the line.
> 
> Right. So you are dropping the idea to introduce these fields as you
> mentioned before for DAX mappings.
> 
>  	DevicePages:
>  	DeviceHugePages:
>  	DeviceGiganticPages:
>  	DeviceGinormousPages:

Right.  We don't need those if we have this patch.

>>  	if (page) {
>>  		int mapcount = page_mapcount(page);
>> +		unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>
>> +		mss->rss_pud += hpage_size;
>>  		if (mapcount >= 2)
>> -			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
>> +			mss->shared_hugetlb += hpage_size;
>>  		else
>> -			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
>> +			mss->private_hugetlb += hpage_size;
>>  	}
>>  	return 0;
> 
> Hmm, is this related to these new changes ? The replacement of 'hpage_size'
> instead of huge_page_size(hstate_vma(vma)) can be done in a separate patch.

Yes, this is theoretically unrelated, but I'm not breaking this 3-line
change up into a different patch unless there's a pretty good reason reason.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-28 17:00     ` Dave Hansen
  0 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2016-11-28 17:00 UTC (permalink / raw)
  To: Anshuman Khandual, linux-kernel; +Cc: hch, akpm, dan.j.williams, linux-mm

On 11/24/2016 08:00 PM, Anshuman Khandual wrote:
...
>> The current mechanisms work fine when we have one or two page sizes.
>> But, they start to get a bit muddled when we mix page sizes inside
>> one VMA.  For instance, the DAX folks were proposing adding a set of
>> fields like:
> 
> So DAX is only case which creates this scenario of multi page sizes in
> the same VMA ? Is there any cases other than DAX mapping ?

Both file and anonymous huge pages.  No other ones in the core VM that I
can think of.

>> 	DevicePages:
>> 	DeviceHugePages:
>> 	DeviceGiganticPages:
>> 	DeviceGinormousPages:
> 
> I guess these are the page sizes supported at PTE, PMD, PUD, PGD level.
> Are all these page sizes supported right now or we are just creating
> place holder for future.

I know there are patches for PUD level support in DAX, but I don't think
they're merged yet.  There is definitely *not* support for PGD level
since we don't have such support in hardware on x86 as far as I know.

>> SwapPss:               0 kB
>> KernelPageSize:        4 kB
>> MMUPageSize:           4 kB
>> Locked:                0 kB
>> Ptes@4kB:	      32 kB
>> Ptes@2MB:	    2048 kB
> 
> So in the left column we are explicitly indicating the size of the PTE
> and expect the user to figure out where it can really be either at PTE,
> PMD, PUD etc. Thats little bit different that 'AnonHugePages' or the
> Shared_HugeTLB/Private_HugeTLB pages which we know are the the PMD/PUD
> level.

Yeah, it's a little different from what we have.

>> The format I used here should be unlikely to break smaps parsers
>> unless they're looking for "kB" and now match the 'Ptes@4kB' instead
>> of the one at the end of the line.
> 
> Right. So you are dropping the idea to introduce these fields as you
> mentioned before for DAX mappings.
> 
>  	DevicePages:
>  	DeviceHugePages:
>  	DeviceGiganticPages:
>  	DeviceGinormousPages:

Right.  We don't need those if we have this patch.

>>  	if (page) {
>>  		int mapcount = page_mapcount(page);
>> +		unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>
>> +		mss->rss_pud += hpage_size;
>>  		if (mapcount >= 2)
>> -			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
>> +			mss->shared_hugetlb += hpage_size;
>>  		else
>> -			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
>> +			mss->private_hugetlb += hpage_size;
>>  	}
>>  	return 0;
> 
> Hmm, is this related to these new changes ? The replacement of 'hpage_size'
> instead of huge_page_size(hstate_vma(vma)) can be done in a separate patch.

Yes, this is theoretically unrelated, but I'm not breaking this 3-line
change up into a different patch unless there's a pretty good reason reason.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
  2016-11-28 16:52     ` Dave Hansen
@ 2016-11-28 21:07       ` Vlastimil Babka
  -1 siblings, 0 replies; 18+ messages in thread
From: Vlastimil Babka @ 2016-11-28 21:07 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: hch, akpm, dan.j.williams, khandual, linux-mm

On 11/28/2016 05:52 PM, Dave Hansen wrote:
> On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
>> On 11/17/2016 01:28 AM, Dave Hansen wrote:
>>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>>>      }
>>>      if (page) {
>>>          int mapcount = page_mapcount(page);
>>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>>
>>> +        mss->rss_pud += hpage_size;
>>
>> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
>> hpage_size?
> 
> Urg, nope.  Thanks for noticing that!  I think we'll need something
> along the lines of:
> 
>                 if (hpage_size == PUD_SIZE)
>                         mss->rss_pud += PUD_SIZE;
>                 else if (hpage_size == PMD_SIZE)
>                         mss->rss_pmd += PMD_SIZE;

Sounds better, although I wonder whether there are some weird arches
supporting hugepage sizes that don't match page table levels. I recall
that e.g. MIPS could do arbitrary size, but dunno if the kernel supports
that...

> I'll respin and resend.
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-28 21:07       ` Vlastimil Babka
  0 siblings, 0 replies; 18+ messages in thread
From: Vlastimil Babka @ 2016-11-28 21:07 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: hch, akpm, dan.j.williams, khandual, linux-mm

On 11/28/2016 05:52 PM, Dave Hansen wrote:
> On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
>> On 11/17/2016 01:28 AM, Dave Hansen wrote:
>>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>>>      }
>>>      if (page) {
>>>          int mapcount = page_mapcount(page);
>>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>>
>>> +        mss->rss_pud += hpage_size;
>>
>> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
>> hpage_size?
> 
> Urg, nope.  Thanks for noticing that!  I think we'll need something
> along the lines of:
> 
>                 if (hpage_size == PUD_SIZE)
>                         mss->rss_pud += PUD_SIZE;
>                 else if (hpage_size == PMD_SIZE)
>                         mss->rss_pmd += PMD_SIZE;

Sounds better, although I wonder whether there are some weird arches
supporting hugepage sizes that don't match page table levels. I recall
that e.g. MIPS could do arbitrary size, but dunno if the kernel supports
that...

> I'll respin and resend.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
  2016-11-28 21:07       ` Vlastimil Babka
@ 2016-11-28 21:39         ` Dave Hansen
  -1 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2016-11-28 21:39 UTC (permalink / raw)
  To: Vlastimil Babka, linux-kernel
  Cc: hch, akpm, dan.j.williams, khandual, linux-mm, Catalin Marinas,
	Will Deacon

... cc'ing the arm64 maintainers

On 11/28/2016 01:07 PM, Vlastimil Babka wrote:
> On 11/28/2016 05:52 PM, Dave Hansen wrote:
>> On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
>>> On 11/17/2016 01:28 AM, Dave Hansen wrote:
>>>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>>>>      }
>>>>      if (page) {
>>>>          int mapcount = page_mapcount(page);
>>>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>>>
>>>> +        mss->rss_pud += hpage_size;
>>>
>>> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
>>> hpage_size?
>>
>> Urg, nope.  Thanks for noticing that!  I think we'll need something
>> along the lines of:
>>
>>                 if (hpage_size == PUD_SIZE)
>>                         mss->rss_pud += PUD_SIZE;
>>                 else if (hpage_size == PMD_SIZE)
>>                         mss->rss_pmd += PMD_SIZE;
> 
> Sounds better, although I wonder whether there are some weird arches
> supporting hugepage sizes that don't match page table levels. I recall
> that e.g. MIPS could do arbitrary size, but dunno if the kernel supports
> that...

arm64 seems to have pretty arbitrary sizes, and seems to be able to
build them out of multiple hardware PTE sizes.  I think I can fix my
code to handle those:

                if (hpage_size >= PGD_SIZE)
                        mss->rss_pgd += PGD_SIZE;
                else if (hpage_size >= PUD_SIZE)
                        mss->rss_pud += PUD_SIZE;
                else if (hpage_size >= PMD_SIZE)
                        mss->rss_pmd += PMD_SIZE;
                else
                        mss->rss_pte += PAGE_SIZE;

But, I *think* that means that smaps_hugetlb_range() is *currently*
broken for these intermediate arm64 sizes.  The code does:

                if (mapcount >= 2)
                        mss->shared_hugetlb += hpage_size;
                else
                        mss->private_hugetlb += hpage_size;

So I *think* if we may count a hugetlbfs arm64 CONT_PTES page multiple
times, and account hpage_size for *each* of the CONT_PTES.  That would
artificially inflate the smaps output for those pages.

Will / Catalin, is there something I'm missing?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-28 21:39         ` Dave Hansen
  0 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2016-11-28 21:39 UTC (permalink / raw)
  To: Vlastimil Babka, linux-kernel
  Cc: hch, akpm, dan.j.williams, khandual, linux-mm, Catalin Marinas,
	Will Deacon

... cc'ing the arm64 maintainers

On 11/28/2016 01:07 PM, Vlastimil Babka wrote:
> On 11/28/2016 05:52 PM, Dave Hansen wrote:
>> On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
>>> On 11/17/2016 01:28 AM, Dave Hansen wrote:
>>>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>>>>      }
>>>>      if (page) {
>>>>          int mapcount = page_mapcount(page);
>>>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>>>
>>>> +        mss->rss_pud += hpage_size;
>>>
>>> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
>>> hpage_size?
>>
>> Urg, nope.  Thanks for noticing that!  I think we'll need something
>> along the lines of:
>>
>>                 if (hpage_size == PUD_SIZE)
>>                         mss->rss_pud += PUD_SIZE;
>>                 else if (hpage_size == PMD_SIZE)
>>                         mss->rss_pmd += PMD_SIZE;
> 
> Sounds better, although I wonder whether there are some weird arches
> supporting hugepage sizes that don't match page table levels. I recall
> that e.g. MIPS could do arbitrary size, but dunno if the kernel supports
> that...

arm64 seems to have pretty arbitrary sizes, and seems to be able to
build them out of multiple hardware PTE sizes.  I think I can fix my
code to handle those:

                if (hpage_size >= PGD_SIZE)
                        mss->rss_pgd += PGD_SIZE;
                else if (hpage_size >= PUD_SIZE)
                        mss->rss_pud += PUD_SIZE;
                else if (hpage_size >= PMD_SIZE)
                        mss->rss_pmd += PMD_SIZE;
                else
                        mss->rss_pte += PAGE_SIZE;

But, I *think* that means that smaps_hugetlb_range() is *currently*
broken for these intermediate arm64 sizes.  The code does:

                if (mapcount >= 2)
                        mss->shared_hugetlb += hpage_size;
                else
                        mss->private_hugetlb += hpage_size;

So I *think* if we may count a hugetlbfs arm64 CONT_PTES page multiple
times, and account hpage_size for *each* of the CONT_PTES.  That would
artificially inflate the smaps output for those pages.

Will / Catalin, is there something I'm missing?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
  2016-11-28 21:39         ` Dave Hansen
@ 2016-11-29  8:01           ` Vlastimil Babka
  -1 siblings, 0 replies; 18+ messages in thread
From: Vlastimil Babka @ 2016-11-29  8:01 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel
  Cc: hch, akpm, dan.j.williams, khandual, linux-mm, Catalin Marinas,
	Will Deacon

On 11/28/2016 10:39 PM, Dave Hansen wrote:
> ... cc'ing the arm64 maintainers
> 
> On 11/28/2016 01:07 PM, Vlastimil Babka wrote:
>> On 11/28/2016 05:52 PM, Dave Hansen wrote:
>>> On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
>>>> On 11/17/2016 01:28 AM, Dave Hansen wrote:
>>>>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>>>>>      }
>>>>>      if (page) {
>>>>>          int mapcount = page_mapcount(page);
>>>>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>>>>
>>>>> +        mss->rss_pud += hpage_size;
>>>>
>>>> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
>>>> hpage_size?
>>>
>>> Urg, nope.  Thanks for noticing that!  I think we'll need something
>>> along the lines of:
>>>
>>>                 if (hpage_size == PUD_SIZE)
>>>                         mss->rss_pud += PUD_SIZE;
>>>                 else if (hpage_size == PMD_SIZE)
>>>                         mss->rss_pmd += PMD_SIZE;
>>
>> Sounds better, although I wonder whether there are some weird arches
>> supporting hugepage sizes that don't match page table levels. I recall
>> that e.g. MIPS could do arbitrary size, but dunno if the kernel supports
>> that...
> 
> arm64 seems to have pretty arbitrary sizes, and seems to be able to
> build them out of multiple hardware PTE sizes.  I think I can fix my
> code to handle those:
> 
>                 if (hpage_size >= PGD_SIZE)
>                         mss->rss_pgd += PGD_SIZE;
>                 else if (hpage_size >= PUD_SIZE)
>                         mss->rss_pud += PUD_SIZE;
>                 else if (hpage_size >= PMD_SIZE)
>                         mss->rss_pmd += PMD_SIZE;
>                 else
>                         mss->rss_pte += PAGE_SIZE;
> 
> But, I *think* that means that smaps_hugetlb_range() is *currently*
> broken for these intermediate arm64 sizes.  The code does:
> 
>                 if (mapcount >= 2)
>                         mss->shared_hugetlb += hpage_size;
>                 else
>                         mss->private_hugetlb += hpage_size;
> 
> So I *think* if we may count a hugetlbfs arm64 CONT_PTES page multiple
> times, and account hpage_size for *each* of the CONT_PTES.  That would
> artificially inflate the smaps output for those pages.

Hmm IIUC walk_hugetlb_range() will call the smaps_hugetlb_range()
callback once per hugepage, not once per "pte", no? See
hugetlb_entry_end(). In that case the current code should be OK and
yours would undercount?

> Will / Catalin, is there something I'm missing?
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-29  8:01           ` Vlastimil Babka
  0 siblings, 0 replies; 18+ messages in thread
From: Vlastimil Babka @ 2016-11-29  8:01 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel
  Cc: hch, akpm, dan.j.williams, khandual, linux-mm, Catalin Marinas,
	Will Deacon

On 11/28/2016 10:39 PM, Dave Hansen wrote:
> ... cc'ing the arm64 maintainers
> 
> On 11/28/2016 01:07 PM, Vlastimil Babka wrote:
>> On 11/28/2016 05:52 PM, Dave Hansen wrote:
>>> On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
>>>> On 11/17/2016 01:28 AM, Dave Hansen wrote:
>>>>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
>>>>>      }
>>>>>      if (page) {
>>>>>          int mapcount = page_mapcount(page);
>>>>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
>>>>>
>>>>> +        mss->rss_pud += hpage_size;
>>>>
>>>> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
>>>> hpage_size?
>>>
>>> Urg, nope.  Thanks for noticing that!  I think we'll need something
>>> along the lines of:
>>>
>>>                 if (hpage_size == PUD_SIZE)
>>>                         mss->rss_pud += PUD_SIZE;
>>>                 else if (hpage_size == PMD_SIZE)
>>>                         mss->rss_pmd += PMD_SIZE;
>>
>> Sounds better, although I wonder whether there are some weird arches
>> supporting hugepage sizes that don't match page table levels. I recall
>> that e.g. MIPS could do arbitrary size, but dunno if the kernel supports
>> that...
> 
> arm64 seems to have pretty arbitrary sizes, and seems to be able to
> build them out of multiple hardware PTE sizes.  I think I can fix my
> code to handle those:
> 
>                 if (hpage_size >= PGD_SIZE)
>                         mss->rss_pgd += PGD_SIZE;
>                 else if (hpage_size >= PUD_SIZE)
>                         mss->rss_pud += PUD_SIZE;
>                 else if (hpage_size >= PMD_SIZE)
>                         mss->rss_pmd += PMD_SIZE;
>                 else
>                         mss->rss_pte += PAGE_SIZE;
> 
> But, I *think* that means that smaps_hugetlb_range() is *currently*
> broken for these intermediate arm64 sizes.  The code does:
> 
>                 if (mapcount >= 2)
>                         mss->shared_hugetlb += hpage_size;
>                 else
>                         mss->private_hugetlb += hpage_size;
> 
> So I *think* if we may count a hugetlbfs arm64 CONT_PTES page multiple
> times, and account hpage_size for *each* of the CONT_PTES.  That would
> artificially inflate the smaps output for those pages.

Hmm IIUC walk_hugetlb_range() will call the smaps_hugetlb_range()
callback once per hugepage, not once per "pte", no? See
hugetlb_entry_end(). In that case the current code should be OK and
yours would undercount?

> Will / Catalin, is there something I'm missing?
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
  2016-11-28 21:39         ` Dave Hansen
@ 2016-11-29 15:07           ` Catalin Marinas
  -1 siblings, 0 replies; 18+ messages in thread
From: Catalin Marinas @ 2016-11-29 15:07 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Vlastimil Babka, linux-kernel, hch, akpm, dan.j.williams,
	khandual, linux-mm, Will Deacon

On Mon, Nov 28, 2016 at 01:39:49PM -0800, Dave Hansen wrote:
> On 11/28/2016 01:07 PM, Vlastimil Babka wrote:
> > On 11/28/2016 05:52 PM, Dave Hansen wrote:
> >> On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
> >>> On 11/17/2016 01:28 AM, Dave Hansen wrote:
> >>>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
> >>>>      }
> >>>>      if (page) {
> >>>>          int mapcount = page_mapcount(page);
> >>>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
> >>>>
> >>>> +        mss->rss_pud += hpage_size;
> >>>
> >>> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
> >>> hpage_size?
> >>
> >> Urg, nope.  Thanks for noticing that!  I think we'll need something
> >> along the lines of:
> >>
> >>                 if (hpage_size == PUD_SIZE)
> >>                         mss->rss_pud += PUD_SIZE;
> >>                 else if (hpage_size == PMD_SIZE)
> >>                         mss->rss_pmd += PMD_SIZE;
> > 
> > Sounds better, although I wonder whether there are some weird arches
> > supporting hugepage sizes that don't match page table levels. I recall
> > that e.g. MIPS could do arbitrary size, but dunno if the kernel supports
> > that...
> 
> arm64 seems to have pretty arbitrary sizes, and seems to be able to
> build them out of multiple hardware PTE sizes.  I think I can fix my
> code to handle those:
> 
>                 if (hpage_size >= PGD_SIZE)
>                         mss->rss_pgd += PGD_SIZE;
>                 else if (hpage_size >= PUD_SIZE)
>                         mss->rss_pud += PUD_SIZE;
>                 else if (hpage_size >= PMD_SIZE)
>                         mss->rss_pmd += PMD_SIZE;
>                 else
>                         mss->rss_pte += PAGE_SIZE;
> 
> But, I *think* that means that smaps_hugetlb_range() is *currently*
> broken for these intermediate arm64 sizes.  The code does:
> 
>                 if (mapcount >= 2)
>                         mss->shared_hugetlb += hpage_size;
>                 else
>                         mss->private_hugetlb += hpage_size;
> 
> So I *think* if we may count a hugetlbfs arm64 CONT_PTES page multiple
> times, and account hpage_size for *each* of the CONT_PTES.  That would
> artificially inflate the smaps output for those pages.

I don't think it would count them multiple times. As Vlastimil
mentioned, huge_page_size() would return (CONT_PTES * PAGE_SIZE) in such
case, so walk_hugetlb_range() skips the intermediate ptes. In general,
we try to keep the contiguous pte/pmd support visible only to the arm64
hugetlb code and hidden to the core code.

-- 
Catalin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] proc: mm: export PTE sizes directly in smaps (v2)
@ 2016-11-29 15:07           ` Catalin Marinas
  0 siblings, 0 replies; 18+ messages in thread
From: Catalin Marinas @ 2016-11-29 15:07 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Vlastimil Babka, linux-kernel, hch, akpm, dan.j.williams,
	khandual, linux-mm, Will Deacon

On Mon, Nov 28, 2016 at 01:39:49PM -0800, Dave Hansen wrote:
> On 11/28/2016 01:07 PM, Vlastimil Babka wrote:
> > On 11/28/2016 05:52 PM, Dave Hansen wrote:
> >> On 11/24/2016 06:22 AM, Vlastimil Babka wrote:
> >>> On 11/17/2016 01:28 AM, Dave Hansen wrote:
> >>>> @@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
> >>>>      }
> >>>>      if (page) {
> >>>>          int mapcount = page_mapcount(page);
> >>>> +        unsigned long hpage_size = huge_page_size(hstate_vma(vma));
> >>>>
> >>>> +        mss->rss_pud += hpage_size;
> >>>
> >>> This hardcoded pud doesn't look right, doesn't the pmd/pud depend on
> >>> hpage_size?
> >>
> >> Urg, nope.  Thanks for noticing that!  I think we'll need something
> >> along the lines of:
> >>
> >>                 if (hpage_size == PUD_SIZE)
> >>                         mss->rss_pud += PUD_SIZE;
> >>                 else if (hpage_size == PMD_SIZE)
> >>                         mss->rss_pmd += PMD_SIZE;
> > 
> > Sounds better, although I wonder whether there are some weird arches
> > supporting hugepage sizes that don't match page table levels. I recall
> > that e.g. MIPS could do arbitrary size, but dunno if the kernel supports
> > that...
> 
> arm64 seems to have pretty arbitrary sizes, and seems to be able to
> build them out of multiple hardware PTE sizes.  I think I can fix my
> code to handle those:
> 
>                 if (hpage_size >= PGD_SIZE)
>                         mss->rss_pgd += PGD_SIZE;
>                 else if (hpage_size >= PUD_SIZE)
>                         mss->rss_pud += PUD_SIZE;
>                 else if (hpage_size >= PMD_SIZE)
>                         mss->rss_pmd += PMD_SIZE;
>                 else
>                         mss->rss_pte += PAGE_SIZE;
> 
> But, I *think* that means that smaps_hugetlb_range() is *currently*
> broken for these intermediate arm64 sizes.  The code does:
> 
>                 if (mapcount >= 2)
>                         mss->shared_hugetlb += hpage_size;
>                 else
>                         mss->private_hugetlb += hpage_size;
> 
> So I *think* if we may count a hugetlbfs arm64 CONT_PTES page multiple
> times, and account hpage_size for *each* of the CONT_PTES.  That would
> artificially inflate the smaps output for those pages.

I don't think it would count them multiple times. As Vlastimil
mentioned, huge_page_size() would return (CONT_PTES * PAGE_SIZE) in such
case, so walk_hugetlb_range() skips the intermediate ptes. In general,
we try to keep the contiguous pte/pmd support visible only to the arm64
hugetlb code and hidden to the core code.

-- 
Catalin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2016-11-29 15:18 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-17  0:28 [PATCH] proc: mm: export PTE sizes directly in smaps (v2) Dave Hansen
2016-11-17  0:28 ` Dave Hansen
2016-11-24 14:22 ` Vlastimil Babka
2016-11-24 14:22   ` Vlastimil Babka
2016-11-28 16:52   ` Dave Hansen
2016-11-28 16:52     ` Dave Hansen
2016-11-28 21:07     ` Vlastimil Babka
2016-11-28 21:07       ` Vlastimil Babka
2016-11-28 21:39       ` Dave Hansen
2016-11-28 21:39         ` Dave Hansen
2016-11-29  8:01         ` Vlastimil Babka
2016-11-29  8:01           ` Vlastimil Babka
2016-11-29 15:07         ` Catalin Marinas
2016-11-29 15:07           ` Catalin Marinas
2016-11-25  4:00 ` Anshuman Khandual
2016-11-25  4:00   ` Anshuman Khandual
2016-11-28 17:00   ` Dave Hansen
2016-11-28 17:00     ` Dave Hansen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.