[V4,01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
diff mbox series

Message ID 1548966486-49963-1-git-send-email-kan.liang@linux.intel.com
State New, archived
Headers show
Series
  • [V4,01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
Related show

Commit Message

Liang, Kan Jan. 31, 2019, 8:27 p.m. UTC
From: Kan Liang <kan.liang@linux.intel.com>

Current perf can report both virtual address and physical address, but
it doesn't report page size. Users have no idea how large the utilized
page is. They cannot promote/demote large pages to optimize memory use.

Add a new sample type for data page size.

Current perf already has a facility to collect data virtual address.
A __weak function, aim to retrieve page size via a given virtual
address, is introduced in the generic code. Now, it always returns 0.
The function must be IRQ-safe.
This patch only implements a x86 specific version, which do full
page-table walk of a given virtual address to retrieve page size.
For x86, disabling IRQs over the walk is sufficient to prevent any
tear down of the page tables.
Other architectures can implement their own functions later separately.

The new sample type requires collecting the virtual address. The
virtual address will not be output unless SAMPLE_ADDR is applied.

A u64 type is claimed for page_size. Because struct perf_sample_data
requires cacheline_aligned.

The large PEBS will be disabled with this sample type. Because we need
to track munmap to flush the PEBS buffer for large PEBS. Perf doesn't
support munmap tracking yet. The large PEBS can be enabled later
separately when munmap tracking is supported.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Use the real page size to replace enum.
- Modify the changelog to mention the generic support of
  __weak perf_get_page_size()

 arch/x86/events/core.c          | 31 +++++++++++++++++++++++++++++++
 arch/x86/events/intel/ds.c      |  3 ++-
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 15 +++++++++++++++
 5 files changed, 52 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra Feb. 1, 2019, 9:22 a.m. UTC | #1
On Thu, Jan 31, 2019 at 12:27:54PM -0800, kan.liang@linux.intel.com wrote:
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 374a197..229a73b 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2578,3 +2578,34 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
>  	cap->events_mask_len	= x86_pmu.events_mask_len;
>  }
>  EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> +
> +u64 perf_get_page_size(u64 virt)
> +{
> +	unsigned long flags;
> +	unsigned int level;
> +	pte_t *pte;
> +
> +	if (!virt)
> +		return 0;
> +
> +	/*
> +	 * Interrupts are disabled, so it prevents any tear down
> +	 * of the page tables.
> +	 * See the comment near struct mmu_table_batch.
> +	 */
> +	local_irq_save(flags);
> +	if (virt >= TASK_SIZE)
> +		pte = lookup_address(virt, &level);
> +	else {
> +		if (current->mm) {
> +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> +						    virt, &level);
> +		} else
> +			level = PG_LEVEL_NUM;
> +	}
> +	local_irq_restore(flags);
> +	if (level >= PG_LEVEL_NUM)
> +		return 0;
> +
> +	return (u64)page_level_size(level);
> +}

*sigh* there really isn't anything x86 specific there.

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 236bb8d..d233f45 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6352,6 +6358,12 @@ static u64 perf_virt_to_phys(u64 virt)
>  	return phys_addr;
>  }
>  
> +/* Return page size of given virtual address. IRQ-safe required. */
> +u64 __weak perf_get_page_size(u64 virt)
> +{
> +	return 0;
> +}
> +
>  static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>  
>  struct perf_callchain_entry *

How about something like so instead?

(completely untested, will likely make your grandma eat puppies)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6357,10 +6357,72 @@ static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
-/* Return page size of given virtual address. IRQ-safe required. */
-u64 __weak perf_get_page_size(u64 virt)
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
 {
-	return 0;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
+		return 0;
+
+	if (p4d_large(*p4d));
+		return 1ULL << P4D_SHIFT;
+
+	if (!p4d_present(*p4d))
+		return 0;
+
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud))
+		return 0;
+
+	if (pud_large(*pud))
+		return 1ULL << PUD_SHIFT;
+
+	if (!pud_present(*pud))
+		return 0;
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return 0;
+
+	if (pmd_large(*pmd))
+		return 1ULL << PMD_SHIFT;
+
+	if (!pmd_present(*pmd))
+		return 0;
+
+	return 1ULL << PAGE_SHIFT;
+}
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+	struct mm_struct *mm;
+	unsigned long flags;
+	u64 ret;
+
+	/*
+	 * Software page-table walkers must disable IRQs, see asm-generic/tlb.h.
+	 */
+	local_irq_save(flags);
+	mm = current->mm;
+	if (!mm) {
+		/*
+		 * For kernel threads and the like, use init_mm so that
+		 * we can find kernel memory.
+		 */
+		mm = &init_mm;
+	}
+	ret = __perf_get_page_size(mm, addr);
+	local_irq_restore(flags);
+
+	return ret;
 }
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
Peter Zijlstra Feb. 1, 2019, 10:03 a.m. UTC | #2
On Fri, Feb 01, 2019 at 10:22:40AM +0100, Peter Zijlstra wrote:

> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
>  {
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (pgd_none(*pgd))
> +		return 0;
> +
> +	p4d = p4d_offset(pgd, addr);
> +	if (p4d_none(*p4d))
> +		return 0;
> +
> +	if (p4d_large(*p4d));
> +		return 1ULL << P4D_SHIFT;
> +
> +	if (!p4d_present(*p4d))
> +		return 0;
> +
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return 0;
> +
> +	if (pud_large(*pud))
> +		return 1ULL << PUD_SHIFT;

Will just mentioned a lovely feature where some archs have multi entry
large pages.

Possible something like:

	if (pud_large(*pud)) {
		struct page *page = pud_page(*pud);
		int order = PUD_SHIFT;

		if (PageHuge(page)) {
			page = compound_head(page);
			order += compound_order(page);
		}

		return 1ULL << order;
	}

works correctly.

> +
> +	if (!pud_present(*pud))
> +		return 0;
> +
> +	pmd = pmd_offset(pud, addr);
> +	if (pmd_none(*pmd))
> +		return 0;
> +
> +	if (pmd_large(*pmd))
> +		return 1ULL << PMD_SHIFT;

And same here I suppose..

> +
> +	if (!pmd_present(*pmd))
> +		return 0;
> +
> +	return 1ULL << PAGE_SHIFT;
> +}
Kirill A. Shutemov Feb. 1, 2019, 10:34 a.m. UTC | #3
On Fri, Feb 01, 2019 at 10:22:40AM +0100, Peter Zijlstra wrote:
> On Thu, Jan 31, 2019 at 12:27:54PM -0800, kan.liang@linux.intel.com wrote:
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> > index 374a197..229a73b 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -2578,3 +2578,34 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
> >  	cap->events_mask_len	= x86_pmu.events_mask_len;
> >  }
> >  EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> > +
> > +u64 perf_get_page_size(u64 virt)
> > +{
> > +	unsigned long flags;
> > +	unsigned int level;
> > +	pte_t *pte;
> > +
> > +	if (!virt)
> > +		return 0;
> > +
> > +	/*
> > +	 * Interrupts are disabled, so it prevents any tear down
> > +	 * of the page tables.
> > +	 * See the comment near struct mmu_table_batch.
> > +	 */
> > +	local_irq_save(flags);
> > +	if (virt >= TASK_SIZE)
> > +		pte = lookup_address(virt, &level);
> > +	else {
> > +		if (current->mm) {
> > +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> > +						    virt, &level);
> > +		} else
> > +			level = PG_LEVEL_NUM;
> > +	}
> > +	local_irq_restore(flags);
> > +	if (level >= PG_LEVEL_NUM)
> > +		return 0;
> > +
> > +	return (u64)page_level_size(level);
> > +}
> 
> *sigh* there really isn't anything x86 specific there.
> 
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 236bb8d..d233f45 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6352,6 +6358,12 @@ static u64 perf_virt_to_phys(u64 virt)
> >  	return phys_addr;
> >  }
> >  
> > +/* Return page size of given virtual address. IRQ-safe required. */
> > +u64 __weak perf_get_page_size(u64 virt)
> > +{
> > +	return 0;
> > +}
> > +
> >  static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
> >  
> >  struct perf_callchain_entry *
> 
> How about something like so instead?
> 
> (completely untested, will likely make your grandma eat puppies)
> 
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6357,10 +6357,72 @@ static u64 perf_virt_to_phys(u64 virt)
>  	return phys_addr;
>  }
>  
> -/* Return page size of given virtual address. IRQ-safe required. */
> -u64 __weak perf_get_page_size(u64 virt)
> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
>  {
> -	return 0;
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (pgd_none(*pgd))
> +		return 0;
> +
> +	p4d = p4d_offset(pgd, addr);
> +	if (p4d_none(*p4d))
> +		return 0;
> +
> +	if (p4d_large(*p4d));

We dont have 512GiB pages yet.

> +		return 1ULL << P4D_SHIFT;

		return P4D_SIZE;

And the same P?D_SIZE below.

> +
> +	if (!p4d_present(*p4d))
> +		return 0;

No need to check p4d_none() *and* p4d_present(). Just p4d_present() should
be enough. Large is still suppose to be present. The same for other levels.

> +
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return 0;
> +
> +	if (pud_large(*pud))
> +		return 1ULL << PUD_SHIFT;
> +
> +	if (!pud_present(*pud))
> +		return 0;
> +
> +	pmd = pmd_offset(pud, addr);
> +	if (pmd_none(*pmd))
> +		return 0;
> +
> +	if (pmd_large(*pmd))
> +		return 1ULL << PMD_SHIFT;
> +
> +	if (!pmd_present(*pmd))
> +		return 0;
> +
> +	return 1ULL << PAGE_SHIFT;
> +}
> +
> +static u64 perf_get_page_size(unsigned long addr)
> +{
> +	struct mm_struct *mm;
> +	unsigned long flags;
> +	u64 ret;
> +
> +	/*
> +	 * Software page-table walkers must disable IRQs, see asm-generic/tlb.h.
> +	 */
> +	local_irq_save(flags);
> +	mm = current->mm;
> +	if (!mm) {
> +		/*
> +		 * For kernel threads and the like, use init_mm so that
> +		 * we can find kernel memory.
> +		 */
> +		mm = &init_mm;
> +	}
> +	ret = __perf_get_page_size(mm, addr);
> +	local_irq_restore(flags);
> +
> +	return ret;
>  }
>  
>  static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
Kirill A. Shutemov Feb. 1, 2019, 10:36 a.m. UTC | #4
On Fri, Feb 01, 2019 at 11:03:58AM +0100, Peter Zijlstra wrote:
> On Fri, Feb 01, 2019 at 10:22:40AM +0100, Peter Zijlstra wrote:
> 
> > +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> >  {
> > +	pgd_t *pgd;
> > +	p4d_t *p4d;
> > +	pud_t *pud;
> > +	pmd_t *pmd;
> > +
> > +	pgd = pgd_offset(mm, addr);
> > +	if (pgd_none(*pgd))
> > +		return 0;
> > +
> > +	p4d = p4d_offset(pgd, addr);
> > +	if (p4d_none(*p4d))
> > +		return 0;
> > +
> > +	if (p4d_large(*p4d));
> > +		return 1ULL << P4D_SHIFT;
> > +
> > +	if (!p4d_present(*p4d))
> > +		return 0;
> > +
> > +	pud = pud_offset(p4d, addr);
> > +	if (pud_none(*pud))
> > +		return 0;
> > +
> > +	if (pud_large(*pud))
> > +		return 1ULL << PUD_SHIFT;
> 
> Will just mentioned a lovely feature where some archs have multi entry
> large pages.
> 
> Possible something like:
> 
> 	if (pud_large(*pud)) {
> 		struct page *page = pud_page(*pud);
> 		int order = PUD_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			order += compound_order(page);
> 		}
> 
> 		return 1ULL << order;
> 	}
> 
> works correctly.

For more fun: some compound pages can be mapped withe page table entries
not matching it's compound size, i.e. 2M pages mapped with PTE.
Peter Zijlstra Feb. 1, 2019, 12:43 p.m. UTC | #5
On Fri, Feb 01, 2019 at 01:36:00PM +0300, Kirill A. Shutemov wrote:
> On Fri, Feb 01, 2019 at 11:03:58AM +0100, Peter Zijlstra wrote:

> > Will just mentioned a lovely feature where some archs have multi entry
> > large pages.
> > 
> > Possible something like:
> > 
> > 	if (pud_large(*pud)) {
> > 		struct page *page = pud_page(*pud);
> > 		int order = PUD_SHIFT;
> > 
> > 		if (PageHuge(page)) {
> > 			page = compound_head(page);
> > 			order += compound_order(page);
> > 		}
> > 
> > 		return 1ULL << order;
> > 	}
> > 
> > works correctly.
> 
> For more fun: some compound pages can be mapped withe page table entries
> not matching it's compound size, i.e. 2M pages mapped with PTE.

Surely not for PageHuge() ?! I thought the point of hugetlbfs was to
guarantee page granularity.

How is the below?

static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
{
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pgd = pgd_offset(mm, addr);
	if (pgd_none(*pgd))
		return 0;

	p4d = p4d_offset(pgd, addr);
	if (!p4d_present(*p4d))
		return 0;

	if (p4d_large(*p4d)) {
		struct page *page = p4d_page(*p4d);
		int shift = P4D_SHIFT;

		if (PageHuge(page)) {
			page = compound_head(page);
			shift = PAGE_SHIFT + compound_order(page);
		}

		return 1ULL << shift;
	}

	if (!p4d_present(*p4d))
		return 0;

	pud = pud_offset(p4d, addr);
	if (!pud_present(*pud))
		return 0;

	if (pud_large(*pud)) {
		struct page *page = pud_page(*pud);
		int shift = P4D_SHIFT;

		if (PageHuge(page)) {
			page = compound_head(page);
			shift = PAGE_SHIFT + compound_order(page);
		}

		return 1ULL << shift;
	}

	pmd = pmd_offset(pud, addr);
	if (!pmd_present(*pmd))
		return 0;

	if (pmd_large(*pmd)) {
		struct page *page = pud_page(*pud);
		int shift = P4D_SHIFT;

		if (PageHuge(page)) {
			page = compound_head(page);
			shift = PAGE_SHIFT + compound_order(page);
		}

		return 1ULL << shift;
	}

	pte = pte_offset_map(pmd, addr);
	if (!pte_present(*pte)) {
		pte_unmap(pte);
		return 0;
	}

	pte_unmap(pte);
	return PAGE_SIZE;
}
Peter Zijlstra Feb. 1, 2019, 12:47 p.m. UTC | #6
On Fri, Feb 01, 2019 at 01:43:48PM +0100, Peter Zijlstra wrote:

> static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> {
> 	pgd_t *pgd;
> 	p4d_t *p4d;
> 	pud_t *pud;
> 	pmd_t *pmd;
> 	pte_t *pte;
> 
> 	pgd = pgd_offset(mm, addr);
> 	if (pgd_none(*pgd))
> 		return 0;
> 
> 	p4d = p4d_offset(pgd, addr);
> 	if (!p4d_present(*p4d))
> 		return 0;
> 
> 	if (p4d_large(*p4d)) {
> 		struct page *page = p4d_page(*p4d);
> 		int shift = P4D_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	if (!p4d_present(*p4d))
> 		return 0;
> 
> 	pud = pud_offset(p4d, addr);
> 	if (!pud_present(*pud))
> 		return 0;
> 
> 	if (pud_large(*pud)) {
> 		struct page *page = pud_page(*pud);
> 		int shift = P4D_SHIFT;

PUD_SHIFT

> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	pmd = pmd_offset(pud, addr);
> 	if (!pmd_present(*pmd))
> 		return 0;
> 
> 	if (pmd_large(*pmd)) {
> 		struct page *page = pud_page(*pud);
> 		int shift = P4D_SHIFT;

PMD_SHIFT

> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	pte = pte_offset_map(pmd, addr);
> 	if (!pte_present(*pte)) {
> 		pte_unmap(pte);
> 		return 0;
> 	}
> 
> 	pte_unmap(pte);
> 	return PAGE_SIZE;
> }
Liang, Kan Feb. 1, 2019, 2:45 p.m. UTC | #7
On 2/1/2019 4:22 AM, Peter Zijlstra wrote:
> On Thu, Jan 31, 2019 at 12:27:54PM -0800, kan.liang@linux.intel.com wrote:
>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
>> index 374a197..229a73b 100644
>> --- a/arch/x86/events/core.c
>> +++ b/arch/x86/events/core.c
>> @@ -2578,3 +2578,34 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
>>   	cap->events_mask_len	= x86_pmu.events_mask_len;
>>   }
>>   EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
>> +
>> +u64 perf_get_page_size(u64 virt)
>> +{
>> +	unsigned long flags;
>> +	unsigned int level;
>> +	pte_t *pte;
>> +
>> +	if (!virt)
>> +		return 0;
>> +
>> +	/*
>> +	 * Interrupts are disabled, so it prevents any tear down
>> +	 * of the page tables.
>> +	 * See the comment near struct mmu_table_batch.
>> +	 */
>> +	local_irq_save(flags);
>> +	if (virt >= TASK_SIZE)
>> +		pte = lookup_address(virt, &level);
>> +	else {
>> +		if (current->mm) {
>> +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
>> +						    virt, &level);
>> +		} else
>> +			level = PG_LEVEL_NUM;
>> +	}
>> +	local_irq_restore(flags);
>> +	if (level >= PG_LEVEL_NUM)
>> +		return 0;
>> +
>> +	return (u64)page_level_size(level);
>> +}
> 
> *sigh* there really isn't anything x86 specific there.

OK. I will split the patch and move the common code to a dedicated patch 
in V5. I will try the proposed code and do some tests on X86.

> >> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 236bb8d..d233f45 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -6352,6 +6358,12 @@ static u64 perf_virt_to_phys(u64 virt)
>>   	return phys_addr;
>>   }
>>   
>> +/* Return page size of given virtual address. IRQ-safe required. */
>> +u64 __weak perf_get_page_size(u64 virt)
>> +{
>> +	return 0;
>> +}
>> +
>>   static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>>   
>>   struct perf_callchain_entry *
> 
> How about something like so instead?
> 
> (completely untested, will likely make your grandma eat puppies
That's not funny!


Kan
Liang, Kan Feb. 1, 2019, 4:16 p.m. UTC | #8
On 2/1/2019 7:43 AM, Peter Zijlstra wrote:
> On Fri, Feb 01, 2019 at 01:36:00PM +0300, Kirill A. Shutemov wrote:
>> On Fri, Feb 01, 2019 at 11:03:58AM +0100, Peter Zijlstra wrote:
> 
>>> Will just mentioned a lovely feature where some archs have multi entry
>>> large pages.
>>>
>>> Possible something like:
>>>
>>> 	if (pud_large(*pud)) {
>>> 		struct page *page = pud_page(*pud);
>>> 		int order = PUD_SHIFT;
>>>
>>> 		if (PageHuge(page)) {
>>> 			page = compound_head(page);
>>> 			order += compound_order(page);
>>> 		}
>>>
>>> 		return 1ULL << order;
>>> 	}
>>>
>>> works correctly.
>>
>> For more fun: some compound pages can be mapped withe page table entries
>> not matching it's compound size, i.e. 2M pages mapped with PTE.
> 
> Surely not for PageHuge() ?! I thought the point of hugetlbfs was to
> guarantee page granularity.
> 
> How is the below?
> 
> static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> {
> 	pgd_t *pgd;
> 	p4d_t *p4d;
> 	pud_t *pud;
> 	pmd_t *pmd;
> 	pte_t *pte;
> 
> 	pgd = pgd_offset(mm, addr);
> 	if (pgd_none(*pgd))
> 		return 0;
> 
> 	p4d = p4d_offset(pgd, addr);
> 	if (!p4d_present(*p4d))
> 		return 0;
> 
> 	if (p4d_large(*p4d)) {

This one looks like x86 specific?

Kan

> 		struct page *page = p4d_page(*p4d);
> 		int shift = P4D_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	if (!p4d_present(*p4d))
> 		return 0;
> 
> 	pud = pud_offset(p4d, addr);
> 	if (!pud_present(*pud))
> 		return 0;
> 
> 	if (pud_large(*pud)) {
> 		struct page *page = pud_page(*pud);
> 		int shift = P4D_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	pmd = pmd_offset(pud, addr);
> 	if (!pmd_present(*pmd))
> 		return 0;
> 
> 	if (pmd_large(*pmd)) {
> 		struct page *page = pud_page(*pud);
> 		int shift = P4D_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	pte = pte_offset_map(pmd, addr);
> 	if (!pte_present(*pte)) {
> 		pte_unmap(pte);
> 		return 0;
> 	}
> 
> 	pte_unmap(pte);
> 	return PAGE_SIZE;
> }
>
Peter Zijlstra Feb. 4, 2019, 10:54 a.m. UTC | #9
On Fri, Feb 01, 2019 at 11:16:51AM -0500, Liang, Kan wrote:

> > 	if (p4d_large(*p4d)) {
> 
> This one looks like x86 specific?

> > 	if (pud_large(*pud)) {

> > 	if (pmd_large(*pmd)) {

Kirill did indeed note that p*_large() isn't universally availale (but
there's definitely !x86 archs that have them). He also said it would
probably make sense to have them universally available and might help
clean up mm/gup.c a little.

A quick grep shows that: ARM, PowerPC, S390, Sparc and x86 have
'pmd_large'.

Anyway; it probably makes sense (and shouldn't be too hard) to fix up
all architectures to provide this.
Liang, Kan Feb. 6, 2019, 8:23 p.m. UTC | #10
On 2/4/2019 5:54 AM, Peter Zijlstra wrote:
> On Fri, Feb 01, 2019 at 11:16:51AM -0500, Liang, Kan wrote:
> 
>>> 	if (p4d_large(*p4d)) {
>>
>> This one looks like x86 specific?
> 
>>> 	if (pud_large(*pud)) {
> 
>>> 	if (pmd_large(*pmd)) {
> 
> Kirill did indeed note that p*_large() isn't universally availale (but
> there's definitely !x86 archs that have them). He also said it would
> probably make sense to have them universally available and might help
> clean up mm/gup.c a little.
> 
> A quick grep shows that: ARM, PowerPC, S390, Sparc and x86 have
> 'pmd_large'.
> 
> Anyway; it probably makes sense (and shouldn't be too hard) to fix up
> all architectures to provide this.
> 
Hi Peter and Kirill,

It looks like it's not easy to support get_page_size() universally.
Even the 'pmd_large' you mentioned is not universal. I got error message 
when building with ARCH=riscv.
There is even less support for pud_large and p4d_large.
We have to check and add something like "#define p*d_large(a) 0" in the 
pg headers for each ARCH. I think it's ugly.


> +               if (PageHuge(page)) {
> +                       page = compound_head(page);
> +                       shift = PAGE_SHIFT + compound_order(page);
> +               }

PageHuge() only returns true for hugetlbfs. I think the transparent huge 
pages should also use compound pages, right? Besides hugetlbfs and THP, 
are there any other cases which also use compound pages?
Can the codes handle all these cases?

> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long
> +addr) {
> +	pgd_t *pgd;
> +	p4d_t *p4d;

An universal get_page_size() function should not be implemented in perf. 
It will be a problem for future maintenance.

All in all, I think we are far away from an universal get_page_size(). A 
__weak function + x86 implementation solution proposed in this patch 
series should be a better choice.
- Other ARCH can have their own implementation later if they want this 
feature.
- Standard pg table helper functions are used for x86. Maintenance will 
not be a problem.

What do you think?

Thanks,
Kan

Patch
diff mbox series

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 374a197..229a73b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2578,3 +2578,34 @@  void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 	cap->events_mask_len	= x86_pmu.events_mask_len;
 }
 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+
+u64 perf_get_page_size(u64 virt)
+{
+	unsigned long flags;
+	unsigned int level;
+	pte_t *pte;
+
+	if (!virt)
+		return 0;
+
+	/*
+	 * Interrupts are disabled, so it prevents any tear down
+	 * of the page tables.
+	 * See the comment near struct mmu_table_batch.
+	 */
+	local_irq_save(flags);
+	if (virt >= TASK_SIZE)
+		pte = lookup_address(virt, &level);
+	else {
+		if (current->mm) {
+			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
+						    virt, &level);
+		} else
+			level = PG_LEVEL_NUM;
+	}
+	local_irq_restore(flags);
+	if (level >= PG_LEVEL_NUM)
+		return 0;
+
+	return (u64)page_level_size(level);
+}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e9acf1d..720dc9e 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1274,7 +1274,8 @@  static void setup_pebs_sample_data(struct perf_event *event,
 	}
 
 
-	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
+	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR
+			    | PERF_SAMPLE_DATA_PAGE_SIZE)) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a79e59f..0e048ab 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -937,6 +937,7 @@  struct perf_sample_data {
 	u64				stack_user_size;
 
 	u64				phys_addr;
+	u64				data_page_size;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7198ddd..0e8d222 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@  enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -863,6 +864,7 @@  enum perf_event_type {
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 236bb8d..d233f45 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1753,6 +1753,9 @@  static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		size += sizeof(data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		size += sizeof(data->data_page_size);
+
 	event->header_size = size;
 }
 
@@ -6305,6 +6308,9 @@  void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		perf_output_put(handle, data->data_page_size);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6352,6 +6358,12 @@  static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
+/* Return page size of given virtual address. IRQ-safe required. */
+u64 __weak perf_get_page_size(u64 virt)
+{
+	return 0;
+}
+
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
 struct perf_callchain_entry *
@@ -6493,6 +6505,9 @@  void perf_prepare_sample(struct perf_event_header *header,
 
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
+
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		data->data_page_size = perf_get_page_size(data->addr);
 }
 
 static __always_inline int