[V3,01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
diff mbox series

Message ID 1548858234-8872-1-git-send-email-kan.liang@linux.intel.com
State New
Headers show
Series
  • [V3,01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
Related show

Commit Message

Liang, Kan Jan. 30, 2019, 2:23 p.m. UTC
From: Kan Liang <kan.liang@linux.intel.com>

Current perf can report both virtual address and physical address, but
it doesn't report page size. Users have no idea how large the utilized
page is. They cannot promote/demote large pages to optimize memory use.

Add a new sample type for data page size.

Current perf already has a facility to collect data virtual address.
A function, to retrieve page size by full page-table walk of a given
virtual address, is introduced for x86. Other architectures can
implement their own functions later separately.
The function must be IRQ-safe. For x86, disabling IRQs over the walk is
sufficient to prevent any tear down of the page tables.

The new sample type requires collecting the virtual address. The
virtual address will not be output unless SAMPLE_ADDR is applied.

Although only a few bits are needed to indicate the page size, a u64
type is still claimed for page_size. Because struct perf_sample_data
requires cacheline_aligned.

The large PEBS will be disabled with this sample type. Because we need
to track munmap to flush the PEBS buffer for large PEBS. Perf doesn't
support munmap tracking yet. The large PEBS can be enabled later
separately when munmap tracking is supported.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V2
- Don't fetch the page size of user address if current->mm is NULL.

 arch/x86/events/core.c          | 42 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/events/intel/ds.c      |  3 ++-
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h | 16 +++++++++++++++-
 kernel/events/core.c            | 15 +++++++++++++++
 5 files changed, 75 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra Jan. 31, 2019, 12:37 p.m. UTC | #1
On Wed, Jan 30, 2019 at 06:23:42AM -0800, kan.liang@linux.intel.com wrote:
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 374a197..03bf45d 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2578,3 +2578,45 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
>  	cap->events_mask_len	= x86_pmu.events_mask_len;
>  }
>  EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> +
> +/*
> + * map x86 page levels to perf page sizes
> + */
> +static const enum perf_page_size perf_page_size_map[PG_LEVEL_NUM] = {
> +	[PG_LEVEL_NONE] = PERF_PAGE_SIZE_NONE,
> +	[PG_LEVEL_4K]   = PERF_PAGE_SIZE_4K,
> +	[PG_LEVEL_2M]   = PERF_PAGE_SIZE_2M,
> +	[PG_LEVEL_1G]   = PERF_PAGE_SIZE_1G,
> +	[PG_LEVEL_512G] = PERF_PAGE_SIZE_512G,
> +};
> +
> +u64 perf_get_page_size(u64 virt)
> +{
> +	unsigned long flags;
> +	unsigned int level;
> +	pte_t *pte;
> +
> +	if (!virt)
> +		return 0;
> +
> +	/*
> +	 * Interrupts are disabled, so it prevents any tear down
> +	 * of the page tables.
> +	 * See the comment near struct mmu_table_batch.
> +	 */
> +	local_irq_save(flags);
> +	if (virt >= TASK_SIZE)
> +		pte = lookup_address(virt, &level);
> +	else {
> +		if (current->mm)
> +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> +						    virt, &level);

Aside from all the missin {}, I'm fairly sure this is broken since this
happens from NMI context. This can interrupt switch_mm() and things like
use_temporary_mm().

Also; why does this live in the x86 code and not in the generic code?

> +		else
> +			level = PG_LEVEL_NUM;
> +	}
> +	local_irq_restore(flags);
> +	if (level >= PG_LEVEL_NUM)
> +		return PERF_PAGE_SIZE_NONE;
> +
> +	return (u64)perf_page_size_map[level];
> +}

> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 7198ddd..79daacd 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
>  	PERF_SAMPLE_TRANSACTION			= 1U << 17,
>  	PERF_SAMPLE_REGS_INTR			= 1U << 18,
>  	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
> +	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 20,
>  
> -	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
> +	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
>  
>  	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
>  };
> @@ -863,6 +864,7 @@ enum perf_event_type {
>  	 *	{ u64			abi; # enum perf_sample_regs_abi
>  	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
>  	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> +	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
>  	 * };
>  	 */
>  	PERF_RECORD_SAMPLE			= 9,
> @@ -1150,6 +1152,18 @@ union perf_mem_data_src {
>  #define PERF_MEM_S(a, s) \
>  	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
>  
> +
> +enum perf_page_size {
> +	PERF_PAGE_SIZE_NONE,
> +	PERF_PAGE_SIZE_4K,
> +	PERF_PAGE_SIZE_8K,
> +	PERF_PAGE_SIZE_16K,
> +	PERF_PAGE_SIZE_64K,
> +	PERF_PAGE_SIZE_2M,
> +	PERF_PAGE_SIZE_1G,
> +	PERF_PAGE_SIZE_512G,
> +};

Since you have a u64 to store this in, WTH do you use this limited enum?
Are you very sure this covers all the possible page sizes for all
architectures?

Why not simply report the page size in bytes?
Peter Zijlstra Jan. 31, 2019, 12:59 p.m. UTC | #2
On Thu, Jan 31, 2019 at 01:37:25PM +0100, Peter Zijlstra wrote:
> On Wed, Jan 30, 2019 at 06:23:42AM -0800, kan.liang@linux.intel.com wrote:
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> > index 374a197..03bf45d 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -2578,3 +2578,45 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
> >  	cap->events_mask_len	= x86_pmu.events_mask_len;
> >  }
> >  EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> > +
> > +/*
> > + * map x86 page levels to perf page sizes
> > + */
> > +static const enum perf_page_size perf_page_size_map[PG_LEVEL_NUM] = {
> > +	[PG_LEVEL_NONE] = PERF_PAGE_SIZE_NONE,
> > +	[PG_LEVEL_4K]   = PERF_PAGE_SIZE_4K,
> > +	[PG_LEVEL_2M]   = PERF_PAGE_SIZE_2M,
> > +	[PG_LEVEL_1G]   = PERF_PAGE_SIZE_1G,
> > +	[PG_LEVEL_512G] = PERF_PAGE_SIZE_512G,
> > +};
> > +
> > +u64 perf_get_page_size(u64 virt)
> > +{
> > +	unsigned long flags;
> > +	unsigned int level;
> > +	pte_t *pte;
> > +
> > +	if (!virt)
> > +		return 0;
> > +
> > +	/*
> > +	 * Interrupts are disabled, so it prevents any tear down
> > +	 * of the page tables.
> > +	 * See the comment near struct mmu_table_batch.
> > +	 */
> > +	local_irq_save(flags);
> > +	if (virt >= TASK_SIZE)
> > +		pte = lookup_address(virt, &level);
> > +	else {
> > +		if (current->mm)
> > +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> > +						    virt, &level);
> 
> Aside from all the missin {}, I'm fairly sure this is broken since this
> happens from NMI context. This can interrupt switch_mm() and things like
> use_temporary_mm().

Ah, I'm confused again. This is a software page-table walk and is not
affected by the current CR3 state, which is much safer.

The rest of the comment still apply of course.
Andi Kleen Jan. 31, 2019, 1:10 p.m. UTC | #3
> 
> Aside from all the missin {}, I'm fairly sure this is broken since this
> happens from NMI context. This can interrupt switch_mm() and things like
> use_temporary_mm().

So the concern is that the sample is from before the switch, and then
looks it up in the wrong page tables if the PMI happens after the switch
due to sampling skid?

First this can happen only with PEBS, which doesn't have that
bad worst case skid (perhaps tens of cycles)

I doubt it is very likely because this problem could only happen
for user addresses because kernel page tables don't change. 

But we would be in the middle of the context 
switch (or use_temporary_mm) here, and there should be no
user space accesses within a tens of cycle window 
(except perhaps for the rseq address, but that's not a very
interesting case)

I assume the use_temporary_mm() cases are similar.

I suppose we could enforce flushing the PMU on such context switches,
but I would suspect while it's a valid theoretical problem, 
it's unlikely to be a real problem in practice.

Likely it means that large buffer PEBS cannot be ever used
with this option, but I guess that's ok.

-Andi
Liang, Kan Jan. 31, 2019, 2:58 p.m. UTC | #4
On 1/31/2019 7:37 AM, Peter Zijlstra wrote:
> On Wed, Jan 30, 2019 at 06:23:42AM -0800, kan.liang@linux.intel.com wrote:
>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
>> index 374a197..03bf45d 100644
>> --- a/arch/x86/events/core.c
>> +++ b/arch/x86/events/core.c
>> @@ -2578,3 +2578,45 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
>>   	cap->events_mask_len	= x86_pmu.events_mask_len;
>>   }
>>   EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
>> +
>> +/*
>> + * map x86 page levels to perf page sizes
>> + */
>> +static const enum perf_page_size perf_page_size_map[PG_LEVEL_NUM] = {
>> +	[PG_LEVEL_NONE] = PERF_PAGE_SIZE_NONE,
>> +	[PG_LEVEL_4K]   = PERF_PAGE_SIZE_4K,
>> +	[PG_LEVEL_2M]   = PERF_PAGE_SIZE_2M,
>> +	[PG_LEVEL_1G]   = PERF_PAGE_SIZE_1G,
>> +	[PG_LEVEL_512G] = PERF_PAGE_SIZE_512G,
>> +};
>> +
>> +u64 perf_get_page_size(u64 virt)
>> +{
>> +	unsigned long flags;
>> +	unsigned int level;
>> +	pte_t *pte;
>> +
>> +	if (!virt)
>> +		return 0;
>> +
>> +	/*
>> +	 * Interrupts are disabled, so it prevents any tear down
>> +	 * of the page tables.
>> +	 * See the comment near struct mmu_table_batch.
>> +	 */
>> +	local_irq_save(flags);
>> +	if (virt >= TASK_SIZE)
>> +		pte = lookup_address(virt, &level);
>> +	else {
>> +		if (current->mm)
>> +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
>> +						    virt, &level);
> 
> Aside from all the missin {}, I'm fairly sure this is broken since this
> happens from NMI context. This can interrupt switch_mm() and things like
> use_temporary_mm().
> 
> Also; why does this live in the x86 code and not in the generic code?
>

This is x86 implementation.
In generic code, there is a __weak function. I'll make it clear in the 
change log in v4.

+/* Return page size of given virtual address. IRQ-safe required. */
+u64 __weak perf_get_page_size(u64 virt)
+{
+	return PERF_PAGE_SIZE_NONE;
+}


>> +		else
>> +			level = PG_LEVEL_NUM;
>> +	}
>> +	local_irq_restore(flags);
>> +	if (level >= PG_LEVEL_NUM)
>> +		return PERF_PAGE_SIZE_NONE;
>> +
>> +	return (u64)perf_page_size_map[level];
>> +}
> 
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 7198ddd..79daacd 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
>>   	PERF_SAMPLE_TRANSACTION			= 1U << 17,
>>   	PERF_SAMPLE_REGS_INTR			= 1U << 18,
>>   	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
>> +	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 20,
>>   
>> -	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
>> +	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
>>   
>>   	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
>>   };
>> @@ -863,6 +864,7 @@ enum perf_event_type {
>>   	 *	{ u64			abi; # enum perf_sample_regs_abi
>>   	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
>>   	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
>> +	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
>>   	 * };
>>   	 */
>>   	PERF_RECORD_SAMPLE			= 9,
>> @@ -1150,6 +1152,18 @@ union perf_mem_data_src {
>>   #define PERF_MEM_S(a, s) \
>>   	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
>>   
>> +
>> +enum perf_page_size {
>> +	PERF_PAGE_SIZE_NONE,
>> +	PERF_PAGE_SIZE_4K,
>> +	PERF_PAGE_SIZE_8K,
>> +	PERF_PAGE_SIZE_16K,
>> +	PERF_PAGE_SIZE_64K,
>> +	PERF_PAGE_SIZE_2M,
>> +	PERF_PAGE_SIZE_1G,
>> +	PERF_PAGE_SIZE_512G,
>> +};
> 
> Since you have a u64 to store this in, WTH do you use this limited enum?
> Are you very sure this covers all the possible page sizes for all
> architectures?
> 
> Why not simply report the page size in bytes?
> 

I will use the page size instead in V4.

Thanks,
Kan
Will Deacon Feb. 1, 2019, 5:02 a.m. UTC | #5
On Thu, Jan 31, 2019 at 01:37:25PM +0100, Peter Zijlstra wrote:
> On Wed, Jan 30, 2019 at 06:23:42AM -0800, kan.liang@linux.intel.com wrote:
> > +enum perf_page_size {
> > +	PERF_PAGE_SIZE_NONE,
> > +	PERF_PAGE_SIZE_4K,
> > +	PERF_PAGE_SIZE_8K,
> > +	PERF_PAGE_SIZE_16K,
> > +	PERF_PAGE_SIZE_64K,
> > +	PERF_PAGE_SIZE_2M,
> > +	PERF_PAGE_SIZE_1G,
> > +	PERF_PAGE_SIZE_512G,
> > +};
> 
> Since you have a u64 to store this in, WTH do you use this limited enum?
> Are you very sure this covers all the possible page sizes for all
> architectures?

FWIW, this covers the basic page sizes on arm64, but it doesn't cover all of
the hugepage sizes.

Will
Thomas Gleixner Feb. 8, 2019, 10:39 a.m. UTC | #6
On Thu, 31 Jan 2019, Liang, Kan wrote:
> > > +u64 perf_get_page_size(u64 virt)
> > > +{
> > > +	unsigned long flags;
> > > +	unsigned int level;
> > > +	pte_t *pte;
> > > +
> > > +	if (!virt)
> > > +		return 0;
> > > +
> > > +	/*
> > > +	 * Interrupts are disabled, so it prevents any tear down
> > > +	 * of the page tables.
> > > +	 * See the comment near struct mmu_table_batch.
> > > +	 */
> > > +	local_irq_save(flags);
> > > +	if (virt >= TASK_SIZE)
> > > +		pte = lookup_address(virt, &level);
> > > +	else {
> > > +		if (current->mm)
> > > +			pte = lookup_address_in_pgd(pgd_offset(current->mm,
> > > virt),
> > > +						    virt, &level);
> > 
> > Aside from all the missin {}, I'm fairly sure this is broken since this
> > happens from NMI context. This can interrupt switch_mm() and things like
> > use_temporary_mm().
> > 
> > Also; why does this live in the x86 code and not in the generic code?
> > 
> 
> This is x86 implementation.
> In generic code, there is a __weak function. I'll make it clear in the change
> log in v4.

No, instead of hiding it in the changelog, split the patch into two:

 #1 Adding the core stuff including the weak function

 #2 Adding the x86 implementation.

Thanks,

	tglx
Liang, Kan Feb. 8, 2019, 1:35 p.m. UTC | #7
On 2/8/2019 5:39 AM, Thomas Gleixner wrote:
> On Thu, 31 Jan 2019, Liang, Kan wrote:
>>>> +u64 perf_get_page_size(u64 virt)
>>>> +{
>>>> +	unsigned long flags;
>>>> +	unsigned int level;
>>>> +	pte_t *pte;
>>>> +
>>>> +	if (!virt)
>>>> +		return 0;
>>>> +
>>>> +	/*
>>>> +	 * Interrupts are disabled, so it prevents any tear down
>>>> +	 * of the page tables.
>>>> +	 * See the comment near struct mmu_table_batch.
>>>> +	 */
>>>> +	local_irq_save(flags);
>>>> +	if (virt >= TASK_SIZE)
>>>> +		pte = lookup_address(virt, &level);
>>>> +	else {
>>>> +		if (current->mm)
>>>> +			pte = lookup_address_in_pgd(pgd_offset(current->mm,
>>>> virt),
>>>> +						    virt, &level);
>>>
>>> Aside from all the missin {}, I'm fairly sure this is broken since this
>>> happens from NMI context. This can interrupt switch_mm() and things like
>>> use_temporary_mm().
>>>
>>> Also; why does this live in the x86 code and not in the generic code?
>>>
>>
>> This is x86 implementation.
>> In generic code, there is a __weak function. I'll make it clear in the change
>> log in v4.
> 
> No, instead of hiding it in the changelog, split the patch into two:
> 
>   #1 Adding the core stuff including the weak function
> 
>   #2 Adding the x86 implementation.
> 

Thanks for the comments. I will do it in V5.

Thanks,
Kan

Patch
diff mbox series

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 374a197..03bf45d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2578,3 +2578,45 @@  void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 	cap->events_mask_len	= x86_pmu.events_mask_len;
 }
 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+
+/*
+ * map x86 page levels to perf page sizes
+ */
+static const enum perf_page_size perf_page_size_map[PG_LEVEL_NUM] = {
+	[PG_LEVEL_NONE] = PERF_PAGE_SIZE_NONE,
+	[PG_LEVEL_4K]   = PERF_PAGE_SIZE_4K,
+	[PG_LEVEL_2M]   = PERF_PAGE_SIZE_2M,
+	[PG_LEVEL_1G]   = PERF_PAGE_SIZE_1G,
+	[PG_LEVEL_512G] = PERF_PAGE_SIZE_512G,
+};
+
+u64 perf_get_page_size(u64 virt)
+{
+	unsigned long flags;
+	unsigned int level;
+	pte_t *pte;
+
+	if (!virt)
+		return 0;
+
+	/*
+	 * Interrupts are disabled, so it prevents any tear down
+	 * of the page tables.
+	 * See the comment near struct mmu_table_batch.
+	 */
+	local_irq_save(flags);
+	if (virt >= TASK_SIZE)
+		pte = lookup_address(virt, &level);
+	else {
+		if (current->mm)
+			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
+						    virt, &level);
+		else
+			level = PG_LEVEL_NUM;
+	}
+	local_irq_restore(flags);
+	if (level >= PG_LEVEL_NUM)
+		return PERF_PAGE_SIZE_NONE;
+
+	return (u64)perf_page_size_map[level];
+}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e9acf1d..720dc9e 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1274,7 +1274,8 @@  static void setup_pebs_sample_data(struct perf_event *event,
 	}
 
 
-	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
+	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR
+			    | PERF_SAMPLE_DATA_PAGE_SIZE)) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a79e59f..0e048ab 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -937,6 +937,7 @@  struct perf_sample_data {
 	u64				stack_user_size;
 
 	u64				phys_addr;
+	u64				data_page_size;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7198ddd..79daacd 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@  enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -863,6 +864,7 @@  enum perf_event_type {
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
@@ -1150,6 +1152,18 @@  union perf_mem_data_src {
 #define PERF_MEM_S(a, s) \
 	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
 
+
+enum perf_page_size {
+	PERF_PAGE_SIZE_NONE,
+	PERF_PAGE_SIZE_4K,
+	PERF_PAGE_SIZE_8K,
+	PERF_PAGE_SIZE_16K,
+	PERF_PAGE_SIZE_64K,
+	PERF_PAGE_SIZE_2M,
+	PERF_PAGE_SIZE_1G,
+	PERF_PAGE_SIZE_512G,
+};
+
 /*
  * single taken branch record layout:
  *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 236bb8d..eb721b5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1753,6 +1753,9 @@  static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		size += sizeof(data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		size += sizeof(data->data_page_size);
+
 	event->header_size = size;
 }
 
@@ -6305,6 +6308,9 @@  void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		perf_output_put(handle, data->data_page_size);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6352,6 +6358,12 @@  static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
+/* Return page size of given virtual address. IRQ-safe required. */
+u64 __weak perf_get_page_size(u64 virt)
+{
+	return PERF_PAGE_SIZE_NONE;
+}
+
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
 struct perf_callchain_entry *
@@ -6493,6 +6505,9 @@  void perf_prepare_sample(struct perf_event_header *header,
 
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
+
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		data->data_page_size = perf_get_page_size(data->addr);
 }
 
 static __always_inline int