All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
@ 2017-08-23 14:22 kan.liang
  2017-08-23 14:33 ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: kan.liang @ 2017-08-23 14:22 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel; +Cc: acme, jolsa, tglx, eranian, ak, Kan Liang

From: Kan Liang <kan.liang@intel.com>

For understanding how the workload maps to memory channels and hardware
behavior, it's very important to collect address maps with physical
addresses. For example, 3D XPoint access can only be found by filtering
the physical address.
However, perf doesn't collect physical address information in sampling.

The virtual address which stored in data->addr can be used to calculate
the physical address.
For kernel direct mapping addresses, virt_to_phys is used to convert the
virtual addresses to physical address.
For user virtual addresses, __get_user_pages_fast is used to walk the
pages tables for user physical address.
This does not work for vmalloc addresses. Right now these are not
resolved, but code to do that could be added.
For security, the physical address can only be exposed to root or
privileged user.
A new sample type PERF_SAMPLE_PHYS_ADDR is introduced to expose the
physical addresses.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---

This patch is kernel patch.
The user space patch can be found here.
https://www.spinics.net/lists/kernel/msg2587093.html

Changes since V5
 - Move virt_to_phys to generic code (PeterZ)

 arch/x86/events/perf_event.h    |  2 +-
 include/linux/perf_event.h      |  2 ++
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 47 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 476aec3..65bb91e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
 	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
+	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b14095b..74fb87e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -944,6 +944,8 @@ struct perf_sample_data {
 
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
+
+	u64				phys_addr;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 642db5f..cbea02f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -814,6 +815,7 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d704e23..b991af3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1570,6 +1570,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		size += sizeof(data->phys_addr);
+
 	event->header_size = size;
 }
 
@@ -6012,6 +6015,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		perf_output_put(handle, data->phys_addr);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6027,6 +6033,38 @@ void perf_output_sample(struct perf_output_handle *handle,
 	}
 }
 
+static u64 perf_virt_to_phys(u64 virt)
+{
+	u64 phys_addr = 0;
+	struct page *p = NULL;
+
+	if (!virt)
+		return 0;
+
+	if (virt >= TASK_SIZE) {
+		/* If it's vmalloc()d memory, leave phys_addr as 0 */
+		if (virt_addr_valid(virt) &&
+		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
+			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+	} else {
+		/*
+		 * Walking the pages tables for user address.
+		 * Interrupts are disabled, so it prevents any tear down
+		 * of the page tables.
+		 * Try IRQ-safe __get_user_pages_fast first.
+		 * If failed, leave phys_addr as 0.
+		 */
+		if ((current->mm != NULL) &&
+		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+		if (p)
+			put_page(p);
+	}
+
+	return phys_addr;
+}
+
 void perf_prepare_sample(struct perf_event_header *header,
 			 struct perf_sample_data *data,
 			 struct perf_event *event,
@@ -6145,6 +6183,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		data->phys_addr = perf_virt_to_phys(data->addr);
 }
 
 static void __always_inline
@@ -9892,6 +9933,12 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* Only privileged users can get kernel addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    perf_paranoid_kernel() &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (!attr.sample_max_stack)
 		attr.sample_max_stack = sysctl_perf_event_max_stack;
 
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-23 14:22 [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
@ 2017-08-23 14:33 ` Peter Zijlstra
  2017-08-23 14:39   ` Peter Zijlstra
  2017-08-24  1:26   ` Michael Ellerman
  0 siblings, 2 replies; 10+ messages in thread
From: Peter Zijlstra @ 2017-08-23 14:33 UTC (permalink / raw)
  To: kan.liang; +Cc: mingo, linux-kernel, acme, jolsa, tglx, eranian, ak, mpe, maddy


Michael, Maddy,

Since PPC implements PERF_SAMPLE_ADDR this affects you guys too, please
have a look.

On Wed, Aug 23, 2017 at 10:22:46AM -0400, kan.liang@intel.com wrote:
> From: Kan Liang <kan.liang@intel.com>
> 
> For understanding how the workload maps to memory channels and hardware
> behavior, it's very important to collect address maps with physical
> addresses. For example, 3D XPoint access can only be found by filtering
> the physical address.
> However, perf doesn't collect physical address information in sampling.
> 
> The virtual address which stored in data->addr can be used to calculate
> the physical address.
> For kernel direct mapping addresses, virt_to_phys is used to convert the
> virtual addresses to physical address.
> For user virtual addresses, __get_user_pages_fast is used to walk the
> pages tables for user physical address.
> This does not work for vmalloc addresses. Right now these are not
> resolved, but code to do that could be added.
> For security, the physical address can only be exposed to root or
> privileged user.
> A new sample type PERF_SAMPLE_PHYS_ADDR is introduced to expose the
> physical addresses.

Please reflow that..

> Signed-off-by: Kan Liang <kan.liang@intel.com>
> ---
> 
> This patch is kernel patch.
> The user space patch can be found here.
> https://www.spinics.net/lists/kernel/msg2587093.html
> 
> Changes since V5
>  - Move virt_to_phys to generic code (PeterZ)
> 
>  arch/x86/events/perf_event.h    |  2 +-
>  include/linux/perf_event.h      |  2 ++
>  include/uapi/linux/perf_event.h |  4 +++-
>  kernel/events/core.c            | 47 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 53 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index 476aec3..65bb91e 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -91,7 +91,7 @@ struct amd_nb {
>  	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
>  	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
>  	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> -	PERF_SAMPLE_TRANSACTION)
> +	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
>  
>  /*
>   * A debug store configuration.
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index b14095b..74fb87e 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -944,6 +944,8 @@ struct perf_sample_data {
>  
>  	struct perf_regs		regs_intr;
>  	u64				stack_user_size;
> +
> +	u64				phys_addr;
>  } ____cacheline_aligned;
>  
>  /* default value for data source */
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 642db5f..cbea02f 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -139,8 +139,9 @@ enum perf_event_sample_format {
>  	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
>  	PERF_SAMPLE_TRANSACTION			= 1U << 17,
>  	PERF_SAMPLE_REGS_INTR			= 1U << 18,
> +	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
>  
> -	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
> +	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
>  };
>  
>  /*
> @@ -814,6 +815,7 @@ enum perf_event_type {
>  	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
>  	 *	{ u64			abi; # enum perf_sample_regs_abi
>  	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> +	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
>  	 * };
>  	 */
>  	PERF_RECORD_SAMPLE			= 9,
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index d704e23..b991af3 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1570,6 +1570,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
>  	if (sample_type & PERF_SAMPLE_TRANSACTION)
>  		size += sizeof(data->txn);
>  
> +	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> +		size += sizeof(data->phys_addr);
> +
>  	event->header_size = size;
>  }
>  
> @@ -6012,6 +6015,9 @@ void perf_output_sample(struct perf_output_handle *handle,
>  		}
>  	}
>  
> +	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> +		perf_output_put(handle, data->phys_addr);
> +
>  	if (!event->attr.watermark) {
>  		int wakeup_events = event->attr.wakeup_events;
>  
> @@ -6027,6 +6033,38 @@ void perf_output_sample(struct perf_output_handle *handle,
>  	}
>  }
>  
> +static u64 perf_virt_to_phys(u64 virt)
> +{
> +	u64 phys_addr = 0;
> +	struct page *p = NULL;
> +
> +	if (!virt)
> +		return 0;
> +
> +	if (virt >= TASK_SIZE) {
> +		/* If it's vmalloc()d memory, leave phys_addr as 0 */
> +		if (virt_addr_valid(virt) &&
> +		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
> +			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
> +	} else {
> +		/*
> +		 * Walking the pages tables for user address.
> +		 * Interrupts are disabled, so it prevents any tear down
> +		 * of the page tables.
> +		 * Try IRQ-safe __get_user_pages_fast first.
> +		 * If failed, leave phys_addr as 0.
> +		 */
> +		if ((current->mm != NULL) &&
> +		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
> +			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
> +
> +		if (p)
> +			put_page(p);
> +	}
> +
> +	return phys_addr;
> +}

Michael, does this work for PPC as is?

> +
>  void perf_prepare_sample(struct perf_event_header *header,
>  			 struct perf_sample_data *data,
>  			 struct perf_event *event,
> @@ -6145,6 +6183,9 @@ void perf_prepare_sample(struct perf_event_header *header,
>  
>  		header->size += size;
>  	}
> +
> +	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> +		data->phys_addr = perf_virt_to_phys(data->addr);

Only problem with this now is that it requires SAMPLE_ADDR to also be
set in order to obtain data->addr.

Either fix all to set data->attr for (SAMPLE_ADDR || SAMPLE_PHYS_ADDR)
or mandate SAMPLE_ADDR when SAMPLE_PHYS_ADDR.

>  }
>  
>  static void __always_inline
> @@ -9892,6 +9933,12 @@ SYSCALL_DEFINE5(perf_event_open,
>  			return -EINVAL;
>  	}
>  
> +	/* Only privileged users can get kernel addresses */
> +	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
> +	    perf_paranoid_kernel() &&
> +	    !capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
>  	if (!attr.sample_max_stack)
>  		attr.sample_max_stack = sysctl_perf_event_max_stack;
>  
> -- 
> 2.4.3
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-23 14:33 ` Peter Zijlstra
@ 2017-08-23 14:39   ` Peter Zijlstra
  2017-08-23 17:00     ` Stephane Eranian
  2017-08-23 18:01     ` Liang, Kan
  2017-08-24  1:26   ` Michael Ellerman
  1 sibling, 2 replies; 10+ messages in thread
From: Peter Zijlstra @ 2017-08-23 14:39 UTC (permalink / raw)
  To: kan.liang; +Cc: mingo, linux-kernel, acme, jolsa, tglx, eranian, ak, mpe, maddy

On Wed, Aug 23, 2017 at 04:33:08PM +0200, Peter Zijlstra wrote:
> > @@ -6145,6 +6183,9 @@ void perf_prepare_sample(struct perf_event_header *header,
> >  
> >  		header->size += size;
> >  	}
> > +
> > +	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> > +		data->phys_addr = perf_virt_to_phys(data->addr);
> 
> Only problem with this now is that it requires SAMPLE_ADDR to also be
> set in order to obtain data->addr.
> 
> Either fix all to set data->attr for (SAMPLE_ADDR || SAMPLE_PHYS_ADDR)
> or mandate SAMPLE_ADDR when SAMPLE_PHYS_ADDR.

I think the former suggestion is better, as it allows for smaller
samples.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-23 14:39   ` Peter Zijlstra
@ 2017-08-23 17:00     ` Stephane Eranian
  2017-08-31 16:44       ` Arnaldo Carvalho de Melo
  2017-08-23 18:01     ` Liang, Kan
  1 sibling, 1 reply; 10+ messages in thread
From: Stephane Eranian @ 2017-08-23 17:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Liang, Kan, mingo, LKML, Arnaldo Carvalho de Melo, Jiri Olsa,
	Thomas Gleixner, ak, Michael Ellerman, Madhavan Srinivasan

On Wed, Aug 23, 2017 at 7:39 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Aug 23, 2017 at 04:33:08PM +0200, Peter Zijlstra wrote:
>> > @@ -6145,6 +6183,9 @@ void perf_prepare_sample(struct perf_event_header *header,
>> >
>> >             header->size += size;
>> >     }
>> > +
>> > +   if (sample_type & PERF_SAMPLE_PHYS_ADDR)
>> > +           data->phys_addr = perf_virt_to_phys(data->addr);
>>
>> Only problem with this now is that it requires SAMPLE_ADDR to also be
>> set in order to obtain data->addr.
>>
>> Either fix all to set data->attr for (SAMPLE_ADDR || SAMPLE_PHYS_ADDR)
>> or mandate SAMPLE_ADDR when SAMPLE_PHYS_ADDR.
>
> I think the former suggestion is better, as it allows for smaller
> samples.

Agreed. I may only care about physical addresses, so SAMPLE_PHYS_ADDR
should suffice from the user.
The kernel must save the virtual address in data->addr, and then convert it.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-23 14:39   ` Peter Zijlstra
  2017-08-23 17:00     ` Stephane Eranian
@ 2017-08-23 18:01     ` Liang, Kan
  2017-08-23 18:26       ` Peter Zijlstra
  1 sibling, 1 reply; 10+ messages in thread
From: Liang, Kan @ 2017-08-23 18:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, acme, jolsa, tglx, eranian, ak, mpe, maddy

> On Wed, Aug 23, 2017 at 04:33:08PM +0200, Peter Zijlstra wrote:
> > > @@ -6145,6 +6183,9 @@ void perf_prepare_sample(struct
> > > perf_event_header *header,
> > >
> > >  		header->size += size;
> > >  	}
> > > +
> > > +	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> > > +		data->phys_addr = perf_virt_to_phys(data->addr);
> >
> > Only problem with this now is that it requires SAMPLE_ADDR to also be
> > set in order to obtain data->addr.
> >
> > Either fix all to set data->attr for (SAMPLE_ADDR || SAMPLE_PHYS_ADDR)
> > or mandate SAMPLE_ADDR when SAMPLE_PHYS_ADDR.
> 
> I think the former suggestion is better, as it allows for smaller samples.

For the latter, it's easy to be implemented, and already in the perf tool patch.
But yes, it will bring extra overhead, if the user doesn't care about the virtual
address. 

For the former, do you mean data->type or event->attr?
I don't think we can set either of them. 
It will impact the perf_output_sample and perf_event__header_size.

For x86, I think we can do something as below. But I'm not sure other architectures.
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a322fed..9bf29dc 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1175,7 +1175,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
        else
                regs->flags &= ~PERF_EFLAGS_EXACT;

-       if ((sample_type & PERF_SAMPLE_ADDR) &&
+       if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR) &&
            x86_pmu.intel_cap.pebs_format >= 1)
                data->addr = pebs->dla;

Thanks,
Kan

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-23 18:01     ` Liang, Kan
@ 2017-08-23 18:26       ` Peter Zijlstra
  0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2017-08-23 18:26 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, linux-kernel, acme, jolsa, tglx, eranian, ak, mpe, maddy

On Wed, Aug 23, 2017 at 06:01:25PM +0000, Liang, Kan wrote:

> For x86, I think we can do something as below. But I'm not sure other architectures.

If you'd done: git grep PERF_SAMPLE_ADDR, you'd have found:

arch/powerpc/perf/core-book3s.c:                if (event->attr.sample_type & PERF_SAMPLE_ADDR)

Which suggests something rather similar..


> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index a322fed..9bf29dc 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -1175,7 +1175,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
>         else
>                 regs->flags &= ~PERF_EFLAGS_EXACT;
> 
> -       if ((sample_type & PERF_SAMPLE_ADDR) &&
> +       if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR) &&
>             x86_pmu.intel_cap.pebs_format >= 1)
>                 data->addr = pebs->dla;

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6c2d4168daec..c833fb26f0bb 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2039,7 +2039,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
 
-		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+		if (event->attr.sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
 			perf_get_data_addr(regs, &data.addr);
 
 		if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-23 14:33 ` Peter Zijlstra
  2017-08-23 14:39   ` Peter Zijlstra
@ 2017-08-24  1:26   ` Michael Ellerman
  2017-08-24  8:56     ` Peter Zijlstra
  1 sibling, 1 reply; 10+ messages in thread
From: Michael Ellerman @ 2017-08-24  1:26 UTC (permalink / raw)
  To: Peter Zijlstra, kan.liang
  Cc: mingo, linux-kernel, acme, jolsa, tglx, eranian, ak, maddy

Peter Zijlstra <peterz@infradead.org> writes:
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index d704e23..b991af3 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -6027,6 +6033,38 @@ void perf_output_sample(struct perf_output_handle *handle,
>>  	}
>>  }
>>  
>> +static u64 perf_virt_to_phys(u64 virt)
>> +{
>> +	u64 phys_addr = 0;
>> +	struct page *p = NULL;
>> +
>> +	if (!virt)
>> +		return 0;
>> +
>> +	if (virt >= TASK_SIZE) {
>> +		/* If it's vmalloc()d memory, leave phys_addr as 0 */
>> +		if (virt_addr_valid(virt) &&
>> +		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
>> +			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
>> +	} else {
>> +		/*
>> +		 * Walking the pages tables for user address.
>> +		 * Interrupts are disabled, so it prevents any tear down
>> +		 * of the page tables.
>> +		 * Try IRQ-safe __get_user_pages_fast first.
>> +		 * If failed, leave phys_addr as 0.
>> +		 */
>> +		if ((current->mm != NULL) &&
>> +		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
>> +			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
>> +
>> +		if (p)
>> +			put_page(p);
>> +	}
>> +
>> +	return phys_addr;
>> +}
>
> Michael, does this work for PPC as is?

I think so.

We have about 8 MMUs and 32-bit and 64-bit so it's a bit hard to say for
sure.

I'm pretty sure virt_addr_valid() will exclude everything except the
linear mapping for us, so the vmalloc check is redundant but that's
fine. So that looks safe AFAICS.

I'm not an expert on GUP fast, but AFAIK there's nothing special
required for us.

cheers

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-24  1:26   ` Michael Ellerman
@ 2017-08-24  8:56     ` Peter Zijlstra
  2017-08-24  9:52       ` Michael Ellerman
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2017-08-24  8:56 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: kan.liang, mingo, linux-kernel, acme, jolsa, tglx, eranian, ak, maddy

On Thu, Aug 24, 2017 at 11:26:17AM +1000, Michael Ellerman wrote:
> Peter Zijlstra <peterz@infradead.org> writes:
> >> diff --git a/kernel/events/core.c b/kernel/events/core.c
> >> index d704e23..b991af3 100644
> >> --- a/kernel/events/core.c
> >> +++ b/kernel/events/core.c
> >> @@ -6027,6 +6033,38 @@ void perf_output_sample(struct perf_output_handle *handle,
> >>  	}
> >>  }
> >>  
> >> +static u64 perf_virt_to_phys(u64 virt)
> >> +{
> >> +	u64 phys_addr = 0;
> >> +	struct page *p = NULL;
> >> +
> >> +	if (!virt)
> >> +		return 0;
> >> +
> >> +	if (virt >= TASK_SIZE) {
> >> +		/* If it's vmalloc()d memory, leave phys_addr as 0 */
> >> +		if (virt_addr_valid(virt) &&
> >> +		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
> >> +			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
> >> +	} else {
> >> +		/*
> >> +		 * Walking the pages tables for user address.
> >> +		 * Interrupts are disabled, so it prevents any tear down
> >> +		 * of the page tables.
> >> +		 * Try IRQ-safe __get_user_pages_fast first.
> >> +		 * If failed, leave phys_addr as 0.
> >> +		 */
> >> +		if ((current->mm != NULL) &&
> >> +		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
> >> +			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
> >> +
> >> +		if (p)
> >> +			put_page(p);
> >> +	}
> >> +
> >> +	return phys_addr;
> >> +}
> >
> > Michael, does this work for PPC as is?
> 
> I think so.
> 
> We have about 8 MMUs and 32-bit and 64-bit so it's a bit hard to say for
> sure.

You can limit it to those that include a PMU that supports data->addr,
which is book3s, whatever that gets you.

> I'm pretty sure virt_addr_valid() will exclude everything except the
> linear mapping for us, so the vmalloc check is redundant but that's
> fine. So that looks safe AFAICS.
> 
> I'm not an expert on GUP fast, but AFAIK there's nothing special
> required for us.

Yeah, gup fast should work for you.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-24  8:56     ` Peter Zijlstra
@ 2017-08-24  9:52       ` Michael Ellerman
  0 siblings, 0 replies; 10+ messages in thread
From: Michael Ellerman @ 2017-08-24  9:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: kan.liang, mingo, linux-kernel, acme, jolsa, tglx, eranian, ak, maddy

Peter Zijlstra <peterz@infradead.org> writes:

> On Thu, Aug 24, 2017 at 11:26:17AM +1000, Michael Ellerman wrote:
>> Peter Zijlstra <peterz@infradead.org> writes:
>> >> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> >> index d704e23..b991af3 100644
>> >> --- a/kernel/events/core.c
>> >> +++ b/kernel/events/core.c
>> >> @@ -6027,6 +6033,38 @@ void perf_output_sample(struct perf_output_handle *handle,
>> >>  	}
>> >>  }
>> >>  
>> >> +static u64 perf_virt_to_phys(u64 virt)
>> >> +{
>> >> +	u64 phys_addr = 0;
>> >> +	struct page *p = NULL;
>> >> +
>> >> +	if (!virt)
>> >> +		return 0;
>> >> +
>> >> +	if (virt >= TASK_SIZE) {
>> >> +		/* If it's vmalloc()d memory, leave phys_addr as 0 */
>> >> +		if (virt_addr_valid(virt) &&
>> >> +		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
>> >> +			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
>> >> +	} else {
>> >> +		/*
>> >> +		 * Walking the pages tables for user address.
>> >> +		 * Interrupts are disabled, so it prevents any tear down
>> >> +		 * of the page tables.
>> >> +		 * Try IRQ-safe __get_user_pages_fast first.
>> >> +		 * If failed, leave phys_addr as 0.
>> >> +		 */
>> >> +		if ((current->mm != NULL) &&
>> >> +		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
>> >> +			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
>> >> +
>> >> +		if (p)
>> >> +			put_page(p);
>> >> +	}
>> >> +
>> >> +	return phys_addr;
>> >> +}
>> >
>> > Michael, does this work for PPC as is?
>> 
>> I think so.
>> 
>> We have about 8 MMUs and 32-bit and 64-bit so it's a bit hard to say for
>> sure.
>
> You can limit it to those that include a PMU that supports data->addr,
> which is book3s, whatever that gets you.

True. That's just hash and radix.

So that looks OK to me. At least OK enough to merge and we'll try and
test it ASAP to confirm it works.

cheers

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR
  2017-08-23 17:00     ` Stephane Eranian
@ 2017-08-31 16:44       ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 10+ messages in thread
From: Arnaldo Carvalho de Melo @ 2017-08-31 16:44 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, Liang, Kan, mingo, LKML, Jiri Olsa,
	Thomas Gleixner, ak, Michael Ellerman, Madhavan Srinivasan

Em Wed, Aug 23, 2017 at 10:00:37AM -0700, Stephane Eranian escreveu:
> On Wed, Aug 23, 2017 at 7:39 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Wed, Aug 23, 2017 at 04:33:08PM +0200, Peter Zijlstra wrote:
> >> > @@ -6145,6 +6183,9 @@ void perf_prepare_sample(struct perf_event_header *header,
> >> >
> >> >             header->size += size;
> >> >     }
> >> > +
> >> > +   if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> >> > +           data->phys_addr = perf_virt_to_phys(data->addr);
> >>
> >> Only problem with this now is that it requires SAMPLE_ADDR to also be
> >> set in order to obtain data->addr.
> >>
> >> Either fix all to set data->attr for (SAMPLE_ADDR || SAMPLE_PHYS_ADDR)
> >> or mandate SAMPLE_ADDR when SAMPLE_PHYS_ADDR.
> >
> > I think the former suggestion is better, as it allows for smaller
> > samples.
> 
> Agreed. I may only care about physical addresses, so SAMPLE_PHYS_ADDR
> should suffice from the user.
> The kernel must save the virtual address in data->addr, and then convert it.

Stephane, have you took the time to test this together with Kan's latest
patchkit for tools/perf? v2? If so can I have your Tested-by or just an
Acked-by?

Thanks,

- Arnaldo

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-08-31 16:44 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-23 14:22 [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
2017-08-23 14:33 ` Peter Zijlstra
2017-08-23 14:39   ` Peter Zijlstra
2017-08-23 17:00     ` Stephane Eranian
2017-08-31 16:44       ` Arnaldo Carvalho de Melo
2017-08-23 18:01     ` Liang, Kan
2017-08-23 18:26       ` Peter Zijlstra
2017-08-24  1:26   ` Michael Ellerman
2017-08-24  8:56     ` Peter Zijlstra
2017-08-24  9:52       ` Michael Ellerman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.