[01/10] perf/core, x86: Add PERF_SAMPLE_LBR_TOS
diff mbox series

Message ID 20191007175910.2805-2-kan.liang@linux.intel.com
State Superseded
Headers show
Series
  • Stitch LBR call stack
Related show

Commit Message

Liang, Kan Oct. 7, 2019, 5:59 p.m. UTC
From: Kan Liang <kan.liang@linux.intel.com>

In LBR call stack mode, the depth of reconstructed LBR call stack limits
to the number of LBR registers. With LBR Top-of-Stack (TOS) information,
perf tool may stitch the stacks of two samples. The reconstructed LBR
call stack can break the HW limitation.

Add a new sample type for LBR TOS.

PEBS record doesn't store TOS information. For single PEBS, TOS can be
directly read from MSR, because the PMI is triggered immediately after
PEBS is written. TOS MSR is still unchanged.
For large PEBS, TOS MSR has stale value. Set -1ULL to indicate that the
TOS information is not available.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 arch/x86/events/intel/lbr.c     |  9 +++++++++
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 12 ++++++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

Comments

Peter Zijlstra Oct. 8, 2019, 8:31 a.m. UTC | #1
On Mon, Oct 07, 2019 at 10:59:01AM -0700, kan.liang@linux.intel.com wrote:
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 61448c19a132..ee9ef0c4cb08 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -100,6 +100,7 @@ struct perf_raw_record {
>   */
>  struct perf_branch_stack {
>  	__u64				nr;
> +	__u64				tos;
>  	struct perf_branch_entry	entries[0];
>  };
>  
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index bb7b271397a6..fe36ebb7dc2e 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
>  	PERF_SAMPLE_TRANSACTION			= 1U << 17,
>  	PERF_SAMPLE_REGS_INTR			= 1U << 18,
>  	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
> +	PERF_SAMPLE_LBR_TOS			= 1U << 20,
>  
> -	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
> +	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
>  
>  	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
>  };
> @@ -864,6 +865,7 @@ enum perf_event_type {
>  	 *	{ u64			abi; # enum perf_sample_regs_abi
>  	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
>  	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> +	 *	{ u64			tos;} && PERF_SAMPLE_LBR_TOS
>  	 * };
>  	 */
>  	PERF_RECORD_SAMPLE			= 9,

I have problems with the API.. You're introducing the intel specific LBR
naming, and adding a whole new sample type vs extending the existing
BRANCH_STACK (like you really already do with struct perf_branch_stack).

So why not add a bit to PERF_SAMPLE_BRANCH_* to request the presence of
the TOS field in the PERF_SAMPLE_BRANCH_STACK output?
Liang, Kan Oct. 8, 2019, 1:53 p.m. UTC | #2
On 10/8/2019 4:31 AM, Peter Zijlstra wrote:
> On Mon, Oct 07, 2019 at 10:59:01AM -0700, kan.liang@linux.intel.com wrote:
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 61448c19a132..ee9ef0c4cb08 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -100,6 +100,7 @@ struct perf_raw_record {
>>    */
>>   struct perf_branch_stack {
>>   	__u64				nr;
>> +	__u64				tos;
>>   	struct perf_branch_entry	entries[0];
>>   };
>>   
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index bb7b271397a6..fe36ebb7dc2e 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
>>   	PERF_SAMPLE_TRANSACTION			= 1U << 17,
>>   	PERF_SAMPLE_REGS_INTR			= 1U << 18,
>>   	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
>> +	PERF_SAMPLE_LBR_TOS			= 1U << 20,
>>   
>> -	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
>> +	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
>>   
>>   	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
>>   };
>> @@ -864,6 +865,7 @@ enum perf_event_type {
>>   	 *	{ u64			abi; # enum perf_sample_regs_abi
>>   	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
>>   	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
>> +	 *	{ u64			tos;} && PERF_SAMPLE_LBR_TOS
>>   	 * };
>>   	 */
>>   	PERF_RECORD_SAMPLE			= 9,
> 
> I have problems with the API.. You're introducing the intel specific LBR
> naming, and adding a whole new sample type vs extending the existing
> BRANCH_STACK (like you really already do with struct perf_branch_stack). >
> So why not add a bit to PERF_SAMPLE_BRANCH_* to request the presence of
> the TOS field in the PERF_SAMPLE_BRANCH_STACK output?

We never store PERF_SAMPLE_BRANCH_* in a sample. The perf tool cannot 
tell if the sample includes TOS field.
There will be a problem when a new perf tool parsing the data generated 
by an old kernel.


Can we rename the new sample type PERF_SAMPLE_BRANCH_STACK_EXTENSION?

{ u64			version;
   u64			tos;} 		&& PERF_SAMPLE_LBR_TOS

If other platforms want to add their extension, we just need to increase 
the version number. Perf tool will check the version before parsing the 
sample.

Thanks,
Kan
Peter Zijlstra Oct. 8, 2019, 2:38 p.m. UTC | #3
On Tue, Oct 08, 2019 at 09:53:24AM -0400, Liang, Kan wrote:
> 
> 
> On 10/8/2019 4:31 AM, Peter Zijlstra wrote:
> > On Mon, Oct 07, 2019 at 10:59:01AM -0700, kan.liang@linux.intel.com wrote:
> > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > > index 61448c19a132..ee9ef0c4cb08 100644
> > > --- a/include/linux/perf_event.h
> > > +++ b/include/linux/perf_event.h
> > > @@ -100,6 +100,7 @@ struct perf_raw_record {
> > >    */
> > >   struct perf_branch_stack {
> > >   	__u64				nr;
> > > +	__u64				tos;
> > >   	struct perf_branch_entry	entries[0];
> > >   };
> > > diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> > > index bb7b271397a6..fe36ebb7dc2e 100644
> > > --- a/include/uapi/linux/perf_event.h
> > > +++ b/include/uapi/linux/perf_event.h
> > > @@ -141,8 +141,9 @@ enum perf_event_sample_format {
> > >   	PERF_SAMPLE_TRANSACTION			= 1U << 17,
> > >   	PERF_SAMPLE_REGS_INTR			= 1U << 18,
> > >   	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
> > > +	PERF_SAMPLE_LBR_TOS			= 1U << 20,
> > > -	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
> > > +	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
> > >   	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
> > >   };
> > > @@ -864,6 +865,7 @@ enum perf_event_type {
> > >   	 *	{ u64			abi; # enum perf_sample_regs_abi
> > >   	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> > >   	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> > > +	 *	{ u64			tos;} && PERF_SAMPLE_LBR_TOS
> > >   	 * };
> > >   	 */
> > >   	PERF_RECORD_SAMPLE			= 9,
> > 
> > I have problems with the API.. You're introducing the intel specific LBR
> > naming, and adding a whole new sample type vs extending the existing
> > BRANCH_STACK (like you really already do with struct perf_branch_stack). >
> > So why not add a bit to PERF_SAMPLE_BRANCH_* to request the presence of
> > the TOS field in the PERF_SAMPLE_BRANCH_STACK output?
> 
> We never store PERF_SAMPLE_BRANCH_* in a sample. The perf tool cannot tell
> if the sample includes TOS field.

The perf tool bloody sets the perf_event_attr::branch_sample_type value!
Of course it knows to expect the TOS field when it asks for it in the
first place.

> There will be a problem when a new perf tool parsing the data generated by
> an old kernel.

ISTR perf stores the full perf_event_attr in the .data file these days,
and therefore such confusion should never happen.
Liang, Kan Oct. 8, 2019, 3:25 p.m. UTC | #4
On 10/8/2019 10:38 AM, Peter Zijlstra wrote:
> On Tue, Oct 08, 2019 at 09:53:24AM -0400, Liang, Kan wrote:
>>
>>
>> On 10/8/2019 4:31 AM, Peter Zijlstra wrote:
>>> On Mon, Oct 07, 2019 at 10:59:01AM -0700, kan.liang@linux.intel.com wrote:
>>>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>>>> index 61448c19a132..ee9ef0c4cb08 100644
>>>> --- a/include/linux/perf_event.h
>>>> +++ b/include/linux/perf_event.h
>>>> @@ -100,6 +100,7 @@ struct perf_raw_record {
>>>>     */
>>>>    struct perf_branch_stack {
>>>>    	__u64				nr;
>>>> +	__u64				tos;
>>>>    	struct perf_branch_entry	entries[0];
>>>>    };
>>>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>>>> index bb7b271397a6..fe36ebb7dc2e 100644
>>>> --- a/include/uapi/linux/perf_event.h
>>>> +++ b/include/uapi/linux/perf_event.h
>>>> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
>>>>    	PERF_SAMPLE_TRANSACTION			= 1U << 17,
>>>>    	PERF_SAMPLE_REGS_INTR			= 1U << 18,
>>>>    	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
>>>> +	PERF_SAMPLE_LBR_TOS			= 1U << 20,
>>>> -	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
>>>> +	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
>>>>    	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
>>>>    };
>>>> @@ -864,6 +865,7 @@ enum perf_event_type {
>>>>    	 *	{ u64			abi; # enum perf_sample_regs_abi
>>>>    	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
>>>>    	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
>>>> +	 *	{ u64			tos;} && PERF_SAMPLE_LBR_TOS
>>>>    	 * };
>>>>    	 */
>>>>    	PERF_RECORD_SAMPLE			= 9,
>>>
>>> I have problems with the API.. You're introducing the intel specific LBR
>>> naming, and adding a whole new sample type vs extending the existing
>>> BRANCH_STACK (like you really already do with struct perf_branch_stack). >
>>> So why not add a bit to PERF_SAMPLE_BRANCH_* to request the presence of
>>> the TOS field in the PERF_SAMPLE_BRANCH_STACK output?
>>
>> We never store PERF_SAMPLE_BRANCH_* in a sample. The perf tool cannot tell
>> if the sample includes TOS field.
> 
> The perf tool bloody sets the perf_event_attr::branch_sample_type value!
> Of course it knows to expect the TOS field when it asks for it in the
> first place.
>

Users may generate the perf.data on one machine, and parse the data on 
another machine.
If the perf.data is from a new kernel with a new perf tool on one 
machine, but users have an old perf tool on another machine to parse it. 
The old perf tool doesn't know the exists of TOS field.


Thanks,
Kan

>> There will be a problem when a new perf tool parsing the data generated by
>> an old kernel.
> 
> ISTR perf stores the full perf_event_attr in the .data file these days,
> and therefore such confusion should never happen.
>
Peter Zijlstra Oct. 8, 2019, 4:32 p.m. UTC | #5
On Tue, Oct 08, 2019 at 11:25:01AM -0400, Liang, Kan wrote:
> > The perf tool bloody sets the perf_event_attr::branch_sample_type value!
> > Of course it knows to expect the TOS field when it asks for it in the
> > first place.
> > 
> 
> Users may generate the perf.data on one machine, and parse the data on
> another machine.
> If the perf.data is from a new kernel with a new perf tool on one machine,
> but users have an old perf tool on another machine to parse it. The old perf
> tool doesn't know the exists of TOS field.

Feh, the tool should check for unknown input bits in attr.

Anyway, the proposed API is horrendous crap, that's just not going to
happen.

Patch
diff mbox series

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index ea54634eabf3..4640ff1c9ecb 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -562,6 +562,7 @@  static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
+	cpuc->lbr_stack.tos = tos;
 }
 
 /*
@@ -657,6 +658,7 @@  static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		out++;
 	}
 	cpuc->lbr_stack.nr = out;
+	cpuc->lbr_stack.tos = tos;
 }
 
 void intel_pmu_lbr_read(void)
@@ -1097,6 +1099,13 @@  void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
 	int i;
 
 	cpuc->lbr_stack.nr = x86_pmu.lbr_nr;
+
+	/* Cannot get TOS for large PEBS */
+	if (cpuc->n_pebs == cpuc->n_large_pebs)
+		cpuc->lbr_stack.tos = -1ULL;
+	else
+		cpuc->lbr_stack.tos = intel_pmu_lbr_tos();
+
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		u64 info = lbr->lbr[i].info;
 		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61448c19a132..ee9ef0c4cb08 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -100,6 +100,7 @@  struct perf_raw_record {
  */
 struct perf_branch_stack {
 	__u64				nr;
+	__u64				tos;
 	struct perf_branch_entry	entries[0];
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index bb7b271397a6..fe36ebb7dc2e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@  enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_LBR_TOS			= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -864,6 +865,7 @@  enum perf_event_type {
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+	 *	{ u64			tos;} && PERF_SAMPLE_LBR_TOS
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 275eae05af20..6ab0913c7b36 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6468,6 +6468,15 @@  void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_LBR_TOS) {
+		u64 tos = -1ULL;
+
+		if (data->br_stack)
+			tos = data->br_stack->tos;
+
+		perf_output_put(handle, tos);
+	}
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6656,6 +6665,9 @@  void perf_prepare_sample(struct perf_event_header *header,
 
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
+
+	if (sample_type & PERF_SAMPLE_LBR_TOS)
+		header->size += sizeof(u64);
 }
 
 static __always_inline int