All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 0/2] On perf and clocks..
@ 2015-02-20 14:29 Peter Zijlstra
  2015-02-20 14:29 ` [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns() Peter Zijlstra
  2015-02-20 14:29 ` [RFC][PATCH 2/2] perf: Add per event clockid support Peter Zijlstra
  0 siblings, 2 replies; 10+ messages in thread
From: Peter Zijlstra @ 2015-02-20 14:29 UTC (permalink / raw)
  To: pawel.moll, adrian.hunter, john.stultz, mingo, eranian
  Cc: linux-kernel, acme, dsahern, fweisbec, jolsa, namhyung, paulus,
	tglx, rostedt, sonnyrao, ak, vincent.weaver, Peter Zijlstra

Hai,

Changelog for patch 2 has the goodies. Compile tested only..



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns()
  2015-02-20 14:29 [RFC][PATCH 0/2] On perf and clocks Peter Zijlstra
@ 2015-02-20 14:29 ` Peter Zijlstra
  2015-02-20 19:49   ` John Stultz
  2015-02-20 14:29 ` [RFC][PATCH 2/2] perf: Add per event clockid support Peter Zijlstra
  1 sibling, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2015-02-20 14:29 UTC (permalink / raw)
  To: pawel.moll, adrian.hunter, john.stultz, mingo, eranian
  Cc: linux-kernel, acme, dsahern, fweisbec, jolsa, namhyung, paulus,
	tglx, rostedt, sonnyrao, ak, vincent.weaver, Peter Zijlstra

[-- Attachment #1: peterz-timer-fast_mono_raw.patch --]
[-- Type: text/plain, Size: 3873 bytes --]

By popular demand, add a MONOTONIC_RAW variant.

The implementation is up for discussion; but given the need for the
dummy thing I thought it was easiest to just always update the
mono_raw clock state in timekeeping_update() for now, even though its
mostly wasted cycles.

I suppose the right place would be a resume callback somewhere.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/timekeeping.h |    1 +
 kernel/time/timekeeping.c   |   42 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 7 deletions(-)

--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -220,6 +220,7 @@ static inline u64 ktime_get_raw_ns(void)
 }
 
 extern u64 ktime_get_mono_fast_ns(void);
+extern u64 ktime_get_mono_raw_fast_ns(void);
 
 /*
  * Timespec interfaces utilizing the ktime based ones
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -58,7 +58,8 @@ struct tk_fast {
 	struct tk_read_base	base[2];
 };
 
-static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_mono     ____cacheline_aligned;
+static struct tk_fast tk_fast_mono_raw ____cacheline_aligned;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -267,18 +268,18 @@ static inline s64 timekeeping_get_ns_raw
  * slightly wrong timestamp (a few nanoseconds). See
  * @ktime_get_mono_fast_ns.
  */
-static void update_fast_timekeeper(struct tk_read_base *tkr)
+static void update_fast_timekeeper(struct tk_fast *fast, struct tk_read_base *tkr)
 {
-	struct tk_read_base *base = tk_fast_mono.base;
+	struct tk_read_base *base = fast->base;
 
 	/* Force readers off to base[1] */
-	raw_write_seqcount_latch(&tk_fast_mono.seq);
+	raw_write_seqcount_latch(&fast->seq);
 
 	/* Update base[0] */
 	memcpy(base, tkr, sizeof(*base));
 
 	/* Force readers back to base[0] */
-	raw_write_seqcount_latch(&tk_fast_mono.seq);
+	raw_write_seqcount_latch(&fast->seq);
 
 	/* Update base[1] */
 	memcpy(base + 1, base, sizeof(*base));
@@ -332,6 +333,22 @@ u64 notrace ktime_get_mono_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 
+u64 notrace ktime_get_mono_raw_fast_ns(void)
+{
+	struct tk_read_base *tkr;
+	unsigned int seq;
+	u64 now;
+
+	do {
+		seq = raw_read_seqcount(&tk_fast_mono_raw.seq);
+		tkr = tk_fast_mono_raw.base + (seq & 0x01);
+		now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+
+	} while (read_seqcount_retry(&tk_fast_mono_raw.seq, seq));
+	return now;
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_raw_fast_ns);
+
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;
 
@@ -358,7 +375,11 @@ static void halt_fast_timekeeper(struct
 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
 	cycles_at_suspend = tkr->read(tkr->clock);
 	tkr_dummy.read = dummy_clock_read;
-	update_fast_timekeeper(&tkr_dummy);
+	update_fast_timekeeper(&tk_fast_mono,     &tkr_dummy);
+
+	tkr_dummy.mult  = tkr->clock->mult;
+	tkr_dummy.shift = tkr->clock->shift;
+	update_fast_timekeeper(&tk_fast_mono_raw, &tkr_dummy);
 }
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -475,6 +496,8 @@ static inline void tk_update_ktime_data(
 /* must hold timekeeper_lock */
 static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 {
+	static struct tk_read_base tkr;
+
 	if (action & TK_CLEAR_NTP) {
 		tk->ntp_error = 0;
 		ntp_clear();
@@ -489,7 +512,12 @@ static void timekeeping_update(struct ti
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));
 
-	update_fast_timekeeper(&tk->tkr);
+	update_fast_timekeeper(&tk_fast_mono, &tk->tkr);
+
+	tkr = tk->tkr;
+	tkr.mult  = tk->tkr.clock->mult;
+	tkr.shift = tk->tkr.clock->shift;
+	update_fast_timekeeper(&tk_fast_mono_raw, &tkr);
 }
 
 /**



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [RFC][PATCH 2/2] perf: Add per event clockid support
  2015-02-20 14:29 [RFC][PATCH 0/2] On perf and clocks Peter Zijlstra
  2015-02-20 14:29 ` [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns() Peter Zijlstra
@ 2015-02-20 14:29 ` Peter Zijlstra
  2015-02-20 15:28   ` Pawel Moll
  2015-02-23  8:13   ` Adrian Hunter
  1 sibling, 2 replies; 10+ messages in thread
From: Peter Zijlstra @ 2015-02-20 14:29 UTC (permalink / raw)
  To: pawel.moll, adrian.hunter, john.stultz, mingo, eranian
  Cc: linux-kernel, acme, dsahern, fweisbec, jolsa, namhyung, paulus,
	tglx, rostedt, sonnyrao, ak, vincent.weaver, Peter Zijlstra

[-- Attachment #1: peterz-perf-clockzzz.patch --]
[-- Type: text/plain, Size: 8763 bytes --]

While thinking on the whole clock discussion it occured to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/cpu/perf_event.c |   14 ++++++-
 include/linux/perf_event.h       |    2 +
 include/linux/timekeeping.h      |    5 ++
 include/uapi/linux/perf_event.h  |    6 +--
 kernel/events/core.c             |   74 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 93 insertions(+), 8 deletions(-)

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1968,13 +1968,23 @@ void arch_perf_update_userpage(struct pe
 
 	data = cyc2ns_read_begin();
 
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
 	userpg->time_offset = data->cyc2ns_offset - now;
 
-	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = data->cyc2ns_offset;
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
 
 	cyc2ns_read_end(data);
 }
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -166,6 +166,7 @@ struct perf_event;
  * pmu::capabilities flags
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
+#define PERF_PMU_CAP_NO_NMI			0x02
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -438,6 +439,7 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -214,6 +214,11 @@ static inline u64 ktime_get_boot_ns(void
 	return ktime_to_ns(ktime_get_boottime());
 }
 
+static inline u64 ktime_get_tai_ns(void)
+{
+	return ktime_to_ns(ktime_get_clocktai());
+}
+
 static inline u64 ktime_get_raw_ns(void)
 {
 	return ktime_to_ns(ktime_get_raw());
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -305,7 +305,8 @@ struct perf_event_attr {
 				exclude_callchain_user   : 1, /* exclude user callchains */
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
-				__reserved_1   : 39;
+				use_clockid    :  1, /* use @clockid for time fields */
+				__reserved_1   : 38;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -334,8 +335,7 @@ struct perf_event_attr {
 	 */
 	__u32	sample_stack_user;
 
-	/* Align to u64. */
-	__u32	__reserved_2;
+	__u32	clockid;
 	/*
 	 * Defines set of regs to dump for each sample
 	 * state captured on:
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+	return event->clock();
+}
+
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@@ -4756,7 +4761,7 @@ static void __perf_event_header__init_id
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME)
-		data->time = perf_clock();
+		data->time = perf_event_clock(event);
 
 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
 		data->id = primary_event_id(event);
@@ -5334,6 +5339,8 @@ static void perf_event_task_output(struc
 	task_event->event_id.tid = perf_event_tid(event, task);
 	task_event->event_id.ptid = perf_event_tid(event, current);
 
+	task_event->event_id.time = perf_event_clock(event);
+
 	perf_output_put(&handle, task_event->event_id);
 
 	perf_event__output_id_sample(event, &handle, &sample);
@@ -5367,7 +5374,7 @@ static void perf_event_task(struct task_
 			/* .ppid */
 			/* .tid  */
 			/* .ptid */
-			.time = perf_clock(),
+			/* .time */
 		},
 	};
 
@@ -5743,7 +5750,7 @@ static void perf_log_throttle(struct per
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time		= perf_clock(),
+		.time		= perf_event_clock(event),
 		.id		= primary_event_id(event),
 		.stream_id	= event->id,
 	};
@@ -6286,6 +6293,8 @@ static int perf_swevent_init(struct perf
 static struct pmu perf_swevent = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= perf_swevent_init,
 	.add		= perf_swevent_add,
 	.del		= perf_swevent_del,
@@ -6403,6 +6412,8 @@ static int perf_tp_event_init(struct per
 static struct pmu perf_tracepoint = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= perf_tp_event_init,
 	.add		= perf_trace_add,
 	.del		= perf_trace_del,
@@ -6628,6 +6639,8 @@ static int cpu_clock_event_init(struct p
 static struct pmu perf_cpu_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
 	.del		= cpu_clock_event_del,
@@ -6706,6 +6719,8 @@ static int task_clock_event_init(struct
 static struct pmu perf_task_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
 	.del		= task_clock_event_del,
@@ -7188,6 +7203,10 @@ perf_event_alloc(struct perf_event_attr
 #endif
 	}
 
+	event->clock = &local_clock;
+	if (parent_event)
+		event->clock = parent_event->clock;
+
 	if (!overflow_handler && parent_event) {
 		overflow_handler = parent_event->overflow_handler;
 		context = parent_event->overflow_handler_context;
@@ -7399,6 +7418,12 @@ perf_event_set_output(struct perf_event
 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;
 
+	/*
+	 * Mixing clocks in the same buffer is trouble you don't need.
+	 */
+	if (output_event->clock != event->clock)
+		goto out;
+
 set:
 	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
@@ -7550,6 +7575,44 @@ SYSCALL_DEFINE5(perf_event_open,
 	 */
 	pmu = event->pmu;
 
+	if (attr.use_clockid) {
+		switch (attr.clockid) {
+		case CLOCK_MONOTONIC:
+			event->clock = &ktime_get_mono_fast_ns;
+			goto clock_set;
+
+		case CLOCK_MONOTONIC_RAW:
+			event->clock = &ktime_get_mono_raw_fast_ns;
+			goto clock_set;
+
+		default:
+			if (!(pmu->capabilities & PERF_PMU_CAP_NO_NMI)) {
+				err = -EINVAL;
+				goto err_alloc;
+			}
+		}
+
+		switch (attr.clockid) {
+		case CLOCK_REALTIME:
+			event->clock = &ktime_get_real_ns;
+			goto clock_set;
+
+		case CLOCK_BOOTTIME:
+			event->clock = &ktime_get_boot_ns;
+			goto clock_set;
+
+		case CLOCK_TAI:
+			event->clock = &ktime_get_tai_ns;
+			goto clock_set;
+
+		default:
+			/* XXX add: clock_id_valid() && clock_gettime_ns() ? */
+			err = -EINVAL;
+			goto err_alloc;
+		}
+	}
+clock_set:
+
 	if (group_leader &&
 	    (is_software_event(event) != is_software_event(group_leader))) {
 		if (is_software_event(event)) {
@@ -7599,6 +7662,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		if (group_leader->group_leader != group_leader)
 			goto err_context;
+
+		/* All events in a group should have the same clock */
+		if (group_leader->clock != event->clock)
+			goto err_context;
+
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][PATCH 2/2] perf: Add per event clockid support
  2015-02-20 14:29 ` [RFC][PATCH 2/2] perf: Add per event clockid support Peter Zijlstra
@ 2015-02-20 15:28   ` Pawel Moll
  2015-02-20 16:01     ` Peter Zijlstra
  2015-02-23  8:13   ` Adrian Hunter
  1 sibling, 1 reply; 10+ messages in thread
From: Pawel Moll @ 2015-02-20 15:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: adrian.hunter, john.stultz, mingo, eranian, linux-kernel, acme,
	dsahern, fweisbec, jolsa, namhyung, paulus, tglx, rostedt,
	sonnyrao, ak, vincent.weaver

On Fri, 2015-02-20 at 14:29 +0000, Peter Zijlstra wrote:
> The below patch makes the distinction between these two cases by
> adding perf_event_clock() which is used for the second case. It
> further makes this configurable on a per-event basis, but adds a few
> sanity checks such that we cannot combine events with different clocks
> in confusing ways.

The idea works for me (obviously :-)

> And since we then have per-event configurability we might as well
> retain the 'legacy' behaviour as a default.

Don't mind that at all.

> @@ -334,8 +335,7 @@ struct perf_event_attr {
>  	 */
>  	__u32	sample_stack_user;
>  
> -	/* Align to u64. */
> -	__u32	__reserved_2;
> +	__u32	clockid;

I thought about it, but was sort-of-afraid to propose it :-)

Now, one thing I'm not 100% sure about it is it being unsigned, as
clockid_t is signed for a reason (negative values have meaning - eg.
dynamic clocks, which could be useful in some circumstances). Of course
casting could be an answer, but is there any reason not to make it
__s32?

> +		default:
> +			/* XXX add: clock_id_valid() && clock_gettime_ns() ? */
> +			err = -EINVAL;
> +			goto err_alloc;
> +		}

If you asked me, I'd say -EINVAL, no default.

Cheers!

Pawel


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][PATCH 2/2] perf: Add per event clockid support
  2015-02-20 15:28   ` Pawel Moll
@ 2015-02-20 16:01     ` Peter Zijlstra
  0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2015-02-20 16:01 UTC (permalink / raw)
  To: Pawel Moll
  Cc: adrian.hunter, john.stultz, mingo, eranian, linux-kernel, acme,
	dsahern, fweisbec, jolsa, namhyung, paulus, tglx, rostedt,
	sonnyrao, ak, vincent.weaver

On Fri, Feb 20, 2015 at 03:28:49PM +0000, Pawel Moll wrote:
> On Fri, 2015-02-20 at 14:29 +0000, Peter Zijlstra wrote:
> > @@ -334,8 +335,7 @@ struct perf_event_attr {
> >  	 */
> >  	__u32	sample_stack_user;
> >  
> > -	/* Align to u64. */
> > -	__u32	__reserved_2;
> > +	__u32	clockid;
> 
> I thought about it, but was sort-of-afraid to propose it :-)
> 
> Now, one thing I'm not 100% sure about it is it being unsigned, as
> clockid_t is signed for a reason (negative values have meaning - eg.
> dynamic clocks, which could be useful in some circumstances). Of course
> casting could be an answer, but is there any reason not to make it
> __s32?

I did not spot that significance and cannot find mention of it in
clock_gettime(2) either, but I've no objection to making it __s32.

> > +		default:
> > +			/* XXX add: clock_id_valid() && clock_gettime_ns() ? */
> > +			err = -EINVAL;
> > +			goto err_alloc;
> > +		}
> 
> If you asked me, I'd say -EINVAL, no default.

Yeah, I should probably restructure that a wee bit.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns()
  2015-02-20 14:29 ` [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns() Peter Zijlstra
@ 2015-02-20 19:49   ` John Stultz
  2015-02-20 20:11     ` Peter Zijlstra
  2015-03-17 11:24     ` Peter Zijlstra
  0 siblings, 2 replies; 10+ messages in thread
From: John Stultz @ 2015-02-20 19:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paweł Moll, Adrian Hunter, Ingo Molnar, Stephane Eranian,
	lkml, Arnaldo Carvalho de Melo, David Ahern,
	Frédéric Weisbecker, Jiri Olsa, Namhyung Kim,
	Paul Mackerras, Thomas Gleixner, Steven Rostedt, Sonny Rao, ak,
	vincent.weaver

On Fri, Feb 20, 2015 at 6:29 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> By popular demand, add a MONOTONIC_RAW variant.
>
> The implementation is up for discussion; but given the need for the
> dummy thing I thought it was easiest to just always update the
> mono_raw clock state in timekeeping_update() for now, even though its
> mostly wasted cycles.
>
> I suppose the right place would be a resume callback somewhere.
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: John Stultz <john.stultz@linaro.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/timekeeping.h |    1 +
>  kernel/time/timekeeping.c   |   42 +++++++++++++++++++++++++++++++++++-------
>  2 files changed, 36 insertions(+), 7 deletions(-)
>
> --- a/include/linux/timekeeping.h
> +++ b/include/linux/timekeeping.h
> @@ -220,6 +220,7 @@ static inline u64 ktime_get_raw_ns(void)
>  }
>
>  extern u64 ktime_get_mono_fast_ns(void);
> +extern u64 ktime_get_mono_raw_fast_ns(void);
>
>  /*
>   * Timespec interfaces utilizing the ktime based ones
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -58,7 +58,8 @@ struct tk_fast {
>         struct tk_read_base     base[2];
>  };
>
> -static struct tk_fast tk_fast_mono ____cacheline_aligned;
> +static struct tk_fast tk_fast_mono     ____cacheline_aligned;
> +static struct tk_fast tk_fast_mono_raw ____cacheline_aligned;
>
>  /* flag for if timekeeping is suspended */
>  int __read_mostly timekeeping_suspended;
> @@ -267,18 +268,18 @@ static inline s64 timekeeping_get_ns_raw
>   * slightly wrong timestamp (a few nanoseconds). See
>   * @ktime_get_mono_fast_ns.
>   */
> -static void update_fast_timekeeper(struct tk_read_base *tkr)
> +static void update_fast_timekeeper(struct tk_fast *fast, struct tk_read_base *tkr)
>  {
> -       struct tk_read_base *base = tk_fast_mono.base;
> +       struct tk_read_base *base = fast->base;
>
>         /* Force readers off to base[1] */
> -       raw_write_seqcount_latch(&tk_fast_mono.seq);
> +       raw_write_seqcount_latch(&fast->seq);
>
>         /* Update base[0] */
>         memcpy(base, tkr, sizeof(*base));
>
>         /* Force readers back to base[0] */
> -       raw_write_seqcount_latch(&tk_fast_mono.seq);
> +       raw_write_seqcount_latch(&fast->seq);
>
>         /* Update base[1] */
>         memcpy(base + 1, base, sizeof(*base));
> @@ -332,6 +333,22 @@ u64 notrace ktime_get_mono_fast_ns(void)
>  }
>  EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
>
> +u64 notrace ktime_get_mono_raw_fast_ns(void)
> +{
> +       struct tk_read_base *tkr;
> +       unsigned int seq;
> +       u64 now;
> +
> +       do {
> +               seq = raw_read_seqcount(&tk_fast_mono_raw.seq);
> +               tkr = tk_fast_mono_raw.base + (seq & 0x01);
> +               now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);


So this doesn't look right. I think you want to use tk->base_raw and
timekeeping_get_ns_raw() here?


> +       } while (read_seqcount_retry(&tk_fast_mono_raw.seq, seq));
> +       return now;
> +}
> +EXPORT_SYMBOL_GPL(ktime_get_mono_raw_fast_ns);
> +
>  /* Suspend-time cycles value for halted fast timekeeper. */
>  static cycle_t cycles_at_suspend;
>
> @@ -358,7 +375,11 @@ static void halt_fast_timekeeper(struct
>         memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
>         cycles_at_suspend = tkr->read(tkr->clock);
>         tkr_dummy.read = dummy_clock_read;
> -       update_fast_timekeeper(&tkr_dummy);
> +       update_fast_timekeeper(&tk_fast_mono,     &tkr_dummy);
> +
> +       tkr_dummy.mult  = tkr->clock->mult;
> +       tkr_dummy.shift = tkr->clock->shift;
> +       update_fast_timekeeper(&tk_fast_mono_raw, &tkr_dummy);
>  }
>
>  #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
> @@ -475,6 +496,8 @@ static inline void tk_update_ktime_data(
>  /* must hold timekeeper_lock */
>  static void timekeeping_update(struct timekeeper *tk, unsigned int action)
>  {
> +       static struct tk_read_base tkr;
> +
>         if (action & TK_CLEAR_NTP) {
>                 tk->ntp_error = 0;
>                 ntp_clear();
> @@ -489,7 +512,12 @@ static void timekeeping_update(struct ti
>                 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
>                        sizeof(tk_core.timekeeper));
>
> -       update_fast_timekeeper(&tk->tkr);
> +       update_fast_timekeeper(&tk_fast_mono, &tk->tkr);
> +
> +       tkr = tk->tkr;
> +       tkr.mult  = tk->tkr.clock->mult;
> +       tkr.shift = tk->tkr.clock->shift;
> +       update_fast_timekeeper(&tk_fast_mono_raw, &tkr);

So this is sort of sneaky and subtle here, which will surely cause
problems later on. You're copying the original mult/shift pair into a
copy of the tkr, so you get similar results from timekeeping_get_ns()
as you'd want from timekeeping_get_ns_raw().  This results in multiple
ways of getting the raw clock.

I think it would be better to either add a new tkr structure for the
raw clock in the timekeeper, so you can directly copy it over, or
extend the tkr structure so it can contain the raw values as well.

Also, there's no real reason to have fast/non-fast versions of the raw
clock. Since it isn't affected by frequency changes, it can't have the
inconsistency issues the monotonic clock can see (which are documented
in the comment near ktime_get_mono_fast_ns()). So we can probably
condense these and avoid duplicative code.

thanks
-john

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns()
  2015-02-20 19:49   ` John Stultz
@ 2015-02-20 20:11     ` Peter Zijlstra
  2015-03-17 11:24     ` Peter Zijlstra
  1 sibling, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2015-02-20 20:11 UTC (permalink / raw)
  To: John Stultz
  Cc: Paweł Moll, Adrian Hunter, Ingo Molnar, Stephane Eranian,
	lkml, Arnaldo Carvalho de Melo, David Ahern,
	Frédéric Weisbecker, Jiri Olsa, Namhyung Kim,
	Paul Mackerras, Thomas Gleixner, Steven Rostedt, Sonny Rao, ak,
	vincent.weaver

On Fri, Feb 20, 2015 at 11:49:49AM -0800, John Stultz wrote:
> On Fri, Feb 20, 2015 at 6:29 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > +u64 notrace ktime_get_mono_raw_fast_ns(void)
> > +{
> > +       struct tk_read_base *tkr;
> > +       unsigned int seq;
> > +       u64 now;
> > +
> > +       do {
> > +               seq = raw_read_seqcount(&tk_fast_mono_raw.seq);
> > +               tkr = tk_fast_mono_raw.base + (seq & 0x01);
> > +               now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
> 
> 
> So this doesn't look right. I think you want to use tk->base_raw and
> timekeeping_get_ns_raw() here?

No, the problem is that timekeeping_get_ns_raw() dereferences
tkr->clock. The idea was to, like ktime_get_mono_fast_ns() only touch
the _1_ cacheline.

Clearly I've messed it up, but I didn't want it to go dereference
tkr->clock and pull all kinds of stuff from that second line.

> > +       tkr = tk->tkr;
> > +       tkr.mult  = tk->tkr.clock->mult;
> > +       tkr.shift = tk->tkr.clock->shift;
> > +       update_fast_timekeeper(&tk_fast_mono_raw, &tkr);
> 
> So this is sort of sneaky and subtle here, which will surely cause
> problems later on. You're copying the original mult/shift pair into a
> copy of the tkr, so you get similar results from timekeeping_get_ns()
> as you'd want from timekeeping_get_ns_raw().  This results in multiple
> ways of getting the raw clock.
> 
> I think it would be better to either add a new tkr structure for the
> raw clock in the timekeeper, so you can directly copy it over,

OK, this then.

> or
> extend the tkr structure so it can contain the raw values as well.

Can't it would push tk_fast over the _1_ cacheline.

> Also, there's no real reason to have fast/non-fast versions of the raw
> clock. Since it isn't affected by frequency changes, it can't have the
> inconsistency issues the monotonic clock can see (which are documented
> in the comment near ktime_get_mono_fast_ns()). So we can probably
> condense these and avoid duplicative code.

The typical timekeeping_get_ns_raw() still got a seqcount around it
which can fail from NMI (inf loop for all).

So we do nee the tk_fast dual copy seqcount thing just the same, also as
per the above, cacheline cacheline cacheline :)

But yes, I think you're right in that we should be able to condense that
somewhat.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][PATCH 2/2] perf: Add per event clockid support
  2015-02-20 14:29 ` [RFC][PATCH 2/2] perf: Add per event clockid support Peter Zijlstra
  2015-02-20 15:28   ` Pawel Moll
@ 2015-02-23  8:13   ` Adrian Hunter
  1 sibling, 0 replies; 10+ messages in thread
From: Adrian Hunter @ 2015-02-23  8:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: pawel.moll, john.stultz, mingo, eranian, linux-kernel, acme,
	dsahern, fweisbec, jolsa, namhyung, paulus, tglx, rostedt,
	sonnyrao, ak, vincent.weaver

On 20/02/15 16:29, Peter Zijlstra wrote:
> While thinking on the whole clock discussion it occured to me we have
> two distinct uses of time:
> 
>  1) the tracking of event/ctx/cgroup enabled/running/stopped times
>     which includes the self-monitoring support in struct
>     perf_event_mmap_page.
> 
>  2) the actual timestamps visible in the data records.
> 
> And we've been conflating them.
> 
> The first is all about tracking time deltas, nobody should really care
> in what time base that happens, its all relative information, as long
> as its internally consistent it works.
> 
> The second however is what people are worried about when having to
> merge their data with external sources. And here we have the
> discussion on MONOTONIC vs MONOTONIC_RAW etc..
> 
> Where MONOTONIC is good for correlating between machines (static
> offset), MONOTNIC_RAW is required for correlating against a fixed rate
> hardware clock.
> 
> This means configurability; now 1) makes that hard because it needs to
> be internally consistent across groups of unrelated events; which is
> why we had to have a global perf_clock().
> 
> However, for 2) it doesn't really matter, perf itself doesn't care
> what it writes into the buffer.
> 
> The below patch makes the distinction between these two cases by
> adding perf_event_clock() which is used for the second case. It
> further makes this configurable on a per-event basis, but adds a few
> sanity checks such that we cannot combine events with different clocks
> in confusing ways.
> 
> And since we then have per-event configurability we might as well
> retain the 'legacy' behaviour as a default.

OK by me.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns()
  2015-02-20 19:49   ` John Stultz
  2015-02-20 20:11     ` Peter Zijlstra
@ 2015-03-17 11:24     ` Peter Zijlstra
  2015-03-18 19:48       ` John Stultz
  1 sibling, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2015-03-17 11:24 UTC (permalink / raw)
  To: John Stultz
  Cc: Paweł Moll, Adrian Hunter, Ingo Molnar, Stephane Eranian,
	lkml, Arnaldo Carvalho de Melo, David Ahern,
	Frédéric Weisbecker, Jiri Olsa, Namhyung Kim,
	Paul Mackerras, Thomas Gleixner, Steven Rostedt, Sonny Rao, ak,
	vincent.weaver

On Fri, Feb 20, 2015 at 11:49:49AM -0800, John Stultz wrote:
> On Fri, Feb 20, 2015 at 6:29 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > @@ -489,7 +512,12 @@ static void timekeeping_update(struct ti
> >                 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
> >                        sizeof(tk_core.timekeeper));
> >
> > -       update_fast_timekeeper(&tk->tkr);
> > +       update_fast_timekeeper(&tk_fast_mono, &tk->tkr);
> > +
> > +       tkr = tk->tkr;
> > +       tkr.mult  = tk->tkr.clock->mult;
> > +       tkr.shift = tk->tkr.clock->shift;
> > +       update_fast_timekeeper(&tk_fast_mono_raw, &tkr);
> 
> So this is sort of sneaky and subtle here, which will surely cause
> problems later on. You're copying the original mult/shift pair into a
> copy of the tkr, so you get similar results from timekeeping_get_ns()
> as you'd want from timekeeping_get_ns_raw().  This results in multiple
> ways of getting the raw clock.
> 
> I think it would be better to either add a new tkr structure for the
> raw clock in the timekeeper, so you can directly copy it over, or
> extend the tkr structure so it can contain the raw values as well.
> 
> Also, there's no real reason to have fast/non-fast versions of the raw
> clock. Since it isn't affected by frequency changes, it can't have the
> inconsistency issues the monotonic clock can see (which are documented
> in the comment near ktime_get_mono_fast_ns()). So we can probably
> condense these and avoid duplicative code.

So I was looking at this again; and I think we do indeed need a second
way to read mono_raw.

So the immediate problem is the tk_core.seq loop around
timekeeping_get_ns_raw(); if the NMI happens during the write side of
that we're stuck.

Now the reason we need that seqcount is because of tk->tkr.cycle_last,
which, afaict, is the only bit that mono_raw needs that changes. With
the possible exception of the suspend/resume paths; we also need to deal
with NMIs requesting time during those.

And we need this silly cycle_last business because of short clocks,
which is something the generic code needs to deal with. And because we
have a bit of an error margin on that we can indeed use a 'stale'
cycle_last.

As to the above suggestions:

 - we cannot add another tkr structure to struct timekeeper because that
   would duplicate a number of fields, including the above mentioned
   cycle_last, and having two of those is bound to cause confusion; and,

 - we cannot extend tkr because that would make it overflow the
   cacheline.

So as it is, I do not see any other way than the already proposed patch;
we even need the update from timekeeping_update() because we need an
up-to-date cycle_last.

So all I can really do is write a better changelog.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns()
  2015-03-17 11:24     ` Peter Zijlstra
@ 2015-03-18 19:48       ` John Stultz
  0 siblings, 0 replies; 10+ messages in thread
From: John Stultz @ 2015-03-18 19:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paweł Moll, Adrian Hunter, Ingo Molnar, Stephane Eranian,
	lkml, Arnaldo Carvalho de Melo, David Ahern,
	Frédéric Weisbecker, Jiri Olsa, Namhyung Kim,
	Paul Mackerras, Thomas Gleixner, Steven Rostedt, Sonny Rao, ak,
	vincent.weaver

On Tue, Mar 17, 2015 at 4:24 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Feb 20, 2015 at 11:49:49AM -0800, John Stultz wrote:
>> On Fri, Feb 20, 2015 at 6:29 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> > @@ -489,7 +512,12 @@ static void timekeeping_update(struct ti
>> >                 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
>> >                        sizeof(tk_core.timekeeper));
>> >
>> > -       update_fast_timekeeper(&tk->tkr);
>> > +       update_fast_timekeeper(&tk_fast_mono, &tk->tkr);
>> > +
>> > +       tkr = tk->tkr;
>> > +       tkr.mult  = tk->tkr.clock->mult;
>> > +       tkr.shift = tk->tkr.clock->shift;
>> > +       update_fast_timekeeper(&tk_fast_mono_raw, &tkr);
>>
>> So this is sort of sneaky and subtle here, which will surely cause
>> problems later on. You're copying the original mult/shift pair into a
>> copy of the tkr, so you get similar results from timekeeping_get_ns()
>> as you'd want from timekeeping_get_ns_raw().  This results in multiple
>> ways of getting the raw clock.
>>
>> I think it would be better to either add a new tkr structure for the
>> raw clock in the timekeeper, so you can directly copy it over, or
>> extend the tkr structure so it can contain the raw values as well.
>>
>> Also, there's no real reason to have fast/non-fast versions of the raw
>> clock. Since it isn't affected by frequency changes, it can't have the
>> inconsistency issues the monotonic clock can see (which are documented
>> in the comment near ktime_get_mono_fast_ns()). So we can probably
>> condense these and avoid duplicative code.
>
> So I was looking at this again; and I think we do indeed need a second
> way to read mono_raw.
>
> So the immediate problem is the tk_core.seq loop around
> timekeeping_get_ns_raw(); if the NMI happens during the write side of
> that we're stuck.
>
> Now the reason we need that seqcount is because of tk->tkr.cycle_last,
> which, afaict, is the only bit that mono_raw needs that changes. With
> the possible exception of the suspend/resume paths; we also need to deal
> with NMIs requesting time during those.
>
> And we need this silly cycle_last business because of short clocks,
> which is something the generic code needs to deal with. And because we
> have a bit of an error margin on that we can indeed use a 'stale'
> cycle_last.

Sorry.. I'm feeling daft here, but this isn't making sense to me.

I'm suggesting we can use the same fast mode to replace the normal
one, since there's no frequency changes on the raw clock, so using a
stale cycle_last isn't problematic. For fast-wrapping clocks updating
cycle_last regualrly is important, but we should still be ok if we
grab the previous cycle_last mid-update.

In fact, looking at it, I'm starting to think we should even be
reusing the ktime_get_mono_fast_ns() (only under a conventional
seqlock read) for the normal ktime_get calls... Just so we can avoid
all this repetitive and slightly different logic.

> As to the above suggestions:
>
>  - we cannot add another tkr structure to struct timekeeper because that
>    would duplicate a number of fields, including the above mentioned
>    cycle_last, and having two of those is bound to cause confusion; and,

I'm not sure if duplicating a few entries in the tkr structure is that
confusing compared to the subtle update trick you're using though...
Especially since we're duplicating it all in the fast_tk structures.
And since base_mono in the tkr could be just made to base, and would
allow us to drop the base_raw in the timekeeper.  The
clock/read/mask/last would be duplicated, but if needed we could
rework the fast_tkr update to pull common elements from the timekeeper
if needed.

>  - we cannot extend tkr because that would make it overflow the
>    cacheline.
>
> So as it is, I do not see any other way than the already proposed patch;
> we even need the update from timekeeping_update() because we need an
> up-to-date cycle_last.
>
> So all I can really do is write a better changelog.

So.. there was still the bug w/ using tk->base_mono and
timekeeping_get_ns() rather then tk->base_raw and
timekeeping_get_ns_raw(), no?   I get the tkr->mult/shift values are
faked so get_ns() is technically ok, but the base seems way wrong.

thanks
-john

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-03-18 19:48 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-20 14:29 [RFC][PATCH 0/2] On perf and clocks Peter Zijlstra
2015-02-20 14:29 ` [RFC][PATCH 1/2] time: Add ktime_get_mono_raw_fast_ns() Peter Zijlstra
2015-02-20 19:49   ` John Stultz
2015-02-20 20:11     ` Peter Zijlstra
2015-03-17 11:24     ` Peter Zijlstra
2015-03-18 19:48       ` John Stultz
2015-02-20 14:29 ` [RFC][PATCH 2/2] perf: Add per event clockid support Peter Zijlstra
2015-02-20 15:28   ` Pawel Moll
2015-02-20 16:01     ` Peter Zijlstra
2015-02-23  8:13   ` Adrian Hunter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.