All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] pending patches
@ 2009-05-08 16:52 Peter Zijlstra
  2009-05-08 16:52 ` [PATCH 1/5] hrtimer: per-cpu cached values of ktime Peter Zijlstra
                   ` (4 more replies)
  0 siblings, 5 replies; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-08 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra,
	Thomas Gleixner

The first two cut down the tick overhead some.
The third ought to fix a problem Corey ran into.
The last two add some record data we talked about some time ago
but never got around to adding.

-- 


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 1/5] hrtimer: per-cpu cached values of ktime
  2009-05-08 16:52 [PATCH 0/5] pending patches Peter Zijlstra
@ 2009-05-08 16:52 ` Peter Zijlstra
  2009-05-08 23:10   ` Andrew Morton
  2009-05-09 19:45   ` Linus Torvalds
  2009-05-08 16:52 ` [PATCH 2/5] perf_counter: optimize perf_counter_task_tick() Peter Zijlstra
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-08 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra,
	Thomas Gleixner

[-- Attachment #1: opt-tick-ktime_get.patch --]
[-- Type: text/plain, Size: 4197 bytes --]

The regular (hrtimer driven) tick calls ktime_get() 4 times:

hrtimer_interupt()
  ktimer_get()
  __run_hrtimer()
    tick_sched_timer()
      ktimer_get()
      update_process_times()
        run_local_timers()
        rcu_pending()
        printk_tick()
        scheduler_tick()
          sched_clock_tick()
            ktime_get()
          perf_counter_task_tick()
        run_posix_cpu_timers()
      profile_tick()
      hrtimer_forward()
    hrtimer_enqueue()
      tick_program_event()
        tick_dev_program_event()
          ktime_get()

Reduce that to 2 by caching the 1st ktime_get(). By clearing the state on
irq_exit() this always works, even for !hrtimer ticks.

Getting rid of the last ktime_get() is a bit more involved as that code can
also get called from outside of irq context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/hrtimer.h  |    5 +++++
 kernel/hrtimer.c         |   22 ++++++++++++++++++++--
 kernel/sched_clock.c     |    3 ++-
 kernel/softirq.c         |    2 ++
 kernel/time/tick-sched.c |    2 +-
 5 files changed, 30 insertions(+), 4 deletions(-)

Index: linux-2.6/include/linux/hrtimer.h
===================================================================
--- linux-2.6.orig/include/linux/hrtimer.h
+++ linux-2.6/include/linux/hrtimer.h
@@ -173,8 +173,13 @@ struct hrtimer_cpu_base {
 	int				hres_active;
 	unsigned long			nr_events;
 #endif
+	ktime_t				ktime_irq;
+	int				ktime_irq_set;
 };
 
+extern ktime_t ktime_irq_get(void);
+extern void ktime_irq_clear(void);
+
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->_expires = time;
Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -88,7 +88,6 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
  */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
-
 	.clock_base =
 	{
 		{
@@ -104,6 +103,25 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, 
 	}
 };
 
+ktime_t ktime_irq_get(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+	if (!cpu_base->ktime_irq_set) {
+		cpu_base->ktime_irq = ktime_get();
+		cpu_base->ktime_irq_set = 1;
+	}
+
+	return cpu_base->ktime_irq;
+}
+
+void ktime_irq_clear(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+	cpu_base->ktime_irq_set = 0;
+}
+
 /**
  * ktime_get_ts - get the monotonic clock in timespec format
  * @ts:		pointer to timespec variable
@@ -1222,7 +1240,7 @@ void hrtimer_interrupt(struct clock_even
 	if (!(++nr_retries % 5))
 		hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
 
-	now = ktime_get();
+	now = ktime_irq_get();
 
 	expires_next.tv64 = KTIME_MAX;
 
Index: linux-2.6/kernel/sched_clock.c
===================================================================
--- linux-2.6.orig/kernel/sched_clock.c
+++ linux-2.6/kernel/sched_clock.c
@@ -219,7 +219,7 @@ void sched_clock_tick(void)
 	WARN_ON_ONCE(!irqs_disabled());
 
 	scd = this_scd();
-	now_gtod = ktime_to_ns(ktime_get());
+	now_gtod = ktime_to_ns(ktime_irq_get());
 	now = sched_clock();
 
 	__raw_spin_lock(&scd->lock);
@@ -246,6 +246,7 @@ void sched_clock_idle_wakeup_event(u64 d
 	if (timekeeping_suspended)
 		return;
 
+	ktime_irq_clear();
 	sched_clock_tick();
 	touch_softlockup_watchdog();
 }
Index: linux-2.6/kernel/softirq.c
===================================================================
--- linux-2.6.orig/kernel/softirq.c
+++ linux-2.6/kernel/softirq.c
@@ -282,6 +282,8 @@ void irq_enter(void)
 		tick_check_idle(cpu);
 	} else
 		__irq_enter();
+
+	ktime_irq_clear();
 }
 
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -629,7 +629,7 @@ static enum hrtimer_restart tick_sched_t
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
 	struct pt_regs *regs = get_irq_regs();
-	ktime_t now = ktime_get();
+	ktime_t now = ktime_irq_get();
 	int cpu = smp_processor_id();
 
 #ifdef CONFIG_NO_HZ

-- 


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 2/5] perf_counter: optimize perf_counter_task_tick()
  2009-05-08 16:52 [PATCH 0/5] pending patches Peter Zijlstra
  2009-05-08 16:52 ` [PATCH 1/5] hrtimer: per-cpu cached values of ktime Peter Zijlstra
@ 2009-05-08 16:52 ` Peter Zijlstra
  2009-05-08 18:39   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
  2009-05-08 16:52 ` [PATCH 3/5] perf_counter: rework ioctl()s Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-08 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra,
	Thomas Gleixner

[-- Attachment #1: perf_counter-opt-tick.patch --]
[-- Type: text/plain, Size: 1813 bytes --]

perf_counter_task_tick() does way too much work to find out there's nothing to
do. Provide an easy short-circuit for the normal case where there are no
counters on the system.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/perf_counter.c |   13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -39,6 +39,7 @@ int perf_max_counters __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
+static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_tracking __read_mostly;
 static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
@@ -1076,8 +1077,14 @@ static void rotate_ctx(struct perf_count
 
 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+
+	if (!atomic_read(&nr_counters))
+		return;
+
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	ctx = &curr->perf_counter_ctx;
 
 	perf_counter_cpu_sched_out(cpuctx);
 	perf_counter_task_sched_out(curr, cpu);
@@ -1197,6 +1204,7 @@ static void free_counter(struct perf_cou
 {
 	perf_pending_sync(counter);
 
+	atomic_dec(&nr_counters);
 	if (counter->hw_event.mmap)
 		atomic_dec(&nr_mmap_tracking);
 	if (counter->hw_event.munmap)
@@ -2861,6 +2869,7 @@ done:
 
 	counter->pmu = pmu;
 
+	atomic_inc(&nr_counters);
 	if (counter->hw_event.mmap)
 		atomic_inc(&nr_mmap_tracking);
 	if (counter->hw_event.munmap)

-- 


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-08 16:52 [PATCH 0/5] pending patches Peter Zijlstra
  2009-05-08 16:52 ` [PATCH 1/5] hrtimer: per-cpu cached values of ktime Peter Zijlstra
  2009-05-08 16:52 ` [PATCH 2/5] perf_counter: optimize perf_counter_task_tick() Peter Zijlstra
@ 2009-05-08 16:52 ` Peter Zijlstra
  2009-05-08 18:39   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
                     ` (2 more replies)
  2009-05-08 16:52 ` [PATCH 4/5] perf_counter: PERF_RECORD_CONFIG Peter Zijlstra
  2009-05-08 16:52 ` [PATCH 5/5] perf_counter: PERF_RECORD_CPU Peter Zijlstra
  4 siblings, 3 replies; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-08 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra,
	Thomas Gleixner

[-- Attachment #1: perf_counter-family.patch --]
[-- Type: text/plain, Size: 5485 bytes --]

Corey noticed that ioctl()s on grouped counters didn't work on the whole group.
This extends the ioctl() interface to take a second argument that is
interpreted as a flags field. We then provide PERF_IOC_FLAG_GROUP to toggle
the behaviour.

Having this flag gives the greatest flexibility, allowing you to individually
enable/disable/reset counters in a group, or all together.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |   10 ++--
 kernel/perf_counter.c        |  104 +++++++++++++++++++++++--------------------
 2 files changed, 65 insertions(+), 49 deletions(-)

Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -82,7 +82,7 @@ list_add_counter(struct perf_counter *co
 	 * add it straight to the context's counter list, or to the group
 	 * leader's sibling list:
 	 */
-	if (counter->group_leader == counter)
+	if (group_leader == counter)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 	else {
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
@@ -385,24 +385,6 @@ static void perf_counter_disable(struct 
 	spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Disable a counter and all its children.
- */
-static void perf_counter_disable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_disable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_disable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
@@ -753,24 +735,6 @@ static int perf_counter_refresh(struct p
 	return 0;
 }
 
-/*
- * Enable a counter and all its children.
- */
-static void perf_counter_enable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_enable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_enable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
@@ -1307,31 +1271,79 @@ static unsigned int perf_poll(struct fil
 
 static void perf_counter_reset(struct perf_counter *counter)
 {
+	(void)perf_counter_read(counter);
 	atomic_set(&counter->count, 0);
+	perf_counter_update_userpage(counter);
+}
+
+static void perf_counter_for_each_sibling(struct perf_counter *counter,
+					  void (*func)(struct perf_counter *))
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *sibling;
+
+	spin_lock_irq(&ctx->lock);
+	counter = counter->group_leader;
+
+	func(counter);
+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+		func(sibling);
+	spin_unlock_irq(&ctx->lock);
+}
+
+static void perf_counter_for_each_child(struct perf_counter *counter,
+					void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	func(counter);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		func(child);
+	mutex_unlock(&counter->mutex);
+}
+
+static void perf_counter_for_each(struct perf_counter *counter,
+				  void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	perf_counter_for_each_sibling(counter, func);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_for_each_sibling(child, func);
+	mutex_unlock(&counter->mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
-	int err = 0;
+	void (*func)(struct perf_counter *);
+	u32 flags = arg;
 
 	switch (cmd) {
 	case PERF_COUNTER_IOC_ENABLE:
-		perf_counter_enable_family(counter);
+		func = perf_counter_enable;
 		break;
 	case PERF_COUNTER_IOC_DISABLE:
-		perf_counter_disable_family(counter);
-		break;
-	case PERF_COUNTER_IOC_REFRESH:
-		err = perf_counter_refresh(counter, arg);
+		func = perf_counter_disable;
 		break;
 	case PERF_COUNTER_IOC_RESET:
-		perf_counter_reset(counter);
+		func = perf_counter_reset;
 		break;
+
+	case PERF_COUNTER_IOC_REFRESH:
+		return perf_counter_refresh(counter, arg);
 	default:
-		err = -ENOTTY;
+		return -ENOTTY;
 	}
-	return err;
+
+	if (flags & PERF_IOC_FLAG_GROUP)
+		perf_counter_for_each(counter, func);
+	else
+		perf_counter_for_each_child(counter, func);
+
+	return 0;
 }
 
 /*
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -157,10 +157,14 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
+#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
 #define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+
+enum perf_counter_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
 
 /*
  * Structure of the page that can be mapped via mmap

-- 


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 4/5] perf_counter: PERF_RECORD_CONFIG
  2009-05-08 16:52 [PATCH 0/5] pending patches Peter Zijlstra
                   ` (2 preceding siblings ...)
  2009-05-08 16:52 ` [PATCH 3/5] perf_counter: rework ioctl()s Peter Zijlstra
@ 2009-05-08 16:52 ` Peter Zijlstra
  2009-05-08 18:39   ` [tip:perfcounters/core] perf_counter: add PERF_RECORD_CONFIG tip-bot for Peter Zijlstra
  2009-05-08 16:52 ` [PATCH 5/5] perf_counter: PERF_RECORD_CPU Peter Zijlstra
  4 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-08 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra,
	Thomas Gleixner

[-- Attachment #1: perf_counter-config.patch --]
[-- Type: text/plain, Size: 1843 bytes --]

Much like CONFIG_RECORD_GROUP records the hw_event.config to identify the
values, allow to record this for all counters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |    2 ++
 kernel/perf_counter.c        |    8 ++++++++
 2 files changed, 10 insertions(+)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -104,6 +104,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_ADDR	= 1U << 3,
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
+	PERF_RECORD_CONFIG	= 1U << 6,
 };
 
 /*
@@ -258,6 +259,7 @@ enum perf_event_type {
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
+	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1992,6 +1992,11 @@ static void perf_counter_output(struct p
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CONFIG) {
+		header.type |= PERF_RECORD_CONFIG;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2027,6 +2032,9 @@ static void perf_counter_output(struct p
 	if (record_type & PERF_RECORD_ADDR)
 		perf_output_put(&handle, addr);
 
+	if (record_type & PERF_RECORD_CONFIG)
+		perf_output_put(&handle, counter->hw_event.config);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */

-- 


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 5/5] perf_counter: PERF_RECORD_CPU
  2009-05-08 16:52 [PATCH 0/5] pending patches Peter Zijlstra
                   ` (3 preceding siblings ...)
  2009-05-08 16:52 ` [PATCH 4/5] perf_counter: PERF_RECORD_CONFIG Peter Zijlstra
@ 2009-05-08 16:52 ` Peter Zijlstra
  2009-05-08 18:40   ` [tip:perfcounters/core] perf_counter: add PERF_RECORD_CPU tip-bot for Peter Zijlstra
  4 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-08 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra,
	Thomas Gleixner

[-- Attachment #1: perf_counter-cpu.patch --]
[-- Type: text/plain, Size: 2281 bytes --]

Allow recording the CPU number the event was generated on.

RFC: this leaves a u32 as reserved, should we fill in the node_id() there,
     or leave this open for future extention, as userspace can already easily
     do the cpu->node mapping if needed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |    2 ++
 kernel/perf_counter.c        |   13 +++++++++++++
 2 files changed, 15 insertions(+)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -105,6 +105,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
 	PERF_RECORD_CONFIG	= 1U << 6,
+	PERF_RECORD_CPU		= 1U << 7,
 };
 
 /*
@@ -260,6 +261,7 @@ enum perf_event_type {
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
+	 * 	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1954,6 +1954,9 @@ static void perf_counter_output(struct p
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
+	struct {
+		u32 cpu, reserved;
+	} cpu_entry;
 
 	header.type = 0;
 	header.size = sizeof(header);
@@ -1997,6 +2000,13 @@ static void perf_counter_output(struct p
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CPU) {
+		header.type |= PERF_RECORD_CPU;
+		header.size += sizeof(cpu_entry);
+
+		cpu_entry.cpu = raw_smp_processor_id();
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2035,6 +2045,9 @@ static void perf_counter_output(struct p
 	if (record_type & PERF_RECORD_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
+	if (record_type & PERF_RECORD_CPU)
+		perf_output_put(&handle, cpu_entry);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */

-- 


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [tip:perfcounters/core] perf_counter: optimize perf_counter_task_tick()
  2009-05-08 16:52 ` [PATCH 2/5] perf_counter: optimize perf_counter_task_tick() Peter Zijlstra
@ 2009-05-08 18:39   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 31+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-05-08 18:39 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor, mingo

Commit-ID:  7fc23a5380797012e92a9633169440f2f4a21253
Gitweb:     http://git.kernel.org/tip/7fc23a5380797012e92a9633169440f2f4a21253
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 8 May 2009 18:52:21 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 8 May 2009 20:36:57 +0200

perf_counter: optimize perf_counter_task_tick()

perf_counter_task_tick() does way too much work to find out
there's nothing to do. Provide an easy short-circuit for the
normal case where there are no counters on the system.

[ Impact: micro-optimization ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170028.750619201@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 kernel/perf_counter.c |   13 +++++++++++--
 1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 60e55f0..fdb0d24 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -39,6 +39,7 @@ int perf_max_counters __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
+static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_tracking __read_mostly;
 static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
@@ -1076,8 +1077,14 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 
 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+
+	if (!atomic_read(&nr_counters))
+		return;
+
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	ctx = &curr->perf_counter_ctx;
 
 	perf_counter_cpu_sched_out(cpuctx);
 	perf_counter_task_sched_out(curr, cpu);
@@ -1197,6 +1204,7 @@ static void free_counter(struct perf_counter *counter)
 {
 	perf_pending_sync(counter);
 
+	atomic_dec(&nr_counters);
 	if (counter->hw_event.mmap)
 		atomic_dec(&nr_mmap_tracking);
 	if (counter->hw_event.munmap)
@@ -2861,6 +2869,7 @@ done:
 
 	counter->pmu = pmu;
 
+	atomic_inc(&nr_counters);
 	if (counter->hw_event.mmap)
 		atomic_inc(&nr_mmap_tracking);
 	if (counter->hw_event.munmap)

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [tip:perfcounters/core] perf_counter: rework ioctl()s
  2009-05-08 16:52 ` [PATCH 3/5] perf_counter: rework ioctl()s Peter Zijlstra
@ 2009-05-08 18:39   ` tip-bot for Peter Zijlstra
  2009-05-11  1:29   ` [PATCH 3/5] " Paul Mackerras
  2009-05-11 23:58   ` Arnd Bergmann
  2 siblings, 0 replies; 31+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-05-08 18:39 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor, mingo

Commit-ID:  3df5edad87a998273aa5a9a8c728c05d855ad00e
Gitweb:     http://git.kernel.org/tip/3df5edad87a998273aa5a9a8c728c05d855ad00e
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 8 May 2009 18:52:22 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 8 May 2009 20:36:58 +0200

perf_counter: rework ioctl()s

Corey noticed that ioctl()s on grouped counters didn't work on
the whole group. This extends the ioctl() interface to take a
second argument that is interpreted as a flags field. We then
provide PERF_IOC_FLAG_GROUP to toggle the behaviour.

Having this flag gives the greatest flexibility, allowing you
to individually enable/disable/reset counters in a group, or
all together.

[ Impact: fix group counter enable/disable semantics ]

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090508170028.837558214@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 include/linux/perf_counter.h |   10 +++-
 kernel/perf_counter.c        |  104 +++++++++++++++++++++++------------------
 2 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 00081d8..88f863e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -157,10 +157,14 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
+#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
 #define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+
+enum perf_counter_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
 
 /*
  * Structure of the page that can be mapped via mmap
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fdb0d24..f4883f1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -82,7 +82,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 * add it straight to the context's counter list, or to the group
 	 * leader's sibling list:
 	 */
-	if (counter->group_leader == counter)
+	if (group_leader == counter)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 	else {
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
@@ -385,24 +385,6 @@ static void perf_counter_disable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Disable a counter and all its children.
- */
-static void perf_counter_disable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_disable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_disable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
@@ -753,24 +735,6 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	return 0;
 }
 
-/*
- * Enable a counter and all its children.
- */
-static void perf_counter_enable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_enable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_enable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
@@ -1307,31 +1271,79 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 
 static void perf_counter_reset(struct perf_counter *counter)
 {
+	(void)perf_counter_read(counter);
 	atomic_set(&counter->count, 0);
+	perf_counter_update_userpage(counter);
+}
+
+static void perf_counter_for_each_sibling(struct perf_counter *counter,
+					  void (*func)(struct perf_counter *))
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *sibling;
+
+	spin_lock_irq(&ctx->lock);
+	counter = counter->group_leader;
+
+	func(counter);
+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+		func(sibling);
+	spin_unlock_irq(&ctx->lock);
+}
+
+static void perf_counter_for_each_child(struct perf_counter *counter,
+					void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	func(counter);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		func(child);
+	mutex_unlock(&counter->mutex);
+}
+
+static void perf_counter_for_each(struct perf_counter *counter,
+				  void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	perf_counter_for_each_sibling(counter, func);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_for_each_sibling(child, func);
+	mutex_unlock(&counter->mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
-	int err = 0;
+	void (*func)(struct perf_counter *);
+	u32 flags = arg;
 
 	switch (cmd) {
 	case PERF_COUNTER_IOC_ENABLE:
-		perf_counter_enable_family(counter);
+		func = perf_counter_enable;
 		break;
 	case PERF_COUNTER_IOC_DISABLE:
-		perf_counter_disable_family(counter);
-		break;
-	case PERF_COUNTER_IOC_REFRESH:
-		err = perf_counter_refresh(counter, arg);
+		func = perf_counter_disable;
 		break;
 	case PERF_COUNTER_IOC_RESET:
-		perf_counter_reset(counter);
+		func = perf_counter_reset;
 		break;
+
+	case PERF_COUNTER_IOC_REFRESH:
+		return perf_counter_refresh(counter, arg);
 	default:
-		err = -ENOTTY;
+		return -ENOTTY;
 	}
-	return err;
+
+	if (flags & PERF_IOC_FLAG_GROUP)
+		perf_counter_for_each(counter, func);
+	else
+		perf_counter_for_each_child(counter, func);
+
+	return 0;
 }
 
 /*

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [tip:perfcounters/core] perf_counter: add PERF_RECORD_CONFIG
  2009-05-08 16:52 ` [PATCH 4/5] perf_counter: PERF_RECORD_CONFIG Peter Zijlstra
@ 2009-05-08 18:39   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 31+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-05-08 18:39 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor, mingo

Commit-ID:  a85f61abe11a46553c4562e74edb27ebc782aeb7
Gitweb:     http://git.kernel.org/tip/a85f61abe11a46553c4562e74edb27ebc782aeb7
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 8 May 2009 18:52:23 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 8 May 2009 20:36:58 +0200

perf_counter: add PERF_RECORD_CONFIG

Much like CONFIG_RECORD_GROUP records the hw_event.config to
identify the values, allow to record this for all counters.

[ Impact: extend perfcounter output record format ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170028.923228280@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 include/linux/perf_counter.h |    2 ++
 kernel/perf_counter.c        |    8 ++++++++
 2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 88f863e..0e6303d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -104,6 +104,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_ADDR	= 1U << 3,
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
+	PERF_RECORD_CONFIG	= 1U << 6,
 };
 
 /*
@@ -258,6 +259,7 @@ enum perf_event_type {
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
+	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f4883f1..c615f52 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1994,6 +1994,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CONFIG) {
+		header.type |= PERF_RECORD_CONFIG;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2029,6 +2034,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_ADDR)
 		perf_output_put(&handle, addr);
 
+	if (record_type & PERF_RECORD_CONFIG)
+		perf_output_put(&handle, counter->hw_event.config);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [tip:perfcounters/core] perf_counter: add PERF_RECORD_CPU
  2009-05-08 16:52 ` [PATCH 5/5] perf_counter: PERF_RECORD_CPU Peter Zijlstra
@ 2009-05-08 18:40   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 31+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-05-08 18:40 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor, mingo

Commit-ID:  f370e1e2f195ec1e6420e26fc83e0319595db578
Gitweb:     http://git.kernel.org/tip/f370e1e2f195ec1e6420e26fc83e0319595db578
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 8 May 2009 18:52:24 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 8 May 2009 20:36:59 +0200

perf_counter: add PERF_RECORD_CPU

Allow recording the CPU number the event was generated on.

RFC: this leaves a u32 as reserved, should we fill in the
     node_id() there, or leave this open for future extention,
     as userspace can already easily do the cpu->node mapping
     if needed.

[ Impact: extend perfcounter output record format ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170029.008627711@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 include/linux/perf_counter.h |    2 ++
 kernel/perf_counter.c        |   13 +++++++++++++
 2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0e6303d..614f921 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -105,6 +105,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
 	PERF_RECORD_CONFIG	= 1U << 6,
+	PERF_RECORD_CPU		= 1U << 7,
 };
 
 /*
@@ -260,6 +261,7 @@ enum perf_event_type {
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
+	 * 	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c615f52..d850a1f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1956,6 +1956,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
+	struct {
+		u32 cpu, reserved;
+	} cpu_entry;
 
 	header.type = 0;
 	header.size = sizeof(header);
@@ -1999,6 +2002,13 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CPU) {
+		header.type |= PERF_RECORD_CPU;
+		header.size += sizeof(cpu_entry);
+
+		cpu_entry.cpu = raw_smp_processor_id();
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2037,6 +2047,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
+	if (record_type & PERF_RECORD_CPU)
+		perf_output_put(&handle, cpu_entry);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/5] hrtimer: per-cpu cached values of ktime
  2009-05-08 16:52 ` [PATCH 1/5] hrtimer: per-cpu cached values of ktime Peter Zijlstra
@ 2009-05-08 23:10   ` Andrew Morton
  2009-05-09 19:45   ` Linus Torvalds
  1 sibling, 0 replies; 31+ messages in thread
From: Andrew Morton @ 2009-05-08 23:10 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: mingo, paulus, cjashfor, linux-kernel, a.p.zijlstra, tglx

On Fri, 08 May 2009 18:52:20 +0200
Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> --- linux-2.6.orig/include/linux/hrtimer.h
> +++ linux-2.6/include/linux/hrtimer.h
> @@ -173,8 +173,13 @@ struct hrtimer_cpu_base {
>  	int				hres_active;
>  	unsigned long			nr_events;
>  #endif
> +	ktime_t				ktime_irq;
> +	int				ktime_irq_set;
>  };

struct hrtimer_cpu_base has nice kerneldoc which (used to) describe
its fields.

Please please please always always always comment data structures.  The
bang-for-your-comment-buck is higher than anywhere else.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/5] hrtimer: per-cpu cached values of ktime
  2009-05-08 16:52 ` [PATCH 1/5] hrtimer: per-cpu cached values of ktime Peter Zijlstra
  2009-05-08 23:10   ` Andrew Morton
@ 2009-05-09 19:45   ` Linus Torvalds
  2009-05-09 19:59     ` Peter Zijlstra
  1 sibling, 1 reply; 31+ messages in thread
From: Linus Torvalds @ 2009-05-09 19:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Paul Mackerras, Corey Ashford, linux-kernel,
	Thomas Gleixner



On Fri, 8 May 2009, Peter Zijlstra wrote:
>
> The regular (hrtimer driven) tick calls ktime_get() 4 times:
> 
> hrtimer_interupt()
>   ktimer_get()
>   __run_hrtimer()
>     tick_sched_timer()
>       ktimer_get()
>       update_process_times()
>         run_local_timers()
>         rcu_pending()
>         printk_tick()
>         scheduler_tick()
>           sched_clock_tick()
>             ktime_get()
>           perf_counter_task_tick()
>         run_posix_cpu_timers()
>       profile_tick()
>       hrtimer_forward()
>     hrtimer_enqueue()
>       tick_program_event()
>         tick_dev_program_event()
>           ktime_get()
> 
> Reduce that to 2 by caching the 1st ktime_get(). By clearing the state on
> irq_exit() this always works, even for !hrtimer ticks.

Ugh. This looks pretty horrid. That special-case cache is just nasty.

Would it be entirely impossible to just instead do ktime_get() once at the 
top and just pass it down. Yes, yes, it would involve calling convention 
changes, but that sounds way saner than having an ad-hoc cache like this.

With this, you now have magic rules for when you can use the "get from 
cache" version, and time goes backwards if you mix them by mistake (ie you 
use a non-caching "ktime_get()" in between two caching "ktime_irq_get()" 
calls, and now time will have gone backwards from the second to the third 
case).

It just seems to be very hacky.

			Linus

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/5] hrtimer: per-cpu cached values of ktime
  2009-05-09 19:45   ` Linus Torvalds
@ 2009-05-09 19:59     ` Peter Zijlstra
  0 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-09 19:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Paul Mackerras, Corey Ashford, linux-kernel,
	Thomas Gleixner

On Sat, 2009-05-09 at 12:45 -0700, Linus Torvalds wrote:
> 
> On Fri, 8 May 2009, Peter Zijlstra wrote:
> >
> > The regular (hrtimer driven) tick calls ktime_get() 4 times:
> > 
> > hrtimer_interupt()
> >   ktimer_get()
> >   __run_hrtimer()
> >     tick_sched_timer()
> >       ktimer_get()
> >       update_process_times()
> >         run_local_timers()
> >         rcu_pending()
> >         printk_tick()
> >         scheduler_tick()
> >           sched_clock_tick()
> >             ktime_get()
> >           perf_counter_task_tick()
> >         run_posix_cpu_timers()
> >       profile_tick()
> >       hrtimer_forward()
> >     hrtimer_enqueue()
> >       tick_program_event()
> >         tick_dev_program_event()
> >           ktime_get()
> > 
> > Reduce that to 2 by caching the 1st ktime_get(). By clearing the state on
> > irq_exit() this always works, even for !hrtimer ticks.
> 
> Ugh. This looks pretty horrid. That special-case cache is just nasty.
> 
> Would it be entirely impossible to just instead do ktime_get() once at the 
> top and just pass it down. Yes, yes, it would involve calling convention 
> changes, but that sounds way saner than having an ad-hoc cache like this.
> 
> With this, you now have magic rules for when you can use the "get from 
> cache" version, and time goes backwards if you mix them by mistake (ie you 
> use a non-caching "ktime_get()" in between two caching "ktime_irq_get()" 
> calls, and now time will have gone backwards from the second to the third 
> case).
> 
> It just seems to be very hacky.

Agreed.. but that would mean passing it down to every hrtimer callback
function, most of whoem won't be interested in it. Furthermore, we'd
have to audit all other 'legacy' timer functions -- but so be it I
guess.

Also, I've had a report that this patch causes some funnies, so there's
definitely something fishy.

I'll look at doing this differently.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-08 16:52 ` [PATCH 3/5] perf_counter: rework ioctl()s Peter Zijlstra
  2009-05-08 18:39   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
@ 2009-05-11  1:29   ` Paul Mackerras
  2009-05-11 15:12     ` Peter Zijlstra
  2009-05-11 18:12     ` Corey Ashford
  2009-05-11 23:58   ` Arnd Bergmann
  2 siblings, 2 replies; 31+ messages in thread
From: Paul Mackerras @ 2009-05-11  1:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Corey Ashford, linux-kernel, Thomas Gleixner

Peter Zijlstra writes:

> Corey noticed that ioctl()s on grouped counters didn't work on the whole group.
> This extends the ioctl() interface to take a second argument that is
> interpreted as a flags field. We then provide PERF_IOC_FLAG_GROUP to toggle
> the behaviour.
> 
> Having this flag gives the greatest flexibility, allowing you to individually
> enable/disable/reset counters in a group, or all together.

As far as enable/disable are concerned, I don't think this is really
necessary.  My intention was that if you want to enable/disable a
whole group you just enable/disable the leader and leave all its
siblings enabled, since if the leader is disabled the whole group
can't go on.

Corey's problem was that we have a bug where enabling the leader only
puts the leader on and not the enabled group members.  I meant to send
a patch to fix that ages ago but I got distracted.  I'll send out the
patch shortly.

Paul.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-11  1:29   ` [PATCH 3/5] " Paul Mackerras
@ 2009-05-11 15:12     ` Peter Zijlstra
  2009-05-12  6:24       ` Paul Mackerras
  2009-05-11 18:12     ` Corey Ashford
  1 sibling, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-11 15:12 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Ingo Molnar, Corey Ashford, linux-kernel, Thomas Gleixner

On Mon, 2009-05-11 at 11:29 +1000, Paul Mackerras wrote:
> Peter Zijlstra writes:
> 
> > Corey noticed that ioctl()s on grouped counters didn't work on the whole group.
> > This extends the ioctl() interface to take a second argument that is
> > interpreted as a flags field. We then provide PERF_IOC_FLAG_GROUP to toggle
> > the behaviour.
> > 
> > Having this flag gives the greatest flexibility, allowing you to individually
> > enable/disable/reset counters in a group, or all together.
> 
> As far as enable/disable are concerned, I don't think this is really
> necessary.  My intention was that if you want to enable/disable a
> whole group you just enable/disable the leader and leave all its
> siblings enabled, since if the leader is disabled the whole group
> can't go on.
> 
> Corey's problem was that we have a bug where enabling the leader only
> puts the leader on and not the enabled group members.  I meant to send
> a patch to fix that ages ago but I got distracted.  I'll send out the
> patch shortly.

Thanks, I missed that little detail.

Do we still want the new ioctl iteration flag?


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-11  1:29   ` [PATCH 3/5] " Paul Mackerras
  2009-05-11 15:12     ` Peter Zijlstra
@ 2009-05-11 18:12     ` Corey Ashford
  2009-05-11 20:52       ` Paul Mackerras
  1 sibling, 1 reply; 31+ messages in thread
From: Corey Ashford @ 2009-05-11 18:12 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Thomas Gleixner



Paul Mackerras wrote:
> Peter Zijlstra writes:
> 
>> Corey noticed that ioctl()s on grouped counters didn't work on the whole group.
>> This extends the ioctl() interface to take a second argument that is
>> interpreted as a flags field. We then provide PERF_IOC_FLAG_GROUP to toggle
>> the behaviour.
>>
>> Having this flag gives the greatest flexibility, allowing you to individually
>> enable/disable/reset counters in a group, or all together.
> 
> As far as enable/disable are concerned, I don't think this is really
> necessary.  My intention was that if you want to enable/disable a
> whole group you just enable/disable the leader and leave all its
> siblings enabled, since if the leader is disabled the whole group
> can't go on.
> 
> Corey's problem was that we have a bug where enabling the leader only
> puts the leader on and not the enabled group members.  I meant to send
> a patch to fix that ages ago but I got distracted.  I'll send out the
> patch shortly.
> 
> Paul.

One problem that I was seeing was that disabling only the group leader didn't 
keep the group member hardware counters from continuing to run and generate 
overflows, and causing sample records to be put in the mmap buffer.

Was this the intention?

For PAPI, we really need to be able to disable, at least, the sample records 
from going into the mmap buffers.  Even better would be to stop the hardware 
counters from counting while disabled.

Otherwise, the only way I have of stopping samples from getting into the buffer 
is to munmap the buffer.


- Corey

Corey Ashford
Software Engineer
IBM Linux Technology Center, Linux Toolchain
cjashfor@us.ibm.com


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-11 18:12     ` Corey Ashford
@ 2009-05-11 20:52       ` Paul Mackerras
  2009-05-11 22:37         ` Corey Ashford
  0 siblings, 1 reply; 31+ messages in thread
From: Paul Mackerras @ 2009-05-11 20:52 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Thomas Gleixner

Corey Ashford writes:

> One problem that I was seeing was that disabling only the group leader didn't 
> keep the group member hardware counters from continuing to run and generate 
> overflows, and causing sample records to be put in the mmap buffer.

That's very strange.  Disabling the group leader should have taken the
whole group off the PMU and so none of the members should have been
able to generate any more overflows or sample records.  What machines
did you see this on?

> Was this the intention?

Not at all.

> For PAPI, we really need to be able to disable, at least, the sample records 
> from going into the mmap buffers.  Even better would be to stop the hardware 
> counters from counting while disabled.

Disabling the leader should be doing all that.

Paul.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-11 20:52       ` Paul Mackerras
@ 2009-05-11 22:37         ` Corey Ashford
  2009-05-12  6:16           ` Paul Mackerras
  0 siblings, 1 reply; 31+ messages in thread
From: Corey Ashford @ 2009-05-11 22:37 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Thomas Gleixner



Paul Mackerras wrote:
> Corey Ashford writes:
> 
>> One problem that I was seeing was that disabling only the group leader didn't 
>> keep the group member hardware counters from continuing to run and generate 
>> overflows, and causing sample records to be put in the mmap buffer.
> 
> That's very strange.  Disabling the group leader should have taken the
> whole group off the PMU and so none of the members should have been
> able to generate any more overflows or sample records.  What machines
> did you see this on?
> 
>> Was this the intention?
> 
> Not at all.
> 
>> For PAPI, we really need to be able to disable, at least, the sample records 
>> from going into the mmap buffers.  Even better would be to stop the hardware 
>> counters from counting while disabled.
> 
> Disabling the leader should be doing all that.

I was seeing that on Power5.  On 5/6/2009, I sent along a test case that was 
supposed to duplicate the sequence of operations, without using PAPI, to Peter 
Zijlstra, cc'ing you.  However, I was unable to get any signals delivered to 
user space with that code, so I was unable to reproduce the problem outside of 
the PAPI test case.  I don't know why signal delivery doesn't work with that 
test case... I spent quite a lot of time verifying that it was doing the right 
thing, but could never get it to work.

I hate to ask you to debug that test case's signal handling, but that might be 
the fastest way to reproduce the behavior I was seeing inside of PAPI.  If not, 
I can try to get back to the test case later this week and have another go at 
it, perhaps instrumenting the kernel to try to determine why its not sending 
signals.

- Corey

Corey Ashford
Software Engineer
IBM Linux Technology Center, Linux Toolchain
cjashfor@us.ibm.com


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-08 16:52 ` [PATCH 3/5] perf_counter: rework ioctl()s Peter Zijlstra
  2009-05-08 18:39   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
  2009-05-11  1:29   ` [PATCH 3/5] " Paul Mackerras
@ 2009-05-11 23:58   ` Arnd Bergmann
  2009-05-12  6:11     ` Peter Zijlstra
  2 siblings, 1 reply; 31+ messages in thread
From: Arnd Bergmann @ 2009-05-11 23:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Paul Mackerras, Corey Ashford, linux-kernel,
	Thomas Gleixner

On Friday 08 May 2009, Peter Zijlstra wrote:
> -#define PERF_COUNTER_IOC_ENABLE                _IO ('$', 0)
> -#define PERF_COUNTER_IOC_DISABLE       _IO ('$', 1)
> +#define PERF_COUNTER_IOC_ENABLE                _IOW('$', 0, u32)
> +#define PERF_COUNTER_IOC_DISABLE       _IOW('$', 1, u32)
>  #define PERF_COUNTER_IOC_REFRESH       _IOW('$', 2, u32)
> -#define PERF_COUNTER_IOC_RESET         _IO ('$', 3)
> +#define PERF_COUNTER_IOC_RESET         _IOW('$', 3, u32)

These ioctl definitions look malformed:
_IOW('$', 0, u32) means that the ioctl will read a u32 in user
space pointed to by (u32 __user *)arg, while what perf_ioctl
actually does is cast arg to a u32. Moreover, exported headers
should use __u32 instead of u32.

PERF_COUNTER_IOC_REFRESH apparently was broken already, this
patch also breaks the other definitions.

	Arnd <><

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-11 23:58   ` Arnd Bergmann
@ 2009-05-12  6:11     ` Peter Zijlstra
  2009-05-12  6:22       ` Paul Mackerras
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-12  6:11 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Ingo Molnar, Paul Mackerras, Corey Ashford, linux-kernel,
	Thomas Gleixner

On Tue, 2009-05-12 at 01:58 +0200, Arnd Bergmann wrote:
> On Friday 08 May 2009, Peter Zijlstra wrote:
> > -#define PERF_COUNTER_IOC_ENABLE                _IO ('$', 0)
> > -#define PERF_COUNTER_IOC_DISABLE       _IO ('$', 1)
> > +#define PERF_COUNTER_IOC_ENABLE                _IOW('$', 0, u32)
> > +#define PERF_COUNTER_IOC_DISABLE       _IOW('$', 1, u32)
> >  #define PERF_COUNTER_IOC_REFRESH       _IOW('$', 2, u32)
> > -#define PERF_COUNTER_IOC_RESET         _IO ('$', 3)
> > +#define PERF_COUNTER_IOC_RESET         _IOW('$', 3, u32)
> 
> These ioctl definitions look malformed:
> _IOW('$', 0, u32) means that the ioctl will read a u32 in user
> space pointed to by (u32 __user *)arg, while what perf_ioctl
> actually does is cast arg to a u32. Moreover, exported headers
> should use __u32 instead of u32.
> 
> PERF_COUNTER_IOC_REFRESH apparently was broken already, this
> patch also breaks the other definitions.

Hmm, are you saying that the 3rd argument to unlocked_ioctl is actually
(void __user *) instead of unsigned long?




^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-11 22:37         ` Corey Ashford
@ 2009-05-12  6:16           ` Paul Mackerras
  2009-05-12 16:15             ` Corey Ashford
  0 siblings, 1 reply; 31+ messages in thread
From: Paul Mackerras @ 2009-05-12  6:16 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Thomas Gleixner

Corey Ashford writes:

> I was seeing that on Power5.  On 5/6/2009, I sent along a test case that was 
> supposed to duplicate the sequence of operations, without using PAPI, to Peter 
> Zijlstra, cc'ing you.  However, I was unable to get any signals delivered to 
> user space with that code, so I was unable to reproduce the problem outside of 
> the PAPI test case.  I don't know why signal delivery doesn't work with that 
> test case... I spent quite a lot of time verifying that it was doing the right 
> thing, but could never get it to work.

I tried your test program out today, with the ioctl calls changed to
add an extra 0 argument, like this:

   ioctl(fd[0], PERF_COUNTER_IOC_ENABLE, 0);

and similarly for the disable call.  It all seemed to be fine.  There
is still a little problem with enabling counters - it doesn't happen
right away.  I have a patch to fix that, which I'll send out, and with
that patch I get:

# ./test_group_disable 
evt[0].config = 0
evt[1].config = 4
group_leader = -1
evt[i].wakeup_events = 0
successfully opened evt[0].  fd[0] = 3
group_leader = 3
evt[i].wakeup_events = 0
successfully opened evt[1].  fd[1] = 4
successfully attached signal handler
fd 3's mmap buffer is located at 0xfffab4b3000
successfully tuned up fd 3
fd 4's mmap buffer is located at 0xfffab4aa000
successfully tuned up fd 4
enable was successful
CYCLES: 20904
BRANCHES: 2413
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 4
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 4
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 4
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
signal received while counters are enabled. fd = 3
disable was successful
CYCLES: 180149676
BRANCHES: 30030826
CYCLES: 180149676
BRANCHES: 30030826
CYCLES data_head: 141b0
BRANCHES data_head: 35a0
#

So clearly it is getting signals when the leader is enabled, one per
page of events, and not when the leader is disabled.  I notice you
have evt[].wakeup_events set to 0, which is presumably why we don't
get more signals than we do.

Paul.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-12  6:11     ` Peter Zijlstra
@ 2009-05-12  6:22       ` Paul Mackerras
  2009-05-12  6:27         ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Paul Mackerras @ 2009-05-12  6:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnd Bergmann, Ingo Molnar, Corey Ashford, linux-kernel, Thomas Gleixner

Peter Zijlstra writes:

> Hmm, are you saying that the 3rd argument to unlocked_ioctl is actually
> (void __user *) instead of unsigned long?

He's saying (correctly) that using _IOR or _IOW implies that the ioctl
is going to read or write the memory location pointed to by the 3rd
argument to unlocked_ioctl.  If the 3rd argument is just a number, not
an address, I believe you should use _IO.

Paul.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-11 15:12     ` Peter Zijlstra
@ 2009-05-12  6:24       ` Paul Mackerras
  0 siblings, 0 replies; 31+ messages in thread
From: Paul Mackerras @ 2009-05-12  6:24 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Corey Ashford, linux-kernel, Thomas Gleixner

Peter Zijlstra writes:

> Thanks, I missed that little detail.
> 
> Do we still want the new ioctl iteration flag?

Not sure; it might be useful sometimes, I guess.  We do need to clear
up the confusion about whether one does ioctl(fd, ..., flag) or
ioctl(fd, ..., &flag), and if the former we shouldn't be using _IOW,
as Arnd pointed out.

Paul.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-12  6:22       ` Paul Mackerras
@ 2009-05-12  6:27         ` Peter Zijlstra
  2009-05-12  7:10           ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-12  6:27 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Arnd Bergmann, Ingo Molnar, Corey Ashford, linux-kernel, Thomas Gleixner

On Tue, 2009-05-12 at 16:22 +1000, Paul Mackerras wrote:
> Peter Zijlstra writes:
> 
> > Hmm, are you saying that the 3rd argument to unlocked_ioctl is actually
> > (void __user *) instead of unsigned long?
> 
> He's saying (correctly) that using _IOR or _IOW implies that the ioctl
> is going to read or write the memory location pointed to by the 3rd
> argument to unlocked_ioctl.  If the 3rd argument is just a number, not
> an address, I believe you should use _IO.

Oh, somewhat confusing all this. Would be good to spell out these things
somewhere. Documentation/ioctl/ seems less than helpful.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-12  6:27         ` Peter Zijlstra
@ 2009-05-12  7:10           ` Peter Zijlstra
  2009-05-12  7:52             ` Arnd Bergmann
  2009-05-12 11:21             ` [PATCH 3/5] perf_counter: rework ioctl()s Paul Mackerras
  0 siblings, 2 replies; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-12  7:10 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Arnd Bergmann, Ingo Molnar, Corey Ashford, linux-kernel, Thomas Gleixner

On Tue, 2009-05-12 at 08:27 +0200, Peter Zijlstra wrote:
> On Tue, 2009-05-12 at 16:22 +1000, Paul Mackerras wrote:
> > Peter Zijlstra writes:
> > 
> > > Hmm, are you saying that the 3rd argument to unlocked_ioctl is actually
> > > (void __user *) instead of unsigned long?
> > 
> > He's saying (correctly) that using _IOR or _IOW implies that the ioctl
> > is going to read or write the memory location pointed to by the 3rd
> > argument to unlocked_ioctl.  If the 3rd argument is just a number, not
> > an address, I believe you should use _IO.
> 
> Oh, somewhat confusing all this. Would be good to spell out these things
> somewhere. Documentation/ioctl/ seems less than helpful.

Ah, so _IO() gets an unsigned long 3rd argument.
_IOW() treats the 3rd arg as a (type __user *) and copies the bits over
_IOR() copies the bits back out to userspace
_IORW() does both

In which case the below should fix things up, no?

---
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 614f921..6a9cebc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -159,10 +159,10 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
-#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
-#define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IO('$', 2)
+#define PERF_COUNTER_IOC_RESET		_IO('$', 3)
 
 enum perf_counter_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-12  7:10           ` Peter Zijlstra
@ 2009-05-12  7:52             ` Arnd Bergmann
  2009-05-12 10:59               ` [PATCH] perf_counter: fix ioctl()s Peter Zijlstra
  2009-05-12 11:21             ` [PATCH 3/5] perf_counter: rework ioctl()s Paul Mackerras
  1 sibling, 1 reply; 31+ messages in thread
From: Arnd Bergmann @ 2009-05-12  7:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul Mackerras, Ingo Molnar, Corey Ashford, linux-kernel,
	Thomas Gleixner

On Tuesday 12 May 2009, Peter Zijlstra wrote:
> Ah, so _IO() gets an unsigned long 3rd argument.
> _IOW() treats the 3rd arg as a (type __user *) and copies the bits over
> _IOR() copies the bits back out to userspace
> _IORW() does both
> 
> In which case the below should fix things up, no?
> 

Yes, this looks good now.

Thanks,

	Arnd <><

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH] perf_counter: fix ioctl()s
  2009-05-12  7:52             ` Arnd Bergmann
@ 2009-05-12 10:59               ` Peter Zijlstra
  0 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2009-05-12 10:59 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Paul Mackerras, Ingo Molnar, Corey Ashford, linux-kernel,
	Thomas Gleixner

On Tue, 2009-05-12 at 09:52 +0200, Arnd Bergmann wrote:
> On Tuesday 12 May 2009, Peter Zijlstra wrote:
> > Ah, so _IO() gets an unsigned long 3rd argument.
> > _IOW() treats the 3rd arg as a (type __user *) and copies the bits over
> > _IOR() copies the bits back out to userspace
> > _IORW() does both
> > 
> > In which case the below should fix things up, no?
> > 
> 
> Yes, this looks good now.

---
Arnd spotted that I misunderstood and wrongly used the 3rd ioctl
argument. Fix up the perf counter ioctl()s to use the 3rd argument as
value -- as opposed to a pointer as would now be the case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 614f921..6a9cebc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -159,10 +159,10 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
-#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
-#define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IO('$', 2)
+#define PERF_COUNTER_IOC_RESET		_IO('$', 3)
 
 enum perf_counter_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-12  7:10           ` Peter Zijlstra
  2009-05-12  7:52             ` Arnd Bergmann
@ 2009-05-12 11:21             ` Paul Mackerras
  1 sibling, 0 replies; 31+ messages in thread
From: Paul Mackerras @ 2009-05-12 11:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnd Bergmann, Ingo Molnar, Corey Ashford, linux-kernel, Thomas Gleixner

Peter Zijlstra writes:

> Ah, so _IO() gets an unsigned long 3rd argument.
> _IOW() treats the 3rd arg as a (type __user *) and copies the bits over
> _IOR() copies the bits back out to userspace
> _IORW() does both

Yep.

> In which case the below should fix things up, no?

Looks good.

Paul.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-12  6:16           ` Paul Mackerras
@ 2009-05-12 16:15             ` Corey Ashford
  2009-05-12 22:18               ` Paul Mackerras
  0 siblings, 1 reply; 31+ messages in thread
From: Corey Ashford @ 2009-05-12 16:15 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Thomas Gleixner

Paul Mackerras wrote:
> Corey Ashford writes:
> 
>> I was seeing that on Power5.  On 5/6/2009, I sent along a test case that was 
>> supposed to duplicate the sequence of operations, without using PAPI, to Peter 
>> Zijlstra, cc'ing you.  However, I was unable to get any signals delivered to 
>> user space with that code, so I was unable to reproduce the problem outside of 
>> the PAPI test case.  I don't know why signal delivery doesn't work with that 
>> test case... I spent quite a lot of time verifying that it was doing the right 
>> thing, but could never get it to work.
> 
> I tried your test program out today, with the ioctl calls changed to
> add an extra 0 argument, like this:
> 
>    ioctl(fd[0], PERF_COUNTER_IOC_ENABLE, 0);
> 
> and similarly for the disable call.  It all seemed to be fine.  There
> is still a little problem with enabling counters - it doesn't happen
> right away.  I have a patch to fix that, which I'll send out, and with
> that patch I get:
> 
> # ./test_group_disable 
> evt[0].config = 0
> evt[1].config = 4
> group_leader = -1
> evt[i].wakeup_events = 0
> successfully opened evt[0].  fd[0] = 3
> group_leader = 3
> evt[i].wakeup_events = 0
> successfully opened evt[1].  fd[1] = 4
> successfully attached signal handler
> fd 3's mmap buffer is located at 0xfffab4b3000
> successfully tuned up fd 3
> fd 4's mmap buffer is located at 0xfffab4aa000
> successfully tuned up fd 4
> enable was successful
> CYCLES: 20904
> BRANCHES: 2413
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 4
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 4
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 4
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> signal received while counters are enabled. fd = 3
> disable was successful
> CYCLES: 180149676
> BRANCHES: 30030826
> CYCLES: 180149676
> BRANCHES: 30030826
> CYCLES data_head: 141b0
> BRANCHES data_head: 35a0
> #
> 
> So clearly it is getting signals when the leader is enabled, one per
> page of events, and not when the leader is disabled.  I notice you
> have evt[].wakeup_events set to 0, which is presumably why we don't
> get more signals than we do.

Interesting. Thanks for finding that ioctl error!  That really had me 
stumped.

Another hypothesis is that the old PAPI code would open, mmap, close, 
then reopen, and mmap again.  I wonder if because I wasn't munmap'ing 
before the close, that I got some strange behavior from the kernel. 
I'll try that today with that test case.

Thanks!

- Corey



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-12 16:15             ` Corey Ashford
@ 2009-05-12 22:18               ` Paul Mackerras
  2009-05-12 22:51                 ` Corey Ashford
  0 siblings, 1 reply; 31+ messages in thread
From: Paul Mackerras @ 2009-05-12 22:18 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Thomas Gleixner

Corey Ashford writes:

> Another hypothesis is that the old PAPI code would open, mmap, close, 
> then reopen, and mmap again.  I wonder if because I wasn't munmap'ing 
> before the close, that I got some strange behavior from the kernel. 

In general an mmap will keep a reference to the file it is mapping,
which means that if you do an mmap on a fd and then close the fd, the
file stays open until the region is munmapped.  So your old PAPI code
would have ended up with more counters active than you expected, if
you thought closing the fd would have destroyed the counter.

That is, you don't have to munmap before close, but you do have to do
both close and munmap to destroy the counter.

Paul.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/5] perf_counter: rework ioctl()s
  2009-05-12 22:18               ` Paul Mackerras
@ 2009-05-12 22:51                 ` Corey Ashford
  0 siblings, 0 replies; 31+ messages in thread
From: Corey Ashford @ 2009-05-12 22:51 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Thomas Gleixner

Paul Mackerras wrote:
> Corey Ashford writes:
> 
>> Another hypothesis is that the old PAPI code would open, mmap, close, 
>> then reopen, and mmap again.  I wonder if because I wasn't munmap'ing 
>> before the close, that I got some strange behavior from the kernel. 
> 
> In general an mmap will keep a reference to the file it is mapping,
> which means that if you do an mmap on a fd and then close the fd, the
> file stays open until the region is munmapped.  So your old PAPI code
> would have ended up with more counters active than you expected, if
> you thought closing the fd would have destroyed the counter.
> 
> That is, you don't have to munmap before close, but you do have to do
> both close and munmap to destroy the counter.


That makes perfect sense.  I'll bet that's what the problem was.  I think this 
issue can be closed.  Thank you Paul M. and Peter Z. for looking into this.

- Corey


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2009-05-12 22:51 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-05-08 16:52 [PATCH 0/5] pending patches Peter Zijlstra
2009-05-08 16:52 ` [PATCH 1/5] hrtimer: per-cpu cached values of ktime Peter Zijlstra
2009-05-08 23:10   ` Andrew Morton
2009-05-09 19:45   ` Linus Torvalds
2009-05-09 19:59     ` Peter Zijlstra
2009-05-08 16:52 ` [PATCH 2/5] perf_counter: optimize perf_counter_task_tick() Peter Zijlstra
2009-05-08 18:39   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-05-08 16:52 ` [PATCH 3/5] perf_counter: rework ioctl()s Peter Zijlstra
2009-05-08 18:39   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-05-11  1:29   ` [PATCH 3/5] " Paul Mackerras
2009-05-11 15:12     ` Peter Zijlstra
2009-05-12  6:24       ` Paul Mackerras
2009-05-11 18:12     ` Corey Ashford
2009-05-11 20:52       ` Paul Mackerras
2009-05-11 22:37         ` Corey Ashford
2009-05-12  6:16           ` Paul Mackerras
2009-05-12 16:15             ` Corey Ashford
2009-05-12 22:18               ` Paul Mackerras
2009-05-12 22:51                 ` Corey Ashford
2009-05-11 23:58   ` Arnd Bergmann
2009-05-12  6:11     ` Peter Zijlstra
2009-05-12  6:22       ` Paul Mackerras
2009-05-12  6:27         ` Peter Zijlstra
2009-05-12  7:10           ` Peter Zijlstra
2009-05-12  7:52             ` Arnd Bergmann
2009-05-12 10:59               ` [PATCH] perf_counter: fix ioctl()s Peter Zijlstra
2009-05-12 11:21             ` [PATCH 3/5] perf_counter: rework ioctl()s Paul Mackerras
2009-05-08 16:52 ` [PATCH 4/5] perf_counter: PERF_RECORD_CONFIG Peter Zijlstra
2009-05-08 18:39   ` [tip:perfcounters/core] perf_counter: add PERF_RECORD_CONFIG tip-bot for Peter Zijlstra
2009-05-08 16:52 ` [PATCH 5/5] perf_counter: PERF_RECORD_CPU Peter Zijlstra
2009-05-08 18:40   ` [tip:perfcounters/core] perf_counter: add PERF_RECORD_CPU tip-bot for Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.