All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] perf_events: add cgroup support (v8)
@ 2011-01-20 13:30 Stephane Eranian
  2011-01-20 14:39 ` Peter Zijlstra
  2011-02-07 16:10 ` Peter Zijlstra
  0 siblings, 2 replies; 28+ messages in thread
From: Stephane Eranian @ 2011-01-20 13:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, mingo, paulus, davem, fweisbec, perfmon2-devel, eranian,
	eranian, robert.richter, acme, lizf

This kernel patch adds the ability to filter monitoring based on
container groups (cgroups). This is for use in per-cpu mode only.
    
The cgroup to monitor is passed as a file descriptor in the pid
argument to the syscall. The file descriptor must be opened to 
the cgroup name in the cgroup filesystem. For instance, if the
cgroup name is foo and cgroupfs is mounted in /cgroup, then the
file descriptor is opened to /cgroup/foo. Cgroup mode is
activated by passing PERF_FLAG_PID_CGROUP in the flags argument
to the syscall.

For instance to measure in cgroup foo on CPU1 assuming
cgroupfs is mounted under /cgroup:

struct perf_event_attr attr;
int cgroup_fd, fd;

cgroup_fd = open("/cgroup/foo", O_RDONLY);
fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP);
close(cgroup_fd);

Signed-off-by: Stephane Eranian <eranian@google.com>
---

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ce104e3..780e88c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -626,6 +626,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
 unsigned short css_depth(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 
 #else /* !CONFIG_CGROUPS */
 
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff0..cdbfcb8 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,4 +65,8 @@ SUBSYS(net_cls)
 SUBSYS(blkio)
 #endif
 
+#ifdef CONFIG_CGROUP_PERF
+SUBSYS(perf)
+#endif
+
 /* */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dda5b0a..c905765 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -464,6 +464,7 @@ enum perf_callchain_context {
 
 #define PERF_FLAG_FD_NO_GROUP	(1U << 0)
 #define PERF_FLAG_FD_OUTPUT	(1U << 1)
+#define PERF_FLAG_PID_CGROUP	(1U << 2) /* pid=cgroup id, per-cpu mode only */
 
 #ifdef __KERNEL__
 /*
@@ -471,6 +472,7 @@ enum perf_callchain_context {
  */
 
 #ifdef CONFIG_PERF_EVENTS
+# include <linux/cgroup.h>
 # include <asm/perf_event.h>
 # include <asm/local64.h>
 #endif
@@ -716,6 +718,22 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
 
+#ifdef CONFIG_CGROUP_PERF
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+	u64 time;
+	u64 timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state css;
+	struct perf_cgroup_info *info;	/* timing info, one per cpu */
+};
+#endif
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -832,6 +850,11 @@ struct perf_event {
 	struct event_filter		*filter;
 #endif
 
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
+	int				cgrp_defer_enabled;
+#endif
+
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -886,6 +909,7 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	struct rcu_head			rcu_head;
+	int				nr_cgroups; /* cgroup events present */
 };
 
 /*
@@ -1040,11 +1064,11 @@ have_event:
 	__perf_sw_event(event_id, nr, nmi, regs, addr);
 }
 
-extern atomic_t perf_task_events;
+extern atomic_t perf_sched_events;
 
 static inline void perf_event_task_sched_in(struct task_struct *task)
 {
-	COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
+	COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
 }
 
 static inline
@@ -1052,7 +1076,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex
 {
 	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
 
-	COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
+	COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -1121,6 +1145,29 @@ extern void perf_swevent_put_recursion_context(int rctx);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_task_tick(void);
+
+#ifdef CONFIG_CGROUP_PERF
+
+#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
+
+extern void perf_cgroup_switch(struct task_struct *tsk, int mode);
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+	/* if necessary, deschedule all events */
+	perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+	/* if necessary, schedule cgroup events */
+	perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+#else
+static inline void perf_cgroup_sched_out(struct task_struct *task)	{ }
+static inline void perf_cgroup_sched_in(struct task_struct *task)	{ }
+#endif
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task)			{ }
@@ -1134,6 +1181,8 @@ static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline void perf_event_print_debug(void)				{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
+static inline void perf_cgroup_sched_out(struct task_struct *task)	{ }
+static inline void perf_cgroup_sched_in(struct task_struct *task)	{ }
 
 static inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi,
diff --git a/init/Kconfig b/init/Kconfig
index 4e33790..954eac3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 	  select this option (if, for some reason, they need to disable it
 	  then noswapaccount does the trick).
 
+config CGROUP_PERF
+	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
+	depends on PERF_EVENTS && CGROUPS
+	help
+	  This option extends the per-cpu mode to restrict monitoring to
+	  threads which belong to the cgroup specificied and run on the
+	  designated cpu.
+
+	  Say N if unsure.
+
 menuconfig CGROUP_SCHED
 	bool "Group CPU scheduler"
 	depends on EXPERIMENTAL
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d702..a27f64a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
+#include <linux/perf_event.h>
 
 #include <asm/atomic.h>
 
@@ -4232,6 +4233,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
 	int i;
 	struct css_set *cg;
+	unsigned long flags;
 
 	if (run_callbacks && need_forkexit_callback) {
 		/*
@@ -4259,8 +4261,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 
 	/* Reassign the task to the init_css_set. */
 	task_lock(tsk);
+	/*
+	 * we mask interrupts to prevent:
+	 * - timer tick to cause event rotation which
+	 *   could schedule back in cgroup events after
+	 *   they were switched out by perf_cgroup_sched_out()
+	 *
+	 * - preemption which could schedule back in cgroup events
+	 */
+	local_irq_save(flags);
+	perf_cgroup_sched_out(tsk);
 	cg = tsk->cgroups;
 	tsk->cgroups = &init_css_set;
+	perf_cgroup_sched_in(tsk);
+	local_irq_restore(flags);
 	task_unlock(tsk);
 	if (cg)
 		put_css_set_taskexit(cg);
@@ -4813,6 +4827,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
 	return ret;
 }
 
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+	struct cgroup *cgrp;
+	struct inode *inode;
+	struct cgroup_subsys_state *css;
+
+	inode = f->f_dentry->d_inode;
+	/* check in cgroup filesystem dir */
+	if (inode->i_op != &cgroup_dir_inode_operations)
+		return ERR_PTR(-EBADF);
+
+	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+		return ERR_PTR(-EINVAL);
+
+	/* get cgroup */
+	cgrp = __d_cgrp(f->f_dentry);
+	css = cgrp->subsys[id];
+	return css ? css : ERR_PTR(-ENOENT);
+}
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
 						   struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 244ca3a..14bdefe 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,23 @@
 
 #include <asm/irq_regs.h>
 
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+		       PERF_FLAG_FD_OUTPUT  |\
+		       PERF_FLAG_PID_CGROUP)
+
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
-atomic_t perf_task_events __read_mostly;
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+atomic_t perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
@@ -75,7 +85,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			      enum event_type_t event_type);
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type);
+			     enum event_type_t event_type,
+			     struct task_struct *task, int cgrp_sw);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
 
 void __weak perf_event_print_debug(void)	{ }
 
@@ -89,6 +103,292 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+#ifdef CONFIG_CGROUP_PERF
+
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, perf_subsys_id),
+			struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+	struct perf_cgroup *cgrp = NULL;
+	if (task)
+		cgrp = perf_cgroup_from_task(task);
+	return !event->cgrp || event->cgrp == cgrp;
+}
+
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+	css_get(&event->cgrp->css);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+	css_put(&event->cgrp->css);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+	perf_put_cgroup(event);
+	event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+	struct perf_cgroup_info *t;
+
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+	struct perf_cgroup_info *info;
+	u64 now;
+
+	now = perf_clock();
+
+	info = this_cpu_ptr(cgrp->info);
+
+	info->time += now - info->timestamp;
+	info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_task(struct task_struct *task)
+{
+	struct perf_cgroup *cgrp_out = perf_cgroup_from_task(task);
+	if (cgrp_out)
+		__update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+	struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
+	/*
+	 * do not update time when cgroup is not active
+	 */
+	if (!event->cgrp || cgrp != event->cgrp)
+		return;
+
+	__update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+	struct perf_cgroup *cgrp;
+	struct perf_cgroup_info *info;
+
+	if (!task)
+		return;
+
+	cgrp = perf_cgroup_from_task(task);
+	info = per_cpu_ptr(cgrp->info, smp_processor_id());
+	info->timestamp = now;
+}
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	/*
+	 * disable interrupts to avoid geting nr_cgroup
+	 * changes via __perf_event_disable(). Also
+	 * avoids preemption.
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * we reschedule only in the presence of cgroup
+	 * constrained events.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+		perf_pmu_disable(cpuctx->ctx.pmu);
+
+		/*
+		 * perf_cgroup_events says at least one
+		 * context on this CPU has cgroup events.
+		 *
+		 * ctx->nr_cgroups reports the number of cgroup
+		 * events for a context.
+		 */
+		if (cpuctx->ctx.nr_cgroups > 0) {
+
+			if (mode & PERF_CGROUP_SWOUT)
+				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+
+			if (mode & PERF_CGROUP_SWIN)
+				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, 1);
+		}
+
+		perf_pmu_enable(cpuctx->ctx.pmu);
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	struct perf_cgroup *cgrp;
+	struct cgroup_subsys_state *css;
+	struct file *file;
+	int ret = 0, fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+
+	css = cgroup_css_from_dir(file, perf_subsys_id);
+	if (IS_ERR(css))
+		return PTR_ERR(css);
+
+	cgrp = container_of(css, struct perf_cgroup, css);
+	event->cgrp = cgrp;
+
+	/*
+	 * all events in a group must monitor
+	 * the same cgroup because a task belongs
+	 * to only one perf cgroup at a time
+	 */
+	if (group_leader && group_leader->cgrp != cgrp) {
+		perf_detach_cgroup(event);
+		ret = -EINVAL;
+	} else {
+		/* must be done before we fput() the file */
+		perf_get_cgroup(event);
+	}
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+	struct perf_cgroup_info *t;
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event, struct task_struct *task)
+{
+	/*
+	 * when the current task's perf cgroup does not match
+	 * the event's, we need to remember to call the
+	 * perf_mark_enable() function the first time a task with
+	 * a matching perf cgroup is scheduled in.
+	 */
+	if (is_cgroup_event(event) && !perf_cgroup_match(event, task))
+		event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+			 struct perf_event_context *ctx)
+{
+	struct perf_event *sub;
+	u64 tstamp = perf_event_time(event);
+
+	if (!event->cgrp_defer_enabled)
+		return;
+
+	event->cgrp_defer_enabled = 0;
+
+	event->tstamp_enabled = tstamp - event->total_time_enabled;
+	list_for_each_entry(sub, &event->sibling_list, group_entry) {
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+			sub->cgrp_defer_enabled = 0;
+		}
+	}
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+	return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{}
+
+static inline void update_cgrp_time_from_task(struct task_struct *t)
+{}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event, struct task_struct *task)
+{}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+			 struct perf_event_context *ctx)
+{}
+#endif
+
 void perf_pmu_disable(struct pmu *pmu)
 {
 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -271,6 +571,10 @@ static void update_context_time(struct perf_event_context *ctx)
 static u64 perf_event_time(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
+
+	if (is_cgroup_event(event))
+		return perf_cgroup_event_time(event);
+
 	return ctx ? ctx->time : 0;
 }
 
@@ -285,9 +589,20 @@ static void update_event_times(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
-
-	if (ctx->is_active)
+	/*
+	 * in cgroup mode, time_enabled represents
+	 * the time the event was enabled AND active
+	 * tasks were in the monitored cgroup. This is
+	 * independent of the activity of the context as
+	 * there may be a mix of cgroup and non-cgroup events.
+	 *
+	 * That is why we treat cgroup events differently
+	 * here.
+	 */
+	if (is_cgroup_event(event))
 		run_end = perf_event_time(event);
+	else if (ctx->is_active)
+		run_end = ctx->time;
 	else
 		run_end = event->tstamp_stopped;
 
@@ -299,6 +614,7 @@ static void update_event_times(struct perf_event *event)
 		run_end = perf_event_time(event);
 
 	event->total_time_running = run_end - event->tstamp_running;
+
 }
 
 /*
@@ -347,6 +663,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
+	if (is_cgroup_event(event)) {
+		ctx->nr_cgroups++;
+		/*
+		 * one more event:
+		 * - that has cgroup constraint on event->cpu
+		 * - that may need work on context switch
+		 */
+		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_inc(&perf_sched_events);
+	}
+
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
 		perf_pmu_rotate_start(ctx->pmu);
@@ -473,6 +800,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
+	if (is_cgroup_event(event)) {
+		ctx->nr_cgroups--;
+		atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_dec(&perf_sched_events);
+	}
+
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -542,9 +875,10 @@ out:
 }
 
 static inline int
-event_filter_match(struct perf_event *event)
+event_filter_match(struct perf_event *event, struct task_struct *task)
 {
-	return event->cpu == -1 || event->cpu == smp_processor_id();
+	return (event->cpu == -1 || event->cpu == smp_processor_id())
+	    && perf_cgroup_match(event, task);
 }
 
 static void
@@ -561,8 +895,8 @@ event_sched_out(struct perf_event *event,
 	 * via read() for time_enabled, time_running:
 	 */
 	if (event->state == PERF_EVENT_STATE_INACTIVE
-	    && !event_filter_match(event)) {
-		delta = ctx->time - event->tstamp_stopped;
+	    && !event_filter_match(event, current)) {
+		delta = tstamp - event->tstamp_stopped;
 		event->tstamp_running += delta;
 		event->tstamp_stopped = tstamp;
 	}
@@ -720,6 +1054,7 @@ static void __perf_event_disable(void *info)
 	 */
 	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
 		update_group_times(event);
 		if (event == event->group_leader)
 			group_sched_out(event, cpuctx, ctx);
@@ -782,6 +1117,41 @@ retry:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
+static void perf_set_shadow_time(struct perf_event *event,
+				 struct perf_event_context *ctx,
+				 u64 tstamp)
+{
+	/*
+	 * use the correct time source for the time snapshot
+	 *
+	 * We could get by without this by leveraging the
+	 * fact that to get to this function, the caller
+	 * has most likely already called update_context_time()
+	 * and update_cgrp_time_xx() and thus both timestamp
+	 * are identical (or very close). Given that tstamp is,
+	 * already adjusted for cgroup, we could say that:
+	 *    tstamp - ctx->timestamp
+	 * is equivalent to
+	 *    tstamp - cgrp->timestamp.
+	 *
+	 * Then, in perf_output_read(), the calculation would
+	 * work with no changes because:
+	 * - event is guaranteed scheduled in
+	 * - no scheduled out in between
+	 * - thus the timestamp would be the same
+	 *
+	 * But this is a bit hairy.
+	 *
+	 * So instead, we have an explicit cgroup call to remain
+	 * within the time time source all along. We believe it
+	 * is cleaner and simpler to understand.
+	 */
+	if (is_cgroup_event(event))
+		perf_cgroup_set_shadow_time(event, tstamp);
+	else
+		event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
@@ -807,7 +1177,7 @@ event_sched_in(struct perf_event *event,
 
 	event->tstamp_running += tstamp - event->tstamp_stopped;
 
-	event->shadow_ctx_time = tstamp - ctx->timestamp;
+	perf_set_shadow_time(event, ctx, tstamp);
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
@@ -923,9 +1293,9 @@ static void add_event_to_ctx(struct perf_event *event,
 
 	list_add_event(event, ctx);
 	perf_group_attach(event);
-	event->tstamp_enabled = tstamp;
 	event->tstamp_running = tstamp;
 	event->tstamp_stopped = tstamp;
+	event->tstamp_enabled = tstamp;
 }
 
 /*
@@ -957,10 +1327,16 @@ static void __perf_install_in_context(void *info)
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	update_context_time(ctx);
+	/*
+	 * update cgrp time only if current cgrp
+	 * matches event->cgrp. Must be done before
+	 * calling add_event_to_ctx()
+	 */
+	update_cgrp_time_from_event(event);
 
 	add_event_to_ctx(event, ctx);
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current))
 		goto unlock;
 
 	/*
@@ -1102,10 +1478,19 @@ static void __perf_event_enable(void *info)
 
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
 		goto unlock;
+
+	/*
+	 * set current task's cgroup time reference point
+	 */
+	perf_cgroup_set_timestamp(current, perf_clock());
+
 	__perf_event_mark_enabled(event, ctx);
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current)) {
+		if (is_cgroup_event(event))
+			perf_cgroup_defer_enabled(event, current);
 		goto unlock;
+	}
 
 	/*
 	 * If the event is in a group and isn't the group leader,
@@ -1227,6 +1612,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	if (likely(!ctx->nr_events))
 		goto out;
 	update_context_time(ctx);
+	update_cgrp_time_from_task(current);
 
 	if (!ctx->nr_active)
 		goto out;
@@ -1416,6 +1802,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
 
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
+
+	/*
+	 * if cgroup events exist on this CPU, then we need
+	 * to check if we have to switch out PMU state.
+	 * cgroup event are system-wide mode only
+	 */
+	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+		perf_cgroup_sched_out(task);
 }
 
 static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1444,16 +1838,21 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 
 static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
+		    struct perf_cpu_context *cpuctx,
+		    struct task_struct *task, int cgrp_sw)
 {
 	struct perf_event *event;
 
 	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, task))
 			continue;
 
+		/* may need to reset tstamp_enabled */
+		if (is_cgroup_event(event))
+			perf_cgroup_mark_enabled(event, ctx);
+
 		if (group_can_go_on(event, cpuctx, 1))
 			group_sched_in(event, cpuctx, ctx);
 
@@ -1470,7 +1869,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 
 static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
+		      struct perf_cpu_context *cpuctx,
+		      struct task_struct *task, int cgrp_sw)
 {
 	struct perf_event *event;
 	int can_add_hw = 1;
@@ -1483,9 +1883,13 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of events:
 		 */
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, task))
 			continue;
 
+		/* may need to reset tstamp_enabled */
+		if (is_cgroup_event(event))
+			perf_cgroup_mark_enabled(event, ctx);
+
 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
@@ -1496,36 +1900,41 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type)
+	     enum event_type_t event_type,
+	     struct task_struct *task, int cgrp_sw)
 {
+	u64 now;
+
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	if (likely(!ctx->nr_events))
 		goto out;
 
-	ctx->timestamp = perf_clock();
-
+	now = perf_clock();
+	ctx->timestamp = now;
+	perf_cgroup_set_timestamp(task, now);
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
 	if (event_type & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+		ctx_pinned_sched_in(ctx, cpuctx, task, cgrp_sw);
 
 	/* Then walk through the lower prio flexible groups */
 	if (event_type & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+		ctx_flexible_sched_in(ctx, cpuctx, task, cgrp_sw);
 
 out:
 	raw_spin_unlock(&ctx->lock);
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type)
+			     enum event_type_t event_type,
+			     struct task_struct *task, int cgrp_sw)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, task, cgrp_sw);
 }
 
 static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1537,11 +1946,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
 	if (cpuctx->task_ctx == ctx)
 		return;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, NULL, 0);
 	cpuctx->task_ctx = ctx;
 }
 
-void perf_event_context_sched_in(struct perf_event_context *ctx)
+void perf_event_context_sched_in(struct perf_event_context *ctx,
+				 struct task_struct *task)
 {
 	struct perf_cpu_context *cpuctx;
 
@@ -1557,9 +1967,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
 	 */
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
-	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+	ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, 0);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, 0);
+	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, 0);
 
 	cpuctx->task_ctx = ctx;
 
@@ -1592,8 +2002,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
 		if (likely(!ctx))
 			continue;
 
-		perf_event_context_sched_in(ctx);
+		perf_event_context_sched_in(ctx, task);
 	}
+	/*
+	 * if cgroup events exist on this CPU, then we need
+	 * to check if we have to switch in PMU state.
+	 * cgroup event are system-wide mode only
+	 */
+	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+		perf_cgroup_sched_in(task);
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1710,7 +2127,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, current))
 			continue;
 
 		hwc = &event->hw;
@@ -1768,9 +2185,10 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 	struct perf_event_context *ctx = NULL;
 	int rotate = 0, remove = 1;
 
-	if (cpuctx->ctx.nr_events) {
+	ctx = &cpuctx->ctx;
+	if (ctx->nr_events) {
 		remove = 0;
-		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+		if (ctx->nr_events != ctx->nr_active)
 			rotate = 1;
 	}
 
@@ -1797,7 +2215,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 	if (ctx)
 		rotate_ctx(ctx);
 
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current, 0);
 	if (ctx)
 		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 
@@ -1876,7 +2294,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 	raw_spin_unlock(&ctx->lock);
 
-	perf_event_context_sched_in(ctx);
+	perf_event_context_sched_in(ctx, ctx->task);
 out:
 	local_irq_restore(flags);
 }
@@ -1902,6 +2320,7 @@ static void __perf_event_read(void *info)
 
 	raw_spin_lock(&ctx->lock);
 	update_context_time(ctx);
+	update_cgrp_time_from_event(event);
 	update_event_times(event);
 	raw_spin_unlock(&ctx->lock);
 
@@ -1932,8 +2351,10 @@ static u64 perf_event_read(struct perf_event *event)
 		 * (e.g., thread is blocked), in that case
 		 * we cannot update context time
 		 */
-		if (ctx->is_active)
+		if (ctx->is_active) {
 			update_context_time(ctx);
+			update_cgrp_time_from_event(event);
+		}
 		update_event_times(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
@@ -2312,7 +2733,7 @@ static void free_event(struct perf_event *event)
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_dec(&perf_task_events);
+			jump_label_dec(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
@@ -2328,6 +2749,9 @@ static void free_event(struct perf_event *event)
 		event->buffer = NULL;
 	}
 
+	if (is_cgroup_event(event))
+		perf_detach_cgroup(event);
+
 	if (event->destroy)
 		event->destroy(event);
 
@@ -3912,7 +4336,7 @@ static int perf_event_task_match(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if (event->attr.comm || event->attr.mmap ||
@@ -4049,7 +4473,7 @@ static int perf_event_comm_match(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if (event->attr.comm)
@@ -4197,7 +4621,7 @@ static int perf_event_mmap_match(struct perf_event *event,
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if ((!executable && event->attr.mmap_data) ||
@@ -5217,6 +5641,7 @@ static void task_clock_event_read(struct perf_event *event)
 
 	if (!in_nmi()) {
 		update_context_time(event->ctx);
+		update_cgrp_time_from_event(event);
 		time = event->ctx->time;
 	} else {
 		u64 now = perf_clock();
@@ -5639,7 +6064,7 @@ done:
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_inc(&perf_task_events);
+			jump_label_inc(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
@@ -5814,7 +6239,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	int err;
 
 	/* for future expandability... */
-	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+	if (flags & ~PERF_FLAG_ALL)
 		return -EINVAL;
 
 	err = perf_copy_attr(attr_uptr, &attr);
@@ -5831,6 +6256,15 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/*
+	 * In cgroup mode, the pid argument is used to pass the fd
+	 * opened to the cgroup directory in cgroupfs. The cpu argument
+	 * designates the cpu on which to monitor threads from that
+	 * cgroup.
+	 */
+	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+		return -EINVAL;
+
 	event_fd = get_unused_fd_flags(O_RDWR);
 	if (event_fd < 0)
 		return event_fd;
@@ -5848,7 +6282,7 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
-	if (pid != -1) {
+	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
 		task = find_lively_task_by_vpid(pid);
 		if (IS_ERR(task)) {
 			err = PTR_ERR(task);
@@ -5862,6 +6296,12 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_task;
 	}
 
+	if (flags & PERF_FLAG_PID_CGROUP) {
+		err = perf_cgroup_connect(pid, event, &attr, group_leader);
+		if (err)
+			goto err_alloc;
+	}
+
 	/*
 	 * Special case software events and allow them to be part of
 	 * any hardware group.
@@ -6718,3 +7158,49 @@ unlock:
 	return ret;
 }
 device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+	struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	struct perf_cgroup_info *t;
+	int c;
+
+	jc = kmalloc(sizeof(*jc), GFP_KERNEL);
+	if (!jc)
+		return ERR_PTR(-ENOMEM);
+
+	memset(jc, 0, sizeof(*jc));
+
+	jc->info = alloc_percpu(struct perf_cgroup_info);
+	if (!jc->info) {
+		kfree(jc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for_each_possible_cpu(c) {
+		t = per_cpu_ptr(jc->info, c);
+		t->time = 0;
+		t->timestamp = 0;
+	}
+	return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+			  struct perf_cgroup, css);
+	free_percpu(jc->info);
+	kfree(jc);
+}
+
+struct cgroup_subsys perf_subsys = {
+	.name = "perf_event",
+	.subsys_id = perf_subsys_id,
+	.create = perf_cgroup_create,
+	.destroy = perf_cgroup_destroy,
+};
+#endif /* CONFIG_CGROUP_PERF */

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-01-20 13:30 [PATCH 1/2] perf_events: add cgroup support (v8) Stephane Eranian
@ 2011-01-20 14:39 ` Peter Zijlstra
  2011-01-20 14:46   ` Stephane Eranian
  2011-02-02 11:29   ` Peter Zijlstra
  2011-02-07 16:10 ` Peter Zijlstra
  1 sibling, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2011-01-20 14:39 UTC (permalink / raw)
  To: eranian
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, perfmon2-devel,
	eranian, robert.richter, acme, lizf, Paul Menage

On Thu, 2011-01-20 at 15:30 +0200, Stephane Eranian wrote:
> @@ -4259,8 +4261,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>  
>         /* Reassign the task to the init_css_set. */
>         task_lock(tsk);
> +       /*
> +        * we mask interrupts to prevent:
> +        * - timer tick to cause event rotation which
> +        *   could schedule back in cgroup events after
> +        *   they were switched out by perf_cgroup_sched_out()
> +        *
> +        * - preemption which could schedule back in cgroup events
> +        */
> +       local_irq_save(flags);
> +       perf_cgroup_sched_out(tsk);
>         cg = tsk->cgroups;
>         tsk->cgroups = &init_css_set;
> +       perf_cgroup_sched_in(tsk);
> +       local_irq_restore(flags);
>         task_unlock(tsk);
>         if (cg)
>                 put_css_set_taskexit(cg); 

So you too need a callback on cgroup change there.. Li, Paul, any chance
we can fix this cgroup_subsys::exit callback? The scheduler code needs
to do funny thing because its in the wrong place as well.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-01-20 14:39 ` Peter Zijlstra
@ 2011-01-20 14:46   ` Stephane Eranian
  2011-02-02 11:29   ` Peter Zijlstra
  1 sibling, 0 replies; 28+ messages in thread
From: Stephane Eranian @ 2011-01-20 14:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, perfmon2-devel,
	eranian, robert.richter, acme, lizf, Paul Menage

Peter,

Note that I also had to drop my optimization which was handling cgroup switch
from perf_event_task_sched_out() (and which had to deal with PF_EXITING).
I realized there was an issue in sampling mode during context switch. If you are
measuring at the kernel level, then you may get samples from the previous task
(which may not be in your cgroup) if a counter overflows in the middle
of the context
switch because you've switched cgroup before "current" is actually
updated. Thus,
the perf_event interrupt handler associates the sample to the old tasks.

So now, I switch out everything from perf_event_task_sched_out() and
wait until perf_event_task_sched_in() to re-examine the situation for the
new task. That's the only way to maintain consistency with "current".


On Thu, Jan 20, 2011 at 3:39 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, 2011-01-20 at 15:30 +0200, Stephane Eranian wrote:
>> @@ -4259,8 +4261,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>>
>>         /* Reassign the task to the init_css_set. */
>>         task_lock(tsk);
>> +       /*
>> +        * we mask interrupts to prevent:
>> +        * - timer tick to cause event rotation which
>> +        *   could schedule back in cgroup events after
>> +        *   they were switched out by perf_cgroup_sched_out()
>> +        *
>> +        * - preemption which could schedule back in cgroup events
>> +        */
>> +       local_irq_save(flags);
>> +       perf_cgroup_sched_out(tsk);
>>         cg = tsk->cgroups;
>>         tsk->cgroups = &init_css_set;
>> +       perf_cgroup_sched_in(tsk);
>> +       local_irq_restore(flags);
>>         task_unlock(tsk);
>>         if (cg)
>>                 put_css_set_taskexit(cg);
>
> So you too need a callback on cgroup change there.. Li, Paul, any chance
> we can fix this cgroup_subsys::exit callback? The scheduler code needs
> to do funny thing because its in the wrong place as well.
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-01-20 14:39 ` Peter Zijlstra
  2011-01-20 14:46   ` Stephane Eranian
@ 2011-02-02 11:29   ` Peter Zijlstra
  2011-02-02 11:50     ` Balbir Singh
  1 sibling, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-02 11:29 UTC (permalink / raw)
  To: eranian
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, perfmon2-devel,
	eranian, robert.richter, acme, lizf, Paul Menage, Balbir Singh

On Thu, 2011-01-20 at 15:39 +0100, Peter Zijlstra wrote:
> On Thu, 2011-01-20 at 15:30 +0200, Stephane Eranian wrote:
> > @@ -4259,8 +4261,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
> >  
> >         /* Reassign the task to the init_css_set. */
> >         task_lock(tsk);
> > +       /*
> > +        * we mask interrupts to prevent:
> > +        * - timer tick to cause event rotation which
> > +        *   could schedule back in cgroup events after
> > +        *   they were switched out by perf_cgroup_sched_out()
> > +        *
> > +        * - preemption which could schedule back in cgroup events
> > +        */
> > +       local_irq_save(flags);
> > +       perf_cgroup_sched_out(tsk);
> >         cg = tsk->cgroups;
> >         tsk->cgroups = &init_css_set;
> > +       perf_cgroup_sched_in(tsk);
> > +       local_irq_restore(flags);
> >         task_unlock(tsk);
> >         if (cg)
> >                 put_css_set_taskexit(cg); 
> 
> So you too need a callback on cgroup change there.. Li, Paul, any chance
> we can fix this cgroup_subsys::exit callback? The scheduler code needs
> to do funny thing because its in the wrong place as well.

cgroup guys? Shall I just fix this exit thing since the only user seems
to be the scheduler and now perf for both of which its unfortunate at
best?

Balbir, memcontrol.c uses pre_destroy(), I pose that using this method
is broken per definition since it makes the cgroup empty notification
void.




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-02 11:29   ` Peter Zijlstra
@ 2011-02-02 11:50     ` Balbir Singh
  2011-02-02 12:46       ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Balbir Singh @ 2011-02-02 11:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf, Paul Menage

* Peter Zijlstra <peterz@infradead.org> [2011-02-02 12:29:20]:

> On Thu, 2011-01-20 at 15:39 +0100, Peter Zijlstra wrote:
> > On Thu, 2011-01-20 at 15:30 +0200, Stephane Eranian wrote:
> > > @@ -4259,8 +4261,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
> > >  
> > >         /* Reassign the task to the init_css_set. */
> > >         task_lock(tsk);
> > > +       /*
> > > +        * we mask interrupts to prevent:
> > > +        * - timer tick to cause event rotation which
> > > +        *   could schedule back in cgroup events after
> > > +        *   they were switched out by perf_cgroup_sched_out()
> > > +        *
> > > +        * - preemption which could schedule back in cgroup events
> > > +        */
> > > +       local_irq_save(flags);
> > > +       perf_cgroup_sched_out(tsk);
> > >         cg = tsk->cgroups;
> > >         tsk->cgroups = &init_css_set;
> > > +       perf_cgroup_sched_in(tsk);
> > > +       local_irq_restore(flags);
> > >         task_unlock(tsk);
> > >         if (cg)
> > >                 put_css_set_taskexit(cg); 
> > 
> > So you too need a callback on cgroup change there.. Li, Paul, any chance
> > we can fix this cgroup_subsys::exit callback? The scheduler code needs
> > to do funny thing because its in the wrong place as well.
> 
> cgroup guys? Shall I just fix this exit thing since the only user seems
> to be the scheduler and now perf for both of which its unfortunate at
> best?a

Are you suggesting that the cgroup_exit on task_exit notification should be
pulled out?

> 
> Balbir, memcontrol.c uses pre_destroy(), I pose that using this method
> is broken per definition since it makes the cgroup empty notification
> void.
>

We use pre_destroy() to reclaim, so that delete/rmdir() will be able
to clean up the node/group. I am not sure what you mean by it makes
the empty notification void and why pre_destroy() is broken?

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-02 11:50     ` Balbir Singh
@ 2011-02-02 12:46       ` Peter Zijlstra
  2011-02-02 19:02         ` Balbir Singh
  2011-02-07 19:29         ` [PATCH 1/2] perf_events: add cgroup support (v8) Paul Menage
  0 siblings, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-02 12:46 UTC (permalink / raw)
  To: balbir
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf, Paul Menage

On Wed, 2011-02-02 at 17:20 +0530, Balbir Singh wrote:
> * Peter Zijlstra <peterz@infradead.org> [2011-02-02 12:29:20]:
> 
> > On Thu, 2011-01-20 at 15:39 +0100, Peter Zijlstra wrote:
> > > On Thu, 2011-01-20 at 15:30 +0200, Stephane Eranian wrote:
> > > > @@ -4259,8 +4261,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
> > > >  
> > > >         /* Reassign the task to the init_css_set. */
> > > >         task_lock(tsk);
> > > > +       /*
> > > > +        * we mask interrupts to prevent:
> > > > +        * - timer tick to cause event rotation which
> > > > +        *   could schedule back in cgroup events after
> > > > +        *   they were switched out by perf_cgroup_sched_out()
> > > > +        *
> > > > +        * - preemption which could schedule back in cgroup events
> > > > +        */
> > > > +       local_irq_save(flags);
> > > > +       perf_cgroup_sched_out(tsk);
> > > >         cg = tsk->cgroups;
> > > >         tsk->cgroups = &init_css_set;
> > > > +       perf_cgroup_sched_in(tsk);
> > > > +       local_irq_restore(flags);
> > > >         task_unlock(tsk);
> > > >         if (cg)
> > > >                 put_css_set_taskexit(cg); 
> > > 
> > > So you too need a callback on cgroup change there.. Li, Paul, any chance
> > > we can fix this cgroup_subsys::exit callback? The scheduler code needs
> > > to do funny thing because its in the wrong place as well.
> > 
> > cgroup guys? Shall I just fix this exit thing since the only user seems
> > to be the scheduler and now perf for both of which its unfortunate at
> > best?
> 
> Are you suggesting that the cgroup_exit on task_exit notification should be
> pulled out?


No, just fixed. The callback as it exists isn't useful and leads to
hacks like the above.


> > Balbir, memcontrol.c uses pre_destroy(), I pose that using this method
> > is broken per definition since it makes the cgroup empty notification
> > void.
> >
> 
> We use pre_destroy() to reclaim, so that delete/rmdir() will be able
> to clean up the node/group. I am not sure what you mean by it makes
> the empty notification void and why pre_destroy() is broken?

A quick look at the code looked like it could return -EBUSY (and other
errors), in that case the rmdir of the empty cgroup will fail.

Therefore it can happen that after the last task is removed, and we get
the notification that the cgroup is empty, and we attempt the rmdir we
will fail.

This again means that all such notification handlers must poll state,
which is ridiculous.




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-02 12:46       ` Peter Zijlstra
@ 2011-02-02 19:02         ` Balbir Singh
  2011-02-07 16:10           ` [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback Peter Zijlstra
  2011-02-07 19:29         ` [PATCH 1/2] perf_events: add cgroup support (v8) Paul Menage
  1 sibling, 1 reply; 28+ messages in thread
From: Balbir Singh @ 2011-02-02 19:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf, Paul Menage

* Peter Zijlstra <peterz@infradead.org> [2011-02-02 13:46:32]:

> On Wed, 2011-02-02 at 17:20 +0530, Balbir Singh wrote:
> > * Peter Zijlstra <peterz@infradead.org> [2011-02-02 12:29:20]:
> > 
> > > On Thu, 2011-01-20 at 15:39 +0100, Peter Zijlstra wrote:
> > > > On Thu, 2011-01-20 at 15:30 +0200, Stephane Eranian wrote:
> > > > > @@ -4259,8 +4261,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
> > > > >  
> > > > >         /* Reassign the task to the init_css_set. */
> > > > >         task_lock(tsk);
> > > > > +       /*
> > > > > +        * we mask interrupts to prevent:
> > > > > +        * - timer tick to cause event rotation which
> > > > > +        *   could schedule back in cgroup events after
> > > > > +        *   they were switched out by perf_cgroup_sched_out()
> > > > > +        *
> > > > > +        * - preemption which could schedule back in cgroup events
> > > > > +        */
> > > > > +       local_irq_save(flags);
> > > > > +       perf_cgroup_sched_out(tsk);
> > > > >         cg = tsk->cgroups;
> > > > >         tsk->cgroups = &init_css_set;
> > > > > +       perf_cgroup_sched_in(tsk);
> > > > > +       local_irq_restore(flags);
> > > > >         task_unlock(tsk);
> > > > >         if (cg)
> > > > >                 put_css_set_taskexit(cg); 
> > > > 
> > > > So you too need a callback on cgroup change there.. Li, Paul, any chance
> > > > we can fix this cgroup_subsys::exit callback? The scheduler code needs
> > > > to do funny thing because its in the wrong place as well.
> > > 
> > > cgroup guys? Shall I just fix this exit thing since the only user seems
> > > to be the scheduler and now perf for both of which its unfortunate at
> > > best?
> > 
> > Are you suggesting that the cgroup_exit on task_exit notification should be
> > pulled out?
> 
> 
> No, just fixed. The callback as it exists isn't useful and leads to
> hacks like the above.
>

OK
 
> 
> > > Balbir, memcontrol.c uses pre_destroy(), I pose that using this method
> > > is broken per definition since it makes the cgroup empty notification
> > > void.
> > >
> > 
> > We use pre_destroy() to reclaim, so that delete/rmdir() will be able
> > to clean up the node/group. I am not sure what you mean by it makes
> > the empty notification void and why pre_destroy() is broken?
> 
> A quick look at the code looked like it could return -EBUSY (and other
> errors), in that case the rmdir of the empty cgroup will fail.
> 
> Therefore it can happen that after the last task is removed, and we get
> the notification that the cgroup is empty, and we attempt the rmdir we
> will fail.
> 
> This again means that all such notification handlers must poll state,
> which is ridiculous.

The reason why the failure occurs is because someone has an active
reference to the cgroup structure. In the case of memory, it was every
page_cgroup earlier. The only reason why a notification would have to
poll state is if

1. notification is sent that there are no references, this group can
be cleaned up
2. A new reference is acquired before the cleanup

1 and 2 are unlikely


-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-02 19:02         ` Balbir Singh
@ 2011-02-07 16:10           ` Peter Zijlstra
  2011-02-07 19:28             ` Paul Menage
  2011-02-13 12:52             ` [RFC][PATCH] " Balbir Singh
  0 siblings, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-07 16:10 UTC (permalink / raw)
  To: balbir
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf, Paul Menage

On Thu, 2011-02-03 at 00:32 +0530, Balbir Singh wrote:
> > No, just fixed. The callback as it exists isn't useful and leads to
> > hacks like the above.

---
Subject: cgroup: Fix cgroup_subsys::exit callback
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon Feb 07 17:02:20 CET 2011

Make the ::exit method act like ::attach, it is after all very nearly
the same thing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
Index: linux-2.6/include/linux/cgroup.h
===================================================================
--- linux-2.6.orig/include/linux/cgroup.h
+++ linux-2.6/include/linux/cgroup.h
@@ -474,7 +474,8 @@ struct cgroup_subsys {
 			struct cgroup *old_cgrp, struct task_struct *tsk,
 			bool threadgroup);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
-	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
+	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			struct cgroup *old_cgrp, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
 			struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
Index: linux-2.6/kernel/cgroup.c
===================================================================
--- linux-2.6.orig/kernel/cgroup.c
+++ linux-2.6/kernel/cgroup.c
@@ -4230,20 +4230,10 @@ void cgroup_post_fork(struct task_struct
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-	int i;
 	struct css_set *cg;
+	int i;
 
-	if (run_callbacks && need_forkexit_callback) {
-		/*
-		 * modular subsystems can't use callbacks, so no need to lock
-		 * the subsys array
-		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-			struct cgroup_subsys *ss = subsys[i];
-			if (ss->exit)
-				ss->exit(ss, tsk);
-		}
-	}
+	mutex_lock(&cgroup_mutex);
 
 	/*
 	 * Unlink from the css_set task list if necessary.
@@ -4262,6 +4252,25 @@ void cgroup_exit(struct task_struct *tsk
 	cg = tsk->cgroups;
 	tsk->cgroups = &init_css_set;
 	task_unlock(tsk);
+
+	if (run_callbacks && need_forkexit_callback) {
+		/*
+		 * modular subsystems can't use callbacks, so no need to lock
+		 * the subsys array
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys *ss = subsys[i];
+			if (ss->exit) {
+				struct cgroup *old_cgrp =
+					rcu_dereference_raw(cg->subsys[i])->cgroup;
+				struct cgroup *cgrp = task_cgroup(tsk, i);
+				ss->exit(ss, cgrp, old_cgrp, tsk);
+			}
+		}
+	}
+
+	mutex_unlock(&cgroup_mutex);
+
 	if (cg)
 		put_css_set_taskexit(cg);
 }
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -606,9 +606,6 @@ static inline struct task_group *task_gr
 	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
-	if (p->flags & PF_EXITING)
-		return &root_task_group;
-
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
@@ -9081,7 +9078,8 @@ cpu_cgroup_attach(struct cgroup_subsys *
 }
 
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-01-20 13:30 [PATCH 1/2] perf_events: add cgroup support (v8) Stephane Eranian
  2011-01-20 14:39 ` Peter Zijlstra
@ 2011-02-07 16:10 ` Peter Zijlstra
  2011-02-07 20:30   ` Stephane Eranian
  2011-02-08 22:31   ` Stephane Eranian
  1 sibling, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-07 16:10 UTC (permalink / raw)
  To: eranian
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, perfmon2-devel,
	eranian, robert.richter, acme, lizf

Compile tested only, depends on the cgroup::exit patch

---
Subject: perf: Add cgroup support
From: Stephane Eranian <eranian@google.com>
Date: Mon Feb 07 17:02:25 CET 2011

This kernel patch adds the ability to filter monitoring based on
container groups (cgroups). This is for use in per-cpu mode only.
    
The cgroup to monitor is passed as a file descriptor in the pid
argument to the syscall. The file descriptor must be opened to 
the cgroup name in the cgroup filesystem. For instance, if the
cgroup name is foo and cgroupfs is mounted in /cgroup, then the
file descriptor is opened to /cgroup/foo. Cgroup mode is
activated by passing PERF_FLAG_PID_CGROUP in the flags argument
to the syscall.

For instance to measure in cgroup foo on CPU1 assuming
cgroupfs is mounted under /cgroup:

struct perf_event_attr attr;
int cgroup_fd, fd;

cgroup_fd = open("/cgroup/foo", O_RDONLY);
fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP);
close(cgroup_fd);

Signed-off-by: Stephane Eranian <eranian@google.com>
[ added perf_cgroup_{exit,attach} ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
 include/linux/cgroup.h        |    1 
 include/linux/cgroup_subsys.h |    4 
 include/linux/perf_event.h    |   33 +-
 init/Kconfig                  |   10 
 kernel/cgroup.c               |   23 +
 kernel/perf_event.c           |  641 +++++++++++++++++++++++++++++++++++++++---
 6 files changed, 665 insertions(+), 47 deletions(-)

Index: linux-2.6/include/linux/cgroup.h
===================================================================
--- linux-2.6.orig/include/linux/cgroup.h
+++ linux-2.6/include/linux/cgroup.h
@@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsy
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
 unsigned short css_depth(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 
 #else /* !CONFIG_CGROUPS */
 
Index: linux-2.6/include/linux/cgroup_subsys.h
===================================================================
--- linux-2.6.orig/include/linux/cgroup_subsys.h
+++ linux-2.6/include/linux/cgroup_subsys.h
@@ -65,4 +65,8 @@ SUBSYS(net_cls)
 SUBSYS(blkio)
 #endif
 
+#ifdef CONFIG_CGROUP_PERF
+SUBSYS(perf)
+#endif
+
 /* */
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -464,6 +464,7 @@ enum perf_callchain_context {
 
 #define PERF_FLAG_FD_NO_GROUP	(1U << 0)
 #define PERF_FLAG_FD_OUTPUT	(1U << 1)
+#define PERF_FLAG_PID_CGROUP	(1U << 2) /* pid=cgroup id, per-cpu mode only */
 
 #ifdef __KERNEL__
 /*
@@ -471,6 +472,7 @@ enum perf_callchain_context {
  */
 
 #ifdef CONFIG_PERF_EVENTS
+# include <linux/cgroup.h>
 # include <asm/perf_event.h>
 # include <asm/local64.h>
 #endif
@@ -716,6 +718,22 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
 
+#ifdef CONFIG_CGROUP_PERF
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+	u64 time;
+	u64 timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state css;
+	struct perf_cgroup_info *info;	/* timing info, one per cpu */
+};
+#endif
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -832,6 +850,11 @@ struct perf_event {
 	struct event_filter		*filter;
 #endif
 
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
+	int				cgrp_defer_enabled;
+#endif
+
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -886,6 +909,7 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	struct rcu_head			rcu_head;
+	int				nr_cgroups; /* cgroup events present */
 };
 
 /*
@@ -905,6 +929,9 @@ struct perf_cpu_context {
 	struct list_head		rotation_list;
 	int				jiffies_interval;
 	struct pmu			*active_pmu;
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cgroup		*cgrp;
+#endif
 };
 
 struct perf_output_handle {
@@ -1040,11 +1067,11 @@ perf_sw_event(u32 event_id, u64 nr, int
 	__perf_sw_event(event_id, nr, nmi, regs, addr);
 }
 
-extern atomic_t perf_task_events;
+extern atomic_t perf_sched_events;
 
 static inline void perf_event_task_sched_in(struct task_struct *task)
 {
-	COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
+	COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
 }
 
 static inline
@@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct ta
 {
 	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
 
-	COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
+	COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 	  select this option (if, for some reason, they need to disable it
 	  then noswapaccount does the trick).
 
+config CGROUP_PERF
+	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
+	depends on PERF_EVENTS && CGROUPS
+	help
+	  This option extends the per-cpu mode to restrict monitoring to
+	  threads which belong to the cgroup specificied and run on the
+	  designated cpu.
+
+	  Say N if unsure.
+
 menuconfig CGROUP_SCHED
 	bool "Group CPU scheduler"
 	depends on EXPERIMENTAL
Index: linux-2.6/kernel/cgroup.c
===================================================================
--- linux-2.6.orig/kernel/cgroup.c
+++ linux-2.6/kernel/cgroup.c
@@ -4822,6 +4822,29 @@ css_get_next(struct cgroup_subsys *ss, i
 	return ret;
 }
 
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+	struct cgroup *cgrp;
+	struct inode *inode;
+	struct cgroup_subsys_state *css;
+
+	inode = f->f_dentry->d_inode;
+	/* check in cgroup filesystem dir */
+	if (inode->i_op != &cgroup_dir_inode_operations)
+		return ERR_PTR(-EBADF);
+
+	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+		return ERR_PTR(-EINVAL);
+
+	/* get cgroup */
+	cgrp = __d_cgrp(f->f_dentry);
+	css = cgrp->subsys[id];
+	return css ? css : ERR_PTR(-ENOENT);
+}
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
 						   struct cgroup *cont)
Index: linux-2.6/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/kernel/perf_event.c
+++ linux-2.6/kernel/perf_event.c
@@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, in
 	return data.ret;
 }
 
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+		       PERF_FLAG_FD_OUTPUT  |\
+		       PERF_FLAG_PID_CGROUP)
+
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
-atomic_t perf_task_events __read_mostly;
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+atomic_t perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
@@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct per
 			      enum event_type_t event_type);
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type);
+			     enum event_type_t event_type,
+			     struct task_struct *task, int cgrp_sw);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
 
 void __weak perf_event_print_debug(void)	{ }
 
@@ -162,6 +176,315 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+#ifdef CONFIG_CGROUP_PERF
+
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, perf_subsys_id),
+			struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+	struct perf_cgroup *cgrp = NULL;
+	if (task)
+		cgrp = perf_cgroup_from_task(task);
+	return !event->cgrp || event->cgrp == cgrp;
+}
+
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+	css_get(&event->cgrp->css);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+	css_put(&event->cgrp->css);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+	perf_put_cgroup(event);
+	event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+	struct perf_cgroup_info *t;
+
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+	struct perf_cgroup_info *info;
+	u64 now;
+
+	now = perf_clock();
+
+	info = this_cpu_ptr(cgrp->info);
+
+	info->time += now - info->timestamp;
+	info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+	if (cgrp_out)
+		__update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+	struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
+	/*
+	 * do not update time when cgroup is not active
+	 */
+	if (!event->cgrp || cgrp != event->cgrp)
+		return;
+
+	__update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+	struct perf_cgroup *cgrp;
+	struct perf_cgroup_info *info;
+
+	if (!task)
+		return;
+
+	cgrp = perf_cgroup_from_task(task);
+	info = per_cpu_ptr(cgrp->info, smp_processor_id());
+	info->timestamp = now;
+}
+
+#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	/*
+	 * disable interrupts to avoid geting nr_cgroup
+	 * changes via __perf_event_disable(). Also
+	 * avoids preemption.
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * we reschedule only in the presence of cgroup
+	 * constrained events.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+		perf_pmu_disable(cpuctx->ctx.pmu);
+
+		/*
+		 * perf_cgroup_events says at least one
+		 * context on this CPU has cgroup events.
+		 *
+		 * ctx->nr_cgroups reports the number of cgroup
+		 * events for a context.
+		 */
+		if (cpuctx->ctx.nr_cgroups > 0) {
+
+			if (mode & PERF_CGROUP_SWOUT)
+				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+
+			if (mode & PERF_CGROUP_SWIN) {
+				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, 1);
+				cpuctx->cgrp = perf_cgroup_from_task(task);
+			}
+		}
+
+		perf_pmu_enable(cpuctx->ctx.pmu);
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+	perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+	perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	struct perf_cgroup *cgrp;
+	struct cgroup_subsys_state *css;
+	struct file *file;
+	int ret = 0, fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+
+	css = cgroup_css_from_dir(file, perf_subsys_id);
+	if (IS_ERR(css))
+		return PTR_ERR(css);
+
+	cgrp = container_of(css, struct perf_cgroup, css);
+	event->cgrp = cgrp;
+
+	/*
+	 * all events in a group must monitor
+	 * the same cgroup because a task belongs
+	 * to only one perf cgroup at a time
+	 */
+	if (group_leader && group_leader->cgrp != cgrp) {
+		perf_detach_cgroup(event);
+		ret = -EINVAL;
+	} else {
+		/* must be done before we fput() the file */
+		perf_get_cgroup(event);
+	}
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+	struct perf_cgroup_info *t;
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event, struct task_struct *task)
+{
+	/*
+	 * when the current task's perf cgroup does not match
+	 * the event's, we need to remember to call the
+	 * perf_mark_enable() function the first time a task with
+	 * a matching perf cgroup is scheduled in.
+	 */
+	if (is_cgroup_event(event) && !perf_cgroup_match(event, task))
+		event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+			 struct perf_event_context *ctx)
+{
+	struct perf_event *sub;
+	u64 tstamp = perf_event_time(event);
+
+	if (!event->cgrp_defer_enabled)
+		return;
+
+	event->cgrp_defer_enabled = 0;
+
+	event->tstamp_enabled = tstamp - event->total_time_enabled;
+	list_for_each_entry(sub, &event->sibling_list, group_entry) {
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+			sub->cgrp_defer_enabled = 0;
+		}
+	}
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+	return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event, struct task_struct *task)
+{}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+			 struct perf_event_context *ctx)
+{}
+#endif
+
 void perf_pmu_disable(struct pmu *pmu)
 {
 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -343,6 +666,10 @@ static void update_context_time(struct p
 static u64 perf_event_time(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
+
+	if (is_cgroup_event(event))
+		return perf_cgroup_event_time(event);
+
 	return ctx ? ctx->time : 0;
 }
 
@@ -357,9 +684,20 @@ static void update_event_times(struct pe
 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
-
-	if (ctx->is_active)
+	/*
+	 * in cgroup mode, time_enabled represents
+	 * the time the event was enabled AND active
+	 * tasks were in the monitored cgroup. This is
+	 * independent of the activity of the context as
+	 * there may be a mix of cgroup and non-cgroup events.
+	 *
+	 * That is why we treat cgroup events differently
+	 * here.
+	 */
+	if (is_cgroup_event(event))
 		run_end = perf_event_time(event);
+	else if (ctx->is_active)
+		run_end = ctx->time;
 	else
 		run_end = event->tstamp_stopped;
 
@@ -371,6 +709,7 @@ static void update_event_times(struct pe
 		run_end = perf_event_time(event);
 
 	event->total_time_running = run_end - event->tstamp_running;
+
 }
 
 /*
@@ -419,6 +758,17 @@ list_add_event(struct perf_event *event,
 		list_add_tail(&event->group_entry, list);
 	}
 
+	if (is_cgroup_event(event)) {
+		ctx->nr_cgroups++;
+		/*
+		 * one more event:
+		 * - that has cgroup constraint on event->cpu
+		 * - that may need work on context switch
+		 */
+		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_inc(&perf_sched_events);
+	}
+
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
 		perf_pmu_rotate_start(ctx->pmu);
@@ -545,6 +895,12 @@ list_del_event(struct perf_event *event,
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
+	if (is_cgroup_event(event)) {
+		ctx->nr_cgroups--;
+		atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_dec(&perf_sched_events);
+	}
+
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -614,9 +970,10 @@ static void perf_group_detach(struct per
 }
 
 static inline int
-event_filter_match(struct perf_event *event)
+event_filter_match(struct perf_event *event, struct task_struct *task)
 {
-	return event->cpu == -1 || event->cpu == smp_processor_id();
+	return (event->cpu == -1 || event->cpu == smp_processor_id())
+	    && perf_cgroup_match(event, task);
 }
 
 static void
@@ -633,8 +990,8 @@ event_sched_out(struct perf_event *event
 	 * via read() for time_enabled, time_running:
 	 */
 	if (event->state == PERF_EVENT_STATE_INACTIVE
-	    && !event_filter_match(event)) {
-		delta = ctx->time - event->tstamp_stopped;
+	    && !event_filter_match(event, current)) {
+		delta = tstamp - event->tstamp_stopped;
 		event->tstamp_running += delta;
 		event->tstamp_stopped = tstamp;
 	}
@@ -783,6 +1140,7 @@ static int __perf_event_disable(void *in
 	 */
 	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
 		update_group_times(event);
 		if (event == event->group_leader)
 			group_sched_out(event, cpuctx, ctx);
@@ -851,6 +1209,41 @@ void perf_event_disable(struct perf_even
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
+static void perf_set_shadow_time(struct perf_event *event,
+				 struct perf_event_context *ctx,
+				 u64 tstamp)
+{
+	/*
+	 * use the correct time source for the time snapshot
+	 *
+	 * We could get by without this by leveraging the
+	 * fact that to get to this function, the caller
+	 * has most likely already called update_context_time()
+	 * and update_cgrp_time_xx() and thus both timestamp
+	 * are identical (or very close). Given that tstamp is,
+	 * already adjusted for cgroup, we could say that:
+	 *    tstamp - ctx->timestamp
+	 * is equivalent to
+	 *    tstamp - cgrp->timestamp.
+	 *
+	 * Then, in perf_output_read(), the calculation would
+	 * work with no changes because:
+	 * - event is guaranteed scheduled in
+	 * - no scheduled out in between
+	 * - thus the timestamp would be the same
+	 *
+	 * But this is a bit hairy.
+	 *
+	 * So instead, we have an explicit cgroup call to remain
+	 * within the time time source all along. We believe it
+	 * is cleaner and simpler to understand.
+	 */
+	if (is_cgroup_event(event))
+		perf_cgroup_set_shadow_time(event, tstamp);
+	else
+		event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
@@ -876,7 +1269,7 @@ event_sched_in(struct perf_event *event,
 
 	event->tstamp_running += tstamp - event->tstamp_stopped;
 
-	event->shadow_ctx_time = tstamp - ctx->timestamp;
+	perf_set_shadow_time(event, ctx, tstamp);
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
@@ -992,12 +1385,13 @@ static void add_event_to_ctx(struct perf
 
 	list_add_event(event, ctx);
 	perf_group_attach(event);
-	event->tstamp_enabled = tstamp;
 	event->tstamp_running = tstamp;
 	event->tstamp_stopped = tstamp;
+	event->tstamp_enabled = tstamp;
 }
 
-static void perf_event_context_sched_in(struct perf_event_context *ctx);
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+					struct task_struct *tsk);
 
 /*
  * Cross CPU call to install and enable a performance event
@@ -1018,15 +1412,21 @@ static int  __perf_install_in_context(vo
 	 * which do context switches with IRQs enabled.
 	 */
 	if (ctx->task && !cpuctx->task_ctx)
-		perf_event_context_sched_in(ctx);
+		perf_event_context_sched_in(ctx, ctx->task);
 
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	update_context_time(ctx);
+	/*
+	 * update cgrp time only if current cgrp
+	 * matches event->cgrp. Must be done before
+	 * calling add_event_to_ctx()
+	 */
+	update_cgrp_time_from_event(event);
 
 	add_event_to_ctx(event, ctx);
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current))
 		goto unlock;
 
 	/*
@@ -1160,10 +1560,19 @@ static int __perf_event_enable(void *inf
 
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
 		goto unlock;
+
+	/*
+	 * set current task's cgroup time reference point
+	 */
+	perf_cgroup_set_timestamp(current, perf_clock());
+
 	__perf_event_mark_enabled(event, ctx);
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current)) {
+		if (is_cgroup_event(event))
+			perf_cgroup_defer_enabled(event, current);
 		goto unlock;
+	}
 
 	/*
 	 * If the event is in a group and isn't the group leader,
@@ -1292,6 +1701,7 @@ static void ctx_sched_out(struct perf_ev
 	if (likely(!ctx->nr_events))
 		goto out;
 	update_context_time(ctx);
+	update_cgrp_time_from_cpuctx(cpuctx);
 
 	if (!ctx->nr_active)
 		goto out;
@@ -1481,6 +1891,14 @@ void __perf_event_task_sched_out(struct
 
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
+
+	/*
+	 * if cgroup events exist on this CPU, then we need
+	 * to check if we have to switch out PMU state.
+	 * cgroup event are system-wide mode only
+	 */
+	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+		perf_cgroup_sched_out(task);
 }
 
 static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1509,16 +1927,21 @@ static void cpu_ctx_sched_out(struct per
 
 static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
+		    struct perf_cpu_context *cpuctx,
+		    struct task_struct *task, int cgrp_sw)
 {
 	struct perf_event *event;
 
 	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, task))
 			continue;
 
+		/* may need to reset tstamp_enabled */
+		if (is_cgroup_event(event))
+			perf_cgroup_mark_enabled(event, ctx);
+
 		if (group_can_go_on(event, cpuctx, 1))
 			group_sched_in(event, cpuctx, ctx);
 
@@ -1535,7 +1958,8 @@ ctx_pinned_sched_in(struct perf_event_co
 
 static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
+		      struct perf_cpu_context *cpuctx,
+		      struct task_struct *task, int cgrp_sw)
 {
 	struct perf_event *event;
 	int can_add_hw = 1;
@@ -1548,9 +1972,13 @@ ctx_flexible_sched_in(struct perf_event_
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of events:
 		 */
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, task))
 			continue;
 
+		/* may need to reset tstamp_enabled */
+		if (is_cgroup_event(event))
+			perf_cgroup_mark_enabled(event, ctx);
+
 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
@@ -1561,36 +1989,41 @@ ctx_flexible_sched_in(struct perf_event_
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type)
+	     enum event_type_t event_type,
+	     struct task_struct *task, int cgrp_sw)
 {
+	u64 now;
+
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	if (likely(!ctx->nr_events))
 		goto out;
 
-	ctx->timestamp = perf_clock();
-
+	now = perf_clock();
+	ctx->timestamp = now;
+	perf_cgroup_set_timestamp(task, now);
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
 	if (event_type & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+		ctx_pinned_sched_in(ctx, cpuctx, task, cgrp_sw);
 
 	/* Then walk through the lower prio flexible groups */
 	if (event_type & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+		ctx_flexible_sched_in(ctx, cpuctx, task, cgrp_sw);
 
 out:
 	raw_spin_unlock(&ctx->lock);
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type)
+			     enum event_type_t event_type,
+			     struct task_struct *task, int cgrp_sw)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, task, cgrp_sw);
 }
 
 static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1602,11 +2035,12 @@ static void task_ctx_sched_in(struct per
 	if (cpuctx->task_ctx == ctx)
 		return;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, NULL, 0);
 	cpuctx->task_ctx = ctx;
 }
 
-static void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+					struct task_struct *task)
 {
 	struct perf_cpu_context *cpuctx;
 
@@ -1622,9 +2056,9 @@ static void perf_event_context_sched_in(
 	 */
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
-	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+	ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, 0);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, 0);
+	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, 0);
 
 	cpuctx->task_ctx = ctx;
 
@@ -1657,8 +2091,15 @@ void __perf_event_task_sched_in(struct t
 		if (likely(!ctx))
 			continue;
 
-		perf_event_context_sched_in(ctx);
+		perf_event_context_sched_in(ctx, task);
 	}
+	/*
+	 * if cgroup events exist on this CPU, then we need
+	 * to check if we have to switch in PMU state.
+	 * cgroup event are system-wide mode only
+	 */
+	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+		perf_cgroup_sched_in(task);
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1775,7 +2216,7 @@ static void perf_ctx_adjust_freq(struct
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, current))
 			continue;
 
 		hwc = &event->hw;
@@ -1833,9 +2274,10 @@ static void perf_rotate_context(struct p
 	struct perf_event_context *ctx = NULL;
 	int rotate = 0, remove = 1;
 
-	if (cpuctx->ctx.nr_events) {
+	ctx = &cpuctx->ctx;
+	if (ctx->nr_events) {
 		remove = 0;
-		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+		if (ctx->nr_events != ctx->nr_active)
 			rotate = 1;
 	}
 
@@ -1862,7 +2304,7 @@ static void perf_rotate_context(struct p
 	if (ctx)
 		rotate_ctx(ctx);
 
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current, 0);
 	if (ctx)
 		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 
@@ -1941,7 +2383,7 @@ static void perf_event_enable_on_exec(st
 
 	raw_spin_unlock(&ctx->lock);
 
-	perf_event_context_sched_in(ctx);
+	perf_event_context_sched_in(ctx, ctx->task);
 out:
 	local_irq_restore(flags);
 }
@@ -1968,6 +2410,7 @@ static void __perf_event_read(void *info
 	raw_spin_lock(&ctx->lock);
 	if (ctx->is_active)
 		update_context_time(ctx);
+	update_cgrp_time_from_event(event);
 	update_event_times(event);
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
 		event->pmu->read(event);
@@ -1998,8 +2441,10 @@ static u64 perf_event_read(struct perf_e
 		 * (e.g., thread is blocked), in that case
 		 * we cannot update context time
 		 */
-		if (ctx->is_active)
+		if (ctx->is_active) {
 			update_context_time(ctx);
+			update_cgrp_time_from_event(event);
+		}
 		update_event_times(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
@@ -2384,7 +2829,7 @@ static void free_event(struct perf_event
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_dec(&perf_task_events);
+			jump_label_dec(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
@@ -2400,6 +2845,9 @@ static void free_event(struct perf_event
 		event->buffer = NULL;
 	}
 
+	if (is_cgroup_event(event))
+		perf_detach_cgroup(event);
+
 	if (event->destroy)
 		event->destroy(event);
 
@@ -3984,7 +4432,7 @@ static int perf_event_task_match(struct
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if (event->attr.comm || event->attr.mmap ||
@@ -4121,7 +4569,7 @@ static int perf_event_comm_match(struct
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if (event->attr.comm)
@@ -4269,7 +4717,7 @@ static int perf_event_mmap_match(struct
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if ((!executable && event->attr.mmap_data) ||
@@ -5289,6 +5737,7 @@ static void task_clock_event_read(struct
 
 	if (!in_nmi()) {
 		update_context_time(event->ctx);
+		update_cgrp_time_from_event(event);
 		time = event->ctx->time;
 	} else {
 		u64 now = perf_clock();
@@ -5714,7 +6163,7 @@ perf_event_alloc(struct perf_event_attr
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_inc(&perf_task_events);
+			jump_label_inc(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
@@ -5889,7 +6338,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	int err;
 
 	/* for future expandability... */
-	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+	if (flags & ~PERF_FLAG_ALL)
 		return -EINVAL;
 
 	err = perf_copy_attr(attr_uptr, &attr);
@@ -5906,6 +6355,15 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/*
+	 * In cgroup mode, the pid argument is used to pass the fd
+	 * opened to the cgroup directory in cgroupfs. The cpu argument
+	 * designates the cpu on which to monitor threads from that
+	 * cgroup.
+	 */
+	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+		return -EINVAL;
+
 	event_fd = get_unused_fd_flags(O_RDWR);
 	if (event_fd < 0)
 		return event_fd;
@@ -5923,7 +6381,7 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
-	if (pid != -1) {
+	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
 		task = find_lively_task_by_vpid(pid);
 		if (IS_ERR(task)) {
 			err = PTR_ERR(task);
@@ -5937,6 +6395,12 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_task;
 	}
 
+	if (flags & PERF_FLAG_PID_CGROUP) {
+		err = perf_cgroup_connect(pid, event, &attr, group_leader);
+		if (err)
+			goto err_alloc;
+	}
+
 	/*
 	 * Special case software events and allow them to be part of
 	 * any hardware group.
@@ -6797,3 +7261,92 @@ static int __init perf_event_sysfs_init(
 	return ret;
 }
 device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+	struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	struct perf_cgroup_info *t;
+	int c;
+
+	jc = kmalloc(sizeof(*jc), GFP_KERNEL);
+	if (!jc)
+		return ERR_PTR(-ENOMEM);
+
+	memset(jc, 0, sizeof(*jc));
+
+	jc->info = alloc_percpu(struct perf_cgroup_info);
+	if (!jc->info) {
+		kfree(jc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for_each_possible_cpu(c) {
+		t = per_cpu_ptr(jc->info, c);
+		t->time = 0;
+		t->timestamp = 0;
+	}
+	return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+			  struct perf_cgroup, css);
+	free_percpu(jc->info);
+	kfree(jc);
+}
+
+static int __perf_cgroup_move(void *info)
+{
+	struct task_struct *task = info;
+	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+	return 0;
+}
+
+static void perf_cgroup_move(struct task_struct *task)
+{
+	task_function_call(task, __perf_cgroup_move, task);
+}
+
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task,
+		bool threadgroup)
+{
+	perf_cgroup_move(task);
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+			perf_cgroup_move(c);
+		}
+		rcu_read_unlock();
+	}
+}
+
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task)
+{
+	/*
+	 * cgroup_exit() is called in the copy_process() failure path.
+	 * Ignore this case since the task hasn't ran yet, this avoids
+	 * trying to poke a half freed task state from generic code.
+	 */
+	if (!(task->flags & PF_EXITING))
+		return;
+
+	perf_cgroup_move(task);
+}
+
+struct cgroup_subsys perf_subsys = {
+	.name = "perf_event",
+	.subsys_id = perf_subsys_id,
+	.create = perf_cgroup_create,
+	.destroy = perf_cgroup_destroy,
+	.exit = perf_cgroup_exit,
+	.attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-07 16:10           ` [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback Peter Zijlstra
@ 2011-02-07 19:28             ` Paul Menage
  2011-02-07 20:02               ` Peter Zijlstra
  2011-02-13 12:52             ` [RFC][PATCH] " Balbir Singh
  1 sibling, 1 reply; 28+ messages in thread
From: Paul Menage @ 2011-02-07 19:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: balbir, eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf

On Mon, Feb 7, 2011 at 8:10 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Make the ::exit method act like ::attach, it is after all very nearly
> the same thing.

The major difference between attach and exit is that the former is
only triggered in response to user cgroup-management action, whereas
the latter is triggered whenever a task exits, even if cgroups aren't
set up.


>  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>  {
> -       int i;
>        struct css_set *cg;
> +       int i;
>
> -       if (run_callbacks && need_forkexit_callback) {
> -               /*
> -                * modular subsystems can't use callbacks, so no need to lock
> -                * the subsys array
> -                */
> -               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> -                       struct cgroup_subsys *ss = subsys[i];
> -                       if (ss->exit)
> -                               ss->exit(ss, tsk);
> -               }
> -       }
> +       mutex_lock(&cgroup_mutex);

NACK - cgroup_mutex is way too heavy to take in the task exit path.
We'll need to find some other way to fix this if it's really needed.
task->alloc_lock is also normally a valid thing to synchronize against
cgroup moves, but I'd have to look at the exit path to see if it's
still valid there.

Paul

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-02 12:46       ` Peter Zijlstra
  2011-02-02 19:02         ` Balbir Singh
@ 2011-02-07 19:29         ` Paul Menage
  2011-02-07 20:09           ` Peter Zijlstra
  1 sibling, 1 reply; 28+ messages in thread
From: Paul Menage @ 2011-02-07 19:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: balbir, eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf

On Wed, Feb 2, 2011 at 4:46 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, 2011-02-02 at 17:20 +0530, Balbir Singh wrote:
>> * Peter Zijlstra <peterz@infradead.org> [2011-02-02 12:29:20]:
>>
>> > On Thu, 2011-01-20 at 15:39 +0100, Peter Zijlstra wrote:
>> > > On Thu, 2011-01-20 at 15:30 +0200, Stephane Eranian wrote:
>> > > > @@ -4259,8 +4261,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>> > > >
>> > > >         /* Reassign the task to the init_css_set. */
>> > > >         task_lock(tsk);
>> > > > +       /*
>> > > > +        * we mask interrupts to prevent:
>> > > > +        * - timer tick to cause event rotation which
>> > > > +        *   could schedule back in cgroup events after
>> > > > +        *   they were switched out by perf_cgroup_sched_out()
>> > > > +        *
>> > > > +        * - preemption which could schedule back in cgroup events
>> > > > +        */
>> > > > +       local_irq_save(flags);
>> > > > +       perf_cgroup_sched_out(tsk);
>> > > >         cg = tsk->cgroups;
>> > > >         tsk->cgroups = &init_css_set;
>> > > > +       perf_cgroup_sched_in(tsk);
>> > > > +       local_irq_restore(flags);
>> > > >         task_unlock(tsk);
>> > > >         if (cg)
>> > > >                 put_css_set_taskexit(cg);
>> > >
>> > > So you too need a callback on cgroup change there.. Li, Paul, any chance
>> > > we can fix this cgroup_subsys::exit callback? The scheduler code needs
>> > > to do funny thing because its in the wrong place as well.
>> >
>> > cgroup guys? Shall I just fix this exit thing since the only user seems
>> > to be the scheduler and now perf for both of which its unfortunate at
>> > best?
>>
>> Are you suggesting that the cgroup_exit on task_exit notification should be
>> pulled out?
>
>
> No, just fixed. The callback as it exists isn't useful and leads to
> hacks like the above.
>
>
>> > Balbir, memcontrol.c uses pre_destroy(), I pose that using this method
>> > is broken per definition since it makes the cgroup empty notification
>> > void.
>> >
>>
>> We use pre_destroy() to reclaim, so that delete/rmdir() will be able
>> to clean up the node/group. I am not sure what you mean by it makes
>> the empty notification void and why pre_destroy() is broken?
>
> A quick look at the code looked like it could return -EBUSY (and other
> errors), in that case the rmdir of the empty cgroup will fail.
>
> Therefore it can happen that after the last task is removed, and we get
> the notification that the cgroup is empty, and we attempt the rmdir we
> will fail.
>
> This again means that all such notification handlers must poll state,
> which is ridiculous.
>

Not necessarily - we could make it that a failed rmdir() sets a bit
that causes a notification again once the final refcount is dropped
again on the cgroup.

Paul

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-07 19:28             ` Paul Menage
@ 2011-02-07 20:02               ` Peter Zijlstra
  2011-02-07 21:21                 ` Paul Menage
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-07 20:02 UTC (permalink / raw)
  To: Paul Menage
  Cc: balbir, eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf

On Mon, 2011-02-07 at 11:28 -0800, Paul Menage wrote:
> On Mon, Feb 7, 2011 at 8:10 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Make the ::exit method act like ::attach, it is after all very nearly
> > the same thing.
> 
> The major difference between attach and exit is that the former is
> only triggered in response to user cgroup-management action, whereas
> the latter is triggered whenever a task exits, even if cgroups aren't
> set up.

And the major likeness is that they both migrate a task from one cgroup
to another. You cannot simply ignore that.

> >  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
> >  {
> > -       int i;
> >        struct css_set *cg;
> > +       int i;
> >
> > -       if (run_callbacks && need_forkexit_callback) {
> > -               /*
> > -                * modular subsystems can't use callbacks, so no need to lock
> > -                * the subsys array
> > -                */
> > -               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> > -                       struct cgroup_subsys *ss = subsys[i];
> > -                       if (ss->exit)
> > -                               ss->exit(ss, tsk);
> > -               }
> > -       }
> > +       mutex_lock(&cgroup_mutex);
> 
> NACK - cgroup_mutex is way too heavy to take in the task exit path.
> We'll need to find some other way to fix this if it's really needed.
> task->alloc_lock is also normally a valid thing to synchronize against
> cgroup moves, but I'd have to look at the exit path to see if it's
> still valid there.

If maybe you're like respond more often than about once every two months
I might actually care what you think. 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-07 19:29         ` [PATCH 1/2] perf_events: add cgroup support (v8) Paul Menage
@ 2011-02-07 20:09           ` Peter Zijlstra
  2011-02-07 21:33             ` Paul Menage
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-07 20:09 UTC (permalink / raw)
  To: Paul Menage
  Cc: balbir, eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf

On Mon, 2011-02-07 at 11:29 -0800, Paul Menage wrote:
> > This again means that all such notification handlers must poll state,
> > which is ridiculous.
> >
> 
> Not necessarily - we could make it that a failed rmdir() sets a bit
> that causes a notification again once the final refcount is dropped
> again on the cgroup. 

But that doesn't work if you fail for any other reason than a refcount,
which you yourself proposed a while back.

Also, the core cgroup code should deal with the cgroup refcount, not
individual controllers.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-07 16:10 ` Peter Zijlstra
@ 2011-02-07 20:30   ` Stephane Eranian
  2011-02-08 22:31   ` Stephane Eranian
  1 sibling, 0 replies; 28+ messages in thread
From: Stephane Eranian @ 2011-02-07 20:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, perfmon2-devel,
	eranian, robert.richter, acme, lizf

Peter,

I will try your changes and report back tomorrow.
Thanks.


On Mon, Feb 7, 2011 at 5:10 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> Compile tested only, depends on the cgroup::exit patch
>
> ---
> Subject: perf: Add cgroup support
> From: Stephane Eranian <eranian@google.com>
> Date: Mon Feb 07 17:02:25 CET 2011
>
> This kernel patch adds the ability to filter monitoring based on
> container groups (cgroups). This is for use in per-cpu mode only.
>
> The cgroup to monitor is passed as a file descriptor in the pid
> argument to the syscall. The file descriptor must be opened to
> the cgroup name in the cgroup filesystem. For instance, if the
> cgroup name is foo and cgroupfs is mounted in /cgroup, then the
> file descriptor is opened to /cgroup/foo. Cgroup mode is
> activated by passing PERF_FLAG_PID_CGROUP in the flags argument
> to the syscall.
>
> For instance to measure in cgroup foo on CPU1 assuming
> cgroupfs is mounted under /cgroup:
>
> struct perf_event_attr attr;
> int cgroup_fd, fd;
>
> cgroup_fd = open("/cgroup/foo", O_RDONLY);
> fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP);
> close(cgroup_fd);
>
> Signed-off-by: Stephane Eranian <eranian@google.com>
> [ added perf_cgroup_{exit,attach} ]
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> LKML-Reference: <new-submission>
> ---
>  include/linux/cgroup.h        |    1
>  include/linux/cgroup_subsys.h |    4
>  include/linux/perf_event.h    |   33 +-
>  init/Kconfig                  |   10
>  kernel/cgroup.c               |   23 +
>  kernel/perf_event.c           |  641 +++++++++++++++++++++++++++++++++++++++---
>  6 files changed, 665 insertions(+), 47 deletions(-)
>
> Index: linux-2.6/include/linux/cgroup.h
> ===================================================================
> --- linux-2.6.orig/include/linux/cgroup.h
> +++ linux-2.6/include/linux/cgroup.h
> @@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsy
>  /* Get id and depth of css */
>  unsigned short css_id(struct cgroup_subsys_state *css);
>  unsigned short css_depth(struct cgroup_subsys_state *css);
> +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
>
>  #else /* !CONFIG_CGROUPS */
>
> Index: linux-2.6/include/linux/cgroup_subsys.h
> ===================================================================
> --- linux-2.6.orig/include/linux/cgroup_subsys.h
> +++ linux-2.6/include/linux/cgroup_subsys.h
> @@ -65,4 +65,8 @@ SUBSYS(net_cls)
>  SUBSYS(blkio)
>  #endif
>
> +#ifdef CONFIG_CGROUP_PERF
> +SUBSYS(perf)
> +#endif
> +
>  /* */
> Index: linux-2.6/include/linux/perf_event.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_event.h
> +++ linux-2.6/include/linux/perf_event.h
> @@ -464,6 +464,7 @@ enum perf_callchain_context {
>
>  #define PERF_FLAG_FD_NO_GROUP  (1U << 0)
>  #define PERF_FLAG_FD_OUTPUT    (1U << 1)
> +#define PERF_FLAG_PID_CGROUP   (1U << 2) /* pid=cgroup id, per-cpu mode only */
>
>  #ifdef __KERNEL__
>  /*
> @@ -471,6 +472,7 @@ enum perf_callchain_context {
>  */
>
>  #ifdef CONFIG_PERF_EVENTS
> +# include <linux/cgroup.h>
>  # include <asm/perf_event.h>
>  # include <asm/local64.h>
>  #endif
> @@ -716,6 +718,22 @@ struct swevent_hlist {
>  #define PERF_ATTACH_GROUP      0x02
>  #define PERF_ATTACH_TASK       0x04
>
> +#ifdef CONFIG_CGROUP_PERF
> +/*
> + * perf_cgroup_info keeps track of time_enabled for a cgroup.
> + * This is a per-cpu dynamically allocated data structure.
> + */
> +struct perf_cgroup_info {
> +       u64 time;
> +       u64 timestamp;
> +};
> +
> +struct perf_cgroup {
> +       struct cgroup_subsys_state css;
> +       struct perf_cgroup_info *info;  /* timing info, one per cpu */
> +};
> +#endif
> +
>  /**
>  * struct perf_event - performance event kernel representation:
>  */
> @@ -832,6 +850,11 @@ struct perf_event {
>        struct event_filter             *filter;
>  #endif
>
> +#ifdef CONFIG_CGROUP_PERF
> +       struct perf_cgroup              *cgrp; /* cgroup event is attach to */
> +       int                             cgrp_defer_enabled;
> +#endif
> +
>  #endif /* CONFIG_PERF_EVENTS */
>  };
>
> @@ -886,6 +909,7 @@ struct perf_event_context {
>        u64                             generation;
>        int                             pin_count;
>        struct rcu_head                 rcu_head;
> +       int                             nr_cgroups; /* cgroup events present */
>  };
>
>  /*
> @@ -905,6 +929,9 @@ struct perf_cpu_context {
>        struct list_head                rotation_list;
>        int                             jiffies_interval;
>        struct pmu                      *active_pmu;
> +#ifdef CONFIG_CGROUP_PERF
> +       struct perf_cgroup              *cgrp;
> +#endif
>  };
>
>  struct perf_output_handle {
> @@ -1040,11 +1067,11 @@ perf_sw_event(u32 event_id, u64 nr, int
>        __perf_sw_event(event_id, nr, nmi, regs, addr);
>  }
>
> -extern atomic_t perf_task_events;
> +extern atomic_t perf_sched_events;
>
>  static inline void perf_event_task_sched_in(struct task_struct *task)
>  {
> -       COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
> +       COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
>  }
>
>  static inline
> @@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct ta
>  {
>        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
>
> -       COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
> +       COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
>  }
>
>  extern void perf_event_mmap(struct vm_area_struct *vma);
> Index: linux-2.6/init/Kconfig
> ===================================================================
> --- linux-2.6.orig/init/Kconfig
> +++ linux-2.6/init/Kconfig
> @@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
>          select this option (if, for some reason, they need to disable it
>          then noswapaccount does the trick).
>
> +config CGROUP_PERF
> +       bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
> +       depends on PERF_EVENTS && CGROUPS
> +       help
> +         This option extends the per-cpu mode to restrict monitoring to
> +         threads which belong to the cgroup specificied and run on the
> +         designated cpu.
> +
> +         Say N if unsure.
> +
>  menuconfig CGROUP_SCHED
>        bool "Group CPU scheduler"
>        depends on EXPERIMENTAL
> Index: linux-2.6/kernel/cgroup.c
> ===================================================================
> --- linux-2.6.orig/kernel/cgroup.c
> +++ linux-2.6/kernel/cgroup.c
> @@ -4822,6 +4822,29 @@ css_get_next(struct cgroup_subsys *ss, i
>        return ret;
>  }
>
> +/*
> + * get corresponding css from file open on cgroupfs directory
> + */
> +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
> +{
> +       struct cgroup *cgrp;
> +       struct inode *inode;
> +       struct cgroup_subsys_state *css;
> +
> +       inode = f->f_dentry->d_inode;
> +       /* check in cgroup filesystem dir */
> +       if (inode->i_op != &cgroup_dir_inode_operations)
> +               return ERR_PTR(-EBADF);
> +
> +       if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
> +               return ERR_PTR(-EINVAL);
> +
> +       /* get cgroup */
> +       cgrp = __d_cgrp(f->f_dentry);
> +       css = cgrp->subsys[id];
> +       return css ? css : ERR_PTR(-ENOENT);
> +}
> +
>  #ifdef CONFIG_CGROUP_DEBUG
>  static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
>                                                   struct cgroup *cont)
> Index: linux-2.6/kernel/perf_event.c
> ===================================================================
> --- linux-2.6.orig/kernel/perf_event.c
> +++ linux-2.6/kernel/perf_event.c
> @@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, in
>        return data.ret;
>  }
>
> +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
> +                      PERF_FLAG_FD_OUTPUT  |\
> +                      PERF_FLAG_PID_CGROUP)
> +
>  enum event_type_t {
>        EVENT_FLEXIBLE = 0x1,
>        EVENT_PINNED = 0x2,
>        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
>  };
>
> -atomic_t perf_task_events __read_mostly;
> +/*
> + * perf_sched_events : >0 events exist
> + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
> + */
> +atomic_t perf_sched_events __read_mostly;
> +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
> +
>  static atomic_t nr_mmap_events __read_mostly;
>  static atomic_t nr_comm_events __read_mostly;
>  static atomic_t nr_task_events __read_mostly;
> @@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct per
>                              enum event_type_t event_type);
>
>  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
> -                            enum event_type_t event_type);
> +                            enum event_type_t event_type,
> +                            struct task_struct *task, int cgrp_sw);
> +
> +static void update_context_time(struct perf_event_context *ctx);
> +static u64 perf_event_time(struct perf_event *event);
>
>  void __weak perf_event_print_debug(void)       { }
>
> @@ -162,6 +176,315 @@ static inline u64 perf_clock(void)
>        return local_clock();
>  }
>
> +#ifdef CONFIG_CGROUP_PERF
> +
> +static inline struct perf_cgroup *
> +perf_cgroup_from_task(struct task_struct *task)
> +{
> +       return container_of(task_subsys_state(task, perf_subsys_id),
> +                       struct perf_cgroup, css);
> +}
> +
> +static inline bool
> +perf_cgroup_match(struct perf_event *event, struct task_struct *task)
> +{
> +       struct perf_cgroup *cgrp = NULL;
> +       if (task)
> +               cgrp = perf_cgroup_from_task(task);
> +       return !event->cgrp || event->cgrp == cgrp;
> +}
> +
> +static inline void perf_get_cgroup(struct perf_event *event)
> +{
> +       css_get(&event->cgrp->css);
> +}
> +
> +static inline void perf_put_cgroup(struct perf_event *event)
> +{
> +       css_put(&event->cgrp->css);
> +}
> +
> +static inline void perf_detach_cgroup(struct perf_event *event)
> +{
> +       perf_put_cgroup(event);
> +       event->cgrp = NULL;
> +}
> +
> +static inline int is_cgroup_event(struct perf_event *event)
> +{
> +       return event->cgrp != NULL;
> +}
> +
> +static inline u64 perf_cgroup_event_time(struct perf_event *event)
> +{
> +       struct perf_cgroup_info *t;
> +
> +       t = per_cpu_ptr(event->cgrp->info, event->cpu);
> +       return t->time;
> +}
> +
> +static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
> +{
> +       struct perf_cgroup_info *info;
> +       u64 now;
> +
> +       now = perf_clock();
> +
> +       info = this_cpu_ptr(cgrp->info);
> +
> +       info->time += now - info->timestamp;
> +       info->timestamp = now;
> +}
> +
> +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
> +{
> +       struct perf_cgroup *cgrp_out = cpuctx->cgrp;
> +       if (cgrp_out)
> +               __update_cgrp_time(cgrp_out);
> +}
> +
> +static inline void update_cgrp_time_from_event(struct perf_event *event)
> +{
> +       struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
> +       /*
> +        * do not update time when cgroup is not active
> +        */
> +       if (!event->cgrp || cgrp != event->cgrp)
> +               return;
> +
> +       __update_cgrp_time(event->cgrp);
> +}
> +
> +static inline void
> +perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
> +{
> +       struct perf_cgroup *cgrp;
> +       struct perf_cgroup_info *info;
> +
> +       if (!task)
> +               return;
> +
> +       cgrp = perf_cgroup_from_task(task);
> +       info = per_cpu_ptr(cgrp->info, smp_processor_id());
> +       info->timestamp = now;
> +}
> +
> +#define PERF_CGROUP_SWOUT      0x1 /* cgroup switch out every event */
> +#define PERF_CGROUP_SWIN       0x2 /* cgroup switch in events based on task */
> +
> +/*
> + * reschedule events based on the cgroup constraint of task.
> + *
> + * mode SWOUT : schedule out everything
> + * mode SWIN : schedule in based on cgroup for next
> + */
> +void perf_cgroup_switch(struct task_struct *task, int mode)
> +{
> +       struct perf_cpu_context *cpuctx;
> +       struct pmu *pmu;
> +       unsigned long flags;
> +
> +       /*
> +        * disable interrupts to avoid geting nr_cgroup
> +        * changes via __perf_event_disable(). Also
> +        * avoids preemption.
> +        */
> +       local_irq_save(flags);
> +
> +       /*
> +        * we reschedule only in the presence of cgroup
> +        * constrained events.
> +        */
> +       rcu_read_lock();
> +
> +       list_for_each_entry_rcu(pmu, &pmus, entry) {
> +
> +               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> +
> +               perf_pmu_disable(cpuctx->ctx.pmu);
> +
> +               /*
> +                * perf_cgroup_events says at least one
> +                * context on this CPU has cgroup events.
> +                *
> +                * ctx->nr_cgroups reports the number of cgroup
> +                * events for a context.
> +                */
> +               if (cpuctx->ctx.nr_cgroups > 0) {
> +
> +                       if (mode & PERF_CGROUP_SWOUT)
> +                               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> +
> +                       if (mode & PERF_CGROUP_SWIN) {
> +                               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, 1);
> +                               cpuctx->cgrp = perf_cgroup_from_task(task);
> +                       }
> +               }
> +
> +               perf_pmu_enable(cpuctx->ctx.pmu);
> +       }
> +
> +       rcu_read_unlock();
> +
> +       local_irq_restore(flags);
> +}
> +
> +static inline void perf_cgroup_sched_out(struct task_struct *task)
> +{
> +       perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
> +}
> +
> +static inline void perf_cgroup_sched_in(struct task_struct *task)
> +{
> +       perf_cgroup_switch(task, PERF_CGROUP_SWIN);
> +}
> +
> +static inline int perf_cgroup_connect(int fd, struct perf_event *event,
> +                                     struct perf_event_attr *attr,
> +                                     struct perf_event *group_leader)
> +{
> +       struct perf_cgroup *cgrp;
> +       struct cgroup_subsys_state *css;
> +       struct file *file;
> +       int ret = 0, fput_needed;
> +
> +       file = fget_light(fd, &fput_needed);
> +       if (!file)
> +               return -EBADF;
> +
> +       css = cgroup_css_from_dir(file, perf_subsys_id);
> +       if (IS_ERR(css))
> +               return PTR_ERR(css);
> +
> +       cgrp = container_of(css, struct perf_cgroup, css);
> +       event->cgrp = cgrp;
> +
> +       /*
> +        * all events in a group must monitor
> +        * the same cgroup because a task belongs
> +        * to only one perf cgroup at a time
> +        */
> +       if (group_leader && group_leader->cgrp != cgrp) {
> +               perf_detach_cgroup(event);
> +               ret = -EINVAL;
> +       } else {
> +               /* must be done before we fput() the file */
> +               perf_get_cgroup(event);
> +       }
> +       fput_light(file, fput_needed);
> +       return ret;
> +}
> +
> +static inline void
> +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
> +{
> +       struct perf_cgroup_info *t;
> +       t = per_cpu_ptr(event->cgrp->info, event->cpu);
> +       event->shadow_ctx_time = now - t->timestamp;
> +}
> +
> +static inline void
> +perf_cgroup_defer_enabled(struct perf_event *event, struct task_struct *task)
> +{
> +       /*
> +        * when the current task's perf cgroup does not match
> +        * the event's, we need to remember to call the
> +        * perf_mark_enable() function the first time a task with
> +        * a matching perf cgroup is scheduled in.
> +        */
> +       if (is_cgroup_event(event) && !perf_cgroup_match(event, task))
> +               event->cgrp_defer_enabled = 1;
> +}
> +
> +static inline void
> +perf_cgroup_mark_enabled(struct perf_event *event,
> +                        struct perf_event_context *ctx)
> +{
> +       struct perf_event *sub;
> +       u64 tstamp = perf_event_time(event);
> +
> +       if (!event->cgrp_defer_enabled)
> +               return;
> +
> +       event->cgrp_defer_enabled = 0;
> +
> +       event->tstamp_enabled = tstamp - event->total_time_enabled;
> +       list_for_each_entry(sub, &event->sibling_list, group_entry) {
> +               if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
> +                       sub->tstamp_enabled = tstamp - sub->total_time_enabled;
> +                       sub->cgrp_defer_enabled = 0;
> +               }
> +       }
> +}
> +#else /* !CONFIG_CGROUP_PERF */
> +
> +static inline bool
> +perf_cgroup_match(struct perf_event *event, struct task_struct *task)
> +{
> +       return true;
> +}
> +
> +static inline void perf_detach_cgroup(struct perf_event *event)
> +{}
> +
> +static inline int is_cgroup_event(struct perf_event *event)
> +{
> +       return 0;
> +}
> +
> +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
> +{
> +       return 0;
> +}
> +
> +static inline void update_cgrp_time_from_event(struct perf_event *event)
> +{}
> +
> +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
> +{}
> +
> +static inline void perf_cgroup_sched_out(struct task_struct *task)
> +{
> +}
> +
> +static inline void perf_cgroup_sched_in(struct task_struct *task)
> +{
> +}
> +
> +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
> +                                     struct perf_event_attr *attr,
> +                                     struct perf_event *group_leader)
> +{
> +       return -EINVAL;
> +}
> +
> +static inline void
> +perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
> +{}
> +
> +void
> +perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
> +{}
> +
> +static inline void
> +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
> +{}
> +
> +static inline u64 perf_cgroup_event_time(struct perf_event *event)
> +{
> +       return 0;
> +}
> +
> +static inline void
> +perf_cgroup_defer_enabled(struct perf_event *event, struct task_struct *task)
> +{}
> +
> +static inline void
> +perf_cgroup_mark_enabled(struct perf_event *event,
> +                        struct perf_event_context *ctx)
> +{}
> +#endif
> +
>  void perf_pmu_disable(struct pmu *pmu)
>  {
>        int *count = this_cpu_ptr(pmu->pmu_disable_count);
> @@ -343,6 +666,10 @@ static void update_context_time(struct p
>  static u64 perf_event_time(struct perf_event *event)
>  {
>        struct perf_event_context *ctx = event->ctx;
> +
> +       if (is_cgroup_event(event))
> +               return perf_cgroup_event_time(event);
> +
>        return ctx ? ctx->time : 0;
>  }
>
> @@ -357,9 +684,20 @@ static void update_event_times(struct pe
>        if (event->state < PERF_EVENT_STATE_INACTIVE ||
>            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
>                return;
> -
> -       if (ctx->is_active)
> +       /*
> +        * in cgroup mode, time_enabled represents
> +        * the time the event was enabled AND active
> +        * tasks were in the monitored cgroup. This is
> +        * independent of the activity of the context as
> +        * there may be a mix of cgroup and non-cgroup events.
> +        *
> +        * That is why we treat cgroup events differently
> +        * here.
> +        */
> +       if (is_cgroup_event(event))
>                run_end = perf_event_time(event);
> +       else if (ctx->is_active)
> +               run_end = ctx->time;
>        else
>                run_end = event->tstamp_stopped;
>
> @@ -371,6 +709,7 @@ static void update_event_times(struct pe
>                run_end = perf_event_time(event);
>
>        event->total_time_running = run_end - event->tstamp_running;
> +
>  }
>
>  /*
> @@ -419,6 +758,17 @@ list_add_event(struct perf_event *event,
>                list_add_tail(&event->group_entry, list);
>        }
>
> +       if (is_cgroup_event(event)) {
> +               ctx->nr_cgroups++;
> +               /*
> +                * one more event:
> +                * - that has cgroup constraint on event->cpu
> +                * - that may need work on context switch
> +                */
> +               atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
> +               jump_label_inc(&perf_sched_events);
> +       }
> +
>        list_add_rcu(&event->event_entry, &ctx->event_list);
>        if (!ctx->nr_events)
>                perf_pmu_rotate_start(ctx->pmu);
> @@ -545,6 +895,12 @@ list_del_event(struct perf_event *event,
>
>        event->attach_state &= ~PERF_ATTACH_CONTEXT;
>
> +       if (is_cgroup_event(event)) {
> +               ctx->nr_cgroups--;
> +               atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
> +               jump_label_dec(&perf_sched_events);
> +       }
> +
>        ctx->nr_events--;
>        if (event->attr.inherit_stat)
>                ctx->nr_stat--;
> @@ -614,9 +970,10 @@ static void perf_group_detach(struct per
>  }
>
>  static inline int
> -event_filter_match(struct perf_event *event)
> +event_filter_match(struct perf_event *event, struct task_struct *task)
>  {
> -       return event->cpu == -1 || event->cpu == smp_processor_id();
> +       return (event->cpu == -1 || event->cpu == smp_processor_id())
> +           && perf_cgroup_match(event, task);
>  }
>
>  static void
> @@ -633,8 +990,8 @@ event_sched_out(struct perf_event *event
>         * via read() for time_enabled, time_running:
>         */
>        if (event->state == PERF_EVENT_STATE_INACTIVE
> -           && !event_filter_match(event)) {
> -               delta = ctx->time - event->tstamp_stopped;
> +           && !event_filter_match(event, current)) {
> +               delta = tstamp - event->tstamp_stopped;
>                event->tstamp_running += delta;
>                event->tstamp_stopped = tstamp;
>        }
> @@ -783,6 +1140,7 @@ static int __perf_event_disable(void *in
>         */
>        if (event->state >= PERF_EVENT_STATE_INACTIVE) {
>                update_context_time(ctx);
> +               update_cgrp_time_from_event(event);
>                update_group_times(event);
>                if (event == event->group_leader)
>                        group_sched_out(event, cpuctx, ctx);
> @@ -851,6 +1209,41 @@ void perf_event_disable(struct perf_even
>        raw_spin_unlock_irq(&ctx->lock);
>  }
>
> +static void perf_set_shadow_time(struct perf_event *event,
> +                                struct perf_event_context *ctx,
> +                                u64 tstamp)
> +{
> +       /*
> +        * use the correct time source for the time snapshot
> +        *
> +        * We could get by without this by leveraging the
> +        * fact that to get to this function, the caller
> +        * has most likely already called update_context_time()
> +        * and update_cgrp_time_xx() and thus both timestamp
> +        * are identical (or very close). Given that tstamp is,
> +        * already adjusted for cgroup, we could say that:
> +        *    tstamp - ctx->timestamp
> +        * is equivalent to
> +        *    tstamp - cgrp->timestamp.
> +        *
> +        * Then, in perf_output_read(), the calculation would
> +        * work with no changes because:
> +        * - event is guaranteed scheduled in
> +        * - no scheduled out in between
> +        * - thus the timestamp would be the same
> +        *
> +        * But this is a bit hairy.
> +        *
> +        * So instead, we have an explicit cgroup call to remain
> +        * within the time time source all along. We believe it
> +        * is cleaner and simpler to understand.
> +        */
> +       if (is_cgroup_event(event))
> +               perf_cgroup_set_shadow_time(event, tstamp);
> +       else
> +               event->shadow_ctx_time = tstamp - ctx->timestamp;
> +}
> +
>  static int
>  event_sched_in(struct perf_event *event,
>                 struct perf_cpu_context *cpuctx,
> @@ -876,7 +1269,7 @@ event_sched_in(struct perf_event *event,
>
>        event->tstamp_running += tstamp - event->tstamp_stopped;
>
> -       event->shadow_ctx_time = tstamp - ctx->timestamp;
> +       perf_set_shadow_time(event, ctx, tstamp);
>
>        if (!is_software_event(event))
>                cpuctx->active_oncpu++;
> @@ -992,12 +1385,13 @@ static void add_event_to_ctx(struct perf
>
>        list_add_event(event, ctx);
>        perf_group_attach(event);
> -       event->tstamp_enabled = tstamp;
>        event->tstamp_running = tstamp;
>        event->tstamp_stopped = tstamp;
> +       event->tstamp_enabled = tstamp;
>  }
>
> -static void perf_event_context_sched_in(struct perf_event_context *ctx);
> +static void perf_event_context_sched_in(struct perf_event_context *ctx,
> +                                       struct task_struct *tsk);
>
>  /*
>  * Cross CPU call to install and enable a performance event
> @@ -1018,15 +1412,21 @@ static int  __perf_install_in_context(vo
>         * which do context switches with IRQs enabled.
>         */
>        if (ctx->task && !cpuctx->task_ctx)
> -               perf_event_context_sched_in(ctx);
> +               perf_event_context_sched_in(ctx, ctx->task);
>
>        raw_spin_lock(&ctx->lock);
>        ctx->is_active = 1;
>        update_context_time(ctx);
> +       /*
> +        * update cgrp time only if current cgrp
> +        * matches event->cgrp. Must be done before
> +        * calling add_event_to_ctx()
> +        */
> +       update_cgrp_time_from_event(event);
>
>        add_event_to_ctx(event, ctx);
>
> -       if (!event_filter_match(event))
> +       if (!event_filter_match(event, current))
>                goto unlock;
>
>        /*
> @@ -1160,10 +1560,19 @@ static int __perf_event_enable(void *inf
>
>        if (event->state >= PERF_EVENT_STATE_INACTIVE)
>                goto unlock;
> +
> +       /*
> +        * set current task's cgroup time reference point
> +        */
> +       perf_cgroup_set_timestamp(current, perf_clock());
> +
>        __perf_event_mark_enabled(event, ctx);
>
> -       if (!event_filter_match(event))
> +       if (!event_filter_match(event, current)) {
> +               if (is_cgroup_event(event))
> +                       perf_cgroup_defer_enabled(event, current);
>                goto unlock;
> +       }
>
>        /*
>         * If the event is in a group and isn't the group leader,
> @@ -1292,6 +1701,7 @@ static void ctx_sched_out(struct perf_ev
>        if (likely(!ctx->nr_events))
>                goto out;
>        update_context_time(ctx);
> +       update_cgrp_time_from_cpuctx(cpuctx);
>
>        if (!ctx->nr_active)
>                goto out;
> @@ -1481,6 +1891,14 @@ void __perf_event_task_sched_out(struct
>
>        for_each_task_context_nr(ctxn)
>                perf_event_context_sched_out(task, ctxn, next);
> +
> +       /*
> +        * if cgroup events exist on this CPU, then we need
> +        * to check if we have to switch out PMU state.
> +        * cgroup event are system-wide mode only
> +        */
> +       if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
> +               perf_cgroup_sched_out(task);
>  }
>
>  static void task_ctx_sched_out(struct perf_event_context *ctx,
> @@ -1509,16 +1927,21 @@ static void cpu_ctx_sched_out(struct per
>
>  static void
>  ctx_pinned_sched_in(struct perf_event_context *ctx,
> -                   struct perf_cpu_context *cpuctx)
> +                   struct perf_cpu_context *cpuctx,
> +                   struct task_struct *task, int cgrp_sw)
>  {
>        struct perf_event *event;
>
>        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
>                if (event->state <= PERF_EVENT_STATE_OFF)
>                        continue;
> -               if (!event_filter_match(event))
> +               if (!event_filter_match(event, task))
>                        continue;
>
> +               /* may need to reset tstamp_enabled */
> +               if (is_cgroup_event(event))
> +                       perf_cgroup_mark_enabled(event, ctx);
> +
>                if (group_can_go_on(event, cpuctx, 1))
>                        group_sched_in(event, cpuctx, ctx);
>
> @@ -1535,7 +1958,8 @@ ctx_pinned_sched_in(struct perf_event_co
>
>  static void
>  ctx_flexible_sched_in(struct perf_event_context *ctx,
> -                     struct perf_cpu_context *cpuctx)
> +                     struct perf_cpu_context *cpuctx,
> +                     struct task_struct *task, int cgrp_sw)
>  {
>        struct perf_event *event;
>        int can_add_hw = 1;
> @@ -1548,9 +1972,13 @@ ctx_flexible_sched_in(struct perf_event_
>                 * Listen to the 'cpu' scheduling filter constraint
>                 * of events:
>                 */
> -               if (!event_filter_match(event))
> +               if (!event_filter_match(event, task))
>                        continue;
>
> +               /* may need to reset tstamp_enabled */
> +               if (is_cgroup_event(event))
> +                       perf_cgroup_mark_enabled(event, ctx);
> +
>                if (group_can_go_on(event, cpuctx, can_add_hw)) {
>                        if (group_sched_in(event, cpuctx, ctx))
>                                can_add_hw = 0;
> @@ -1561,36 +1989,41 @@ ctx_flexible_sched_in(struct perf_event_
>  static void
>  ctx_sched_in(struct perf_event_context *ctx,
>             struct perf_cpu_context *cpuctx,
> -            enum event_type_t event_type)
> +            enum event_type_t event_type,
> +            struct task_struct *task, int cgrp_sw)
>  {
> +       u64 now;
> +
>        raw_spin_lock(&ctx->lock);
>        ctx->is_active = 1;
>        if (likely(!ctx->nr_events))
>                goto out;
>
> -       ctx->timestamp = perf_clock();
> -
> +       now = perf_clock();
> +       ctx->timestamp = now;
> +       perf_cgroup_set_timestamp(task, now);
>        /*
>         * First go through the list and put on any pinned groups
>         * in order to give them the best chance of going on.
>         */
>        if (event_type & EVENT_PINNED)
> -               ctx_pinned_sched_in(ctx, cpuctx);
> +               ctx_pinned_sched_in(ctx, cpuctx, task, cgrp_sw);
>
>        /* Then walk through the lower prio flexible groups */
>        if (event_type & EVENT_FLEXIBLE)
> -               ctx_flexible_sched_in(ctx, cpuctx);
> +               ctx_flexible_sched_in(ctx, cpuctx, task, cgrp_sw);
>
>  out:
>        raw_spin_unlock(&ctx->lock);
>  }
>
>  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
> -                            enum event_type_t event_type)
> +                            enum event_type_t event_type,
> +                            struct task_struct *task, int cgrp_sw)
>  {
>        struct perf_event_context *ctx = &cpuctx->ctx;
>
> -       ctx_sched_in(ctx, cpuctx, event_type);
> +       ctx_sched_in(ctx, cpuctx, event_type, task, cgrp_sw);
>  }
>
>  static void task_ctx_sched_in(struct perf_event_context *ctx,
> @@ -1602,11 +2035,12 @@ static void task_ctx_sched_in(struct per
>        if (cpuctx->task_ctx == ctx)
>                return;
>
> -       ctx_sched_in(ctx, cpuctx, event_type);
> +       ctx_sched_in(ctx, cpuctx, event_type, NULL, 0);
>        cpuctx->task_ctx = ctx;
>  }
>
> -static void perf_event_context_sched_in(struct perf_event_context *ctx)
> +static void perf_event_context_sched_in(struct perf_event_context *ctx,
> +                                       struct task_struct *task)
>  {
>        struct perf_cpu_context *cpuctx;
>
> @@ -1622,9 +2056,9 @@ static void perf_event_context_sched_in(
>         */
>        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>
> -       ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
> -       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
> -       ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
> +       ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, 0);
> +       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, 0);
> +       ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, 0);
>
>        cpuctx->task_ctx = ctx;
>
> @@ -1657,8 +2091,15 @@ void __perf_event_task_sched_in(struct t
>                if (likely(!ctx))
>                        continue;
>
> -               perf_event_context_sched_in(ctx);
> +               perf_event_context_sched_in(ctx, task);
>        }
> +       /*
> +        * if cgroup events exist on this CPU, then we need
> +        * to check if we have to switch in PMU state.
> +        * cgroup event are system-wide mode only
> +        */
> +       if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
> +               perf_cgroup_sched_in(task);
>  }
>
>  #define MAX_INTERRUPTS (~0ULL)
> @@ -1775,7 +2216,7 @@ static void perf_ctx_adjust_freq(struct
>                if (event->state != PERF_EVENT_STATE_ACTIVE)
>                        continue;
>
> -               if (!event_filter_match(event))
> +               if (!event_filter_match(event, current))
>                        continue;
>
>                hwc = &event->hw;
> @@ -1833,9 +2274,10 @@ static void perf_rotate_context(struct p
>        struct perf_event_context *ctx = NULL;
>        int rotate = 0, remove = 1;
>
> -       if (cpuctx->ctx.nr_events) {
> +       ctx = &cpuctx->ctx;
> +       if (ctx->nr_events) {
>                remove = 0;
> -               if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
> +               if (ctx->nr_events != ctx->nr_active)
>                        rotate = 1;
>        }
>
> @@ -1862,7 +2304,7 @@ static void perf_rotate_context(struct p
>        if (ctx)
>                rotate_ctx(ctx);
>
> -       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
> +       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current, 0);
>        if (ctx)
>                task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
>
> @@ -1941,7 +2383,7 @@ static void perf_event_enable_on_exec(st
>
>        raw_spin_unlock(&ctx->lock);
>
> -       perf_event_context_sched_in(ctx);
> +       perf_event_context_sched_in(ctx, ctx->task);
>  out:
>        local_irq_restore(flags);
>  }
> @@ -1968,6 +2410,7 @@ static void __perf_event_read(void *info
>        raw_spin_lock(&ctx->lock);
>        if (ctx->is_active)
>                update_context_time(ctx);
> +       update_cgrp_time_from_event(event);
>        update_event_times(event);
>        if (event->state == PERF_EVENT_STATE_ACTIVE)
>                event->pmu->read(event);
> @@ -1998,8 +2441,10 @@ static u64 perf_event_read(struct perf_e
>                 * (e.g., thread is blocked), in that case
>                 * we cannot update context time
>                 */
> -               if (ctx->is_active)
> +               if (ctx->is_active) {
>                        update_context_time(ctx);
> +                       update_cgrp_time_from_event(event);
> +               }
>                update_event_times(event);
>                raw_spin_unlock_irqrestore(&ctx->lock, flags);
>        }
> @@ -2384,7 +2829,7 @@ static void free_event(struct perf_event
>
>        if (!event->parent) {
>                if (event->attach_state & PERF_ATTACH_TASK)
> -                       jump_label_dec(&perf_task_events);
> +                       jump_label_dec(&perf_sched_events);
>                if (event->attr.mmap || event->attr.mmap_data)
>                        atomic_dec(&nr_mmap_events);
>                if (event->attr.comm)
> @@ -2400,6 +2845,9 @@ static void free_event(struct perf_event
>                event->buffer = NULL;
>        }
>
> +       if (is_cgroup_event(event))
> +               perf_detach_cgroup(event);
> +
>        if (event->destroy)
>                event->destroy(event);
>
> @@ -3984,7 +4432,7 @@ static int perf_event_task_match(struct
>        if (event->state < PERF_EVENT_STATE_INACTIVE)
>                return 0;
>
> -       if (!event_filter_match(event))
> +       if (!event_filter_match(event, current))
>                return 0;
>
>        if (event->attr.comm || event->attr.mmap ||
> @@ -4121,7 +4569,7 @@ static int perf_event_comm_match(struct
>        if (event->state < PERF_EVENT_STATE_INACTIVE)
>                return 0;
>
> -       if (!event_filter_match(event))
> +       if (!event_filter_match(event, current))
>                return 0;
>
>        if (event->attr.comm)
> @@ -4269,7 +4717,7 @@ static int perf_event_mmap_match(struct
>        if (event->state < PERF_EVENT_STATE_INACTIVE)
>                return 0;
>
> -       if (!event_filter_match(event))
> +       if (!event_filter_match(event, current))
>                return 0;
>
>        if ((!executable && event->attr.mmap_data) ||
> @@ -5289,6 +5737,7 @@ static void task_clock_event_read(struct
>
>        if (!in_nmi()) {
>                update_context_time(event->ctx);
> +               update_cgrp_time_from_event(event);
>                time = event->ctx->time;
>        } else {
>                u64 now = perf_clock();
> @@ -5714,7 +6163,7 @@ perf_event_alloc(struct perf_event_attr
>
>        if (!event->parent) {
>                if (event->attach_state & PERF_ATTACH_TASK)
> -                       jump_label_inc(&perf_task_events);
> +                       jump_label_inc(&perf_sched_events);
>                if (event->attr.mmap || event->attr.mmap_data)
>                        atomic_inc(&nr_mmap_events);
>                if (event->attr.comm)
> @@ -5889,7 +6338,7 @@ SYSCALL_DEFINE5(perf_event_open,
>        int err;
>
>        /* for future expandability... */
> -       if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
> +       if (flags & ~PERF_FLAG_ALL)
>                return -EINVAL;
>
>        err = perf_copy_attr(attr_uptr, &attr);
> @@ -5906,6 +6355,15 @@ SYSCALL_DEFINE5(perf_event_open,
>                        return -EINVAL;
>        }
>
> +       /*
> +        * In cgroup mode, the pid argument is used to pass the fd
> +        * opened to the cgroup directory in cgroupfs. The cpu argument
> +        * designates the cpu on which to monitor threads from that
> +        * cgroup.
> +        */
> +       if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
> +               return -EINVAL;
> +
>        event_fd = get_unused_fd_flags(O_RDWR);
>        if (event_fd < 0)
>                return event_fd;
> @@ -5923,7 +6381,7 @@ SYSCALL_DEFINE5(perf_event_open,
>                        group_leader = NULL;
>        }
>
> -       if (pid != -1) {
> +       if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
>                task = find_lively_task_by_vpid(pid);
>                if (IS_ERR(task)) {
>                        err = PTR_ERR(task);
> @@ -5937,6 +6395,12 @@ SYSCALL_DEFINE5(perf_event_open,
>                goto err_task;
>        }
>
> +       if (flags & PERF_FLAG_PID_CGROUP) {
> +               err = perf_cgroup_connect(pid, event, &attr, group_leader);
> +               if (err)
> +                       goto err_alloc;
> +       }
> +
>        /*
>         * Special case software events and allow them to be part of
>         * any hardware group.
> @@ -6797,3 +7261,92 @@ static int __init perf_event_sysfs_init(
>        return ret;
>  }
>  device_initcall(perf_event_sysfs_init);
> +
> +#ifdef CONFIG_CGROUP_PERF
> +static struct cgroup_subsys_state *perf_cgroup_create(
> +       struct cgroup_subsys *ss, struct cgroup *cont)
> +{
> +       struct perf_cgroup *jc;
> +       struct perf_cgroup_info *t;
> +       int c;
> +
> +       jc = kmalloc(sizeof(*jc), GFP_KERNEL);
> +       if (!jc)
> +               return ERR_PTR(-ENOMEM);
> +
> +       memset(jc, 0, sizeof(*jc));
> +
> +       jc->info = alloc_percpu(struct perf_cgroup_info);
> +       if (!jc->info) {
> +               kfree(jc);
> +               return ERR_PTR(-ENOMEM);
> +       }
> +
> +       for_each_possible_cpu(c) {
> +               t = per_cpu_ptr(jc->info, c);
> +               t->time = 0;
> +               t->timestamp = 0;
> +       }
> +       return &jc->css;
> +}
> +
> +static void perf_cgroup_destroy(struct cgroup_subsys *ss,
> +                               struct cgroup *cont)
> +{
> +       struct perf_cgroup *jc;
> +       jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
> +                         struct perf_cgroup, css);
> +       free_percpu(jc->info);
> +       kfree(jc);
> +}
> +
> +static int __perf_cgroup_move(void *info)
> +{
> +       struct task_struct *task = info;
> +       perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
> +       return 0;
> +}
> +
> +static void perf_cgroup_move(struct task_struct *task)
> +{
> +       task_function_call(task, __perf_cgroup_move, task);
> +}
> +
> +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> +               struct cgroup *old_cgrp, struct task_struct *task,
> +               bool threadgroup)
> +{
> +       perf_cgroup_move(task);
> +       if (threadgroup) {
> +               struct task_struct *c;
> +               rcu_read_lock();
> +               list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> +                       perf_cgroup_move(c);
> +               }
> +               rcu_read_unlock();
> +       }
> +}
> +
> +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
> +               struct cgroup *old_cgrp, struct task_struct *task)
> +{
> +       /*
> +        * cgroup_exit() is called in the copy_process() failure path.
> +        * Ignore this case since the task hasn't ran yet, this avoids
> +        * trying to poke a half freed task state from generic code.
> +        */
> +       if (!(task->flags & PF_EXITING))
> +               return;
> +
> +       perf_cgroup_move(task);
> +}
> +
> +struct cgroup_subsys perf_subsys = {
> +       .name = "perf_event",
> +       .subsys_id = perf_subsys_id,
> +       .create = perf_cgroup_create,
> +       .destroy = perf_cgroup_destroy,
> +       .exit = perf_cgroup_exit,
> +       .attach = perf_cgroup_attach,
> +};
> +#endif /* CONFIG_CGROUP_PERF */
>
>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-07 20:02               ` Peter Zijlstra
@ 2011-02-07 21:21                 ` Paul Menage
  2011-02-08 10:24                   ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Paul Menage @ 2011-02-07 21:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: balbir, eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf

On Mon, Feb 7, 2011 at 12:02 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, 2011-02-07 at 11:28 -0800, Paul Menage wrote:
>> On Mon, Feb 7, 2011 at 8:10 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> >
>> > Make the ::exit method act like ::attach, it is after all very nearly
>> > the same thing.
>>
>> The major difference between attach and exit is that the former is
>> only triggered in response to user cgroup-management action, whereas
>> the latter is triggered whenever a task exits, even if cgroups aren't
>> set up.
>
> And the major likeness is that they both migrate a task from one cgroup
> to another. You cannot simply ignore that.

True, and the exit path for cgroups has always been a bit fuzzy - that
was kind of inherited from cpusets, where this worked out OK, but the
CPU subsystem has more interesting requirements.

An important semantic difference between attach and exit is that in
the exit case the destination is always the root, and the task in
question is going to be exiting before doing anything else
interesting. So it should be possible to optimize that path a lot
compared to the regular attach (many things related to resource usage
can be ignored, since the task won't have time to actually use any
non-trivial amount of resources).

>
> If maybe you're like respond more often than about once every two months
> I might actually care what you think.

Yes, sadly since switching groups at Google, cgroups has become pretty
much just a spare-time activity for me - but that in itself doesn't
automatically invalidate my opinion when I do have time to respond.
It's still the case that cgroup_mutex is an incredibly heavyweight
mutex that has no business being in the task exit path. Or do you just
believe that ad hominem is a valid style of argument?

How about if the exit callback was moved before the preceding
task_unlock()? Since I think the scheduler is still the only user of
the exit callback, redefining the locking semantics should be fine.

Paul

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-07 20:09           ` Peter Zijlstra
@ 2011-02-07 21:33             ` Paul Menage
  0 siblings, 0 replies; 28+ messages in thread
From: Paul Menage @ 2011-02-07 21:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: balbir, eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf

On Mon, Feb 7, 2011 at 12:09 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Also, the core cgroup code should deal with the cgroup refcount, not
> individual controllers.

Right, that's what I meant - make it so that an rmdir that fails
because a subsystem is still holding a refcount puts/leaves the cgroup
in a state where userspace will still get a notification when the last
refcount is dropped. Not something that the subsystem would have to
worry about.

Paul

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-07 21:21                 ` Paul Menage
@ 2011-02-08 10:24                   ` Peter Zijlstra
  2011-02-10  2:04                     ` Li Zefan
                                       ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-08 10:24 UTC (permalink / raw)
  To: Paul Menage
  Cc: balbir, eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf

On Mon, 2011-02-07 at 13:21 -0800, Paul Menage wrote:
> On Mon, Feb 7, 2011 at 12:02 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Mon, 2011-02-07 at 11:28 -0800, Paul Menage wrote:
> >> On Mon, Feb 7, 2011 at 8:10 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >> >
> >> > Make the ::exit method act like ::attach, it is after all very nearly
> >> > the same thing.
> >>
> >> The major difference between attach and exit is that the former is
> >> only triggered in response to user cgroup-management action, whereas
> >> the latter is triggered whenever a task exits, even if cgroups aren't
> >> set up.
> >
> > And the major likeness is that they both migrate a task from one cgroup
> > to another. You cannot simply ignore that.
> 
> True, and the exit path for cgroups has always been a bit fuzzy - that
> was kind of inherited from cpusets, where this worked out OK, but the
> CPU subsystem has more interesting requirements.

Yeah, from that pov we're actually still a running task that can and
will scheduler still.

> An important semantic difference between attach and exit is that in
> the exit case the destination is always the root, and the task in
> question is going to be exiting before doing anything else
> interesting. So it should be possible to optimize that path a lot
> compared to the regular attach (many things related to resource usage
> can be ignored, since the task won't have time to actually use any
> non-trivial amount of resources).

One could also argue that the accumulated time (/other resources) of all
tasks exiting might end up being a significant amount, but yeah
whatever :-)

I'm mostly concerned with not wrecking state (and crashing/leaking etc).

> >
> > If maybe you're like respond more often than about once every two months
> > I might actually care what you think.
> 
> Yes, sadly since switching groups at Google, cgroups has become pretty
> much just a spare-time activity for me 

And here I thought Google was starting to understand what community
participation meant.. is there anybody you know who can play a more
active role in the whole cgroup thing?

> - but that in itself doesn't
> automatically invalidate my opinion when I do have time to respond.
> It's still the case that cgroup_mutex is an incredibly heavyweight
> mutex that has no business being in the task exit path. Or do you just
> believe that ad hominem is a valid style of argument?

No, it was mostly frustration talking.

> How about if the exit callback was moved before the preceding
> task_unlock()? Since I think the scheduler is still the only user of
> the exit callback, redefining the locking semantics should be fine.

Like the below? Both the perf and sched exit callback are fine with
being called under task_lock afaict, but I haven't actually ran with
lockdep enabled to see if I missed something.

I also pondered doing the cgroup exit from __put_task_struct() but that
had another problem iirc.

---
Index: linux-2.6/include/linux/cgroup.h
===================================================================
--- linux-2.6.orig/include/linux/cgroup.h
+++ linux-2.6/include/linux/cgroup.h
@@ -474,7 +474,8 @@ struct cgroup_subsys {
 			struct cgroup *old_cgrp, struct task_struct *tsk,
 			bool threadgroup);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
-	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
+	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			struct cgroup *old_cgrp, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
 			struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
Index: linux-2.6/kernel/cgroup.c
===================================================================
--- linux-2.6.orig/kernel/cgroup.c
+++ linux-2.6/kernel/cgroup.c
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-	int i;
 	struct css_set *cg;
-
-	if (run_callbacks && need_forkexit_callback) {
-		/*
-		 * modular subsystems can't use callbacks, so no need to lock
-		 * the subsys array
-		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-			struct cgroup_subsys *ss = subsys[i];
-			if (ss->exit)
-				ss->exit(ss, tsk);
-		}
-	}
+	int i;
 
 	/*
 	 * Unlink from the css_set task list if necessary.
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk
 	task_lock(tsk);
 	cg = tsk->cgroups;
 	tsk->cgroups = &init_css_set;
+
+	if (run_callbacks && need_forkexit_callback) {
+		/*
+		 * modular subsystems can't use callbacks, so no need to lock
+		 * the subsys array
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys *ss = subsys[i];
+			if (ss->exit) {
+				struct cgroup *old_cgrp =
+					rcu_dereference_raw(cg->subsys[i])->cgroup;
+				struct cgroup *cgrp = task_cgroup(tsk, i);
+				ss->exit(ss, cgrp, old_cgrp, tsk);
+			}
+		}
+	}
 	task_unlock(tsk);
+
 	if (cg)
 		put_css_set_taskexit(cg);
 }
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -606,9 +606,6 @@ static inline struct task_group *task_gr
 	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
-	if (p->flags & PF_EXITING)
-		return &root_task_group;
-
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
@@ -9081,7 +9078,8 @@ cpu_cgroup_attach(struct cgroup_subsys *
 }
 
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-07 16:10 ` Peter Zijlstra
  2011-02-07 20:30   ` Stephane Eranian
@ 2011-02-08 22:31   ` Stephane Eranian
  2011-02-09  9:47     ` Peter Zijlstra
  1 sibling, 1 reply; 28+ messages in thread
From: Stephane Eranian @ 2011-02-08 22:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, perfmon2-devel,
	eranian, robert.richter, acme, lizf

Peter,

See comments below.


On Mon, Feb 7, 2011 at 5:10 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> Compile tested only, depends on the cgroup::exit patch
>
> --- linux-2.6.orig/include/linux/perf_event.h
> +++ linux-2.6/include/linux/perf_event.h
> @@ -905,6 +929,9 @@ struct perf_cpu_context {
>        struct list_head                rotation_list;
>        int                             jiffies_interval;
>        struct pmu                      *active_pmu;
> +#ifdef CONFIG_CGROUP_PERF
> +       struct perf_cgroup              *cgrp;
> +#endif
>  };
>
I don't quite understand the motivation for adding cgrp to cpuctx.

> --- linux-2.6.orig/kernel/perf_event.c
> +++ linux-2.6/kernel/perf_event.c
> +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
> +{
> +       struct perf_cgroup *cgrp_out = cpuctx->cgrp;
> +       if (cgrp_out)
> +               __update_cgrp_time(cgrp_out);
> +}
> +
What's the benefit of this form compared to the original from_task() version?

> +void perf_cgroup_switch(struct task_struct *task, int mode)
> +{
> +       struct perf_cpu_context *cpuctx;
> +       struct pmu *pmu;
> +       unsigned long flags;
> +
> +       /*
> +        * disable interrupts to avoid geting nr_cgroup
> +        * changes via __perf_event_disable(). Also
> +        * avoids preemption.
> +        */
> +       local_irq_save(flags);
> +
> +       /*
> +        * we reschedule only in the presence of cgroup
> +        * constrained events.
> +        */
> +       rcu_read_lock();
> +
> +       list_for_each_entry_rcu(pmu, &pmus, entry) {
> +
> +               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> +
> +               perf_pmu_disable(cpuctx->ctx.pmu);
> +
> +               /*
> +                * perf_cgroup_events says at least one
> +                * context on this CPU has cgroup events.
> +                *
> +                * ctx->nr_cgroups reports the number of cgroup
> +                * events for a context.
> +                */
> +               if (cpuctx->ctx.nr_cgroups > 0) {
> +
> +                       if (mode & PERF_CGROUP_SWOUT)
> +                               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> +
> +                       if (mode & PERF_CGROUP_SWIN) {
> +                               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, 1);
> +                               cpuctx->cgrp = perf_cgroup_from_task(task);
> +                       }
> +               }
I think there is a risk on cpuctx->cgrp pointing to stale cgrp information.
Shouldn't we also set cpuctx->cgrp = NULL on SWOUT?

> +static int __perf_cgroup_move(void *info)
> +{
> +       struct task_struct *task = info;
> +       perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
> +       return 0;
> +}
> +
> +static void perf_cgroup_move(struct task_struct *task)
> +{
> +       task_function_call(task, __perf_cgroup_move, task);
> +}
> +
> +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> +               struct cgroup *old_cgrp, struct task_struct *task,
> +               bool threadgroup)
> +{
> +       perf_cgroup_move(task);
> +       if (threadgroup) {
> +               struct task_struct *c;
> +               rcu_read_lock();
> +               list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> +                       perf_cgroup_move(c);
> +               }
> +               rcu_read_unlock();
> +       }
> +}
> +
I suspect my original patch was not necessarily handling the attach completely
when you move an existing task into a cgroup which was already monitored.
I think you may have had to wait until a ctxsw. Looks like this callback handles
this better.

Let me make sure I understand the threadgroup iteration, though. I suspect
this handles the situation where a multi-threaded app is moved into a cgroup
while there is already cgroup monitoring active. In that case and if we do not
want to wait until there is at least one ctxsw on all CPUs, then we have to
check if the other threads are not already running on the other CPUs.If so,
we need to do a cgroup switch on those CPUs. Otherwise, we have nothing to
do. Am I getting this right?

> +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
> +               struct cgroup *old_cgrp, struct task_struct *task)
> +{
> +       /*
> +        * cgroup_exit() is called in the copy_process() failure path.
> +        * Ignore this case since the task hasn't ran yet, this avoids
> +        * trying to poke a half freed task state from generic code.
> +        */
> +       if (!(task->flags & PF_EXITING))
> +               return;
> +
> +       perf_cgroup_move(task);
> +}
> +
Those callbacks looks good to me. They certainly alleviate the need for the
hack in cgorup_exit().

Thanks for fixing this.

> +struct cgroup_subsys perf_subsys = {
> +       .name = "perf_event",
> +       .subsys_id = perf_subsys_id,
> +       .create = perf_cgroup_create,
> +       .destroy = perf_cgroup_destroy,
> +       .exit = perf_cgroup_exit,
> +       .attach = perf_cgroup_attach,
> +};
> +#endif /* CONFIG_CGROUP_PERF */
>
>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-08 22:31   ` Stephane Eranian
@ 2011-02-09  9:47     ` Peter Zijlstra
  2011-02-10 11:47       ` Stephane Eranian
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-09  9:47 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, perfmon2-devel,
	eranian, robert.richter, acme, lizf

On Tue, 2011-02-08 at 23:31 +0100, Stephane Eranian wrote:
> Peter,
> 
> See comments below.
> 
> 
> On Mon, Feb 7, 2011 at 5:10 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> > Compile tested only, depends on the cgroup::exit patch
> >
> > --- linux-2.6.orig/include/linux/perf_event.h
> > +++ linux-2.6/include/linux/perf_event.h
> > @@ -905,6 +929,9 @@ struct perf_cpu_context {
> >        struct list_head                rotation_list;
> >        int                             jiffies_interval;
> >        struct pmu                      *active_pmu;
> > +#ifdef CONFIG_CGROUP_PERF
> > +       struct perf_cgroup              *cgrp;
> > +#endif
> >  };
> >
> I don't quite understand the motivation for adding cgrp to cpuctx.
> 
> > --- linux-2.6.orig/kernel/perf_event.c
> > +++ linux-2.6/kernel/perf_event.c
> > +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
> > +{
> > +       struct perf_cgroup *cgrp_out = cpuctx->cgrp;
> > +       if (cgrp_out)
> > +               __update_cgrp_time(cgrp_out);
> > +}
> > +
> What's the benefit of this form compared to the original from_task() version?

Answering both questions, I did this so we could still do the
sched_out() while the task has already been flipped to a new cgroup.
Note that both attach and the new exit cgroup_subsys methods are called
after they update the task's cgroup. While they do provide the old
cgroup as an argument, making use of that requires passing that along
which would have been a much more invasive change.


> > +                       if (mode & PERF_CGROUP_SWOUT)
> > +                               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> > +
> > +                       if (mode & PERF_CGROUP_SWIN) {
> > +                               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, 1);
> > +                               cpuctx->cgrp = perf_cgroup_from_task(task);
> > +                       }
> > +               }
> I think there is a risk on cpuctx->cgrp pointing to stale cgrp information.
> Shouldn't we also set cpuctx->cgrp = NULL on SWOUT?

Yeah, we probably should.

> > +static int __perf_cgroup_move(void *info)
> > +{
> > +       struct task_struct *task = info;
> > +       perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
> > +       return 0;
> > +}
> > +
> > +static void perf_cgroup_move(struct task_struct *task)
> > +{
> > +       task_function_call(task, __perf_cgroup_move, task);
> > +}
> > +
> > +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > +               struct cgroup *old_cgrp, struct task_struct *task,
> > +               bool threadgroup)
> > +{
> > +       perf_cgroup_move(task);
> > +       if (threadgroup) {
> > +               struct task_struct *c;
> > +               rcu_read_lock();
> > +               list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> > +                       perf_cgroup_move(c);
> > +               }
> > +               rcu_read_unlock();
> > +       }
> > +}
> > +
> I suspect my original patch was not necessarily handling the attach completely
> when you move an existing task into a cgroup which was already monitored.
> I think you may have had to wait until a ctxsw. Looks like this callback handles
> this better.

Right, this deals with moving a task into a cgroup that isn't currently
being monitored and its converse, moving it out of a cgroup that is
being monitored.

> Let me make sure I understand the threadgroup iteration, though. I suspect
> this handles the situation where a multi-threaded app is moved into a cgroup

Indeed.

> while there is already cgroup monitoring active. In that case and if we do not
> want to wait until there is at least one ctxsw on all CPUs, then we have to
> check if the other threads are not already running on the other CPUs.If so,
> we need to do a cgroup switch on those CPUs. Otherwise, we have nothing to
> do. Am I getting this right?

Right, so if any of those tasks is currently running, that cpu will be
monitoring their old cgroup, hence we send an IPI to flip cgroups.

> > +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > +               struct cgroup *old_cgrp, struct task_struct *task)
> > +{
> > +       /*
> > +        * cgroup_exit() is called in the copy_process() failure path.
> > +        * Ignore this case since the task hasn't ran yet, this avoids
> > +        * trying to poke a half freed task state from generic code.
> > +        */
> > +       if (!(task->flags & PF_EXITING))
> > +               return;
> > +
> > +       perf_cgroup_move(task);
> > +}
> > +
> Those callbacks looks good to me. They certainly alleviate the need for the
> hack in cgorup_exit().
> 
> Thanks for fixing this.

n/p, now all we need to do is get this cgroup_subsys::exit stuff
sorted ;-)




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-08 10:24                   ` Peter Zijlstra
@ 2011-02-10  2:04                     ` Li Zefan
  2011-02-11 12:13                       ` Peter Zijlstra
  2011-02-14  4:32                     ` Paul Menage
  2011-02-16 13:46                     ` [tip:perf/core] " tip-bot for Peter Zijlstra
  2 siblings, 1 reply; 28+ messages in thread
From: Li Zefan @ 2011-02-10  2:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul Menage, balbir, eranian, linux-kernel, mingo, paulus, davem,
	fweisbec, perfmon2-devel, eranian, robert.richter, acme

(just came back from vacation)

>> How about if the exit callback was moved before the preceding
>> task_unlock()? Since I think the scheduler is still the only user of
>> the exit callback, redefining the locking semantics should be fine.
> 
> Like the below? Both the perf and sched exit callback are fine with
> being called under task_lock afaict, but I haven't actually ran with
> lockdep enabled to see if I missed something.
> 

This change should be fine.

...

> +
> +	if (run_callbacks && need_forkexit_callback) {
> +		/*
> +		 * modular subsystems can't use callbacks, so no need to lock
> +		 * the subsys array
> +		 */
> +		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> +			struct cgroup_subsys *ss = subsys[i];
> +			if (ss->exit) {
> +				struct cgroup *old_cgrp =
> +					rcu_dereference_raw(cg->subsys[i])->cgroup;
> +				struct cgroup *cgrp = task_cgroup(tsk, i);
> +				ss->exit(ss, cgrp, old_cgrp, tsk);

Since both sched and perf won't use the 2 args @cgrp and @old_cgrp,
don't bother to change the ->exit interface?

> +			}
> +		}
> +	}
>  	task_unlock(tsk);
> +
>  	if (cg)
>  		put_css_set_taskexit(cg);
>  }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-09  9:47     ` Peter Zijlstra
@ 2011-02-10 11:47       ` Stephane Eranian
  2011-02-11  0:55         ` Li Zefan
  0 siblings, 1 reply; 28+ messages in thread
From: Stephane Eranian @ 2011-02-10 11:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, perfmon2-devel,
	eranian, robert.richter, acme, lizf, Paul Menage

On Wed, Feb 9, 2011 at 10:47 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, 2011-02-08 at 23:31 +0100, Stephane Eranian wrote:
>> Peter,
>>
>> See comments below.
>>
>>
>> On Mon, Feb 7, 2011 at 5:10 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>> > Compile tested only, depends on the cgroup::exit patch
>> >
>> > --- linux-2.6.orig/include/linux/perf_event.h
>> > +++ linux-2.6/include/linux/perf_event.h
>> > @@ -905,6 +929,9 @@ struct perf_cpu_context {
>> >        struct list_head                rotation_list;
>> >        int                             jiffies_interval;
>> >        struct pmu                      *active_pmu;
>> > +#ifdef CONFIG_CGROUP_PERF
>> > +       struct perf_cgroup              *cgrp;
>> > +#endif
>> >  };
>> >
>> I don't quite understand the motivation for adding cgrp to cpuctx.
>>
>> > --- linux-2.6.orig/kernel/perf_event.c
>> > +++ linux-2.6/kernel/perf_event.c
>> > +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
>> > +{
>> > +       struct perf_cgroup *cgrp_out = cpuctx->cgrp;
>> > +       if (cgrp_out)
>> > +               __update_cgrp_time(cgrp_out);
>> > +}
>> > +
>> What's the benefit of this form compared to the original from_task() version?
>
> Answering both questions, I did this so we could still do the
> sched_out() while the task has already been flipped to a new cgroup.
> Note that both attach and the new exit cgroup_subsys methods are called
> after they update the task's cgroup. While they do provide the old
> cgroup as an argument, making use of that requires passing that along
> which would have been a much more invasive change.
>
>
>> > +                       if (mode & PERF_CGROUP_SWOUT)
>> > +                               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> > +
>> > +                       if (mode & PERF_CGROUP_SWIN) {
>> > +                               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, 1);
>> > +                               cpuctx->cgrp = perf_cgroup_from_task(task);
>> > +                       }
>> > +               }
>> I think there is a risk on cpuctx->cgrp pointing to stale cgrp information.
>> Shouldn't we also set cpuctx->cgrp = NULL on SWOUT?
>
> Yeah, we probably should.
>
>> > +static int __perf_cgroup_move(void *info)
>> > +{
>> > +       struct task_struct *task = info;
>> > +       perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
>> > +       return 0;
>> > +}
>> > +
>> > +static void perf_cgroup_move(struct task_struct *task)
>> > +{
>> > +       task_function_call(task, __perf_cgroup_move, task);
>> > +}
>> > +
>> > +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
>> > +               struct cgroup *old_cgrp, struct task_struct *task,
>> > +               bool threadgroup)
>> > +{
>> > +       perf_cgroup_move(task);
>> > +       if (threadgroup) {
>> > +               struct task_struct *c;
>> > +               rcu_read_lock();
>> > +               list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
>> > +                       perf_cgroup_move(c);
>> > +               }
>> > +               rcu_read_unlock();
>> > +       }
>> > +}
>> > +
>> I suspect my original patch was not necessarily handling the attach completely
>> when you move an existing task into a cgroup which was already monitored.
>> I think you may have had to wait until a ctxsw. Looks like this callback handles
>> this better.
>
> Right, this deals with moving a task into a cgroup that isn't currently
> being monitored and its converse, moving it out of a cgroup that is
> being monitored.
>
>> Let me make sure I understand the threadgroup iteration, though. I suspect
>> this handles the situation where a multi-threaded app is moved into a cgroup
>
> Indeed.
>
>> while there is already cgroup monitoring active. In that case and if we do not
>> want to wait until there is at least one ctxsw on all CPUs, then we have to
>> check if the other threads are not already running on the other CPUs.If so,
>> we need to do a cgroup switch on those CPUs. Otherwise, we have nothing to
>> do. Am I getting this right?
>
> Right, so if any of those tasks is currently running, that cpu will be
> monitoring their old cgroup, hence we send an IPI to flip cgroups.
>
I have built a test case where this would trigger. I launched a multi-threaded
app, and then I move the pid into a cgroup via: echo PID >/cgroup/tests/tasks.
I don't see any perf_cgroup move beyond the PID passed.

I looked at kernel/cgroup.c and I could not find a invocation of
ss->attach() that
would pass threadgroup = true. So I am confused here.

I wonder how the cgroupfs 'echo PID >tasks' interface would make the distinction
between PID and TID. It seems possible to move one thread of a multi-threaded
process into a cgroup but not the others.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-10 11:47       ` Stephane Eranian
@ 2011-02-11  0:55         ` Li Zefan
  2011-02-11  9:56           ` Stephane Eranian
  0 siblings, 1 reply; 28+ messages in thread
From: Li Zefan @ 2011-02-11  0:55 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, Paul Menage

>>> while there is already cgroup monitoring active. In that case and if we do not
>>> want to wait until there is at least one ctxsw on all CPUs, then we have to
>>> check if the other threads are not already running on the other CPUs.If so,
>>> we need to do a cgroup switch on those CPUs. Otherwise, we have nothing to
>>> do. Am I getting this right?
>>
>> Right, so if any of those tasks is currently running, that cpu will be
>> monitoring their old cgroup, hence we send an IPI to flip cgroups.
>>
> I have built a test case where this would trigger. I launched a multi-threaded
> app, and then I move the pid into a cgroup via: echo PID >/cgroup/tests/tasks.
> I don't see any perf_cgroup move beyond the PID passed.
> 
> I looked at kernel/cgroup.c and I could not find a invocation of
> ss->attach() that
> would pass threadgroup = true. So I am confused here.
> 
> I wonder how the cgroupfs 'echo PID >tasks' interface would make the distinction
> between PID and TID. It seems possible to move one thread of a multi-threaded
> process into a cgroup but not the others.
> 

You can do this:

	# echo PID > cgroup.procs

When the patchset that implements the above feature is accepted. See:

	https://lkml.org/lkml/2011/2/7/418

The below commit that confused you is actually a part of the above patchset,
but it sneaked into the kernel accidentally:

commit be367d09927023d081f9199665c8500f69f14d22
Author: Ben Blum <bblum@google.com>
Date:   Wed Sep 23 15:56:31 2009 -0700

    cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-11  0:55         ` Li Zefan
@ 2011-02-11  9:56           ` Stephane Eranian
  2011-02-11 13:36             ` Stephane Eranian
  0 siblings, 1 reply; 28+ messages in thread
From: Stephane Eranian @ 2011-02-11  9:56 UTC (permalink / raw)
  To: Li Zefan
  Cc: Peter Zijlstra, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, Paul Menage

Li,

On Fri, Feb 11, 2011 at 1:55 AM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>> while there is already cgroup monitoring active. In that case and if we do not
>>>> want to wait until there is at least one ctxsw on all CPUs, then we have to
>>>> check if the other threads are not already running on the other CPUs.If so,
>>>> we need to do a cgroup switch on those CPUs. Otherwise, we have nothing to
>>>> do. Am I getting this right?
>>>
>>> Right, so if any of those tasks is currently running, that cpu will be
>>> monitoring their old cgroup, hence we send an IPI to flip cgroups.
>>>
>> I have built a test case where this would trigger. I launched a multi-threaded
>> app, and then I move the pid into a cgroup via: echo PID >/cgroup/tests/tasks.
>> I don't see any perf_cgroup move beyond the PID passed.
>>
>> I looked at kernel/cgroup.c and I could not find a invocation of
>> ss->attach() that
>> would pass threadgroup = true. So I am confused here.
>>
>> I wonder how the cgroupfs 'echo PID >tasks' interface would make the distinction
>> between PID and TID. It seems possible to move one thread of a multi-threaded
>> process into a cgroup but not the others.
>>
>
> You can do this:
>
>        # echo PID > cgroup.procs
>
> When the patchset that implements the above feature is accepted. See:
>
>        https://lkml.org/lkml/2011/2/7/418
>
> The below commit that confused you is actually a part of the above patchset,
> but it sneaked into the kernel accidentally:
>
> commit be367d09927023d081f9199665c8500f69f14d22
> Author: Ben Blum <bblum@google.com>
> Date:   Wed Sep 23 15:56:31 2009 -0700
>
>    cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time
>
Ok, that makes more sense now. I wil try with the above patchset applied to
verify this work as expected.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-10  2:04                     ` Li Zefan
@ 2011-02-11 12:13                       ` Peter Zijlstra
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2011-02-11 12:13 UTC (permalink / raw)
  To: Li Zefan
  Cc: Paul Menage, balbir, eranian, linux-kernel, mingo, paulus, davem,
	fweisbec, perfmon2-devel, eranian, robert.richter, acme

On Thu, 2011-02-10 at 10:04 +0800, Li Zefan wrote:
> Since both sched and perf won't use the 2 args @cgrp and @old_cgrp,
> don't bother to change the ->exit interface? 

symmetry with ->attach, both sched and perf use a common method to
implement attach and exit, if one needs it the other would too.




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] perf_events: add cgroup support (v8)
  2011-02-11  9:56           ` Stephane Eranian
@ 2011-02-11 13:36             ` Stephane Eranian
  0 siblings, 0 replies; 28+ messages in thread
From: Stephane Eranian @ 2011-02-11 13:36 UTC (permalink / raw)
  To: Li Zefan
  Cc: Peter Zijlstra, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, Paul Menage

On Fri, Feb 11, 2011 at 10:56 AM, Stephane Eranian <eranian@google.com> wrote:
> Li,
>
> On Fri, Feb 11, 2011 at 1:55 AM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>>> while there is already cgroup monitoring active. In that case and if we do not
>>>>> want to wait until there is at least one ctxsw on all CPUs, then we have to
>>>>> check if the other threads are not already running on the other CPUs.If so,
>>>>> we need to do a cgroup switch on those CPUs. Otherwise, we have nothing to
>>>>> do. Am I getting this right?
>>>>
>>>> Right, so if any of those tasks is currently running, that cpu will be
>>>> monitoring their old cgroup, hence we send an IPI to flip cgroups.
>>>>
>>> I have built a test case where this would trigger. I launched a multi-threaded
>>> app, and then I move the pid into a cgroup via: echo PID >/cgroup/tests/tasks.
>>> I don't see any perf_cgroup move beyond the PID passed.
>>>
>>> I looked at kernel/cgroup.c and I could not find a invocation of
>>> ss->attach() that
>>> would pass threadgroup = true. So I am confused here.
>>>
>>> I wonder how the cgroupfs 'echo PID >tasks' interface would make the distinction
>>> between PID and TID. It seems possible to move one thread of a multi-threaded
>>> process into a cgroup but not the others.
>>>
>>
>> You can do this:
>>
>>        # echo PID > cgroup.procs
>>
>> When the patchset that implements the above feature is accepted. See:
>>
>>        https://lkml.org/lkml/2011/2/7/418
>>
>> The below commit that confused you is actually a part of the above patchset,
>> but it sneaked into the kernel accidentally:
>>
>> commit be367d09927023d081f9199665c8500f69f14d22
>> Author: Ben Blum <bblum@google.com>
>> Date:   Wed Sep 23 15:56:31 2009 -0700
>>
>>    cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time
>>
> Ok, that makes more sense now. I wil try with the above patchset applied to
> verify this work as expected.
>
The above patchset seems to work fine. I add to switch perf_event from
ss->attach() to ss->attach_task() to make it work with multi-threaded apps.
The good thing is that it simplifies the code, because all we have to do now
is simply call perf_cgroup_move(). The caller does the thread iteration.

During this exercise, I found a minor issue in event_sched_out() with the
current perf cgroup patch. I will resubmit the patch incl. Peter's changes
+ some more cleanups and the fix + the update perf cgroup patch.

Thanks.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-07 16:10           ` [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback Peter Zijlstra
  2011-02-07 19:28             ` Paul Menage
@ 2011-02-13 12:52             ` Balbir Singh
  1 sibling, 0 replies; 28+ messages in thread
From: Balbir Singh @ 2011-02-13 12:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf, Paul Menage

* Peter Zijlstra <peterz@infradead.org> [2011-02-07 17:10:33]:

> On Thu, 2011-02-03 at 00:32 +0530, Balbir Singh wrote:
> > > No, just fixed. The callback as it exists isn't useful and leads to
> > > hacks like the above.
> 
> ---
> Subject: cgroup: Fix cgroup_subsys::exit callback
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Mon Feb 07 17:02:20 CET 2011
> 
> Make the ::exit method act like ::attach, it is after all very nearly
> the same thing.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> LKML-Reference: <new-submission>
> ---
> Index: linux-2.6/include/linux/cgroup.h
> ===================================================================
> --- linux-2.6.orig/include/linux/cgroup.h
> +++ linux-2.6/include/linux/cgroup.h
> @@ -474,7 +474,8 @@ struct cgroup_subsys {
>  			struct cgroup *old_cgrp, struct task_struct *tsk,
>  			bool threadgroup);
>  	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
> -	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
> +	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> +			struct cgroup *old_cgrp, struct task_struct *task);

The effective change I see

1. mutex_lock() being held
2. Old cgroup being passed as a part of the notification

Is 1 required? I don't see anything in the changelog. For (2), I don't
see it being used, is the use in the scheduler cgroup path/patch?

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback
  2011-02-08 10:24                   ` Peter Zijlstra
  2011-02-10  2:04                     ` Li Zefan
@ 2011-02-14  4:32                     ` Paul Menage
  2011-02-16 13:46                     ` [tip:perf/core] " tip-bot for Peter Zijlstra
  2 siblings, 0 replies; 28+ messages in thread
From: Paul Menage @ 2011-02-14  4:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: balbir, eranian, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, lizf

On Tue, Feb 8, 2011 at 2:24 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> And here I thought Google was starting to understand what community
> participation meant.. is there anybody you know who can play a more
> active role in the whole cgroup thing?

Google has plenty of people actively working on various aspects of
cgroups (particularly memory and I/O scheduling) in mainline. There's
no one tasked with core cgroups framework maintenance, but there are
folks from Fujitsu and IBM (Li Zefan, Balbir Singh, etc) who have more
experience and state in the core framework than anyone else in Google
anyway.

>
> Like the below? Both the perf and sched exit callback are fine with
> being called under task_lock afaict, but I haven't actually ran with

Sounds good to me.

Acked-by: Paul Menage <menage@google.com>

> lockdep enabled to see if I missed something.
>
> I also pondered doing the cgroup exit from __put_task_struct() but that
> had another problem iirc.

My guess is that by that time, so much of the task's context has been
destroyed that it comes too late in the tear-down for some/many
subsystems? Proving that guess either true or false (for all current
and future potential subsystems) would probably be tricky, though.

>
> ---
> Index: linux-2.6/include/linux/cgroup.h
> ===================================================================
> --- linux-2.6.orig/include/linux/cgroup.h
> +++ linux-2.6/include/linux/cgroup.h
> @@ -474,7 +474,8 @@ struct cgroup_subsys {
>                        struct cgroup *old_cgrp, struct task_struct *tsk,
>                        bool threadgroup);
>        void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
> -       void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
> +       void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> +                       struct cgroup *old_cgrp, struct task_struct *task);
>        int (*populate)(struct cgroup_subsys *ss,
>                        struct cgroup *cgrp);
>        void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
> Index: linux-2.6/kernel/cgroup.c
> ===================================================================

Probably also worth including a note in the docs:

--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -602,6 +602,7 @@ void fork(struct cgroup_subsy *ss, struct task_struct *task)
 Called when a task is forked into a cgroup.

 void exit(struct cgroup_subsys *ss, struct task_struct *task)
+(task->alloc_lock held by caller)

 Called during task exit.


> --- linux-2.6.orig/kernel/cgroup.c
> +++ linux-2.6/kernel/cgroup.c
> @@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct
>  */
>  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>  {
> -       int i;
>        struct css_set *cg;
> -
> -       if (run_callbacks && need_forkexit_callback) {
> -               /*
> -                * modular subsystems can't use callbacks, so no need to lock
> -                * the subsys array
> -                */
> -               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> -                       struct cgroup_subsys *ss = subsys[i];
> -                       if (ss->exit)
> -                               ss->exit(ss, tsk);
> -               }
> -       }
> +       int i;
>
>        /*
>         * Unlink from the css_set task list if necessary.
> @@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk
>        task_lock(tsk);
>        cg = tsk->cgroups;
>        tsk->cgroups = &init_css_set;
> +
> +       if (run_callbacks && need_forkexit_callback) {
> +               /*
> +                * modular subsystems can't use callbacks, so no need to lock
> +                * the subsys array
> +                */
> +               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> +                       struct cgroup_subsys *ss = subsys[i];
> +                       if (ss->exit) {
> +                               struct cgroup *old_cgrp =
> +                                       rcu_dereference_raw(cg->subsys[i])->cgroup;
> +                               struct cgroup *cgrp = task_cgroup(tsk, i);
> +                               ss->exit(ss, cgrp, old_cgrp, tsk);
> +                       }
> +               }
> +       }
>        task_unlock(tsk);
> +
>        if (cg)
>                put_css_set_taskexit(cg);
>  }
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -606,9 +606,6 @@ static inline struct task_group *task_gr
>        struct task_group *tg;
>        struct cgroup_subsys_state *css;
>
> -       if (p->flags & PF_EXITING)
> -               return &root_task_group;
> -
>        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
>                        lockdep_is_held(&task_rq(p)->lock));
>        tg = container_of(css, struct task_group, css);
> @@ -9081,7 +9078,8 @@ cpu_cgroup_attach(struct cgroup_subsys *
>  }
>
>  static void
> -cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
> +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
> +               struct cgroup *old_cgrp, struct task_struct *task)
>  {
>        /*
>         * cgroup_exit() is called in the copy_process() failure path.
>
>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [tip:perf/core] cgroup: Fix cgroup_subsys::exit callback
  2011-02-08 10:24                   ` Peter Zijlstra
  2011-02-10  2:04                     ` Li Zefan
  2011-02-14  4:32                     ` Paul Menage
@ 2011-02-16 13:46                     ` tip-bot for Peter Zijlstra
  2 siblings, 0 replies; 28+ messages in thread
From: tip-bot for Peter Zijlstra @ 2011-02-16 13:46 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, a.p.zijlstra, tglx, mingo, menage

Commit-ID:  d41d5a01631af821d3a3447e6613a316f5ee6c25
Gitweb:     http://git.kernel.org/tip/d41d5a01631af821d3a3447e6613a316f5ee6c25
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Mon, 7 Feb 2011 17:02:20 +0100
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 16 Feb 2011 13:30:47 +0100

cgroup: Fix cgroup_subsys::exit callback

Make the ::exit method act like ::attach, it is after all very nearly
the same thing.

The bug had no effect on correctness - fixing it is an optimization for
the scheduler. Also, later perf-cgroups patches rely on it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Menage <menage@google.com>
LKML-Reference: <1297160655.13327.92.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cgroup.h |    3 ++-
 kernel/cgroup.c        |   31 ++++++++++++++++++-------------
 kernel/sched.c         |    6 ++----
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ce104e3..38117d9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -474,7 +474,8 @@ struct cgroup_subsys {
 			struct cgroup *old_cgrp, struct task_struct *tsk,
 			bool threadgroup);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
-	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
+	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			struct cgroup *old_cgrp, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
 			struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d702..f6495f3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child)
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-	int i;
 	struct css_set *cg;
-
-	if (run_callbacks && need_forkexit_callback) {
-		/*
-		 * modular subsystems can't use callbacks, so no need to lock
-		 * the subsys array
-		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-			struct cgroup_subsys *ss = subsys[i];
-			if (ss->exit)
-				ss->exit(ss, tsk);
-		}
-	}
+	int i;
 
 	/*
 	 * Unlink from the css_set task list if necessary.
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	task_lock(tsk);
 	cg = tsk->cgroups;
 	tsk->cgroups = &init_css_set;
+
+	if (run_callbacks && need_forkexit_callback) {
+		/*
+		 * modular subsystems can't use callbacks, so no need to lock
+		 * the subsys array
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys *ss = subsys[i];
+			if (ss->exit) {
+				struct cgroup *old_cgrp =
+					rcu_dereference_raw(cg->subsys[i])->cgroup;
+				struct cgroup *cgrp = task_cgroup(tsk, i);
+				ss->exit(ss, cgrp, old_cgrp, tsk);
+			}
+		}
+	}
 	task_unlock(tsk);
+
 	if (cg)
 		put_css_set_taskexit(cg);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index e142e92..79e611c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
-	if (p->flags & PF_EXITING)
-		return &root_task_group;
-
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
@@ -8863,7 +8860,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.

^ permalink raw reply related	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2011-02-16 13:46 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-20 13:30 [PATCH 1/2] perf_events: add cgroup support (v8) Stephane Eranian
2011-01-20 14:39 ` Peter Zijlstra
2011-01-20 14:46   ` Stephane Eranian
2011-02-02 11:29   ` Peter Zijlstra
2011-02-02 11:50     ` Balbir Singh
2011-02-02 12:46       ` Peter Zijlstra
2011-02-02 19:02         ` Balbir Singh
2011-02-07 16:10           ` [RFC][PATCH] cgroup: Fix cgroup_subsys::exit callback Peter Zijlstra
2011-02-07 19:28             ` Paul Menage
2011-02-07 20:02               ` Peter Zijlstra
2011-02-07 21:21                 ` Paul Menage
2011-02-08 10:24                   ` Peter Zijlstra
2011-02-10  2:04                     ` Li Zefan
2011-02-11 12:13                       ` Peter Zijlstra
2011-02-14  4:32                     ` Paul Menage
2011-02-16 13:46                     ` [tip:perf/core] " tip-bot for Peter Zijlstra
2011-02-13 12:52             ` [RFC][PATCH] " Balbir Singh
2011-02-07 19:29         ` [PATCH 1/2] perf_events: add cgroup support (v8) Paul Menage
2011-02-07 20:09           ` Peter Zijlstra
2011-02-07 21:33             ` Paul Menage
2011-02-07 16:10 ` Peter Zijlstra
2011-02-07 20:30   ` Stephane Eranian
2011-02-08 22:31   ` Stephane Eranian
2011-02-09  9:47     ` Peter Zijlstra
2011-02-10 11:47       ` Stephane Eranian
2011-02-11  0:55         ` Li Zefan
2011-02-11  9:56           ` Stephane Eranian
2011-02-11 13:36             ` Stephane Eranian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.