[PATCH 09/14] x86/cqm: Add Continuous cgroup monitoring

From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
To: vikas.shivappa@intel.com, vikas.shivappa@linux.intel.com
Cc: linux-kernel@vger.kernel.org, x86@kernel.org, tglx@linutronix.de,
	peterz@infradead.org, ravi.v.shankar@intel.com,
	tony.luck@intel.com, fenghua.yu@intel.com, andi.kleen@intel.com,
	davidcc@google.com, eranian@google.com, hpa@zytor.com
Subject: [PATCH 09/14] x86/cqm: Add Continuous cgroup monitoring
Date: Fri, 16 Dec 2016 15:13:03 -0800	[thread overview]
Message-ID: <1481929988-31569-10-git-send-email-vikas.shivappa@linux.intel.com> (raw)
In-Reply-To: <1481929988-31569-1-git-send-email-vikas.shivappa@linux.intel.com>

This patch adds support for cgroup continuous monitoring which enables
to start monitoring a cgroup by toggling the cont_monitor field in the
cgroup without any perf overhead.
The cgroup would be monitored from the time this field is set
and user can fetch the data from the perf when data is needed.
This avoids perf over head all along the time that the cgroup is being
monitored and if one has to monitor a cgroup for its lifetime, it doesnt
need running perf the whole time.

A new file is introduced in the cgroup cont_mon. Once this is enabled a
new RMID is assigned to the cgroup. If an event is created to monitor
this cgroup again, the event just reuses the same RMID. At switch_to
time, we add a check to see if there is cont_monitoring. During read,
data is fetched by reading the counters in the same was as its done for
other cgroups.

Tests: Should be able to monitor cgroup continuously without perf by
toggling the new cont_mon file in the cgroup.

Patch is based on David Carrillo-Cisneros <davidcc@google.com> patches
in cqm2 series.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
 arch/x86/events/intel/cqm.c | 119 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 8017886..73f566a 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -521,6 +521,7 @@ static int cqm_assign_rmid(struct perf_event *event, u32 *rmid)
 static int intel_cqm_setup_event(struct perf_event *event,
 				  struct perf_event **group)
 {
+	struct cgrp_cqm_info *cqm_info;
 	struct perf_event *iter;
 	u32 *rmid, sizet;
 
@@ -537,6 +538,18 @@ static int intel_cqm_setup_event(struct perf_event *event,
 			return 0;
 		}
 	}
+#ifdef CONFIG_CGROUP_PERF
+	/*
+	 * For continously monitored cgroups, *rmid is allocated already.
+	 */
+	if (event->cgrp) {
+		cqm_info = cgrp_to_cqm_info(event->cgrp);
+		if (cqm_info->cont_mon) {
+			event->hw.cqm_rmid = cqm_info->rmid;
+			return 0;
+		}
+	}
+#endif
 
 	/*
 	 * RMIDs are allocated in LAZY mode by default only when
@@ -547,6 +560,8 @@ static int intel_cqm_setup_event(struct perf_event *event,
 	if (!event->hw.cqm_rmid)
 		return -ENOMEM;
 
+	cqm_assign_rmid(event, event->hw.cqm_rmid);
+
 	return 0;
 }
 
@@ -843,18 +858,23 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
 	return 0;
 }
 
+static inline bool is_cont_mon_event(struct perf_event *event);
+
 static inline void
 	cqm_event_free_rmid(struct perf_event *event)
 {
 	u32 *rmid = event->hw.cqm_rmid;
 	int d;
 
-	for (d = 0; d < cqm_socket_max; d++) {
-		if (__rmid_valid(rmid[d]))
-			__put_rmid(rmid[d], d);
+	if (!is_cont_mon_event(event)) {
+
+		for (d = 0; d < cqm_socket_max; d++) {
+			if (__rmid_valid(rmid[d]))
+				__put_rmid(rmid[d], d);
+		}
+		cqm_assign_rmid(event, NULL);
+		kfree(event->hw.cqm_rmid);
 	}
-	kfree(event->hw.cqm_rmid);
-	cqm_assign_rmid(event, NULL);
 	list_del(&event->hw.cqm_groups_entry);
 }
 
@@ -1122,6 +1142,11 @@ static int intel_cqm_event_init(struct perf_event *event)
 };
 
 #ifdef CONFIG_CGROUP_PERF
+static inline bool is_cont_mon_event(struct perf_event *event)
+{
+	return (is_cgroup_event(event) && cgrp_to_cqm_info(event->cgrp)->cont_mon);
+}
+
 int perf_cgroup_arch_css_alloc(struct cgroup_subsys_state *parent_css,
 				      struct cgroup_subsys_state *new_css)
 {
@@ -1230,6 +1255,90 @@ int perf_cgroup_arch_can_attach(struct cgroup_taskset *tset)
 
 	return 0;
 }
+
+/* kernfs guarantees that css doesn't need to be pinned. */
+static u64 cqm_cont_monitoring_read_u64(struct cgroup_subsys_state *css,
+					struct cftype *cft)
+{
+	int ret = -1;
+
+	mutex_lock(&cache_mutex);
+	ret = css_to_cqm_info(css)->cont_mon;
+	mutex_unlock(&cache_mutex);
+
+	return ret;
+}
+
+/* kernfs guarantees that css doesn't need to be pinned. */
+static int cqm_cont_monitoring_write_u64(struct cgroup_subsys_state *css,
+					 struct cftype *cft, u64 value)
+{
+	struct cgrp_cqm_info *cqm_info;
+	unsigned long flags;
+	int ret = 0, d;
+
+	if (value > 1)
+		return -1;
+
+	mutex_lock(&cache_mutex);
+
+	/* Root cgroup cannot stop being monitored. */
+	if (!css->parent)
+		goto out;
+
+	cqm_info = css_to_cqm_info(css);
+
+	/*
+	 * Alloc and free rmid when cont monitoring is being set
+	 * and reset.
+	 */
+	if (!cqm_info->cont_mon && value && !cqm_info->rmid) {
+		cqm_info->rmid =
+			kzalloc(sizeof(u32) * cqm_socket_max, GFP_KERNEL);
+		if (!cqm_info->rmid) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		cqm_assign_hier_rmid(css, cqm_info->rmid);
+	}
+
+	if (cqm_info->cont_mon && !value) {
+		u32 *rmid = cqm_info->rmid;
+
+		raw_spin_lock_irqsave(&cache_lock, flags);
+		for (d = 0; d < cqm_socket_max; d++) {
+			if (__rmid_valid(rmid[d]))
+				__put_rmid(rmid[d], d);
+		}
+		raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+		kfree(cqm_info->rmid);
+		cqm_assign_hier_rmid(css, NULL);
+	}
+
+	cqm_info->cont_mon = value;
+out:
+	mutex_unlock(&cache_mutex);
+
+	return ret;
+}
+
+struct cftype perf_event_cgrp_arch_subsys_cftypes[] = {
+	{
+		.name = "cqm_cont_monitoring",
+		.read_u64 = cqm_cont_monitoring_read_u64,
+		.write_u64 = cqm_cont_monitoring_write_u64,
+	},
+
+	{}	/* terminate */
+};
+#else
+
+static inline bool is_cont_mon_event(struct perf_event *event)
+{
+	return false;
+}
 #endif
 
 static inline void cqm_pick_event_reader(int cpu)
-- 
1.9.1