[PATCH v3 41/46] perf/x86/intel/cmt: add rotation minimum progress SLO - David Carrillo-Cisneros

From: David Carrillo-Cisneros <davidcc@google.com>
To: linux-kernel@vger.kernel.org
Cc: "x86@kernel.org" <x86@kernel.org>, Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Andi Kleen <ak@linux.intel.com>, Kan Liang <kan.liang@intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vegard Nossum <vegard.nossum@gmail.com>,
	Marcelo Tosatti <mtosatti@redhat.com>,
	Nilay Vaish <nilayvaish@gmail.com>, Borislav Petkov <bp@suse.de>,
	Vikas Shivappa <vikas.shivappa@linux.intel.com>,
	Ravi V Shankar <ravi.v.shankar@intel.com>,
	Fenghua Yu <fenghua.yu@intel.com>, Paul Turner <pjt@google.com>,
	Stephane Eranian <eranian@google.com>,
	David Carrillo-Cisneros <davidcc@google.com>
Subject: [PATCH v3 41/46] perf/x86/intel/cmt: add rotation minimum progress SLO
Date: Sat, 29 Oct 2016 17:38:38 -0700	[thread overview]
Message-ID: <1477787923-61185-42-git-send-email-davidcc@google.com> (raw)
In-Reply-To: <1477787923-61185-1-git-send-email-davidcc@google.com>

Try to activate monrs at a __cmt_min_progress_rate rate.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
---
 arch/x86/events/intel/cmt.c | 274 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 273 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/cmt.c b/arch/x86/events/intel/cmt.c
index 8bf6aa5..ba82f95 100644
--- a/arch/x86/events/intel/cmt.c
+++ b/arch/x86/events/intel/cmt.c
@@ -79,6 +79,14 @@ static u64 __cmt_min_mon_slice;
 static unsigned int __cmt_max_threshold;	/* bytes */
 
 /*
+ * Rotation SLO of all monrs events (including those without llc_occupancy):
+ * @__cmt_min_progrees_rate: Min numbers of pmonrs that must go to Active
+ * state per second, otherwise, recycling occupancy error is increased.
+ */
+#define CMT_DEFAULT_MIN_PROGRESS_RATE 2		/* pmonrs per sec */
+static unsigned int __cmt_min_progress_rate = CMT_DEFAULT_MIN_PROGRESS_RATE;
+
+/*
  * If @pkgd == NULL, return first online, pkg_data in cmt_pkgs_data.
  * Otherwise next online pkg_data or NULL if no more.
  */
@@ -466,6 +474,21 @@ static void pmonr_dep_dirty_to_active(struct pmonr *pmonr)
 	__pmonr_dep_to_active_helper(pmonr, rmids.read_rmid);
 }
 
+/* dirty rmid must be clean enough to go to free_rmids. */
+static void pmonr_dep_dirty_to_dep_idle_helper(struct pmonr *pmonr,
+					       union pmonr_rmids rmids)
+{
+	struct pkg_data *pkgd = pmonr->pkgd;
+
+	pmonr->pkgd->nr_dirty_rmids--;
+	__set_bit(rmids.read_rmid, pkgd->free_rmids);
+	list_move_tail(&pmonr->rot_entry, &pkgd->dep_idle_pmonrs);
+	pkgd->nr_dep_pmonrs++;
+
+	pmonr->state = PMONR_DEP_IDLE;
+	pmonr_set_rmids(pmonr, rmids.sched_rmid, INVALID_RMID);
+}
+
 static void monr_dealloc(struct monr *monr)
 {
 	u16 p, nr_pkgs = topology_max_packages();
@@ -1311,6 +1334,242 @@ static void smp_call_rmid_read(void *data)
 	atomic_set(&ccsd->on_read, 0);
 }
 
+/*
+ * Try to reuse dirty rmid's for pmonrs at the front of dep_dirty_pmonrs.
+ */
+static int __try_activate_dep_dirty_pmonrs(struct pkg_data *pkgd)
+{
+	int reused = 0;
+	struct pmonr *pmonr;
+	struct list_head *lhead = &pkgd->dep_pmonrs;
+
+	lockdep_assert_held(&pkgd->lock);
+
+	while ((pmonr = list_first_entry_or_null(
+				lhead, struct pmonr, pkgd_deps_entry))) {
+		if (!pmonr || pmonr->state == PMONR_DEP_IDLE)
+			break;
+		pmonr_dep_dirty_to_active(pmonr);
+		reused++;
+	}
+
+	return reused;
+}
+
+static int try_activate_dep_dirty_pmonrs(struct pkg_data *pkgd)
+{
+	int nr_reused;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&pkgd->lock, flags);
+	nr_reused = __try_activate_dep_dirty_pmonrs(pkgd);
+	raw_spin_unlock_irqrestore(&pkgd->lock, flags);
+
+	return nr_reused;
+}
+
+static inline int __try_use_free_rmid(struct pkg_data *pkgd, u32 rmid)
+{
+	struct pmonr *pmonr;
+
+	lockdep_assert_held(&pkgd->lock);
+
+	pmonr = list_first_entry_or_null(&pkgd->dep_idle_pmonrs,
+					 struct pmonr, rot_entry);
+	if (!pmonr)
+		return 0;
+	/* The state transition will move the rmid to the active list.  */
+	pmonr_dep_idle_to_active(pmonr, rmid);
+
+	return 1 + __try_activate_dep_dirty_pmonrs(pkgd);
+}
+
+static int __try_use_free_rmids(struct pkg_data *pkgd)
+{
+	int nr_activated = 0, nr_used, r;
+
+	for_each_set_bit(r, pkgd->free_rmids, CMT_MAX_NR_RMIDS) {
+		/* Removes the rmid from free list if succeeds. */
+		nr_used = __try_use_free_rmid(pkgd, r);
+		if (!nr_used)
+			break;
+		nr_activated += nr_used;
+	}
+
+	return nr_activated;
+}
+
+static bool is_rmid_dirty(struct pkg_data *pkgd, u32 rmid, bool do_read,
+			  unsigned int dirty_thld, unsigned int *min_dirty)
+{
+	u64 val;
+
+	if (do_read && WARN_ON_ONCE(cmt_rmid_read(rmid, &val)))
+		return true;
+	if (val > dirty_thld) {
+		if (val < *min_dirty)
+			*min_dirty = val;
+		return true;
+	}
+
+	return false;
+}
+
+static int try_free_dep_dirty_pmonrs(struct pkg_data *pkgd,
+				     bool do_read,
+				     unsigned int dirty_thld,
+				     unsigned int *min_dirty)
+{
+	struct pmonr *pmonr, *tmp;
+	union pmonr_rmids rmids;
+	int nr_activated = 0;
+	unsigned long flags;
+
+	/*
+	 * No need to acquire pkg lock for pkgd->dep_dirty_pmonrs because
+	 * rotation logic is the only user of this list.
+	 */
+	list_for_each_entry_safe(pmonr, tmp,
+				 &pkgd->dep_dirty_pmonrs, rot_entry) {
+		rmids.value = atomic64_read(&pmonr->atomic_rmids);
+		if (is_rmid_dirty(pkgd, rmids.read_rmid,
+					do_read, dirty_thld, min_dirty))
+			continue;
+
+		raw_spin_lock_irqsave(&pkgd->lock, flags);
+		pmonr_dep_dirty_to_dep_idle_helper(pmonr, rmids);
+		nr_activated += __try_use_free_rmid(pkgd, rmids.read_rmid);
+		raw_spin_unlock_irqrestore(&pkgd->lock, flags);
+	}
+
+	return nr_activated;
+}
+
+static int try_free_dirty_rmids(struct pkg_data *pkgd,
+				bool do_read,
+				unsigned int dirty_thld,
+				unsigned int *min_dirty,
+				unsigned long *rmids_bm)
+{
+	int nr_activated = 0, r;
+	unsigned long flags;
+
+	/*
+	 * To avoid holding pkgd->lock while reading rmids in hw (slow), hold
+	 * once and save all rmids that must be read. Then read them while
+	 * unlocked.
+	 */
+	raw_spin_lock_irqsave(&pkgd->lock, flags);
+	memcpy(rmids_bm, pkgd->dirty_rmids, CMT_MAX_NR_RMIDS_BYTES);
+	raw_spin_unlock_irqrestore(&pkgd->lock, flags);
+
+	for_each_set_bit(r, rmids_bm, CMT_MAX_NR_RMIDS) {
+		if (is_rmid_dirty(pkgd, r, do_read, dirty_thld, min_dirty))
+			continue;
+
+		raw_spin_lock_irqsave(&pkgd->lock, flags);
+
+		pkgd->nr_dirty_rmids--;
+		__clear_bit(r, pkgd->dirty_rmids);
+		__set_bit(r, pkgd->free_rmids);
+		nr_activated += __try_use_free_rmid(pkgd, r);
+
+		raw_spin_unlock_irqrestore(&pkgd->lock, flags);
+	}
+
+	return nr_activated;
+}
+
+/**
+ * __intel_cmt_rmid_rotate - Rotate rmids among pmonrs and handle dirty rmids.
+ * @pkgd:		The package data to rotate rmids on.
+ * @active_goal:	Target min nr of pmonrs to put in Active state.
+ * @max_dirty_thld:	Upper bound for dirty_thld, in CMT cache units.
+ *
+ * The goals for each iteration of rotation logic are:
+ *   1) to activate @active_goal pmonrs.
+ *
+ * In order to activate Dep_{Dirty,Idle} pmonrs, rotation logic:
+ *   1) activate eligible Dep_Dirty pmonrs: These pmonrs can reuse their former
+ *   rmid, even if it is not clean, without increasing the error.
+ *   2) take clean rmids from Dep_Dirty pmonrs and reuse them for other pmonrs
+ *   or add them to pool of free rmids.
+ *   3) use free rmids to activate Dep_Idle pmonrs.
+ *
+ * Rotation logic also checks the occupancy of dirty rmids and, if now clean,
+ * uses them or adds them to free rmids.
+ * When a Dep_Idle pmonr is activated, any Dep_Dirty pmonr that is immediately
+ * after it in the pkg->dep_pmonrs list can be activated reusing its dirty
+ * rmid.
+ */
+static int __intel_cmt_rmid_rotate(struct pkg_data *pkgd,
+		unsigned int active_goal, unsigned int max_dirty_thld)
+{
+	unsigned int dirty_thld = 0, min_dirty, nr_activated;
+	unsigned int nr_dep_pmonrs;
+	unsigned long flags, *rmids_bm = NULL;
+	bool do_active_goal, read_dirty = true, dirty_is_max;
+
+	lockdep_assert_held(&pkgd->mutex);
+
+	rmids_bm = kzalloc(CMT_MAX_NR_RMIDS_BYTES, GFP_KERNEL);
+	if (!rmids_bm)
+		return -ENOMEM;
+
+	nr_activated = try_activate_dep_dirty_pmonrs(pkgd);
+
+again:
+	min_dirty = UINT_MAX;
+
+	/* retry every iteration since dirty_thld may have changed. */
+	nr_activated += try_free_dirty_rmids(pkgd, read_dirty,
+					     dirty_thld, &min_dirty, rmids_bm);
+
+	raw_spin_lock_irqsave(&pkgd->lock, flags);
+	nr_activated += __try_use_free_rmids(pkgd);
+	raw_spin_unlock_irqrestore(&pkgd->lock, flags);
+
+	nr_activated += try_free_dep_dirty_pmonrs(pkgd, read_dirty,
+						  dirty_thld, &min_dirty);
+
+	raw_spin_lock_irqsave(&pkgd->lock, flags);
+	nr_activated += __try_use_free_rmids(pkgd);
+	nr_dep_pmonrs = pkgd->nr_dep_pmonrs;
+	raw_spin_unlock_irqrestore(&pkgd->lock, flags);
+
+	/*
+	 * If there is no room to increase dirty_thld, then no more dirty rmids
+	 * could be reused and must give up active goal.
+	 */
+	dirty_is_max = dirty_thld >= max_dirty_thld;
+	do_active_goal = nr_activated < active_goal && !dirty_is_max;
+
+	/*
+	 * Since Dep_Dirty pmonrs have their own dirty rmid, only Dep_Idle
+	 * pmonrs are waiting for a rmid to be available. Stop if no pmonr
+	 * wait for rmid or no goals to pursue.
+	 */
+	if (!nr_dep_pmonrs || !do_active_goal)
+		goto exit;
+
+	/*
+	 * Try to activate more pmonrs by increasing the dirty threshold.
+	 * Using the minimum observed occupancy in dirty rmids guarantees to
+	 * recover at least one rmid per iteration.
+	 */
+	if (do_active_goal) {
+		dirty_thld = min(min_dirty, max_dirty_thld);
+		/* do not read occupancy for dirty rmids twice. */
+		read_dirty = true;
+		goto again;
+	}
+
+exit:
+	kfree(rmids_bm);
+
+	return 0;
+}
+
 static struct pmu intel_cmt_pmu;
 
 /* Schedule rotation in one package. */
@@ -1360,10 +1619,20 @@ static bool intel_cmt_need_rmid_rotation(struct pkg_data *pkgd)
 
 /*
  * Rotation function, runs per-package.
+ * If rmids are needed in a package it will steal rmids from pmonr that have
+ * been active longer than __cmt_pre_mon_slice + __cmt_min_mon_slice.
+ * The hardware doesn't provide a way to free occupancy for a rmid that will
+ * be reused. Therefore, before reusing a rmid, it should stay unscheduled for
+ * a while, hoping that the cache lines counted towards this rmid will
+ * eventually be replaced and the rmid occupancy will decrease below
+ * __cmt_max_threshold.
  */
 static void intel_cmt_rmid_rotation_work(struct work_struct *work)
 {
 	struct pkg_data *pkgd;
+	/* not precise elapsed time, but good enough for rotation purposes. */
+	unsigned int elapsed_ms = intel_cmt_pmu.hrtimer_interval_ms;
+	unsigned int active_goal, max_dirty_threshold;
 
 	pkgd = container_of(to_delayed_work(work),
 			    struct pkg_data, rotation_work);
@@ -1377,7 +1646,10 @@ static void intel_cmt_rmid_rotation_work(struct work_struct *work)
 	if (!intel_cmt_need_rmid_rotation(pkgd))
 		goto exit;
 
-	/* To add call to rotation function in next patch */
+	active_goal = max(1u, (elapsed_ms * __cmt_min_progress_rate) / 1000);
+	max_dirty_threshold = READ_ONCE(__cmt_max_threshold) / cmt_l3_scale;
+
+	__intel_cmt_rmid_rotate(pkgd, active_goal, max_dirty_threshold);
 
 	if (intel_cmt_need_rmid_rotation(pkgd))
 		__intel_cmt_schedule_rotation_for_pkg(pkgd);
-- 
2.8.0.rc3.226.g39d4020