[PATCH v3 42/46] perf/x86/intel/cmt: add rmid stealing - David Carrillo-Cisneros

From: David Carrillo-Cisneros <davidcc@google.com>
To: linux-kernel@vger.kernel.org
Cc: "x86@kernel.org" <x86@kernel.org>, Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Andi Kleen <ak@linux.intel.com>, Kan Liang <kan.liang@intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vegard Nossum <vegard.nossum@gmail.com>,
	Marcelo Tosatti <mtosatti@redhat.com>,
	Nilay Vaish <nilayvaish@gmail.com>, Borislav Petkov <bp@suse.de>,
	Vikas Shivappa <vikas.shivappa@linux.intel.com>,
	Ravi V Shankar <ravi.v.shankar@intel.com>,
	Fenghua Yu <fenghua.yu@intel.com>, Paul Turner <pjt@google.com>,
	Stephane Eranian <eranian@google.com>,
	David Carrillo-Cisneros <davidcc@google.com>
Subject: [PATCH v3 42/46] perf/x86/intel/cmt: add rmid stealing
Date: Sat, 29 Oct 2016 17:38:39 -0700	[thread overview]
Message-ID: <1477787923-61185-43-git-send-email-davidcc@google.com> (raw)
In-Reply-To: <1477787923-61185-1-git-send-email-davidcc@google.com>

Add rmid rotation code to steal an rmid whenever not enough
pmonrs are being reactivated.

More details in code's comments.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
---
 arch/x86/events/intel/cmt.c | 149 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 144 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/intel/cmt.c b/arch/x86/events/intel/cmt.c
index ba82f95..e677511 100644
--- a/arch/x86/events/intel/cmt.c
+++ b/arch/x86/events/intel/cmt.c
@@ -1368,6 +1368,106 @@ static int try_activate_dep_dirty_pmonrs(struct pkg_data *pkgd)
 	return nr_reused;
 }
 
+/**
+ * can_steal_rmid() - Tell if this pmonr's rmid can be stolen.
+ *
+ * The "rmid cycle" for a pmonr starts when an Active pmonr gets its rmid
+ * stolen and completes when it receives a rmid again.
+ * A monr "rmid recoup" occurs when all its non Off/Unused pmonrs
+ * obtain a rmid (i.e. when all pmonr than need a rmid have one).
+ *
+ * A pmonr's rmid can be stolen if either:
+ *   1) No other pmonr in pmonr's monr has been stolen before, or
+ *   2) Some pmonrs have had rmids stolen but rmids for all pmonrs have been
+ *   recovered (rmid recoup) and kept for at least
+ *     __cmt_pre_mon_slice + __cmt_min_mon_slice time.
+ *   3) At least one of the pmonrs with pkgid smaller than @pmonr's has not
+ *   completed its first "rmid cycle". Once this condition is false, the pmonr
+ *   will have completed its last "rmid cycle" and stealing will no be longer
+ *   allowed.
+ *   This guarantees that the last "rmid cycle" of a pmonr occurs in
+ *   pkgid order, preventing rmid deadlocks. It also guarantees that eventually
+ *   all pmonrs will eventually have a last "rmid cycle", recovering all
+ *   required rmids.
+ */
+static bool can_steal_rmid(struct pmonr *pmonr)
+{
+	union pmonr_rmids rmids;
+	struct monr *monr = pmonr->monr;
+	struct pkg_data *pkgd = NULL;
+	struct pmonr *pos_pmonr;
+	bool need_rmid_state;
+	u64 last_all_active, next_steal_time, last_pmonr_active;
+
+	last_all_active = atomic64_read(&monr->last_rmid_recoup);
+	/*
+	 * Can steal if no pmonr has been stolen or all not Unused have been
+	 * in Active state for long enough.
+	 */
+	if (!atomic_read(&monr->nr_dep_pmonrs)) {
+		/* Check steal condition 1. */
+		if (!last_all_active)
+			return true;
+		next_steal_time = last_all_active +
+				__cmt_pre_mon_slice + __cmt_min_mon_slice;
+		/* Check steal condition 2. */
+		if (time_after64(next_steal_time, get_jiffies_64()))
+			return true;
+
+		return false;
+	}
+
+	rcu_read_lock();
+
+	/* Check for steal condition 3 without locking. */
+	while ((pkgd = cmt_pkgs_data_next_rcu(pkgd))) {
+		/* To avoid deadlocks, wait for pmonr in pkgid order. */
+		if (pkgd->pkgid >= pmonr->pkgd->pkgid)
+			break;
+		pos_pmonr = pkgd_pmonr(pkgd, monr);
+		rmids.value = atomic64_read(&pos_pmonr->atomic_rmids);
+		last_pmonr_active = atomic64_read(
+				&pos_pmonr->last_enter_active);
+
+		/* pmonrs in Dep_{Idle,Dirty} states are waiting for a rmid. */
+		need_rmid_state = rmids.sched_rmid != INVALID_RMID &&
+				  rmids.sched_rmid != rmids.read_rmid;
+
+		/* test if pos_pmonr has finished its first rmid cycle. */
+		if (need_rmid_state && last_all_active <= last_pmonr_active) {
+			rcu_read_unlock();
+
+			return true;
+		}
+	}
+	rcu_read_unlock();
+
+	return false;
+}
+
+/* Steal as many rmids as possible, up to @max_to_steal. */
+static int try_steal_active_pmonrs(struct pkg_data *pkgd,
+				   unsigned int max_to_steal)
+{
+	struct pmonr *pmonr, *tmp;
+	unsigned long flags;
+	int nr_stolen = 0;
+
+	raw_spin_lock_irqsave(&pkgd->lock, flags);
+
+	list_for_each_entry_safe(pmonr, tmp, &pkgd->active_pmonrs, rot_entry) {
+		if (!can_steal_rmid(pmonr))
+			continue;
+		pmonr_active_to_dep_dirty(pmonr);
+		nr_stolen++;
+		if (nr_stolen == max_to_steal)
+			break;
+	}
+	raw_spin_unlock_irqrestore(&pkgd->lock, flags);
+
+	return nr_stolen;
+}
+
 static inline int __try_use_free_rmid(struct pkg_data *pkgd, u32 rmid)
 {
 	struct pmonr *pmonr;
@@ -1485,9 +1585,17 @@ static int try_free_dirty_rmids(struct pkg_data *pkgd,
  * @pkgd:		The package data to rotate rmids on.
  * @active_goal:	Target min nr of pmonrs to put in Active state.
  * @max_dirty_thld:	Upper bound for dirty_thld, in CMT cache units.
+ * @max_dirty_goal:	Max nr of rmids to leave dirty, waiting to drop
+ *			occupancy.
+ * @dirty_cushion:	nr of rmids to try to leave in dirty on top of the
+ *			nr of pmonrs that need rmid (Dep_Idle), in case
+ *			some dirty rmids do not drop occupancy fast enough.
  *
  * The goals for each iteration of rotation logic are:
  *   1) to activate @active_goal pmonrs.
+ *   2) if any pmonr is waiting for rmid (Dep_Idle), to steal enough rmids to
+ *   meet its dirty_goal. The dirty_goal is an estimate of the number of dirty
+ *   rmids required so that next call reaches its @active_goal.
  *
  * In order to activate Dep_{Dirty,Idle} pmonrs, rotation logic:
  *   1) activate eligible Dep_Dirty pmonrs: These pmonrs can reuse their former
@@ -1503,12 +1611,14 @@ static int try_free_dirty_rmids(struct pkg_data *pkgd,
  * rmid.
  */
 static int __intel_cmt_rmid_rotate(struct pkg_data *pkgd,
-		unsigned int active_goal, unsigned int max_dirty_thld)
+		unsigned int active_goal, unsigned int max_dirty_thld,
+		unsigned int max_dirty_goal, unsigned int dirty_cushion)
 {
 	unsigned int dirty_thld = 0, min_dirty, nr_activated;
-	unsigned int nr_dep_pmonrs;
+	unsigned int nr_to_steal, nr_stolen;
+	unsigned int nr_dirty, dirty_goal, nr_dep_pmonrs;
 	unsigned long flags, *rmids_bm = NULL;
-	bool do_active_goal, read_dirty = true, dirty_is_max;
+	bool do_active_goal, do_dirty_goal, read_dirty = true, dirty_is_max;
 
 	lockdep_assert_held(&pkgd->mutex);
 
@@ -1534,6 +1644,7 @@ static int __intel_cmt_rmid_rotate(struct pkg_data *pkgd,
 
 	raw_spin_lock_irqsave(&pkgd->lock, flags);
 	nr_activated += __try_use_free_rmids(pkgd);
+	nr_dirty = pkgd->nr_dirty_rmids;
 	nr_dep_pmonrs = pkgd->nr_dep_pmonrs;
 	raw_spin_unlock_irqrestore(&pkgd->lock, flags);
 
@@ -1544,14 +1655,27 @@ static int __intel_cmt_rmid_rotate(struct pkg_data *pkgd,
 	dirty_is_max = dirty_thld >= max_dirty_thld;
 	do_active_goal = nr_activated < active_goal && !dirty_is_max;
 
+	dirty_goal = min(max_dirty_goal, nr_dep_pmonrs + dirty_cushion);
+	do_dirty_goal = nr_dirty < dirty_goal;
+
 	/*
 	 * Since Dep_Dirty pmonrs have their own dirty rmid, only Dep_Idle
 	 * pmonrs are waiting for a rmid to be available. Stop if no pmonr
 	 * wait for rmid or no goals to pursue.
 	 */
-	if (!nr_dep_pmonrs || !do_active_goal)
+	if (!nr_dep_pmonrs || (!do_dirty_goal && !do_active_goal))
 		goto exit;
 
+	if (do_dirty_goal) {
+		nr_to_steal = dirty_goal - nr_dirty;
+		nr_stolen = try_steal_active_pmonrs(pkgd, nr_to_steal);
+		/*
+		 * It tried to steal from all Active pmonrs, makes no sense
+		 * to reattempt.
+		 */
+		max_dirty_goal = 0;
+	}
+
 	/*
 	 * Try to activate more pmonrs by increasing the dirty threshold.
 	 * Using the minimum observed occupancy in dirty rmids guarantees to
@@ -1633,6 +1757,7 @@ static void intel_cmt_rmid_rotation_work(struct work_struct *work)
 	/* not precise elapsed time, but good enough for rotation purposes. */
 	unsigned int elapsed_ms = intel_cmt_pmu.hrtimer_interval_ms;
 	unsigned int active_goal, max_dirty_threshold;
+	unsigned int dirty_cushion, max_dirty_goal;
 
 	pkgd = container_of(to_delayed_work(work),
 			    struct pkg_data, rotation_work);
@@ -1649,7 +1774,21 @@ static void intel_cmt_rmid_rotation_work(struct work_struct *work)
 	active_goal = max(1u, (elapsed_ms * __cmt_min_progress_rate) / 1000);
 	max_dirty_threshold = READ_ONCE(__cmt_max_threshold) / cmt_l3_scale;
 
-	__intel_cmt_rmid_rotate(pkgd, active_goal, max_dirty_threshold);
+	/*
+	 * Upper bound for the nr of rmids to be dirty in order to have a good
+	 * chance of finding enough rmids in next iteration of rotation logic.
+	 */
+	max_dirty_goal = min(active_goal + 1, (pkgd->max_rmid + 1) / 4);
+
+	/*
+	 * Nr of extra rmids to put in dirty in case some don't drop occupancy.
+	 * To be calculated in a sensible manner once statistics about rmid
+	 * recycling rate are in place.
+	 */
+	dirty_cushion = 2;
+
+	__intel_cmt_rmid_rotate(pkgd, active_goal, max_dirty_threshold,
+				max_dirty_goal, dirty_cushion);
 
 	if (intel_cmt_need_rmid_rotation(pkgd))
 		__intel_cmt_schedule_rotation_for_pkg(pkgd);
-- 
2.8.0.rc3.226.g39d4020