linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
@ 2017-06-08 19:19 josef
  2017-06-08 19:19 ` [PATCH 2/2] mm: make kswapd try harder to keep active pages in cache josef
  2017-06-13  5:28 ` [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation Minchan Kim
  0 siblings, 2 replies; 23+ messages in thread
From: josef @ 2017-06-08 19:19 UTC (permalink / raw)
  To: hannes, riel, akpm, linux-mm, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

When testing a slab heavy workload I noticed that we often would barely
reclaim anything at all from slab when kswapd started doing reclaim.
This is because we use the ratio of nr_scanned / nr_lru to determine how
much of slab we should reclaim.  But in a slab only/mostly workload we
will not have much page cache to reclaim, and thus our ratio will be
really low and not at all related to where the memory on the system is.
Instead we want to use a ratio of the reclaimable slab to the actual
reclaimable space on the system.  That way if we are slab heavy we work
harder to reclaim slab.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 mm/vmscan.c | 71 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 28 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f84cdd3..16add44 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -307,8 +307,8 @@ EXPORT_SYMBOL(unregister_shrinker);
 
 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 				    struct shrinker *shrinker,
-				    unsigned long nr_scanned,
-				    unsigned long nr_eligible)
+				    unsigned long numerator,
+				    unsigned long denominator)
 {
 	unsigned long freed = 0;
 	unsigned long long delta;
@@ -333,9 +333,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
 
 	total_scan = nr;
-	delta = (4 * nr_scanned) / shrinker->seeks;
+	delta = (4 * numerator) / shrinker->seeks;
 	delta *= freeable;
-	do_div(delta, nr_eligible + 1);
+	do_div(delta, denominator + 1);
 	total_scan += delta;
 	if (total_scan < 0) {
 		pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -369,7 +369,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 		total_scan = freeable * 2;
 
 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
-				   nr_scanned, nr_eligible,
+				   numerator, denominator,
 				   freeable, delta, total_scan);
 
 	/*
@@ -429,8 +429,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  * @gfp_mask: allocation context
  * @nid: node whose slab caches to target
  * @memcg: memory cgroup whose slab caches to target
- * @nr_scanned: pressure numerator
- * @nr_eligible: pressure denominator
+ * @numerator: pressure numerator
+ * @denominator: pressure denominator
  *
  * Call the shrink functions to age shrinkable caches.
  *
@@ -442,20 +442,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  * objects from the memory cgroup specified. Otherwise, only unaware
  * shrinkers are called.
  *
- * @nr_scanned and @nr_eligible form a ratio that indicate how much of
- * the available objects should be scanned.  Page reclaim for example
- * passes the number of pages scanned and the number of pages on the
- * LRU lists that it considered on @nid, plus a bias in @nr_scanned
- * when it encountered mapped pages.  The ratio is further biased by
- * the ->seeks setting of the shrink function, which indicates the
- * cost to recreate an object relative to that of an LRU page.
+ * @numerator and @denominator form a ratio that indicate how much of
+ * the available objects should be scanned.  Global reclaim for example will do
+ * the ratio of reclaimable slab to the lru sizes.
  *
  * Returns the number of reclaimed slab objects.
  */
 static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 				 struct mem_cgroup *memcg,
-				 unsigned long nr_scanned,
-				 unsigned long nr_eligible)
+				 unsigned long numerator,
+				 unsigned long denominator)
 {
 	struct shrinker *shrinker;
 	unsigned long freed = 0;
@@ -463,9 +459,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 	if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
 		return 0;
 
-	if (nr_scanned == 0)
-		nr_scanned = SWAP_CLUSTER_MAX;
-
 	if (!down_read_trylock(&shrinker_rwsem)) {
 		/*
 		 * If we would return 0, our callers would understand that we
@@ -496,7 +489,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 			sc.nid = 0;
 
-		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+		freed += do_shrink_slab(&sc, shrinker, numerator, denominator);
 	}
 
 	up_read(&shrinker_rwsem);
@@ -2558,12 +2551,34 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 	return true;
 }
 
+static unsigned long lruvec_reclaimable_pages(struct lruvec *lruvec)
+{
+	unsigned long nr;
+
+	nr = lruvec_page_state(lruvec, NR_ACTIVE_FILE) +
+	     lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
+	     lruvec_page_state(lruvec, NR_ISOLATED_FILE);
+
+	if (get_nr_swap_pages() > 0)
+		nr += lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
+		      lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
+		      lruvec_page_state(lruvec, NR_ISOLATED_ANON);
+
+	return nr;
+}
+
 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long nr_reclaimed, nr_scanned;
+	unsigned long greclaim = 1, gslab = 1;
 	bool reclaimable = false;
 
+	if (global_reclaim(sc)) {
+		gslab = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
+		greclaim = pgdat_reclaimable_pages(pgdat);
+	}
+
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
 		struct mem_cgroup_reclaim_cookie reclaim = {
@@ -2578,6 +2593,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
+			struct lruvec *lruvec = mem_cgroup_lruvec(pgdat,
+								  memcg);
+			unsigned long nr_slab, nr_reclaim;
 			unsigned long lru_pages;
 			unsigned long reclaimed;
 			unsigned long scanned;
@@ -2592,14 +2610,16 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
 			reclaimed = sc->nr_reclaimed;
 			scanned = sc->nr_scanned;
+			nr_slab = lruvec_page_state(lruvec,
+						    NR_SLAB_RECLAIMABLE);
+			nr_reclaim = lruvec_reclaimable_pages(lruvec);
 
 			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
 			node_lru_pages += lru_pages;
 
 			if (memcg)
 				shrink_slab(sc->gfp_mask, pgdat->node_id,
-					    memcg, sc->nr_scanned - scanned,
-					    lru_pages);
+					    memcg, nr_slab, nr_reclaim);
 
 			/* Record the group's reclaim efficiency */
 			vmpressure(sc->gfp_mask, memcg, false,
@@ -2623,14 +2643,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			}
 		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
 
-		/*
-		 * Shrink the slab caches in the same proportion that
-		 * the eligible LRU pages were scanned.
-		 */
 		if (global_reclaim(sc))
 			shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
-				    sc->nr_scanned - nr_scanned,
-				    node_lru_pages);
+				    gslab, greclaim);
 
 		/*
 		 * Record the subtree's reclaim efficiency. The reclaimed
-- 
2.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 2/2] mm: make kswapd try harder to keep active pages in cache
  2017-06-08 19:19 [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation josef
@ 2017-06-08 19:19 ` josef
  2017-06-13  5:28 ` [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation Minchan Kim
  1 sibling, 0 replies; 23+ messages in thread
From: josef @ 2017-06-08 19:19 UTC (permalink / raw)
  To: hannes, riel, akpm, linux-mm, kernel-team; +Cc: Josef Bacik

From: Josef Bacik <jbacik@fb.com>

While testing slab reclaim I noticed that if we were running a workload
that used most of the system memory for it's working set and we start
putting a lot of reclaimable slab pressure on the system (think find /,
or some other silliness), we will happily evict the active pages over
the slab cache.  This is kind of backwards as we want to do all that we
can to keep the active working set in memory, and instead evict these
short lived objects.  The same thing occurs when say you do a yum
update of a few packages while your working set takes up most of RAM,
you end up with inactive lists being relatively small and so we reclaim
active pages even though we could reclaim these short lived inactive
pages.

My approach here is twofold.  First, keep track of the difference in
inactive and slab pages since the last time kswapd ran.  In the first
run this will just be the overall counts of inactive and slab, but for
each subsequent run we'll have a good idea of where the memory pressure
is coming from.  Then we use this information to put pressure on either
the inactive lists or the slab caches, depending on where the pressure
is coming from.

If this optimization does not work, then we fall back to the previous
methods of reclaiming space with a slight adjustment.  Instead of using
the overall scan rate of page cache to determine the scan rate for slab,
we instead use the total usage of slab compared to the reclaimable page
cache on the box.  This will allow us to put an appropriate amount of
pressure on the slab shrinkers if we are a mostly slab workload.

I have two tests I was using to watch either side of this problem.  The
first test kept 2 files that took up 3/4 of the memory, and then started
creating a bunch of empty files.  Without this patch we would have to
re-read both files in their entirety at least 3 times during the run.
With this patch the active pages are never evicted.

The second test was a test that would read and stat all the files in a
directory, which again would take up about 3/4 of the memory with slab
cache.  Then I cat'ed a 100gib file into /dev/null and checked to see if
any of the files were evicted and verified that none of the files were
evicted.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 mm/vmscan.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 158 insertions(+), 15 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 16add44..141a860 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -110,11 +110,20 @@ struct scan_control {
 	/* One of the zones is ready for compaction */
 	unsigned int compaction_ready:1;
 
+	/* Only reclaim inactive page cache or slab. */
+	unsigned int inactive_only:1;
+
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
 	/* Number of pages freed so far during a call to shrink_zones() */
 	unsigned long nr_reclaimed;
+
+	/* Number of inactive pages added since last kswapd run. */
+	unsigned long inactive_diff;
+
+	/* Number of slab pages added since last kswapd run. */
+	unsigned long slab_diff;
 };
 
 #ifdef ARCH_HAS_PREFETCH
@@ -308,7 +317,8 @@ EXPORT_SYMBOL(unregister_shrinker);
 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 				    struct shrinker *shrinker,
 				    unsigned long numerator,
-				    unsigned long denominator)
+				    unsigned long denominator,
+				    unsigned long *slab_scanned)
 {
 	unsigned long freed = 0;
 	unsigned long long delta;
@@ -409,6 +419,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 		next_deferred -= scanned;
 	else
 		next_deferred = 0;
+	if (slab_scanned)
+		(*slab_scanned) += scanned;
+
 	/*
 	 * move the unused scan count back into the shrinker in a
 	 * manner that handles concurrent updates. If we exhausted the
@@ -451,7 +464,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 				 struct mem_cgroup *memcg,
 				 unsigned long numerator,
-				 unsigned long denominator)
+				 unsigned long denominator,
+				 unsigned long *slab_scanned)
 {
 	struct shrinker *shrinker;
 	unsigned long freed = 0;
@@ -489,7 +503,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 			sc.nid = 0;
 
-		freed += do_shrink_slab(&sc, shrinker, numerator, denominator);
+		freed += do_shrink_slab(&sc, shrinker, numerator, denominator,
+					slab_scanned);
 	}
 
 	up_read(&shrinker_rwsem);
@@ -508,7 +523,7 @@ void drop_slab_node(int nid)
 		freed = 0;
 		do {
 			freed += shrink_slab(GFP_KERNEL, nid, memcg,
-					     1000, 1000);
+					     1000, 1000, NULL);
 		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
 	} while (freed > 10);
 }
@@ -2139,6 +2154,7 @@ enum scan_balance {
 	SCAN_FRACT,
 	SCAN_ANON,
 	SCAN_FILE,
+	SCAN_INACTIVE,
 };
 
 /*
@@ -2165,6 +2181,11 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	unsigned long ap, fp;
 	enum lru_list lru;
 
+	if (sc->inactive_only) {
+		scan_balance = SCAN_INACTIVE;
+		goto out;
+	}
+
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
 		scan_balance = SCAN_FILE;
@@ -2338,6 +2359,14 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 				scan = 0;
 			}
 			break;
+		case SCAN_INACTIVE:
+			if (file && !is_active_lru(lru)) {
+				scan = max(scan, sc->nr_to_reclaim);
+			} else {
+				size = 0;
+				scan = 0;
+			}
+			break;
 		default:
 			/* Look ma, no brain */
 			BUG();
@@ -2571,12 +2600,59 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long nr_reclaimed, nr_scanned;
-	unsigned long greclaim = 1, gslab = 1;
+	unsigned long greclaim = 1, gslab = 1, total_high_wmark = 0, nr_inactive;
 	bool reclaimable = false;
+	bool skip_slab = false;
 
 	if (global_reclaim(sc)) {
+		int z;
+		for (z = 0; z < MAX_NR_ZONES; z++) {
+			struct zone *zone = &pgdat->node_zones[z];
+			if (!managed_zone(zone))
+				continue;
+			total_high_wmark += high_wmark_pages(zone);
+		}
+		nr_inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
 		gslab = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
 		greclaim = pgdat_reclaimable_pages(pgdat);
+	} else {
+		struct lruvec *lruvec =
+			mem_cgroup_lruvec(pgdat, sc->target_mem_cgroup);
+		total_high_wmark = sc->nr_to_reclaim;
+		nr_inactive = lruvec_page_state(lruvec, NR_INACTIVE_FILE);
+		gslab = lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
+	}
+
+	/*
+	 * If we don't have a lot of inactive or slab pages then there's no
+	 * point in trying to free them exclusively, do the normal scan stuff.
+	 */
+	if (nr_inactive + gslab < total_high_wmark)
+		sc->inactive_only = 0;
+
+	/*
+	 * We still want to slightly prefer slab over inactive, so if the
+	 * inactive on this node is large enough and what is pushing us into
+	 * reclaim terretitory then limit our flushing to the inactive list for
+	 * the first go around.
+	 *
+	 * The idea is that with a memcg configured system we will still reclaim
+	 * memcg aware shrinkers, which includes the super block shrinkers.  So
+	 * if our steady state is keeping fs objects in cache for our workload
+	 * we'll still put a certain amount of pressure on them anyway.  To
+	 * avoid evicting things we actually care about we want to skip slab
+	 * reclaim altogether.
+	 *
+	 * However we still want to account for slab and inactive growing at the
+	 * same rate, so if that is the case just carry on shrinking inactive
+	 * and slab together.
+	 */
+	if (nr_inactive > total_high_wmark &&
+	    sc->inactive_diff > sc->slab_diff) {
+		unsigned long tmp = sc->inactive_diff >> 1;
+
+		if (tmp >= sc->slab_diff)
+			skip_slab = true;
 	}
 
 	do {
@@ -2586,6 +2662,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			.priority = sc->priority,
 		};
 		unsigned long node_lru_pages = 0;
+		unsigned long slab_reclaimed = 0;
+		unsigned long slab_scanned = 0;
 		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
@@ -2617,9 +2695,18 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
 			node_lru_pages += lru_pages;
 
-			if (memcg)
-				shrink_slab(sc->gfp_mask, pgdat->node_id,
-					    memcg, nr_slab, nr_reclaim);
+			if (memcg && !skip_slab) {
+				unsigned long numerator = nr_slab;
+				unsigned long denominator = nr_reclaim;
+				if (sc->inactive_only) {
+					numerator = sc->slab_diff;
+					denominator = sc->inactive_diff;
+				}
+				slab_reclaimed +=
+					shrink_slab(sc->gfp_mask, pgdat->node_id,
+						    memcg, numerator, denominator,
+						    &slab_scanned);
+			}
 
 			/* Record the group's reclaim efficiency */
 			vmpressure(sc->gfp_mask, memcg, false,
@@ -2643,9 +2730,18 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			}
 		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
 
-		if (global_reclaim(sc))
-			shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
-				    gslab, greclaim);
+		if (!skip_slab && global_reclaim(sc)) {
+			unsigned long numerator = gslab;
+			unsigned long denominator = greclaim;
+			if (sc->inactive_only) {
+				numerator = sc->slab_diff;
+				denominator = sc->inactive_diff;
+			}
+			slab_reclaimed += shrink_slab(sc->gfp_mask,
+						      pgdat->node_id, NULL,
+						      numerator, denominator,
+						      &slab_scanned);
+		}
 
 		/*
 		 * Record the subtree's reclaim efficiency. The reclaimed
@@ -2664,9 +2760,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			reclaim_state->reclaimed_slab = 0;
 		}
 
-		if (sc->nr_reclaimed - nr_reclaimed)
+		if (sc->nr_reclaimed - nr_reclaimed) {
 			reclaimable = true;
+		} else if (sc->inactive_only && !skip_slab) {
+			unsigned long percent = 0;
 
+			/*
+			 * We didn't reclaim anything this go around, so the
+			 * inactive list is likely spent.  If we're reclaiming
+			 * less than half of the objects in slab that we're
+			 * scanning then just stop doing the inactive only scan.
+			 * Otherwise ramp up the pressure on the slab caches
+			 * hoping that eventually we'll start freeing enough
+			 * objects to reclaim space.
+			 */
+			if (slab_scanned)
+				percent = slab_reclaimed * 100 / slab_scanned;
+			if (percent < 50)
+				sc->inactive_only = 0;
+			else
+				gslab <<= 1;
+		}
+		skip_slab = false;
 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 
@@ -3309,7 +3424,8 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
  * or lower is eligible for reclaim until at least one usable zone is
  * balanced.
  */
-static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx,
+			 unsigned long inactive_diff, unsigned long slab_diff)
 {
 	int i;
 	unsigned long nr_soft_reclaimed;
@@ -3322,6 +3438,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.inactive_only = 1,
+		.inactive_diff = inactive_diff,
+		.slab_diff = slab_diff,
 	};
 	count_vm_event(PAGEOUTRUN);
 
@@ -3541,7 +3660,7 @@ static int kswapd(void *p)
 	unsigned int classzone_idx = MAX_NR_ZONES - 1;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
-
+	unsigned long nr_slab = 0, nr_inactive = 0;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
@@ -3571,6 +3690,7 @@ static int kswapd(void *p)
 	pgdat->kswapd_order = 0;
 	pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
 	for ( ; ; ) {
+		unsigned long slab_diff, inactive_diff;
 		bool ret;
 
 		alloc_order = reclaim_order = pgdat->kswapd_order;
@@ -3598,6 +3718,23 @@ static int kswapd(void *p)
 			continue;
 
 		/*
+		 * We want to know where we're adding pages so we can make
+		 * smarter decisions about where we're going to put pressure
+		 * when shrinking.
+		 */
+		slab_diff = sum_zone_node_page_state(pgdat->node_id,
+						     NR_SLAB_RECLAIMABLE);
+		inactive_diff = node_page_state(pgdat, NR_INACTIVE_FILE);
+		if (nr_slab > slab_diff)
+			slab_diff = 0;
+		else
+			slab_diff -= nr_slab;
+		if (inactive_diff < nr_inactive)
+			inactive_diff = 0;
+		else
+			inactive_diff -= nr_inactive;
+
+		/*
 		 * Reclaim begins at the requested order but if a high-order
 		 * reclaim fails then kswapd falls back to reclaiming for
 		 * order-0. If that happens, kswapd will consider sleeping
@@ -3607,7 +3744,11 @@ static int kswapd(void *p)
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
 						alloc_order);
-		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx,
+					      inactive_diff, slab_diff);
+		nr_inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
+		nr_slab = sum_zone_node_page_state(pgdat->node_id,
+						   NR_SLAB_RECLAIMABLE);
 		if (reclaim_order < alloc_order)
 			goto kswapd_try_sleep;
 	}
@@ -3859,6 +4000,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
 		.reclaim_idx = gfp_zone(gfp_mask),
+		.slab_diff = 1,
+		.inactive_diff = 1,
 	};
 
 	cond_resched();
-- 
2.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-08 19:19 [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation josef
  2017-06-08 19:19 ` [PATCH 2/2] mm: make kswapd try harder to keep active pages in cache josef
@ 2017-06-13  5:28 ` Minchan Kim
  2017-06-13 12:01   ` Josef Bacik
  1 sibling, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2017-06-13  5:28 UTC (permalink / raw)
  To: josef; +Cc: hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik

Hello,

On Thu, Jun 08, 2017 at 03:19:05PM -0400, josef@toxicpanda.com wrote:
> From: Josef Bacik <jbacik@fb.com>
> 
> When testing a slab heavy workload I noticed that we often would barely
> reclaim anything at all from slab when kswapd started doing reclaim.
> This is because we use the ratio of nr_scanned / nr_lru to determine how
> much of slab we should reclaim.  But in a slab only/mostly workload we
> will not have much page cache to reclaim, and thus our ratio will be
> really low and not at all related to where the memory on the system is.

I want to understand this clearly.
Why nr_scanned / nr_lru is low if system doesnt' have much page cache?
Could you elaborate it a bit?

Thanks.

> Instead we want to use a ratio of the reclaimable slab to the actual
> reclaimable space on the system.  That way if we are slab heavy we work
> harder to reclaim slab.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  mm/vmscan.c | 71 +++++++++++++++++++++++++++++++++++++------------------------
>  1 file changed, 43 insertions(+), 28 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f84cdd3..16add44 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -307,8 +307,8 @@ EXPORT_SYMBOL(unregister_shrinker);
>  
>  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>  				    struct shrinker *shrinker,
> -				    unsigned long nr_scanned,
> -				    unsigned long nr_eligible)
> +				    unsigned long numerator,
> +				    unsigned long denominator)
>  {
>  	unsigned long freed = 0;
>  	unsigned long long delta;
> @@ -333,9 +333,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>  	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
>  
>  	total_scan = nr;
> -	delta = (4 * nr_scanned) / shrinker->seeks;
> +	delta = (4 * numerator) / shrinker->seeks;
>  	delta *= freeable;
> -	do_div(delta, nr_eligible + 1);
> +	do_div(delta, denominator + 1);
>  	total_scan += delta;
>  	if (total_scan < 0) {
>  		pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
> @@ -369,7 +369,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>  		total_scan = freeable * 2;
>  
>  	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
> -				   nr_scanned, nr_eligible,
> +				   numerator, denominator,
>  				   freeable, delta, total_scan);
>  
>  	/*
> @@ -429,8 +429,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>   * @gfp_mask: allocation context
>   * @nid: node whose slab caches to target
>   * @memcg: memory cgroup whose slab caches to target
> - * @nr_scanned: pressure numerator
> - * @nr_eligible: pressure denominator
> + * @numerator: pressure numerator
> + * @denominator: pressure denominator
>   *
>   * Call the shrink functions to age shrinkable caches.
>   *
> @@ -442,20 +442,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>   * objects from the memory cgroup specified. Otherwise, only unaware
>   * shrinkers are called.
>   *
> - * @nr_scanned and @nr_eligible form a ratio that indicate how much of
> - * the available objects should be scanned.  Page reclaim for example
> - * passes the number of pages scanned and the number of pages on the
> - * LRU lists that it considered on @nid, plus a bias in @nr_scanned
> - * when it encountered mapped pages.  The ratio is further biased by
> - * the ->seeks setting of the shrink function, which indicates the
> - * cost to recreate an object relative to that of an LRU page.
> + * @numerator and @denominator form a ratio that indicate how much of
> + * the available objects should be scanned.  Global reclaim for example will do
> + * the ratio of reclaimable slab to the lru sizes.
>   *
>   * Returns the number of reclaimed slab objects.
>   */
>  static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  				 struct mem_cgroup *memcg,
> -				 unsigned long nr_scanned,
> -				 unsigned long nr_eligible)
> +				 unsigned long numerator,
> +				 unsigned long denominator)
>  {
>  	struct shrinker *shrinker;
>  	unsigned long freed = 0;
> @@ -463,9 +459,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  	if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
>  		return 0;
>  
> -	if (nr_scanned == 0)
> -		nr_scanned = SWAP_CLUSTER_MAX;
> -
>  	if (!down_read_trylock(&shrinker_rwsem)) {
>  		/*
>  		 * If we would return 0, our callers would understand that we
> @@ -496,7 +489,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
>  			sc.nid = 0;
>  
> -		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
> +		freed += do_shrink_slab(&sc, shrinker, numerator, denominator);
>  	}
>  
>  	up_read(&shrinker_rwsem);
> @@ -2558,12 +2551,34 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
>  	return true;
>  }
>  
> +static unsigned long lruvec_reclaimable_pages(struct lruvec *lruvec)
> +{
> +	unsigned long nr;
> +
> +	nr = lruvec_page_state(lruvec, NR_ACTIVE_FILE) +
> +	     lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
> +	     lruvec_page_state(lruvec, NR_ISOLATED_FILE);
> +
> +	if (get_nr_swap_pages() > 0)
> +		nr += lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
> +		      lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
> +		      lruvec_page_state(lruvec, NR_ISOLATED_ANON);
> +
> +	return nr;
> +}
> +
>  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  {
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	unsigned long nr_reclaimed, nr_scanned;
> +	unsigned long greclaim = 1, gslab = 1;
>  	bool reclaimable = false;
>  
> +	if (global_reclaim(sc)) {
> +		gslab = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
> +		greclaim = pgdat_reclaimable_pages(pgdat);
> +	}
> +
>  	do {
>  		struct mem_cgroup *root = sc->target_mem_cgroup;
>  		struct mem_cgroup_reclaim_cookie reclaim = {
> @@ -2578,6 +2593,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  
>  		memcg = mem_cgroup_iter(root, NULL, &reclaim);
>  		do {
> +			struct lruvec *lruvec = mem_cgroup_lruvec(pgdat,
> +								  memcg);
> +			unsigned long nr_slab, nr_reclaim;
>  			unsigned long lru_pages;
>  			unsigned long reclaimed;
>  			unsigned long scanned;
> @@ -2592,14 +2610,16 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  
>  			reclaimed = sc->nr_reclaimed;
>  			scanned = sc->nr_scanned;
> +			nr_slab = lruvec_page_state(lruvec,
> +						    NR_SLAB_RECLAIMABLE);
> +			nr_reclaim = lruvec_reclaimable_pages(lruvec);
>  
>  			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
>  			node_lru_pages += lru_pages;
>  
>  			if (memcg)
>  				shrink_slab(sc->gfp_mask, pgdat->node_id,
> -					    memcg, sc->nr_scanned - scanned,
> -					    lru_pages);
> +					    memcg, nr_slab, nr_reclaim);
>  
>  			/* Record the group's reclaim efficiency */
>  			vmpressure(sc->gfp_mask, memcg, false,
> @@ -2623,14 +2643,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  			}
>  		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
>  
> -		/*
> -		 * Shrink the slab caches in the same proportion that
> -		 * the eligible LRU pages were scanned.
> -		 */
>  		if (global_reclaim(sc))
>  			shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
> -				    sc->nr_scanned - nr_scanned,
> -				    node_lru_pages);
> +				    gslab, greclaim);
>  
>  		/*
>  		 * Record the subtree's reclaim efficiency. The reclaimed
> -- 
> 2.7.4
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-13  5:28 ` [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation Minchan Kim
@ 2017-06-13 12:01   ` Josef Bacik
  2017-06-14  6:40     ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Josef Bacik @ 2017-06-13 12:01 UTC (permalink / raw)
  To: Minchan Kim; +Cc: josef, hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik

On Tue, Jun 13, 2017 at 02:28:02PM +0900, Minchan Kim wrote:
> Hello,
> 
> On Thu, Jun 08, 2017 at 03:19:05PM -0400, josef@toxicpanda.com wrote:
> > From: Josef Bacik <jbacik@fb.com>
> > 
> > When testing a slab heavy workload I noticed that we often would barely
> > reclaim anything at all from slab when kswapd started doing reclaim.
> > This is because we use the ratio of nr_scanned / nr_lru to determine how
> > much of slab we should reclaim.  But in a slab only/mostly workload we
> > will not have much page cache to reclaim, and thus our ratio will be
> > really low and not at all related to where the memory on the system is.
> 
> I want to understand this clearly.
> Why nr_scanned / nr_lru is low if system doesnt' have much page cache?
> Could you elaborate it a bit?
> 

Yeah so for example on my freshly booted test box I have this

Active:            58840 kB
Inactive:          46860 kB

Every time we do a get_scan_count() we do this

scan = size >> sc->priority

where sc->priority starts at DEF_PRIORITY, which is 12.  The first loop through
reclaim would result in a scan target of 2 pages to 11715 total inactive pages,
and 3 pages to 14710 total active pages.  This is a really really small target
for a system that is entirely slab pages.  And this is super optimistic, this
assumes we even get to scan these pages.  We don't increment sc->nr_scanned
unless we 1) isolate the page, which assumes it's not in use, and 2) can lock
the page.  Under pressure these numbers could probably go down, I'm sure there's
some random pages from daemons that aren't actually in use, so the targets get
even smaller.

We have to get sc->priority down a lot before we start to get to the 1:1 ratio
that would even start to be useful for reclaim in this scenario.  Add to this
that most shrinkable slabs have this idea that their objects have to loop
through the LRU twice (no longer icache/dcache as Al took my patch to fix that
thankfully) and you end up spending a lot of time looping and reclaiming
nothing.  Basing it on actual slab usage makes more sense logically and avoids
this kind of problem.  Thanks,

Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-13 12:01   ` Josef Bacik
@ 2017-06-14  6:40     ` Minchan Kim
  2017-06-19 15:11       ` Josef Bacik
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2017-06-14  6:40 UTC (permalink / raw)
  To: Josef Bacik; +Cc: hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik

On Tue, Jun 13, 2017 at 08:01:57AM -0400, Josef Bacik wrote:
> On Tue, Jun 13, 2017 at 02:28:02PM +0900, Minchan Kim wrote:
> > Hello,
> > 
> > On Thu, Jun 08, 2017 at 03:19:05PM -0400, josef@toxicpanda.com wrote:
> > > From: Josef Bacik <jbacik@fb.com>
> > > 
> > > When testing a slab heavy workload I noticed that we often would barely
> > > reclaim anything at all from slab when kswapd started doing reclaim.
> > > This is because we use the ratio of nr_scanned / nr_lru to determine how
> > > much of slab we should reclaim.  But in a slab only/mostly workload we
> > > will not have much page cache to reclaim, and thus our ratio will be
> > > really low and not at all related to where the memory on the system is.
> > 
> > I want to understand this clearly.
> > Why nr_scanned / nr_lru is low if system doesnt' have much page cache?
> > Could you elaborate it a bit?
> > 
> 
> Yeah so for example on my freshly booted test box I have this
> 
> Active:            58840 kB
> Inactive:          46860 kB
> 
> Every time we do a get_scan_count() we do this
> 
> scan = size >> sc->priority
> 
> where sc->priority starts at DEF_PRIORITY, which is 12.  The first loop through
> reclaim would result in a scan target of 2 pages to 11715 total inactive pages,
> and 3 pages to 14710 total active pages.  This is a really really small target
> for a system that is entirely slab pages.  And this is super optimistic, this
> assumes we even get to scan these pages.  We don't increment sc->nr_scanned
> unless we 1) isolate the page, which assumes it's not in use, and 2) can lock
> the page.  Under pressure these numbers could probably go down, I'm sure there's
> some random pages from daemons that aren't actually in use, so the targets get
> even smaller.
> 
> We have to get sc->priority down a lot before we start to get to the 1:1 ratio
> that would even start to be useful for reclaim in this scenario.  Add to this
> that most shrinkable slabs have this idea that their objects have to loop
> through the LRU twice (no longer icache/dcache as Al took my patch to fix that
> thankfully) and you end up spending a lot of time looping and reclaiming
> nothing.  Basing it on actual slab usage makes more sense logically and avoids
> this kind of problem.  Thanks,

Thanks. I got understood now.

As I see your change, it seems to be rather aggressive to me.

        node_slab = lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
        shrink_slab(,,, node_slab >> sc->priority, node_slab);

The point is when we finish reclaiming from direct/background(ie, kswapd),
it makes sure that VM scanned slab object up to twice of the size which
is consistent with LRU pages.

What do you think about this?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-14  6:40     ` Minchan Kim
@ 2017-06-19 15:11       ` Josef Bacik
  2017-06-20  2:46         ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Josef Bacik @ 2017-06-19 15:11 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Josef Bacik, hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik

On Wed, Jun 14, 2017 at 03:40:45PM +0900, Minchan Kim wrote:
> On Tue, Jun 13, 2017 at 08:01:57AM -0400, Josef Bacik wrote:
> > On Tue, Jun 13, 2017 at 02:28:02PM +0900, Minchan Kim wrote:
> > > Hello,
> > > 
> > > On Thu, Jun 08, 2017 at 03:19:05PM -0400, josef@toxicpanda.com wrote:
> > > > From: Josef Bacik <jbacik@fb.com>
> > > > 
> > > > When testing a slab heavy workload I noticed that we often would barely
> > > > reclaim anything at all from slab when kswapd started doing reclaim.
> > > > This is because we use the ratio of nr_scanned / nr_lru to determine how
> > > > much of slab we should reclaim.  But in a slab only/mostly workload we
> > > > will not have much page cache to reclaim, and thus our ratio will be
> > > > really low and not at all related to where the memory on the system is.
> > > 
> > > I want to understand this clearly.
> > > Why nr_scanned / nr_lru is low if system doesnt' have much page cache?
> > > Could you elaborate it a bit?
> > > 
> > 
> > Yeah so for example on my freshly booted test box I have this
> > 
> > Active:            58840 kB
> > Inactive:          46860 kB
> > 
> > Every time we do a get_scan_count() we do this
> > 
> > scan = size >> sc->priority
> > 
> > where sc->priority starts at DEF_PRIORITY, which is 12.  The first loop through
> > reclaim would result in a scan target of 2 pages to 11715 total inactive pages,
> > and 3 pages to 14710 total active pages.  This is a really really small target
> > for a system that is entirely slab pages.  And this is super optimistic, this
> > assumes we even get to scan these pages.  We don't increment sc->nr_scanned
> > unless we 1) isolate the page, which assumes it's not in use, and 2) can lock
> > the page.  Under pressure these numbers could probably go down, I'm sure there's
> > some random pages from daemons that aren't actually in use, so the targets get
> > even smaller.
> > 
> > We have to get sc->priority down a lot before we start to get to the 1:1 ratio
> > that would even start to be useful for reclaim in this scenario.  Add to this
> > that most shrinkable slabs have this idea that their objects have to loop
> > through the LRU twice (no longer icache/dcache as Al took my patch to fix that
> > thankfully) and you end up spending a lot of time looping and reclaiming
> > nothing.  Basing it on actual slab usage makes more sense logically and avoids
> > this kind of problem.  Thanks,
> 
> Thanks. I got understood now.
> 
> As I see your change, it seems to be rather aggressive to me.
> 
>         node_slab = lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
>         shrink_slab(,,, node_slab >> sc->priority, node_slab);
> 
> The point is when we finish reclaiming from direct/background(ie, kswapd),
> it makes sure that VM scanned slab object up to twice of the size which
> is consistent with LRU pages.
> 
> What do you think about this?

Sorry for the delay, I was on a short vacation.  At first I thought this was a
decent idea so I went to put it in there.  But there were some problems with it,
and with sc->priority itself I beleive.  First the results were not great, we
still end up not doing a lot of reclaim until we get down to the lower priority
numbers.

The thing that's different with slab vs everybody else is that these numbers are
a ratio, not a specific scan target amount.  With the other LRU's we do

scan = total >> sc->priority

and then we look through 'scan' number of pages, which means we're usually
reclaiming enough stuff to make progress at each priority level.  Slab is
different, pages != slab objects.  Plus we have this common pattern of putting
every object onto our lru list, and letting the scanning mechanism figure out
which objects are actually not in use any more, which means each scan is likely
to not make progress until we've gone through the entire lru.

You are worried that we are just going to empty the slab every time, and that is
totally a valid concern.  But we have checks in place to make sure that our
total_scan (the number of objects we scan) doesn't end up hugely bonkers so we
don't waste time scanning through objects.  If we wanted to be even more careful
we could add some checks in do_shrink_slab/shrink_slab to bail as soon as we hit
our reclaim targets, instead of having just the one check in shrink_node.

As for sc->priority, I think it doesn't make much sense in general.  It makes
total sense to limit the number of pages scanned per LRU, but we can accomplish
this with ratios of each lru to the overall state of the system.  The fact is we
want to keep scanning and reclaiming until we hit our reclaim target, so using
the sc->priority thing is just kind of clunky and sometimes results in us
looping needlessly out to get the priority lowered, when we could just apply
ratio based pressure to the LRU's/slab until we hit our targets, and then bail
out.  I could be wrong and that seems like a big can of worms I don't want to
open right now, but for sure I don't think it's a good fit for slab shrinking
because of the disconnect of nr_slab_pages to actual slab objects.  Thanks,

Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-19 15:11       ` Josef Bacik
@ 2017-06-20  2:46         ` Minchan Kim
  2017-06-27 13:59           ` Josef Bacik
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2017-06-20  2:46 UTC (permalink / raw)
  To: Josef Bacik; +Cc: hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik

Hello Josef,

On Mon, Jun 19, 2017 at 11:11:21AM -0400, Josef Bacik wrote:
> On Wed, Jun 14, 2017 at 03:40:45PM +0900, Minchan Kim wrote:
> > On Tue, Jun 13, 2017 at 08:01:57AM -0400, Josef Bacik wrote:
> > > On Tue, Jun 13, 2017 at 02:28:02PM +0900, Minchan Kim wrote:
> > > > Hello,
> > > > 
> > > > On Thu, Jun 08, 2017 at 03:19:05PM -0400, josef@toxicpanda.com wrote:
> > > > > From: Josef Bacik <jbacik@fb.com>
> > > > > 
> > > > > When testing a slab heavy workload I noticed that we often would barely
> > > > > reclaim anything at all from slab when kswapd started doing reclaim.
> > > > > This is because we use the ratio of nr_scanned / nr_lru to determine how
> > > > > much of slab we should reclaim.  But in a slab only/mostly workload we
> > > > > will not have much page cache to reclaim, and thus our ratio will be
> > > > > really low and not at all related to where the memory on the system is.
> > > > 
> > > > I want to understand this clearly.
> > > > Why nr_scanned / nr_lru is low if system doesnt' have much page cache?
> > > > Could you elaborate it a bit?
> > > > 
> > > 
> > > Yeah so for example on my freshly booted test box I have this
> > > 
> > > Active:            58840 kB
> > > Inactive:          46860 kB
> > > 
> > > Every time we do a get_scan_count() we do this
> > > 
> > > scan = size >> sc->priority
> > > 
> > > where sc->priority starts at DEF_PRIORITY, which is 12.  The first loop through
> > > reclaim would result in a scan target of 2 pages to 11715 total inactive pages,
> > > and 3 pages to 14710 total active pages.  This is a really really small target
> > > for a system that is entirely slab pages.  And this is super optimistic, this
> > > assumes we even get to scan these pages.  We don't increment sc->nr_scanned
> > > unless we 1) isolate the page, which assumes it's not in use, and 2) can lock
> > > the page.  Under pressure these numbers could probably go down, I'm sure there's
> > > some random pages from daemons that aren't actually in use, so the targets get
> > > even smaller.
> > > 
> > > We have to get sc->priority down a lot before we start to get to the 1:1 ratio
> > > that would even start to be useful for reclaim in this scenario.  Add to this
> > > that most shrinkable slabs have this idea that their objects have to loop
> > > through the LRU twice (no longer icache/dcache as Al took my patch to fix that
> > > thankfully) and you end up spending a lot of time looping and reclaiming
> > > nothing.  Basing it on actual slab usage makes more sense logically and avoids
> > > this kind of problem.  Thanks,
> > 
> > Thanks. I got understood now.
> > 
> > As I see your change, it seems to be rather aggressive to me.
> > 
> >         node_slab = lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
> >         shrink_slab(,,, node_slab >> sc->priority, node_slab);
> > 
> > The point is when we finish reclaiming from direct/background(ie, kswapd),
> > it makes sure that VM scanned slab object up to twice of the size which
> > is consistent with LRU pages.
> > 
> > What do you think about this?
> 
> Sorry for the delay, I was on a short vacation.  At first I thought this was a
> decent idea so I went to put it in there.  But there were some problems with it,
> and with sc->priority itself I beleive.  First the results were not great, we
> still end up not doing a lot of reclaim until we get down to the lower priority
> numbers.
> 
> The thing that's different with slab vs everybody else is that these numbers are
> a ratio, not a specific scan target amount.  With the other LRU's we do

Hmm, I don't get it why the ratio model is a problem.
My suggestion is to aim for scanning entire available objects list twice
in a reclaim cycle(priority from 12 to 0) which is consistent with LRU
reclaim. IOW, (1/4096 + 1/2048 + ... 1/1) * available object.
If it cannot reclaim pages with low priority(ie, 12), it try to reclaim
more objects in higher priority and finally, it try to reclaim every objects
at priority 1 and one more chance with priority 0.

> 
> scan = total >> sc->priority
> 
> and then we look through 'scan' number of pages, which means we're usually
> reclaiming enough stuff to make progress at each priority level.  Slab is
> different, pages != slab objects.  Plus we have this common pattern of putting

Aha, I see your concern. The problem is although we can reclaim a object from
slab, it doesn't mean to reclaim a page so VM should go with next priority
cycle. If so, I think we can adjust the the cost model more agressive with
ratio approach. (1/12 + 2/12 + 3/12 ...) With this, in a reclaim cycle(12..0),
we guarantees that scanning of entire objects list four times while LRU is
two times. As well, it might be a helpful if we can have slab's reclaim tunable
knob to tune reclaim agressiveness of slab like swappiness for anonymous pages.

> every object onto our lru list, and letting the scanning mechanism figure out
> which objects are actually not in use any more, which means each scan is likely
> to not make progress until we've gone through the entire lru.

Sorry, I didn't understand. Could you elaborate a bit if it's important point
in this discussion?

> 
> You are worried that we are just going to empty the slab every time, and that is
> totally a valid concern.  But we have checks in place to make sure that our
> total_scan (the number of objects we scan) doesn't end up hugely bonkers so we
> don't waste time scanning through objects.  If we wanted to be even more careful
> we could add some checks in do_shrink_slab/shrink_slab to bail as soon as we hit
> our reclaim targets, instead of having just the one check in shrink_node.

Acutually, my main worry is the expression(gslab/greclaimable).
What's the rationale for such modeling in you mind?
Without understanding that, it's hard to say whether it's proper.

For exmaple, with your expression, if nr_slab == nr_lru, it scans all objects
of slab. Why?  At that time, VM even doesn't scan full LRU.
I really want to make them consistent so when a reclaim cycle is done,
we guarantees to happen some amount of scanning.
In my idea, LRU scanning is x2 of LRU pages and slab scanning is x4 of
slab object.

> 
> As for sc->priority, I think it doesn't make much sense in general.  It makes
> total sense to limit the number of pages scanned per LRU, but we can accomplish
> this with ratios of each lru to the overall state of the system.  The fact is we
> want to keep scanning and reclaiming until we hit our reclaim target, so using
> the sc->priority thing is just kind of clunky and sometimes results in us
> looping needlessly out to get the priority lowered, when we could just apply
> ratio based pressure to the LRU's/slab until we hit our targets, and then bail
> out.  I could be wrong and that seems like a big can of worms I don't want to
> open right now, but for sure I don't think it's a good fit for slab shrinking
> because of the disconnect of nr_slab_pages to actual slab objects.  Thanks,
> 
> Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-20  2:46         ` Minchan Kim
@ 2017-06-27 13:59           ` Josef Bacik
  2017-06-30  2:17             ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Josef Bacik @ 2017-06-27 13:59 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Josef Bacik, hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik

On Tue, Jun 20, 2017 at 11:46:45AM +0900, Minchan Kim wrote:
> Hello Josef,
> 
> On Mon, Jun 19, 2017 at 11:11:21AM -0400, Josef Bacik wrote:
> > On Wed, Jun 14, 2017 at 03:40:45PM +0900, Minchan Kim wrote:
> > > On Tue, Jun 13, 2017 at 08:01:57AM -0400, Josef Bacik wrote:
> > > > On Tue, Jun 13, 2017 at 02:28:02PM +0900, Minchan Kim wrote:
> > > > > Hello,
> > > > > 
> > > > > On Thu, Jun 08, 2017 at 03:19:05PM -0400, josef@toxicpanda.com wrote:
> > > > > > From: Josef Bacik <jbacik@fb.com>
> > > > > > 
> > > > > > When testing a slab heavy workload I noticed that we often would barely
> > > > > > reclaim anything at all from slab when kswapd started doing reclaim.
> > > > > > This is because we use the ratio of nr_scanned / nr_lru to determine how
> > > > > > much of slab we should reclaim.  But in a slab only/mostly workload we
> > > > > > will not have much page cache to reclaim, and thus our ratio will be
> > > > > > really low and not at all related to where the memory on the system is.
> > > > > 
> > > > > I want to understand this clearly.
> > > > > Why nr_scanned / nr_lru is low if system doesnt' have much page cache?
> > > > > Could you elaborate it a bit?
> > > > > 
> > > > 
> > > > Yeah so for example on my freshly booted test box I have this
> > > > 
> > > > Active:            58840 kB
> > > > Inactive:          46860 kB
> > > > 
> > > > Every time we do a get_scan_count() we do this
> > > > 
> > > > scan = size >> sc->priority
> > > > 
> > > > where sc->priority starts at DEF_PRIORITY, which is 12.  The first loop through
> > > > reclaim would result in a scan target of 2 pages to 11715 total inactive pages,
> > > > and 3 pages to 14710 total active pages.  This is a really really small target
> > > > for a system that is entirely slab pages.  And this is super optimistic, this
> > > > assumes we even get to scan these pages.  We don't increment sc->nr_scanned
> > > > unless we 1) isolate the page, which assumes it's not in use, and 2) can lock
> > > > the page.  Under pressure these numbers could probably go down, I'm sure there's
> > > > some random pages from daemons that aren't actually in use, so the targets get
> > > > even smaller.
> > > > 
> > > > We have to get sc->priority down a lot before we start to get to the 1:1 ratio
> > > > that would even start to be useful for reclaim in this scenario.  Add to this
> > > > that most shrinkable slabs have this idea that their objects have to loop
> > > > through the LRU twice (no longer icache/dcache as Al took my patch to fix that
> > > > thankfully) and you end up spending a lot of time looping and reclaiming
> > > > nothing.  Basing it on actual slab usage makes more sense logically and avoids
> > > > this kind of problem.  Thanks,
> > > 
> > > Thanks. I got understood now.
> > > 
> > > As I see your change, it seems to be rather aggressive to me.
> > > 
> > >         node_slab = lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
> > >         shrink_slab(,,, node_slab >> sc->priority, node_slab);
> > > 
> > > The point is when we finish reclaiming from direct/background(ie, kswapd),
> > > it makes sure that VM scanned slab object up to twice of the size which
> > > is consistent with LRU pages.
> > > 
> > > What do you think about this?
> > 
> > Sorry for the delay, I was on a short vacation.  At first I thought this was a
> > decent idea so I went to put it in there.  But there were some problems with it,
> > and with sc->priority itself I beleive.  First the results were not great, we
> > still end up not doing a lot of reclaim until we get down to the lower priority
> > numbers.
> > 
> > The thing that's different with slab vs everybody else is that these numbers are
> > a ratio, not a specific scan target amount.  With the other LRU's we do
> 
> Hmm, I don't get it why the ratio model is a problem.
> My suggestion is to aim for scanning entire available objects list twice
> in a reclaim cycle(priority from 12 to 0) which is consistent with LRU
> reclaim. IOW, (1/4096 + 1/2048 + ... 1/1) * available object.
> If it cannot reclaim pages with low priority(ie, 12), it try to reclaim
> more objects in higher priority and finally, it try to reclaim every objects
> at priority 1 and one more chance with priority 0.

Because this static step down wastes cycles.  Why loop 10 times when you could
set the target at actual usage and try to get everything in one go?  Most
shrinkable slabs adhere to this default of in use first model, which means that
we have to hit an object in the lru twice before it is freed.  So in order to
reclaim anything we have to scan a slab cache's entire lru at least once before
any reclaim starts happening.  If we're doing this static step down thing we
scan some of it, then some more, then some more, then finally we get priority
down enough that we scan a huge swatch of the list enough to start reclaiming
objects.

With the usage ratio in place it's based on actual system usage, so we waste
less time dropping the priority and spend more time actively trying to free
objects.

And keep in mind this is the first patch, that sets the baseline.  The next
patch makes it so we don't even really use this ratio that often, we use the
ratio of changed slab pages to changed inactive pages, so that can be even more
agressive.

> 
> > 
> > scan = total >> sc->priority
> > 
> > and then we look through 'scan' number of pages, which means we're usually
> > reclaiming enough stuff to make progress at each priority level.  Slab is
> > different, pages != slab objects.  Plus we have this common pattern of putting
> 
> Aha, I see your concern. The problem is although we can reclaim a object from
> slab, it doesn't mean to reclaim a page so VM should go with next priority
> cycle. If so, I think we can adjust the the cost model more agressive with
> ratio approach. (1/12 + 2/12 + 3/12 ...) With this, in a reclaim cycle(12..0),
> we guarantees that scanning of entire objects list four times while LRU is
> two times. As well, it might be a helpful if we can have slab's reclaim tunable
> knob to tune reclaim agressiveness of slab like swappiness for anonymous pages.
> 

Death to all tunables ;).  I prefer this mechanism, it's based on what is
happening on the system currently, and not some weird static thing that can be
fucked up by some idiot system administrator inside Facebook who thinks he's
God's gift to kernel tuning.

I think the 1/12->2/12->3/12 thing is a better solution than your other option,
but again I argue that statically stepping down just wastes time in a majority
of the cases.

Thinking of it a different way, going to higher and higher priority to hopefully
put enough pressure on the slab is going to have the side-effect of putting more
and more pressure on active/inactive.  If our app needs its hot stuff in the
active pages we'd really rather not evict all of it so we could get the ratios
right to discard the slab pages we didn't care about in the first place.

> > every object onto our lru list, and letting the scanning mechanism figure out
> > which objects are actually not in use any more, which means each scan is likely
> > to not make progress until we've gone through the entire lru.
> 
> Sorry, I didn't understand. Could you elaborate a bit if it's important point
> in this discussion?
> 

I expanded on this above, but I'll give a more concrete example.  Consider xfs
metadata, we allocate a slab object and read in our page, use it, and then free
the buffer which put's it on the lru list.  XFS marks this with a INUSE flag,
which must be cleared before it is free'd from the LRU.  We scan through the
LRU, clearing the INUSE flag and moving the object to the back of the LRU, but
not actually free'ing it.  This happens for all (well most) objects that end up
on the LRU, and this design pattern is used _everywhere_.  Until recently it was
used for the super shrinkers, but I changed that so thankfully the bulk of the
problem is gone.  However if you do a find / -exec stat {} \;, and then do it
again, you'll end up with the same scenario for the super shrinker.   There's no
aging except via pressure on the slabs, so worst case we always have to scan the
entire slab object lru once before we start reclaiming anything.  Being
agressive here I think is ok, we have things in place to make sure we don't over
reclaim.

> > 
> > You are worried that we are just going to empty the slab every time, and that is
> > totally a valid concern.  But we have checks in place to make sure that our
> > total_scan (the number of objects we scan) doesn't end up hugely bonkers so we
> > don't waste time scanning through objects.  If we wanted to be even more careful
> > we could add some checks in do_shrink_slab/shrink_slab to bail as soon as we hit
> > our reclaim targets, instead of having just the one check in shrink_node.
> 
> Acutually, my main worry is the expression(gslab/greclaimable).
> What's the rationale for such modeling in you mind?
> Without understanding that, it's hard to say whether it's proper.
> 
> For exmaple, with your expression, if nr_slab == nr_lru, it scans all objects
> of slab. Why?  At that time, VM even doesn't scan full LRU.
> I really want to make them consistent so when a reclaim cycle is done,
> we guarantees to happen some amount of scanning.
> In my idea, LRU scanning is x2 of LRU pages and slab scanning is x4 of
> slab object.

I really should read the whole email before I start replying to parts ;).  I
think I've explained my rationale above, but I'll summarize here

1) Slab reclaim isn't like page reclaim.
  a) slab objects != slab pages, there's more objects to reclaim than pages, and
     fragmentation can fuck us.
  b) scanning/reclaiming slab objects is generally faster than page reclaim, so
     scanning more of them is a higher cpu cost, generally we don't have to do
     IO in order to reclaim (*cough*except for xfs*cough*).
  c) most slab lru's institute that INUSE flag bullshit that forces us to scan
     the whole LRU once before reclaim occurs.
2) gslab/greclaimable equates to actual system usage.  With a normal machine
   nothing really changes, slab will be some single digit %, and nobody will
   notice, however with a mostly slab workload the slab lru's can be huge and
   then small static targets get us no where (see 1c).
3) My next patch means we don't actually use gslab/greclaimable in reality that
   often, we'll use delta_slab/delta_inactive, so changing this here doesn't
   matter much unless we also want to debate my second patch as well.

Sorry for the long delay Minchin, I'm not trying to ignore you, been trying to
track down a cgroup cpu thing, I'll try to be more responsive from now on as I'd
like to get these patches into the next merge window.  Thanks,

Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-27 13:59           ` Josef Bacik
@ 2017-06-30  2:17             ` Minchan Kim
  2017-06-30 15:03               ` Josef Bacik
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2017-06-30  2:17 UTC (permalink / raw)
  To: Josef Bacik
  Cc: hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik, mhocko,
	cl, david

On Tue, Jun 27, 2017 at 09:59:32AM -0400, Josef Bacik wrote:
> On Tue, Jun 20, 2017 at 11:46:45AM +0900, Minchan Kim wrote:
> > Hello Josef,
> > 
> > On Mon, Jun 19, 2017 at 11:11:21AM -0400, Josef Bacik wrote:
> > > On Wed, Jun 14, 2017 at 03:40:45PM +0900, Minchan Kim wrote:
> > > > On Tue, Jun 13, 2017 at 08:01:57AM -0400, Josef Bacik wrote:
> > > > > On Tue, Jun 13, 2017 at 02:28:02PM +0900, Minchan Kim wrote:
> > > > > > Hello,
> > > > > > 
> > > > > > On Thu, Jun 08, 2017 at 03:19:05PM -0400, josef@toxicpanda.com wrote:
> > > > > > > From: Josef Bacik <jbacik@fb.com>
> > > > > > > 
> > > > > > > When testing a slab heavy workload I noticed that we often would barely
> > > > > > > reclaim anything at all from slab when kswapd started doing reclaim.
> > > > > > > This is because we use the ratio of nr_scanned / nr_lru to determine how
> > > > > > > much of slab we should reclaim.  But in a slab only/mostly workload we
> > > > > > > will not have much page cache to reclaim, and thus our ratio will be
> > > > > > > really low and not at all related to where the memory on the system is.
> > > > > > 
> > > > > > I want to understand this clearly.
> > > > > > Why nr_scanned / nr_lru is low if system doesnt' have much page cache?
> > > > > > Could you elaborate it a bit?
> > > > > > 
> > > > > 
> > > > > Yeah so for example on my freshly booted test box I have this
> > > > > 
> > > > > Active:            58840 kB
> > > > > Inactive:          46860 kB
> > > > > 
> > > > > Every time we do a get_scan_count() we do this
> > > > > 
> > > > > scan = size >> sc->priority
> > > > > 
> > > > > where sc->priority starts at DEF_PRIORITY, which is 12.  The first loop through
> > > > > reclaim would result in a scan target of 2 pages to 11715 total inactive pages,
> > > > > and 3 pages to 14710 total active pages.  This is a really really small target
> > > > > for a system that is entirely slab pages.  And this is super optimistic, this
> > > > > assumes we even get to scan these pages.  We don't increment sc->nr_scanned
> > > > > unless we 1) isolate the page, which assumes it's not in use, and 2) can lock
> > > > > the page.  Under pressure these numbers could probably go down, I'm sure there's
> > > > > some random pages from daemons that aren't actually in use, so the targets get
> > > > > even smaller.
> > > > > 
> > > > > We have to get sc->priority down a lot before we start to get to the 1:1 ratio
> > > > > that would even start to be useful for reclaim in this scenario.  Add to this
> > > > > that most shrinkable slabs have this idea that their objects have to loop
> > > > > through the LRU twice (no longer icache/dcache as Al took my patch to fix that
> > > > > thankfully) and you end up spending a lot of time looping and reclaiming
> > > > > nothing.  Basing it on actual slab usage makes more sense logically and avoids
> > > > > this kind of problem.  Thanks,
> > > > 
> > > > Thanks. I got understood now.
> > > > 
> > > > As I see your change, it seems to be rather aggressive to me.
> > > > 
> > > >         node_slab = lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
> > > >         shrink_slab(,,, node_slab >> sc->priority, node_slab);
> > > > 
> > > > The point is when we finish reclaiming from direct/background(ie, kswapd),
> > > > it makes sure that VM scanned slab object up to twice of the size which
> > > > is consistent with LRU pages.
> > > > 
> > > > What do you think about this?
> > > 
> > > Sorry for the delay, I was on a short vacation.  At first I thought this was a
> > > decent idea so I went to put it in there.  But there were some problems with it,
> > > and with sc->priority itself I beleive.  First the results were not great, we
> > > still end up not doing a lot of reclaim until we get down to the lower priority
> > > numbers.
> > > 
> > > The thing that's different with slab vs everybody else is that these numbers are
> > > a ratio, not a specific scan target amount.  With the other LRU's we do
> > 
> > Hmm, I don't get it why the ratio model is a problem.
> > My suggestion is to aim for scanning entire available objects list twice
> > in a reclaim cycle(priority from 12 to 0) which is consistent with LRU
> > reclaim. IOW, (1/4096 + 1/2048 + ... 1/1) * available object.
> > If it cannot reclaim pages with low priority(ie, 12), it try to reclaim
> > more objects in higher priority and finally, it try to reclaim every objects
> > at priority 1 and one more chance with priority 0.
> 
> Because this static step down wastes cycles.  Why loop 10 times when you could
> set the target at actual usage and try to get everything in one go?  Most
> shrinkable slabs adhere to this default of in use first model, which means that
> we have to hit an object in the lru twice before it is freed.  So in order to

I didn't know that.

> reclaim anything we have to scan a slab cache's entire lru at least once before
> any reclaim starts happening.  If we're doing this static step down thing we

If it's really true, I think that shrinker should be fixed first.

> scan some of it, then some more, then some more, then finally we get priority
> down enough that we scan a huge swatch of the list enough to start reclaiming
> objects.
> 
> With the usage ratio in place it's based on actual system usage, so we waste
> less time dropping the priority and spend more time actively trying to free
> objects.

However, I think your idea is too much agressive.

        100M LRU, 1000M SLAB

With your idea, it scans 10 times of all objects in shrinker which ends up
reclaim every slab pages, I guess.

I think your idea comes from some of slab shrinker as you mentioned.
I guess at the first time, all of objects in shrinker could be INUSE state
as you said, however, on steady state, they would work like real LRU
to reflect recency, otherwise, I want to call it broken and we shouldn't
design general slab aging model for those specific one.

> 
> And keep in mind this is the first patch, that sets the baseline.  The next
> patch makes it so we don't even really use this ratio that often, we use the
> ratio of changed slab pages to changed inactive pages, so that can be even more
> agressive.

Intentionally, I didn't read your next patch because without clear understanding
of prior patch, it's hard to digest second one so wanted to discuss this one
first. However, if second patch makes the situation better, I will read but
doubt because you said it would make more aggressive which is my concern.

> 
> > 
> > > 
> > > scan = total >> sc->priority
> > > 
> > > and then we look through 'scan' number of pages, which means we're usually
> > > reclaiming enough stuff to make progress at each priority level.  Slab is
> > > different, pages != slab objects.  Plus we have this common pattern of putting
> > 
> > Aha, I see your concern. The problem is although we can reclaim a object from
> > slab, it doesn't mean to reclaim a page so VM should go with next priority
> > cycle. If so, I think we can adjust the the cost model more agressive with
> > ratio approach. (1/12 + 2/12 + 3/12 ...) With this, in a reclaim cycle(12..0),
> > we guarantees that scanning of entire objects list four times while LRU is
> > two times. As well, it might be a helpful if we can have slab's reclaim tunable
> > knob to tune reclaim agressiveness of slab like swappiness for anonymous pages.
> > 
> 
> Death to all tunables ;).  I prefer this mechanism, it's based on what is
> happening on the system currently, and not some weird static thing that can be
> fucked up by some idiot system administrator inside Facebook who thinks he's
> God's gift to kernel tuning.
> 
> I think the 1/12->2/12->3/12 thing is a better solution than your other option,
> but again I argue that statically stepping down just wastes time in a majority
> of the cases.
> 
> Thinking of it a different way, going to higher and higher priority to hopefully
> put enough pressure on the slab is going to have the side-effect of putting more
> and more pressure on active/inactive.  If our app needs its hot stuff in the
> active pages we'd really rather not evict all of it so we could get the ratios
> right to discard the slab pages we didn't care about in the first place.
> 
> > > every object onto our lru list, and letting the scanning mechanism figure out
> > > which objects are actually not in use any more, which means each scan is likely
> > > to not make progress until we've gone through the entire lru.
> > 
> > Sorry, I didn't understand. Could you elaborate a bit if it's important point
> > in this discussion?
> > 
> 
> I expanded on this above, but I'll give a more concrete example.  Consider xfs
> metadata, we allocate a slab object and read in our page, use it, and then free
> the buffer which put's it on the lru list.  XFS marks this with a INUSE flag,
> which must be cleared before it is free'd from the LRU.  We scan through the
> LRU, clearing the INUSE flag and moving the object to the back of the LRU, but
> not actually free'ing it.  This happens for all (well most) objects that end up
> on the LRU, and this design pattern is used _everywhere_.  Until recently it was
> used for the super shrinkers, but I changed that so thankfully the bulk of the
> problem is gone.  However if you do a find / -exec stat {} \;, and then do it
> again, you'll end up with the same scenario for the super shrinker.   There's no
> aging except via pressure on the slabs, so worst case we always have to scan the
> entire slab object lru once before we start reclaiming anything.  Being
> agressive here I think is ok, we have things in place to make sure we don't over
> reclaim.

Thanks for the detail example. Now I understood but the question is it is
always true? I mean at the first stage(ie, first population of objects), it
seems to be but at the steady stage, I guess some of objects have INUSE,
others not by access pattern so it emulates LRU model. No?

> 
> > > 
> > > You are worried that we are just going to empty the slab every time, and that is
> > > totally a valid concern.  But we have checks in place to make sure that our
> > > total_scan (the number of objects we scan) doesn't end up hugely bonkers so we
> > > don't waste time scanning through objects.  If we wanted to be even more careful
> > > we could add some checks in do_shrink_slab/shrink_slab to bail as soon as we hit
> > > our reclaim targets, instead of having just the one check in shrink_node.
> > 
> > Acutually, my main worry is the expression(gslab/greclaimable).
> > What's the rationale for such modeling in you mind?
> > Without understanding that, it's hard to say whether it's proper.
> > 
> > For exmaple, with your expression, if nr_slab == nr_lru, it scans all objects
> > of slab. Why?  At that time, VM even doesn't scan full LRU.
> > I really want to make them consistent so when a reclaim cycle is done,
> > we guarantees to happen some amount of scanning.
> > In my idea, LRU scanning is x2 of LRU pages and slab scanning is x4 of
> > slab object.
> 
> I really should read the whole email before I start replying to parts ;).  I
> think I've explained my rationale above, but I'll summarize here
> 
> 1) Slab reclaim isn't like page reclaim.
>   a) slab objects != slab pages, there's more objects to reclaim than pages, and
>      fragmentation can fuck us.


True. there were some discussion to improve it better. yes, that's not trivial
job but at least it would be better to revisit ideas before making slab
reclaim too aggressive.

Ccing Dave, Christoph and others guys might have a interest on slab reclaim.

        page-based slab reclaim,
        https://lkml.org/lkml/2010/2/8/329

        slab defragmentation
        https://lkml.org/lkml/2007/7/7/175

>   b) scanning/reclaiming slab objects is generally faster than page reclaim, so
>      scanning more of them is a higher cpu cost, generally we don't have to do
>      IO in order to reclaim (*cough*except for xfs*cough*).
>   c) most slab lru's institute that INUSE flag bullshit that forces us to scan
>      the whole LRU once before reclaim occurs.

If it's really true, I think we should fix it rather than making VM slab reclaim
logic too agressive which will affect other sane shrinkers.

> 2) gslab/greclaimable equates to actual system usage.  With a normal machine
>    nothing really changes, slab will be some single digit %, and nobody will
>    notice, however with a mostly slab workload the slab lru's can be huge and
>    then small static targets get us no where (see 1c).
> 3) My next patch means we don't actually use gslab/greclaimable in reality that
>    often, we'll use delta_slab/delta_inactive, so changing this here doesn't
>    matter much unless we also want to debate my second patch as well.
> 
> Sorry for the long delay Minchin, I'm not trying to ignore you, been trying to
> track down a cgroup cpu thing, I'll try to be more responsive from now on as I'd
> like to get these patches into the next merge window.  Thanks,

No problem, Josef. I belive it would be a good chance to think over slab reclaim
redesign.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-30  2:17             ` Minchan Kim
@ 2017-06-30 15:03               ` Josef Bacik
  2017-07-02  1:58                 ` Dave Chinner
  2017-07-03  1:33                 ` Minchan Kim
  0 siblings, 2 replies; 23+ messages in thread
From: Josef Bacik @ 2017-06-30 15:03 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Josef Bacik, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl, david

On Fri, Jun 30, 2017 at 11:17:13AM +0900, Minchan Kim wrote:

<snip>

> > 
> > Because this static step down wastes cycles.  Why loop 10 times when you could
> > set the target at actual usage and try to get everything in one go?  Most
> > shrinkable slabs adhere to this default of in use first model, which means that
> > we have to hit an object in the lru twice before it is freed.  So in order to
> 
> I didn't know that.
> 
> > reclaim anything we have to scan a slab cache's entire lru at least once before
> > any reclaim starts happening.  If we're doing this static step down thing we
> 
> If it's really true, I think that shrinker should be fixed first.
> 

Easier said than done.  I've fixed this for the super shrinkers, but like I said
below, all it takes is some asshole doing find / -exec stat {} \; twice to put
us back in the same situation again.  There's no aging mechanism other than
memory reclaim, so we get into this shitty situation of aging+reclaiming at the
same time.

> > scan some of it, then some more, then some more, then finally we get priority
> > down enough that we scan a huge swatch of the list enough to start reclaiming
> > objects.
> > 
> > With the usage ratio in place it's based on actual system usage, so we waste
> > less time dropping the priority and spend more time actively trying to free
> > objects.
> 
> However, I think your idea is too much agressive.
> 
>         100M LRU, 1000M SLAB
> 
> With your idea, it scans 10 times of all objects in shrinker which ends up
> reclaim every slab pages, I guess.
> 

No, we have limits in do_shrink_slab, so in this case we will limit the scan
count to twice the LRU size, which accounts for this INUSE design pattern
everybody loves.  Plus we have the early bailout logic so when we reclaim enough
we are done, we don't just reclaim the whole thing.

> I think your idea comes from some of slab shrinker as you mentioned.
> I guess at the first time, all of objects in shrinker could be INUSE state
> as you said, however, on steady state, they would work like real LRU
> to reflect recency, otherwise, I want to call it broken and we shouldn't
> design general slab aging model for those specific one.
> 

Yeah that's totally a valid argument to make, but the idea of coming up with
something completely different hurts my head, and I'm trying to fix this problem
right now, not in 6 cycles when we all finally agree on the new mechanism.

> > 
> > And keep in mind this is the first patch, that sets the baseline.  The next
> > patch makes it so we don't even really use this ratio that often, we use the
> > ratio of changed slab pages to changed inactive pages, so that can be even more
> > agressive.
> 
> Intentionally, I didn't read your next patch because without clear understanding
> of prior patch, it's hard to digest second one so wanted to discuss this one
> first. However, if second patch makes the situation better, I will read but
> doubt because you said it would make more aggressive which is my concern.
> 

Right so it adjusts the aggressiveness on the change in slab vs inactive lru
size, so if we're generating a lot of slab pages and no inactive pages then
it'll look just as agressive.

I think you're getting too scared of the scale of aggressiveness these numbers
generate.  We have a bunch of logic to trim down these numbers to reasonable
scan targets and to bail out when we've hit our reclaim target.  We end up with
bonkers numbers in bonkers situations, and these numbers are curtailed to
reasonable things later on, so the initial pass isn't that important.  What _is_
important is that we are actually agressive enough, because right now we aren't
and it hurts badly.  We can be overly agressive because we have checks in place
to back off.

> > 
> > > 
> > > > 
> > > > scan = total >> sc->priority
> > > > 
> > > > and then we look through 'scan' number of pages, which means we're usually
> > > > reclaiming enough stuff to make progress at each priority level.  Slab is
> > > > different, pages != slab objects.  Plus we have this common pattern of putting
> > > 
> > > Aha, I see your concern. The problem is although we can reclaim a object from
> > > slab, it doesn't mean to reclaim a page so VM should go with next priority
> > > cycle. If so, I think we can adjust the the cost model more agressive with
> > > ratio approach. (1/12 + 2/12 + 3/12 ...) With this, in a reclaim cycle(12..0),
> > > we guarantees that scanning of entire objects list four times while LRU is
> > > two times. As well, it might be a helpful if we can have slab's reclaim tunable
> > > knob to tune reclaim agressiveness of slab like swappiness for anonymous pages.
> > > 
> > 
> > Death to all tunables ;).  I prefer this mechanism, it's based on what is
> > happening on the system currently, and not some weird static thing that can be
> > fucked up by some idiot system administrator inside Facebook who thinks he's
> > God's gift to kernel tuning.
> > 
> > I think the 1/12->2/12->3/12 thing is a better solution than your other option,
> > but again I argue that statically stepping down just wastes time in a majority
> > of the cases.
> > 
> > Thinking of it a different way, going to higher and higher priority to hopefully
> > put enough pressure on the slab is going to have the side-effect of putting more
> > and more pressure on active/inactive.  If our app needs its hot stuff in the
> > active pages we'd really rather not evict all of it so we could get the ratios
> > right to discard the slab pages we didn't care about in the first place.
> > 
> > > > every object onto our lru list, and letting the scanning mechanism figure out
> > > > which objects are actually not in use any more, which means each scan is likely
> > > > to not make progress until we've gone through the entire lru.
> > > 
> > > Sorry, I didn't understand. Could you elaborate a bit if it's important point
> > > in this discussion?
> > > 
> > 
> > I expanded on this above, but I'll give a more concrete example.  Consider xfs
> > metadata, we allocate a slab object and read in our page, use it, and then free
> > the buffer which put's it on the lru list.  XFS marks this with a INUSE flag,
> > which must be cleared before it is free'd from the LRU.  We scan through the
> > LRU, clearing the INUSE flag and moving the object to the back of the LRU, but
> > not actually free'ing it.  This happens for all (well most) objects that end up
> > on the LRU, and this design pattern is used _everywhere_.  Until recently it was
> > used for the super shrinkers, but I changed that so thankfully the bulk of the
> > problem is gone.  However if you do a find / -exec stat {} \;, and then do it
> > again, you'll end up with the same scenario for the super shrinker.   There's no
> > aging except via pressure on the slabs, so worst case we always have to scan the
> > entire slab object lru once before we start reclaiming anything.  Being
> > agressive here I think is ok, we have things in place to make sure we don't over
> > reclaim.
> 
> Thanks for the detail example. Now I understood but the question is it is
> always true? I mean at the first stage(ie, first population of objects), it
> seems to be but at the steady stage, I guess some of objects have INUSE,
> others not by access pattern so it emulates LRU model. No?
> 

Sort of.  Lets take icache/dcache.  We open a file and close a file, this get's
added to the LRU.  We open the file and close the file again, it's already on
the LRU so it stays where it was and gets the INUSE flag set.  Once an object is
on the LRU it doesn't move unless we hit it via the shrinker.  Even if we open
and close the file, and then open and keep it open, the file stays on the LRU,
and is only removed once the shrinker hits it, sees it's refcount is > 1, and
removes it from the list.

With my patch in place we will have a smattering of objects that are not in use
with the INUSE flag set, objects with no INUSE so get free'd immediately
(hooray!), and objects that really are in use that get removed from the LRU once
we encounter them.

Even with this "ideal" (or less shitty depending on your point of view), our LRU
is going to be made up with a pretty significant number of objects that can't be
free'd right away in the worst case.

> > 
> > > > 
> > > > You are worried that we are just going to empty the slab every time, and that is
> > > > totally a valid concern.  But we have checks in place to make sure that our
> > > > total_scan (the number of objects we scan) doesn't end up hugely bonkers so we
> > > > don't waste time scanning through objects.  If we wanted to be even more careful
> > > > we could add some checks in do_shrink_slab/shrink_slab to bail as soon as we hit
> > > > our reclaim targets, instead of having just the one check in shrink_node.
> > > 
> > > Acutually, my main worry is the expression(gslab/greclaimable).
> > > What's the rationale for such modeling in you mind?
> > > Without understanding that, it's hard to say whether it's proper.
> > > 
> > > For exmaple, with your expression, if nr_slab == nr_lru, it scans all objects
> > > of slab. Why?  At that time, VM even doesn't scan full LRU.
> > > I really want to make them consistent so when a reclaim cycle is done,
> > > we guarantees to happen some amount of scanning.
> > > In my idea, LRU scanning is x2 of LRU pages and slab scanning is x4 of
> > > slab object.
> > 
> > I really should read the whole email before I start replying to parts ;).  I
> > think I've explained my rationale above, but I'll summarize here
> > 
> > 1) Slab reclaim isn't like page reclaim.
> >   a) slab objects != slab pages, there's more objects to reclaim than pages, and
> >      fragmentation can fuck us.
> 
> 
> True. there were some discussion to improve it better. yes, that's not trivial
> job but at least it would be better to revisit ideas before making slab
> reclaim too aggressive.
> 
> Ccing Dave, Christoph and others guys might have a interest on slab reclaim.
> 
>         page-based slab reclaim,
>         https://lkml.org/lkml/2010/2/8/329
> 
>         slab defragmentation
>         https://lkml.org/lkml/2007/7/7/175
> 

This is the first path I went down, and I burned and salted the earth on my way
back.  The problem with trying to reclaim slab pages is you have to have some
insight into the objects contained on the page.  That gets you into weird
locking scenarios and end's up pretty yucky.

The next approach I took was having a slab lru, and then the reclaim would tell
the file system "I want to reclaim this page."  The problem is there's a
disconnect between what the vm will think is last touched vs what is actually
last touched, which could lead to us free'ing hotter objects when there are
cooler ones to free.

Admittedly I didn't spend much time on these solutions, all I really want is to
make the current situation less shitty right now so I can go back to being a
btrfs developer and not a everything else fixer ;).

> >   b) scanning/reclaiming slab objects is generally faster than page reclaim, so
> >      scanning more of them is a higher cpu cost, generally we don't have to do
> >      IO in order to reclaim (*cough*except for xfs*cough*).
> >   c) most slab lru's institute that INUSE flag bullshit that forces us to scan
> >      the whole LRU once before reclaim occurs.
> 
> If it's really true, I think we should fix it rather than making VM slab reclaim
> logic too agressive which will affect other sane shrinkers.
> 

Agreed, and it's been fixed for the super shrinkers which are the largest pool.
However you can still force the bad behavior, so we still need to be able to
handle the worst case.

> > 2) gslab/greclaimable equates to actual system usage.  With a normal machine
> >    nothing really changes, slab will be some single digit %, and nobody will
> >    notice, however with a mostly slab workload the slab lru's can be huge and
> >    then small static targets get us no where (see 1c).
> > 3) My next patch means we don't actually use gslab/greclaimable in reality that
> >    often, we'll use delta_slab/delta_inactive, so changing this here doesn't
> >    matter much unless we also want to debate my second patch as well.
> > 
> > Sorry for the long delay Minchin, I'm not trying to ignore you, been trying to
> > track down a cgroup cpu thing, I'll try to be more responsive from now on as I'd
> > like to get these patches into the next merge window.  Thanks,
> 
> No problem, Josef. I belive it would be a good chance to think over slab reclaim
> redesign.
> 

Yeah I've had a smattering of talks with various people over the last year, and
I've tried to implement a few of the ideas, but nothing has turned out to be
really viable.

What I'm hoping to convince you of is that yes, the initial numbers are fucking
huge, and that does make us more agressive.  However there are checks to pull
these numbers down to reasonable counts, so we are never in danger of scanning
all slab objects 10 times for one reclaim loop.  Coupled with the early bailout
logic these things keep the worst case insanity down to something sane.  Thanks,

Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-30 15:03               ` Josef Bacik
@ 2017-07-02  1:58                 ` Dave Chinner
  2017-07-03 13:52                   ` Josef Bacik
  2017-07-03  1:33                 ` Minchan Kim
  1 sibling, 1 reply; 23+ messages in thread
From: Dave Chinner @ 2017-07-02  1:58 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Minchan Kim, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl

[I don't have the full context, haven't seen the proposed patches,
etc so I'm only commenting on small bits of what I was cc'd on]

On Fri, Jun 30, 2017 at 11:03:24AM -0400, Josef Bacik wrote:
> On Fri, Jun 30, 2017 at 11:17:13AM +0900, Minchan Kim wrote:
> 
> <snip>
> 
> > > 
> > > Because this static step down wastes cycles.  Why loop 10 times when you could
> > > set the target at actual usage and try to get everything in one go?  Most
> > > shrinkable slabs adhere to this default of in use first model, which means that
> > > we have to hit an object in the lru twice before it is freed.  So in order to
> > 
> > I didn't know that.
> > 
> > > reclaim anything we have to scan a slab cache's entire lru at least once before
> > > any reclaim starts happening.  If we're doing this static step down thing we
> > 
> > If it's really true, I think that shrinker should be fixed first.
> > 
> 
> Easier said than done.  I've fixed this for the super shrinkers, but like I said
> below, all it takes is some asshole doing find / -exec stat {} \; twice to put
> us back in the same situation again.  There's no aging mechanism other than
> memory reclaim, so we get into this shitty situation of aging+reclaiming at the
> same time.

To me, this sounds like the problem that needs fixing. i.e.
threshold detection to trigger demand-based aging of shrinkable
caches at allocation time rather than waiting for memory pressure to
trigger aging.

> > I guess at the first time, all of objects in shrinker could be INUSE state
> > as you said, however, on steady state, they would work like real LRU
> > to reflect recency, otherwise, I want to call it broken and we shouldn't
> > design general slab aging model for those specific one.
> 
> Yeah that's totally a valid argument to make, but the idea of coming up with
> something completely different hurts my head, and I'm trying to fix this problem
> right now, not in 6 cycles when we all finally agree on the new mechanism.

Add a shrinker callback to the slab allocation code that checks the
rate of growth of the cache. If the cache is growing fast and
getting large, run the shrinker every so often to slow down the rate
of growth. The larger the cache, the slower the allowed rate of
growth. It's the feedback loop that we are missing between slab
allocation and slab reclaim that prevents use from controlling slab
cache growthi and aging cleanly.

This would also allow us to cap the size of caching slabs easily
- something people have been asking us to do for years....

Note: we need to make a clear distinction between slabs used as
"heap" memory and slabs used for allocating large numbers of objects
that are cached for performance reasons. The problem here is cache
management - the fact we are using slabs to allocate the memory for
the objects is largely irrelevant...

> > > I expanded on this above, but I'll give a more concrete example.  Consider xfs
> > > metadata, we allocate a slab object and read in our page, use it, and then free
> > > the buffer which put's it on the lru list.  XFS marks this with a INUSE flag,
> > > which must be cleared before it is free'd from the LRU.  We scan through the
> > > LRU, clearing the INUSE flag and moving the object to the back of the LRU, but
> > > not actually free'ing it.  This happens for all (well most) objects that end up
> > > on the LRU, and this design pattern is used _everywhere_.  Until recently it was
> > > used for the super shrinkers, but I changed that so thankfully the bulk of the
> > > problem is gone.  However if you do a find / -exec stat {} \;, and then do it
> > > again, you'll end up with the same scenario for the super shrinker.   There's no
> > > aging except via pressure on the slabs, so worst case we always have to scan the
> > > entire slab object lru once before we start reclaiming anything.  Being
> > > agressive here I think is ok, we have things in place to make sure we don't over
> > > reclaim.
> > 
> > Thanks for the detail example. Now I understood but the question is it is
> > always true? I mean at the first stage(ie, first population of objects), it
> > seems to be but at the steady stage, I guess some of objects have INUSE,
> > others not by access pattern so it emulates LRU model. No?
> > 
> 
> Sort of.  Lets take icache/dcache.  We open a file and close a file, this get's
> added to the LRU.  We open the file and close the file again, it's already on
> the LRU so it stays where it was and gets the INUSE flag set.  Once an object is
> on the LRU it doesn't move unless we hit it via the shrinker.  Even if we open
> and close the file, and then open and keep it open, the file stays on the LRU,
> and is only removed once the shrinker hits it, sees it's refcount is > 1, and
> removes it from the list.

Yes, but  the lazy LRU removal is a performance feature - the cost
in terms of lock contention of removing dentries inodes from the LRU
on first reference is prohibitive, especially for short term usage
like 'find / -exec stat {} \;' workloads. Git does this sort of
traverse/stat a lot, so making this path any slower would make lots
of people unhappy...

> This is the first path I went down, and I burned and salted the earth on my way
> back.  The problem with trying to reclaim slab pages is you have to have some
> insight into the objects contained on the page.  That gets you into weird
> locking scenarios and end's up pretty yucky.

Yup, that's also the reason we don't have slab defragmentation.

> The next approach I took was having a slab lru, and then the reclaim would tell
> the file system "I want to reclaim this page."  The problem is there's a
> disconnect between what the vm will think is last touched vs what is actually
> last touched, which could lead to us free'ing hotter objects when there are
> cooler ones to free.

*nod*

Which is why I think having the slab allocator simply call the
shrinker with a slightly different set of constraints (e.g. max
size/max growth rate) would work better because it doesn't have any
of those problems...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-06-30 15:03               ` Josef Bacik
  2017-07-02  1:58                 ` Dave Chinner
@ 2017-07-03  1:33                 ` Minchan Kim
  2017-07-03 13:50                   ` Josef Bacik
  1 sibling, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2017-07-03  1:33 UTC (permalink / raw)
  To: Josef Bacik
  Cc: hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik, mhocko,
	cl, david

Hello,

On Fri, Jun 30, 2017 at 11:03:24AM -0400, Josef Bacik wrote:
> On Fri, Jun 30, 2017 at 11:17:13AM +0900, Minchan Kim wrote:
> 
> <snip>
> 
> > > 
> > > Because this static step down wastes cycles.  Why loop 10 times when you could
> > > set the target at actual usage and try to get everything in one go?  Most
> > > shrinkable slabs adhere to this default of in use first model, which means that
> > > we have to hit an object in the lru twice before it is freed.  So in order to
> > 
> > I didn't know that.
> > 
> > > reclaim anything we have to scan a slab cache's entire lru at least once before
> > > any reclaim starts happening.  If we're doing this static step down thing we
> > 
> > If it's really true, I think that shrinker should be fixed first.
> > 
> 
> Easier said than done.  I've fixed this for the super shrinkers, but like I said
> below, all it takes is some asshole doing find / -exec stat {} \; twice to put
> us back in the same situation again.  There's no aging mechanism other than
> memory reclaim, so we get into this shitty situation of aging+reclaiming at the
> same time.

What's different with normal page cache problem?

It has the same problem you mentioned so need to peek what VM does to
address it.

It has two LRU list, active and inactive and maintain the size ratio 1:1.
New page is on inactive and if they are two-touched, the page will be
promoted into active list which is same problem.
However, once reclaim is triggered, VM will can move quickly them from
active to inactive with remove referenced flag untill the ratio is matched.
So, VM can reclaim pages from inactive list, easily.

Can we apply similar mechanism into the problematical slab?
How about adding shrink_slab(xxx, ACTIVE|INACTIVE) in somewhere of VM
for demotion of objects from active list and adding the logic to move
inactive object to active list when the cache hit happens to the FS?

> 
> > > scan some of it, then some more, then some more, then finally we get priority
> > > down enough that we scan a huge swatch of the list enough to start reclaiming
> > > objects.
> > > 
> > > With the usage ratio in place it's based on actual system usage, so we waste
> > > less time dropping the priority and spend more time actively trying to free
> > > objects.
> > 
> > However, I think your idea is too much agressive.
> > 
> >         100M LRU, 1000M SLAB
> > 
> > With your idea, it scans 10 times of all objects in shrinker which ends up
> > reclaim every slab pages, I guess.
> > 
> 
> No, we have limits in do_shrink_slab, so in this case we will limit the scan
> count to twice the LRU size, which accounts for this INUSE design pattern

Aha, it sounds logical. Twice of the LRU size is enough to sweep out all
objects. First round, remove INUSE flag, second round, free objects?
IOW, If LRU size is half of slab in your logic, it could reclaim all slab
objects.

> everybody loves.  Plus we have the early bailout logic so when we reclaim enough
> we are done, we don't just reclaim the whole thing.

I cannot see what bailout logic you are saying.
Once shrink_slab with big gap of #slab/#LRU in your logic is called,
it could reclaim every objects in every shrinkers.

> 
> > I think your idea comes from some of slab shrinker as you mentioned.
> > I guess at the first time, all of objects in shrinker could be INUSE state
> > as you said, however, on steady state, they would work like real LRU
> > to reflect recency, otherwise, I want to call it broken and we shouldn't
> > design general slab aging model for those specific one.
> > 
> 
> Yeah that's totally a valid argument to make, but the idea of coming up with
> something completely different hurts my head, and I'm trying to fix this problem
> right now, not in 6 cycles when we all finally agree on the new mechanism.
> 
> > > 
> > > And keep in mind this is the first patch, that sets the baseline.  The next
> > > patch makes it so we don't even really use this ratio that often, we use the
> > > ratio of changed slab pages to changed inactive pages, so that can be even more
> > > agressive.
> > 
> > Intentionally, I didn't read your next patch because without clear understanding
> > of prior patch, it's hard to digest second one so wanted to discuss this one
> > first. However, if second patch makes the situation better, I will read but
> > doubt because you said it would make more aggressive which is my concern.
> > 
> 
> Right so it adjusts the aggressiveness on the change in slab vs inactive lru
> size, so if we're generating a lot of slab pages and no inactive pages then
> it'll look just as agressive.
> 
> I think you're getting too scared of the scale of aggressiveness these numbers
> generate.  We have a bunch of logic to trim down these numbers to reasonable
> scan targets and to bail out when we've hit our reclaim target.  We end up with
> bonkers numbers in bonkers situations, and these numbers are curtailed to
> reasonable things later on, so the initial pass isn't that important.  What _is_
> important is that we are actually agressive enough, because right now we aren't
> and it hurts badly.  We can be overly agressive because we have checks in place
> to back off.

Sorry but I cannot see what kinds of bailout logics you are saying.
Please elaborate it a bit. I hope it would be in general VM logic,
not FS sepecic.

> 
> > > 
> > > > 
> > > > > 
> > > > > scan = total >> sc->priority
> > > > > 
> > > > > and then we look through 'scan' number of pages, which means we're usually
> > > > > reclaiming enough stuff to make progress at each priority level.  Slab is
> > > > > different, pages != slab objects.  Plus we have this common pattern of putting
> > > > 
> > > > Aha, I see your concern. The problem is although we can reclaim a object from
> > > > slab, it doesn't mean to reclaim a page so VM should go with next priority
> > > > cycle. If so, I think we can adjust the the cost model more agressive with
> > > > ratio approach. (1/12 + 2/12 + 3/12 ...) With this, in a reclaim cycle(12..0),
> > > > we guarantees that scanning of entire objects list four times while LRU is
> > > > two times. As well, it might be a helpful if we can have slab's reclaim tunable
> > > > knob to tune reclaim agressiveness of slab like swappiness for anonymous pages.
> > > > 
> > > 
> > > Death to all tunables ;).  I prefer this mechanism, it's based on what is
> > > happening on the system currently, and not some weird static thing that can be
> > > fucked up by some idiot system administrator inside Facebook who thinks he's
> > > God's gift to kernel tuning.
> > > 
> > > I think the 1/12->2/12->3/12 thing is a better solution than your other option,
> > > but again I argue that statically stepping down just wastes time in a majority
> > > of the cases.
> > > 
> > > Thinking of it a different way, going to higher and higher priority to hopefully
> > > put enough pressure on the slab is going to have the side-effect of putting more
> > > and more pressure on active/inactive.  If our app needs its hot stuff in the
> > > active pages we'd really rather not evict all of it so we could get the ratios
> > > right to discard the slab pages we didn't care about in the first place.
> > > 
> > > > > every object onto our lru list, and letting the scanning mechanism figure out
> > > > > which objects are actually not in use any more, which means each scan is likely
> > > > > to not make progress until we've gone through the entire lru.
> > > > 
> > > > Sorry, I didn't understand. Could you elaborate a bit if it's important point
> > > > in this discussion?
> > > > 
> > > 
> > > I expanded on this above, but I'll give a more concrete example.  Consider xfs
> > > metadata, we allocate a slab object and read in our page, use it, and then free
> > > the buffer which put's it on the lru list.  XFS marks this with a INUSE flag,
> > > which must be cleared before it is free'd from the LRU.  We scan through the
> > > LRU, clearing the INUSE flag and moving the object to the back of the LRU, but
> > > not actually free'ing it.  This happens for all (well most) objects that end up
> > > on the LRU, and this design pattern is used _everywhere_.  Until recently it was
> > > used for the super shrinkers, but I changed that so thankfully the bulk of the
> > > problem is gone.  However if you do a find / -exec stat {} \;, and then do it
> > > again, you'll end up with the same scenario for the super shrinker.   There's no
> > > aging except via pressure on the slabs, so worst case we always have to scan the
> > > entire slab object lru once before we start reclaiming anything.  Being
> > > agressive here I think is ok, we have things in place to make sure we don't over
> > > reclaim.
> > 
> > Thanks for the detail example. Now I understood but the question is it is
> > always true? I mean at the first stage(ie, first population of objects), it
> > seems to be but at the steady stage, I guess some of objects have INUSE,
> > others not by access pattern so it emulates LRU model. No?
> > 
> 
> Sort of.  Lets take icache/dcache.  We open a file and close a file, this get's
> added to the LRU.  We open the file and close the file again, it's already on
> the LRU so it stays where it was and gets the INUSE flag set.  Once an object is
> on the LRU it doesn't move unless we hit it via the shrinker.  Even if we open
> and close the file, and then open and keep it open, the file stays on the LRU,
> and is only removed once the shrinker hits it, sees it's refcount is > 1, and
> removes it from the list.

How about my suggestion in above which adds active/inactive LRU list
for the slab and adding shrink_slab(xxx, active|inactive) in shrink_list,
for example? Of course, it needs more tweaks for shrinkers who are able
to use a lot of memory as a cache but I think it's the tradeoff for whom
can be memory-hogger.

> 
> With my patch in place we will have a smattering of objects that are not in use
> with the INUSE flag set, objects with no INUSE so get free'd immediately
> (hooray!), and objects that really are in use that get removed from the LRU once
> we encounter them.
> 
> Even with this "ideal" (or less shitty depending on your point of view), our LRU
> is going to be made up with a pretty significant number of objects that can't be
> free'd right away in the worst case.
> 
> > > 
> > > > > 
> > > > > You are worried that we are just going to empty the slab every time, and that is
> > > > > totally a valid concern.  But we have checks in place to make sure that our
> > > > > total_scan (the number of objects we scan) doesn't end up hugely bonkers so we
> > > > > don't waste time scanning through objects.  If we wanted to be even more careful
> > > > > we could add some checks in do_shrink_slab/shrink_slab to bail as soon as we hit
> > > > > our reclaim targets, instead of having just the one check in shrink_node.
> > > > 
> > > > Acutually, my main worry is the expression(gslab/greclaimable).
> > > > What's the rationale for such modeling in you mind?
> > > > Without understanding that, it's hard to say whether it's proper.
> > > > 
> > > > For exmaple, with your expression, if nr_slab == nr_lru, it scans all objects
> > > > of slab. Why?  At that time, VM even doesn't scan full LRU.
> > > > I really want to make them consistent so when a reclaim cycle is done,
> > > > we guarantees to happen some amount of scanning.
> > > > In my idea, LRU scanning is x2 of LRU pages and slab scanning is x4 of
> > > > slab object.
> > > 
> > > I really should read the whole email before I start replying to parts ;).  I
> > > think I've explained my rationale above, but I'll summarize here
> > > 
> > > 1) Slab reclaim isn't like page reclaim.
> > >   a) slab objects != slab pages, there's more objects to reclaim than pages, and
> > >      fragmentation can fuck us.
> > 
> > 
> > True. there were some discussion to improve it better. yes, that's not trivial
> > job but at least it would be better to revisit ideas before making slab
> > reclaim too aggressive.
> > 
> > Ccing Dave, Christoph and others guys might have a interest on slab reclaim.
> > 
> >         page-based slab reclaim,
> >         https://lkml.org/lkml/2010/2/8/329
> > 
> >         slab defragmentation
> >         https://lkml.org/lkml/2007/7/7/175
> > 
> 
> This is the first path I went down, and I burned and salted the earth on my way
> back.  The problem with trying to reclaim slab pages is you have to have some
> insight into the objects contained on the page.  That gets you into weird
> locking scenarios and end's up pretty yucky.
> 
> The next approach I took was having a slab lru, and then the reclaim would tell
> the file system "I want to reclaim this page."  The problem is there's a
> disconnect between what the vm will think is last touched vs what is actually
> last touched, which could lead to us free'ing hotter objects when there are
> cooler ones to free.
> 
> Admittedly I didn't spend much time on these solutions, all I really want is to
> make the current situation less shitty right now so I can go back to being a
> btrfs developer and not a everything else fixer ;).
> 
> > >   b) scanning/reclaiming slab objects is generally faster than page reclaim, so
> > >      scanning more of them is a higher cpu cost, generally we don't have to do
> > >      IO in order to reclaim (*cough*except for xfs*cough*).
> > >   c) most slab lru's institute that INUSE flag bullshit that forces us to scan
> > >      the whole LRU once before reclaim occurs.
> > 
> > If it's really true, I think we should fix it rather than making VM slab reclaim
> > logic too agressive which will affect other sane shrinkers.
> > 
> 
> Agreed, and it's been fixed for the super shrinkers which are the largest pool.
> However you can still force the bad behavior, so we still need to be able to
> handle the worst case.
> 
> > > 2) gslab/greclaimable equates to actual system usage.  With a normal machine
> > >    nothing really changes, slab will be some single digit %, and nobody will
> > >    notice, however with a mostly slab workload the slab lru's can be huge and
> > >    then small static targets get us no where (see 1c).
> > > 3) My next patch means we don't actually use gslab/greclaimable in reality that
> > >    often, we'll use delta_slab/delta_inactive, so changing this here doesn't
> > >    matter much unless we also want to debate my second patch as well.
> > > 
> > > Sorry for the long delay Minchin, I'm not trying to ignore you, been trying to
> > > track down a cgroup cpu thing, I'll try to be more responsive from now on as I'd
> > > like to get these patches into the next merge window.  Thanks,
> > 
> > No problem, Josef. I belive it would be a good chance to think over slab reclaim
> > redesign.
> > 
> 
> Yeah I've had a smattering of talks with various people over the last year, and
> I've tried to implement a few of the ideas, but nothing has turned out to be
> really viable.
> 
> What I'm hoping to convince you of is that yes, the initial numbers are fucking
> huge, and that does make us more agressive.  However there are checks to pull
> these numbers down to reasonable counts, so we are never in danger of scanning
> all slab objects 10 times for one reclaim loop.  Coupled with the early bailout
> logic these things keep the worst case insanity down to something sane.  Thanks,

Before the answering, I need to know what you means about early bailout logic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-03  1:33                 ` Minchan Kim
@ 2017-07-03 13:50                   ` Josef Bacik
  2017-07-04  3:01                     ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Josef Bacik @ 2017-07-03 13:50 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Josef Bacik, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl, david

On Mon, Jul 03, 2017 at 10:33:03AM +0900, Minchan Kim wrote:
> Hello,
> 
> On Fri, Jun 30, 2017 at 11:03:24AM -0400, Josef Bacik wrote:
> > On Fri, Jun 30, 2017 at 11:17:13AM +0900, Minchan Kim wrote:
> > 
> > <snip>
> > 
> > > > 
> > > > Because this static step down wastes cycles.  Why loop 10 times when you could
> > > > set the target at actual usage and try to get everything in one go?  Most
> > > > shrinkable slabs adhere to this default of in use first model, which means that
> > > > we have to hit an object in the lru twice before it is freed.  So in order to
> > > 
> > > I didn't know that.
> > > 
> > > > reclaim anything we have to scan a slab cache's entire lru at least once before
> > > > any reclaim starts happening.  If we're doing this static step down thing we
> > > 
> > > If it's really true, I think that shrinker should be fixed first.
> > > 
> > 
> > Easier said than done.  I've fixed this for the super shrinkers, but like I said
> > below, all it takes is some asshole doing find / -exec stat {} \; twice to put
> > us back in the same situation again.  There's no aging mechanism other than
> > memory reclaim, so we get into this shitty situation of aging+reclaiming at the
> > same time.
> 
> What's different with normal page cache problem?
> 

What's different is reclaiming a page from pagecache gives you a page,
reclaiming 10k objects from slab may only give you one page if you are super
unlucky.  I'm nothing in life if I'm not unlucky.

> It has the same problem you mentioned so need to peek what VM does to
> address it.
> 
> It has two LRU list, active and inactive and maintain the size ratio 1:1.
> New page is on inactive and if they are two-touched, the page will be
> promoted into active list which is same problem.
> However, once reclaim is triggered, VM will can move quickly them from
> active to inactive with remove referenced flag untill the ratio is matched.
> So, VM can reclaim pages from inactive list, easily.
> 
> Can we apply similar mechanism into the problematical slab?
> How about adding shrink_slab(xxx, ACTIVE|INACTIVE) in somewhere of VM
> for demotion of objects from active list and adding the logic to move
> inactive object to active list when the cache hit happens to the FS?
> 

I did this too!  This worked out ok, but was a bit complex and the problem was
solved just as well by dropping the INUSE first approach.  I think that Dave's
approach to having a separate aging mechanism is a good compliment to these
patches.

> > 
> > > > scan some of it, then some more, then some more, then finally we get priority
> > > > down enough that we scan a huge swatch of the list enough to start reclaiming
> > > > objects.
> > > > 
> > > > With the usage ratio in place it's based on actual system usage, so we waste
> > > > less time dropping the priority and spend more time actively trying to free
> > > > objects.
> > > 
> > > However, I think your idea is too much agressive.
> > > 
> > >         100M LRU, 1000M SLAB
> > > 
> > > With your idea, it scans 10 times of all objects in shrinker which ends up
> > > reclaim every slab pages, I guess.
> > > 
> > 
> > No, we have limits in do_shrink_slab, so in this case we will limit the scan
> > count to twice the LRU size, which accounts for this INUSE design pattern
> 
> Aha, it sounds logical. Twice of the LRU size is enough to sweep out all
> objects. First round, remove INUSE flag, second round, free objects?
> IOW, If LRU size is half of slab in your logic, it could reclaim all slab
> objects.
> 
> > everybody loves.  Plus we have the early bailout logic so when we reclaim enough
> > we are done, we don't just reclaim the whole thing.
> 
> I cannot see what bailout logic you are saying.
> Once shrink_slab with big gap of #slab/#LRU in your logic is called,
> it could reclaim every objects in every shrinkers.
> 
> > 
> > > I think your idea comes from some of slab shrinker as you mentioned.
> > > I guess at the first time, all of objects in shrinker could be INUSE state
> > > as you said, however, on steady state, they would work like real LRU
> > > to reflect recency, otherwise, I want to call it broken and we shouldn't
> > > design general slab aging model for those specific one.
> > > 
> > 
> > Yeah that's totally a valid argument to make, but the idea of coming up with
> > something completely different hurts my head, and I'm trying to fix this problem
> > right now, not in 6 cycles when we all finally agree on the new mechanism.
> > 
> > > > 
> > > > And keep in mind this is the first patch, that sets the baseline.  The next
> > > > patch makes it so we don't even really use this ratio that often, we use the
> > > > ratio of changed slab pages to changed inactive pages, so that can be even more
> > > > agressive.
> > > 
> > > Intentionally, I didn't read your next patch because without clear understanding
> > > of prior patch, it's hard to digest second one so wanted to discuss this one
> > > first. However, if second patch makes the situation better, I will read but
> > > doubt because you said it would make more aggressive which is my concern.
> > > 
> > 
> > Right so it adjusts the aggressiveness on the change in slab vs inactive lru
> > size, so if we're generating a lot of slab pages and no inactive pages then
> > it'll look just as agressive.
> > 
> > I think you're getting too scared of the scale of aggressiveness these numbers
> > generate.  We have a bunch of logic to trim down these numbers to reasonable
> > scan targets and to bail out when we've hit our reclaim target.  We end up with
> > bonkers numbers in bonkers situations, and these numbers are curtailed to
> > reasonable things later on, so the initial pass isn't that important.  What _is_
> > important is that we are actually agressive enough, because right now we aren't
> > and it hurts badly.  We can be overly agressive because we have checks in place
> > to back off.
> 
> Sorry but I cannot see what kinds of bailout logics you are saying.
> Please elaborate it a bit. I hope it would be in general VM logic,
> not FS sepecic.
>

Jesus I'm sorry Minchan, I've done so many different things trying to solve this
problem I've bled together what the code looks like and other things I've done.
I had at some point in the past moved the bailout code to do_shrink_slab to keep
us from over-reclaiming slab, but I dropped them at some point but just went on
thinking thats how it all worked.  I'll respin these patches and add the early
bailout conditions as well to address this problem.  Sorry again,

Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-02  1:58                 ` Dave Chinner
@ 2017-07-03 13:52                   ` Josef Bacik
  0 siblings, 0 replies; 23+ messages in thread
From: Josef Bacik @ 2017-07-03 13:52 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Josef Bacik, Minchan Kim, hannes, riel, akpm, linux-mm,
	kernel-team, Josef Bacik, mhocko, cl

On Sun, Jul 02, 2017 at 11:58:43AM +1000, Dave Chinner wrote:
> [I don't have the full context, haven't seen the proposed patches,
> etc so I'm only commenting on small bits of what I was cc'd on]
> 
> On Fri, Jun 30, 2017 at 11:03:24AM -0400, Josef Bacik wrote:
> > On Fri, Jun 30, 2017 at 11:17:13AM +0900, Minchan Kim wrote:
> > 
> > <snip>
> > 
> > > > 
> > > > Because this static step down wastes cycles.  Why loop 10 times when you could
> > > > set the target at actual usage and try to get everything in one go?  Most
> > > > shrinkable slabs adhere to this default of in use first model, which means that
> > > > we have to hit an object in the lru twice before it is freed.  So in order to
> > > 
> > > I didn't know that.
> > > 
> > > > reclaim anything we have to scan a slab cache's entire lru at least once before
> > > > any reclaim starts happening.  If we're doing this static step down thing we
> > > 
> > > If it's really true, I think that shrinker should be fixed first.
> > > 
> > 
> > Easier said than done.  I've fixed this for the super shrinkers, but like I said
> > below, all it takes is some asshole doing find / -exec stat {} \; twice to put
> > us back in the same situation again.  There's no aging mechanism other than
> > memory reclaim, so we get into this shitty situation of aging+reclaiming at the
> > same time.
> 
> To me, this sounds like the problem that needs fixing. i.e.
> threshold detection to trigger demand-based aging of shrinkable
> caches at allocation time rather than waiting for memory pressure to
> trigger aging.
> 
> > > I guess at the first time, all of objects in shrinker could be INUSE state
> > > as you said, however, on steady state, they would work like real LRU
> > > to reflect recency, otherwise, I want to call it broken and we shouldn't
> > > design general slab aging model for those specific one.
> > 
> > Yeah that's totally a valid argument to make, but the idea of coming up with
> > something completely different hurts my head, and I'm trying to fix this problem
> > right now, not in 6 cycles when we all finally agree on the new mechanism.
> 
> Add a shrinker callback to the slab allocation code that checks the
> rate of growth of the cache. If the cache is growing fast and
> getting large, run the shrinker every so often to slow down the rate
> of growth. The larger the cache, the slower the allowed rate of
> growth. It's the feedback loop that we are missing between slab
> allocation and slab reclaim that prevents use from controlling slab
> cache growthi and aging cleanly.
> 
> This would also allow us to cap the size of caching slabs easily
> - something people have been asking us to do for years....
> 
> Note: we need to make a clear distinction between slabs used as
> "heap" memory and slabs used for allocating large numbers of objects
> that are cached for performance reasons. The problem here is cache
> management - the fact we are using slabs to allocate the memory for
> the objects is largely irrelevant...
> 
> > > > I expanded on this above, but I'll give a more concrete example.  Consider xfs
> > > > metadata, we allocate a slab object and read in our page, use it, and then free
> > > > the buffer which put's it on the lru list.  XFS marks this with a INUSE flag,
> > > > which must be cleared before it is free'd from the LRU.  We scan through the
> > > > LRU, clearing the INUSE flag and moving the object to the back of the LRU, but
> > > > not actually free'ing it.  This happens for all (well most) objects that end up
> > > > on the LRU, and this design pattern is used _everywhere_.  Until recently it was
> > > > used for the super shrinkers, but I changed that so thankfully the bulk of the
> > > > problem is gone.  However if you do a find / -exec stat {} \;, and then do it
> > > > again, you'll end up with the same scenario for the super shrinker.   There's no
> > > > aging except via pressure on the slabs, so worst case we always have to scan the
> > > > entire slab object lru once before we start reclaiming anything.  Being
> > > > agressive here I think is ok, we have things in place to make sure we don't over
> > > > reclaim.
> > > 
> > > Thanks for the detail example. Now I understood but the question is it is
> > > always true? I mean at the first stage(ie, first population of objects), it
> > > seems to be but at the steady stage, I guess some of objects have INUSE,
> > > others not by access pattern so it emulates LRU model. No?
> > > 
> > 
> > Sort of.  Lets take icache/dcache.  We open a file and close a file, this get's
> > added to the LRU.  We open the file and close the file again, it's already on
> > the LRU so it stays where it was and gets the INUSE flag set.  Once an object is
> > on the LRU it doesn't move unless we hit it via the shrinker.  Even if we open
> > and close the file, and then open and keep it open, the file stays on the LRU,
> > and is only removed once the shrinker hits it, sees it's refcount is > 1, and
> > removes it from the list.
> 
> Yes, but  the lazy LRU removal is a performance feature - the cost
> in terms of lock contention of removing dentries inodes from the LRU
> on first reference is prohibitive, especially for short term usage
> like 'find / -exec stat {} \;' workloads. Git does this sort of
> traverse/stat a lot, so making this path any slower would make lots
> of people unhappy...
>

Sure we want it when it's useful, but often times it ends up in latencies due to
reclaim.  I like your aging thing, it would definitely help in our worst case,
I'll give that a whirl after my vacation.  Thanks,

Josef 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-03 13:50                   ` Josef Bacik
@ 2017-07-04  3:01                     ` Minchan Kim
  2017-07-04 13:21                       ` Josef Bacik
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2017-07-04  3:01 UTC (permalink / raw)
  To: Josef Bacik
  Cc: hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik, mhocko,
	cl, david

On Mon, Jul 03, 2017 at 09:50:07AM -0400, Josef Bacik wrote:
> On Mon, Jul 03, 2017 at 10:33:03AM +0900, Minchan Kim wrote:
> > Hello,
> > 
> > On Fri, Jun 30, 2017 at 11:03:24AM -0400, Josef Bacik wrote:
> > > On Fri, Jun 30, 2017 at 11:17:13AM +0900, Minchan Kim wrote:
> > > 
> > > <snip>
> > > 
> > > > > 
> > > > > Because this static step down wastes cycles.  Why loop 10 times when you could
> > > > > set the target at actual usage and try to get everything in one go?  Most
> > > > > shrinkable slabs adhere to this default of in use first model, which means that
> > > > > we have to hit an object in the lru twice before it is freed.  So in order to
> > > > 
> > > > I didn't know that.
> > > > 
> > > > > reclaim anything we have to scan a slab cache's entire lru at least once before
> > > > > any reclaim starts happening.  If we're doing this static step down thing we
> > > > 
> > > > If it's really true, I think that shrinker should be fixed first.
> > > > 
> > > 
> > > Easier said than done.  I've fixed this for the super shrinkers, but like I said
> > > below, all it takes is some asshole doing find / -exec stat {} \; twice to put
> > > us back in the same situation again.  There's no aging mechanism other than
> > > memory reclaim, so we get into this shitty situation of aging+reclaiming at the
> > > same time.
> > 
> > What's different with normal page cache problem?
> > 
> 
> What's different is reclaiming a page from pagecache gives you a page,
> reclaiming 10k objects from slab may only give you one page if you are super
> unlucky.  I'm nothing in life if I'm not unlucky.
> 
> > It has the same problem you mentioned so need to peek what VM does to
> > address it.
> > 
> > It has two LRU list, active and inactive and maintain the size ratio 1:1.
> > New page is on inactive and if they are two-touched, the page will be
> > promoted into active list which is same problem.
> > However, once reclaim is triggered, VM will can move quickly them from
> > active to inactive with remove referenced flag untill the ratio is matched.
> > So, VM can reclaim pages from inactive list, easily.
> > 
> > Can we apply similar mechanism into the problematical slab?
> > How about adding shrink_slab(xxx, ACTIVE|INACTIVE) in somewhere of VM
> > for demotion of objects from active list and adding the logic to move
> > inactive object to active list when the cache hit happens to the FS?
> > 
> 
> I did this too!  This worked out ok, but was a bit complex and the problem was
> solved just as well by dropping the INUSE first approach.  I think that Dave's
> approach to having a separate aging mechanism is a good compliment to these
> patches.

There are two problems you are try to address.

1. slab *page* reclaim

Your claim is that it's hard to reclaim a page by slab fragmentation so need to
reclaim objects more aggressively.

Basically, aggressive scanning doesn't guarantee to reclaim a page but it just
increases the possibility. Even, if we think slab works with merging feature(i.e.,
it mixes same size several type objects in a slab), the possibility will be huge
dropped if you try to bail out on a certain shrinker. So for working well,
we should increase aggressiveness too much to sweep every objects from all shrinker.
I guess that's why your patch makes the logic very aggressive.
In here, my concern with that aggressive is to reclaim all objects too early
and it ends up making void caching scheme. I'm not sure it's gain in the end.

2. stream-workload

Your claim is that every objects can have INUSE flag in that workload so they
need to scan full-cycle with removing the flag and finally, next cycle,
objects can be reclaimed. On the situation, static incremental scanning would
make deep prorioty drop which causes unncessary CPU cycle waste.

Actually, there isn't nice solution for that at the moment. Page cache try
to solve it with multi-level LRU and as you said, it would solve the
problem. However, it would be too complicated so you could be okay with
Dave's suggestion which periodic aging(i.e., LFU) but it's not free so that
it could increase runtime latency.

The point is that such workload is hard to solve in general and just
general agreessive scanning is not a good solution because it can sweep
other shrinkers which don't have such problem so I hope it should be
solved by a specific shrinker itself rather than general VM level.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-04  3:01                     ` Minchan Kim
@ 2017-07-04 13:21                       ` Josef Bacik
  2017-07-04 22:57                         ` Dave Chinner
  2017-07-05  4:43                         ` Minchan Kim
  0 siblings, 2 replies; 23+ messages in thread
From: Josef Bacik @ 2017-07-04 13:21 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Josef Bacik, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl, david

On Tue, Jul 04, 2017 at 12:01:00PM +0900, Minchan Kim wrote:
> On Mon, Jul 03, 2017 at 09:50:07AM -0400, Josef Bacik wrote:
> > On Mon, Jul 03, 2017 at 10:33:03AM +0900, Minchan Kim wrote:
> > > Hello,
> > > 
> > > On Fri, Jun 30, 2017 at 11:03:24AM -0400, Josef Bacik wrote:
> > > > On Fri, Jun 30, 2017 at 11:17:13AM +0900, Minchan Kim wrote:
> > > > 
> > > > <snip>
> > > > 
> > > > > > 
> > > > > > Because this static step down wastes cycles.  Why loop 10 times when you could
> > > > > > set the target at actual usage and try to get everything in one go?  Most
> > > > > > shrinkable slabs adhere to this default of in use first model, which means that
> > > > > > we have to hit an object in the lru twice before it is freed.  So in order to
> > > > > 
> > > > > I didn't know that.
> > > > > 
> > > > > > reclaim anything we have to scan a slab cache's entire lru at least once before
> > > > > > any reclaim starts happening.  If we're doing this static step down thing we
> > > > > 
> > > > > If it's really true, I think that shrinker should be fixed first.
> > > > > 
> > > > 
> > > > Easier said than done.  I've fixed this for the super shrinkers, but like I said
> > > > below, all it takes is some asshole doing find / -exec stat {} \; twice to put
> > > > us back in the same situation again.  There's no aging mechanism other than
> > > > memory reclaim, so we get into this shitty situation of aging+reclaiming at the
> > > > same time.
> > > 
> > > What's different with normal page cache problem?
> > > 
> > 
> > What's different is reclaiming a page from pagecache gives you a page,
> > reclaiming 10k objects from slab may only give you one page if you are super
> > unlucky.  I'm nothing in life if I'm not unlucky.
> > 
> > > It has the same problem you mentioned so need to peek what VM does to
> > > address it.
> > > 
> > > It has two LRU list, active and inactive and maintain the size ratio 1:1.
> > > New page is on inactive and if they are two-touched, the page will be
> > > promoted into active list which is same problem.
> > > However, once reclaim is triggered, VM will can move quickly them from
> > > active to inactive with remove referenced flag untill the ratio is matched.
> > > So, VM can reclaim pages from inactive list, easily.
> > > 
> > > Can we apply similar mechanism into the problematical slab?
> > > How about adding shrink_slab(xxx, ACTIVE|INACTIVE) in somewhere of VM
> > > for demotion of objects from active list and adding the logic to move
> > > inactive object to active list when the cache hit happens to the FS?
> > > 
> > 
> > I did this too!  This worked out ok, but was a bit complex and the problem was
> > solved just as well by dropping the INUSE first approach.  I think that Dave's
> > approach to having a separate aging mechanism is a good compliment to these
> > patches.
> 
> There are two problems you are try to address.
> 
> 1. slab *page* reclaim
> 
> Your claim is that it's hard to reclaim a page by slab fragmentation so need to
> reclaim objects more aggressively.
> 
> Basically, aggressive scanning doesn't guarantee to reclaim a page but it just
> increases the possibility. Even, if we think slab works with merging feature(i.e.,
> it mixes same size several type objects in a slab), the possibility will be huge
> dropped if you try to bail out on a certain shrinker. So for working well,
> we should increase aggressiveness too much to sweep every objects from all shrinker.
> I guess that's why your patch makes the logic very aggressive.
> In here, my concern with that aggressive is to reclaim all objects too early
> and it ends up making void caching scheme. I'm not sure it's gain in the end.
>

Well the fact is what we have doesn't work, and I've been staring at this
problem for a few months and I don't have a better solution.

And keep in mind we're talking about a purely slab workload, something that
isn't likely to be a common case.  And even if our scan target is 2x, we aren't
going to reclaim the entire cache before we bail out.  We only scan in
'batch_size' chunks, which generally is 1024.  In the worst case that we have
one in use object on every slab page we own then yes we're fucked, but we're
still fucked with the current code, only with the current code it'll take us 20
minutes of looping in the vm vs. seconds scanning the whole list twice.

> 2. stream-workload
> 
> Your claim is that every objects can have INUSE flag in that workload so they
> need to scan full-cycle with removing the flag and finally, next cycle,
> objects can be reclaimed. On the situation, static incremental scanning would
> make deep prorioty drop which causes unncessary CPU cycle waste.
> 
> Actually, there isn't nice solution for that at the moment. Page cache try
> to solve it with multi-level LRU and as you said, it would solve the
> problem. However, it would be too complicated so you could be okay with
> Dave's suggestion which periodic aging(i.e., LFU) but it's not free so that
> it could increase runtime latency.
> 
> The point is that such workload is hard to solve in general and just
> general agreessive scanning is not a good solution because it can sweep
> other shrinkers which don't have such problem so I hope it should be
> solved by a specific shrinker itself rather than general VM level.

The only problem I see here is our shrinker list is just a list, there's no
order or anything and we just walk through one at a time.  We could mitigate
this problem by ordering the list based on objects, but this isn't necessarily a
good indication of overall size.  Consider xfs_buf, where each slab object is
also hiding 1 page, so for every slab object we free we also free 1 page.  This
may appear to be a smaller slab by object measures, but may actually be larger.
We could definitely make this aspect of the shrinker smarter, but these patches
here need to still be in place in general to solve the problem of us not being
aggressive enough currently.  Thanks,

Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-04 13:21                       ` Josef Bacik
@ 2017-07-04 22:57                         ` Dave Chinner
  2017-07-05  4:59                           ` Minchan Kim
  2017-07-05 13:33                           ` Josef Bacik
  2017-07-05  4:43                         ` Minchan Kim
  1 sibling, 2 replies; 23+ messages in thread
From: Dave Chinner @ 2017-07-04 22:57 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Minchan Kim, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl

On Tue, Jul 04, 2017 at 09:21:37AM -0400, Josef Bacik wrote:
> On Tue, Jul 04, 2017 at 12:01:00PM +0900, Minchan Kim wrote:
> > 1. slab *page* reclaim
> > 
> > Your claim is that it's hard to reclaim a page by slab fragmentation so need to
> > reclaim objects more aggressively.
> > 
> > Basically, aggressive scanning doesn't guarantee to reclaim a page but it just
> > increases the possibility. Even, if we think slab works with merging feature(i.e.,
> > it mixes same size several type objects in a slab), the possibility will be huge
> > dropped if you try to bail out on a certain shrinker. So for working well,
> > we should increase aggressiveness too much to sweep every objects from all shrinker.
> > I guess that's why your patch makes the logic very aggressive.
> > In here, my concern with that aggressive is to reclaim all objects too early
> > and it ends up making void caching scheme. I'm not sure it's gain in the end.
> >
> 
> Well the fact is what we have doesn't work, and I've been staring at this
> problem for a few months and I don't have a better solution.
> 
> And keep in mind we're talking about a purely slab workload, something that
> isn't likely to be a common case.  And even if our scan target is 2x, we aren't
> going to reclaim the entire cache before we bail out.  We only scan in
> 'batch_size' chunks, which generally is 1024.  In the worst case that we have
> one in use object on every slab page we own then yes we're fucked, but we're
> still fucked with the current code, only with the current code it'll take us 20
> minutes of looping in the vm vs. seconds scanning the whole list twice.

Right - this is where growth/allocation rate based aging scans
come into play, rather than waiting for the VM to hit some unknown
ceiling and do an unpredictable amount of scanning.

> > 2. stream-workload
> > 
> > Your claim is that every objects can have INUSE flag in that workload so they
> > need to scan full-cycle with removing the flag and finally, next cycle,
> > objects can be reclaimed. On the situation, static incremental scanning would
> > make deep prorioty drop which causes unncessary CPU cycle waste.
> > 
> > Actually, there isn't nice solution for that at the moment. Page cache try
> > to solve it with multi-level LRU and as you said, it would solve the
> > problem. However, it would be too complicated so you could be okay with
> > Dave's suggestion which periodic aging(i.e., LFU) but it's not free so that
> > it could increase runtime latency.
> > 
> > The point is that such workload is hard to solve in general and just
> > general agreessive scanning is not a good solution because it can sweep
> > other shrinkers which don't have such problem so I hope it should be
> > solved by a specific shrinker itself rather than general VM level.
> 
> The only problem I see here is our shrinker list is just a list, there's no
> order or anything and we just walk through one at a time.

That's because we don't really need an ordered list - all shrinkable
caches needed to have the same amount of work done on them for a
given memory pressure. That's how we maintain balance between
caches.

> We could mitigate
> this problem by ordering the list based on objects, but this isn't necessarily a
> good indication of overall size.

shrinkers don't just run on caches, and some of the this they run
against have variable object size. Some of them report reclaimable
memory in bytes rather than object counts to the shrinker
infrastructure.  i.e. the shrinker infrastrcture is abstracted
sufficiently that the accounting of memory used/reclaimed is defined
by the individual subsystems, not the shrinker infrastructure....

> Consider xfs_buf, where each slab object is also hiding 1 page, so
> for every slab object we free we also free 1 page.

Well, that's a very simplistic view - there are objects that hold a
page, but it's a variable size object cache. an xfs-buf can point to
heap memory or multiple pages.

IOWs, the xfs-buf is not a *slab cache*. It's a *buffer cache*, but
we control it's size via a shrinker because that's the only
mechanism we have that provides subsystems with memory pressure
callbacks. This is a clear example of what I said above about
shrinkers being much more than just a mechanism to control the size
of a slab...

> This may
> appear to be a smaller slab by object measures, but may actually
> be larger.

Right, but that's for the subsystem to sort out the working set
balance against all the other caches - the shrinker infrastructure
cannot determine how important different subsystems are relative to
each other, so memory reclaim must try to do the same percentage of
reclaim work across all of them so that everything remains globally
balanced.

My original plan years ago was to make the shrinker infrastructure
API work on "bytes" rather than "subsystem defined objects", but
there were so many broken shrinkers I burnt out before I got that
far....

My suggestion of allocation based aging callbacks is something for
specific caches to be able to run based on their own or the users
size/growth/performance constraints. It's independent of memory
reclaim behaviour and so can be a strongly biased as the user wants.
Memory reclaim will just maintain whatever balance that exists
between the different caches as a result of the subsystem specific
aging callbacks.

> We could definitely make this aspect of the shrinker
> smarter, but these patches here need to still be in place in
> general to solve the problem of us not being aggressive enough
> currently.  Thanks,

Remember that the shrinker callback into a subsystem is just a
mechanism for scanning a subsystem's reclaim list and performing
reclaim. We're not limited to only calling them from
do_shrink_slab() - a historic name that doesn't reflect the reality
of shrinkers these days - if we have a superblock, we can call the
shrinker....

FWIW, we have per-object init callbacks in the slab infrastructure -
ever thought of maybe using them for controlling cache aging
behaviour? e.g. accounting to trigger background aging scans...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-04 13:21                       ` Josef Bacik
  2017-07-04 22:57                         ` Dave Chinner
@ 2017-07-05  4:43                         ` Minchan Kim
  1 sibling, 0 replies; 23+ messages in thread
From: Minchan Kim @ 2017-07-05  4:43 UTC (permalink / raw)
  To: Josef Bacik
  Cc: hannes, riel, akpm, linux-mm, kernel-team, Josef Bacik, mhocko,
	cl, david

On Tue, Jul 04, 2017 at 09:21:37AM -0400, Josef Bacik wrote:
> On Tue, Jul 04, 2017 at 12:01:00PM +0900, Minchan Kim wrote:
> > On Mon, Jul 03, 2017 at 09:50:07AM -0400, Josef Bacik wrote:
> > > On Mon, Jul 03, 2017 at 10:33:03AM +0900, Minchan Kim wrote:
> > > > Hello,
> > > > 
> > > > On Fri, Jun 30, 2017 at 11:03:24AM -0400, Josef Bacik wrote:
> > > > > On Fri, Jun 30, 2017 at 11:17:13AM +0900, Minchan Kim wrote:
> > > > > 
> > > > > <snip>
> > > > > 
> > > > > > > 
> > > > > > > Because this static step down wastes cycles.  Why loop 10 times when you could
> > > > > > > set the target at actual usage and try to get everything in one go?  Most
> > > > > > > shrinkable slabs adhere to this default of in use first model, which means that
> > > > > > > we have to hit an object in the lru twice before it is freed.  So in order to
> > > > > > 
> > > > > > I didn't know that.
> > > > > > 
> > > > > > > reclaim anything we have to scan a slab cache's entire lru at least once before
> > > > > > > any reclaim starts happening.  If we're doing this static step down thing we
> > > > > > 
> > > > > > If it's really true, I think that shrinker should be fixed first.
> > > > > > 
> > > > > 
> > > > > Easier said than done.  I've fixed this for the super shrinkers, but like I said
> > > > > below, all it takes is some asshole doing find / -exec stat {} \; twice to put
> > > > > us back in the same situation again.  There's no aging mechanism other than
> > > > > memory reclaim, so we get into this shitty situation of aging+reclaiming at the
> > > > > same time.
> > > > 
> > > > What's different with normal page cache problem?
> > > > 
> > > 
> > > What's different is reclaiming a page from pagecache gives you a page,
> > > reclaiming 10k objects from slab may only give you one page if you are super
> > > unlucky.  I'm nothing in life if I'm not unlucky.
> > > 
> > > > It has the same problem you mentioned so need to peek what VM does to
> > > > address it.
> > > > 
> > > > It has two LRU list, active and inactive and maintain the size ratio 1:1.
> > > > New page is on inactive and if they are two-touched, the page will be
> > > > promoted into active list which is same problem.
> > > > However, once reclaim is triggered, VM will can move quickly them from
> > > > active to inactive with remove referenced flag untill the ratio is matched.
> > > > So, VM can reclaim pages from inactive list, easily.
> > > > 
> > > > Can we apply similar mechanism into the problematical slab?
> > > > How about adding shrink_slab(xxx, ACTIVE|INACTIVE) in somewhere of VM
> > > > for demotion of objects from active list and adding the logic to move
> > > > inactive object to active list when the cache hit happens to the FS?
> > > > 
> > > 
> > > I did this too!  This worked out ok, but was a bit complex and the problem was
> > > solved just as well by dropping the INUSE first approach.  I think that Dave's
> > > approach to having a separate aging mechanism is a good compliment to these
> > > patches.
> > 
> > There are two problems you are try to address.
> > 
> > 1. slab *page* reclaim
> > 
> > Your claim is that it's hard to reclaim a page by slab fragmentation so need to
> > reclaim objects more aggressively.
> > 
> > Basically, aggressive scanning doesn't guarantee to reclaim a page but it just
> > increases the possibility. Even, if we think slab works with merging feature(i.e.,
> > it mixes same size several type objects in a slab), the possibility will be huge
> > dropped if you try to bail out on a certain shrinker. So for working well,
> > we should increase aggressiveness too much to sweep every objects from all shrinker.
> > I guess that's why your patch makes the logic very aggressive.
> > In here, my concern with that aggressive is to reclaim all objects too early
> > and it ends up making void caching scheme. I'm not sure it's gain in the end.
> >
> 
> Well the fact is what we have doesn't work, and I've been staring at this
> problem for a few months and I don't have a better solution.
> 
> And keep in mind we're talking about a purely slab workload, something that
> isn't likely to be a common case.  And even if our scan target is 2x, we aren't
> going to reclaim the entire cache before we bail out.  We only scan in
> 'batch_size' chunks, which generally is 1024.  In the worst case that we have

I replied your new patchset. It breaks fair aging.

> one in use object on every slab page we own then yes we're fucked, but we're
> still fucked with the current code, only with the current code it'll take us 20
> minutes of looping in the vm vs. seconds scanning the whole list twice.

Don't get me wrong. As I replied at first discussion, I tend to agree
increase aggressiveness. What I dislike is your patch increases it too much
via SLAB/LRU ratio. It can reclaim all of objects although memory pressure
is not severe but small LRU/big SLAB.

You said we can bail out but it breaks aging problem between shrinkers
as reply of new patch.

As well, gslab/lru ratio is totally out of control from VM pov.
IMO, VM want to define hot workingset with survival pages although it
has twice full scan(1/4096 + 1/2048 + ....  1/1) so if VM cannot see
any progress in that full scan until MAX_RECLAIM_RETRIES(16), it decide
to kill some process although there are reclaimable pages in there.

Like that, VM need some guide line to guarantee. However, if we use
SLAB/LRU ratio, it can scan all of objects twice from the beginning.
It means it can 24 full scan in reclaim iteration. Also, it can be
changed with SLAB/LRU ratio so it's really unpredictable.

> 
> > 2. stream-workload
> > 
> > Your claim is that every objects can have INUSE flag in that workload so they
> > need to scan full-cycle with removing the flag and finally, next cycle,
> > objects can be reclaimed. On the situation, static incremental scanning would
> > make deep prorioty drop which causes unncessary CPU cycle waste.
> > 
> > Actually, there isn't nice solution for that at the moment. Page cache try
> > to solve it with multi-level LRU and as you said, it would solve the
> > problem. However, it would be too complicated so you could be okay with
> > Dave's suggestion which periodic aging(i.e., LFU) but it's not free so that
> > it could increase runtime latency.
> > 
> > The point is that such workload is hard to solve in general and just
> > general agreessive scanning is not a good solution because it can sweep
> > other shrinkers which don't have such problem so I hope it should be
> > solved by a specific shrinker itself rather than general VM level.
> 
> The only problem I see here is our shrinker list is just a list, there's no
> order or anything and we just walk through one at a time.  We could mitigate

I don't get it. Why do you think ordered list solves this stream-workload
issues?

> this problem by ordering the list based on objects, but this isn't necessarily a
> good indication of overall size.  Consider xfs_buf, where each slab object is
> also hiding 1 page, so for every slab object we free we also free 1 page.  This
> may appear to be a smaller slab by object measures, but may actually be larger.
> We could definitely make this aspect of the shrinker smarter, but these patches
> here need to still be in place in general to solve the problem of us not being
> aggressive enough currently.  Thanks,
> 
> Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-04 22:57                         ` Dave Chinner
@ 2017-07-05  4:59                           ` Minchan Kim
  2017-07-05 23:58                             ` Dave Chinner
  2017-07-05 13:33                           ` Josef Bacik
  1 sibling, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2017-07-05  4:59 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Josef Bacik, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl

Hi Dave,

On Wed, Jul 05, 2017 at 08:57:58AM +1000, Dave Chinner wrote:
> On Tue, Jul 04, 2017 at 09:21:37AM -0400, Josef Bacik wrote:
> > On Tue, Jul 04, 2017 at 12:01:00PM +0900, Minchan Kim wrote:
> > > 1. slab *page* reclaim
> > > 
> > > Your claim is that it's hard to reclaim a page by slab fragmentation so need to
> > > reclaim objects more aggressively.
> > > 
> > > Basically, aggressive scanning doesn't guarantee to reclaim a page but it just
> > > increases the possibility. Even, if we think slab works with merging feature(i.e.,
> > > it mixes same size several type objects in a slab), the possibility will be huge
> > > dropped if you try to bail out on a certain shrinker. So for working well,
> > > we should increase aggressiveness too much to sweep every objects from all shrinker.
> > > I guess that's why your patch makes the logic very aggressive.
> > > In here, my concern with that aggressive is to reclaim all objects too early
> > > and it ends up making void caching scheme. I'm not sure it's gain in the end.
> > >
> > 
> > Well the fact is what we have doesn't work, and I've been staring at this
> > problem for a few months and I don't have a better solution.
> > 
> > And keep in mind we're talking about a purely slab workload, something that
> > isn't likely to be a common case.  And even if our scan target is 2x, we aren't
> > going to reclaim the entire cache before we bail out.  We only scan in
> > 'batch_size' chunks, which generally is 1024.  In the worst case that we have
> > one in use object on every slab page we own then yes we're fucked, but we're
> > still fucked with the current code, only with the current code it'll take us 20
> > minutes of looping in the vm vs. seconds scanning the whole list twice.
> 
> Right - this is where growth/allocation rate based aging scans
> come into play, rather than waiting for the VM to hit some unknown
> ceiling and do an unpredictable amount of scanning.

http://www.spinics.net/lists/linux-mm/msg129470.html

I suggested static scanning increasement(1/12 + 2/12 + 3/12...) which is
more aggressive compared to as-is. With this, in a reclaim cycle(priority
12..0), we guarantees that scanning of entire objects list four times
while LRU is two times. Although I believe we don't need four times
(i.e., it's enough with two times), it's just compromise solution with
Josef's much too agressive slab reclaim.
It would be more predictable and aggressive from VM point of view.

If some of shrinker cannot be happy with this policy, it would accelerate
the scanning for only that shrinker under shrink_slab call although I don't
like it because it's out of control from VM pov so I'm okay your per-shrinker
aging callback regardless of shrink_slab. My point is if some of shrinker is
painful to be reclaimed, it should have own model to solve it rather than
making general slab reclaim strately very aggressive.

However, Josef's current approach changes slab scanning aggressive
by using SLAB/LRU ratio(please, see his patchset 3/4) and bail out until
sc->nr_to_reclaim is meet. To me, it's too much aggressive and breaks fair
aging between shrinkers.

> 
> > > 2. stream-workload
> > > 
> > > Your claim is that every objects can have INUSE flag in that workload so they
> > > need to scan full-cycle with removing the flag and finally, next cycle,
> > > objects can be reclaimed. On the situation, static incremental scanning would
> > > make deep prorioty drop which causes unncessary CPU cycle waste.
> > > 
> > > Actually, there isn't nice solution for that at the moment. Page cache try
> > > to solve it with multi-level LRU and as you said, it would solve the
> > > problem. However, it would be too complicated so you could be okay with
> > > Dave's suggestion which periodic aging(i.e., LFU) but it's not free so that
> > > it could increase runtime latency.
> > > 
> > > The point is that such workload is hard to solve in general and just
> > > general agreessive scanning is not a good solution because it can sweep
> > > other shrinkers which don't have such problem so I hope it should be
> > > solved by a specific shrinker itself rather than general VM level.
> > 
> > The only problem I see here is our shrinker list is just a list, there's no
> > order or anything and we just walk through one at a time.
> 
> That's because we don't really need an ordered list - all shrinkable
> caches needed to have the same amount of work done on them for a
> given memory pressure. That's how we maintain balance between
> caches.

True.

> 
> > We could mitigate
> > this problem by ordering the list based on objects, but this isn't necessarily a
> > good indication of overall size.
> 
> shrinkers don't just run on caches, and some of the this they run
> against have variable object size. Some of them report reclaimable
> memory in bytes rather than object counts to the shrinker
> infrastructure.  i.e. the shrinker infrastrcture is abstracted
> sufficiently that the accounting of memory used/reclaimed is defined
> by the individual subsystems, not the shrinker infrastructure....
> 
> > Consider xfs_buf, where each slab object is also hiding 1 page, so
> > for every slab object we free we also free 1 page.
> 
> Well, that's a very simplistic view - there are objects that hold a
> page, but it's a variable size object cache. an xfs-buf can point to
> heap memory or multiple pages.
> 
> IOWs, the xfs-buf is not a *slab cache*. It's a *buffer cache*, but
> we control it's size via a shrinker because that's the only
> mechanism we have that provides subsystems with memory pressure
> callbacks. This is a clear example of what I said above about
> shrinkers being much more than just a mechanism to control the size
> of a slab...
> 
> > This may
> > appear to be a smaller slab by object measures, but may actually
> > be larger.
> 
> Right, but that's for the subsystem to sort out the working set
> balance against all the other caches - the shrinker infrastructure
> cannot determine how important different subsystems are relative to
> each other, so memory reclaim must try to do the same percentage of
> reclaim work across all of them so that everything remains globally
> balanced.

True.

> 
> My original plan years ago was to make the shrinker infrastructure
> API work on "bytes" rather than "subsystem defined objects", but
> there were so many broken shrinkers I burnt out before I got that
> far....
> 
> My suggestion of allocation based aging callbacks is something for
> specific caches to be able to run based on their own or the users
> size/growth/performance constraints. It's independent of memory
> reclaim behaviour and so can be a strongly biased as the user wants.
> Memory reclaim will just maintain whatever balance that exists
> between the different caches as a result of the subsystem specific
> aging callbacks.

Absolutely agree.

> 
> > We could definitely make this aspect of the shrinker
> > smarter, but these patches here need to still be in place in
> > general to solve the problem of us not being aggressive enough
> > currently.  Thanks,
> 
> Remember that the shrinker callback into a subsystem is just a
> mechanism for scanning a subsystem's reclaim list and performing
> reclaim. We're not limited to only calling them from
> do_shrink_slab() - a historic name that doesn't reflect the reality
> of shrinkers these days - if we have a superblock, we can call the
> shrinker....
> 
> FWIW, we have per-object init callbacks in the slab infrastructure -
> ever thought of maybe using them for controlling cache aging
> behaviour? e.g. accounting to trigger background aging scans...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-04 22:57                         ` Dave Chinner
  2017-07-05  4:59                           ` Minchan Kim
@ 2017-07-05 13:33                           ` Josef Bacik
  2017-07-05 23:30                             ` Dave Chinner
  1 sibling, 1 reply; 23+ messages in thread
From: Josef Bacik @ 2017-07-05 13:33 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Josef Bacik, Minchan Kim, hannes, riel, akpm, linux-mm,
	kernel-team, Josef Bacik, mhocko, cl

On Wed, Jul 05, 2017 at 08:57:58AM +1000, Dave Chinner wrote:
> On Tue, Jul 04, 2017 at 09:21:37AM -0400, Josef Bacik wrote:
> > On Tue, Jul 04, 2017 at 12:01:00PM +0900, Minchan Kim wrote:
> > > 1. slab *page* reclaim
> > > 
> > > Your claim is that it's hard to reclaim a page by slab fragmentation so need to
> > > reclaim objects more aggressively.
> > > 
> > > Basically, aggressive scanning doesn't guarantee to reclaim a page but it just
> > > increases the possibility. Even, if we think slab works with merging feature(i.e.,
> > > it mixes same size several type objects in a slab), the possibility will be huge
> > > dropped if you try to bail out on a certain shrinker. So for working well,
> > > we should increase aggressiveness too much to sweep every objects from all shrinker.
> > > I guess that's why your patch makes the logic very aggressive.
> > > In here, my concern with that aggressive is to reclaim all objects too early
> > > and it ends up making void caching scheme. I'm not sure it's gain in the end.
> > >
> > 
> > Well the fact is what we have doesn't work, and I've been staring at this
> > problem for a few months and I don't have a better solution.
> > 
> > And keep in mind we're talking about a purely slab workload, something that
> > isn't likely to be a common case.  And even if our scan target is 2x, we aren't
> > going to reclaim the entire cache before we bail out.  We only scan in
> > 'batch_size' chunks, which generally is 1024.  In the worst case that we have
> > one in use object on every slab page we own then yes we're fucked, but we're
> > still fucked with the current code, only with the current code it'll take us 20
> > minutes of looping in the vm vs. seconds scanning the whole list twice.
> 
> Right - this is where growth/allocation rate based aging scans
> come into play, rather than waiting for the VM to hit some unknown
> ceiling and do an unpredictable amount of scanning.
> 
> > > 2. stream-workload
> > > 
> > > Your claim is that every objects can have INUSE flag in that workload so they
> > > need to scan full-cycle with removing the flag and finally, next cycle,
> > > objects can be reclaimed. On the situation, static incremental scanning would
> > > make deep prorioty drop which causes unncessary CPU cycle waste.
> > > 
> > > Actually, there isn't nice solution for that at the moment. Page cache try
> > > to solve it with multi-level LRU and as you said, it would solve the
> > > problem. However, it would be too complicated so you could be okay with
> > > Dave's suggestion which periodic aging(i.e., LFU) but it's not free so that
> > > it could increase runtime latency.
> > > 
> > > The point is that such workload is hard to solve in general and just
> > > general agreessive scanning is not a good solution because it can sweep
> > > other shrinkers which don't have such problem so I hope it should be
> > > solved by a specific shrinker itself rather than general VM level.
> > 
> > The only problem I see here is our shrinker list is just a list, there's no
> > order or anything and we just walk through one at a time.
> 
> That's because we don't really need an ordered list - all shrinkable
> caches needed to have the same amount of work done on them for a
> given memory pressure. That's how we maintain balance between
> caches.
> 
> > We could mitigate
> > this problem by ordering the list based on objects, but this isn't necessarily a
> > good indication of overall size.
> 
> shrinkers don't just run on caches, and some of the this they run
> against have variable object size. Some of them report reclaimable
> memory in bytes rather than object counts to the shrinker
> infrastructure.  i.e. the shrinker infrastrcture is abstracted
> sufficiently that the accounting of memory used/reclaimed is defined
> by the individual subsystems, not the shrinker infrastructure....
> 
> > Consider xfs_buf, where each slab object is also hiding 1 page, so
> > for every slab object we free we also free 1 page.
> 
> Well, that's a very simplistic view - there are objects that hold a
> page, but it's a variable size object cache. an xfs-buf can point to
> heap memory or multiple pages.
> 
> IOWs, the xfs-buf is not a *slab cache*. It's a *buffer cache*, but
> we control it's size via a shrinker because that's the only
> mechanism we have that provides subsystems with memory pressure
> callbacks. This is a clear example of what I said above about
> shrinkers being much more than just a mechanism to control the size
> of a slab...
> 
> > This may
> > appear to be a smaller slab by object measures, but may actually
> > be larger.
> 
> Right, but that's for the subsystem to sort out the working set
> balance against all the other caches - the shrinker infrastructure
> cannot determine how important different subsystems are relative to
> each other, so memory reclaim must try to do the same percentage of
> reclaim work across all of them so that everything remains globally
> balanced.
> 
> My original plan years ago was to make the shrinker infrastructure
> API work on "bytes" rather than "subsystem defined objects", but
> there were so many broken shrinkers I burnt out before I got that
> far....
> 

Ha I ran into the same problem.  The device-mapper ones were particularly
tricky.  This would make it easier for the vm to be fairer, so maybe I just need
to suck it up and do this work.

> My suggestion of allocation based aging callbacks is something for
> specific caches to be able to run based on their own or the users
> size/growth/performance constraints. It's independent of memory
> reclaim behaviour and so can be a strongly biased as the user wants.
> Memory reclaim will just maintain whatever balance that exists
> between the different caches as a result of the subsystem specific
> aging callbacks.
> 

Ok so how does a scheme like this look?  The shrinking stuff can be relatively
heavy because generally speaking it's always run asynchronously by kswapd, so
the only latency it induces to normal workloads is the CPU time it takes away
from processes we care about.

With an aging callback at allocation time we're inducing latency for the user at
allocation time.  So we want to do as little as possible here, but what do we
need to determine if there's something to do?  Do we just have a static "I'm
over limit X objects, start a worker thread to check if we need to reclaim"?  Or
do we have it be actually smart, checking the overall count and checking it
against some configurable growth rate?  That's going to be expensive on a per
allocation basis.

I'm having a hard time envisioning how this works that doesn't induce a bunch of
latency.

> > We could definitely make this aspect of the shrinker
> > smarter, but these patches here need to still be in place in
> > general to solve the problem of us not being aggressive enough
> > currently.  Thanks,
> 
> Remember that the shrinker callback into a subsystem is just a
> mechanism for scanning a subsystem's reclaim list and performing
> reclaim. We're not limited to only calling them from
> do_shrink_slab() - a historic name that doesn't reflect the reality
> of shrinkers these days - if we have a superblock, we can call the
> shrinker....
> 
> FWIW, we have per-object init callbacks in the slab infrastructure -
> ever thought of maybe using them for controlling cache aging
> behaviour? e.g. accounting to trigger background aging scans...

I suppose we could keep track of how often we're allocating pages per slab cache
that has a shrinker, and use that as a basis for when to kick off background
aging.  I'm fixing to go on vacation for a few days, I'll think about this some
more and come up with something when I get home next week.  Thanks,

Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-05 13:33                           ` Josef Bacik
@ 2017-07-05 23:30                             ` Dave Chinner
  0 siblings, 0 replies; 23+ messages in thread
From: Dave Chinner @ 2017-07-05 23:30 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Minchan Kim, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl

On Wed, Jul 05, 2017 at 09:33:45AM -0400, Josef Bacik wrote:
> On Wed, Jul 05, 2017 at 08:57:58AM +1000, Dave Chinner wrote:
> > My suggestion of allocation based aging callbacks is something for
> > specific caches to be able to run based on their own or the users
> > size/growth/performance constraints. It's independent of memory
> > reclaim behaviour and so can be a strongly biased as the user wants.
> > Memory reclaim will just maintain whatever balance that exists
> > between the different caches as a result of the subsystem specific
> > aging callbacks.
> > 
> 
> Ok so how does a scheme like this look?  The shrinking stuff can be relatively
> heavy because generally speaking it's always run asynchronously by kswapd, so
> the only latency it induces to normal workloads is the CPU time it takes away
> from processes we care about.
> 
> With an aging callback at allocation time we're inducing latency for the user at
> allocation time.  So we want to do as little as possible here, but what do we
> need to determine if there's something to do?  Do we just have a static "I'm
> over limit X objects, start a worker thread to check if we need to reclaim"?  Or
> do we have it be actually smart, checking the overall count and checking it
> against some configurable growth rate?  That's going to be expensive on a per
> allocation basis.
> 
> I'm having a hard time envisioning how this works that doesn't induce a bunch of
> latency.

I was thinking the aging would also be async, like kswapd, and the
only thing the allocation does is accounting. THe actual aging scans
don't need to be done in the foreground and get the in way of the
current allocation because aging doesn't need precise control or
behaviour...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-05  4:59                           ` Minchan Kim
@ 2017-07-05 23:58                             ` Dave Chinner
  2017-07-06  3:56                               ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Dave Chinner @ 2017-07-05 23:58 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Josef Bacik, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl

On Wed, Jul 05, 2017 at 01:59:12PM +0900, Minchan Kim wrote:
> Hi Dave,
> 
> On Wed, Jul 05, 2017 at 08:57:58AM +1000, Dave Chinner wrote:
> > On Tue, Jul 04, 2017 at 09:21:37AM -0400, Josef Bacik wrote:
> > > On Tue, Jul 04, 2017 at 12:01:00PM +0900, Minchan Kim wrote:
> > > > 1. slab *page* reclaim
> > > > 
> > > > Your claim is that it's hard to reclaim a page by slab fragmentation so need to
> > > > reclaim objects more aggressively.
> > > > 
> > > > Basically, aggressive scanning doesn't guarantee to reclaim a page but it just
> > > > increases the possibility. Even, if we think slab works with merging feature(i.e.,
> > > > it mixes same size several type objects in a slab), the possibility will be huge
> > > > dropped if you try to bail out on a certain shrinker. So for working well,
> > > > we should increase aggressiveness too much to sweep every objects from all shrinker.
> > > > I guess that's why your patch makes the logic very aggressive.
> > > > In here, my concern with that aggressive is to reclaim all objects too early
> > > > and it ends up making void caching scheme. I'm not sure it's gain in the end.
> > > >
> > > 
> > > Well the fact is what we have doesn't work, and I've been staring at this
> > > problem for a few months and I don't have a better solution.
> > > 
> > > And keep in mind we're talking about a purely slab workload, something that
> > > isn't likely to be a common case.  And even if our scan target is 2x, we aren't
> > > going to reclaim the entire cache before we bail out.  We only scan in
> > > 'batch_size' chunks, which generally is 1024.  In the worst case that we have
> > > one in use object on every slab page we own then yes we're fucked, but we're
> > > still fucked with the current code, only with the current code it'll take us 20
> > > minutes of looping in the vm vs. seconds scanning the whole list twice.
> > 
> > Right - this is where growth/allocation rate based aging scans
> > come into play, rather than waiting for the VM to hit some unknown
> > ceiling and do an unpredictable amount of scanning.
> 
> http://www.spinics.net/lists/linux-mm/msg129470.html
> 
> I suggested static scanning increasement(1/12 + 2/12 + 3/12...) which is
> more aggressive compared to as-is. With this, in a reclaim cycle(priority
> 12..0), we guarantees that scanning of entire objects list four times
> while LRU is two times. Although I believe we don't need four times
> (i.e., it's enough with two times), it's just compromise solution with
> Josef's much too agressive slab reclaim.
> It would be more predictable and aggressive from VM point of view.

Yes, I read and understood that post, but you are talking about
changing reclaim behaviour when there is low memory, not dealing
with the aging problem. It's a brute force big hammer approach, yet
we already know that increasingly aggressive reclaim of caches is a
problem for filesystems in that it drives us into OOM conditions
faster. i.e. agressive shrinker reclaim trashes the working set of
cached filesystem metadata and so increases the GFP_NOFS memory
allocation demand required by the filesystem at times of critically
low memory....

> If some of shrinker cannot be happy with this policy, it would accelerate
> the scanning for only that shrinker under shrink_slab call although I don't
> like it because it's out of control from VM pov so I'm okay your per-shrinker
> aging callback regardless of shrink_slab. My point is if some of shrinker is
> painful to be reclaimed, it should have own model to solve it rather than
> making general slab reclaim strately very aggressive.

We already do this in various filesystems. The issue is that we
don't know that we should reclaim caches until shrinker callbacks
start happening. i.e. there's *no feedback mechanism* that allows us
to age shrinker controlled caches over time. memory reclaim is a
feedback loop but if we never hit low memory, then it's never
invoked until we actually run out of memory and so it drowns in
aging rather than reclaim work when it does get run.

Stop looking at the code and start thinking about the architecture -
how the subsystems connect and what control/feedback mechanisms are
required to allow them to work correctly together. We solve balance
and breakdown problems by identifying the missing/sub-optimal
feedback loops and fixing them. In this case, what we are missing is
the mechanism to detect and control "cache growth in single use
workloads when there is no memory pressure". Sustained cache
allocation should trigger some amount of aging regardless of how
much free memory we have, otherwise we simply fill up memory with
objects we're never going to use again.....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation
  2017-07-05 23:58                             ` Dave Chinner
@ 2017-07-06  3:56                               ` Minchan Kim
  0 siblings, 0 replies; 23+ messages in thread
From: Minchan Kim @ 2017-07-06  3:56 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Josef Bacik, hannes, riel, akpm, linux-mm, kernel-team,
	Josef Bacik, mhocko, cl

On Thu, Jul 06, 2017 at 09:58:23AM +1000, Dave Chinner wrote:
> On Wed, Jul 05, 2017 at 01:59:12PM +0900, Minchan Kim wrote:
> > Hi Dave,
> > 
> > On Wed, Jul 05, 2017 at 08:57:58AM +1000, Dave Chinner wrote:
> > > On Tue, Jul 04, 2017 at 09:21:37AM -0400, Josef Bacik wrote:
> > > > On Tue, Jul 04, 2017 at 12:01:00PM +0900, Minchan Kim wrote:
> > > > > 1. slab *page* reclaim
> > > > > 
> > > > > Your claim is that it's hard to reclaim a page by slab fragmentation so need to
> > > > > reclaim objects more aggressively.
> > > > > 
> > > > > Basically, aggressive scanning doesn't guarantee to reclaim a page but it just
> > > > > increases the possibility. Even, if we think slab works with merging feature(i.e.,
> > > > > it mixes same size several type objects in a slab), the possibility will be huge
> > > > > dropped if you try to bail out on a certain shrinker. So for working well,
> > > > > we should increase aggressiveness too much to sweep every objects from all shrinker.
> > > > > I guess that's why your patch makes the logic very aggressive.
> > > > > In here, my concern with that aggressive is to reclaim all objects too early
> > > > > and it ends up making void caching scheme. I'm not sure it's gain in the end.
> > > > >
> > > > 
> > > > Well the fact is what we have doesn't work, and I've been staring at this
> > > > problem for a few months and I don't have a better solution.
> > > > 
> > > > And keep in mind we're talking about a purely slab workload, something that
> > > > isn't likely to be a common case.  And even if our scan target is 2x, we aren't
> > > > going to reclaim the entire cache before we bail out.  We only scan in
> > > > 'batch_size' chunks, which generally is 1024.  In the worst case that we have
> > > > one in use object on every slab page we own then yes we're fucked, but we're
> > > > still fucked with the current code, only with the current code it'll take us 20
> > > > minutes of looping in the vm vs. seconds scanning the whole list twice.
> > > 
> > > Right - this is where growth/allocation rate based aging scans
> > > come into play, rather than waiting for the VM to hit some unknown
> > > ceiling and do an unpredictable amount of scanning.
> > 
> > http://www.spinics.net/lists/linux-mm/msg129470.html
> > 
> > I suggested static scanning increasement(1/12 + 2/12 + 3/12...) which is
> > more aggressive compared to as-is. With this, in a reclaim cycle(priority
> > 12..0), we guarantees that scanning of entire objects list four times
> > while LRU is two times. Although I believe we don't need four times
> > (i.e., it's enough with two times), it's just compromise solution with
> > Josef's much too agressive slab reclaim.
> > It would be more predictable and aggressive from VM point of view.
> 
> Yes, I read and understood that post, but you are talking about
> changing reclaim behaviour when there is low memory, not dealing
> with the aging problem. It's a brute force big hammer approach, yet

The reason I mentioned reclaim behaviour that his patch changes it really
aggressive and one of the reason he did is we cannot a full slab page
by slab fragmentation. That's really I dislike. If slab fragmentation
is really issue, it should be done by [anti|de]-fragmenation whatever,
not increasing reclaim agressivess which is just band-aid, even it could be
hurt system performance via heavy reclaiming.

> we already know that increasingly aggressive reclaim of caches is a
> problem for filesystems in that it drives us into OOM conditions
> faster. i.e. agressive shrinker reclaim trashes the working set of
> cached filesystem metadata and so increases the GFP_NOFS memory

True. That is my point so please do not reclaim behaviour much too
aggressive.

> allocation demand required by the filesystem at times of critically
> low memory....

Indeed.

> 
> > If some of shrinker cannot be happy with this policy, it would accelerate
> > the scanning for only that shrinker under shrink_slab call although I don't
> > like it because it's out of control from VM pov so I'm okay your per-shrinker
> > aging callback regardless of shrink_slab. My point is if some of shrinker is
> > painful to be reclaimed, it should have own model to solve it rather than
> > making general slab reclaim strately very aggressive.
> 
> We already do this in various filesystems. The issue is that we
> don't know that we should reclaim caches until shrinker callbacks
> start happening. i.e. there's *no feedback mechanism* that allows us
> to age shrinker controlled caches over time. memory reclaim is a
> feedback loop but if we never hit low memory, then it's never
> invoked until we actually run out of memory and so it drowns in
> aging rather than reclaim work when it does get run.

Yes, it's the problem page cache as well as slab cache.
In case of page cache, as you know well, VM want to do best effort
two level LRU promotion/demotion. Although it's not perfect, it would have worked.

Cannot FS slab cache handle aging more smarter? The LFU you suggested
would be better than as-is but would be hard to avoid latency.

Anyway, I'm okay if each shrinker can have the aging method unless
it doesn't increases slab reclaim behaviour too much aggressive
without convincing reason.

> 
> Stop looking at the code and start thinking about the architecture -
> how the subsystems connect and what control/feedback mechanisms are
> required to allow them to work correctly together. We solve balance
> and breakdown problems by identifying the missing/sub-optimal
> feedback loops and fixing them. In this case, what we are missing is
> the mechanism to detect and control "cache growth in single use
> workloads when there is no memory pressure". Sustained cache
> allocation should trigger some amount of aging regardless of how
> much free memory we have, otherwise we simply fill up memory with
> objects we're never going to use again.....
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2017-07-06  3:57 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-08 19:19 [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation josef
2017-06-08 19:19 ` [PATCH 2/2] mm: make kswapd try harder to keep active pages in cache josef
2017-06-13  5:28 ` [PATCH 1/2] mm: use slab size in the slab shrinking ratio calculation Minchan Kim
2017-06-13 12:01   ` Josef Bacik
2017-06-14  6:40     ` Minchan Kim
2017-06-19 15:11       ` Josef Bacik
2017-06-20  2:46         ` Minchan Kim
2017-06-27 13:59           ` Josef Bacik
2017-06-30  2:17             ` Minchan Kim
2017-06-30 15:03               ` Josef Bacik
2017-07-02  1:58                 ` Dave Chinner
2017-07-03 13:52                   ` Josef Bacik
2017-07-03  1:33                 ` Minchan Kim
2017-07-03 13:50                   ` Josef Bacik
2017-07-04  3:01                     ` Minchan Kim
2017-07-04 13:21                       ` Josef Bacik
2017-07-04 22:57                         ` Dave Chinner
2017-07-05  4:59                           ` Minchan Kim
2017-07-05 23:58                             ` Dave Chinner
2017-07-06  3:56                               ` Minchan Kim
2017-07-05 13:33                           ` Josef Bacik
2017-07-05 23:30                             ` Dave Chinner
2017-07-05  4:43                         ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).