All of lore.kernel.org
 help / color / mirror / Atom feed
From: Roman Gushchin <guro@fb.com>
To: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Michal Hocko <mhocko@suse.com>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	"cgroups@vger.kernel.org" <cgroups@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Kernel Team <Kernel-team@fb.com>
Subject: Re: [PATCH 8/8] mm: vmscan: harmonize writeback congestion tracking for nodes & memcgs
Date: Tue, 22 Oct 2019 21:03:57 +0000	[thread overview]
Message-ID: <20191022210352.GB24142@tower.DHCP.thefacebook.com> (raw)
In-Reply-To: <20191022144803.302233-9-hannes@cmpxchg.org>

On Tue, Oct 22, 2019 at 10:48:03AM -0400, Johannes Weiner wrote:
> The current writeback congestion tracking has separate flags for
> kswapd reclaim (node level) and cgroup limit reclaim (memcg-node
> level). This is unnecessarily complicated: the lruvec is an existing
> abstraction layer for that node-memcg intersection.
> 
> Introduce lruvec->flags and LRUVEC_CONGESTED. Then track that at the
> reclaim root level, which is either the NUMA node for global reclaim,
> or the cgroup-node intersection for cgroup reclaim.

Good idea!

Reviewed-by: Roman Gushchin <guro@fb.com>

> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  include/linux/memcontrol.h |  6 +--
>  include/linux/mmzone.h     | 11 ++++--
>  mm/vmscan.c                | 80 ++++++++++++--------------------------
>  3 files changed, 36 insertions(+), 61 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 498cea07cbb1..d8ffcf60440c 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -133,9 +133,6 @@ struct mem_cgroup_per_node {
>  	unsigned long		usage_in_excess;/* Set to the value by which */
>  						/* the soft limit is exceeded*/
>  	bool			on_tree;
> -	bool			congested;	/* memcg has many dirty pages */
> -						/* backed by a congested BDI */
> -
>  	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
>  						/* use container_of	   */
>  };
> @@ -412,6 +409,9 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
>  		goto out;
>  	}
>  
> +	if (!memcg)
> +		memcg = root_mem_cgroup;
> +
>  	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
>  	lruvec = &mz->lruvec;
>  out:
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 449a44171026..c04b4c1f01fa 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -296,6 +296,12 @@ struct zone_reclaim_stat {
>  	unsigned long		recent_scanned[2];
>  };
>  
> +enum lruvec_flags {
> +	LRUVEC_CONGESTED,		/* lruvec has many dirty pages
> +					 * backed by a congested BDI
> +					 */
> +};
> +
>  struct lruvec {
>  	struct list_head		lists[NR_LRU_LISTS];
>  	struct zone_reclaim_stat	reclaim_stat;
> @@ -303,6 +309,8 @@ struct lruvec {
>  	atomic_long_t			inactive_age;
>  	/* Refaults at the time of last reclaim cycle */
>  	unsigned long			refaults;
> +	/* Various lruvec state flags (enum lruvec_flags) */
> +	unsigned long			flags;
>  #ifdef CONFIG_MEMCG
>  	struct pglist_data *pgdat;
>  #endif
> @@ -572,9 +580,6 @@ struct zone {
>  } ____cacheline_internodealigned_in_smp;
>  
>  enum pgdat_flags {
> -	PGDAT_CONGESTED,		/* pgdat has many dirty pages backed by
> -					 * a congested BDI
> -					 */
>  	PGDAT_DIRTY,			/* reclaim scanning has recently found
>  					 * many dirty file pages at the tail
>  					 * of the LRU.
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 65baa89740dd..3e21166d5198 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -267,29 +267,6 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>  #endif
>  	return false;
>  }
> -
> -static void set_memcg_congestion(pg_data_t *pgdat,
> -				struct mem_cgroup *memcg,
> -				bool congested)
> -{
> -	struct mem_cgroup_per_node *mn;
> -
> -	if (!memcg)
> -		return;
> -
> -	mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
> -	WRITE_ONCE(mn->congested, congested);
> -}
> -
> -static bool memcg_congested(pg_data_t *pgdat,
> -			struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup_per_node *mn;
> -
> -	mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
> -	return READ_ONCE(mn->congested);
> -
> -}
>  #else
>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>  {
> @@ -309,18 +286,6 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>  {
>  	return true;
>  }
> -
> -static inline void set_memcg_congestion(struct pglist_data *pgdat,
> -				struct mem_cgroup *memcg, bool congested)
> -{
> -}
> -
> -static inline bool memcg_congested(struct pglist_data *pgdat,
> -			struct mem_cgroup *memcg)
> -{
> -	return false;
> -
> -}
>  #endif
>  
>  /*
> @@ -2716,12 +2681,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
>  	return inactive_lru_pages > pages_for_compaction;
>  }
>  
> -static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
> -{
> -	return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
> -		(memcg && memcg_congested(pgdat, memcg));
> -}
> -
>  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>  {
>  	struct mem_cgroup *root = sc->target_mem_cgroup;
> @@ -2785,8 +2744,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	struct mem_cgroup *root = sc->target_mem_cgroup;
>  	unsigned long nr_reclaimed, nr_scanned;
> +	struct lruvec *target_lruvec;
>  	bool reclaimable = false;
>  
> +	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
> +
>  again:
>  	memset(&sc->nr, 0, sizeof(sc->nr));
>  
> @@ -2829,14 +2791,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
>  			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
>  
> -		/*
> -		 * Tag a node as congested if all the dirty pages
> -		 * scanned were backed by a congested BDI and
> -		 * wait_iff_congested will stall.
> -		 */
> -		if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -			set_bit(PGDAT_CONGESTED, &pgdat->flags);
> -
>  		/* Allow kswapd to start writing pages during reclaim.*/
>  		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
>  			set_bit(PGDAT_DIRTY, &pgdat->flags);
> @@ -2852,12 +2806,17 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  	}
>  
>  	/*
> +	 * Tag a node/memcg as congested if all the dirty pages
> +	 * scanned were backed by a congested BDI and
> +	 * wait_iff_congested will stall.
> +	 *
>  	 * Legacy memcg will stall in page writeback so avoid forcibly
>  	 * stalling in wait_iff_congested().
>  	 */
> -	if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> +	if ((current_is_kswapd() ||
> +	     (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
>  	    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -		set_memcg_congestion(pgdat, root, true);
> +		set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
>  
>  	/*
>  	 * Stall direct reclaim for IO completions if underlying BDIs
> @@ -2865,8 +2824,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  	 * starts encountering unqueued dirty pages or cycling through
>  	 * the LRU too quickly.
>  	 */
> -	if (!sc->hibernation_mode && !current_is_kswapd() &&
> -	    current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> +	if (!current_is_kswapd() && current_may_throttle() &&
> +	    !sc->hibernation_mode &&
> +	    test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
>  		wait_iff_congested(BLK_RW_ASYNC, HZ/10);
>  
>  	if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> @@ -3080,8 +3040,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
>  		if (zone->zone_pgdat == last_pgdat)
>  			continue;
>  		last_pgdat = zone->zone_pgdat;
> +
>  		snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
> -		set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
> +
> +		if (cgroup_reclaim(sc)) {
> +			struct lruvec *lruvec;
> +
> +			lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
> +						   zone->zone_pgdat);
> +			clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
> +		}
>  	}
>  
>  	delayacct_freepages_end();
> @@ -3461,7 +3429,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
>  /* Clear pgdat state for congested, dirty or under writeback. */
>  static void clear_pgdat_congested(pg_data_t *pgdat)
>  {
> -	clear_bit(PGDAT_CONGESTED, &pgdat->flags);
> +	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
> +
> +	clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
>  	clear_bit(PGDAT_DIRTY, &pgdat->flags);
>  	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
>  }
> -- 
> 2.23.0
> 
> 

  reply	other threads:[~2019-10-22 21:04 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-22 14:47 [PATCH 0/8]: mm: vmscan: cgroup-related cleanups Johannes Weiner
2019-10-22 14:47 ` [PATCH 1/8] mm: vmscan: simplify lruvec_lru_size() Johannes Weiner
2019-10-22 19:18   ` Roman Gushchin
2019-10-23 13:48   ` Michal Hocko
2019-10-22 14:47 ` [PATCH 2/8] mm: clean up and clarify lruvec lookup procedure Johannes Weiner
2019-10-22 19:25   ` Roman Gushchin
2019-10-22 21:31     ` Johannes Weiner
2019-10-23 14:00   ` Michal Hocko
2019-10-22 14:47 ` [PATCH 3/8] mm: vmscan: move inactive_list_is_low() swap check to the caller Johannes Weiner
2019-10-22 19:28   ` Roman Gushchin
2019-10-23 14:06   ` Michal Hocko
2019-10-22 14:47 ` [PATCH 4/8] mm: vmscan: naming fixes: global_reclaim() and sane_reclaim() Johannes Weiner
2019-10-22 19:40   ` Roman Gushchin
2019-10-23 16:02     ` Johannes Weiner
2019-10-23 14:14   ` Michal Hocko
2019-10-23 15:56     ` Johannes Weiner
2019-10-22 14:48 ` [PATCH 5/8] mm: vmscan: replace shrink_node() loop with a retry jump Johannes Weiner
2019-10-22 19:56   ` Roman Gushchin
2019-10-22 21:42     ` Johannes Weiner
2019-10-22 22:46       ` Roman Gushchin
2019-10-23 14:18   ` Michal Hocko
2019-10-25 13:44     ` Johannes Weiner
2019-10-22 14:48 ` [PATCH 6/8] mm: vmscan: turn shrink_node_memcg() into shrink_lruvec() Johannes Weiner
2019-10-22 20:04   ` Roman Gushchin
2019-10-23 14:21   ` Michal Hocko
2019-10-22 14:48 ` [PATCH 7/8] mm: vmscan: split shrink_node() into node part and memcgs part Johannes Weiner
2019-10-22 20:08   ` Roman Gushchin
2019-10-25 14:36     ` Johannes Weiner
2019-10-23 14:24   ` Michal Hocko
2019-10-22 14:48 ` [PATCH 8/8] mm: vmscan: harmonize writeback congestion tracking for nodes & memcgs Johannes Weiner
2019-10-22 21:03   ` Roman Gushchin [this message]
2019-10-25 14:41     ` Johannes Weiner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191022210352.GB24142@tower.DHCP.thefacebook.com \
    --to=guro@fb.com \
    --cc=Kernel-team@fb.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.