linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: js1304@gmail.com
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>, Hugh Dickins <hughd@google.com>,
	Minchan Kim <minchan@kernel.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mel Gorman <mgorman@techsingularity.net>,
	kernel-team@lge.com, Joonsoo Kim <iamjoonsoo.kim@lge.com>
Subject: [PATCH v5 08/10] mm/swap: do not readahead if the previous owner of the swap entry isn't me
Date: Fri,  3 Apr 2020 14:40:46 +0900	[thread overview]
Message-ID: <1585892447-32059-9-git-send-email-iamjoonsoo.kim@lge.com> (raw)
In-Reply-To: <1585892447-32059-1-git-send-email-iamjoonsoo.kim@lge.com>

From: Joonsoo Kim <iamjoonsoo.kim@lge.com>

To implement workingset detection for anonymous page, the page that is
swapped-in by readahead but not yet touched in the swap cache is also
charged to the memcg. However, if these readahead pages are not ours,
these pages are wrongly charged to us and our memcg could be wrongly
pressured.

To solve the problem, this patch implements to avoid readahead if the
previous owner of the swap entry isn't me. With this patch, readahead
only happens for mine and then there is no wrongly memcg pressure issue.

The purpose of the readahead is to improve the performance by using
locality of the swap space. If the entry isn't mine, there is no locality.
Therefore, it's safe to avoid readahead.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 include/linux/swap.h |  9 ++++++---
 mm/shmem.c           |  2 +-
 mm/swap_state.c      | 34 ++++++++++++++++++++++++++--------
 mm/workingset.c      | 16 ++++++++++++++++
 mm/zswap.c           |  2 +-
 5 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 97e8a2e..d204cc7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -307,6 +307,7 @@ struct vma_swap_readahead {
 };
 
 /* linux/mm/workingset.c */
+bool shadow_from_memcg(void *shadow, struct mem_cgroup *memcg);
 void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
 void workingset_refault(struct page *page, void *shadow);
 void workingset_activation(struct page *page);
@@ -410,7 +411,8 @@ extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *page);
 extern void *get_shadow_from_swap_cache(swp_entry_t entry);
 extern int add_to_swap_cache(struct page *page, swp_entry_t entry,
-			struct vm_area_struct *vma, gfp_t gfp, void **shadowp);
+			struct vm_area_struct *vma, gfp_t gfp,
+			void **shadowp, bool readahead);
 extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
 extern void __delete_from_swap_cache(struct page *page,
 			swp_entry_t entry, void *shadow);
@@ -425,7 +427,7 @@ extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
 			bool do_poll);
 extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr,
-			bool *new_page_allocated);
+			bool *new_page_allocated, bool readahead);
 extern struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 				struct vm_fault *vmf);
 extern struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
@@ -573,7 +575,8 @@ static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
 }
 
 static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
-			struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
+				struct vm_area_struct *vma, gfp_t gfp,
+				void **shadowp, bool readahead)
 {
 	return -1;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 8e28c1f..483c32d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1371,7 +1371,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 
 	if (add_to_swap_cache(page, swap, NULL,
 			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
-			NULL) == 0) {
+			NULL, false) == 0) {
 		spin_lock_irq(&info->lock);
 		shmem_recalc_inode(inode);
 		info->swapped++;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index de994f9..9bf0251f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -126,7 +126,8 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
  * but sets SwapCache flag and private instead of mapping and index.
  */
 int add_to_swap_cache(struct page *page, swp_entry_t entry,
-			struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
+			struct vm_area_struct *vma, gfp_t gfp,
+			void **shadowp, bool readahead)
 {
 	struct address_space *address_space = swap_address_space(entry);
 	pgoff_t idx = swp_offset(entry);
@@ -143,6 +144,23 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
 	VM_BUG_ON_PAGE(PageSwapCache(page), page);
 	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
+	if (readahead) {
+		void *shadow = get_shadow_from_swap_cache(entry);
+
+		/*
+		 * In readahead case, check the memcgid of the shadow entry
+		 * in order to stop to readahead other's page
+		 */
+		if (shadow) {
+			memcg = get_mem_cgroup_from_mm(mm);
+			if (memcg && !shadow_from_memcg(shadow, memcg)) {
+				mem_cgroup_put(memcg);
+				return -EINVAL;
+			}
+			mem_cgroup_put(memcg);
+		}
+	}
+
 	page_ref_add(page, nr);
 	/* PageSwapCache() prevent the page from being re-charged */
 	SetPageSwapCache(page);
@@ -253,7 +271,7 @@ int add_to_swap(struct page *page)
 	 * Add it to the swap cache.
 	 */
 	err = add_to_swap_cache(page, entry, NULL,
-			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
+			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL, false);
 	if (err)
 		/*
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -402,7 +420,7 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
 
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr,
-			bool *new_page_allocated)
+			bool *new_page_allocated, bool readahead)
 {
 	struct page *found_page = NULL, *new_page = NULL;
 	struct swap_info_struct *si;
@@ -465,7 +483,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		__SetPageSwapBacked(new_page);
 		shadow = NULL;
 		err = add_to_swap_cache(new_page, entry, vma,
-				gfp_mask & GFP_KERNEL, &shadow);
+				gfp_mask & GFP_KERNEL, &shadow, readahead);
 		if (likely(!err)) {
 			/* Initiate read into locked page */
 			SetPageWorkingset(new_page);
@@ -481,7 +499,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * clear SWAP_HAS_CACHE flag.
 		 */
 		put_swap_page(new_page, entry);
-	} while (err != -ENOMEM);
+	} while (err != -ENOMEM && err != -EINVAL);
 
 	if (new_page)
 		put_page(new_page);
@@ -499,7 +517,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 {
 	bool page_was_allocated;
 	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
-			vma, addr, &page_was_allocated);
+			vma, addr, &page_was_allocated, false);
 
 	if (page_was_allocated)
 		swap_readpage(retpage, do_poll);
@@ -624,7 +642,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		/* Ok, do the async read-ahead now */
 		page = __read_swap_cache_async(
 			swp_entry(swp_type(entry), offset),
-			gfp_mask, vma, addr, &page_allocated);
+			gfp_mask, vma, addr, &page_allocated, true);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -796,7 +814,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 		if (unlikely(non_swap_entry(entry)))
 			continue;
 		page = __read_swap_cache_async(entry, gfp_mask, vma,
-					       vmf->address, &page_allocated);
+					vmf->address, &page_allocated, true);
 		if (!page)
 			continue;
 		if (page_allocated) {
diff --git a/mm/workingset.c b/mm/workingset.c
index 871b867..1fa46cf 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -231,6 +231,22 @@ static void unpack_shadow(void *shadow, int *memcgidp, int *page_memcg_tagp,
 	*workingsetp = workingset;
 }
 
+bool shadow_from_memcg(void *shadow, struct mem_cgroup *memcg)
+{
+	int memcgid, page_memcg_tag;
+	struct pglist_data *pgdat;
+	unsigned long eviction;
+	bool workingset;
+	int memcg_tag;
+
+	unpack_shadow(shadow, &memcgid, &page_memcg_tag,
+			&pgdat, &eviction, &workingset);
+	memcg_tag = mem_cgroup_id(memcg);
+	memcg_tag &= (1UL << PAGE_MEMCG_TAG_SHIFT) - 1;
+
+	return memcg_tag == page_memcg_tag;
+}
+
 static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat,
 				bool file)
 {
diff --git a/mm/zswap.c b/mm/zswap.c
index 55094e6..92f1463 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -848,7 +848,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
 	bool page_was_allocated;
 
 	*retpage = __read_swap_cache_async(entry, GFP_KERNEL,
-			NULL, 0, &page_was_allocated);
+			NULL, 0, &page_was_allocated, false);
 	if (page_was_allocated)
 		return ZSWAP_SWAPCACHE_NEW;
 	if (!*retpage)
-- 
2.7.4


  parent reply	other threads:[~2020-04-03  5:41 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-04-03  5:40 [PATCH v5 00/10] workingset protection/detection on the anonymous LRU list js1304
2020-04-03  5:40 ` [PATCH v5 01/10] mm/vmscan: make active/inactive ratio as 1:1 for anon lru js1304
2020-04-03  5:40 ` [PATCH v5 02/10] mm/vmscan: protect the workingset on anonymous LRU js1304
2020-04-03  5:40 ` [PATCH v5 03/10] mm/workingset: extend the workingset detection for anon LRU js1304
2020-04-03  5:40 ` [PATCH v5 04/10] mm/swapcache: support to handle the exceptional entries in swapcache js1304
2020-04-03  5:40 ` [PATCH v5 05/10] mm/swap: charge the page when adding to the swap cache js1304
2020-04-03 18:29   ` Yang Shi
2020-04-06  1:03     ` Joonsoo Kim
2020-04-07  0:22       ` Yang Shi
2020-04-07  1:27         ` Joonsoo Kim
2020-04-16 16:11   ` Johannes Weiner
2020-04-17  1:38     ` Joonsoo Kim
2020-04-17  3:31       ` Johannes Weiner
2020-04-17  3:57         ` Joonsoo Kim
2020-04-03  5:40 ` [PATCH v5 06/10] mm/swap: implement workingset detection for anonymous LRU js1304
2020-04-03  5:40 ` [PATCH v5 07/10] mm/workingset: support to remember the previous owner of the page js1304
2020-04-03  5:40 ` js1304 [this message]
2020-04-03  5:40 ` [PATCH v5 09/10] mm/vmscan: restore active/inactive ratio for anonymous LRU js1304
2020-04-03  5:45 ` [PATCH v5 10/10] mm/swap: reinforce the reclaim_stat changed by anon LRU algorithm change js1304
     [not found] ` <20200406091814.17256-1-hdanton@sina.com>
2020-04-07  0:40   ` [PATCH v5 02/10] mm/vmscan: protect the workingset on anonymous LRU Joonsoo Kim
     [not found] ` <20200406115804.4440-1-hdanton@sina.com>
2020-04-07  0:42   ` [PATCH v5 05/10] mm/swap: charge the page when adding to the swap cache Joonsoo Kim
     [not found]   ` <20200407022144.11164-1-hdanton@sina.com>
2020-04-09  0:53     ` Joonsoo Kim
2020-04-08 16:55 ` [PATCH v5 00/10] workingset protection/detection on the anonymous LRU list Vlastimil Babka
2020-04-09  0:50   ` Joonsoo Kim
2020-06-03  3:57     ` Suren Baghdasaryan
2020-06-03  5:46       ` Joonsoo Kim

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1585892447-32059-9-git-send-email-iamjoonsoo.kim@lge.com \
    --to=js1304@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=kernel-team@lge.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@kernel.org \
    --cc=minchan@kernel.org \
    --cc=vbabka@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).