[PATCH v2 05/22] mm: replace per-cpu lru-add page-vectors with page-lists

From: Konstantin Khlebnikov <khlebnikov@openvz.org>
To: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org
Cc: Hugh Dickins <hughd@google.com>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Subject: [PATCH v2 05/22] mm: replace per-cpu lru-add page-vectors with page-lists
Date: Mon, 20 Feb 2012 21:22:54 +0400	[thread overview]
Message-ID: <20120220172254.22196.24192.stgit@zurg> (raw)
In-Reply-To: <20120220171138.22196.65847.stgit@zurg>

This patch replaces page-vectors with page-lists in lru_cache_add*() functions.
We can use page->lru for linking because page obviously not in lru.

Now per-cpu batch limited with its pages total size, not pages count,
otherwise it can be extremely huge if there many huge-pages inside:
PAGEVEC_SIZE * HPAGE_SIZE = 28Mb, per-cpu!
These pages are hidden from memory reclaimer for a while.
New limit: LRU_CACHE_ADD_BATCH = 64 (* PAGE_SIZE = 256Kb)

So, huge-page adding now will always drain per-cpu list. Huge-page allocation
and preparation is long procedure, thus nobody will notice this draining.

Draining procedure disables preemption only for pages list isolation,
thus batch size can be increased without negative effect for latency.

Plus this patch introduces new function lru_cache_add_list() and use it in
mpage_readpages() and read_pages(). There pages already collected in list.
Unlike to single-page lru-add, list-add reuse page-referencies from caller,
thus we save one page_get()/page_put() per page.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
---
 fs/mpage.c           |   21 ++++++----
 include/linux/swap.h |    2 +
 mm/readahead.c       |   15 ++++---
 mm/swap.c            |  104 ++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 119 insertions(+), 23 deletions(-)

diff --git a/fs/mpage.c b/fs/mpage.c
index 643e9f5..6474f41 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/kdev_t.h>
 #include <linux/gfp.h>
 #include <linux/bio.h>
@@ -367,29 +368,33 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block)
 {
 	struct bio *bio = NULL;
-	unsigned page_idx;
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
+	struct page *page, *next;
+	int nr_added = 0;
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
-	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		struct page *page = list_entry(pages->prev, struct page, lru);
 
+	list_for_each_entry_safe(page, next, pages, lru) {
 		prefetchw(&page->flags);
-		list_del(&page->lru);
-		if (!add_to_page_cache_lru(page, mapping,
+		if (!add_to_page_cache(page, mapping,
 					page->index, GFP_KERNEL)) {
 			bio = do_mpage_readpage(bio, page,
-					nr_pages - page_idx,
+					nr_pages,
 					&last_block_in_bio, &map_bh,
 					&first_logical_block,
 					get_block);
+			nr_added++;
+		} else {
+			list_del(&page->lru);
+			page_cache_release(page);
 		}
-		page_cache_release(page);
+		nr_pages--;
 	}
-	BUG_ON(!list_empty(pages));
+	BUG_ON(nr_pages);
+	lru_cache_add_list(pages, nr_added, LRU_INACTIVE_FILE);
 	if (bio)
 		mpage_bio_submit(READ, bio);
 	return 0;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ba2c8d7..7394100 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -226,6 +226,8 @@ extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void lru_add_page_tail(struct zone* zone,
 			      struct page *page, struct page *page_tail);
+extern void lru_cache_add_list(struct list_head *pages,
+			       int size, enum lru_list lru);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02..2f6fe4b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/export.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -110,7 +111,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		struct list_head *pages, unsigned nr_pages)
 {
 	struct blk_plug plug;
-	unsigned page_idx;
+	struct page *page, *next;
 	int ret;
 
 	blk_start_plug(&plug);
@@ -122,15 +123,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		goto out;
 	}
 
-	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		struct page *page = list_to_page(pages);
-		list_del(&page->lru);
-		if (!add_to_page_cache_lru(page, mapping,
+	list_for_each_entry_safe(page, next, pages, lru) {
+		if (!add_to_page_cache(page, mapping,
 					page->index, GFP_KERNEL)) {
 			mapping->a_ops->readpage(filp, page);
+		} else {
+			list_del(&page->lru);
+			page_cache_release(page);
+			nr_pages--;
 		}
-		page_cache_release(page);
 	}
+	lru_cache_add_list(pages, nr_pages, LRU_INACTIVE_FILE);
 	ret = 0;
 
 out:
diff --git a/mm/swap.c b/mm/swap.c
index 38b2686..303fbc3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -36,7 +36,12 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
-static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
+/* How many pages may be in per-cpu lru-add pending list */
+#define LRU_CACHE_ADD_BATCH	64
+
+static DEFINE_PER_CPU(struct list_head[NR_LRU_LISTS], lru_add_pages);
+static DEFINE_PER_CPU(int[NR_LRU_LISTS], lru_add_pending);
+
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 
@@ -371,14 +376,83 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
+static void __lru_cache_add_list(struct list_head *pages, enum lru_list lru)
+{
+	int file = is_file_lru(lru);
+	int active = is_active_lru(lru);
+	struct page *page, *next;
+	struct zone *pagezone, *zone = NULL;
+	unsigned long uninitialized_var(flags);
+	LIST_HEAD(free_pages);
+
+	list_for_each_entry_safe(page, next, pages, lru) {
+		pagezone = page_zone(page);
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irqrestore(&zone->lru_lock, flags);
+			zone = pagezone;
+			spin_lock_irqsave(&zone->lru_lock, flags);
+		}
+		VM_BUG_ON(PageActive(page));
+		VM_BUG_ON(PageUnevictable(page));
+		VM_BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		if (active)
+			SetPageActive(page);
+		update_page_reclaim_stat(zone, page, file, active);
+		add_page_to_lru_list(zone, page, lru);
+		if (unlikely(put_page_testzero(page))) {
+			__ClearPageLRU(page);
+			__ClearPageActive(page);
+			del_page_from_lru_list(zone, page, lru);
+			if (unlikely(PageCompound(page))) {
+				spin_unlock_irqrestore(&zone->lru_lock, flags);
+				zone = NULL;
+				(*get_compound_page_dtor(page))(page);
+			} else
+				list_add_tail(&page->lru, &free_pages);
+		}
+	}
+	if (zone)
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+
+	free_hot_cold_page_list(&free_pages, 0);
+}
+
+/**
+ * lru_cache_add_list - add list of pages into lru, drop caller's page
+ *			references and reinitialize list.
+ * @pages	list of pages to adding
+ * @size	total size of pages in list
+ * @lru		the LRU list to which the page is added.
+ */
+void lru_cache_add_list(struct list_head *pages, int size, enum lru_list lru)
+{
+	struct list_head *list;
+
+	preempt_disable();
+	list = __this_cpu_ptr(lru_add_pages + lru);
+	list_splice_tail_init(pages, list);
+	if (likely(__this_cpu_add_return(lru_add_pending[lru], size) <=
+				LRU_CACHE_ADD_BATCH)) {
+		preempt_enable();
+		return;
+	}
+	list_replace_init(list, pages);
+	__this_cpu_write(lru_add_pending[lru], 0);
+	preempt_enable();
+	__lru_cache_add_list(pages, lru);
+	INIT_LIST_HEAD(pages);
+}
+EXPORT_SYMBOL(lru_cache_add_list);
+
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
+	struct list_head pages = LIST_HEAD_INIT(page->lru);
+	int size = hpage_nr_pages(page);
 
 	page_cache_get(page);
-	if (!pagevec_add(pvec, page))
-		__pagevec_lru_add(pvec, lru);
-	put_cpu_var(lru_add_pvecs);
+	lru_cache_add_list(&pages, size, lru);
 }
 EXPORT_SYMBOL(__lru_cache_add);
 
@@ -498,14 +572,16 @@ static void lru_deactivate_fn(struct page *page, void *arg)
  */
 void lru_add_drain_cpu(int cpu)
 {
-	struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
+	struct list_head *pages = per_cpu(lru_add_pages, cpu);
 	struct pagevec *pvec;
 	int lru;
 
 	for_each_lru(lru) {
-		pvec = &pvecs[lru - LRU_BASE];
-		if (pagevec_count(pvec))
-			__pagevec_lru_add(pvec, lru);
+		if (!list_empty(pages + lru)) {
+			__lru_cache_add_list(pages + lru, lru);
+			INIT_LIST_HEAD(pages + lru);
+			per_cpu(lru_add_pending[lru], cpu) = 0;
+		}
 	}
 
 	pvec = &per_cpu(lru_rotate_pvecs, cpu);
@@ -780,3 +856,13 @@ void __init swap_setup(void)
 	 * _really_ don't want to cluster much more
 	 */
 }
+
+void __init lru_cache_add_init(void)
+{
+	int cpu, lru;
+
+	for_each_possible_cpu(cpu)
+		for_each_lru(lru)
+			INIT_LIST_HEAD(per_cpu(lru_add_pages, cpu) + lru);
+}
+core_initcall(lru_cache_add_init);