LKML Archive on lore.kernel.org
 help / color / Atom feed
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: "Theodore Ts'o" <tytso@mit.edu>,
	Andreas Dilger <adilger.kernel@dilger.ca>,
	Jan Kara <jack@suse.com>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>,
	Hugh Dickins <hughd@google.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	Matthew Wilcox <willy@infradead.org>,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-block@vger.kernel.org,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries
Date: Thu, 26 Jan 2017 14:57:43 +0300
Message-ID: <20170126115819.58875-2-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <20170126115819.58875-1-kirill.shutemov@linux.intel.com>

We would need to use multi-order radix-tree entires for ext4 and other
filesystems to have coherent view on tags (dirty/towrite) in the tree.

This patch converts huge tmpfs implementation to multi-order entries, so
we will be able to use the same code patch for all filesystems.

We also change interface for page-cache lookup function:

  - functions that lookup for pages[1] would return subpages of THP
    relevant for requested indexes;

  - functions that lookup for entries[2] would return one entry per-THP
    and index will point to index of head page (basically, round down to
    HPAGE_PMD_NR);

This would provide balanced exposure of multi-order entires to the rest
of the kernel.

[1] find_get_pages(), pagecache_get_page(), pagevec_lookup(), etc.
[2] find_get_entry(), find_get_entries(), pagevec_lookup_entries(), etc.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/pagemap.h |   9 ++
 mm/filemap.c            | 236 ++++++++++++++++++++++++++----------------------
 mm/huge_memory.c        |  48 +++++++---
 mm/khugepaged.c         |  26 ++----
 mm/shmem.c              | 117 ++++++++++--------------
 mm/truncate.c           |  15 ++-
 6 files changed, 235 insertions(+), 216 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 324c8dbad1e1..ad63a7be5a5e 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -332,6 +332,15 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
 			mapping_gfp_mask(mapping));
 }
 
+static inline struct page *find_subpage(struct page *page, pgoff_t offset)
+{
+	VM_BUG_ON_PAGE(PageTail(page), page);
+	VM_BUG_ON_PAGE(page->index > offset, page);
+	VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) < offset,
+			page);
+	return page - page->index + offset;
+}
+
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
diff --git a/mm/filemap.c b/mm/filemap.c
index b772a33ef640..837a71a2a412 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -150,7 +150,9 @@ static int page_cache_tree_insert(struct address_space *mapping,
 static void page_cache_tree_delete(struct address_space *mapping,
 				   struct page *page, void *shadow)
 {
-	int i, nr;
+	struct radix_tree_node *node;
+	void **slot;
+	int nr;
 
 	/* hugetlb pages are represented by one entry in the radix tree */
 	nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
@@ -159,19 +161,12 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	VM_BUG_ON_PAGE(PageTail(page), page);
 	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
 
-	for (i = 0; i < nr; i++) {
-		struct radix_tree_node *node;
-		void **slot;
+	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+	VM_BUG_ON_PAGE(!node && nr != 1, page);
 
-		__radix_tree_lookup(&mapping->page_tree, page->index + i,
-				    &node, &slot);
-
-		VM_BUG_ON_PAGE(!node && nr != 1, page);
-
-		radix_tree_clear_tags(&mapping->page_tree, node, slot);
-		__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
-				     workingset_update_node, mapping);
-	}
+	radix_tree_clear_tags(&mapping->page_tree, node, slot);
+	__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
+			workingset_update_node, mapping);
 
 	if (shadow) {
 		mapping->nrexceptional += nr;
@@ -285,12 +280,7 @@ void delete_from_page_cache(struct page *page)
 	if (freepage)
 		freepage(page);
 
-	if (PageTransHuge(page) && !PageHuge(page)) {
-		page_ref_sub(page, HPAGE_PMD_NR);
-		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
-	} else {
-		put_page(page);
-	}
+	put_page(page);
 }
 EXPORT_SYMBOL(delete_from_page_cache);
 
@@ -1172,7 +1162,7 @@ EXPORT_SYMBOL(page_cache_prev_hole);
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 {
 	void **pagep;
-	struct page *head, *page;
+	struct page *page;
 
 	rcu_read_lock();
 repeat:
@@ -1193,15 +1183,8 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 			goto out;
 		}
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
-			goto repeat;
-
-		/* The page was split under us? */
-		if (compound_head(page) != head) {
-			put_page(head);
+		if (!page_cache_get_speculative(page))
 			goto repeat;
-		}
 
 		/*
 		 * Has the page moved?
@@ -1209,7 +1192,7 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 		 * include/linux/pagemap.h for details.
 		 */
 		if (unlikely(page != *pagep)) {
-			put_page(head);
+			put_page(page);
 			goto repeat;
 		}
 	}
@@ -1250,7 +1233,6 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
 			put_page(page);
 			goto repeat;
 		}
-		VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
 	}
 	return page;
 }
@@ -1307,7 +1289,6 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
 			put_page(page);
 			goto repeat;
 		}
-		VM_BUG_ON_PAGE(page->index != offset, page);
 	}
 
 	if (page && (fgp_flags & FGP_ACCESSED))
@@ -1342,6 +1323,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
 		}
 	}
 
+	if (page)
+		page = find_subpage(page, offset);
 	return page;
 }
 EXPORT_SYMBOL(pagecache_get_page);
@@ -1382,7 +1365,7 @@ unsigned find_get_entries(struct address_space *mapping,
 
 	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-		struct page *head, *page;
+		struct page *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1400,19 +1383,12 @@ unsigned find_get_entries(struct address_space *mapping,
 			goto export;
 		}
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
+		if (!page_cache_get_speculative(page))
 			goto repeat;
 
-		/* The page was split under us? */
-		if (compound_head(page) != head) {
-			put_page(head);
-			goto repeat;
-		}
-
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			put_page(head);
+			put_page(page);
 			goto repeat;
 		}
 export:
@@ -1446,14 +1422,17 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
 	struct radix_tree_iter iter;
 	void **slot;
-	unsigned ret = 0;
+	unsigned refs, ret = 0;
 
 	if (unlikely(!nr_pages))
 		return 0;
 
 	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-		struct page *head, *page;
+		struct page *page;
+		unsigned long index = iter.index;
+		if (index < start)
+			index = start;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1472,25 +1451,35 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			continue;
 		}
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
-			goto repeat;
-
-		/* The page was split under us? */
-		if (compound_head(page) != head) {
-			put_page(head);
+		if (!page_cache_get_speculative(page))
 			goto repeat;
-		}
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			put_page(head);
+			put_page(page);
 			goto repeat;
 		}
 
+		/* For multi-order entries, find relevant subpage */
+		if (PageTransHuge(page)) {
+			VM_BUG_ON(index - page->index < 0);
+			VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+			page += index - page->index;
+		}
+
 		pages[ret] = page;
 		if (++ret == nr_pages)
 			break;
+		if (!PageTransCompound(page))
+			continue;
+		for (refs = 0; ret < nr_pages &&
+				(index + 1) % HPAGE_PMD_NR;
+				ret++, refs++, index++)
+			pages[ret] = ++page;
+		if (refs)
+			page_ref_add(compound_head(page), refs);
+		if (ret == nr_pages)
+			break;
 	}
 
 	rcu_read_unlock();
@@ -1500,7 +1489,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 /**
  * find_get_pages_contig - gang contiguous pagecache lookup
  * @mapping:	The address_space to search
- * @index:	The starting page index
+ * @start:	The starting page index
  * @nr_pages:	The maximum number of pages
  * @pages:	Where the resulting pages are placed
  *
@@ -1509,19 +1498,22 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
  *
  * find_get_pages_contig() returns the number of pages which were found.
  */
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages)
 {
 	struct radix_tree_iter iter;
 	void **slot;
-	unsigned int ret = 0;
+	unsigned int refs, ret = 0;
 
 	if (unlikely(!nr_pages))
 		return 0;
 
 	rcu_read_lock();
-	radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
-		struct page *head, *page;
+	radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, start) {
+		struct page *page;
+		unsigned long index = iter.index;
+		if (index < start)
+			index = start;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		/* The hole, there no reason to continue */
@@ -1541,19 +1533,12 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 			break;
 		}
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
+		if (!page_cache_get_speculative(page))
 			goto repeat;
 
-		/* The page was split under us? */
-		if (compound_head(page) != head) {
-			put_page(head);
-			goto repeat;
-		}
-
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			put_page(head);
+			put_page(page);
 			goto repeat;
 		}
 
@@ -1562,14 +1547,31 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 		 * otherwise we can get both false positives and false
 		 * negatives, which is just confusing to the caller.
 		 */
-		if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
+		if (page->mapping == NULL || page_to_pgoff(page) != index) {
 			put_page(page);
 			break;
 		}
 
+		/* For multi-order entries, find relevant subpage */
+		if (PageTransHuge(page)) {
+			VM_BUG_ON(index - page->index < 0);
+			VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+			page += index - page->index;
+		}
+
 		pages[ret] = page;
 		if (++ret == nr_pages)
 			break;
+		if (!PageTransCompound(page))
+			continue;
+		for (refs = 0; ret < nr_pages &&
+				(index + 1) % HPAGE_PMD_NR;
+				ret++, refs++, index++)
+			pages[ret] = ++page;
+		if (refs)
+			page_ref_add(compound_head(page), refs);
+		if (ret == nr_pages)
+			break;
 	}
 	rcu_read_unlock();
 	return ret;
@@ -1579,7 +1581,7 @@ EXPORT_SYMBOL(find_get_pages_contig);
 /**
  * find_get_pages_tag - find and return pages that match @tag
  * @mapping:	the address_space to search
- * @index:	the starting page index
+ * @indexp:	the starting page index
  * @tag:	the tag index
  * @nr_pages:	the maximum number of pages
  * @pages:	where the resulting pages are placed
@@ -1587,20 +1589,23 @@ EXPORT_SYMBOL(find_get_pages_contig);
  * Like find_get_pages, except we only return pages which are tagged with
  * @tag.   We update @index to index the next page for the traversal.
  */
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *indexp,
 			int tag, unsigned int nr_pages, struct page **pages)
 {
 	struct radix_tree_iter iter;
 	void **slot;
-	unsigned ret = 0;
+	unsigned refs, ret = 0;
 
 	if (unlikely(!nr_pages))
 		return 0;
 
 	rcu_read_lock();
 	radix_tree_for_each_tagged(slot, &mapping->page_tree,
-				   &iter, *index, tag) {
-		struct page *head, *page;
+				   &iter, *indexp, tag) {
+		struct page *page;
+		unsigned long index = iter.index;
+		if (index < *indexp)
+			index = *indexp;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1625,31 +1630,41 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			continue;
 		}
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
-			goto repeat;
-
-		/* The page was split under us? */
-		if (compound_head(page) != head) {
-			put_page(head);
+		if (!page_cache_get_speculative(page))
 			goto repeat;
-		}
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			put_page(head);
+			put_page(page);
 			goto repeat;
 		}
 
+		/* For multi-order entries, find relevant subpage */
+		if (PageTransHuge(page)) {
+			VM_BUG_ON(index - page->index < 0);
+			VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+			page += index - page->index;
+		}
+
 		pages[ret] = page;
 		if (++ret == nr_pages)
 			break;
+		if (!PageTransCompound(page))
+			continue;
+		for (refs = 0; ret < nr_pages &&
+				(index + 1) % HPAGE_PMD_NR;
+				ret++, refs++, index++)
+			pages[ret] = ++page;
+		if (refs)
+			page_ref_add(compound_head(page), refs);
+		if (ret == nr_pages)
+			break;
 	}
 
 	rcu_read_unlock();
 
 	if (ret)
-		*index = pages[ret - 1]->index + 1;
+		*indexp = page_to_pgoff(pages[ret - 1]) + 1;
 
 	return ret;
 }
@@ -1681,7 +1696,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
 	rcu_read_lock();
 	radix_tree_for_each_tagged(slot, &mapping->page_tree,
 				   &iter, start, tag) {
-		struct page *head, *page;
+		struct page *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1700,19 +1715,12 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
 			goto export;
 		}
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
-			goto repeat;
-
-		/* The page was split under us? */
-		if (compound_head(page) != head) {
-			put_page(head);
+		if (!page_cache_get_speculative(page))
 			goto repeat;
-		}
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			put_page(head);
+			put_page(page);
 			goto repeat;
 		}
 export:
@@ -2310,12 +2318,15 @@ void filemap_map_pages(struct vm_fault *vmf,
 	struct address_space *mapping = file->f_mapping;
 	pgoff_t last_pgoff = start_pgoff;
 	loff_t size;
-	struct page *head, *page;
+	struct page *page;
 
 	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
 			start_pgoff) {
-		if (iter.index > end_pgoff)
+		unsigned long index = iter.index;
+		if (index < start_pgoff)
+			index = start_pgoff;
+		if (index > end_pgoff)
 			break;
 repeat:
 		page = radix_tree_deref_slot(slot);
@@ -2326,25 +2337,26 @@ void filemap_map_pages(struct vm_fault *vmf,
 				slot = radix_tree_iter_retry(&iter);
 				continue;
 			}
+			page = NULL;
 			goto next;
 		}
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
+		if (!page_cache_get_speculative(page))
 			goto repeat;
 
-		/* The page was split under us? */
-		if (compound_head(page) != head) {
-			put_page(head);
-			goto repeat;
-		}
-
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			put_page(head);
+			put_page(page);
 			goto repeat;
 		}
 
+		/* For multi-order entries, find relevant subpage */
+		if (PageTransHuge(page)) {
+			VM_BUG_ON(index - page->index < 0);
+			VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+			page += index - page->index;
+		}
+
 		if (!PageUptodate(page) ||
 				PageReadahead(page) ||
 				PageHWPoison(page))
@@ -2352,20 +2364,20 @@ void filemap_map_pages(struct vm_fault *vmf,
 		if (!trylock_page(page))
 			goto skip;
 
-		if (page->mapping != mapping || !PageUptodate(page))
+		if (page_mapping(page) != mapping || !PageUptodate(page))
 			goto unlock;
 
 		size = round_up(i_size_read(mapping->host), PAGE_SIZE);
-		if (page->index >= size >> PAGE_SHIFT)
+		if (compound_head(page)->index >= size >> PAGE_SHIFT)
 			goto unlock;
 
 		if (file->f_ra.mmap_miss > 0)
 			file->f_ra.mmap_miss--;
 
-		vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+		vmf->address += (index - last_pgoff) << PAGE_SHIFT;
 		if (vmf->pte)
-			vmf->pte += iter.index - last_pgoff;
-		last_pgoff = iter.index;
+			vmf->pte += index - last_pgoff;
+		last_pgoff = index;
 		if (alloc_set_pte(vmf, NULL, page))
 			goto unlock;
 		unlock_page(page);
@@ -2378,8 +2390,14 @@ void filemap_map_pages(struct vm_fault *vmf,
 		/* Huge page is mapped? No need to proceed. */
 		if (pmd_trans_huge(*vmf->pmd))
 			break;
-		if (iter.index == end_pgoff)
+		if (index == end_pgoff)
 			break;
+		if (page && PageTransCompound(page) &&
+				(index & (HPAGE_PMD_NR - 1)) !=
+				HPAGE_PMD_NR - 1) {
+			index++;
+			goto repeat;
+		}
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ca7855f857fa..f383cb801e34 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1928,6 +1928,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	struct page *head = compound_head(page);
 	struct zone *zone = page_zone(head);
 	struct lruvec *lruvec;
+	struct page *subpage;
 	pgoff_t end = -1;
 	int i;
 
@@ -1936,8 +1937,27 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	/* complete memcg works before add pages to LRU */
 	mem_cgroup_split_huge_fixup(head);
 
-	if (!PageAnon(page))
-		end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
+	if (!PageAnon(head)) {
+		struct address_space *mapping = head->mapping;
+		struct radix_tree_iter iter;
+		void **slot;
+
+		__dec_node_page_state(head, NR_SHMEM_THPS);
+
+		radix_tree_split(&mapping->page_tree, head->index, 0);
+		radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
+				head->index) {
+			if (iter.index >= head->index + HPAGE_PMD_NR)
+				break;
+			subpage = head + iter.index - head->index;
+			radix_tree_replace_slot(&mapping->page_tree,
+					slot, subpage);
+			VM_BUG_ON_PAGE(compound_head(subpage) != head, subpage);
+		}
+		radix_tree_preload_end();
+
+		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+	}
 
 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
 		__split_huge_page_tail(head, i, lruvec, list);
@@ -1966,7 +1986,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	unfreeze_page(head);
 
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		struct page *subpage = head + i;
+		subpage = head + i;
 		if (subpage == page)
 			continue;
 		unlock_page(subpage);
@@ -2123,8 +2143,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			goto out;
 		}
 
-		/* Addidional pins from radix tree */
-		extra_pins = HPAGE_PMD_NR;
+		/* Addidional pin from radix tree */
+		extra_pins = 1;
 		anon_vma = NULL;
 		i_mmap_lock_read(mapping);
 	}
@@ -2146,6 +2166,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	if (mlocked)
 		lru_add_drain();
 
+	if (mapping && radix_tree_split_preload(HPAGE_PMD_ORDER, 0,
+				GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto unfreeze;
+	}
+
 	/* prevent PageLRU to go away from under us, and freeze lru stats */
 	spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
 
@@ -2155,10 +2181,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 		spin_lock(&mapping->tree_lock);
 		pslot = radix_tree_lookup_slot(&mapping->page_tree,
 				page_index(head));
-		/*
-		 * Check if the head page is present in radix tree.
-		 * We assume all tail are present too, if head is there.
-		 */
+		/* Check if the page is present in radix tree */
 		if (radix_tree_deref_slot_protected(pslot,
 					&mapping->tree_lock) != head)
 			goto fail;
@@ -2173,8 +2196,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			pgdata->split_queue_len--;
 			list_del(page_deferred_list(head));
 		}
-		if (mapping)
-			__dec_node_page_state(page, NR_SHMEM_THPS);
 		spin_unlock(&pgdata->split_queue_lock);
 		__split_huge_page(page, list, flags);
 		ret = 0;
@@ -2188,9 +2209,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			BUG();
 		}
 		spin_unlock(&pgdata->split_queue_lock);
-fail:		if (mapping)
+fail:		if (mapping) {
 			spin_unlock(&mapping->tree_lock);
+			radix_tree_preload_end();
+		}
 		spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+unfreeze:
 		unfreeze_page(head);
 		ret = -EBUSY;
 	}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 77ae3239c3de..1910048bf63a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1350,10 +1350,8 @@ static void collapse_shmem(struct mm_struct *mm,
 			break;
 		}
 		nr_none += n;
-		for (; index < min(iter.index, end); index++) {
-			radix_tree_insert(&mapping->page_tree, index,
-					new_page + (index % HPAGE_PMD_NR));
-		}
+		for (; index < min(iter.index, end); index++)
+			radix_tree_insert(&mapping->page_tree, index, new_page);
 
 		/* We are done. */
 		if (index >= end)
@@ -1425,8 +1423,7 @@ static void collapse_shmem(struct mm_struct *mm,
 		list_add_tail(&page->lru, &pagelist);
 
 		/* Finally, replace with the new page. */
-		radix_tree_replace_slot(&mapping->page_tree, slot,
-				new_page + (index % HPAGE_PMD_NR));
+		radix_tree_replace_slot(&mapping->page_tree, slot, new_page);
 
 		slot = radix_tree_iter_resume(slot, &iter);
 		index++;
@@ -1444,24 +1441,17 @@ static void collapse_shmem(struct mm_struct *mm,
 		break;
 	}
 
-	/*
-	 * Handle hole in radix tree at the end of the range.
-	 * This code only triggers if there's nothing in radix tree
-	 * beyond 'end'.
-	 */
-	if (result == SCAN_SUCCEED && index < end) {
+	if (result == SCAN_SUCCEED) {
 		int n = end - index;
 
-		if (!shmem_charge(mapping->host, n)) {
+		if (n && !shmem_charge(mapping->host, n)) {
 			result = SCAN_FAIL;
 			goto tree_locked;
 		}
-
-		for (; index < end; index++) {
-			radix_tree_insert(&mapping->page_tree, index,
-					new_page + (index % HPAGE_PMD_NR));
-		}
 		nr_none += n;
+
+		radix_tree_join(&mapping->page_tree, start,
+				HPAGE_PMD_ORDER, new_page);
 	}
 
 tree_locked:
diff --git a/mm/shmem.c b/mm/shmem.c
index bb53285a1d99..ff017bbde4b3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -544,33 +544,14 @@ static int shmem_add_to_page_cache(struct page *page,
 	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 	VM_BUG_ON(expected && PageTransHuge(page));
 
-	page_ref_add(page, nr);
+	get_page(page);
 	page->mapping = mapping;
 	page->index = index;
 
 	spin_lock_irq(&mapping->tree_lock);
-	if (PageTransHuge(page)) {
-		void __rcu **results;
-		pgoff_t idx;
-		int i;
-
-		error = 0;
-		if (radix_tree_gang_lookup_slot(&mapping->page_tree,
-					&results, &idx, index, 1) &&
-				idx < index + HPAGE_PMD_NR) {
-			error = -EEXIST;
-		}
-
-		if (!error) {
-			for (i = 0; i < HPAGE_PMD_NR; i++) {
-				error = radix_tree_insert(&mapping->page_tree,
-						index + i, page + i);
-				VM_BUG_ON(error);
-			}
-			count_vm_event(THP_FILE_ALLOC);
-		}
-	} else if (!expected) {
-		error = radix_tree_insert(&mapping->page_tree, index, page);
+	if (!expected) {
+		error = __radix_tree_insert(&mapping->page_tree, index,
+				compound_order(page), page);
 	} else {
 		error = shmem_radix_tree_replace(mapping, index, expected,
 								 page);
@@ -578,15 +559,17 @@ static int shmem_add_to_page_cache(struct page *page,
 
 	if (!error) {
 		mapping->nrpages += nr;
-		if (PageTransHuge(page))
+		if (PageTransHuge(page)) {
+			count_vm_event(THP_FILE_ALLOC);
 			__inc_node_page_state(page, NR_SHMEM_THPS);
+		}
 		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
 		__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
 		spin_unlock_irq(&mapping->tree_lock);
 	} else {
 		page->mapping = NULL;
 		spin_unlock_irq(&mapping->tree_lock);
-		page_ref_sub(page, nr);
+		put_page(page);
 	}
 	return error;
 }
@@ -727,8 +710,9 @@ void shmem_unlock_mapping(struct address_space *mapping)
 					   PAGEVEC_SIZE, pvec.pages, indices);
 		if (!pvec.nr)
 			break;
-		index = indices[pvec.nr - 1] + 1;
 		pagevec_remove_exceptionals(&pvec);
+		index = indices[pvec.nr - 1] +
+			hpage_nr_pages(pvec.pages[pvec.nr - 1]);
 		check_move_unevictable_pages(pvec.pages, pvec.nr);
 		pagevec_release(&pvec);
 		cond_resched();
@@ -785,23 +769,25 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			if (!trylock_page(page))
 				continue;
 
-			if (PageTransTail(page)) {
-				/* Middle of THP: zero out the page */
-				clear_highpage(page);
-				unlock_page(page);
-				continue;
-			} else if (PageTransHuge(page)) {
+			if (PageTransHuge(page)) {
+				/* Range starts in the middle of THP */
+				if (start > page->index) {
+					pgoff_t i;
+					index += HPAGE_PMD_NR;
+					page += start - page->index;
+					for (i = start; i < index; i++, page++)
+						clear_highpage(page);
+					unlock_page(page - 1);
+					continue;
+				}
+
+				/* Range ends in the middle of THP */
 				if (index == round_down(end, HPAGE_PMD_NR)) {
-					/*
-					 * Range ends in the middle of THP:
-					 * zero out the page
-					 */
-					clear_highpage(page);
+					while (index++ < end)
+						clear_highpage(page++);
 					unlock_page(page);
 					continue;
 				}
-				index += HPAGE_PMD_NR - 1;
-				i += HPAGE_PMD_NR - 1;
 			}
 
 			if (!unfalloc || !PageUptodate(page)) {
@@ -814,9 +800,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			unlock_page(page);
 		}
 		pagevec_remove_exceptionals(&pvec);
+		index += pvec.nr ? hpage_nr_pages(pvec.pages[pvec.nr - 1]) : 1;
 		pagevec_release(&pvec);
 		cond_resched();
-		index++;
 	}
 
 	if (partial_start) {
@@ -874,8 +860,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 					continue;
 				if (shmem_free_swap(mapping, index, page)) {
 					/* Swap was replaced by page: retry */
-					index--;
-					break;
+					goto retry;
 				}
 				nr_swaps_freed++;
 				continue;
@@ -883,30 +868,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 
 			lock_page(page);
 
-			if (PageTransTail(page)) {
-				/* Middle of THP: zero out the page */
-				clear_highpage(page);
-				unlock_page(page);
-				/*
-				 * Partial thp truncate due 'start' in middle
-				 * of THP: don't need to look on these pages
-				 * again on !pvec.nr restart.
-				 */
-				if (index != round_down(end, HPAGE_PMD_NR))
-					start++;
-				continue;
-			} else if (PageTransHuge(page)) {
+			if (PageTransHuge(page)) {
+				/* Range starts in the middle of THP */
+				if (start > page->index) {
+					index += HPAGE_PMD_NR;
+					page += start - page->index;
+					while (start++ < index)
+						clear_highpage(page++);
+					unlock_page(page - 1);
+					continue;
+				}
+
+				/* Range ends in the middle of THP */
 				if (index == round_down(end, HPAGE_PMD_NR)) {
-					/*
-					 * Range ends in the middle of THP:
-					 * zero out the page
-					 */
-					clear_highpage(page);
+					while (index++ < end)
+						clear_highpage(page++);
 					unlock_page(page);
 					continue;
 				}
-				index += HPAGE_PMD_NR - 1;
-				i += HPAGE_PMD_NR - 1;
 			}
 
 			if (!unfalloc || !PageUptodate(page)) {
@@ -917,15 +896,18 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 				} else {
 					/* Page was replaced by swap: retry */
 					unlock_page(page);
-					index--;
-					break;
+					goto retry;
 				}
 			}
 			unlock_page(page);
 		}
 		pagevec_remove_exceptionals(&pvec);
+		index += pvec.nr ? hpage_nr_pages(pvec.pages[pvec.nr - 1]) : 1;
+		pagevec_release(&pvec);
+		continue;
+retry:
+		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		index++;
 	}
 
 	spin_lock_irq(&info->lock);
@@ -1762,8 +1744,7 @@ alloc_nohuge:		page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
 				PageTransHuge(page));
 		if (error)
 			goto unacct;
-		error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
-				compound_order(page));
+		error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
 		if (!error) {
 			error = shmem_add_to_page_cache(page, mapping, hindex,
 							NULL);
@@ -1837,7 +1818,7 @@ alloc_nohuge:		page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
 		error = -EINVAL;
 		goto unlock;
 	}
-	*pagep = page + index - hindex;
+	*pagep = find_subpage(page, index);
 	return 0;
 
 	/*
diff --git a/mm/truncate.c b/mm/truncate.c
index dd7b24e083c5..3a1a1c1a654e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -524,16 +524,13 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 
 			WARN_ON(page_to_index(page) != index);
 
-			/* Middle of THP: skip */
-			if (PageTransTail(page)) {
+			/* Is 'start' or 'end' in the middle of THP ? */
+			if (PageTransHuge(page) &&
+				(start > index ||
+				 (index ==  round_down(end, HPAGE_PMD_NR)))) {
+				/* skip */
 				unlock_page(page);
 				continue;
-			} else if (PageTransHuge(page)) {
-				index += HPAGE_PMD_NR - 1;
-				i += HPAGE_PMD_NR - 1;
-				/* 'end' is in the middle of THP */
-				if (index ==  round_down(end, HPAGE_PMD_NR))
-					continue;
 			}
 
 			ret = invalidate_inode_page(page);
@@ -547,9 +544,9 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 			count += ret;
 		}
 		pagevec_remove_exceptionals(&pvec);
+		index += pvec.nr ? hpage_nr_pages(pvec.pages[pvec.nr - 1]) : 1;
 		pagevec_release(&pvec);
 		cond_resched();
-		index++;
 	}
 	return count;
 }
-- 
2.11.0

  reply index

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-26 11:57 [PATCHv6 00/37] ext4: support of huge pages Kirill A. Shutemov
2017-01-26 11:57 ` Kirill A. Shutemov [this message]
2017-02-09  3:57   ` [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries Matthew Wilcox
2017-02-09 16:58     ` Kirill A. Shutemov
2017-02-13 13:43       ` Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 02/37] Revert "radix-tree: implement radix_tree_maybe_preload_order()" Kirill A. Shutemov
2017-01-26 15:38   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 03/37] page-flags: relax page flag policy for few flags Kirill A. Shutemov
2017-02-09  4:01   ` Matthew Wilcox
2017-02-13 13:59     ` Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 04/37] mm, rmap: account file thp pages Kirill A. Shutemov
2017-02-09 20:17   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 05/37] thp: try to free page's buffers before attempt split Kirill A. Shutemov
2017-02-09 20:14   ` Matthew Wilcox
2017-02-13 14:32     ` Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 06/37] thp: handle write-protection faults for file THP Kirill A. Shutemov
2017-01-26 15:44   ` Matthew Wilcox
2017-01-26 15:57     ` Kirill A. Shutemov
2017-02-09 20:19   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 07/37] filemap: allocate huge page in page_cache_read(), if allowed Kirill A. Shutemov
2017-02-09 21:18   ` Matthew Wilcox
2017-02-13 15:17     ` Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 08/37] filemap: handle huge pages in do_generic_file_read() Kirill A. Shutemov
2017-02-09 21:55   ` Matthew Wilcox
2017-02-13 15:33     ` Kirill A. Shutemov
2017-02-13 16:01       ` Matthew Wilcox
2017-02-13 16:09         ` Matthew Wilcox
2017-02-13 16:28   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 09/37] filemap: allocate huge page in pagecache_get_page(), if allowed Kirill A. Shutemov
2017-02-09 21:59   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 10/37] filemap: handle huge pages in filemap_fdatawait_range() Kirill A. Shutemov
2017-02-09 23:03   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 11/37] HACK: readahead: alloc huge pages, if allowed Kirill A. Shutemov
2017-02-09 23:34   ` Matthew Wilcox
2017-02-10  0:23     ` Andreas Dilger
2017-02-10 14:51       ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 12/37] brd: make it handle huge pages Kirill A. Shutemov
2017-02-10 17:24   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 13/37] mm: make write_cache_pages() work on " Kirill A. Shutemov
2017-02-10 17:46   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 14/37] thp: introduce hpage_size() and hpage_mask() Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 15/37] thp: do not threat slab pages as huge in hpage_{nr_pages,size,mask} Kirill A. Shutemov
2017-02-10 22:13   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 16/37] thp: make thp_get_unmapped_area() respect S_HUGE_MODE Kirill A. Shutemov
2017-02-10 17:50   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 17/37] fs: make block_read_full_page() be able to read huge page Kirill A. Shutemov
2017-02-10 17:58   ` Matthew Wilcox
2017-01-26 11:58 ` [PATCHv6 18/37] fs: make block_write_{begin,end}() be able to handle huge pages Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 19/37] fs: make block_page_mkwrite() aware about " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 20/37] truncate: make truncate_inode_pages_range() " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 21/37] truncate: make invalidate_inode_pages2_range() " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 22/37] mm, hugetlb: switch hugetlbfs to multi-order radix-tree entries Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 23/37] mm: account huge pages to dirty, writaback, reclaimable, etc Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 24/37] ext4: make ext4_mpage_readpages() hugepage-aware Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 25/37] ext4: make ext4_writepage() work on huge pages Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 26/37] ext4: handle huge pages in ext4_page_mkwrite() Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 27/37] ext4: handle huge pages in __ext4_block_zero_page_range() Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 28/37] ext4: make ext4_block_write_begin() aware about huge pages Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 29/37] ext4: handle huge pages in ext4_da_write_end() Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 30/37] ext4: make ext4_da_page_release_reservation() aware about huge pages Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 31/37] ext4: handle writeback with " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 32/37] ext4: make EXT4_IOC_MOVE_EXT work " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 33/37] ext4: fix SEEK_DATA/SEEK_HOLE for " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 34/37] ext4: make fallocate() operations work with " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 35/37] ext4: reserve larger jounral transaction for " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 36/37] mm, fs, ext4: expand use of page_mapping() and page_to_pgoff() Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 37/37] ext4, vfs: add huge= mount option Kirill A. Shutemov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170126115819.58875-2-kirill.shutemov@linux.intel.com \
    --to=kirill.shutemov@linux.intel.com \
    --cc=aarcange@redhat.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@intel.com \
    --cc=hughd@google.com \
    --cc=jack@suse.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ross.zwisler@linux.intel.com \
    --cc=tytso@mit.edu \
    --cc=vbabka@suse.cz \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git