Re: [PATCH 02/19] mm: Use multi-index entries in the page cache

From: Zi Yan <ziy@nvidia.com>
To: Matthew Wilcox <willy@infradead.org>
Cc: <linux-mm@kvack.org>, <linux-fsdevel@vger.kernel.org>
Subject: Re: [PATCH 02/19] mm: Use multi-index entries in the page cache
Date: Thu, 29 Oct 2020 16:49:39 -0400	[thread overview]
Message-ID: <4D931CDD-2CB1-4129-974C-12255156154E@nvidia.com> (raw)
In-Reply-To: <20201029193405.29125-3-willy@infradead.org>

[-- Attachment #1: Type: text/plain, Size: 9351 bytes --]

On 29 Oct 2020, at 15:33, Matthew Wilcox (Oracle) wrote:

> We currently store order-N THPs as 2^N consecutive entries.  While this
> consumes rather more memory than necessary, it also turns out to be buggy.
> A writeback operation which starts in the middle of a dirty THP will not
> notice as the dirty bit is only set on the head index.  With multi-index
> entries, the dirty bit will be found no matter where in the THP the
> iteration starts.

A multi-index entry can point to a THP with any size and the code relies
on thp_last_tail() to check whether it has finished processing the page
pointed by the entry. Is it how this change works?

>
> This does end up simplifying the page cache slightly, although not as
> much as I had hoped.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/pagemap.h | 10 -------
>  mm/filemap.c            | 62 ++++++++++++++++++++++++-----------------
>  mm/huge_memory.c        | 19 ++++++++++---
>  mm/khugepaged.c         | 12 +++++++-
>  mm/migrate.c            |  8 ------
>  mm/shmem.c              | 11 ++------
>  6 files changed, 65 insertions(+), 57 deletions(-)
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 62b759f92e36..00288ed24698 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -912,16 +912,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
>  		VM_BUG_ON_PAGE(PageTail(page), page);
>  		array[i++] = page;
>  		rac->_batch_count += thp_nr_pages(page);
> -
> -		/*
> -		 * The page cache isn't using multi-index entries yet,
> -		 * so the xas cursor needs to be manually moved to the
> -		 * next index.  This can be removed once the page cache
> -		 * is converted.
> -		 */
> -		if (PageHead(page))
> -			xas_set(&xas, rac->_index + rac->_batch_count);
> -
>  		if (i == array_sz)
>  			break;
>  	}
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 5c4db536fff4..8537ee86f99f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -127,13 +127,12 @@ static void page_cache_delete(struct address_space *mapping,
>
>  	/* hugetlb pages are represented by a single entry in the xarray */
>  	if (!PageHuge(page)) {
> -		xas_set_order(&xas, page->index, compound_order(page));
> -		nr = compound_nr(page);
> +		xas_set_order(&xas, page->index, thp_order(page));
> +		nr = thp_nr_pages(page);
>  	}
>
>  	VM_BUG_ON_PAGE(!PageLocked(page), page);
>  	VM_BUG_ON_PAGE(PageTail(page), page);
> -	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
>
>  	xas_store(&xas, shadow);
>  	xas_init_marks(&xas);
> @@ -311,19 +310,12 @@ static void page_cache_delete_batch(struct address_space *mapping,
>
>  		WARN_ON_ONCE(!PageLocked(page));
>
> -		if (page->index == xas.xa_index)
> -			page->mapping = NULL;
> +		page->mapping = NULL;
>  		/* Leave page->index set: truncation lookup relies on it */
>
> -		/*
> -		 * Move to the next page in the vector if this is a regular
> -		 * page or the index is of the last sub-page of this compound
> -		 * page.
> -		 */
> -		if (page->index + compound_nr(page) - 1 == xas.xa_index)
> -			i++;
> +		i++;
>  		xas_store(&xas, NULL);
> -		total_pages++;
> +		total_pages += thp_nr_pages(page);
>  	}
>  	mapping->nrpages -= total_pages;
>  }
> @@ -1956,20 +1948,24 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
>  		indices[pvec->nr] = xas.xa_index;
>  		if (!pagevec_add(pvec, page))
>  			break;
> -		goto next;
> +		continue;
>  unlock:
>  		unlock_page(page);
>  put:
>  		put_page(page);
> -next:
> -		if (!xa_is_value(page) && PageTransHuge(page))
> -			xas_set(&xas, page->index + thp_nr_pages(page));
>  	}
>  	rcu_read_unlock();
>
>  	return pagevec_count(pvec);
>  }
>
> +static inline bool thp_last_tail(struct page *head, pgoff_t index)
> +{
> +	if (!PageTransCompound(head) || PageHuge(head))
> +		return true;
> +	return index == head->index + thp_nr_pages(head) - 1;
> +}
> +
>  /**
>   * find_get_pages_range - gang pagecache lookup
>   * @mapping:	The address_space to search
> @@ -2008,11 +2004,17 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
>  		if (xa_is_value(page))
>  			continue;
>
> +again:
>  		pages[ret] = find_subpage(page, xas.xa_index);
>  		if (++ret == nr_pages) {
>  			*start = xas.xa_index + 1;
>  			goto out;
>  		}
> +		if (!thp_last_tail(page, xas.xa_index)) {
> +			xas.xa_index++;
> +			page_ref_inc(page);
> +			goto again;
> +		}
>  	}
>
>  	/*
> @@ -3018,6 +3020,12 @@ void filemap_map_pages(struct vm_fault *vmf,
>  	struct page *head, *page;
>  	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
>
> +	max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
> +	if (max_idx == 0)
> +		return;
> +	if (end_pgoff >= max_idx)
> +		end_pgoff = max_idx - 1;
> +
>  	rcu_read_lock();
>  	xas_for_each(&xas, head, end_pgoff) {
>  		if (xas_retry(&xas, head))
> @@ -3037,20 +3045,16 @@ void filemap_map_pages(struct vm_fault *vmf,
>  		/* Has the page moved or been split? */
>  		if (unlikely(head != xas_reload(&xas)))
>  			goto skip;
> -		page = find_subpage(head, xas.xa_index);
> -
> -		if (!PageUptodate(head) ||
> -				PageReadahead(page) ||
> -				PageHWPoison(page))
> +		if (!PageUptodate(head) || PageReadahead(head))
>  			goto skip;
>  		if (!trylock_page(head))
>  			goto skip;
> -
>  		if (head->mapping != mapping || !PageUptodate(head))
>  			goto unlock;
>
> -		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
> -		if (xas.xa_index >= max_idx)
> +		page = find_subpage(head, xas.xa_index);
> +again:
> +		if (PageHWPoison(page))
>  			goto unlock;
>
>  		if (mmap_miss > 0)
> @@ -3062,6 +3066,14 @@ void filemap_map_pages(struct vm_fault *vmf,
>  		last_pgoff = xas.xa_index;
>  		if (alloc_set_pte(vmf, page))
>  			goto unlock;
> +		if (!thp_last_tail(head, xas.xa_index)) {
> +			xas.xa_index++;
> +			page++;
> +			page_ref_inc(head);
> +			if (xas.xa_index >= end_pgoff)
> +				goto unlock;
> +			goto again;
> +		}
>  		unlock_page(head);
>  		goto next;
>  unlock:
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f99167d74cbc..0e900e594e77 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2626,6 +2626,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  	struct page *head = compound_head(page);
>  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
>  	struct deferred_split *ds_queue = get_deferred_split_queue(head);
> +	XA_STATE(xas, &head->mapping->i_pages, head->index);
>  	struct anon_vma *anon_vma = NULL;
>  	struct address_space *mapping = NULL;
>  	int count, mapcount, extra_pins, ret;
> @@ -2690,19 +2691,28 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  	unmap_page(head);
>  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
>
> +	if (mapping) {
> +		xas_split_alloc(&xas, head, thp_order(head),
> +				mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
> +		if (xas_error(&xas)) {
> +			ret = xas_error(&xas);
> +			goto out_unlock;
> +		}
> +	}
> +
>  	/* prevent PageLRU to go away from under us, and freeze lru stats */
>  	spin_lock_irqsave(&pgdata->lru_lock, flags);
>
>  	if (mapping) {
> -		XA_STATE(xas, &mapping->i_pages, page_index(head));
> -
>  		/*
>  		 * Check if the head page is present in page cache.
>  		 * We assume all tail are present too, if head is there.
>  		 */
> -		xa_lock(&mapping->i_pages);
> +		xas_lock(&xas);
> +		xas_reset(&xas);
>  		if (xas_load(&xas) != head)
>  			goto fail;
> +		xas_split(&xas, head, thp_order(head));
>  	}
>
>  	/* Prevent deferred_split_scan() touching ->_refcount */
> @@ -2735,7 +2745,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  		}
>  		spin_unlock(&ds_queue->split_queue_lock);
>  fail:		if (mapping)
> -			xa_unlock(&mapping->i_pages);
> +			xas_unlock(&xas);
>  		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
>  		remap_page(head, thp_nr_pages(head));
>  		ret = -EBUSY;
> @@ -2749,6 +2759,7 @@ fail:		if (mapping)
>  	if (mapping)
>  		i_mmap_unlock_read(mapping);
>  out:
> +	xas_destroy(&xas);
>  	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
>  	return ret;
>  }
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2cb93aa8bf84..230e62a92ae7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1645,7 +1645,10 @@ static void collapse_file(struct mm_struct *mm,
>  	}
>  	count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
>
> -	/* This will be less messy when we use multi-index entries */
> +	/*
> +	 * Ensure we have slots for all the pages in the range.  This is
> +	 * almost certainly a no-op because most of the pages must be present
> +	 */
>  	do {
>  		xas_lock_irq(&xas);
>  		xas_create_range(&xas);
> @@ -1851,6 +1854,9 @@ static void collapse_file(struct mm_struct *mm,
>  			__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
>  	}
>
> +	/* Join all the small entries into a single multi-index entry */
> +	xas_set_order(&xas, start, HPAGE_PMD_ORDER);

Would thp_order(new_page) be better than HPAGE_PMD_ORDER?

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]