Re: [PATCH 10/15] mm: Allow find_get_page to be used for large pages

From: "Kirill A. Shutemov" <kirill@shutemov.name>
To: Matthew Wilcox <willy@infradead.org>
Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 10/15] mm: Allow find_get_page to be used for large pages
Date: Tue, 1 Oct 2019 13:32:45 +0300	[thread overview]
Message-ID: <20191001103245.zmgbtoer4f7ytxg2@box> (raw)
In-Reply-To: <20190925005214.27240-11-willy@infradead.org>

On Tue, Sep 24, 2019 at 05:52:09PM -0700, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> 
> Add FGP_PMD to indicate that we're trying to find-or-create a page that
> is at least PMD_ORDER in size.  The internal 'conflict' entry usage
> is modelled after that in DAX, but the implementations are different
> due to DAX using multi-order entries and the page cache using multiple
> order-0 entries.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/pagemap.h | 13 ++++++
>  mm/filemap.c            | 99 +++++++++++++++++++++++++++++++++++------
>  2 files changed, 99 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index d610a49be571..d6d97f9fb762 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -248,6 +248,19 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
>  #define FGP_NOFS		0x00000010
>  #define FGP_NOWAIT		0x00000020
>  #define FGP_FOR_MMAP		0x00000040
> +/*
> + * If you add more flags, increment FGP_ORDER_SHIFT (no further than 25).
> + * Do not insert flags above the FGP order bits.
> + */
> +#define FGP_ORDER_SHIFT		7
> +#define FGP_PMD			((PMD_SHIFT - PAGE_SHIFT) << FGP_ORDER_SHIFT)
> +#define FGP_PUD			((PUD_SHIFT - PAGE_SHIFT) << FGP_ORDER_SHIFT)
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +#define fgp_order(fgp)		((fgp) >> FGP_ORDER_SHIFT)
> +#else
> +#define fgp_order(fgp)		0
> +#endif
>  
>  struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
>  		int fgp_flags, gfp_t cache_gfp_mask);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index afe8f5d95810..8eca91547e40 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1576,7 +1576,71 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
>  
>  	return page;
>  }
> -EXPORT_SYMBOL(find_get_entry);
> +
> +static bool pagecache_is_conflict(struct page *page)
> +{
> +	return page == XA_RETRY_ENTRY;
> +}
> +
> +/**
> + * __find_get_page - Find and get a page cache entry.
> + * @mapping: The address_space to search.
> + * @offset: The page cache index.
> + * @order: The minimum order of the entry to return.
> + *
> + * Looks up the page cache entries at @mapping between @offset and
> + * @offset + 2^@order.  If there is a page cache page, it is returned with
> + * an increased refcount unless it is smaller than @order.
> + *
> + * If the slot holds a shadow entry of a previously evicted page, or a
> + * swap entry from shmem/tmpfs, it is returned.
> + *
> + * Return: the found page, a value indicating a conflicting page or %NULL if
> + * there are no pages in this range.
> + */
> +static struct page *__find_get_page(struct address_space *mapping,
> +		unsigned long offset, unsigned int order)
> +{
> +	XA_STATE(xas, &mapping->i_pages, offset);
> +	struct page *page;
> +
> +	rcu_read_lock();
> +repeat:
> +	xas_reset(&xas);
> +	page = xas_find(&xas, offset | ((1UL << order) - 1));
> +	if (xas_retry(&xas, page))
> +		goto repeat;
> +	/*
> +	 * A shadow entry of a recently evicted page, or a swap entry from
> +	 * shmem/tmpfs.  Skip it; keep looking for pages.
> +	 */
> +	if (xa_is_value(page))
> +		goto repeat;
> +	if (!page)
> +		goto out;
> +	if (compound_order(page) < order) {
> +		page = XA_RETRY_ENTRY;
> +		goto out;
> +	}
> +
> +	if (!page_cache_get_speculative(page))
> +		goto repeat;
> +
> +	/*
> +	 * Has the page moved or been split?
> +	 * This is part of the lockless pagecache protocol. See
> +	 * include/linux/pagemap.h for details.
> +	 */
> +	if (unlikely(page != xas_reload(&xas))) {
> +		put_page(page);
> +		goto repeat;
> +	}

You need to re-check compound_order() after obtaining reference to the
page. Otherwise the page could be split under you.

> +	page = find_subpage(page, offset);
> +out:
> +	rcu_read_unlock();
> +
> +	return page;
> +}
>  
>  /**
>   * find_lock_entry - locate, pin and lock a page cache entry
> @@ -1618,12 +1682,12 @@ EXPORT_SYMBOL(find_lock_entry);
>   * pagecache_get_page - find and get a page reference
>   * @mapping: the address_space to search
>   * @offset: the page index
> - * @fgp_flags: PCG flags
> + * @fgp_flags: FGP flags
>   * @gfp_mask: gfp mask to use for the page cache data page allocation
>   *
>   * Looks up the page cache slot at @mapping & @offset.
>   *
> - * PCG flags modify how the page is returned.
> + * FGP flags modify how the page is returned.
>   *
>   * @fgp_flags can be:
>   *
> @@ -1636,6 +1700,10 @@ EXPORT_SYMBOL(find_lock_entry);
>   * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
>   *   its own locking dance if the page is already in cache, or unlock the page
>   *   before returning if we had to add the page to pagecache.
> + * - FGP_PMD: We're only interested in pages at PMD granularity.  If there
> + *   is no page here (and FGP_CREATE is set), we'll create one large enough.
> + *   If there is a smaller page in the cache that overlaps the PMD page, we
> + *   return %NULL and do not attempt to create a page.

I still think it's suboptimal interface. It's okay to ask for PMD page,
but there's small page already caller should deal with it. Otherwise the
caller will do one additional lookup in xarray for fallback path for no
real reason.

>   *
>   * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
>   * if the GFP flags specified for FGP_CREAT are atomic.
> @@ -1649,10 +1717,11 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
>  {
>  	struct page *page;
>  
> +	BUILD_BUG_ON(((63 << FGP_ORDER_SHIFT) >> FGP_ORDER_SHIFT) != 63);
>  repeat:
> -	page = find_get_entry(mapping, offset);
> -	if (xa_is_value(page))
> -		page = NULL;
> +	page = __find_get_page(mapping, offset, fgp_order(fgp_flags));
> +	if (pagecache_is_conflict(page))
> +		return NULL;
>  	if (!page)
>  		goto no_page;
>  
> @@ -1686,7 +1755,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
>  		if (fgp_flags & FGP_NOFS)
>  			gfp_mask &= ~__GFP_FS;
>  
> -		page = __page_cache_alloc(gfp_mask);
> +		page = __page_cache_alloc_order(gfp_mask, fgp_order(fgp_flags));
>  		if (!page)
>  			return NULL;
>  
> @@ -1704,13 +1773,17 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
>  			if (err == -EEXIST)
>  				goto repeat;
>  		}
> +		if (page) {
> +			if (fgp_order(fgp_flags))
> +				count_vm_event(THP_FILE_ALLOC);
>  
> -		/*
> -		 * add_to_page_cache_lru locks the page, and for mmap we expect
> -		 * an unlocked page.
> -		 */
> -		if (page && (fgp_flags & FGP_FOR_MMAP))
> -			unlock_page(page);
> +			/*
> +			 * add_to_page_cache_lru locks the page, and
> +			 * for mmap we expect an unlocked page.
> +			 */
> +			if (fgp_flags & FGP_FOR_MMAP)
> +				unlock_page(page);
> +		}
>  	}
>  
>  	return page;
> -- 
> 2.23.0
> 

-- 
 Kirill A. Shutemov