linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Zi Yan <ziy@nvidia.com>
To: Matthew Wilcox <willy@infradead.org>
Cc: <linux-mm@kvack.org>, <linux-fsdevel@vger.kernel.org>
Subject: Re: [PATCH 02/19] mm: Use multi-index entries in the page cache
Date: Thu, 29 Oct 2020 16:49:39 -0400	[thread overview]
Message-ID: <4D931CDD-2CB1-4129-974C-12255156154E@nvidia.com> (raw)
In-Reply-To: <20201029193405.29125-3-willy@infradead.org>

[-- Attachment #1: Type: text/plain, Size: 9351 bytes --]

On 29 Oct 2020, at 15:33, Matthew Wilcox (Oracle) wrote:

> We currently store order-N THPs as 2^N consecutive entries.  While this
> consumes rather more memory than necessary, it also turns out to be buggy.
> A writeback operation which starts in the middle of a dirty THP will not
> notice as the dirty bit is only set on the head index.  With multi-index
> entries, the dirty bit will be found no matter where in the THP the
> iteration starts.

A multi-index entry can point to a THP with any size and the code relies
on thp_last_tail() to check whether it has finished processing the page
pointed by the entry. Is it how this change works?

>
> This does end up simplifying the page cache slightly, although not as
> much as I had hoped.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/pagemap.h | 10 -------
>  mm/filemap.c            | 62 ++++++++++++++++++++++++-----------------
>  mm/huge_memory.c        | 19 ++++++++++---
>  mm/khugepaged.c         | 12 +++++++-
>  mm/migrate.c            |  8 ------
>  mm/shmem.c              | 11 ++------
>  6 files changed, 65 insertions(+), 57 deletions(-)
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 62b759f92e36..00288ed24698 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -912,16 +912,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
>  		VM_BUG_ON_PAGE(PageTail(page), page);
>  		array[i++] = page;
>  		rac->_batch_count += thp_nr_pages(page);
> -
> -		/*
> -		 * The page cache isn't using multi-index entries yet,
> -		 * so the xas cursor needs to be manually moved to the
> -		 * next index.  This can be removed once the page cache
> -		 * is converted.
> -		 */
> -		if (PageHead(page))
> -			xas_set(&xas, rac->_index + rac->_batch_count);
> -
>  		if (i == array_sz)
>  			break;
>  	}
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 5c4db536fff4..8537ee86f99f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -127,13 +127,12 @@ static void page_cache_delete(struct address_space *mapping,
>
>  	/* hugetlb pages are represented by a single entry in the xarray */
>  	if (!PageHuge(page)) {
> -		xas_set_order(&xas, page->index, compound_order(page));
> -		nr = compound_nr(page);
> +		xas_set_order(&xas, page->index, thp_order(page));
> +		nr = thp_nr_pages(page);
>  	}
>
>  	VM_BUG_ON_PAGE(!PageLocked(page), page);
>  	VM_BUG_ON_PAGE(PageTail(page), page);
> -	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
>
>  	xas_store(&xas, shadow);
>  	xas_init_marks(&xas);
> @@ -311,19 +310,12 @@ static void page_cache_delete_batch(struct address_space *mapping,
>
>  		WARN_ON_ONCE(!PageLocked(page));
>
> -		if (page->index == xas.xa_index)
> -			page->mapping = NULL;
> +		page->mapping = NULL;
>  		/* Leave page->index set: truncation lookup relies on it */
>
> -		/*
> -		 * Move to the next page in the vector if this is a regular
> -		 * page or the index is of the last sub-page of this compound
> -		 * page.
> -		 */
> -		if (page->index + compound_nr(page) - 1 == xas.xa_index)
> -			i++;
> +		i++;
>  		xas_store(&xas, NULL);
> -		total_pages++;
> +		total_pages += thp_nr_pages(page);
>  	}
>  	mapping->nrpages -= total_pages;
>  }
> @@ -1956,20 +1948,24 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
>  		indices[pvec->nr] = xas.xa_index;
>  		if (!pagevec_add(pvec, page))
>  			break;
> -		goto next;
> +		continue;
>  unlock:
>  		unlock_page(page);
>  put:
>  		put_page(page);
> -next:
> -		if (!xa_is_value(page) && PageTransHuge(page))
> -			xas_set(&xas, page->index + thp_nr_pages(page));
>  	}
>  	rcu_read_unlock();
>
>  	return pagevec_count(pvec);
>  }
>
> +static inline bool thp_last_tail(struct page *head, pgoff_t index)
> +{
> +	if (!PageTransCompound(head) || PageHuge(head))
> +		return true;
> +	return index == head->index + thp_nr_pages(head) - 1;
> +}
> +
>  /**
>   * find_get_pages_range - gang pagecache lookup
>   * @mapping:	The address_space to search
> @@ -2008,11 +2004,17 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
>  		if (xa_is_value(page))
>  			continue;
>
> +again:
>  		pages[ret] = find_subpage(page, xas.xa_index);
>  		if (++ret == nr_pages) {
>  			*start = xas.xa_index + 1;
>  			goto out;
>  		}
> +		if (!thp_last_tail(page, xas.xa_index)) {
> +			xas.xa_index++;
> +			page_ref_inc(page);
> +			goto again;
> +		}
>  	}
>
>  	/*
> @@ -3018,6 +3020,12 @@ void filemap_map_pages(struct vm_fault *vmf,
>  	struct page *head, *page;
>  	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
>
> +	max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
> +	if (max_idx == 0)
> +		return;
> +	if (end_pgoff >= max_idx)
> +		end_pgoff = max_idx - 1;
> +
>  	rcu_read_lock();
>  	xas_for_each(&xas, head, end_pgoff) {
>  		if (xas_retry(&xas, head))
> @@ -3037,20 +3045,16 @@ void filemap_map_pages(struct vm_fault *vmf,
>  		/* Has the page moved or been split? */
>  		if (unlikely(head != xas_reload(&xas)))
>  			goto skip;
> -		page = find_subpage(head, xas.xa_index);
> -
> -		if (!PageUptodate(head) ||
> -				PageReadahead(page) ||
> -				PageHWPoison(page))
> +		if (!PageUptodate(head) || PageReadahead(head))
>  			goto skip;
>  		if (!trylock_page(head))
>  			goto skip;
> -
>  		if (head->mapping != mapping || !PageUptodate(head))
>  			goto unlock;
>
> -		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
> -		if (xas.xa_index >= max_idx)
> +		page = find_subpage(head, xas.xa_index);
> +again:
> +		if (PageHWPoison(page))
>  			goto unlock;
>
>  		if (mmap_miss > 0)
> @@ -3062,6 +3066,14 @@ void filemap_map_pages(struct vm_fault *vmf,
>  		last_pgoff = xas.xa_index;
>  		if (alloc_set_pte(vmf, page))
>  			goto unlock;
> +		if (!thp_last_tail(head, xas.xa_index)) {
> +			xas.xa_index++;
> +			page++;
> +			page_ref_inc(head);
> +			if (xas.xa_index >= end_pgoff)
> +				goto unlock;
> +			goto again;
> +		}
>  		unlock_page(head);
>  		goto next;
>  unlock:
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f99167d74cbc..0e900e594e77 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2626,6 +2626,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  	struct page *head = compound_head(page);
>  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
>  	struct deferred_split *ds_queue = get_deferred_split_queue(head);
> +	XA_STATE(xas, &head->mapping->i_pages, head->index);
>  	struct anon_vma *anon_vma = NULL;
>  	struct address_space *mapping = NULL;
>  	int count, mapcount, extra_pins, ret;
> @@ -2690,19 +2691,28 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  	unmap_page(head);
>  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
>
> +	if (mapping) {
> +		xas_split_alloc(&xas, head, thp_order(head),
> +				mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
> +		if (xas_error(&xas)) {
> +			ret = xas_error(&xas);
> +			goto out_unlock;
> +		}
> +	}
> +
>  	/* prevent PageLRU to go away from under us, and freeze lru stats */
>  	spin_lock_irqsave(&pgdata->lru_lock, flags);
>
>  	if (mapping) {
> -		XA_STATE(xas, &mapping->i_pages, page_index(head));
> -
>  		/*
>  		 * Check if the head page is present in page cache.
>  		 * We assume all tail are present too, if head is there.
>  		 */
> -		xa_lock(&mapping->i_pages);
> +		xas_lock(&xas);
> +		xas_reset(&xas);
>  		if (xas_load(&xas) != head)
>  			goto fail;
> +		xas_split(&xas, head, thp_order(head));
>  	}
>
>  	/* Prevent deferred_split_scan() touching ->_refcount */
> @@ -2735,7 +2745,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  		}
>  		spin_unlock(&ds_queue->split_queue_lock);
>  fail:		if (mapping)
> -			xa_unlock(&mapping->i_pages);
> +			xas_unlock(&xas);
>  		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
>  		remap_page(head, thp_nr_pages(head));
>  		ret = -EBUSY;
> @@ -2749,6 +2759,7 @@ fail:		if (mapping)
>  	if (mapping)
>  		i_mmap_unlock_read(mapping);
>  out:
> +	xas_destroy(&xas);
>  	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
>  	return ret;
>  }
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2cb93aa8bf84..230e62a92ae7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1645,7 +1645,10 @@ static void collapse_file(struct mm_struct *mm,
>  	}
>  	count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
>
> -	/* This will be less messy when we use multi-index entries */
> +	/*
> +	 * Ensure we have slots for all the pages in the range.  This is
> +	 * almost certainly a no-op because most of the pages must be present
> +	 */
>  	do {
>  		xas_lock_irq(&xas);
>  		xas_create_range(&xas);
> @@ -1851,6 +1854,9 @@ static void collapse_file(struct mm_struct *mm,
>  			__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
>  	}
>
> +	/* Join all the small entries into a single multi-index entry */
> +	xas_set_order(&xas, start, HPAGE_PMD_ORDER);

Would thp_order(new_page) be better than HPAGE_PMD_ORDER?


—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

  reply	other threads:[~2020-10-29 20:49 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-29 19:33 [PATCH 00/19] Transparent Hugepages for non-tmpfs filesystems Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 01/19] XArray: Expose xas_destroy Matthew Wilcox (Oracle)
2020-10-29 20:33   ` Zi Yan
2020-10-29 19:33 ` [PATCH 02/19] mm: Use multi-index entries in the page cache Matthew Wilcox (Oracle)
2020-10-29 20:49   ` Zi Yan [this message]
2020-10-29 21:54     ` Matthew Wilcox
2020-10-30 14:48       ` Zi Yan
2020-11-03 14:04   ` Kirill A. Shutemov
2020-10-29 19:33 ` [PATCH 03/19] mm: Support arbitrary THP sizes Matthew Wilcox (Oracle)
2020-10-29 20:50   ` Zi Yan
2020-10-29 19:33 ` [PATCH 04/19] mm: Change NR_FILE_THPS to account in base pages Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 05/19] mm/filemap: Rename generic_file_buffered_read subfunctions Matthew Wilcox (Oracle)
2020-10-30  0:04   ` Kent Overstreet
2020-10-30  8:56   ` Christoph Hellwig
2020-11-03 14:16   ` Kirill A. Shutemov
2020-11-03 14:40     ` Matthew Wilcox
2020-11-03 15:02       ` Kirill A. Shutemov
2020-10-29 19:33 ` [PATCH 06/19] mm/filemap: Change calling convention for gfbr_ functions Matthew Wilcox (Oracle)
2020-10-30  0:05   ` Kent Overstreet
2020-10-29 19:33 ` [PATCH 07/19] mm/filemap: Use head pages in generic_file_buffered_read Matthew Wilcox (Oracle)
2020-10-30  0:19   ` Kent Overstreet
2020-10-30  1:03     ` Matthew Wilcox
2020-10-29 19:33 ` [PATCH 08/19] mm/filemap: Add __page_cache_alloc_order Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 09/19] mm/filemap: Allow THPs to be added to the page cache Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 10/19] mm/vmscan: Optimise shrink_page_list for smaller THPs Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 11/19] mm/filemap: Allow PageReadahead to be set on head pages Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 12/19] mm: Pass a sleep state to put_and_wait_on_page_locked Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 13/19] mm/filemap: Support readpage splitting a page Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 14/19] mm/filemap: Inline __wait_on_page_locked_async into caller Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 15/19] mm/readahead: Add THP readahead Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 16/19] mm/readahead: Align THP mappings for non-DAX Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 17/19] mm/readahead: Switch to page_cache_ra_order Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 18/19] mm/filemap: Support VM_HUGEPAGE for file mappings Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 19/19] selftests/vm/transhuge-stress: Support file-backed THPs Matthew Wilcox (Oracle)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D931CDD-2CB1-4129-974C-12255156154E@nvidia.com \
    --to=ziy@nvidia.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).