From: Zi Yan <ziy@nvidia.com>
To: Matthew Wilcox <willy@infradead.org>
Cc: <linux-mm@kvack.org>, <linux-fsdevel@vger.kernel.org>
Subject: Re: [PATCH 02/19] mm: Use multi-index entries in the page cache
Date: Thu, 29 Oct 2020 16:49:39 -0400 [thread overview]
Message-ID: <4D931CDD-2CB1-4129-974C-12255156154E@nvidia.com> (raw)
In-Reply-To: <20201029193405.29125-3-willy@infradead.org>
[-- Attachment #1: Type: text/plain, Size: 9351 bytes --]
On 29 Oct 2020, at 15:33, Matthew Wilcox (Oracle) wrote:
> We currently store order-N THPs as 2^N consecutive entries. While this
> consumes rather more memory than necessary, it also turns out to be buggy.
> A writeback operation which starts in the middle of a dirty THP will not
> notice as the dirty bit is only set on the head index. With multi-index
> entries, the dirty bit will be found no matter where in the THP the
> iteration starts.
A multi-index entry can point to a THP with any size and the code relies
on thp_last_tail() to check whether it has finished processing the page
pointed by the entry. Is it how this change works?
>
> This does end up simplifying the page cache slightly, although not as
> much as I had hoped.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
> include/linux/pagemap.h | 10 -------
> mm/filemap.c | 62 ++++++++++++++++++++++++-----------------
> mm/huge_memory.c | 19 ++++++++++---
> mm/khugepaged.c | 12 +++++++-
> mm/migrate.c | 8 ------
> mm/shmem.c | 11 ++------
> 6 files changed, 65 insertions(+), 57 deletions(-)
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 62b759f92e36..00288ed24698 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -912,16 +912,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
> VM_BUG_ON_PAGE(PageTail(page), page);
> array[i++] = page;
> rac->_batch_count += thp_nr_pages(page);
> -
> - /*
> - * The page cache isn't using multi-index entries yet,
> - * so the xas cursor needs to be manually moved to the
> - * next index. This can be removed once the page cache
> - * is converted.
> - */
> - if (PageHead(page))
> - xas_set(&xas, rac->_index + rac->_batch_count);
> -
> if (i == array_sz)
> break;
> }
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 5c4db536fff4..8537ee86f99f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -127,13 +127,12 @@ static void page_cache_delete(struct address_space *mapping,
>
> /* hugetlb pages are represented by a single entry in the xarray */
> if (!PageHuge(page)) {
> - xas_set_order(&xas, page->index, compound_order(page));
> - nr = compound_nr(page);
> + xas_set_order(&xas, page->index, thp_order(page));
> + nr = thp_nr_pages(page);
> }
>
> VM_BUG_ON_PAGE(!PageLocked(page), page);
> VM_BUG_ON_PAGE(PageTail(page), page);
> - VM_BUG_ON_PAGE(nr != 1 && shadow, page);
>
> xas_store(&xas, shadow);
> xas_init_marks(&xas);
> @@ -311,19 +310,12 @@ static void page_cache_delete_batch(struct address_space *mapping,
>
> WARN_ON_ONCE(!PageLocked(page));
>
> - if (page->index == xas.xa_index)
> - page->mapping = NULL;
> + page->mapping = NULL;
> /* Leave page->index set: truncation lookup relies on it */
>
> - /*
> - * Move to the next page in the vector if this is a regular
> - * page or the index is of the last sub-page of this compound
> - * page.
> - */
> - if (page->index + compound_nr(page) - 1 == xas.xa_index)
> - i++;
> + i++;
> xas_store(&xas, NULL);
> - total_pages++;
> + total_pages += thp_nr_pages(page);
> }
> mapping->nrpages -= total_pages;
> }
> @@ -1956,20 +1948,24 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
> indices[pvec->nr] = xas.xa_index;
> if (!pagevec_add(pvec, page))
> break;
> - goto next;
> + continue;
> unlock:
> unlock_page(page);
> put:
> put_page(page);
> -next:
> - if (!xa_is_value(page) && PageTransHuge(page))
> - xas_set(&xas, page->index + thp_nr_pages(page));
> }
> rcu_read_unlock();
>
> return pagevec_count(pvec);
> }
>
> +static inline bool thp_last_tail(struct page *head, pgoff_t index)
> +{
> + if (!PageTransCompound(head) || PageHuge(head))
> + return true;
> + return index == head->index + thp_nr_pages(head) - 1;
> +}
> +
> /**
> * find_get_pages_range - gang pagecache lookup
> * @mapping: The address_space to search
> @@ -2008,11 +2004,17 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
> if (xa_is_value(page))
> continue;
>
> +again:
> pages[ret] = find_subpage(page, xas.xa_index);
> if (++ret == nr_pages) {
> *start = xas.xa_index + 1;
> goto out;
> }
> + if (!thp_last_tail(page, xas.xa_index)) {
> + xas.xa_index++;
> + page_ref_inc(page);
> + goto again;
> + }
> }
>
> /*
> @@ -3018,6 +3020,12 @@ void filemap_map_pages(struct vm_fault *vmf,
> struct page *head, *page;
> unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
>
> + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
> + if (max_idx == 0)
> + return;
> + if (end_pgoff >= max_idx)
> + end_pgoff = max_idx - 1;
> +
> rcu_read_lock();
> xas_for_each(&xas, head, end_pgoff) {
> if (xas_retry(&xas, head))
> @@ -3037,20 +3045,16 @@ void filemap_map_pages(struct vm_fault *vmf,
> /* Has the page moved or been split? */
> if (unlikely(head != xas_reload(&xas)))
> goto skip;
> - page = find_subpage(head, xas.xa_index);
> -
> - if (!PageUptodate(head) ||
> - PageReadahead(page) ||
> - PageHWPoison(page))
> + if (!PageUptodate(head) || PageReadahead(head))
> goto skip;
> if (!trylock_page(head))
> goto skip;
> -
> if (head->mapping != mapping || !PageUptodate(head))
> goto unlock;
>
> - max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
> - if (xas.xa_index >= max_idx)
> + page = find_subpage(head, xas.xa_index);
> +again:
> + if (PageHWPoison(page))
> goto unlock;
>
> if (mmap_miss > 0)
> @@ -3062,6 +3066,14 @@ void filemap_map_pages(struct vm_fault *vmf,
> last_pgoff = xas.xa_index;
> if (alloc_set_pte(vmf, page))
> goto unlock;
> + if (!thp_last_tail(head, xas.xa_index)) {
> + xas.xa_index++;
> + page++;
> + page_ref_inc(head);
> + if (xas.xa_index >= end_pgoff)
> + goto unlock;
> + goto again;
> + }
> unlock_page(head);
> goto next;
> unlock:
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f99167d74cbc..0e900e594e77 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2626,6 +2626,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> struct page *head = compound_head(page);
> struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
> struct deferred_split *ds_queue = get_deferred_split_queue(head);
> + XA_STATE(xas, &head->mapping->i_pages, head->index);
> struct anon_vma *anon_vma = NULL;
> struct address_space *mapping = NULL;
> int count, mapcount, extra_pins, ret;
> @@ -2690,19 +2691,28 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> unmap_page(head);
> VM_BUG_ON_PAGE(compound_mapcount(head), head);
>
> + if (mapping) {
> + xas_split_alloc(&xas, head, thp_order(head),
> + mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
> + if (xas_error(&xas)) {
> + ret = xas_error(&xas);
> + goto out_unlock;
> + }
> + }
> +
> /* prevent PageLRU to go away from under us, and freeze lru stats */
> spin_lock_irqsave(&pgdata->lru_lock, flags);
>
> if (mapping) {
> - XA_STATE(xas, &mapping->i_pages, page_index(head));
> -
> /*
> * Check if the head page is present in page cache.
> * We assume all tail are present too, if head is there.
> */
> - xa_lock(&mapping->i_pages);
> + xas_lock(&xas);
> + xas_reset(&xas);
> if (xas_load(&xas) != head)
> goto fail;
> + xas_split(&xas, head, thp_order(head));
> }
>
> /* Prevent deferred_split_scan() touching ->_refcount */
> @@ -2735,7 +2745,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> }
> spin_unlock(&ds_queue->split_queue_lock);
> fail: if (mapping)
> - xa_unlock(&mapping->i_pages);
> + xas_unlock(&xas);
> spin_unlock_irqrestore(&pgdata->lru_lock, flags);
> remap_page(head, thp_nr_pages(head));
> ret = -EBUSY;
> @@ -2749,6 +2759,7 @@ fail: if (mapping)
> if (mapping)
> i_mmap_unlock_read(mapping);
> out:
> + xas_destroy(&xas);
> count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> return ret;
> }
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2cb93aa8bf84..230e62a92ae7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1645,7 +1645,10 @@ static void collapse_file(struct mm_struct *mm,
> }
> count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
>
> - /* This will be less messy when we use multi-index entries */
> + /*
> + * Ensure we have slots for all the pages in the range. This is
> + * almost certainly a no-op because most of the pages must be present
> + */
> do {
> xas_lock_irq(&xas);
> xas_create_range(&xas);
> @@ -1851,6 +1854,9 @@ static void collapse_file(struct mm_struct *mm,
> __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> }
>
> + /* Join all the small entries into a single multi-index entry */
> + xas_set_order(&xas, start, HPAGE_PMD_ORDER);
Would thp_order(new_page) be better than HPAGE_PMD_ORDER?
—
Best Regards,
Yan Zi
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]
next prev parent reply other threads:[~2020-10-29 20:49 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-29 19:33 [PATCH 00/19] Transparent Hugepages for non-tmpfs filesystems Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 01/19] XArray: Expose xas_destroy Matthew Wilcox (Oracle)
2020-10-29 20:33 ` Zi Yan
2020-10-29 19:33 ` [PATCH 02/19] mm: Use multi-index entries in the page cache Matthew Wilcox (Oracle)
2020-10-29 20:49 ` Zi Yan [this message]
2020-10-29 21:54 ` Matthew Wilcox
2020-10-30 14:48 ` Zi Yan
2020-11-03 14:04 ` Kirill A. Shutemov
2020-10-29 19:33 ` [PATCH 03/19] mm: Support arbitrary THP sizes Matthew Wilcox (Oracle)
2020-10-29 20:50 ` Zi Yan
2020-10-29 19:33 ` [PATCH 04/19] mm: Change NR_FILE_THPS to account in base pages Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 05/19] mm/filemap: Rename generic_file_buffered_read subfunctions Matthew Wilcox (Oracle)
2020-10-30 0:04 ` Kent Overstreet
2020-10-30 8:56 ` Christoph Hellwig
2020-11-03 14:16 ` Kirill A. Shutemov
2020-11-03 14:40 ` Matthew Wilcox
2020-11-03 15:02 ` Kirill A. Shutemov
2020-10-29 19:33 ` [PATCH 06/19] mm/filemap: Change calling convention for gfbr_ functions Matthew Wilcox (Oracle)
2020-10-30 0:05 ` Kent Overstreet
2020-10-29 19:33 ` [PATCH 07/19] mm/filemap: Use head pages in generic_file_buffered_read Matthew Wilcox (Oracle)
2020-10-30 0:19 ` Kent Overstreet
2020-10-30 1:03 ` Matthew Wilcox
2020-10-29 19:33 ` [PATCH 08/19] mm/filemap: Add __page_cache_alloc_order Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 09/19] mm/filemap: Allow THPs to be added to the page cache Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 10/19] mm/vmscan: Optimise shrink_page_list for smaller THPs Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 11/19] mm/filemap: Allow PageReadahead to be set on head pages Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 12/19] mm: Pass a sleep state to put_and_wait_on_page_locked Matthew Wilcox (Oracle)
2020-10-29 19:33 ` [PATCH 13/19] mm/filemap: Support readpage splitting a page Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 14/19] mm/filemap: Inline __wait_on_page_locked_async into caller Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 15/19] mm/readahead: Add THP readahead Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 16/19] mm/readahead: Align THP mappings for non-DAX Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 17/19] mm/readahead: Switch to page_cache_ra_order Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 18/19] mm/filemap: Support VM_HUGEPAGE for file mappings Matthew Wilcox (Oracle)
2020-10-29 19:34 ` [PATCH 19/19] selftests/vm/transhuge-stress: Support file-backed THPs Matthew Wilcox (Oracle)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4D931CDD-2CB1-4129-974C-12255156154E@nvidia.com \
--to=ziy@nvidia.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).