linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Song Liu <songliubraving@fb.com>
To: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Linux-MM <linux-mm@kvack.org>,
	"matthew.wilcox@oracle.com" <matthew.wilcox@oracle.com>,
	"kirill.shutemov@linux.intel.com"
	<kirill.shutemov@linux.intel.com>,
	"peterz@infradead.org" <peterz@infradead.org>,
	"oleg@redhat.com" <oleg@redhat.com>,
	"rostedt@goodmis.org" <rostedt@goodmis.org>,
	Kernel Team <Kernel-team@fb.com>,
	"william.kucharski@oracle.com" <william.kucharski@oracle.com>
Subject: Re: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
Date: Mon, 24 Jun 2019 14:25:42 +0000	[thread overview]
Message-ID: <24FB1072-E355-4F9D-855F-337C855C9AF9@fb.com> (raw)
In-Reply-To: <20190624131934.m6gbktixyykw65ws@box>



> On Jun 24, 2019, at 6:19 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> 
> On Sat, Jun 22, 2019 at 10:48:28PM -0700, Song Liu wrote:
>> khugepaged needs exclusive mmap_sem to access page table. When it fails
>> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
>> is already a THP, khugepaged will not handle this pmd again.
>> 
>> This patch enables the khugepaged to retry retract_page_tables().
>> 
>> A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
>> contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
>> it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
>> compound pages in this address_space.
>> 
>> Since collapse may happen at an later time, some pages may already fault
>> in. To handle these pages properly, it is necessary to prepare the pmd
>> before collapsing. prepare_pmd_for_collapse() is introduced to prepare
>> the pmd by removing rmap, adjusting refcount and mm_counter.
>> 
>> prepare_pmd_for_collapse() also double checks whether all ptes in this
>> pmd are mapping to the same THP. This is necessary because some subpage
>> of the THP may be replaced, for example by uprobe. In such cases, it
>> is not possible to collapse the pmd, so we fall back.
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>> include/linux/pagemap.h |  1 +
>> mm/khugepaged.c         | 69 +++++++++++++++++++++++++++++++++++------
>> 2 files changed, 60 insertions(+), 10 deletions(-)
>> 
>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>> index 9ec3544baee2..eac881de2a46 100644
>> --- a/include/linux/pagemap.h
>> +++ b/include/linux/pagemap.h
>> @@ -29,6 +29,7 @@ enum mapping_flags {
>> 	AS_EXITING	= 4, 	/* final truncate in progress */
>> 	/* writeback related tags are not used */
>> 	AS_NO_WRITEBACK_TAGS = 5,
>> +	AS_COLLAPSE_PMD = 6,	/* try collapse pmd for THP */
>> };
>> 
>> /**
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index a4f90a1b06f5..9b980327fd9b 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
>> }
>> 
>> #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
>> -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>> +
>> +/* return whether the pmd is ready for collapse */
>> +bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
>> +			      struct page *hpage, pmd_t *pmd)
>> +{
>> +	unsigned long haddr = page_address_in_vma(hpage, vma);
>> +	unsigned long addr;
>> +	int i, count = 0;
>> +
>> +	/* step 1: check all mapped PTEs are to this huge page */
>> +	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>> +		pte_t *pte = pte_offset_map(pmd, addr);
>> +
>> +		if (pte_none(*pte))
>> +			continue;
>> +
>> +		if (hpage + i != vm_normal_page(vma, addr, *pte))
>> +			return false;
>> +		count++;
>> +	}
>> +
>> +	/* step 2: adjust rmap */
>> +	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>> +		pte_t *pte = pte_offset_map(pmd, addr);
>> +		struct page *page;
>> +
>> +		if (pte_none(*pte))
>> +			continue;
>> +		page = vm_normal_page(vma, addr, *pte);
>> +		page_remove_rmap(page, false);
>> +	}
>> +
>> +	/* step 3: set proper refcount and mm_counters. */
>> +	page_ref_sub(hpage, count);
>> +	add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
>> +	return true;
>> +}
>> +
>> +extern pid_t sysctl_dump_pt_pid;
>> +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
>> +				struct page *hpage)
>> {
>> 	struct vm_area_struct *vma;
>> 	unsigned long addr;
>> @@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>> 		pmd = mm_find_pmd(vma->vm_mm, addr);
>> 		if (!pmd)
>> 			continue;
>> -		/*
>> -		 * We need exclusive mmap_sem to retract page table.
>> -		 * If trylock fails we would end up with pte-mapped THP after
>> -		 * re-fault. Not ideal, but it's more important to not disturb
>> -		 * the system too much.
>> -		 */
>> 		if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
>> 			spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
>> -			/* assume page table is clear */
>> +
>> +			if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
>> +				spin_unlock(ptl);
>> +				up_write(&vma->vm_mm->mmap_sem);
>> +				continue;
>> +			}
>> 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
>> 			spin_unlock(ptl);
>> 			up_write(&vma->vm_mm->mmap_sem);
>> 			mm_dec_nr_ptes(vma->vm_mm);
>> 			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
>> -		}
>> +		} else
>> +			set_bit(AS_COLLAPSE_PMD, &mapping->flags);
>> 	}
>> 	i_mmap_unlock_write(mapping);
>> }
>> @@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
>> 		/*
>> 		 * Remove pte page tables, so we can re-fault the page as huge.
>> 		 */
>> -		retract_page_tables(mapping, start);
>> +		retract_page_tables(mapping, start, new_page);
>> 		*hpage = NULL;
>> 
>> 		khugepaged_pages_collapsed++;
>> @@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>> 	int present, swap;
>> 	int node = NUMA_NO_NODE;
>> 	int result = SCAN_SUCCEED;
>> +	bool collapse_pmd = false;
>> 
>> 	present = 0;
>> 	swap = 0;
>> @@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>> 		}
>> 
>> 		if (PageTransCompound(page)) {
>> +			if (collapse_pmd ||
>> +			    test_and_clear_bit(AS_COLLAPSE_PMD,
>> +					       &mapping->flags)) {
> 
> Who said it's the only PMD range that's subject to collapse? The bit has
> to be per-PMD, not per-mapping.

I didn't assume this is the only PMD range that subject to collapse. 
So once we found AS_COLLAPSE_PMD, it will continue scan the whole mapping:
retract_page_tables(), then continue. 

> 
> I beleive we can store the bit in struct page of PTE page table, clearing
> it if we've mapped anyting that doesn't belong to there from fault path.
> 
> And in general this calls for more substantial re-design for khugepaged:
> we might want to split if into two different kernel threads. One works on
> collapsing small pages into compound and the other changes virtual address
> space to map the page as PMD.

I had almost same idea of splitting into two threads. Or ask one thread to 
scan two lists: one for pages to collapse, the other for page tables to
map as PMD. However, that does need substantial re-design. 

On the other hand, this patch is much simpler and does the work fine. It is
not optimal, as it scans the whole mapping for opportunity in just one PMD 
range. But the overhead is not in any hot path, so it just works. The extra 
double check in prepare_pmd_for_collapse() makes sure it will not collapse 
wrong pages into pmd mapped. 

Overall, I believe this is an accurate solution for the problem. We can 
further improve it. But I really hope to do the improvements after these
two patchsets get in. 

Thanks,
Song


  reply	other threads:[~2019-06-24 14:28 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-23  5:48 [PATCH v6 0/6] THP aware uprobe Song Liu
2019-06-23  5:48 ` [PATCH v6 1/6] mm: move memcmp_pages() and pages_identical() Song Liu
2019-06-23  5:48 ` [PATCH v6 2/6] uprobe: use original page when all uprobes are removed Song Liu
2019-06-23  5:48 ` [PATCH v6 3/6] mm, thp: introduce FOLL_SPLIT_PMD Song Liu
2019-06-23  5:48 ` [PATCH v6 4/6] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT Song Liu
2019-06-23  5:48 ` [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP Song Liu
2019-06-24 13:19   ` Kirill A. Shutemov
2019-06-24 14:25     ` Song Liu [this message]
2019-06-25 10:34       ` Kirill A. Shutemov
2019-06-25 12:33         ` Song Liu
2019-06-25 14:12           ` Kirill A. Shutemov
2019-06-25 15:10             ` Song Liu
2019-06-24 22:06   ` Song Liu
2019-06-23  5:48 ` [PATCH v6 6/6] uprobe: collapse THP pmd after removing all uprobes Song Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=24FB1072-E355-4F9D-855F-337C855C9AF9@fb.com \
    --to=songliubraving@fb.com \
    --cc=Kernel-team@fb.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=kirill@shutemov.name \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthew.wilcox@oracle.com \
    --cc=oleg@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=william.kucharski@oracle.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).