* [PATCH v6 1/6] mm: move memcmp_pages() and pages_identical()
2019-06-23 5:48 [PATCH v6 0/6] THP aware uprobe Song Liu
@ 2019-06-23 5:48 ` Song Liu
2019-06-23 5:48 ` [PATCH v6 2/6] uprobe: use original page when all uprobes are removed Song Liu
` (4 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2019-06-23 5:48 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: matthew.wilcox, kirill.shutemov, peterz, oleg, rostedt,
kernel-team, william.kucharski, Song Liu
This patch moves memcmp_pages() to mm/util.c and pages_identical() to
mm.h, so that we can use them in other files.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/mm.h | 7 +++++++
mm/ksm.c | 18 ------------------
mm/util.c | 13 +++++++++++++
3 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd0b5f4e1e45..0ab8c7d84cd0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2891,5 +2891,12 @@ void __init setup_nr_node_ids(void);
static inline void setup_nr_node_ids(void) {}
#endif
+extern int memcmp_pages(struct page *page1, struct page *page2);
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+ return !memcmp_pages(page1, page2);
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/mm/ksm.c b/mm/ksm.c
index 81c20ed57bf6..6f153f976c4c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1030,24 +1030,6 @@ static u32 calc_checksum(struct page *page)
return checksum;
}
-static int memcmp_pages(struct page *page1, struct page *page2)
-{
- char *addr1, *addr2;
- int ret;
-
- addr1 = kmap_atomic(page1);
- addr2 = kmap_atomic(page2);
- ret = memcmp(addr1, addr2, PAGE_SIZE);
- kunmap_atomic(addr2);
- kunmap_atomic(addr1);
- return ret;
-}
-
-static inline int pages_identical(struct page *page1, struct page *page2)
-{
- return !memcmp_pages(page1, page2);
-}
-
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
diff --git a/mm/util.c b/mm/util.c
index 9834c4ab7d8e..750e586d50bc 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -755,3 +755,16 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
out:
return res;
}
+
+int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
--
2.17.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v6 2/6] uprobe: use original page when all uprobes are removed
2019-06-23 5:48 [PATCH v6 0/6] THP aware uprobe Song Liu
2019-06-23 5:48 ` [PATCH v6 1/6] mm: move memcmp_pages() and pages_identical() Song Liu
@ 2019-06-23 5:48 ` Song Liu
2019-06-23 5:48 ` [PATCH v6 3/6] mm, thp: introduce FOLL_SPLIT_PMD Song Liu
` (3 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2019-06-23 5:48 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: matthew.wilcox, kirill.shutemov, peterz, oleg, rostedt,
kernel-team, william.kucharski, Song Liu
Currently, uprobe swaps the target page with a anonymous page in both
install_breakpoint() and remove_breakpoint(). When all uprobes on a page
are removed, the given mm is still using an anonymous page (not the
original page).
This patch allows uprobe to use original page when possible (all uprobes
on the page are already removed).
Signed-off-by: Song Liu <songliubraving@fb.com>
---
kernel/events/uprobes.c | 45 +++++++++++++++++++++++++++++++++--------
1 file changed, 37 insertions(+), 8 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 78f61bfc6b79..f7c61a1ef720 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -160,16 +160,19 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
int err;
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
+ bool orig = new_page->mapping != NULL; /* new_page == orig_page */
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
- err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
- false);
- if (err)
- return err;
+ if (!orig) {
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false);
+ if (err)
+ return err;
+ }
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
@@ -177,15 +180,24 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw)) {
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ if (!orig)
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ if (orig) {
+ lock_page(new_page); /* for page_add_file_rmap() */
+ page_add_file_rmap(new_page, false);
+ unlock_page(new_page);
+ inc_mm_counter(mm, mm_counter_file(new_page));
+ dec_mm_counter(mm, MM_ANONPAGES);
+ } else {
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ }
if (!PageAnon(old_page)) {
dec_mm_counter(mm, mm_counter_file(old_page));
@@ -501,6 +513,23 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (!is_register) {
+ struct page *orig_page;
+ pgoff_t index;
+
+ index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
+ orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
+ index);
+
+ if (orig_page) {
+ if (pages_identical(new_page, orig_page)) {
+ put_page(new_page);
+ new_page = orig_page;
+ } else
+ put_page(orig_page);
+ }
+ }
+
ret = __replace_page(vma, vaddr, old_page, new_page);
put_page(new_page);
put_old:
--
2.17.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v6 3/6] mm, thp: introduce FOLL_SPLIT_PMD
2019-06-23 5:48 [PATCH v6 0/6] THP aware uprobe Song Liu
2019-06-23 5:48 ` [PATCH v6 1/6] mm: move memcmp_pages() and pages_identical() Song Liu
2019-06-23 5:48 ` [PATCH v6 2/6] uprobe: use original page when all uprobes are removed Song Liu
@ 2019-06-23 5:48 ` Song Liu
2019-06-23 5:48 ` [PATCH v6 4/6] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT Song Liu
` (2 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2019-06-23 5:48 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: matthew.wilcox, kirill.shutemov, peterz, oleg, rostedt,
kernel-team, william.kucharski, Song Liu
This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says
FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge
page stays as-is.
FOLL_SPLIT_PMD is useful for cases where we need to use regular pages,
but would switch back to huge page and huge pmd on. One of such example
is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/mm.h | 1 +
mm/gup.c | 8 ++++++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ab8c7d84cd0..e605acc4fc81 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2642,6 +2642,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
+#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
/*
* NOTE on FOLL_LONGTERM:
diff --git a/mm/gup.c b/mm/gup.c
index ddde097cf9e4..41f2a1fcc6f0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -398,7 +398,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & FOLL_SPLIT) {
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
@@ -407,7 +407,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else {
+ } else if (flags & FOLL_SPLIT) {
if (unlikely(!try_get_page(page))) {
spin_unlock(ptl);
return ERR_PTR(-ENOMEM);
@@ -419,6 +419,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
put_page(page);
if (pmd_none(*pmd))
return no_page_table(vma, flags);
+ } else { /* flags & FOLL_SPLIT_PMD */
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ ret = pte_alloc(mm, pmd);
}
return ret ? ERR_PTR(ret) :
--
2.17.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v6 4/6] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT
2019-06-23 5:48 [PATCH v6 0/6] THP aware uprobe Song Liu
` (2 preceding siblings ...)
2019-06-23 5:48 ` [PATCH v6 3/6] mm, thp: introduce FOLL_SPLIT_PMD Song Liu
@ 2019-06-23 5:48 ` Song Liu
2019-06-23 5:48 ` [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP Song Liu
2019-06-23 5:48 ` [PATCH v6 6/6] uprobe: collapse THP pmd after removing all uprobes Song Liu
5 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2019-06-23 5:48 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: matthew.wilcox, kirill.shutemov, peterz, oleg, rostedt,
kernel-team, william.kucharski, Song Liu
This patches uses newly added FOLL_SPLIT_PMD in uprobe. This enables easy
regroup of huge pmd after the uprobe is disabled (in next patch).
Signed-off-by: Song Liu <songliubraving@fb.com>
---
kernel/events/uprobes.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f7c61a1ef720..a20d7b43a056 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -153,7 +153,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
- .page = old_page,
+ .page = compound_head(old_page),
.vma = vma,
.address = addr,
};
@@ -165,8 +165,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
- VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-
if (!orig) {
err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
&memcg, false);
@@ -483,7 +481,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
retry:
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(NULL, mm, vaddr, 1,
- FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
+ FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL);
if (ret <= 0)
return ret;
--
2.17.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
2019-06-23 5:48 [PATCH v6 0/6] THP aware uprobe Song Liu
` (3 preceding siblings ...)
2019-06-23 5:48 ` [PATCH v6 4/6] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT Song Liu
@ 2019-06-23 5:48 ` Song Liu
2019-06-24 13:19 ` Kirill A. Shutemov
2019-06-24 22:06 ` Song Liu
2019-06-23 5:48 ` [PATCH v6 6/6] uprobe: collapse THP pmd after removing all uprobes Song Liu
5 siblings, 2 replies; 14+ messages in thread
From: Song Liu @ 2019-06-23 5:48 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: matthew.wilcox, kirill.shutemov, peterz, oleg, rostedt,
kernel-team, william.kucharski, Song Liu
khugepaged needs exclusive mmap_sem to access page table. When it fails
to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
is already a THP, khugepaged will not handle this pmd again.
This patch enables the khugepaged to retry retract_page_tables().
A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
compound pages in this address_space.
Since collapse may happen at an later time, some pages may already fault
in. To handle these pages properly, it is necessary to prepare the pmd
before collapsing. prepare_pmd_for_collapse() is introduced to prepare
the pmd by removing rmap, adjusting refcount and mm_counter.
prepare_pmd_for_collapse() also double checks whether all ptes in this
pmd are mapping to the same THP. This is necessary because some subpage
of the THP may be replaced, for example by uprobe. In such cases, it
is not possible to collapse the pmd, so we fall back.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/pagemap.h | 1 +
mm/khugepaged.c | 69 +++++++++++++++++++++++++++++++++++------
2 files changed, 60 insertions(+), 10 deletions(-)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 9ec3544baee2..eac881de2a46 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -29,6 +29,7 @@ enum mapping_flags {
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
+ AS_COLLAPSE_PMD = 6, /* try collapse pmd for THP */
};
/**
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a4f90a1b06f5..9b980327fd9b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
}
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
-static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+
+/* return whether the pmd is ready for collapse */
+bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
+ struct page *hpage, pmd_t *pmd)
+{
+ unsigned long haddr = page_address_in_vma(hpage, vma);
+ unsigned long addr;
+ int i, count = 0;
+
+ /* step 1: check all mapped PTEs are to this huge page */
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t *pte = pte_offset_map(pmd, addr);
+
+ if (pte_none(*pte))
+ continue;
+
+ if (hpage + i != vm_normal_page(vma, addr, *pte))
+ return false;
+ count++;
+ }
+
+ /* step 2: adjust rmap */
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t *pte = pte_offset_map(pmd, addr);
+ struct page *page;
+
+ if (pte_none(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ page_remove_rmap(page, false);
+ }
+
+ /* step 3: set proper refcount and mm_counters. */
+ page_ref_sub(hpage, count);
+ add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ return true;
+}
+
+extern pid_t sysctl_dump_pt_pid;
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+ struct page *hpage)
{
struct vm_area_struct *vma;
unsigned long addr;
@@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
pmd = mm_find_pmd(vma->vm_mm, addr);
if (!pmd)
continue;
- /*
- * We need exclusive mmap_sem to retract page table.
- * If trylock fails we would end up with pte-mapped THP after
- * re-fault. Not ideal, but it's more important to not disturb
- * the system too much.
- */
if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
+
+ if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
+ spin_unlock(ptl);
+ up_write(&vma->vm_mm->mmap_sem);
+ continue;
+ }
_pmd = pmdp_collapse_flush(vma, addr, pmd);
spin_unlock(ptl);
up_write(&vma->vm_mm->mmap_sem);
mm_dec_nr_ptes(vma->vm_mm);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
- }
+ } else
+ set_bit(AS_COLLAPSE_PMD, &mapping->flags);
}
i_mmap_unlock_write(mapping);
}
@@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
/*
* Remove pte page tables, so we can re-fault the page as huge.
*/
- retract_page_tables(mapping, start);
+ retract_page_tables(mapping, start, new_page);
*hpage = NULL;
khugepaged_pages_collapsed++;
@@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
int present, swap;
int node = NUMA_NO_NODE;
int result = SCAN_SUCCEED;
+ bool collapse_pmd = false;
present = 0;
swap = 0;
@@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
}
if (PageTransCompound(page)) {
+ if (collapse_pmd ||
+ test_and_clear_bit(AS_COLLAPSE_PMD,
+ &mapping->flags)) {
+ collapse_pmd = true;
+ retract_page_tables(mapping, start,
+ compound_head(page));
+ continue;
+ }
result = SCAN_PAGE_COMPOUND;
break;
}
--
2.17.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
2019-06-23 5:48 ` [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP Song Liu
@ 2019-06-24 13:19 ` Kirill A. Shutemov
2019-06-24 14:25 ` Song Liu
2019-06-24 22:06 ` Song Liu
1 sibling, 1 reply; 14+ messages in thread
From: Kirill A. Shutemov @ 2019-06-24 13:19 UTC (permalink / raw)
To: Song Liu
Cc: linux-kernel, linux-mm, matthew.wilcox, kirill.shutemov, peterz,
oleg, rostedt, kernel-team, william.kucharski
On Sat, Jun 22, 2019 at 10:48:28PM -0700, Song Liu wrote:
> khugepaged needs exclusive mmap_sem to access page table. When it fails
> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
> is already a THP, khugepaged will not handle this pmd again.
>
> This patch enables the khugepaged to retry retract_page_tables().
>
> A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
> contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
> it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
> compound pages in this address_space.
>
> Since collapse may happen at an later time, some pages may already fault
> in. To handle these pages properly, it is necessary to prepare the pmd
> before collapsing. prepare_pmd_for_collapse() is introduced to prepare
> the pmd by removing rmap, adjusting refcount and mm_counter.
>
> prepare_pmd_for_collapse() also double checks whether all ptes in this
> pmd are mapping to the same THP. This is necessary because some subpage
> of the THP may be replaced, for example by uprobe. In such cases, it
> is not possible to collapse the pmd, so we fall back.
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
> include/linux/pagemap.h | 1 +
> mm/khugepaged.c | 69 +++++++++++++++++++++++++++++++++++------
> 2 files changed, 60 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 9ec3544baee2..eac881de2a46 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -29,6 +29,7 @@ enum mapping_flags {
> AS_EXITING = 4, /* final truncate in progress */
> /* writeback related tags are not used */
> AS_NO_WRITEBACK_TAGS = 5,
> + AS_COLLAPSE_PMD = 6, /* try collapse pmd for THP */
> };
>
> /**
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index a4f90a1b06f5..9b980327fd9b 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
> }
>
> #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
> -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> +
> +/* return whether the pmd is ready for collapse */
> +bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
> + struct page *hpage, pmd_t *pmd)
> +{
> + unsigned long haddr = page_address_in_vma(hpage, vma);
> + unsigned long addr;
> + int i, count = 0;
> +
> + /* step 1: check all mapped PTEs are to this huge page */
> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> + pte_t *pte = pte_offset_map(pmd, addr);
> +
> + if (pte_none(*pte))
> + continue;
> +
> + if (hpage + i != vm_normal_page(vma, addr, *pte))
> + return false;
> + count++;
> + }
> +
> + /* step 2: adjust rmap */
> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> + pte_t *pte = pte_offset_map(pmd, addr);
> + struct page *page;
> +
> + if (pte_none(*pte))
> + continue;
> + page = vm_normal_page(vma, addr, *pte);
> + page_remove_rmap(page, false);
> + }
> +
> + /* step 3: set proper refcount and mm_counters. */
> + page_ref_sub(hpage, count);
> + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
> + return true;
> +}
> +
> +extern pid_t sysctl_dump_pt_pid;
> +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
> + struct page *hpage)
> {
> struct vm_area_struct *vma;
> unsigned long addr;
> @@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> pmd = mm_find_pmd(vma->vm_mm, addr);
> if (!pmd)
> continue;
> - /*
> - * We need exclusive mmap_sem to retract page table.
> - * If trylock fails we would end up with pte-mapped THP after
> - * re-fault. Not ideal, but it's more important to not disturb
> - * the system too much.
> - */
> if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
> spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
> - /* assume page table is clear */
> +
> + if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
> + spin_unlock(ptl);
> + up_write(&vma->vm_mm->mmap_sem);
> + continue;
> + }
> _pmd = pmdp_collapse_flush(vma, addr, pmd);
> spin_unlock(ptl);
> up_write(&vma->vm_mm->mmap_sem);
> mm_dec_nr_ptes(vma->vm_mm);
> pte_free(vma->vm_mm, pmd_pgtable(_pmd));
> - }
> + } else
> + set_bit(AS_COLLAPSE_PMD, &mapping->flags);
> }
> i_mmap_unlock_write(mapping);
> }
> @@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
> /*
> * Remove pte page tables, so we can re-fault the page as huge.
> */
> - retract_page_tables(mapping, start);
> + retract_page_tables(mapping, start, new_page);
> *hpage = NULL;
>
> khugepaged_pages_collapsed++;
> @@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
> int present, swap;
> int node = NUMA_NO_NODE;
> int result = SCAN_SUCCEED;
> + bool collapse_pmd = false;
>
> present = 0;
> swap = 0;
> @@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
> }
>
> if (PageTransCompound(page)) {
> + if (collapse_pmd ||
> + test_and_clear_bit(AS_COLLAPSE_PMD,
> + &mapping->flags)) {
Who said it's the only PMD range that's subject to collapse? The bit has
to be per-PMD, not per-mapping.
I beleive we can store the bit in struct page of PTE page table, clearing
it if we've mapped anyting that doesn't belong to there from fault path.
And in general this calls for more substantial re-design for khugepaged:
we might want to split if into two different kernel threads. One works on
collapsing small pages into compound and the other changes virtual address
space to map the page as PMD.
Even if only the first step is successful, it's still useful: the new
mapping of the file will get huge page, even if the old is sill
PTE-mapped.
--
Kirill A. Shutemov
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
2019-06-24 13:19 ` Kirill A. Shutemov
@ 2019-06-24 14:25 ` Song Liu
2019-06-25 10:34 ` Kirill A. Shutemov
0 siblings, 1 reply; 14+ messages in thread
From: Song Liu @ 2019-06-24 14:25 UTC (permalink / raw)
To: Kirill A. Shutemov
Cc: LKML, Linux-MM, matthew.wilcox, kirill.shutemov, peterz, oleg,
rostedt, Kernel Team, william.kucharski
> On Jun 24, 2019, at 6:19 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Sat, Jun 22, 2019 at 10:48:28PM -0700, Song Liu wrote:
>> khugepaged needs exclusive mmap_sem to access page table. When it fails
>> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
>> is already a THP, khugepaged will not handle this pmd again.
>>
>> This patch enables the khugepaged to retry retract_page_tables().
>>
>> A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
>> contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
>> it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
>> compound pages in this address_space.
>>
>> Since collapse may happen at an later time, some pages may already fault
>> in. To handle these pages properly, it is necessary to prepare the pmd
>> before collapsing. prepare_pmd_for_collapse() is introduced to prepare
>> the pmd by removing rmap, adjusting refcount and mm_counter.
>>
>> prepare_pmd_for_collapse() also double checks whether all ptes in this
>> pmd are mapping to the same THP. This is necessary because some subpage
>> of the THP may be replaced, for example by uprobe. In such cases, it
>> is not possible to collapse the pmd, so we fall back.
>>
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>> include/linux/pagemap.h | 1 +
>> mm/khugepaged.c | 69 +++++++++++++++++++++++++++++++++++------
>> 2 files changed, 60 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>> index 9ec3544baee2..eac881de2a46 100644
>> --- a/include/linux/pagemap.h
>> +++ b/include/linux/pagemap.h
>> @@ -29,6 +29,7 @@ enum mapping_flags {
>> AS_EXITING = 4, /* final truncate in progress */
>> /* writeback related tags are not used */
>> AS_NO_WRITEBACK_TAGS = 5,
>> + AS_COLLAPSE_PMD = 6, /* try collapse pmd for THP */
>> };
>>
>> /**
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index a4f90a1b06f5..9b980327fd9b 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
>> }
>>
>> #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
>> -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>> +
>> +/* return whether the pmd is ready for collapse */
>> +bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
>> + struct page *hpage, pmd_t *pmd)
>> +{
>> + unsigned long haddr = page_address_in_vma(hpage, vma);
>> + unsigned long addr;
>> + int i, count = 0;
>> +
>> + /* step 1: check all mapped PTEs are to this huge page */
>> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>> + pte_t *pte = pte_offset_map(pmd, addr);
>> +
>> + if (pte_none(*pte))
>> + continue;
>> +
>> + if (hpage + i != vm_normal_page(vma, addr, *pte))
>> + return false;
>> + count++;
>> + }
>> +
>> + /* step 2: adjust rmap */
>> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>> + pte_t *pte = pte_offset_map(pmd, addr);
>> + struct page *page;
>> +
>> + if (pte_none(*pte))
>> + continue;
>> + page = vm_normal_page(vma, addr, *pte);
>> + page_remove_rmap(page, false);
>> + }
>> +
>> + /* step 3: set proper refcount and mm_counters. */
>> + page_ref_sub(hpage, count);
>> + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
>> + return true;
>> +}
>> +
>> +extern pid_t sysctl_dump_pt_pid;
>> +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
>> + struct page *hpage)
>> {
>> struct vm_area_struct *vma;
>> unsigned long addr;
>> @@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>> pmd = mm_find_pmd(vma->vm_mm, addr);
>> if (!pmd)
>> continue;
>> - /*
>> - * We need exclusive mmap_sem to retract page table.
>> - * If trylock fails we would end up with pte-mapped THP after
>> - * re-fault. Not ideal, but it's more important to not disturb
>> - * the system too much.
>> - */
>> if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
>> spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
>> - /* assume page table is clear */
>> +
>> + if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
>> + spin_unlock(ptl);
>> + up_write(&vma->vm_mm->mmap_sem);
>> + continue;
>> + }
>> _pmd = pmdp_collapse_flush(vma, addr, pmd);
>> spin_unlock(ptl);
>> up_write(&vma->vm_mm->mmap_sem);
>> mm_dec_nr_ptes(vma->vm_mm);
>> pte_free(vma->vm_mm, pmd_pgtable(_pmd));
>> - }
>> + } else
>> + set_bit(AS_COLLAPSE_PMD, &mapping->flags);
>> }
>> i_mmap_unlock_write(mapping);
>> }
>> @@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
>> /*
>> * Remove pte page tables, so we can re-fault the page as huge.
>> */
>> - retract_page_tables(mapping, start);
>> + retract_page_tables(mapping, start, new_page);
>> *hpage = NULL;
>>
>> khugepaged_pages_collapsed++;
>> @@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>> int present, swap;
>> int node = NUMA_NO_NODE;
>> int result = SCAN_SUCCEED;
>> + bool collapse_pmd = false;
>>
>> present = 0;
>> swap = 0;
>> @@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>> }
>>
>> if (PageTransCompound(page)) {
>> + if (collapse_pmd ||
>> + test_and_clear_bit(AS_COLLAPSE_PMD,
>> + &mapping->flags)) {
>
> Who said it's the only PMD range that's subject to collapse? The bit has
> to be per-PMD, not per-mapping.
I didn't assume this is the only PMD range that subject to collapse.
So once we found AS_COLLAPSE_PMD, it will continue scan the whole mapping:
retract_page_tables(), then continue.
>
> I beleive we can store the bit in struct page of PTE page table, clearing
> it if we've mapped anyting that doesn't belong to there from fault path.
>
> And in general this calls for more substantial re-design for khugepaged:
> we might want to split if into two different kernel threads. One works on
> collapsing small pages into compound and the other changes virtual address
> space to map the page as PMD.
I had almost same idea of splitting into two threads. Or ask one thread to
scan two lists: one for pages to collapse, the other for page tables to
map as PMD. However, that does need substantial re-design.
On the other hand, this patch is much simpler and does the work fine. It is
not optimal, as it scans the whole mapping for opportunity in just one PMD
range. But the overhead is not in any hot path, so it just works. The extra
double check in prepare_pmd_for_collapse() makes sure it will not collapse
wrong pages into pmd mapped.
Overall, I believe this is an accurate solution for the problem. We can
further improve it. But I really hope to do the improvements after these
two patchsets get in.
Thanks,
Song
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
2019-06-24 14:25 ` Song Liu
@ 2019-06-25 10:34 ` Kirill A. Shutemov
2019-06-25 12:33 ` Song Liu
0 siblings, 1 reply; 14+ messages in thread
From: Kirill A. Shutemov @ 2019-06-25 10:34 UTC (permalink / raw)
To: Song Liu
Cc: LKML, Linux-MM, matthew.wilcox, kirill.shutemov, peterz, oleg,
rostedt, Kernel Team, william.kucharski
On Mon, Jun 24, 2019 at 02:25:42PM +0000, Song Liu wrote:
>
>
> > On Jun 24, 2019, at 6:19 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >
> > On Sat, Jun 22, 2019 at 10:48:28PM -0700, Song Liu wrote:
> >> khugepaged needs exclusive mmap_sem to access page table. When it fails
> >> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
> >> is already a THP, khugepaged will not handle this pmd again.
> >>
> >> This patch enables the khugepaged to retry retract_page_tables().
> >>
> >> A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
> >> contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
> >> it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
> >> compound pages in this address_space.
> >>
> >> Since collapse may happen at an later time, some pages may already fault
> >> in. To handle these pages properly, it is necessary to prepare the pmd
> >> before collapsing. prepare_pmd_for_collapse() is introduced to prepare
> >> the pmd by removing rmap, adjusting refcount and mm_counter.
> >>
> >> prepare_pmd_for_collapse() also double checks whether all ptes in this
> >> pmd are mapping to the same THP. This is necessary because some subpage
> >> of the THP may be replaced, for example by uprobe. In such cases, it
> >> is not possible to collapse the pmd, so we fall back.
> >>
> >> Signed-off-by: Song Liu <songliubraving@fb.com>
> >> ---
> >> include/linux/pagemap.h | 1 +
> >> mm/khugepaged.c | 69 +++++++++++++++++++++++++++++++++++------
> >> 2 files changed, 60 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> >> index 9ec3544baee2..eac881de2a46 100644
> >> --- a/include/linux/pagemap.h
> >> +++ b/include/linux/pagemap.h
> >> @@ -29,6 +29,7 @@ enum mapping_flags {
> >> AS_EXITING = 4, /* final truncate in progress */
> >> /* writeback related tags are not used */
> >> AS_NO_WRITEBACK_TAGS = 5,
> >> + AS_COLLAPSE_PMD = 6, /* try collapse pmd for THP */
> >> };
> >>
> >> /**
> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >> index a4f90a1b06f5..9b980327fd9b 100644
> >> --- a/mm/khugepaged.c
> >> +++ b/mm/khugepaged.c
> >> @@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
> >> }
> >>
> >> #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
> >> -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> >> +
> >> +/* return whether the pmd is ready for collapse */
> >> +bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
> >> + struct page *hpage, pmd_t *pmd)
> >> +{
> >> + unsigned long haddr = page_address_in_vma(hpage, vma);
> >> + unsigned long addr;
> >> + int i, count = 0;
> >> +
> >> + /* step 1: check all mapped PTEs are to this huge page */
> >> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> >> + pte_t *pte = pte_offset_map(pmd, addr);
> >> +
> >> + if (pte_none(*pte))
> >> + continue;
> >> +
> >> + if (hpage + i != vm_normal_page(vma, addr, *pte))
> >> + return false;
> >> + count++;
> >> + }
> >> +
> >> + /* step 2: adjust rmap */
> >> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> >> + pte_t *pte = pte_offset_map(pmd, addr);
> >> + struct page *page;
> >> +
> >> + if (pte_none(*pte))
> >> + continue;
> >> + page = vm_normal_page(vma, addr, *pte);
> >> + page_remove_rmap(page, false);
> >> + }
> >> +
> >> + /* step 3: set proper refcount and mm_counters. */
> >> + page_ref_sub(hpage, count);
> >> + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
> >> + return true;
> >> +}
> >> +
> >> +extern pid_t sysctl_dump_pt_pid;
> >> +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
> >> + struct page *hpage)
> >> {
> >> struct vm_area_struct *vma;
> >> unsigned long addr;
> >> @@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> >> pmd = mm_find_pmd(vma->vm_mm, addr);
> >> if (!pmd)
> >> continue;
> >> - /*
> >> - * We need exclusive mmap_sem to retract page table.
> >> - * If trylock fails we would end up with pte-mapped THP after
> >> - * re-fault. Not ideal, but it's more important to not disturb
> >> - * the system too much.
> >> - */
> >> if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
> >> spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
> >> - /* assume page table is clear */
> >> +
> >> + if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
> >> + spin_unlock(ptl);
> >> + up_write(&vma->vm_mm->mmap_sem);
> >> + continue;
> >> + }
> >> _pmd = pmdp_collapse_flush(vma, addr, pmd);
> >> spin_unlock(ptl);
> >> up_write(&vma->vm_mm->mmap_sem);
> >> mm_dec_nr_ptes(vma->vm_mm);
> >> pte_free(vma->vm_mm, pmd_pgtable(_pmd));
> >> - }
> >> + } else
> >> + set_bit(AS_COLLAPSE_PMD, &mapping->flags);
> >> }
> >> i_mmap_unlock_write(mapping);
> >> }
> >> @@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
> >> /*
> >> * Remove pte page tables, so we can re-fault the page as huge.
> >> */
> >> - retract_page_tables(mapping, start);
> >> + retract_page_tables(mapping, start, new_page);
> >> *hpage = NULL;
> >>
> >> khugepaged_pages_collapsed++;
> >> @@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
> >> int present, swap;
> >> int node = NUMA_NO_NODE;
> >> int result = SCAN_SUCCEED;
> >> + bool collapse_pmd = false;
> >>
> >> present = 0;
> >> swap = 0;
> >> @@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
> >> }
> >>
> >> if (PageTransCompound(page)) {
> >> + if (collapse_pmd ||
> >> + test_and_clear_bit(AS_COLLAPSE_PMD,
> >> + &mapping->flags)) {
> >
> > Who said it's the only PMD range that's subject to collapse? The bit has
> > to be per-PMD, not per-mapping.
>
> I didn't assume this is the only PMD range that subject to collapse.
> So once we found AS_COLLAPSE_PMD, it will continue scan the whole mapping:
> retract_page_tables(), then continue.
I still don't get it.
Assume we have two ranges that subject to collapse. khugepaged_scan_file()
sees and clears AS_COLLAPSE_PMD. Tries to collapse the first range, fails
and set the bit again. khugepaged_scan_file() sees the second range,
clears the bit, but this time collapse is successful: the bit is still not
set, but it should be.
> > I beleive we can store the bit in struct page of PTE page table, clearing
> > it if we've mapped anyting that doesn't belong to there from fault path.
> >
> > And in general this calls for more substantial re-design for khugepaged:
> > we might want to split if into two different kernel threads. One works on
> > collapsing small pages into compound and the other changes virtual address
> > space to map the page as PMD.
>
> I had almost same idea of splitting into two threads. Or ask one thread to
> scan two lists: one for pages to collapse, the other for page tables to
> map as PMD. However, that does need substantial re-design.
>
> On the other hand, this patch is much simpler and does the work fine. It is
> not optimal, as it scans the whole mapping for opportunity in just one PMD
> range. But the overhead is not in any hot path, so it just works. The extra
> double check in prepare_pmd_for_collapse() makes sure it will not collapse
> wrong pages into pmd mapped.
>
> Overall, I believe this is an accurate solution for the problem. We can
> further improve it. But I really hope to do the improvements after these
> two patchsets get in.
I don't think AS_COLLAPSE_PMD adds much value. I don't see it relieably
serving the purpose.
--
Kirill A. Shutemov
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
2019-06-25 10:34 ` Kirill A. Shutemov
@ 2019-06-25 12:33 ` Song Liu
2019-06-25 14:12 ` Kirill A. Shutemov
0 siblings, 1 reply; 14+ messages in thread
From: Song Liu @ 2019-06-25 12:33 UTC (permalink / raw)
To: Kirill A. Shutemov
Cc: LKML, Linux-MM, matthew.wilcox, kirill.shutemov, peterz, oleg,
rostedt, Kernel Team, william.kucharski
> On Jun 25, 2019, at 3:34 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Mon, Jun 24, 2019 at 02:25:42PM +0000, Song Liu wrote:
>>
>>
>>> On Jun 24, 2019, at 6:19 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>>>
>>> On Sat, Jun 22, 2019 at 10:48:28PM -0700, Song Liu wrote:
>>>> khugepaged needs exclusive mmap_sem to access page table. When it fails
>>>> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
>>>> is already a THP, khugepaged will not handle this pmd again.
>>>>
>>>> This patch enables the khugepaged to retry retract_page_tables().
>>>>
>>>> A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
>>>> contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
>>>> it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
>>>> compound pages in this address_space.
>>>>
>>>> Since collapse may happen at an later time, some pages may already fault
>>>> in. To handle these pages properly, it is necessary to prepare the pmd
>>>> before collapsing. prepare_pmd_for_collapse() is introduced to prepare
>>>> the pmd by removing rmap, adjusting refcount and mm_counter.
>>>>
>>>> prepare_pmd_for_collapse() also double checks whether all ptes in this
>>>> pmd are mapping to the same THP. This is necessary because some subpage
>>>> of the THP may be replaced, for example by uprobe. In such cases, it
>>>> is not possible to collapse the pmd, so we fall back.
>>>>
>>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>>> ---
>>>> include/linux/pagemap.h | 1 +
>>>> mm/khugepaged.c | 69 +++++++++++++++++++++++++++++++++++------
>>>> 2 files changed, 60 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>>>> index 9ec3544baee2..eac881de2a46 100644
>>>> --- a/include/linux/pagemap.h
>>>> +++ b/include/linux/pagemap.h
>>>> @@ -29,6 +29,7 @@ enum mapping_flags {
>>>> AS_EXITING = 4, /* final truncate in progress */
>>>> /* writeback related tags are not used */
>>>> AS_NO_WRITEBACK_TAGS = 5,
>>>> + AS_COLLAPSE_PMD = 6, /* try collapse pmd for THP */
>>>> };
>>>>
>>>> /**
>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>> index a4f90a1b06f5..9b980327fd9b 100644
>>>> --- a/mm/khugepaged.c
>>>> +++ b/mm/khugepaged.c
>>>> @@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
>>>> }
>>>>
>>>> #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
>>>> -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>>>> +
>>>> +/* return whether the pmd is ready for collapse */
>>>> +bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
>>>> + struct page *hpage, pmd_t *pmd)
>>>> +{
>>>> + unsigned long haddr = page_address_in_vma(hpage, vma);
>>>> + unsigned long addr;
>>>> + int i, count = 0;
>>>> +
>>>> + /* step 1: check all mapped PTEs are to this huge page */
>>>> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>>>> + pte_t *pte = pte_offset_map(pmd, addr);
>>>> +
>>>> + if (pte_none(*pte))
>>>> + continue;
>>>> +
>>>> + if (hpage + i != vm_normal_page(vma, addr, *pte))
>>>> + return false;
>>>> + count++;
>>>> + }
>>>> +
>>>> + /* step 2: adjust rmap */
>>>> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>>>> + pte_t *pte = pte_offset_map(pmd, addr);
>>>> + struct page *page;
>>>> +
>>>> + if (pte_none(*pte))
>>>> + continue;
>>>> + page = vm_normal_page(vma, addr, *pte);
>>>> + page_remove_rmap(page, false);
>>>> + }
>>>> +
>>>> + /* step 3: set proper refcount and mm_counters. */
>>>> + page_ref_sub(hpage, count);
>>>> + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
>>>> + return true;
>>>> +}
>>>> +
>>>> +extern pid_t sysctl_dump_pt_pid;
>>>> +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
>>>> + struct page *hpage)
>>>> {
>>>> struct vm_area_struct *vma;
>>>> unsigned long addr;
>>>> @@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>>>> pmd = mm_find_pmd(vma->vm_mm, addr);
>>>> if (!pmd)
>>>> continue;
>>>> - /*
>>>> - * We need exclusive mmap_sem to retract page table.
>>>> - * If trylock fails we would end up with pte-mapped THP after
>>>> - * re-fault. Not ideal, but it's more important to not disturb
>>>> - * the system too much.
>>>> - */
>>>> if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
>>>> spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
>>>> - /* assume page table is clear */
>>>> +
>>>> + if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
>>>> + spin_unlock(ptl);
>>>> + up_write(&vma->vm_mm->mmap_sem);
>>>> + continue;
>>>> + }
>>>> _pmd = pmdp_collapse_flush(vma, addr, pmd);
>>>> spin_unlock(ptl);
>>>> up_write(&vma->vm_mm->mmap_sem);
>>>> mm_dec_nr_ptes(vma->vm_mm);
>>>> pte_free(vma->vm_mm, pmd_pgtable(_pmd));
>>>> - }
>>>> + } else
>>>> + set_bit(AS_COLLAPSE_PMD, &mapping->flags);
>>>> }
>>>> i_mmap_unlock_write(mapping);
>>>> }
>>>> @@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
>>>> /*
>>>> * Remove pte page tables, so we can re-fault the page as huge.
>>>> */
>>>> - retract_page_tables(mapping, start);
>>>> + retract_page_tables(mapping, start, new_page);
>>>> *hpage = NULL;
>>>>
>>>> khugepaged_pages_collapsed++;
>>>> @@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>>>> int present, swap;
>>>> int node = NUMA_NO_NODE;
>>>> int result = SCAN_SUCCEED;
>>>> + bool collapse_pmd = false;
>>>>
>>>> present = 0;
>>>> swap = 0;
>>>> @@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>>>> }
>>>>
>>>> if (PageTransCompound(page)) {
>>>> + if (collapse_pmd ||
>>>> + test_and_clear_bit(AS_COLLAPSE_PMD,
>>>> + &mapping->flags)) {
>>>
>>> Who said it's the only PMD range that's subject to collapse? The bit has
>>> to be per-PMD, not per-mapping.
>>
>> I didn't assume this is the only PMD range that subject to collapse.
>> So once we found AS_COLLAPSE_PMD, it will continue scan the whole mapping:
>> retract_page_tables(), then continue.
>
> I still don't get it.
>
> Assume we have two ranges that subject to collapse. khugepaged_scan_file()
> sees and clears AS_COLLAPSE_PMD. Tries to collapse the first range, fails
> and set the bit again. khugepaged_scan_file() sees the second range,
> clears the bit, but this time collapse is successful: the bit is still not
> set, but it should be.
Yeah, you are right. Current logic only covers multiple THPs within single
call of khugepaged_scan_file(). I missed the case you just described.
What do you think about the first 4 patches of set and the other set? If
these patches look good, how about we get them in first? I will work on
proper fix (or re-design) for 5/6 and 6/6 in the meanwhile.
Thanks,
Song
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
2019-06-25 12:33 ` Song Liu
@ 2019-06-25 14:12 ` Kirill A. Shutemov
2019-06-25 15:10 ` Song Liu
0 siblings, 1 reply; 14+ messages in thread
From: Kirill A. Shutemov @ 2019-06-25 14:12 UTC (permalink / raw)
To: Song Liu
Cc: LKML, Linux-MM, matthew.wilcox, kirill.shutemov, peterz, oleg,
rostedt, Kernel Team, william.kucharski
On Tue, Jun 25, 2019 at 12:33:03PM +0000, Song Liu wrote:
>
>
> > On Jun 25, 2019, at 3:34 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >
> > On Mon, Jun 24, 2019 at 02:25:42PM +0000, Song Liu wrote:
> >>
> >>
> >>> On Jun 24, 2019, at 6:19 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >>>
> >>> On Sat, Jun 22, 2019 at 10:48:28PM -0700, Song Liu wrote:
> >>>> khugepaged needs exclusive mmap_sem to access page table. When it fails
> >>>> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
> >>>> is already a THP, khugepaged will not handle this pmd again.
> >>>>
> >>>> This patch enables the khugepaged to retry retract_page_tables().
> >>>>
> >>>> A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
> >>>> contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
> >>>> it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
> >>>> compound pages in this address_space.
> >>>>
> >>>> Since collapse may happen at an later time, some pages may already fault
> >>>> in. To handle these pages properly, it is necessary to prepare the pmd
> >>>> before collapsing. prepare_pmd_for_collapse() is introduced to prepare
> >>>> the pmd by removing rmap, adjusting refcount and mm_counter.
> >>>>
> >>>> prepare_pmd_for_collapse() also double checks whether all ptes in this
> >>>> pmd are mapping to the same THP. This is necessary because some subpage
> >>>> of the THP may be replaced, for example by uprobe. In such cases, it
> >>>> is not possible to collapse the pmd, so we fall back.
> >>>>
> >>>> Signed-off-by: Song Liu <songliubraving@fb.com>
> >>>> ---
> >>>> include/linux/pagemap.h | 1 +
> >>>> mm/khugepaged.c | 69 +++++++++++++++++++++++++++++++++++------
> >>>> 2 files changed, 60 insertions(+), 10 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> >>>> index 9ec3544baee2..eac881de2a46 100644
> >>>> --- a/include/linux/pagemap.h
> >>>> +++ b/include/linux/pagemap.h
> >>>> @@ -29,6 +29,7 @@ enum mapping_flags {
> >>>> AS_EXITING = 4, /* final truncate in progress */
> >>>> /* writeback related tags are not used */
> >>>> AS_NO_WRITEBACK_TAGS = 5,
> >>>> + AS_COLLAPSE_PMD = 6, /* try collapse pmd for THP */
> >>>> };
> >>>>
> >>>> /**
> >>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >>>> index a4f90a1b06f5..9b980327fd9b 100644
> >>>> --- a/mm/khugepaged.c
> >>>> +++ b/mm/khugepaged.c
> >>>> @@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
> >>>> }
> >>>>
> >>>> #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
> >>>> -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> >>>> +
> >>>> +/* return whether the pmd is ready for collapse */
> >>>> +bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
> >>>> + struct page *hpage, pmd_t *pmd)
> >>>> +{
> >>>> + unsigned long haddr = page_address_in_vma(hpage, vma);
> >>>> + unsigned long addr;
> >>>> + int i, count = 0;
> >>>> +
> >>>> + /* step 1: check all mapped PTEs are to this huge page */
> >>>> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> >>>> + pte_t *pte = pte_offset_map(pmd, addr);
> >>>> +
> >>>> + if (pte_none(*pte))
> >>>> + continue;
> >>>> +
> >>>> + if (hpage + i != vm_normal_page(vma, addr, *pte))
> >>>> + return false;
> >>>> + count++;
> >>>> + }
> >>>> +
> >>>> + /* step 2: adjust rmap */
> >>>> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> >>>> + pte_t *pte = pte_offset_map(pmd, addr);
> >>>> + struct page *page;
> >>>> +
> >>>> + if (pte_none(*pte))
> >>>> + continue;
> >>>> + page = vm_normal_page(vma, addr, *pte);
> >>>> + page_remove_rmap(page, false);
> >>>> + }
> >>>> +
> >>>> + /* step 3: set proper refcount and mm_counters. */
> >>>> + page_ref_sub(hpage, count);
> >>>> + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
> >>>> + return true;
> >>>> +}
> >>>> +
> >>>> +extern pid_t sysctl_dump_pt_pid;
> >>>> +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
> >>>> + struct page *hpage)
> >>>> {
> >>>> struct vm_area_struct *vma;
> >>>> unsigned long addr;
> >>>> @@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> >>>> pmd = mm_find_pmd(vma->vm_mm, addr);
> >>>> if (!pmd)
> >>>> continue;
> >>>> - /*
> >>>> - * We need exclusive mmap_sem to retract page table.
> >>>> - * If trylock fails we would end up with pte-mapped THP after
> >>>> - * re-fault. Not ideal, but it's more important to not disturb
> >>>> - * the system too much.
> >>>> - */
> >>>> if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
> >>>> spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
> >>>> - /* assume page table is clear */
> >>>> +
> >>>> + if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
> >>>> + spin_unlock(ptl);
> >>>> + up_write(&vma->vm_mm->mmap_sem);
> >>>> + continue;
> >>>> + }
> >>>> _pmd = pmdp_collapse_flush(vma, addr, pmd);
> >>>> spin_unlock(ptl);
> >>>> up_write(&vma->vm_mm->mmap_sem);
> >>>> mm_dec_nr_ptes(vma->vm_mm);
> >>>> pte_free(vma->vm_mm, pmd_pgtable(_pmd));
> >>>> - }
> >>>> + } else
> >>>> + set_bit(AS_COLLAPSE_PMD, &mapping->flags);
> >>>> }
> >>>> i_mmap_unlock_write(mapping);
> >>>> }
> >>>> @@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
> >>>> /*
> >>>> * Remove pte page tables, so we can re-fault the page as huge.
> >>>> */
> >>>> - retract_page_tables(mapping, start);
> >>>> + retract_page_tables(mapping, start, new_page);
> >>>> *hpage = NULL;
> >>>>
> >>>> khugepaged_pages_collapsed++;
> >>>> @@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
> >>>> int present, swap;
> >>>> int node = NUMA_NO_NODE;
> >>>> int result = SCAN_SUCCEED;
> >>>> + bool collapse_pmd = false;
> >>>>
> >>>> present = 0;
> >>>> swap = 0;
> >>>> @@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
> >>>> }
> >>>>
> >>>> if (PageTransCompound(page)) {
> >>>> + if (collapse_pmd ||
> >>>> + test_and_clear_bit(AS_COLLAPSE_PMD,
> >>>> + &mapping->flags)) {
> >>>
> >>> Who said it's the only PMD range that's subject to collapse? The bit has
> >>> to be per-PMD, not per-mapping.
> >>
> >> I didn't assume this is the only PMD range that subject to collapse.
> >> So once we found AS_COLLAPSE_PMD, it will continue scan the whole mapping:
> >> retract_page_tables(), then continue.
> >
> > I still don't get it.
> >
> > Assume we have two ranges that subject to collapse. khugepaged_scan_file()
> > sees and clears AS_COLLAPSE_PMD. Tries to collapse the first range, fails
> > and set the bit again. khugepaged_scan_file() sees the second range,
> > clears the bit, but this time collapse is successful: the bit is still not
> > set, but it should be.
>
> Yeah, you are right. Current logic only covers multiple THPs within single
> call of khugepaged_scan_file(). I missed the case you just described.
>
> What do you think about the first 4 patches of set and the other set? If
> these patches look good, how about we get them in first? I will work on
> proper fix (or re-design) for 5/6 and 6/6 in the meanwhile.
You can use
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
for the first 4 patches.
--
Kirill A. Shutemov
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
2019-06-25 14:12 ` Kirill A. Shutemov
@ 2019-06-25 15:10 ` Song Liu
0 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2019-06-25 15:10 UTC (permalink / raw)
To: Kirill A. Shutemov
Cc: LKML, Linux-MM, matthew.wilcox, kirill.shutemov, peterz, oleg,
rostedt, Kernel Team, william.kucharski
> On Jun 25, 2019, at 7:12 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Tue, Jun 25, 2019 at 12:33:03PM +0000, Song Liu wrote:
>>
>>
>>> On Jun 25, 2019, at 3:34 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>>>
>>> On Mon, Jun 24, 2019 at 02:25:42PM +0000, Song Liu wrote:
>>>>
>>>>
>>>>> On Jun 24, 2019, at 6:19 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>>>>>
>>>>> On Sat, Jun 22, 2019 at 10:48:28PM -0700, Song Liu wrote:
>>>>>> khugepaged needs exclusive mmap_sem to access page table. When it fails
>>>>>> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
>>>>>> is already a THP, khugepaged will not handle this pmd again.
>>>>>>
>>>>>> This patch enables the khugepaged to retry retract_page_tables().
>>>>>>
>>>>>> A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
>>>>>> contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
>>>>>> it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
>>>>>> compound pages in this address_space.
>>>>>>
>>>>>> Since collapse may happen at an later time, some pages may already fault
>>>>>> in. To handle these pages properly, it is necessary to prepare the pmd
>>>>>> before collapsing. prepare_pmd_for_collapse() is introduced to prepare
>>>>>> the pmd by removing rmap, adjusting refcount and mm_counter.
>>>>>>
>>>>>> prepare_pmd_for_collapse() also double checks whether all ptes in this
>>>>>> pmd are mapping to the same THP. This is necessary because some subpage
>>>>>> of the THP may be replaced, for example by uprobe. In such cases, it
>>>>>> is not possible to collapse the pmd, so we fall back.
>>>>>>
>>>>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>>>>> ---
>>>>>> include/linux/pagemap.h | 1 +
>>>>>> mm/khugepaged.c | 69 +++++++++++++++++++++++++++++++++++------
>>>>>> 2 files changed, 60 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>>>>>> index 9ec3544baee2..eac881de2a46 100644
>>>>>> --- a/include/linux/pagemap.h
>>>>>> +++ b/include/linux/pagemap.h
>>>>>> @@ -29,6 +29,7 @@ enum mapping_flags {
>>>>>> AS_EXITING = 4, /* final truncate in progress */
>>>>>> /* writeback related tags are not used */
>>>>>> AS_NO_WRITEBACK_TAGS = 5,
>>>>>> + AS_COLLAPSE_PMD = 6, /* try collapse pmd for THP */
>>>>>> };
>>>>>>
>>>>>> /**
>>>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>>>> index a4f90a1b06f5..9b980327fd9b 100644
>>>>>> --- a/mm/khugepaged.c
>>>>>> +++ b/mm/khugepaged.c
>>>>>> @@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
>>>>>> }
>>>>>>
>>>>>> #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
>>>>>> -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>>>>>> +
>>>>>> +/* return whether the pmd is ready for collapse */
>>>>>> +bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
>>>>>> + struct page *hpage, pmd_t *pmd)
>>>>>> +{
>>>>>> + unsigned long haddr = page_address_in_vma(hpage, vma);
>>>>>> + unsigned long addr;
>>>>>> + int i, count = 0;
>>>>>> +
>>>>>> + /* step 1: check all mapped PTEs are to this huge page */
>>>>>> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>>>>>> + pte_t *pte = pte_offset_map(pmd, addr);
>>>>>> +
>>>>>> + if (pte_none(*pte))
>>>>>> + continue;
>>>>>> +
>>>>>> + if (hpage + i != vm_normal_page(vma, addr, *pte))
>>>>>> + return false;
>>>>>> + count++;
>>>>>> + }
>>>>>> +
>>>>>> + /* step 2: adjust rmap */
>>>>>> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>>>>>> + pte_t *pte = pte_offset_map(pmd, addr);
>>>>>> + struct page *page;
>>>>>> +
>>>>>> + if (pte_none(*pte))
>>>>>> + continue;
>>>>>> + page = vm_normal_page(vma, addr, *pte);
>>>>>> + page_remove_rmap(page, false);
>>>>>> + }
>>>>>> +
>>>>>> + /* step 3: set proper refcount and mm_counters. */
>>>>>> + page_ref_sub(hpage, count);
>>>>>> + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
>>>>>> + return true;
>>>>>> +}
>>>>>> +
>>>>>> +extern pid_t sysctl_dump_pt_pid;
>>>>>> +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
>>>>>> + struct page *hpage)
>>>>>> {
>>>>>> struct vm_area_struct *vma;
>>>>>> unsigned long addr;
>>>>>> @@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>>>>>> pmd = mm_find_pmd(vma->vm_mm, addr);
>>>>>> if (!pmd)
>>>>>> continue;
>>>>>> - /*
>>>>>> - * We need exclusive mmap_sem to retract page table.
>>>>>> - * If trylock fails we would end up with pte-mapped THP after
>>>>>> - * re-fault. Not ideal, but it's more important to not disturb
>>>>>> - * the system too much.
>>>>>> - */
>>>>>> if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
>>>>>> spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
>>>>>> - /* assume page table is clear */
>>>>>> +
>>>>>> + if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
>>>>>> + spin_unlock(ptl);
>>>>>> + up_write(&vma->vm_mm->mmap_sem);
>>>>>> + continue;
>>>>>> + }
>>>>>> _pmd = pmdp_collapse_flush(vma, addr, pmd);
>>>>>> spin_unlock(ptl);
>>>>>> up_write(&vma->vm_mm->mmap_sem);
>>>>>> mm_dec_nr_ptes(vma->vm_mm);
>>>>>> pte_free(vma->vm_mm, pmd_pgtable(_pmd));
>>>>>> - }
>>>>>> + } else
>>>>>> + set_bit(AS_COLLAPSE_PMD, &mapping->flags);
>>>>>> }
>>>>>> i_mmap_unlock_write(mapping);
>>>>>> }
>>>>>> @@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
>>>>>> /*
>>>>>> * Remove pte page tables, so we can re-fault the page as huge.
>>>>>> */
>>>>>> - retract_page_tables(mapping, start);
>>>>>> + retract_page_tables(mapping, start, new_page);
>>>>>> *hpage = NULL;
>>>>>>
>>>>>> khugepaged_pages_collapsed++;
>>>>>> @@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>>>>>> int present, swap;
>>>>>> int node = NUMA_NO_NODE;
>>>>>> int result = SCAN_SUCCEED;
>>>>>> + bool collapse_pmd = false;
>>>>>>
>>>>>> present = 0;
>>>>>> swap = 0;
>>>>>> @@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>>>>>> }
>>>>>>
>>>>>> if (PageTransCompound(page)) {
>>>>>> + if (collapse_pmd ||
>>>>>> + test_and_clear_bit(AS_COLLAPSE_PMD,
>>>>>> + &mapping->flags)) {
>>>>>
>>>>> Who said it's the only PMD range that's subject to collapse? The bit has
>>>>> to be per-PMD, not per-mapping.
>>>>
>>>> I didn't assume this is the only PMD range that subject to collapse.
>>>> So once we found AS_COLLAPSE_PMD, it will continue scan the whole mapping:
>>>> retract_page_tables(), then continue.
>>>
>>> I still don't get it.
>>>
>>> Assume we have two ranges that subject to collapse. khugepaged_scan_file()
>>> sees and clears AS_COLLAPSE_PMD. Tries to collapse the first range, fails
>>> and set the bit again. khugepaged_scan_file() sees the second range,
>>> clears the bit, but this time collapse is successful: the bit is still not
>>> set, but it should be.
>>
>> Yeah, you are right. Current logic only covers multiple THPs within single
>> call of khugepaged_scan_file(). I missed the case you just described.
>>
>> What do you think about the first 4 patches of set and the other set? If
>> these patches look good, how about we get them in first? I will work on
>> proper fix (or re-design) for 5/6 and 6/6 in the meanwhile.
>
> You can use
>
> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>
> for the first 4 patches.
Thanks Kirill!
Song
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP
2019-06-23 5:48 ` [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP Song Liu
2019-06-24 13:19 ` Kirill A. Shutemov
@ 2019-06-24 22:06 ` Song Liu
1 sibling, 0 replies; 14+ messages in thread
From: Song Liu @ 2019-06-24 22:06 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: matthew.wilcox, kirill.shutemov, peterz, oleg, rostedt,
Kernel Team, william.kucharski
> On Jun 22, 2019, at 10:48 PM, Song Liu <songliubraving@fb.com> wrote:
>
> khugepaged needs exclusive mmap_sem to access page table. When it fails
> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
> is already a THP, khugepaged will not handle this pmd again.
>
> This patch enables the khugepaged to retry retract_page_tables().
>
> A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
> contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
> it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
> compound pages in this address_space.
>
> Since collapse may happen at an later time, some pages may already fault
> in. To handle these pages properly, it is necessary to prepare the pmd
> before collapsing. prepare_pmd_for_collapse() is introduced to prepare
> the pmd by removing rmap, adjusting refcount and mm_counter.
>
> prepare_pmd_for_collapse() also double checks whether all ptes in this
> pmd are mapping to the same THP. This is necessary because some subpage
> of the THP may be replaced, for example by uprobe. In such cases, it
> is not possible to collapse the pmd, so we fall back.
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
> include/linux/pagemap.h | 1 +
> mm/khugepaged.c | 69 +++++++++++++++++++++++++++++++++++------
> 2 files changed, 60 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 9ec3544baee2..eac881de2a46 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -29,6 +29,7 @@ enum mapping_flags {
> AS_EXITING = 4, /* final truncate in progress */
> /* writeback related tags are not used */
> AS_NO_WRITEBACK_TAGS = 5,
> + AS_COLLAPSE_PMD = 6, /* try collapse pmd for THP */
> };
>
> /**
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index a4f90a1b06f5..9b980327fd9b 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
> }
>
> #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
> -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> +
> +/* return whether the pmd is ready for collapse */
> +bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
> + struct page *hpage, pmd_t *pmd)
kbuild test robot reported I missed "static" here. But I am holding off a
newer version that just fixes this.
Thanks,
Song
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v6 6/6] uprobe: collapse THP pmd after removing all uprobes
2019-06-23 5:48 [PATCH v6 0/6] THP aware uprobe Song Liu
` (4 preceding siblings ...)
2019-06-23 5:48 ` [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP Song Liu
@ 2019-06-23 5:48 ` Song Liu
5 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2019-06-23 5:48 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: matthew.wilcox, kirill.shutemov, peterz, oleg, rostedt,
kernel-team, william.kucharski, Song Liu
After all uprobes are removed from the huge page (with PTE pgtable), it
is possible to collapse the pmd and benefit from THP again. This patch
does the collapse by setting AS_COLLAPSE_PMD. khugepage would retrace
the page table.
A check for vma->anon_vma is removed from retract_page_tables(). The
check was initially marked as "probably overkill". The code works well
without the check.
An issue on earlier version was discovered by kbuild test robot.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
kernel/events/uprobes.c | 6 +++++-
mm/khugepaged.c | 3 ---
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a20d7b43a056..418382259f61 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -474,6 +474,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
struct page *old_page, *new_page;
struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
+ struct page *orig_page = NULL;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
@@ -512,7 +513,6 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
if (!is_register) {
- struct page *orig_page;
pgoff_t index;
index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
@@ -540,6 +540,10 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
if (ret && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
+ if (!ret && orig_page && PageTransCompound(orig_page))
+ set_bit(AS_COLLAPSE_PMD,
+ &compound_head(orig_page)->mapping->flags);
+
return ret;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 9b980327fd9b..2e277a2d731f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1302,9 +1302,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- /* probably overkill */
- if (vma->anon_vma)
- continue;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK)
continue;
--
2.17.1
^ permalink raw reply related [flat|nested] 14+ messages in thread