* [PATCH v10 1/4] mm: move memcmp_pages() and pages_identical()
2019-07-30 5:23 [PATCH v10 0/4] THP aware uprobe Song Liu
@ 2019-07-30 5:23 ` Song Liu
2019-07-30 5:23 ` [PATCH v10 2/4] uprobe: use original page when all uprobes are removed Song Liu
` (2 subsequent siblings)
3 siblings, 0 replies; 11+ messages in thread
From: Song Liu @ 2019-07-30 5:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, akpm
Cc: matthew.wilcox, kirill.shutemov, oleg, kernel-team,
william.kucharski, srikar, Song Liu
This patch moves memcmp_pages() to mm/util.c and pages_identical() to
mm.h, so that we can use them in other files.
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/mm.h | 7 +++++++
mm/ksm.c | 18 ------------------
mm/util.c | 13 +++++++++++++
3 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..f189176dabed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2891,5 +2891,12 @@ void __init setup_nr_node_ids(void);
static inline void setup_nr_node_ids(void) {}
#endif
+extern int memcmp_pages(struct page *page1, struct page *page2);
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+ return !memcmp_pages(page1, page2);
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/mm/ksm.c b/mm/ksm.c
index 3dc4346411e4..dbee2eb4dd05 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1029,24 +1029,6 @@ static u32 calc_checksum(struct page *page)
return checksum;
}
-static int memcmp_pages(struct page *page1, struct page *page2)
-{
- char *addr1, *addr2;
- int ret;
-
- addr1 = kmap_atomic(page1);
- addr2 = kmap_atomic(page2);
- ret = memcmp(addr1, addr2, PAGE_SIZE);
- kunmap_atomic(addr2);
- kunmap_atomic(addr1);
- return ret;
-}
-
-static inline int pages_identical(struct page *page1, struct page *page2)
-{
- return !memcmp_pages(page1, page2);
-}
-
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
diff --git a/mm/util.c b/mm/util.c
index e6351a80f248..0d5e2f425612 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -783,3 +783,16 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
out:
return res;
}
+
+int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
--
2.17.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v10 2/4] uprobe: use original page when all uprobes are removed
2019-07-30 5:23 [PATCH v10 0/4] THP aware uprobe Song Liu
2019-07-30 5:23 ` [PATCH v10 1/4] mm: move memcmp_pages() and pages_identical() Song Liu
@ 2019-07-30 5:23 ` Song Liu
2019-07-30 16:58 ` Oleg Nesterov
2019-07-30 5:23 ` [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD Song Liu
2019-07-30 5:23 ` [PATCH v10 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT Song Liu
3 siblings, 1 reply; 11+ messages in thread
From: Song Liu @ 2019-07-30 5:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, akpm
Cc: matthew.wilcox, kirill.shutemov, oleg, kernel-team,
william.kucharski, srikar, Song Liu
Currently, uprobe swaps the target page with a anonymous page in both
install_breakpoint() and remove_breakpoint(). When all uprobes on a page
are removed, the given mm is still using an anonymous page (not the
original page).
This patch allows uprobe to use original page when possible (all uprobes
on the page are already removed). As suggested by Oleg, we unmap the
old_page and let the original page fault in.
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
kernel/events/uprobes.c | 66 +++++++++++++++++++++++++++++++----------
1 file changed, 51 insertions(+), 15 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 84fa00497c49..648f47553bff 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -143,10 +143,12 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
*
* @vma: vma that holds the pte pointing to page
* @addr: address the old @page is mapped at
- * @page: the cowed page we are replacing by kpage
- * @kpage: the modified page we replace page by
+ * @old_page: the page we are replacing by new_page
+ * @new_page: the modified page we replace page by
*
- * Returns 0 on success, -EFAULT on failure.
+ * If @new_page is NULL, only unmap @old_page.
+ *
+ * Returns 0 on success, negative error code otherwise.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
@@ -166,10 +168,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
- err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
- false);
- if (err)
- return err;
+ if (new_page) {
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false);
+ if (err)
+ return err;
+ }
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
@@ -177,15 +181,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw)) {
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ if (new_page)
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ if (new_page) {
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ } else
+ /* no new page, just dec_mm_counter for old_page */
+ dec_mm_counter(mm, MM_ANONPAGES);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, mm_counter_file(old_page));
@@ -194,8 +203,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
ptep_clear_flush_notify(vma, addr, pvmw.pte);
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
+ if (new_page)
+ set_pte_at_notify(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
@@ -488,6 +498,10 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
ref_ctr_updated = 1;
}
+ ret = 0;
+ if (!is_register && !PageAnon(old_page))
+ goto put_old;
+
ret = anon_vma_prepare(vma);
if (ret)
goto put_old;
@@ -501,8 +515,30 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (!is_register) {
+ struct page *orig_page;
+ pgoff_t index;
+
+ VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
+
+ index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
+ orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
+ index);
+
+ if (orig_page) {
+ if (PageUptodate(orig_page) &&
+ pages_identical(new_page, orig_page)) {
+ /* let go new_page */
+ put_page(new_page);
+ new_page = NULL;
+ }
+ put_page(orig_page);
+ }
+ }
+
ret = __replace_page(vma, vaddr, old_page, new_page);
- put_page(new_page);
+ if (new_page)
+ put_page(new_page);
put_old:
put_page(old_page);
--
2.17.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v10 2/4] uprobe: use original page when all uprobes are removed
2019-07-30 5:23 ` [PATCH v10 2/4] uprobe: use original page when all uprobes are removed Song Liu
@ 2019-07-30 16:58 ` Oleg Nesterov
0 siblings, 0 replies; 11+ messages in thread
From: Oleg Nesterov @ 2019-07-30 16:58 UTC (permalink / raw)
To: Song Liu
Cc: linux-kernel, linux-mm, akpm, matthew.wilcox, kirill.shutemov,
kernel-team, william.kucharski, srikar
On 07/29, Song Liu wrote:
>
> This patch allows uprobe to use original page when possible (all uprobes
> on the page are already removed).
Again, the changelog is not 100% accurate. all uprobes removed _and_
the original page is in page cache and uptodate.
Otherwise looks correct...
Oleg.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD
2019-07-30 5:23 [PATCH v10 0/4] THP aware uprobe Song Liu
2019-07-30 5:23 ` [PATCH v10 1/4] mm: move memcmp_pages() and pages_identical() Song Liu
2019-07-30 5:23 ` [PATCH v10 2/4] uprobe: use original page when all uprobes are removed Song Liu
@ 2019-07-30 5:23 ` Song Liu
2019-07-30 16:11 ` Oleg Nesterov
2019-07-30 5:23 ` [PATCH v10 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT Song Liu
3 siblings, 1 reply; 11+ messages in thread
From: Song Liu @ 2019-07-30 5:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, akpm
Cc: matthew.wilcox, kirill.shutemov, oleg, kernel-team,
william.kucharski, srikar, Song Liu
This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says
FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge
page stays as-is.
FOLL_SPLIT_PMD is useful for cases where we need to use regular pages,
but would switch back to huge page and huge pmd on. One of such example
is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe.
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/mm.h | 1 +
mm/gup.c | 8 ++++++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f189176dabed..74db879711eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2614,6 +2614,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
+#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
/*
* NOTE on FOLL_LONGTERM:
diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..3c514e223ce3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & FOLL_SPLIT) {
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
@@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else {
+ } else if (flags & FOLL_SPLIT) {
if (unlikely(!try_get_page(page))) {
spin_unlock(ptl);
return ERR_PTR(-ENOMEM);
@@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
put_page(page);
if (pmd_none(*pmd))
return no_page_table(vma, flags);
+ } else { /* flags & FOLL_SPLIT_PMD */
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ ret = pte_alloc(mm, pmd);
}
return ret ? ERR_PTR(ret) :
--
2.17.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD
2019-07-30 5:23 ` [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD Song Liu
@ 2019-07-30 16:11 ` Oleg Nesterov
2019-07-30 17:42 ` Song Liu
0 siblings, 1 reply; 11+ messages in thread
From: Oleg Nesterov @ 2019-07-30 16:11 UTC (permalink / raw)
To: Song Liu
Cc: linux-kernel, linux-mm, akpm, matthew.wilcox, kirill.shutemov,
kernel-team, william.kucharski, srikar
I don't understand this code, so I can't review, but.
On 07/29, Song Liu wrote:
>
> This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says
> FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge
> page stays as-is.
>
> FOLL_SPLIT_PMD is useful for cases where we need to use regular pages,
> but would switch back to huge page and huge pmd on. One of such example
> is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe.
So after the next patch we have a single user of FOLL_SPLIT_PMD (uprobes)
and a single user of FOLL_SPLIT: arch/s390/mm/gmap.c:thp_split_mm().
Hmm.
> @@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> spin_unlock(ptl);
> return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
> }
> - if (flags & FOLL_SPLIT) {
> + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
> int ret;
> page = pmd_page(*pmd);
> if (is_huge_zero_page(page)) {
> @@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> split_huge_pmd(vma, pmd, address);
> if (pmd_trans_unstable(pmd))
> ret = -EBUSY;
> - } else {
> + } else if (flags & FOLL_SPLIT) {
> if (unlikely(!try_get_page(page))) {
> spin_unlock(ptl);
> return ERR_PTR(-ENOMEM);
> @@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> put_page(page);
> if (pmd_none(*pmd))
> return no_page_table(vma, flags);
> + } else { /* flags & FOLL_SPLIT_PMD */
> + spin_unlock(ptl);
> + split_huge_pmd(vma, pmd, address);
> + ret = pte_alloc(mm, pmd);
I fail to understand why this differs from the is_huge_zero_page() case above.
Anyway, ret = pte_alloc(mm, pmd) can't be correct. If __pte_alloc() fails pte_alloc()
will return 1. This will fool the IS_ERR(page) check in __get_user_pages().
Oleg.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD
2019-07-30 16:11 ` Oleg Nesterov
@ 2019-07-30 17:42 ` Song Liu
2019-07-31 15:18 ` Oleg Nesterov
0 siblings, 1 reply; 11+ messages in thread
From: Song Liu @ 2019-07-30 17:42 UTC (permalink / raw)
To: Oleg Nesterov
Cc: lkml, linux-mm, akpm, matthew.wilcox, kirill.shutemov,
Kernel Team, william.kucharski, srikar
> On Jul 30, 2019, at 9:11 AM, Oleg Nesterov <oleg@redhat.com> wrote:
>
> I don't understand this code, so I can't review, but.
>
> On 07/29, Song Liu wrote:
>>
>> This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says
>> FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge
>> page stays as-is.
>>
>> FOLL_SPLIT_PMD is useful for cases where we need to use regular pages,
>> but would switch back to huge page and huge pmd on. One of such example
>> is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe.
>
> So after the next patch we have a single user of FOLL_SPLIT_PMD (uprobes)
> and a single user of FOLL_SPLIT: arch/s390/mm/gmap.c:thp_split_mm().
>
> Hmm.
I think this is what we want. :)
FOLL_SPLIT is the fallback solution for users who cannot handle THP. With
more THP aware code, there will be fewer users of FOLL_SPLIT.
>
>> @@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> spin_unlock(ptl);
>> return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>> }
>> - if (flags & FOLL_SPLIT) {
>> + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>> int ret;
>> page = pmd_page(*pmd);
>> if (is_huge_zero_page(page)) {
>> @@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> split_huge_pmd(vma, pmd, address);
>> if (pmd_trans_unstable(pmd))
>> ret = -EBUSY;
>> - } else {
>> + } else if (flags & FOLL_SPLIT) {
>> if (unlikely(!try_get_page(page))) {
>> spin_unlock(ptl);
>> return ERR_PTR(-ENOMEM);
>> @@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> put_page(page);
>> if (pmd_none(*pmd))
>> return no_page_table(vma, flags);
>> + } else { /* flags & FOLL_SPLIT_PMD */
>> + spin_unlock(ptl);
>> + split_huge_pmd(vma, pmd, address);
>> + ret = pte_alloc(mm, pmd);
>
> I fail to understand why this differs from the is_huge_zero_page() case above.
split_huge_pmd() handles is_huge_zero_page() differently. In this case, we
cannot use the pmd_trans_unstable() check.
>
> Anyway, ret = pte_alloc(mm, pmd) can't be correct. If __pte_alloc() fails pte_alloc()
> will return 1. This will fool the IS_ERR(page) check in __get_user_pages().
Great catch! Let me fix it.
Thanks,
Song
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD
2019-07-30 17:42 ` Song Liu
@ 2019-07-31 15:18 ` Oleg Nesterov
2019-07-31 17:10 ` Song Liu
0 siblings, 1 reply; 11+ messages in thread
From: Oleg Nesterov @ 2019-07-31 15:18 UTC (permalink / raw)
To: Song Liu
Cc: lkml, linux-mm, akpm, matthew.wilcox, kirill.shutemov,
Kernel Team, william.kucharski, srikar
On 07/30, Song Liu wrote:
>
>
> > On Jul 30, 2019, at 9:11 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > So after the next patch we have a single user of FOLL_SPLIT_PMD (uprobes)
> > and a single user of FOLL_SPLIT: arch/s390/mm/gmap.c:thp_split_mm().
> >
> > Hmm.
>
> I think this is what we want. :)
We? I don't ;)
> FOLL_SPLIT is the fallback solution for users who cannot handle THP.
and again, we have a single user: thp_split_mm(). I do not know if it
can use FOLL_SPLIT_PMD or not, may be you can take a look...
> With
> more THP aware code, there will be fewer users of FOLL_SPLIT.
Fewer than 1? Good ;)
> >> @@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> >> spin_unlock(ptl);
> >> return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
> >> }
> >> - if (flags & FOLL_SPLIT) {
> >> + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
> >> int ret;
> >> page = pmd_page(*pmd);
> >> if (is_huge_zero_page(page)) {
> >> @@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> >> split_huge_pmd(vma, pmd, address);
> >> if (pmd_trans_unstable(pmd))
> >> ret = -EBUSY;
> >> - } else {
> >> + } else if (flags & FOLL_SPLIT) {
> >> if (unlikely(!try_get_page(page))) {
> >> spin_unlock(ptl);
> >> return ERR_PTR(-ENOMEM);
> >> @@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> >> put_page(page);
> >> if (pmd_none(*pmd))
> >> return no_page_table(vma, flags);
> >> + } else { /* flags & FOLL_SPLIT_PMD */
> >> + spin_unlock(ptl);
> >> + split_huge_pmd(vma, pmd, address);
> >> + ret = pte_alloc(mm, pmd);
> >
> > I fail to understand why this differs from the is_huge_zero_page() case above.
>
> split_huge_pmd() handles is_huge_zero_page() differently. In this case, we
> cannot use the pmd_trans_unstable() check.
Please correct me, but iiuc the problem is not that split_huge_pmd() handles
is_huge_zero_page() differently, the problem is that __split_huge_pmd_locked()
handles the !vma_is_anonymous(vma) differently and returns with pmd_none() = T
after pmdp_huge_clear_flush_notify(). This means that pmd_trans_unstable() will
fail.
Now, I don't understand why do we need pmd_trans_unstable() after
split_huge_pmd(huge-zero-pmd), but whatever reason we have, why can't we
unify both cases?
IOW, could you explain why the path below is wrong?
Oleg.
--- x/mm/gup.c
+++ x/mm/gup.c
@@ -399,14 +399,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & FOLL_SPLIT) {
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
int ret;
page = pmd_page(*pmd);
- if (is_huge_zero_page(page)) {
+ if ((flags & FOLL_SPLIT_PMD) || is_huge_zero_page(page)) {
spin_unlock(ptl);
- ret = 0;
split_huge_pmd(vma, pmd, address);
- if (pmd_trans_unstable(pmd))
+ ret = 0;
+ if (pte_alloc(mm, pmd))
+ ret = -ENOMEM;
+ else if (pmd_trans_unstable(pmd))
ret = -EBUSY;
} else {
if (unlikely(!try_get_page(page))) {
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD
2019-07-31 15:18 ` Oleg Nesterov
@ 2019-07-31 17:10 ` Song Liu
2019-08-01 15:04 ` Oleg Nesterov
0 siblings, 1 reply; 11+ messages in thread
From: Song Liu @ 2019-07-31 17:10 UTC (permalink / raw)
To: Oleg Nesterov
Cc: lkml, linux-mm, akpm, matthew.wilcox, kirill.shutemov,
Kernel Team, william.kucharski, srikar
> On Jul 31, 2019, at 8:18 AM, Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 07/30, Song Liu wrote:
>>
>>
>>> On Jul 30, 2019, at 9:11 AM, Oleg Nesterov <oleg@redhat.com> wrote:
>>>
>>> So after the next patch we have a single user of FOLL_SPLIT_PMD (uprobes)
>>> and a single user of FOLL_SPLIT: arch/s390/mm/gmap.c:thp_split_mm().
>>>
>>> Hmm.
>>
>> I think this is what we want. :)
>
> We? I don't ;)
>
>> FOLL_SPLIT is the fallback solution for users who cannot handle THP.
>
> and again, we have a single user: thp_split_mm(). I do not know if it
> can use FOLL_SPLIT_PMD or not, may be you can take a look...
I haven't played with s390, so it gonna take me some time to ramp up.
I will add it to my to-do list.
>
>> With
>> more THP aware code, there will be fewer users of FOLL_SPLIT.
>
> Fewer than 1? Good ;)
Yes! It will be great if thp_split_mm() can use FOLL_SPLIT_PMD
instead.
>
>>>> @@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>>>> spin_unlock(ptl);
>>>> return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>>>> }
>>>> - if (flags & FOLL_SPLIT) {
>>>> + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>>>> int ret;
>>>> page = pmd_page(*pmd);
>>>> if (is_huge_zero_page(page)) {
>>>> @@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>>>> split_huge_pmd(vma, pmd, address);
>>>> if (pmd_trans_unstable(pmd))
>>>> ret = -EBUSY;
>>>> - } else {
>>>> + } else if (flags & FOLL_SPLIT) {
>>>> if (unlikely(!try_get_page(page))) {
>>>> spin_unlock(ptl);
>>>> return ERR_PTR(-ENOMEM);
>>>> @@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>>>> put_page(page);
>>>> if (pmd_none(*pmd))
>>>> return no_page_table(vma, flags);
>>>> + } else { /* flags & FOLL_SPLIT_PMD */
>>>> + spin_unlock(ptl);
>>>> + split_huge_pmd(vma, pmd, address);
>>>> + ret = pte_alloc(mm, pmd);
>>>
>>> I fail to understand why this differs from the is_huge_zero_page() case above.
>>
>> split_huge_pmd() handles is_huge_zero_page() differently. In this case, we
>> cannot use the pmd_trans_unstable() check.
>
> Please correct me, but iiuc the problem is not that split_huge_pmd() handles
> is_huge_zero_page() differently, the problem is that __split_huge_pmd_locked()
> handles the !vma_is_anonymous(vma) differently and returns with pmd_none() = T
> after pmdp_huge_clear_flush_notify(). This means that pmd_trans_unstable() will
> fail.
Agreed.
>
> Now, I don't understand why do we need pmd_trans_unstable() after
> split_huge_pmd(huge-zero-pmd), but whatever reason we have, why can't we
> unify both cases?
>
> IOW, could you explain why the path below is wrong?
I _think_ the following patch works (haven't fully tested yet). But I am not
sure whether this is the best. By separating the two cases, we don't duplicate
much code. And it is clear that the two cases are handled differently.
Therefore, I would prefer to keep these separate for now.
Thanks,
Song
>
>
> --- x/mm/gup.c
> +++ x/mm/gup.c
> @@ -399,14 +399,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> spin_unlock(ptl);
> return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
> }
> - if (flags & FOLL_SPLIT) {
> + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
> int ret;
> page = pmd_page(*pmd);
> - if (is_huge_zero_page(page)) {
> + if ((flags & FOLL_SPLIT_PMD) || is_huge_zero_page(page)) {
> spin_unlock(ptl);
> - ret = 0;
> split_huge_pmd(vma, pmd, address);
> - if (pmd_trans_unstable(pmd))
> + ret = 0;
> + if (pte_alloc(mm, pmd))
> + ret = -ENOMEM;
> + else if (pmd_trans_unstable(pmd))
> ret = -EBUSY;
> } else {
> if (unlikely(!try_get_page(page))) {
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD
2019-07-31 17:10 ` Song Liu
@ 2019-08-01 15:04 ` Oleg Nesterov
0 siblings, 0 replies; 11+ messages in thread
From: Oleg Nesterov @ 2019-08-01 15:04 UTC (permalink / raw)
To: Song Liu
Cc: lkml, linux-mm, akpm, matthew.wilcox, kirill.shutemov,
Kernel Team, william.kucharski, srikar
On 07/31, Song Liu wrote:
>
> > On Jul 31, 2019, at 8:18 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > Now, I don't understand why do we need pmd_trans_unstable() after
> > split_huge_pmd(huge-zero-pmd), but whatever reason we have, why can't we
> > unify both cases?
> >
> > IOW, could you explain why the path below is wrong?
>
> I _think_ the following patch works (haven't fully tested yet). But I am not
> sure whether this is the best. By separating the two cases, we don't duplicate
> much code. And it is clear that the two cases are handled differently.
> Therefore, I would prefer to keep these separate for now.
I disagree. I think this separation makes the code less readable/understandable.
Exactly because it handles two cases differently and it is absolutely not clear
why.
But I can't argue, please forget.
Oleg.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH v10 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT
2019-07-30 5:23 [PATCH v10 0/4] THP aware uprobe Song Liu
` (2 preceding siblings ...)
2019-07-30 5:23 ` [PATCH v10 3/4] mm, thp: introduce FOLL_SPLIT_PMD Song Liu
@ 2019-07-30 5:23 ` Song Liu
3 siblings, 0 replies; 11+ messages in thread
From: Song Liu @ 2019-07-30 5:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, akpm
Cc: matthew.wilcox, kirill.shutemov, oleg, kernel-team,
william.kucharski, srikar, Song Liu
This patch uses newly added FOLL_SPLIT_PMD in uprobe. This preserves the
huge page when the uprobe is enabled. When the uprobe is disabled, newer
instances of the same application could still benefit from huge page.
For the next step, we will enable khugepaged to regroup the pmd, so that
existing instances of the application could also benefit from huge page
after the uprobe is disabled.
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
kernel/events/uprobes.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 648f47553bff..27b596f14463 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -155,7 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
- .page = old_page,
+ .page = compound_head(old_page),
.vma = vma,
.address = addr,
};
@@ -166,8 +166,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
- VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-
if (new_page) {
err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
&memcg, false);
@@ -481,7 +479,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
retry:
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(NULL, mm, vaddr, 1,
- FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
+ FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL);
if (ret <= 0)
return ret;
--
2.17.1
^ permalink raw reply related [flat|nested] 11+ messages in thread