* [RFC PATCH 1/6] mm/memory: add copy_huge_page_from_user for hugetlb userfaultfd support
2016-06-06 17:45 [RFC PATCH 0/6] hugetlb support for userfaultfd Mike Kravetz
@ 2016-06-06 17:45 ` Mike Kravetz
2016-06-06 17:45 ` [RFC PATCH 2/6] mm/hugetlb: add hugetlb_mcopy_atomic_pte for " Mike Kravetz
` (4 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Mike Kravetz @ 2016-06-06 17:45 UTC (permalink / raw)
To: linux-mm, linux-kernel
Cc: Andrea Arcangeli, Hugh Dickins, Dave Hansen, Kirill A. Shutemov,
Naoya Horiguchi, Hillf Danton, Michal Hocko, Andrew Morton,
Mike Kravetz
userfaultfd UFFDIO_COPY allows user level code to copy data to a page
at fault time. The data is copied from user space to a newly allocated
huge page. The new routine copy_huge_page_from_user performs this copy.
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
include/linux/mm.h | 3 +++
mm/memory.c | 22 ++++++++++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0510282..7ecc7e7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2355,6 +2355,9 @@ extern void clear_huge_page(struct page *page,
extern void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma,
unsigned int pages_per_huge_page);
+extern long copy_huge_page_from_user(const void __user *usr_src,
+ struct page *dst_page,
+ unsigned int pages_per_huge_page);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
extern struct page_ext_operations debug_guardpage_ops;
diff --git a/mm/memory.c b/mm/memory.c
index 19584b9..c44ddad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3970,6 +3970,28 @@ void copy_user_huge_page(struct page *dst, struct page *src,
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
+
+long copy_huge_page_from_user(const void __user *usr_src,
+ struct page *dst_page,
+ unsigned int pages_per_huge_page)
+{
+ void *src = (void *)usr_src;
+ void *page_kaddr;
+ long i, rc = 0;
+
+ for (i = 0; i < pages_per_huge_page; i++) {
+ page_kaddr = kmap_atomic(dst_page + i);
+ rc = copy_from_user(page_kaddr,
+ (const void __user *)(src + i * PAGE_SIZE),
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+ if (rc)
+ break;
+
+ cond_resched();
+ }
+ return rc;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
--
2.4.11
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC PATCH 2/6] mm/hugetlb: add hugetlb_mcopy_atomic_pte for userfaultfd support
2016-06-06 17:45 [RFC PATCH 0/6] hugetlb support for userfaultfd Mike Kravetz
2016-06-06 17:45 ` [RFC PATCH 1/6] mm/memory: add copy_huge_page_from_user for hugetlb userfaultfd support Mike Kravetz
@ 2016-06-06 17:45 ` Mike Kravetz
2016-06-06 17:45 ` [RFC PATCH 3/6] mm/userfaultfd: add __mcopy_atomic_hugetlb for huge page UFFDIO_COPY Mike Kravetz
` (3 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Mike Kravetz @ 2016-06-06 17:45 UTC (permalink / raw)
To: linux-mm, linux-kernel
Cc: Andrea Arcangeli, Hugh Dickins, Dave Hansen, Kirill A. Shutemov,
Naoya Horiguchi, Hillf Danton, Michal Hocko, Andrew Morton,
Mike Kravetz
hugetlb_mcopy_atomic_pte is the low level routine that implements
the userfaultfd UFFDIO_COPY command. It is based on the existing
mcopy_atomic_pte routine with modifications for huge pages.
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
include/linux/hugetlb.h | 8 ++++-
mm/hugetlb.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 87 insertions(+), 1 deletion(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c26d463..35697b2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -81,6 +81,11 @@ void hugetlb_show_meminfo(void);
unsigned long hugetlb_total_pages(void);
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep);
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags);
@@ -149,6 +154,8 @@ static inline void hugetlb_show_meminfo(void)
#define is_hugepage_only_range(mm, addr, len) 0
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
+#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
+ src_addr, pagep) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0
static inline int dequeue_hwpoisoned_huge_page(struct page *page)
{
@@ -272,7 +279,6 @@ static inline bool is_file_hugepages(struct file *file)
return is_file_shm_hugepages(file);
}
-
#else /* !CONFIG_HUGETLBFS */
#define is_file_hugepages(file) false
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0949d0d..4943d8b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3837,6 +3837,86 @@ out_mutex:
return ret;
}
+/*
+ * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
+ * modifications for huge pages.
+ */
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ struct hstate *h = hstate_vma(dst_vma);
+ pte_t _dst_pte;
+ spinlock_t *ptl;
+ int ret;
+ struct page *page;
+
+ if (!*pagep) {
+ ret = -ENOMEM;
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (!page)
+ goto out;
+
+ ret = copy_huge_page_from_user((const void __user *) src_addr,
+ page, pages_per_huge_page(h));
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ ret = -EFAULT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+ set_page_huge_active(page);
+
+ ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+ spin_lock(ptl);
+
+ ret = -EEXIST;
+ if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ goto out_release_unlock;
+
+ ClearPagePrivate(page);
+ hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+
+ _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = huge_pte_mkdirty(_dst_pte);
+ _dst_pte = pte_mkyoung(_dst_pte);
+
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
+ dst_vma->vm_flags & VM_WRITE);
+ hugetlb_count_add(pages_per_huge_page(h), dst_mm);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ spin_unlock(ptl);
+ ret = 0;
+out:
+ return ret;
+out_release_unlock:
+ spin_unlock(ptl);
+ put_page(page);
+ goto out;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
--
2.4.11
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC PATCH 3/6] mm/userfaultfd: add __mcopy_atomic_hugetlb for huge page UFFDIO_COPY
2016-06-06 17:45 [RFC PATCH 0/6] hugetlb support for userfaultfd Mike Kravetz
2016-06-06 17:45 ` [RFC PATCH 1/6] mm/memory: add copy_huge_page_from_user for hugetlb userfaultfd support Mike Kravetz
2016-06-06 17:45 ` [RFC PATCH 2/6] mm/hugetlb: add hugetlb_mcopy_atomic_pte for " Mike Kravetz
@ 2016-06-06 17:45 ` Mike Kravetz
2016-06-07 6:27 ` Hillf Danton
2016-06-06 17:45 ` [RFC PATCH 4/6] mm/hugetlb: add userfaultfd hugetlb hook Mike Kravetz
` (2 subsequent siblings)
5 siblings, 1 reply; 9+ messages in thread
From: Mike Kravetz @ 2016-06-06 17:45 UTC (permalink / raw)
To: linux-mm, linux-kernel
Cc: Andrea Arcangeli, Hugh Dickins, Dave Hansen, Kirill A. Shutemov,
Naoya Horiguchi, Hillf Danton, Michal Hocko, Andrew Morton,
Mike Kravetz
__mcopy_atomic_hugetlb performs the UFFDIO_COPY operation for huge
pages. It is based on the existing __mcopy_atomic routine for normal
pages. Unlike normal pages, there is no huge page support for the
UFFDIO_ZEROPAGE operation.
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
mm/userfaultfd.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 179 insertions(+)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index af817e5..c006f46 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -14,6 +14,8 @@
#include <linux/swapops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -139,6 +141,176 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
return pmd;
}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
+ * called with mmap_sem held, it will release mmap_sem before returning.
+ */
+static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage)
+{
+ ssize_t err;
+ pte_t *dst_pte;
+ unsigned long src_addr, dst_addr;
+ long copied;
+ struct page *page;
+ struct hstate *h;
+ unsigned long vma_hpagesize;
+ pgoff_t idx;
+ u32 hash;
+ struct address_space *mapping;
+
+ /*
+ * There is no default zero huge page for all huge page sizes as
+ * supported by hugetlb. A PMD_SIZE huge pages may exist as used
+ * by THP. Since we can not reliably insert a zero page, this
+ * feature is not supported.
+ */
+ if (zeropage)
+ return -EINVAL;
+
+ src_addr = src_start;
+ dst_addr = dst_start;
+ copied = 0;
+ page = NULL;
+ vma_hpagesize = vma_kernel_pagesize(dst_vma);
+
+retry:
+ /*
+ * On routine entry dst_vma is set. If we had to drop mmap_sem and
+ * retry, dst_vma will be set to NULL and we must lookup again.
+ */
+ err = -EINVAL;
+ if (!dst_vma) {
+ dst_vma = find_vma(dst_mm, dst_start);
+ vma_hpagesize = vma_kernel_pagesize(dst_vma);
+
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ if (dst_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
+ }
+
+ /*
+ * Validate alignment based on huge page size
+ */
+ if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
+ goto out_unlock;
+
+ /*
+ * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ goto out_unlock;
+
+ /*
+ * Ensure the dst_vma has a anon_vma.
+ */
+ err = -ENOMEM;
+ if (unlikely(anon_vma_prepare(dst_vma)))
+ goto out_unlock;
+
+ h = hstate_vma(dst_vma);
+
+ while (src_addr < src_start + len) {
+ pte_t dst_pteval;
+
+ BUG_ON(dst_addr >= dst_start + len);
+ dst_addr &= huge_page_mask(h);
+
+ /*
+ * Serialize via hugetlb_fault_mutex
+ */
+ idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
+ hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
+ idx, dst_addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ err = -ENOMEM;
+ dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+ if (!dst_pte) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
+
+ err = -EEXIST;
+ dst_pteval = huge_ptep_get(dst_pte);
+ if (!huge_pte_none(dst_pteval)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
+
+ err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
+ dst_addr, src_addr, &page);
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ cond_resched();
+
+ if (unlikely(err == -EFAULT)) {
+ up_read(&dst_mm->mmap_sem);
+ BUG_ON(!page);
+
+ err = copy_huge_page_from_user(
+ (const void __user *)src_addr,
+ page, pages_per_huge_page(h));
+ if (unlikely(err)) {
+ err = -EFAULT;
+ goto out;
+ }
+ down_read(&dst_mm->mmap_sem);
+
+ dst_vma = NULL;
+ goto retry;
+ } else
+ BUG_ON(page);
+
+ if (!err) {
+ dst_addr += vma_hpagesize;
+ src_addr += vma_hpagesize;
+ copied += vma_hpagesize;
+
+ if (fatal_signal_pending(current))
+ err = -EINTR;
+ }
+ if (err)
+ break;
+ }
+
+out_unlock:
+ up_read(&dst_mm->mmap_sem);
+out:
+ if (page)
+ put_page(page);
+ BUG_ON(copied < 0);
+ BUG_ON(err > 0);
+ BUG_ON(!copied && !err);
+ return copied ? copied : err;
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage)
+{
+ up_read(&dst_mm->mmap_sem); /* HUGETLB not configured */
+ BUG();
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
@@ -182,6 +354,13 @@ retry:
goto out_unlock;
/*
+ * If this is a HUGETLB vma, pass off to appropriate routine
+ */
+ if (dst_vma->vm_flags & VM_HUGETLB)
+ return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+ src_start, len, false);
+
+ /*
* Be strict and only allow __mcopy_atomic on userfaultfd
* registered ranges to prevent userland errors going
* unnoticed. As far as the VM consistency is concerned, it
--
2.4.11
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [RFC PATCH 3/6] mm/userfaultfd: add __mcopy_atomic_hugetlb for huge page UFFDIO_COPY
2016-06-06 17:45 ` [RFC PATCH 3/6] mm/userfaultfd: add __mcopy_atomic_hugetlb for huge page UFFDIO_COPY Mike Kravetz
@ 2016-06-07 6:27 ` Hillf Danton
2016-06-07 16:35 ` Mike Kravetz
0 siblings, 1 reply; 9+ messages in thread
From: Hillf Danton @ 2016-06-07 6:27 UTC (permalink / raw)
To: 'Mike Kravetz', linux-mm, linux-kernel
Cc: 'Andrea Arcangeli', 'Hugh Dickins',
'Dave Hansen', 'Kirill A. Shutemov',
'Naoya Horiguchi', 'Michal Hocko',
'Andrew Morton'
> @@ -182,6 +354,13 @@ retry:
> goto out_unlock;
>
> /*
> + * If this is a HUGETLB vma, pass off to appropriate routine
> + */
> + if (dst_vma->vm_flags & VM_HUGETLB)
Use is_vm_hugetlb_page()?
And in cases in subsequent patches?
Hillf
> + return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
> + src_start, len, false);
> +
> + /*
> * Be strict and only allow __mcopy_atomic on userfaultfd
> * registered ranges to prevent userland errors going
> * unnoticed. As far as the VM consistency is concerned, it
> --
> 2.4.11
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC PATCH 3/6] mm/userfaultfd: add __mcopy_atomic_hugetlb for huge page UFFDIO_COPY
2016-06-07 6:27 ` Hillf Danton
@ 2016-06-07 16:35 ` Mike Kravetz
0 siblings, 0 replies; 9+ messages in thread
From: Mike Kravetz @ 2016-06-07 16:35 UTC (permalink / raw)
To: Hillf Danton, linux-mm, linux-kernel
Cc: 'Andrea Arcangeli', 'Hugh Dickins',
'Dave Hansen', 'Kirill A. Shutemov',
'Naoya Horiguchi', 'Michal Hocko',
'Andrew Morton'
On 06/06/2016 11:27 PM, Hillf Danton wrote:
>> @@ -182,6 +354,13 @@ retry:
>> goto out_unlock;
>>
>> /*
>> + * If this is a HUGETLB vma, pass off to appropriate routine
>> + */
>> + if (dst_vma->vm_flags & VM_HUGETLB)
>
> Use is_vm_hugetlb_page()?
> And in cases in subsequent patches?
>
> Hillf
Yes, that would be better. Thanks.
--
Mike Kravetz
^ permalink raw reply [flat|nested] 9+ messages in thread
* [RFC PATCH 4/6] mm/hugetlb: add userfaultfd hugetlb hook
2016-06-06 17:45 [RFC PATCH 0/6] hugetlb support for userfaultfd Mike Kravetz
` (2 preceding siblings ...)
2016-06-06 17:45 ` [RFC PATCH 3/6] mm/userfaultfd: add __mcopy_atomic_hugetlb for huge page UFFDIO_COPY Mike Kravetz
@ 2016-06-06 17:45 ` Mike Kravetz
2016-06-06 17:45 ` [RFC PATCH 5/6] fs/userfaultfd: allow registration of ranges containing huge pages Mike Kravetz
2016-06-06 17:45 ` [RFC PATCH 6/6] selftests/userfaultfd: add userfaultfd_hugetlb test Mike Kravetz
5 siblings, 0 replies; 9+ messages in thread
From: Mike Kravetz @ 2016-06-06 17:45 UTC (permalink / raw)
To: linux-mm, linux-kernel
Cc: Andrea Arcangeli, Hugh Dickins, Dave Hansen, Kirill A. Shutemov,
Naoya Horiguchi, Hillf Danton, Michal Hocko, Andrew Morton,
Mike Kravetz
When processing a hugetlb fault for no page present, check the vma to
determine if faults are to be handled via userfaultfd. If so, drop the
hugetlb_fault_mutex and call handle_userfault().
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
mm/hugetlb.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4943d8b..a2814e7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -32,6 +32,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
+#include <linux/userfaultfd_k.h>
#include "internal.h"
int hugepages_treat_as_movable;
@@ -3569,6 +3570,27 @@ retry:
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
+
+ /*
+ * Check for page in userfault range
+ */
+ if (userfaultfd_missing(vma)) {
+ u32 hash;
+
+ /*
+ * hugetlb_fault_mutex must be dropped before
+ * handling userfault. Reacquire after handling
+ * fault to make calling code simpler.
+ */
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
+ idx, address);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ ret = handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
--
2.4.11
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC PATCH 5/6] fs/userfaultfd: allow registration of ranges containing huge pages
2016-06-06 17:45 [RFC PATCH 0/6] hugetlb support for userfaultfd Mike Kravetz
` (3 preceding siblings ...)
2016-06-06 17:45 ` [RFC PATCH 4/6] mm/hugetlb: add userfaultfd hugetlb hook Mike Kravetz
@ 2016-06-06 17:45 ` Mike Kravetz
2016-06-06 17:45 ` [RFC PATCH 6/6] selftests/userfaultfd: add userfaultfd_hugetlb test Mike Kravetz
5 siblings, 0 replies; 9+ messages in thread
From: Mike Kravetz @ 2016-06-06 17:45 UTC (permalink / raw)
To: linux-mm, linux-kernel
Cc: Andrea Arcangeli, Hugh Dickins, Dave Hansen, Kirill A. Shutemov,
Naoya Horiguchi, Hillf Danton, Michal Hocko, Andrew Morton,
Mike Kravetz
Expand the userfaultfd_register/unregister routines to allow VM_HUGETLB
vmas. huge page alignment checking is performed after a VM_HUGETLB
vma is encountered.
Also, since there is no UFFDIO_ZEROPAGE support for huge pages do not
return that as a valid ioctl method for huge page ranges.
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
fs/userfaultfd.c | 69 +++++++++++++++++++++++++++++++++++++---
include/uapi/linux/userfaultfd.h | 3 ++
2 files changed, 67 insertions(+), 5 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 2d97952..7a1a345 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -26,6 +26,7 @@
#include <linux/mempolicy.h>
#include <linux/ioctl.h>
#include <linux/security.h>
+#include <linux/hugetlb.h>
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
@@ -728,6 +729,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
struct uffdio_register __user *user_uffdio_register;
unsigned long vm_flags, new_flags;
bool found;
+ bool huge_pages;
unsigned long start, end, vma_end;
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -779,6 +781,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out_unlock;
/*
+ * If the first vma contains huge pages, make sure start address
+ * is aligned to huge page size.
+ */
+ if (vma->vm_flags & VM_HUGETLB) {
+ unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+ if (start & (vma_hpagesize - 1))
+ goto out_unlock;
+ }
+
+ /*
* Search for not compatible vmas.
*
* FIXME: this shall be relaxed later so that it doesn't fail
@@ -786,6 +799,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* on anonymous vmas).
*/
found = false;
+ huge_pages = false;
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
cond_resched();
@@ -794,7 +808,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/* check not compatible vmas */
ret = -EINVAL;
- if (cur->vm_ops)
+ if (cur->vm_ops && !(cur->vm_flags & VM_HUGETLB))
goto out_unlock;
/*
@@ -808,6 +822,25 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
cur->vm_userfaultfd_ctx.ctx != ctx)
goto out_unlock;
+ /*
+ * Note vmas containing huge pages
+ */
+ if (cur->vm_flags & VM_HUGETLB) {
+ huge_pages = true;
+
+ /*
+ * If vma contains end address, check alignment
+ */
+ ret = -EINVAL;
+ if (end <= cur->vm_end && end > cur->vm_start) {
+ unsigned long vma_hpagesize =
+ vma_kernel_pagesize(cur);
+
+ if (end & (vma_hpagesize - 1))
+ goto out_unlock;
+ }
+ }
+
found = true;
}
BUG_ON(!found);
@@ -819,7 +852,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(vma->vm_ops);
+ BUG_ON(vma->vm_ops && !(vma->vm_flags & VM_HUGETLB));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
@@ -877,7 +910,8 @@ out_unlock:
* userland which ioctls methods are guaranteed to
* succeed on this range.
*/
- if (put_user(UFFD_API_RANGE_IOCTLS,
+ if (put_user(huge_pages ? UFFD_API_RANGE_IOCTLS_HPAGE :
+ UFFD_API_RANGE_IOCTLS,
&user_uffdio_register->ioctls))
ret = -EFAULT;
}
@@ -924,6 +958,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out_unlock;
/*
+ * If the first vma contains huge pages, make sure start address
+ * is aligned to huge page size.
+ */
+ if (vma->vm_flags & VM_HUGETLB) {
+ unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+ if (start & (vma_hpagesize - 1))
+ goto out_unlock;
+ }
+
+ /*
* Search for not compatible vmas.
*
* FIXME: this shall be relaxed later so that it doesn't fail
@@ -945,9 +990,23 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (cur->vm_ops)
+ if (cur->vm_ops && !(cur->vm_flags & VM_HUGETLB))
goto out_unlock;
+ /*
+ * If this vma contains ending address, and huge pages
+ * check alignment.
+ */
+ if (cur->vm_flags & VM_HUGETLB && end <= cur->vm_end &&
+ end > cur->vm_start) {
+ unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
+
+ ret = -EINVAL;
+
+ if (end & (vma_hpagesize - 1))
+ goto out_unlock;
+ }
+
found = true;
}
BUG_ON(!found);
@@ -959,7 +1018,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(vma->vm_ops);
+ BUG_ON(vma->vm_ops && !(vma->vm_flags & VM_HUGETLB));
/*
* Nothing to do: this vma is already registered into this
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 9057d7a..751d814 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -26,6 +26,9 @@
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_ZEROPAGE)
+#define UFFD_API_RANGE_IOCTLS_HPAGE \
+ ((__u64)1 << _UFFDIO_WAKE | \
+ (__u64)1 << _UFFDIO_COPY)
/*
* Valid ioctl command number range with this API is from 0x00 to
--
2.4.11
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC PATCH 6/6] selftests/userfaultfd: add userfaultfd_hugetlb test
2016-06-06 17:45 [RFC PATCH 0/6] hugetlb support for userfaultfd Mike Kravetz
` (4 preceding siblings ...)
2016-06-06 17:45 ` [RFC PATCH 5/6] fs/userfaultfd: allow registration of ranges containing huge pages Mike Kravetz
@ 2016-06-06 17:45 ` Mike Kravetz
5 siblings, 0 replies; 9+ messages in thread
From: Mike Kravetz @ 2016-06-06 17:45 UTC (permalink / raw)
To: linux-mm, linux-kernel
Cc: Andrea Arcangeli, Hugh Dickins, Dave Hansen, Kirill A. Shutemov,
Naoya Horiguchi, Hillf Danton, Michal Hocko, Andrew Morton,
Mike Kravetz
Test userfaultfd hugetlb functionality by using the existing testing
method (in userfaultfd.c). Instead of an anonymous memeory, a
hugetlbfs file is mmap'ed private. In this way fallocate hole punch
can be used to release pages. This is because madvise(MADV_DONTNEED)
is not supported for huge pages.
Use the same file, but create wrappers for allocating ranges and
releasing pages. Compile userfaultfd.c with HUGETLB_TEST defined to
produce an executable to test userfaultfd hugetlb functionality.
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
tools/testing/selftests/vm/Makefile | 3 +
tools/testing/selftests/vm/run_vmtests | 13 +++
tools/testing/selftests/vm/userfaultfd.c | 161 +++++++++++++++++++++++++++----
3 files changed, 160 insertions(+), 17 deletions(-)
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index e4bb1de..aaa4225 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -10,12 +10,15 @@ BINARIES += on-fault-limit
BINARIES += thuge-gen
BINARIES += transhuge-stress
BINARIES += userfaultfd
+BINARIES += userfaultfd_hugetlb
all: $(BINARIES)
%: %.c
$(CC) $(CFLAGS) -o $@ $^ -lrt
userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h
$(CC) $(CFLAGS) -O2 -o $@ $< -lpthread
+userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h
+ $(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread
../../../../usr/include/linux/kernel.h:
make -C ../../../.. headers_install
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index e11968b..14d697e 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -103,6 +103,19 @@ else
echo "[PASS]"
fi
+echo "----------------------------"
+echo "running userfaultfd_hugetlb"
+echo "----------------------------"
+# 258MB total huge pages == 128MB src and 128MB dst
+./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+rm -f $mnt/ufd_test_file
+
#cleanup
umount $mnt
rm -rf $mnt
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index d77ed41..3011711 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -76,6 +76,10 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
#define BOUNCE_POLL (1<<3)
static int bounces;
+#ifdef HUGETLB_TEST
+static int huge_fd;
+static char *huge_fd_off0;
+#endif
static unsigned long long *count_verify;
static int uffd, finished, *pipefd;
static char *area_src, *area_dst;
@@ -97,6 +101,69 @@ pthread_attr_t attr;
~(unsigned long)(sizeof(unsigned long long) \
- 1)))
+#ifndef HUGETLB_TEST
+
+#define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
+ (1 << _UFFDIO_COPY) | \
+ (1 << _UFFDIO_ZEROPAGE))
+
+static int release_pages(char *rel_area)
+{
+ int ret = 0;
+
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static void allocate_area(void **alloc_area)
+{
+ if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
+ fprintf(stderr, "out of memory\n");
+ *alloc_area = NULL;
+ }
+}
+
+#else /* HUGETLB_TEST */
+
+#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_HPAGE
+
+static int release_pages(char *rel_area)
+{
+ int ret = 0;
+
+ if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ rel_area == huge_fd_off0 ? 0 :
+ nr_pages * page_size,
+ nr_pages * page_size)) {
+ perror("fallocate");
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+static void allocate_area(void **alloc_area)
+{
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_HUGETLB, huge_fd,
+ *alloc_area == area_src ? 0 :
+ nr_pages * page_size);
+ if (*alloc_area == MAP_FAILED) {
+ fprintf(stderr, "mmap of hugetlbfs file failed\n");
+ *alloc_area = NULL;
+ }
+
+ if (*alloc_area == area_src)
+ huge_fd_off0 = *alloc_area;
+}
+
+#endif /* HUGETLB_TEST */
+
static int my_bcmp(char *str1, char *str2, size_t n)
{
unsigned long i;
@@ -384,10 +451,8 @@ static int stress(unsigned long *userfaults)
* UFFDIO_COPY without writing zero pages into area_dst
* because the background threads already completed).
*/
- if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
- perror("madvise");
+ if (release_pages(area_src))
return 1;
- }
for (cpu = 0; cpu < nr_cpus; cpu++) {
char c;
@@ -425,16 +490,12 @@ static int userfaultfd_stress(void)
int uffd_flags, err;
unsigned long userfaults[nr_cpus];
- if (posix_memalign(&area, page_size, nr_pages * page_size)) {
- fprintf(stderr, "out of memory\n");
+ allocate_area((void **)&area_src);
+ if (!area_src)
return 1;
- }
- area_src = area;
- if (posix_memalign(&area, page_size, nr_pages * page_size)) {
- fprintf(stderr, "out of memory\n");
+ allocate_area((void **)&area_dst);
+ if (!area_dst)
return 1;
- }
- area_dst = area;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd < 0) {
@@ -528,9 +589,7 @@ static int userfaultfd_stress(void)
fprintf(stderr, "register failure\n");
return 1;
}
- expected_ioctls = (1 << _UFFDIO_WAKE) |
- (1 << _UFFDIO_COPY) |
- (1 << _UFFDIO_ZEROPAGE);
+ expected_ioctls = EXPECTED_IOCTLS;
if ((uffdio_register.ioctls & expected_ioctls) !=
expected_ioctls) {
fprintf(stderr,
@@ -562,10 +621,8 @@ static int userfaultfd_stress(void)
* MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
* required to MADV_DONTNEED here.
*/
- if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
- perror("madvise 2");
+ if (release_pages(area_dst))
return 1;
- }
/* bounce pass */
if (stress(userfaults))
@@ -606,6 +663,8 @@ static int userfaultfd_stress(void)
return err;
}
+#ifndef HUGETLB_TEST
+
int main(int argc, char **argv)
{
if (argc < 3)
@@ -632,6 +691,74 @@ int main(int argc, char **argv)
return userfaultfd_stress();
}
+#else /* HUGETLB_TEST */
+
+/*
+ * Copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+ return hps;
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 4)
+ fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
+ exit(1);
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ page_size = default_huge_page_size();
+ if (!page_size)
+ fprintf(stderr, "Unable to determine huge page size\n"),
+ exit(2);
+ if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+ > page_size)
+ fprintf(stderr, "Impossible to run this test\n"), exit(2);
+ nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+ nr_cpus;
+ if (!nr_pages_per_cpu) {
+ fprintf(stderr, "invalid MiB\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ bounces = atoi(argv[2]);
+ if (bounces <= 0) {
+ fprintf(stderr, "invalid bounces\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ nr_pages = nr_pages_per_cpu * nr_cpus;
+ huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
+ if (huge_fd < 0) {
+ fprintf(stderr, "Open of %s failed", argv[3]);
+ perror("open");
+ exit(1);
+ }
+ if (ftruncate(huge_fd, 0)) {
+ fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
+ perror("ftruncate");
+ exit(1);
+ }
+ printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+ nr_pages, nr_pages_per_cpu);
+ return userfaultfd_stress();
+}
+
+#endif
#else /* __NR_userfaultfd */
#warning "missing __NR_userfaultfd definition"
--
2.4.11
^ permalink raw reply related [flat|nested] 9+ messages in thread