All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-11 17:31 ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-11 17:31 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann, linux-nvdimm,
	linux-api, Dave Hansen, linux-kernel, Andrew Morton,
	Kirill A. Shutemov

As evidenced by this bug report [1], userspace libraries are interested
in whether a mapping is DAX mapped, i.e. no intervening page cache.
Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
explicit "is dax" indication as a new flag in the page vector populated
by mincore.

There are also cases, particularly for testing and validating a
configuration to know the hardware mapping geometry of the pages in a
given process address range.  Consider filesystem-dax where a
configuration needs to take care to align partitions and block
allocations before huge page mappings might be used, or
anonymous-transparent-huge-pages where a process is opportunistically
assigned large pages.  mincore2() allows these configurations to be
surveyed and validated.

The implementation takes advantage of the unused bits in the per-page
byte returned for each PAGE_SIZE extent of a given address range.  The
new format of each vector byte is:

(TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present

[1]: https://lkml.org/lkml/2016/9/7/61

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/syscalls.h               |    2 +
 include/uapi/asm-generic/mman-common.h |    3 +
 kernel/sys_ni.c                        |    1 
 mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
 4 files changed, 104 insertions(+), 28 deletions(-)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d02239022bd0..4aa2ee7e359a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_mincore(unsigned long start, size_t len,
 				unsigned char __user * vec);
+asmlinkage long sys_mincore2(unsigned long start, size_t len,
+				unsigned char __user * vec, int flags);
 
 asmlinkage long sys_pivot_root(const char __user *new_root,
 				const char __user *put_old);
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 58274382a616..05037343f0da 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -72,4 +72,7 @@
 #define MAP_HUGE_SHIFT	26
 #define MAP_HUGE_MASK	0x3f
 
+#define MINCORE_DAX	1		/* indicate pages that are dax-mapped */
+#define MINCORE_ORDER	2		/* retrieve hardware mapping-size-order */
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..e14b87834054 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -197,6 +197,7 @@ cond_syscall(sys_mlockall);
 cond_syscall(sys_munlockall);
 cond_syscall(sys_mlock2);
 cond_syscall(sys_mincore);
+cond_syscall(sys_mincore2);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
diff --git a/mm/mincore.c b/mm/mincore.c
index c0b5ba965200..15f9eb5de65b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -15,25 +15,62 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
+#include <linux/dax.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+#define MINCORE_DAX_MASK 2
+#define MINCORE_DAX_SHIFT 1
+
+#define MINCORE_ORDER_MASK 0x7c
+#define MINCORE_ORDER_SHIFT 2
+
+struct mincore_params {
+	unsigned char *vec;
+	int flags;
+};
+
+static void mincore_set(unsigned char *vec, struct vm_area_struct *vma, int nr,
+		int flags)
+{
+	unsigned char mincore = 1;
+
+	if (!nr) {
+		*vec = 0;
+		return;
+	}
+
+	if ((flags & MINCORE_DAX) && vma_is_dax(vma))
+		mincore |= 1 << MINCORE_DAX_SHIFT;
+	if (flags & MINCORE_ORDER) {
+		unsigned char order = ilog2(nr);
+
+		WARN_ON((order << MINCORE_ORDER_SHIFT) & ~MINCORE_ORDER_MASK);
+		mincore |= order << MINCORE_ORDER_SHIFT;
+	}
+	memset(vec, mincore, nr);
+}
+
 static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
 			unsigned long end, struct mm_walk *walk)
 {
 #ifdef CONFIG_HUGETLB_PAGE
+	struct mincore_params *p = walk->private;
+	int nr = (end - addr) >> PAGE_SHIFT;
+	unsigned char *vec = p->vec;
 	unsigned char present;
-	unsigned char *vec = walk->private;
 
 	/*
 	 * Hugepages under user process are always in RAM and never
 	 * swapped out, but theoretically it needs to be checked.
 	 */
 	present = pte && !huge_pte_none(huge_ptep_get(pte));
-	for (; addr != end; vec++, addr += PAGE_SIZE)
-		*vec = present;
-	walk->private = vec;
+	if (!present)
+		memset(vec, 0, nr);
+	else
+		mincore_set(vec, walk->vma, nr, p->flags);
+	p->vec = vec + nr;
 #else
 	BUG();
 #endif
@@ -82,20 +119,24 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 }
 
 static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
-				struct vm_area_struct *vma, unsigned char *vec)
+				struct vm_area_struct *vma, unsigned char *vec,
+				int flags)
 {
 	unsigned long nr = (end - addr) >> PAGE_SHIFT;
+	unsigned char present;
 	int i;
 
 	if (vma->vm_file) {
 		pgoff_t pgoff;
 
 		pgoff = linear_page_index(vma, addr);
-		for (i = 0; i < nr; i++, pgoff++)
-			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+		for (i = 0; i < nr; i++, pgoff++) {
+			present = mincore_page(vma->vm_file->f_mapping, pgoff);
+			mincore_set(vec + i, vma, present, flags);
+		}
 	} else {
 		for (i = 0; i < nr; i++)
-			vec[i] = 0;
+			mincore_set(vec + i, vma, 0, flags);
 	}
 	return nr;
 }
@@ -103,8 +144,11 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
 static int mincore_unmapped_range(unsigned long addr, unsigned long end,
 				   struct mm_walk *walk)
 {
-	walk->private += __mincore_unmapped_range(addr, end,
-						  walk->vma, walk->private);
+	struct mincore_params *p = walk->private;
+	int nr = __mincore_unmapped_range(addr, end, walk->vma, p->vec,
+			p->flags);
+
+	p->vec += nr;
 	return 0;
 }
 
@@ -114,18 +158,20 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	spinlock_t *ptl;
 	struct vm_area_struct *vma = walk->vma;
 	pte_t *ptep;
-	unsigned char *vec = walk->private;
+	struct mincore_params *p = walk->private;
+	unsigned char *vec = p->vec;
 	int nr = (end - addr) >> PAGE_SHIFT;
+	int flags = p->flags;
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		memset(vec, 1, nr);
+		mincore_set(vec, vma, nr, flags);
 		spin_unlock(ptl);
 		goto out;
 	}
 
 	if (pmd_trans_unstable(pmd)) {
-		__mincore_unmapped_range(addr, end, vma, vec);
+		__mincore_unmapped_range(addr, end, vma, vec, flags);
 		goto out;
 	}
 
@@ -135,9 +181,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 		if (pte_none(pte))
 			__mincore_unmapped_range(addr, addr + PAGE_SIZE,
-						 vma, vec);
+						 vma, vec, flags);
 		else if (pte_present(pte))
-			*vec = 1;
+			mincore_set(vec, vma, 1, flags);
 		else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
@@ -146,14 +192,17 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 				 * migration or hwpoison entries are always
 				 * uptodate
 				 */
-				*vec = 1;
+				mincore_set(vec, vma, 1, flags);
 			} else {
 #ifdef CONFIG_SWAP
-				*vec = mincore_page(swap_address_space(entry),
-					entry.val);
+				unsigned char present;
+
+				present = mincore_page(swap_address_space(entry),
+						entry.val);
+				mincore_set(vec, vma, present, flags);
 #else
 				WARN_ON(1);
-				*vec = 1;
+				mincore_set(vec, vma, 1, flags);
 #endif
 			}
 		}
@@ -161,7 +210,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	}
 	pte_unmap_unlock(ptep - 1, ptl);
 out:
-	walk->private += nr;
+	p->vec = vec + nr;
 	cond_resched();
 	return 0;
 }
@@ -171,16 +220,21 @@ out:
  * all the arguments, we hold the mmap semaphore: we should
  * just return the amount of info we're asked for.
  */
-static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+static long do_mincore(unsigned long addr, unsigned long pages,
+		unsigned char *vec, int flags)
 {
 	struct vm_area_struct *vma;
 	unsigned long end;
 	int err;
+	struct mincore_params p = {
+		.vec = vec,
+		.flags = flags,
+	};
 	struct mm_walk mincore_walk = {
 		.pmd_entry = mincore_pte_range,
 		.pte_hole = mincore_unmapped_range,
 		.hugetlb_entry = mincore_hugetlb,
-		.private = vec,
+		.private = &p,
 	};
 
 	vma = find_vma(current->mm, addr);
@@ -195,13 +249,19 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 }
 
 /*
- * The mincore(2) system call.
+ * The mincore2(2) system call.
  *
- * mincore() returns the memory residency status of the pages in the
+ * mincore2() returns the memory residency status of the pages in the
  * current process's address space specified by [addr, addr + len).
  * The status is returned in a vector of bytes.  The least significant
  * bit of each byte is 1 if the referenced page is in memory, otherwise
- * it is zero.
+ * it is zero.  When 'flags' is non-zero each byte additionally contains
+ * an indication of whether the referenced page in memory is a DAX
+ * mapping (bit 2 of each vector byte), and/or the order of the mapping
+ * (bits 3 through 7 of each vector byte).  Where the order relates to
+ * the hardware mapping size backing the given logical-page.  For
+ * example, a 2MB-dax-mapped-huge-page would correspond to 512 vector
+ * entries with the value 0x27.
  *
  * Because the status of a page can change after mincore() checks it
  * but before it returns to the application, the returned vector may
@@ -218,8 +278,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
  *		mapped
  *  -EAGAIN - A kernel resource was temporarily unavailable.
  */
-SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
-		unsigned char __user *, vec)
+SYSCALL_DEFINE4(mincore2, unsigned long, start, size_t, len,
+		unsigned char __user *, vec, int, flags)
 {
 	long retval;
 	unsigned long pages;
@@ -229,6 +289,10 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
 
+	/* Check that undefined flags are zero */
+	if (flags & ~(MINCORE_DAX | MINCORE_ORDER))
+		return -EINVAL;
+
 	/* ..and we need to be passed a valid user-space range */
 	if (!access_ok(VERIFY_READ, (void __user *) start, len))
 		return -ENOMEM;
@@ -251,7 +315,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 		 * the temporary buffer size.
 		 */
 		down_read(&current->mm->mmap_sem);
-		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
+		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp, flags);
 		up_read(&current->mm->mmap_sem);
 
 		if (retval <= 0)
@@ -268,3 +332,9 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 	free_page((unsigned long) tmp);
 	return retval;
 }
+
+SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
+		unsigned char __user *, vec)
+{
+	return sys_mincore2(start, len, vec, 0);
+}

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-11 17:31 ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-11 17:31 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann, linux-nvdimm,
	linux-api, Dave Hansen, linux-kernel, Andrew Morton,
	Kirill A. Shutemov

As evidenced by this bug report [1], userspace libraries are interested
in whether a mapping is DAX mapped, i.e. no intervening page cache.
Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
explicit "is dax" indication as a new flag in the page vector populated
by mincore.

There are also cases, particularly for testing and validating a
configuration to know the hardware mapping geometry of the pages in a
given process address range.  Consider filesystem-dax where a
configuration needs to take care to align partitions and block
allocations before huge page mappings might be used, or
anonymous-transparent-huge-pages where a process is opportunistically
assigned large pages.  mincore2() allows these configurations to be
surveyed and validated.

The implementation takes advantage of the unused bits in the per-page
byte returned for each PAGE_SIZE extent of a given address range.  The
new format of each vector byte is:

(TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present

[1]: https://lkml.org/lkml/2016/9/7/61

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/syscalls.h               |    2 +
 include/uapi/asm-generic/mman-common.h |    3 +
 kernel/sys_ni.c                        |    1 
 mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
 4 files changed, 104 insertions(+), 28 deletions(-)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d02239022bd0..4aa2ee7e359a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_mincore(unsigned long start, size_t len,
 				unsigned char __user * vec);
+asmlinkage long sys_mincore2(unsigned long start, size_t len,
+				unsigned char __user * vec, int flags);
 
 asmlinkage long sys_pivot_root(const char __user *new_root,
 				const char __user *put_old);
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 58274382a616..05037343f0da 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -72,4 +72,7 @@
 #define MAP_HUGE_SHIFT	26
 #define MAP_HUGE_MASK	0x3f
 
+#define MINCORE_DAX	1		/* indicate pages that are dax-mapped */
+#define MINCORE_ORDER	2		/* retrieve hardware mapping-size-order */
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..e14b87834054 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -197,6 +197,7 @@ cond_syscall(sys_mlockall);
 cond_syscall(sys_munlockall);
 cond_syscall(sys_mlock2);
 cond_syscall(sys_mincore);
+cond_syscall(sys_mincore2);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
diff --git a/mm/mincore.c b/mm/mincore.c
index c0b5ba965200..15f9eb5de65b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -15,25 +15,62 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
+#include <linux/dax.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+#define MINCORE_DAX_MASK 2
+#define MINCORE_DAX_SHIFT 1
+
+#define MINCORE_ORDER_MASK 0x7c
+#define MINCORE_ORDER_SHIFT 2
+
+struct mincore_params {
+	unsigned char *vec;
+	int flags;
+};
+
+static void mincore_set(unsigned char *vec, struct vm_area_struct *vma, int nr,
+		int flags)
+{
+	unsigned char mincore = 1;
+
+	if (!nr) {
+		*vec = 0;
+		return;
+	}
+
+	if ((flags & MINCORE_DAX) && vma_is_dax(vma))
+		mincore |= 1 << MINCORE_DAX_SHIFT;
+	if (flags & MINCORE_ORDER) {
+		unsigned char order = ilog2(nr);
+
+		WARN_ON((order << MINCORE_ORDER_SHIFT) & ~MINCORE_ORDER_MASK);
+		mincore |= order << MINCORE_ORDER_SHIFT;
+	}
+	memset(vec, mincore, nr);
+}
+
 static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
 			unsigned long end, struct mm_walk *walk)
 {
 #ifdef CONFIG_HUGETLB_PAGE
+	struct mincore_params *p = walk->private;
+	int nr = (end - addr) >> PAGE_SHIFT;
+	unsigned char *vec = p->vec;
 	unsigned char present;
-	unsigned char *vec = walk->private;
 
 	/*
 	 * Hugepages under user process are always in RAM and never
 	 * swapped out, but theoretically it needs to be checked.
 	 */
 	present = pte && !huge_pte_none(huge_ptep_get(pte));
-	for (; addr != end; vec++, addr += PAGE_SIZE)
-		*vec = present;
-	walk->private = vec;
+	if (!present)
+		memset(vec, 0, nr);
+	else
+		mincore_set(vec, walk->vma, nr, p->flags);
+	p->vec = vec + nr;
 #else
 	BUG();
 #endif
@@ -82,20 +119,24 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 }
 
 static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
-				struct vm_area_struct *vma, unsigned char *vec)
+				struct vm_area_struct *vma, unsigned char *vec,
+				int flags)
 {
 	unsigned long nr = (end - addr) >> PAGE_SHIFT;
+	unsigned char present;
 	int i;
 
 	if (vma->vm_file) {
 		pgoff_t pgoff;
 
 		pgoff = linear_page_index(vma, addr);
-		for (i = 0; i < nr; i++, pgoff++)
-			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+		for (i = 0; i < nr; i++, pgoff++) {
+			present = mincore_page(vma->vm_file->f_mapping, pgoff);
+			mincore_set(vec + i, vma, present, flags);
+		}
 	} else {
 		for (i = 0; i < nr; i++)
-			vec[i] = 0;
+			mincore_set(vec + i, vma, 0, flags);
 	}
 	return nr;
 }
@@ -103,8 +144,11 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
 static int mincore_unmapped_range(unsigned long addr, unsigned long end,
 				   struct mm_walk *walk)
 {
-	walk->private += __mincore_unmapped_range(addr, end,
-						  walk->vma, walk->private);
+	struct mincore_params *p = walk->private;
+	int nr = __mincore_unmapped_range(addr, end, walk->vma, p->vec,
+			p->flags);
+
+	p->vec += nr;
 	return 0;
 }
 
@@ -114,18 +158,20 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	spinlock_t *ptl;
 	struct vm_area_struct *vma = walk->vma;
 	pte_t *ptep;
-	unsigned char *vec = walk->private;
+	struct mincore_params *p = walk->private;
+	unsigned char *vec = p->vec;
 	int nr = (end - addr) >> PAGE_SHIFT;
+	int flags = p->flags;
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		memset(vec, 1, nr);
+		mincore_set(vec, vma, nr, flags);
 		spin_unlock(ptl);
 		goto out;
 	}
 
 	if (pmd_trans_unstable(pmd)) {
-		__mincore_unmapped_range(addr, end, vma, vec);
+		__mincore_unmapped_range(addr, end, vma, vec, flags);
 		goto out;
 	}
 
@@ -135,9 +181,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 		if (pte_none(pte))
 			__mincore_unmapped_range(addr, addr + PAGE_SIZE,
-						 vma, vec);
+						 vma, vec, flags);
 		else if (pte_present(pte))
-			*vec = 1;
+			mincore_set(vec, vma, 1, flags);
 		else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
@@ -146,14 +192,17 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 				 * migration or hwpoison entries are always
 				 * uptodate
 				 */
-				*vec = 1;
+				mincore_set(vec, vma, 1, flags);
 			} else {
 #ifdef CONFIG_SWAP
-				*vec = mincore_page(swap_address_space(entry),
-					entry.val);
+				unsigned char present;
+
+				present = mincore_page(swap_address_space(entry),
+						entry.val);
+				mincore_set(vec, vma, present, flags);
 #else
 				WARN_ON(1);
-				*vec = 1;
+				mincore_set(vec, vma, 1, flags);
 #endif
 			}
 		}
@@ -161,7 +210,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	}
 	pte_unmap_unlock(ptep - 1, ptl);
 out:
-	walk->private += nr;
+	p->vec = vec + nr;
 	cond_resched();
 	return 0;
 }
@@ -171,16 +220,21 @@ out:
  * all the arguments, we hold the mmap semaphore: we should
  * just return the amount of info we're asked for.
  */
-static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+static long do_mincore(unsigned long addr, unsigned long pages,
+		unsigned char *vec, int flags)
 {
 	struct vm_area_struct *vma;
 	unsigned long end;
 	int err;
+	struct mincore_params p = {
+		.vec = vec,
+		.flags = flags,
+	};
 	struct mm_walk mincore_walk = {
 		.pmd_entry = mincore_pte_range,
 		.pte_hole = mincore_unmapped_range,
 		.hugetlb_entry = mincore_hugetlb,
-		.private = vec,
+		.private = &p,
 	};
 
 	vma = find_vma(current->mm, addr);
@@ -195,13 +249,19 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 }
 
 /*
- * The mincore(2) system call.
+ * The mincore2(2) system call.
  *
- * mincore() returns the memory residency status of the pages in the
+ * mincore2() returns the memory residency status of the pages in the
  * current process's address space specified by [addr, addr + len).
  * The status is returned in a vector of bytes.  The least significant
  * bit of each byte is 1 if the referenced page is in memory, otherwise
- * it is zero.
+ * it is zero.  When 'flags' is non-zero each byte additionally contains
+ * an indication of whether the referenced page in memory is a DAX
+ * mapping (bit 2 of each vector byte), and/or the order of the mapping
+ * (bits 3 through 7 of each vector byte).  Where the order relates to
+ * the hardware mapping size backing the given logical-page.  For
+ * example, a 2MB-dax-mapped-huge-page would correspond to 512 vector
+ * entries with the value 0x27.
  *
  * Because the status of a page can change after mincore() checks it
  * but before it returns to the application, the returned vector may
@@ -218,8 +278,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
  *		mapped
  *  -EAGAIN - A kernel resource was temporarily unavailable.
  */
-SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
-		unsigned char __user *, vec)
+SYSCALL_DEFINE4(mincore2, unsigned long, start, size_t, len,
+		unsigned char __user *, vec, int, flags)
 {
 	long retval;
 	unsigned long pages;
@@ -229,6 +289,10 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
 
+	/* Check that undefined flags are zero */
+	if (flags & ~(MINCORE_DAX | MINCORE_ORDER))
+		return -EINVAL;
+
 	/* ..and we need to be passed a valid user-space range */
 	if (!access_ok(VERIFY_READ, (void __user *) start, len))
 		return -ENOMEM;
@@ -251,7 +315,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 		 * the temporary buffer size.
 		 */
 		down_read(&current->mm->mmap_sem);
-		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
+		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp, flags);
 		up_read(&current->mm->mmap_sem);
 
 		if (retval <= 0)
@@ -268,3 +332,9 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 	free_page((unsigned long) tmp);
 	return retval;
 }
+
+SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
+		unsigned char __user *, vec)
+{
+	return sys_mincore2(start, len, vec, 0);
+}

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-11 17:31 ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-11 17:31 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann, linux-nvdimm,
	linux-api, Dave Hansen, linux-kernel, Andrew Morton,
	Kirill A. Shutemov

As evidenced by this bug report [1], userspace libraries are interested
in whether a mapping is DAX mapped, i.e. no intervening page cache.
Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
explicit "is dax" indication as a new flag in the page vector populated
by mincore.

There are also cases, particularly for testing and validating a
configuration to know the hardware mapping geometry of the pages in a
given process address range.  Consider filesystem-dax where a
configuration needs to take care to align partitions and block
allocations before huge page mappings might be used, or
anonymous-transparent-huge-pages where a process is opportunistically
assigned large pages.  mincore2() allows these configurations to be
surveyed and validated.

The implementation takes advantage of the unused bits in the per-page
byte returned for each PAGE_SIZE extent of a given address range.  The
new format of each vector byte is:

(TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present

[1]: https://lkml.org/lkml/2016/9/7/61

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/syscalls.h               |    2 +
 include/uapi/asm-generic/mman-common.h |    3 +
 kernel/sys_ni.c                        |    1 
 mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
 4 files changed, 104 insertions(+), 28 deletions(-)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d02239022bd0..4aa2ee7e359a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_mincore(unsigned long start, size_t len,
 				unsigned char __user * vec);
+asmlinkage long sys_mincore2(unsigned long start, size_t len,
+				unsigned char __user * vec, int flags);
 
 asmlinkage long sys_pivot_root(const char __user *new_root,
 				const char __user *put_old);
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 58274382a616..05037343f0da 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -72,4 +72,7 @@
 #define MAP_HUGE_SHIFT	26
 #define MAP_HUGE_MASK	0x3f
 
+#define MINCORE_DAX	1		/* indicate pages that are dax-mapped */
+#define MINCORE_ORDER	2		/* retrieve hardware mapping-size-order */
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..e14b87834054 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -197,6 +197,7 @@ cond_syscall(sys_mlockall);
 cond_syscall(sys_munlockall);
 cond_syscall(sys_mlock2);
 cond_syscall(sys_mincore);
+cond_syscall(sys_mincore2);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
diff --git a/mm/mincore.c b/mm/mincore.c
index c0b5ba965200..15f9eb5de65b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -15,25 +15,62 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
+#include <linux/dax.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+#define MINCORE_DAX_MASK 2
+#define MINCORE_DAX_SHIFT 1
+
+#define MINCORE_ORDER_MASK 0x7c
+#define MINCORE_ORDER_SHIFT 2
+
+struct mincore_params {
+	unsigned char *vec;
+	int flags;
+};
+
+static void mincore_set(unsigned char *vec, struct vm_area_struct *vma, int nr,
+		int flags)
+{
+	unsigned char mincore = 1;
+
+	if (!nr) {
+		*vec = 0;
+		return;
+	}
+
+	if ((flags & MINCORE_DAX) && vma_is_dax(vma))
+		mincore |= 1 << MINCORE_DAX_SHIFT;
+	if (flags & MINCORE_ORDER) {
+		unsigned char order = ilog2(nr);
+
+		WARN_ON((order << MINCORE_ORDER_SHIFT) & ~MINCORE_ORDER_MASK);
+		mincore |= order << MINCORE_ORDER_SHIFT;
+	}
+	memset(vec, mincore, nr);
+}
+
 static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
 			unsigned long end, struct mm_walk *walk)
 {
 #ifdef CONFIG_HUGETLB_PAGE
+	struct mincore_params *p = walk->private;
+	int nr = (end - addr) >> PAGE_SHIFT;
+	unsigned char *vec = p->vec;
 	unsigned char present;
-	unsigned char *vec = walk->private;
 
 	/*
 	 * Hugepages under user process are always in RAM and never
 	 * swapped out, but theoretically it needs to be checked.
 	 */
 	present = pte && !huge_pte_none(huge_ptep_get(pte));
-	for (; addr != end; vec++, addr += PAGE_SIZE)
-		*vec = present;
-	walk->private = vec;
+	if (!present)
+		memset(vec, 0, nr);
+	else
+		mincore_set(vec, walk->vma, nr, p->flags);
+	p->vec = vec + nr;
 #else
 	BUG();
 #endif
@@ -82,20 +119,24 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 }
 
 static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
-				struct vm_area_struct *vma, unsigned char *vec)
+				struct vm_area_struct *vma, unsigned char *vec,
+				int flags)
 {
 	unsigned long nr = (end - addr) >> PAGE_SHIFT;
+	unsigned char present;
 	int i;
 
 	if (vma->vm_file) {
 		pgoff_t pgoff;
 
 		pgoff = linear_page_index(vma, addr);
-		for (i = 0; i < nr; i++, pgoff++)
-			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+		for (i = 0; i < nr; i++, pgoff++) {
+			present = mincore_page(vma->vm_file->f_mapping, pgoff);
+			mincore_set(vec + i, vma, present, flags);
+		}
 	} else {
 		for (i = 0; i < nr; i++)
-			vec[i] = 0;
+			mincore_set(vec + i, vma, 0, flags);
 	}
 	return nr;
 }
@@ -103,8 +144,11 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
 static int mincore_unmapped_range(unsigned long addr, unsigned long end,
 				   struct mm_walk *walk)
 {
-	walk->private += __mincore_unmapped_range(addr, end,
-						  walk->vma, walk->private);
+	struct mincore_params *p = walk->private;
+	int nr = __mincore_unmapped_range(addr, end, walk->vma, p->vec,
+			p->flags);
+
+	p->vec += nr;
 	return 0;
 }
 
@@ -114,18 +158,20 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	spinlock_t *ptl;
 	struct vm_area_struct *vma = walk->vma;
 	pte_t *ptep;
-	unsigned char *vec = walk->private;
+	struct mincore_params *p = walk->private;
+	unsigned char *vec = p->vec;
 	int nr = (end - addr) >> PAGE_SHIFT;
+	int flags = p->flags;
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		memset(vec, 1, nr);
+		mincore_set(vec, vma, nr, flags);
 		spin_unlock(ptl);
 		goto out;
 	}
 
 	if (pmd_trans_unstable(pmd)) {
-		__mincore_unmapped_range(addr, end, vma, vec);
+		__mincore_unmapped_range(addr, end, vma, vec, flags);
 		goto out;
 	}
 
@@ -135,9 +181,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 		if (pte_none(pte))
 			__mincore_unmapped_range(addr, addr + PAGE_SIZE,
-						 vma, vec);
+						 vma, vec, flags);
 		else if (pte_present(pte))
-			*vec = 1;
+			mincore_set(vec, vma, 1, flags);
 		else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
@@ -146,14 +192,17 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 				 * migration or hwpoison entries are always
 				 * uptodate
 				 */
-				*vec = 1;
+				mincore_set(vec, vma, 1, flags);
 			} else {
 #ifdef CONFIG_SWAP
-				*vec = mincore_page(swap_address_space(entry),
-					entry.val);
+				unsigned char present;
+
+				present = mincore_page(swap_address_space(entry),
+						entry.val);
+				mincore_set(vec, vma, present, flags);
 #else
 				WARN_ON(1);
-				*vec = 1;
+				mincore_set(vec, vma, 1, flags);
 #endif
 			}
 		}
@@ -161,7 +210,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	}
 	pte_unmap_unlock(ptep - 1, ptl);
 out:
-	walk->private += nr;
+	p->vec = vec + nr;
 	cond_resched();
 	return 0;
 }
@@ -171,16 +220,21 @@ out:
  * all the arguments, we hold the mmap semaphore: we should
  * just return the amount of info we're asked for.
  */
-static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+static long do_mincore(unsigned long addr, unsigned long pages,
+		unsigned char *vec, int flags)
 {
 	struct vm_area_struct *vma;
 	unsigned long end;
 	int err;
+	struct mincore_params p = {
+		.vec = vec,
+		.flags = flags,
+	};
 	struct mm_walk mincore_walk = {
 		.pmd_entry = mincore_pte_range,
 		.pte_hole = mincore_unmapped_range,
 		.hugetlb_entry = mincore_hugetlb,
-		.private = vec,
+		.private = &p,
 	};
 
 	vma = find_vma(current->mm, addr);
@@ -195,13 +249,19 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 }
 
 /*
- * The mincore(2) system call.
+ * The mincore2(2) system call.
  *
- * mincore() returns the memory residency status of the pages in the
+ * mincore2() returns the memory residency status of the pages in the
  * current process's address space specified by [addr, addr + len).
  * The status is returned in a vector of bytes.  The least significant
  * bit of each byte is 1 if the referenced page is in memory, otherwise
- * it is zero.
+ * it is zero.  When 'flags' is non-zero each byte additionally contains
+ * an indication of whether the referenced page in memory is a DAX
+ * mapping (bit 2 of each vector byte), and/or the order of the mapping
+ * (bits 3 through 7 of each vector byte).  Where the order relates to
+ * the hardware mapping size backing the given logical-page.  For
+ * example, a 2MB-dax-mapped-huge-page would correspond to 512 vector
+ * entries with the value 0x27.
  *
  * Because the status of a page can change after mincore() checks it
  * but before it returns to the application, the returned vector may
@@ -218,8 +278,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
  *		mapped
  *  -EAGAIN - A kernel resource was temporarily unavailable.
  */
-SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
-		unsigned char __user *, vec)
+SYSCALL_DEFINE4(mincore2, unsigned long, start, size_t, len,
+		unsigned char __user *, vec, int, flags)
 {
 	long retval;
 	unsigned long pages;
@@ -229,6 +289,10 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
 
+	/* Check that undefined flags are zero */
+	if (flags & ~(MINCORE_DAX | MINCORE_ORDER))
+		return -EINVAL;
+
 	/* ..and we need to be passed a valid user-space range */
 	if (!access_ok(VERIFY_READ, (void __user *) start, len))
 		return -ENOMEM;
@@ -251,7 +315,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 		 * the temporary buffer size.
 		 */
 		down_read(&current->mm->mmap_sem);
-		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
+		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp, flags);
 		up_read(&current->mm->mmap_sem);
 
 		if (retval <= 0)
@@ -268,3 +332,9 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 	free_page((unsigned long) tmp);
 	return retval;
 }
+
+SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
+		unsigned char __user *, vec)
+{
+	return sys_mincore2(start, len, vec, 0);
+}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [RFC PATCH 2/2] x86: wire up mincore2()
  2016-09-11 17:31 ` Dan Williams
  (?)
  (?)
@ 2016-09-11 17:31   ` Dan Williams
  -1 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-11 17:31 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-nvdimm, linux-api, x86, linux-kernel, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner

Add the new the mincore2() symbol to the x86 syscall tables.

Cc: x86@kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |    1 +
 arch/x86/entry/syscalls/syscall_64.tbl |    1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f848572169ea..71957671d06b 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -386,3 +386,4 @@
 377	i386	copy_file_range		sys_copy_file_range
 378	i386	preadv2			sys_preadv2			compat_sys_preadv2
 379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2
+380	i386	sys_mincore2		sys_mincore2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index e9ce9c7c39b4..bf2a2f6b5c49 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -335,6 +335,7 @@
 326	common	copy_file_range		sys_copy_file_range
 327	64	preadv2			sys_preadv2
 328	64	pwritev2		sys_pwritev2
+329	common	mincore2		sys_mincore2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [RFC PATCH 2/2] x86: wire up mincore2()
@ 2016-09-11 17:31   ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-11 17:31 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-nvdimm, linux-api, x86, linux-kernel, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner

Add the new the mincore2() symbol to the x86 syscall tables.

Cc: x86@kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |    1 +
 arch/x86/entry/syscalls/syscall_64.tbl |    1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f848572169ea..71957671d06b 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -386,3 +386,4 @@
 377	i386	copy_file_range		sys_copy_file_range
 378	i386	preadv2			sys_preadv2			compat_sys_preadv2
 379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2
+380	i386	sys_mincore2		sys_mincore2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index e9ce9c7c39b4..bf2a2f6b5c49 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -335,6 +335,7 @@
 326	common	copy_file_range		sys_copy_file_range
 327	64	preadv2			sys_preadv2
 328	64	pwritev2		sys_pwritev2
+329	common	mincore2		sys_mincore2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [RFC PATCH 2/2] x86: wire up mincore2()
@ 2016-09-11 17:31   ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-11 17:31 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-nvdimm, linux-api, x86, linux-kernel, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner

Add the new the mincore2() symbol to the x86 syscall tables.

Cc: x86@kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |    1 +
 arch/x86/entry/syscalls/syscall_64.tbl |    1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f848572169ea..71957671d06b 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -386,3 +386,4 @@
 377	i386	copy_file_range		sys_copy_file_range
 378	i386	preadv2			sys_preadv2			compat_sys_preadv2
 379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2
+380	i386	sys_mincore2		sys_mincore2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index e9ce9c7c39b4..bf2a2f6b5c49 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -335,6 +335,7 @@
 326	common	copy_file_range		sys_copy_file_range
 327	64	preadv2			sys_preadv2
 328	64	pwritev2		sys_pwritev2
+329	common	mincore2		sys_mincore2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [RFC PATCH 2/2] x86: wire up mincore2()
@ 2016-09-11 17:31   ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-11 17:31 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-nvdimm, linux-api, x86, linux-kernel, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner

Add the new the mincore2() symbol to the x86 syscall tables.

Cc: x86@kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |    1 +
 arch/x86/entry/syscalls/syscall_64.tbl |    1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f848572169ea..71957671d06b 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -386,3 +386,4 @@
 377	i386	copy_file_range		sys_copy_file_range
 378	i386	preadv2			sys_preadv2			compat_sys_preadv2
 379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2
+380	i386	sys_mincore2		sys_mincore2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index e9ce9c7c39b4..bf2a2f6b5c49 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -335,6 +335,7 @@
 326	common	copy_file_range		sys_copy_file_range
 327	64	preadv2			sys_preadv2
 328	64	pwritev2		sys_pwritev2
+329	common	mincore2		sys_mincore2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-11 17:31 ` Dan Williams
                     ` (2 preceding siblings ...)
  (?)
@ 2016-09-12  3:35   ` Nicholas Piggin
  -1 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-12  3:35 UTC (permalink / raw)
  To: Dan Williams
  Cc: Andrea Arcangeli, linux-arch, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel, linux-mm,
	Andrew Morton, Kirill A. Shutemov

On Sun, 11 Sep 2016 10:31:35 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.

Can you cc linux-arch when adding new syscalls (or other such things that
need arch enablement).

I wonder if the changelog for a new syscall should have a bit more grandeur.
Without seeing patch 2, you might not know this was a new syscall just by
reading the subject and changelog.

mincore() defines other bits to be reserved, but I guess it probably breaks
things if you suddenly started using them.

It's a bit sad to introduce a new syscall for this and immediately use up
all bits that can be returned. Would it be a serious problem to return a
larger mask per page?

Thanks,
Nick
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12  3:35   ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-12  3:35 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Sun, 11 Sep 2016 10:31:35 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.

Can you cc linux-arch when adding new syscalls (or other such things that
need arch enablement).

I wonder if the changelog for a new syscall should have a bit more grandeur.
Without seeing patch 2, you might not know this was a new syscall just by
reading the subject and changelog.

mincore() defines other bits to be reserved, but I guess it probably breaks
things if you suddenly started using them.

It's a bit sad to introduce a new syscall for this and immediately use up
all bits that can be returned. Would it be a serious problem to return a
larger mask per page?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12  3:35   ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-12  3:35 UTC (permalink / raw)
  To: Dan Williams
  Cc: Andrea Arcangeli, linux-arch-u79uwXL29TY76Z2rM5mHXA,
	Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Dave Hansen,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Andrew Morton,
	Kirill A. Shutemov

On Sun, 11 Sep 2016 10:31:35 -0700
Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:

> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.

Can you cc linux-arch when adding new syscalls (or other such things that
need arch enablement).

I wonder if the changelog for a new syscall should have a bit more grandeur.
Without seeing patch 2, you might not know this was a new syscall just by
reading the subject and changelog.

mincore() defines other bits to be reserved, but I guess it probably breaks
things if you suddenly started using them.

It's a bit sad to introduce a new syscall for this and immediately use up
all bits that can be returned. Would it be a serious problem to return a
larger mask per page?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12  3:35   ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-12  3:35 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Sun, 11 Sep 2016 10:31:35 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.

Can you cc linux-arch when adding new syscalls (or other such things that
need arch enablement).

I wonder if the changelog for a new syscall should have a bit more grandeur.
Without seeing patch 2, you might not know this was a new syscall just by
reading the subject and changelog.

mincore() defines other bits to be reserved, but I guess it probably breaks
things if you suddenly started using them.

It's a bit sad to introduce a new syscall for this and immediately use up
all bits that can be returned. Would it be a serious problem to return a
larger mask per page?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12  3:35   ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-12  3:35 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Sun, 11 Sep 2016 10:31:35 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.

Can you cc linux-arch when adding new syscalls (or other such things that
need arch enablement).

I wonder if the changelog for a new syscall should have a bit more grandeur.
Without seeing patch 2, you might not know this was a new syscall just by
reading the subject and changelog.

mincore() defines other bits to be reserved, but I guess it probably breaks
things if you suddenly started using them.

It's a bit sad to introduce a new syscall for this and immediately use up
all bits that can be returned. Would it be a serious problem to return a
larger mask per page?

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-11 17:31 ` Dan Williams
  (?)
@ 2016-09-12  6:29   ` Oliver O'Halloran
  -1 siblings, 0 replies; 46+ messages in thread
From: Oliver O'Halloran @ 2016-09-12  6:29 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Mon, Sep 12, 2016 at 3:31 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.
>
> There are also cases, particularly for testing and validating a
> configuration to know the hardware mapping geometry of the pages in a
> given process address range.  Consider filesystem-dax where a
> configuration needs to take care to align partitions and block
> allocations before huge page mappings might be used, or
> anonymous-transparent-huge-pages where a process is opportunistically
> assigned large pages.  mincore2() allows these configurations to be
> surveyed and validated.
>
> The implementation takes advantage of the unused bits in the per-page
> byte returned for each PAGE_SIZE extent of a given address range.  The
> new format of each vector byte is:
>
> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present

What is userspace expected to do with the information in vec? Whether
PMD or THP mappings can be used is going to depend more on the block
allocations done by the filesystem rather than anything the an
application can directly influence. Returning a vector for each page
makes some sense in the mincore() case since the application can touch
each page to fault them in, but I don't see what they can do here.

Why not just get rid of vec entirely and make mincore2() a yes/no
check over the range for whatever is supplied in flags? That would
work for NVML's use case and it should be easier to extend if needed.

Oliver

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12  6:29   ` Oliver O'Halloran
  0 siblings, 0 replies; 46+ messages in thread
From: Oliver O'Halloran @ 2016-09-12  6:29 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Mon, Sep 12, 2016 at 3:31 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.
>
> There are also cases, particularly for testing and validating a
> configuration to know the hardware mapping geometry of the pages in a
> given process address range.  Consider filesystem-dax where a
> configuration needs to take care to align partitions and block
> allocations before huge page mappings might be used, or
> anonymous-transparent-huge-pages where a process is opportunistically
> assigned large pages.  mincore2() allows these configurations to be
> surveyed and validated.
>
> The implementation takes advantage of the unused bits in the per-page
> byte returned for each PAGE_SIZE extent of a given address range.  The
> new format of each vector byte is:
>
> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present

What is userspace expected to do with the information in vec? Whether
PMD or THP mappings can be used is going to depend more on the block
allocations done by the filesystem rather than anything the an
application can directly influence. Returning a vector for each page
makes some sense in the mincore() case since the application can touch
each page to fault them in, but I don't see what they can do here.

Why not just get rid of vec entirely and make mincore2() a yes/no
check over the range for whatever is supplied in flags? That would
work for NVML's use case and it should be easier to extend if needed.

Oliver

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12  6:29   ` Oliver O'Halloran
  0 siblings, 0 replies; 46+ messages in thread
From: Oliver O'Halloran @ 2016-09-12  6:29 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Andrea Arcangeli,
	Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Dave Hansen,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton,
	Kirill A. Shutemov

On Mon, Sep 12, 2016 at 3:31 AM, Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.
>
> There are also cases, particularly for testing and validating a
> configuration to know the hardware mapping geometry of the pages in a
> given process address range.  Consider filesystem-dax where a
> configuration needs to take care to align partitions and block
> allocations before huge page mappings might be used, or
> anonymous-transparent-huge-pages where a process is opportunistically
> assigned large pages.  mincore2() allows these configurations to be
> surveyed and validated.
>
> The implementation takes advantage of the unused bits in the per-page
> byte returned for each PAGE_SIZE extent of a given address range.  The
> new format of each vector byte is:
>
> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present

What is userspace expected to do with the information in vec? Whether
PMD or THP mappings can be used is going to depend more on the block
allocations done by the filesystem rather than anything the an
application can directly influence. Returning a vector for each page
makes some sense in the mincore() case since the application can touch
each page to fault them in, but I don't see what they can do here.

Why not just get rid of vec entirely and make mincore2() a yes/no
check over the range for whatever is supplied in flags? That would
work for NVML's use case and it should be easier to extend if needed.

Oliver

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-11 17:31 ` Dan Williams
  (?)
  (?)
@ 2016-09-12 10:09   ` Kirill A. Shutemov
  -1 siblings, 0 replies; 46+ messages in thread
From: Kirill A. Shutemov @ 2016-09-12 10:09 UTC (permalink / raw)
  To: Dan Williams
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann, linux-nvdimm,
	linux-api, Dave Hansen, linux-kernel, linux-mm, Andrew Morton,
	Kirill A. Shutemov

On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.
> 
> There are also cases, particularly for testing and validating a
> configuration to know the hardware mapping geometry of the pages in a
> given process address range.  Consider filesystem-dax where a
> configuration needs to take care to align partitions and block
> allocations before huge page mappings might be used, or
> anonymous-transparent-huge-pages where a process is opportunistically
> assigned large pages.  mincore2() allows these configurations to be
> surveyed and validated.
> 
> The implementation takes advantage of the unused bits in the per-page
> byte returned for each PAGE_SIZE extent of a given address range.  The
> new format of each vector byte is:
> 
> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
> 
> [1]: https://lkml.org/lkml/2016/9/7/61
> 
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  include/linux/syscalls.h               |    2 +
>  include/uapi/asm-generic/mman-common.h |    3 +
>  kernel/sys_ni.c                        |    1 
>  mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
>  4 files changed, 104 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index d02239022bd0..4aa2ee7e359a 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>  				unsigned char __user * vec);
> +asmlinkage long sys_mincore2(unsigned long start, size_t len,
> +				unsigned char __user * vec, int flags);

We had few attempts to extand mincore(2) interface/functionality before.
None of them ended up in upsteam.

How this attempt compares to previous?

-- 
 Kirill A. Shutemov
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 10:09   ` Kirill A. Shutemov
  0 siblings, 0 replies; 46+ messages in thread
From: Kirill A. Shutemov @ 2016-09-12 10:09 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.
> 
> There are also cases, particularly for testing and validating a
> configuration to know the hardware mapping geometry of the pages in a
> given process address range.  Consider filesystem-dax where a
> configuration needs to take care to align partitions and block
> allocations before huge page mappings might be used, or
> anonymous-transparent-huge-pages where a process is opportunistically
> assigned large pages.  mincore2() allows these configurations to be
> surveyed and validated.
> 
> The implementation takes advantage of the unused bits in the per-page
> byte returned for each PAGE_SIZE extent of a given address range.  The
> new format of each vector byte is:
> 
> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
> 
> [1]: https://lkml.org/lkml/2016/9/7/61
> 
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  include/linux/syscalls.h               |    2 +
>  include/uapi/asm-generic/mman-common.h |    3 +
>  kernel/sys_ni.c                        |    1 
>  mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
>  4 files changed, 104 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index d02239022bd0..4aa2ee7e359a 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>  				unsigned char __user * vec);
> +asmlinkage long sys_mincore2(unsigned long start, size_t len,
> +				unsigned char __user * vec, int flags);

We had few attempts to extand mincore(2) interface/functionality before.
None of them ended up in upsteam.

How this attempt compares to previous?

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 10:09   ` Kirill A. Shutemov
  0 siblings, 0 replies; 46+ messages in thread
From: Kirill A. Shutemov @ 2016-09-12 10:09 UTC (permalink / raw)
  To: Dan Williams
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Dave Hansen,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Andrew Morton,
	Kirill A. Shutemov

On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.
> 
> There are also cases, particularly for testing and validating a
> configuration to know the hardware mapping geometry of the pages in a
> given process address range.  Consider filesystem-dax where a
> configuration needs to take care to align partitions and block
> allocations before huge page mappings might be used, or
> anonymous-transparent-huge-pages where a process is opportunistically
> assigned large pages.  mincore2() allows these configurations to be
> surveyed and validated.
> 
> The implementation takes advantage of the unused bits in the per-page
> byte returned for each PAGE_SIZE extent of a given address range.  The
> new format of each vector byte is:
> 
> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
> 
> [1]: https://lkml.org/lkml/2016/9/7/61
> 
> Cc: Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>
> Cc: Andrea Arcangeli <aarcange-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> Cc: Dave Hansen <dave.hansen-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Cc: Xiao Guangrong <guangrong.xiao-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Cc: Kirill A. Shutemov <kirill.shutemov-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Signed-off-by: Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> ---
>  include/linux/syscalls.h               |    2 +
>  include/uapi/asm-generic/mman-common.h |    3 +
>  kernel/sys_ni.c                        |    1 
>  mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
>  4 files changed, 104 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index d02239022bd0..4aa2ee7e359a 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>  				unsigned char __user * vec);
> +asmlinkage long sys_mincore2(unsigned long start, size_t len,
> +				unsigned char __user * vec, int flags);

We had few attempts to extand mincore(2) interface/functionality before.
None of them ended up in upsteam.

How this attempt compares to previous?

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 10:09   ` Kirill A. Shutemov
  0 siblings, 0 replies; 46+ messages in thread
From: Kirill A. Shutemov @ 2016-09-12 10:09 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.
> 
> There are also cases, particularly for testing and validating a
> configuration to know the hardware mapping geometry of the pages in a
> given process address range.  Consider filesystem-dax where a
> configuration needs to take care to align partitions and block
> allocations before huge page mappings might be used, or
> anonymous-transparent-huge-pages where a process is opportunistically
> assigned large pages.  mincore2() allows these configurations to be
> surveyed and validated.
> 
> The implementation takes advantage of the unused bits in the per-page
> byte returned for each PAGE_SIZE extent of a given address range.  The
> new format of each vector byte is:
> 
> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
> 
> [1]: https://lkml.org/lkml/2016/9/7/61
> 
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  include/linux/syscalls.h               |    2 +
>  include/uapi/asm-generic/mman-common.h |    3 +
>  kernel/sys_ni.c                        |    1 
>  mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
>  4 files changed, 104 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index d02239022bd0..4aa2ee7e359a 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>  				unsigned char __user * vec);
> +asmlinkage long sys_mincore2(unsigned long start, size_t len,
> +				unsigned char __user * vec, int flags);

We had few attempts to extand mincore(2) interface/functionality before.
None of them ended up in upsteam.

How this attempt compares to previous?

-- 
 Kirill A. Shutemov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-12 10:09   ` Kirill A. Shutemov
  (?)
@ 2016-09-12 17:15     ` Dan Williams
  -1 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:15 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann, linux-nvdimm,
	linux-api, Dave Hansen, linux-kernel, Linux MM, Andrew Morton,
	Kirill A. Shutemov

On Mon, Sep 12, 2016 at 3:09 AM, Kirill A. Shutemov
<kirill@shutemov.name> wrote:
> On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>>
>> There are also cases, particularly for testing and validating a
>> configuration to know the hardware mapping geometry of the pages in a
>> given process address range.  Consider filesystem-dax where a
>> configuration needs to take care to align partitions and block
>> allocations before huge page mappings might be used, or
>> anonymous-transparent-huge-pages where a process is opportunistically
>> assigned large pages.  mincore2() allows these configurations to be
>> surveyed and validated.
>>
>> The implementation takes advantage of the unused bits in the per-page
>> byte returned for each PAGE_SIZE extent of a given address range.  The
>> new format of each vector byte is:
>>
>> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
>>
>> [1]: https://lkml.org/lkml/2016/9/7/61
>>
>> Cc: Arnd Bergmann <arnd@arndb.de>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
>> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  include/linux/syscalls.h               |    2 +
>>  include/uapi/asm-generic/mman-common.h |    3 +
>>  kernel/sys_ni.c                        |    1
>>  mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
>>  4 files changed, 104 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index d02239022bd0..4aa2ee7e359a 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
>>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
>>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>>                               unsigned char __user * vec);
>> +asmlinkage long sys_mincore2(unsigned long start, size_t len,
>> +                             unsigned char __user * vec, int flags);
>
> We had few attempts to extand mincore(2) interface/functionality before.
> None of them ended up in upsteam.
>
> How this attempt compares to previous?

Not sure, I'm wading into this cold trying to get my pet problem
solved, hence the RFC.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 17:15     ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:15 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm@lists.01.org, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Mon, Sep 12, 2016 at 3:09 AM, Kirill A. Shutemov
<kirill@shutemov.name> wrote:
> On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>>
>> There are also cases, particularly for testing and validating a
>> configuration to know the hardware mapping geometry of the pages in a
>> given process address range.  Consider filesystem-dax where a
>> configuration needs to take care to align partitions and block
>> allocations before huge page mappings might be used, or
>> anonymous-transparent-huge-pages where a process is opportunistically
>> assigned large pages.  mincore2() allows these configurations to be
>> surveyed and validated.
>>
>> The implementation takes advantage of the unused bits in the per-page
>> byte returned for each PAGE_SIZE extent of a given address range.  The
>> new format of each vector byte is:
>>
>> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
>>
>> [1]: https://lkml.org/lkml/2016/9/7/61
>>
>> Cc: Arnd Bergmann <arnd@arndb.de>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
>> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  include/linux/syscalls.h               |    2 +
>>  include/uapi/asm-generic/mman-common.h |    3 +
>>  kernel/sys_ni.c                        |    1
>>  mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
>>  4 files changed, 104 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index d02239022bd0..4aa2ee7e359a 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
>>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
>>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>>                               unsigned char __user * vec);
>> +asmlinkage long sys_mincore2(unsigned long start, size_t len,
>> +                             unsigned char __user * vec, int flags);
>
> We had few attempts to extand mincore(2) interface/functionality before.
> None of them ended up in upsteam.
>
> How this attempt compares to previous?

Not sure, I'm wading into this cold trying to get my pet problem
solved, hence the RFC.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 17:15     ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:15 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Mon, Sep 12, 2016 at 3:09 AM, Kirill A. Shutemov
<kirill@shutemov.name> wrote:
> On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>>
>> There are also cases, particularly for testing and validating a
>> configuration to know the hardware mapping geometry of the pages in a
>> given process address range.  Consider filesystem-dax where a
>> configuration needs to take care to align partitions and block
>> allocations before huge page mappings might be used, or
>> anonymous-transparent-huge-pages where a process is opportunistically
>> assigned large pages.  mincore2() allows these configurations to be
>> surveyed and validated.
>>
>> The implementation takes advantage of the unused bits in the per-page
>> byte returned for each PAGE_SIZE extent of a given address range.  The
>> new format of each vector byte is:
>>
>> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
>>
>> [1]: https://lkml.org/lkml/2016/9/7/61
>>
>> Cc: Arnd Bergmann <arnd@arndb.de>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
>> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  include/linux/syscalls.h               |    2 +
>>  include/uapi/asm-generic/mman-common.h |    3 +
>>  kernel/sys_ni.c                        |    1
>>  mm/mincore.c                           |  126 +++++++++++++++++++++++++-------
>>  4 files changed, 104 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index d02239022bd0..4aa2ee7e359a 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
>>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
>>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>>                               unsigned char __user * vec);
>> +asmlinkage long sys_mincore2(unsigned long start, size_t len,
>> +                             unsigned char __user * vec, int flags);
>
> We had few attempts to extand mincore(2) interface/functionality before.
> None of them ended up in upsteam.
>
> How this attempt compares to previous?

Not sure, I'm wading into this cold trying to get my pet problem
solved, hence the RFC.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-12  6:29   ` Oliver O'Halloran
  (?)
  (?)
@ 2016-09-12 17:25     ` Dan Williams
  -1 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:25 UTC (permalink / raw)
  To: Oliver O'Halloran
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann, linux-nvdimm,
	linux-api, Dave Hansen, linux-kernel, Linux MM, Andrew Morton,
	Kirill A. Shutemov

On Sun, Sep 11, 2016 at 11:29 PM, Oliver O'Halloran <oohall@gmail.com> wrote:
> On Mon, Sep 12, 2016 at 3:31 AM, Dan Williams <dan.j.williams@intel.com> wrote:
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>>
>> There are also cases, particularly for testing and validating a
>> configuration to know the hardware mapping geometry of the pages in a
>> given process address range.  Consider filesystem-dax where a
>> configuration needs to take care to align partitions and block
>> allocations before huge page mappings might be used, or
>> anonymous-transparent-huge-pages where a process is opportunistically
>> assigned large pages.  mincore2() allows these configurations to be
>> surveyed and validated.
>>
>> The implementation takes advantage of the unused bits in the per-page
>> byte returned for each PAGE_SIZE extent of a given address range.  The
>> new format of each vector byte is:
>>
>> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
>
> What is userspace expected to do with the information in vec? Whether
> PMD or THP mappings can be used is going to depend more on the block
> allocations done by the filesystem rather than anything the an
> application can directly influence. Returning a vector for each page
> makes some sense in the mincore() case since the application can touch
> each page to fault them in, but I don't see what they can do here.

It's not a "can huge pages be used?" question it's interrogating the
mapping that got established after the fact.  If an
application/environment expects huge mappings, but pte mappings are
getting established

> Why not just get rid of vec entirely and make mincore2() a yes/no
> check over the range for whatever is supplied in flags? That would
> work for NVML's use case and it should be easier to extend if needed.

I think having a way to ask the kernel if an address range satisfies a
certain set of input attributes is a useful interface.  Perhaps a
"MINCORE_CHECK" flag can indicate that the input vector contains a
single character that it wants the kernel to validate during the page
table walk, and return zero or the offset of the first mismatch.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 17:25     ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:25 UTC (permalink / raw)
  To: Oliver O'Halloran
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm@lists.01.org, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Sun, Sep 11, 2016 at 11:29 PM, Oliver O'Halloran <oohall@gmail.com> wrote:
> On Mon, Sep 12, 2016 at 3:31 AM, Dan Williams <dan.j.williams@intel.com> wrote:
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>>
>> There are also cases, particularly for testing and validating a
>> configuration to know the hardware mapping geometry of the pages in a
>> given process address range.  Consider filesystem-dax where a
>> configuration needs to take care to align partitions and block
>> allocations before huge page mappings might be used, or
>> anonymous-transparent-huge-pages where a process is opportunistically
>> assigned large pages.  mincore2() allows these configurations to be
>> surveyed and validated.
>>
>> The implementation takes advantage of the unused bits in the per-page
>> byte returned for each PAGE_SIZE extent of a given address range.  The
>> new format of each vector byte is:
>>
>> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
>
> What is userspace expected to do with the information in vec? Whether
> PMD or THP mappings can be used is going to depend more on the block
> allocations done by the filesystem rather than anything the an
> application can directly influence. Returning a vector for each page
> makes some sense in the mincore() case since the application can touch
> each page to fault them in, but I don't see what they can do here.

It's not a "can huge pages be used?" question it's interrogating the
mapping that got established after the fact.  If an
application/environment expects huge mappings, but pte mappings are
getting established

> Why not just get rid of vec entirely and make mincore2() a yes/no
> check over the range for whatever is supplied in flags? That would
> work for NVML's use case and it should be easier to extend if needed.

I think having a way to ask the kernel if an address range satisfies a
certain set of input attributes is a useful interface.  Perhaps a
"MINCORE_CHECK" flag can indicate that the input vector contains a
single character that it wants the kernel to validate during the page
table walk, and return zero or the offset of the first mismatch.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 17:25     ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:25 UTC (permalink / raw)
  To: Oliver O'Halloran
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Dave Hansen,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux MM, Andrew Morton,
	Kirill A. Shutemov

On Sun, Sep 11, 2016 at 11:29 PM, Oliver O'Halloran <oohall-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> On Mon, Sep 12, 2016 at 3:31 AM, Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>>
>> There are also cases, particularly for testing and validating a
>> configuration to know the hardware mapping geometry of the pages in a
>> given process address range.  Consider filesystem-dax where a
>> configuration needs to take care to align partitions and block
>> allocations before huge page mappings might be used, or
>> anonymous-transparent-huge-pages where a process is opportunistically
>> assigned large pages.  mincore2() allows these configurations to be
>> surveyed and validated.
>>
>> The implementation takes advantage of the unused bits in the per-page
>> byte returned for each PAGE_SIZE extent of a given address range.  The
>> new format of each vector byte is:
>>
>> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
>
> What is userspace expected to do with the information in vec? Whether
> PMD or THP mappings can be used is going to depend more on the block
> allocations done by the filesystem rather than anything the an
> application can directly influence. Returning a vector for each page
> makes some sense in the mincore() case since the application can touch
> each page to fault them in, but I don't see what they can do here.

It's not a "can huge pages be used?" question it's interrogating the
mapping that got established after the fact.  If an
application/environment expects huge mappings, but pte mappings are
getting established

> Why not just get rid of vec entirely and make mincore2() a yes/no
> check over the range for whatever is supplied in flags? That would
> work for NVML's use case and it should be easier to extend if needed.

I think having a way to ask the kernel if an address range satisfies a
certain set of input attributes is a useful interface.  Perhaps a
"MINCORE_CHECK" flag can indicate that the input vector contains a
single character that it wants the kernel to validate during the page
table walk, and return zero or the offset of the first mismatch.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 17:25     ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:25 UTC (permalink / raw)
  To: Oliver O'Halloran
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Sun, Sep 11, 2016 at 11:29 PM, Oliver O'Halloran <oohall@gmail.com> wrote:
> On Mon, Sep 12, 2016 at 3:31 AM, Dan Williams <dan.j.williams@intel.com> wrote:
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>>
>> There are also cases, particularly for testing and validating a
>> configuration to know the hardware mapping geometry of the pages in a
>> given process address range.  Consider filesystem-dax where a
>> configuration needs to take care to align partitions and block
>> allocations before huge page mappings might be used, or
>> anonymous-transparent-huge-pages where a process is opportunistically
>> assigned large pages.  mincore2() allows these configurations to be
>> surveyed and validated.
>>
>> The implementation takes advantage of the unused bits in the per-page
>> byte returned for each PAGE_SIZE extent of a given address range.  The
>> new format of each vector byte is:
>>
>> (TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
>
> What is userspace expected to do with the information in vec? Whether
> PMD or THP mappings can be used is going to depend more on the block
> allocations done by the filesystem rather than anything the an
> application can directly influence. Returning a vector for each page
> makes some sense in the mincore() case since the application can touch
> each page to fault them in, but I don't see what they can do here.

It's not a "can huge pages be used?" question it's interrogating the
mapping that got established after the fact.  If an
application/environment expects huge mappings, but pte mappings are
getting established

> Why not just get rid of vec entirely and make mincore2() a yes/no
> check over the range for whatever is supplied in flags? That would
> work for NVML's use case and it should be easier to extend if needed.

I think having a way to ask the kernel if an address range satisfies a
certain set of input attributes is a useful interface.  Perhaps a
"MINCORE_CHECK" flag can indicate that the input vector contains a
single character that it wants the kernel to validate during the page
table walk, and return zero or the offset of the first mismatch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-12  3:35   ` Nicholas Piggin
  (?)
  (?)
@ 2016-09-12 17:29     ` Dan Williams
  -1 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:29 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Andrea Arcangeli, linux-arch, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel, Linux MM,
	Andrew Morton, Kirill A. Shutemov

On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Sun, 11 Sep 2016 10:31:35 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
>
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>
> Can you cc linux-arch when adding new syscalls (or other such things that
> need arch enablement).
>
> I wonder if the changelog for a new syscall should have a bit more grandeur.
> Without seeing patch 2, you might not know this was a new syscall just by
> reading the subject and changelog.

Fair point, I'll beef up the documentation if this moves past an RFC.

> mincore() defines other bits to be reserved, but I guess it probably breaks
> things if you suddenly started using them.

The new bits are left as zero unless an application explicitly asks
for them, so an existing mincore() user shouldn't break.

> It's a bit sad to introduce a new syscall for this and immediately use up
> all bits that can be returned. Would it be a serious problem to return a
> larger mask per page?

Certainly one of the new request flags can indicate that the vector is
made up of larger entries.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 17:29     ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:29 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm@lists.01.org, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Sun, 11 Sep 2016 10:31:35 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
>
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>
> Can you cc linux-arch when adding new syscalls (or other such things that
> need arch enablement).
>
> I wonder if the changelog for a new syscall should have a bit more grandeur.
> Without seeing patch 2, you might not know this was a new syscall just by
> reading the subject and changelog.

Fair point, I'll beef up the documentation if this moves past an RFC.

> mincore() defines other bits to be reserved, but I guess it probably breaks
> things if you suddenly started using them.

The new bits are left as zero unless an application explicitly asks
for them, so an existing mincore() user shouldn't break.

> It's a bit sad to introduce a new syscall for this and immediately use up
> all bits that can be returned. Would it be a serious problem to return a
> larger mask per page?

Certainly one of the new request flags can indicate that the vector is
made up of larger entries.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 17:29     ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:29 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Sun, 11 Sep 2016 10:31:35 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
>
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>
> Can you cc linux-arch when adding new syscalls (or other such things that
> need arch enablement).
>
> I wonder if the changelog for a new syscall should have a bit more grandeur.
> Without seeing patch 2, you might not know this was a new syscall just by
> reading the subject and changelog.

Fair point, I'll beef up the documentation if this moves past an RFC.

> mincore() defines other bits to be reserved, but I guess it probably breaks
> things if you suddenly started using them.

The new bits are left as zero unless an application explicitly asks
for them, so an existing mincore() user shouldn't break.

> It's a bit sad to introduce a new syscall for this and immediately use up
> all bits that can be returned. Would it be a serious problem to return a
> larger mask per page?

Certainly one of the new request flags can indicate that the vector is
made up of larger entries.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-12 17:29     ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-12 17:29 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Sun, 11 Sep 2016 10:31:35 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
>
>> As evidenced by this bug report [1], userspace libraries are interested
>> in whether a mapping is DAX mapped, i.e. no intervening page cache.
>> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
>> explicit "is dax" indication as a new flag in the page vector populated
>> by mincore.
>
> Can you cc linux-arch when adding new syscalls (or other such things that
> need arch enablement).
>
> I wonder if the changelog for a new syscall should have a bit more grandeur.
> Without seeing patch 2, you might not know this was a new syscall just by
> reading the subject and changelog.

Fair point, I'll beef up the documentation if this moves past an RFC.

> mincore() defines other bits to be reserved, but I guess it probably breaks
> things if you suddenly started using them.

The new bits are left as zero unless an application explicitly asks
for them, so an existing mincore() user shouldn't break.

> It's a bit sad to introduce a new syscall for this and immediately use up
> all bits that can be returned. Would it be a serious problem to return a
> larger mask per page?

Certainly one of the new request flags can indicate that the vector is
made up of larger entries.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-12 17:29     ` Dan Williams
                         ` (2 preceding siblings ...)
  (?)
@ 2016-09-13  2:16       ` Nicholas Piggin
  -1 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-13  2:16 UTC (permalink / raw)
  To: Dan Williams
  Cc: Andrea Arcangeli, linux-arch, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel, Linux MM,
	Andrew Morton, Kirill A. Shutemov

On Mon, 12 Sep 2016 10:29:17 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> > On Sun, 11 Sep 2016 10:31:35 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> >> As evidenced by this bug report [1], userspace libraries are interested
> >> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> >> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> >> explicit "is dax" indication as a new flag in the page vector populated
> >> by mincore.  
> >
> > Can you cc linux-arch when adding new syscalls (or other such things that
> > need arch enablement).
> >
> > I wonder if the changelog for a new syscall should have a bit more grandeur.
> > Without seeing patch 2, you might not know this was a new syscall just by
> > reading the subject and changelog.  
> 
> Fair point, I'll beef up the documentation if this moves past an RFC.

Okay. Also, it would be good to summarise some of the justification
directly in the changelog rather than external link. Performance
numbers, etc.


> > mincore() defines other bits to be reserved, but I guess it probably breaks
> > things if you suddenly started using them.  
> 
> The new bits are left as zero unless an application explicitly asks
> for them, so an existing mincore() user shouldn't break.

Oh yeah, I was just musing that we can't really use the old syscall
despite it claims to have some reserved bits for future use.


> > It's a bit sad to introduce a new syscall for this and immediately use up
> > all bits that can be returned. Would it be a serious problem to return a
> > larger mask per page?  
> 
> Certainly one of the new request flags can indicate that the vector is
> made up of larger entries.

Hmm. Changing prototype depending on flags. I thought I was having
a nightmare about ioctls for a minute there :)

In general, is this what we want for a new API? Should we be thinking
about an extent API?

Thanks,
Nick
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  2:16       ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-13  2:16 UTC (permalink / raw)
  To: Dan Williams
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm@lists.01.org, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Mon, 12 Sep 2016 10:29:17 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> > On Sun, 11 Sep 2016 10:31:35 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> >> As evidenced by this bug report [1], userspace libraries are interested
> >> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> >> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> >> explicit "is dax" indication as a new flag in the page vector populated
> >> by mincore.  
> >
> > Can you cc linux-arch when adding new syscalls (or other such things that
> > need arch enablement).
> >
> > I wonder if the changelog for a new syscall should have a bit more grandeur.
> > Without seeing patch 2, you might not know this was a new syscall just by
> > reading the subject and changelog.  
> 
> Fair point, I'll beef up the documentation if this moves past an RFC.

Okay. Also, it would be good to summarise some of the justification
directly in the changelog rather than external link. Performance
numbers, etc.


> > mincore() defines other bits to be reserved, but I guess it probably breaks
> > things if you suddenly started using them.  
> 
> The new bits are left as zero unless an application explicitly asks
> for them, so an existing mincore() user shouldn't break.

Oh yeah, I was just musing that we can't really use the old syscall
despite it claims to have some reserved bits for future use.


> > It's a bit sad to introduce a new syscall for this and immediately use up
> > all bits that can be returned. Would it be a serious problem to return a
> > larger mask per page?  
> 
> Certainly one of the new request flags can indicate that the vector is
> made up of larger entries.

Hmm. Changing prototype depending on flags. I thought I was having
a nightmare about ioctls for a minute there :)

In general, is this what we want for a new API? Should we be thinking
about an extent API?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  2:16       ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-13  2:16 UTC (permalink / raw)
  To: Dan Williams
  Cc: Andrea Arcangeli, linux-arch-u79uwXL29TY76Z2rM5mHXA,
	Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Dave Hansen,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux MM, Andrew Morton,
	Kirill A. Shutemov

On Mon, 12 Sep 2016 10:29:17 -0700
Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:

> On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> > On Sun, 11 Sep 2016 10:31:35 -0700
> > Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> >  
> >> As evidenced by this bug report [1], userspace libraries are interested
> >> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> >> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> >> explicit "is dax" indication as a new flag in the page vector populated
> >> by mincore.  
> >
> > Can you cc linux-arch when adding new syscalls (or other such things that
> > need arch enablement).
> >
> > I wonder if the changelog for a new syscall should have a bit more grandeur.
> > Without seeing patch 2, you might not know this was a new syscall just by
> > reading the subject and changelog.  
> 
> Fair point, I'll beef up the documentation if this moves past an RFC.

Okay. Also, it would be good to summarise some of the justification
directly in the changelog rather than external link. Performance
numbers, etc.


> > mincore() defines other bits to be reserved, but I guess it probably breaks
> > things if you suddenly started using them.  
> 
> The new bits are left as zero unless an application explicitly asks
> for them, so an existing mincore() user shouldn't break.

Oh yeah, I was just musing that we can't really use the old syscall
despite it claims to have some reserved bits for future use.


> > It's a bit sad to introduce a new syscall for this and immediately use up
> > all bits that can be returned. Would it be a serious problem to return a
> > larger mask per page?  
> 
> Certainly one of the new request flags can indicate that the vector is
> made up of larger entries.

Hmm. Changing prototype depending on flags. I thought I was having
a nightmare about ioctls for a minute there :)

In general, is this what we want for a new API? Should we be thinking
about an extent API?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  2:16       ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-13  2:16 UTC (permalink / raw)
  To: Dan Williams
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Mon, 12 Sep 2016 10:29:17 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> > On Sun, 11 Sep 2016 10:31:35 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> >> As evidenced by this bug report [1], userspace libraries are interested
> >> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> >> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> >> explicit "is dax" indication as a new flag in the page vector populated
> >> by mincore.  
> >
> > Can you cc linux-arch when adding new syscalls (or other such things that
> > need arch enablement).
> >
> > I wonder if the changelog for a new syscall should have a bit more grandeur.
> > Without seeing patch 2, you might not know this was a new syscall just by
> > reading the subject and changelog.  
> 
> Fair point, I'll beef up the documentation if this moves past an RFC.

Okay. Also, it would be good to summarise some of the justification
directly in the changelog rather than external link. Performance
numbers, etc.


> > mincore() defines other bits to be reserved, but I guess it probably breaks
> > things if you suddenly started using them.  
> 
> The new bits are left as zero unless an application explicitly asks
> for them, so an existing mincore() user shouldn't break.

Oh yeah, I was just musing that we can't really use the old syscall
despite it claims to have some reserved bits for future use.


> > It's a bit sad to introduce a new syscall for this and immediately use up
> > all bits that can be returned. Would it be a serious problem to return a
> > larger mask per page?  
> 
> Certainly one of the new request flags can indicate that the vector is
> made up of larger entries.

Hmm. Changing prototype depending on flags. I thought I was having
a nightmare about ioctls for a minute there :)

In general, is this what we want for a new API? Should we be thinking
about an extent API?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  2:16       ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2016-09-13  2:16 UTC (permalink / raw)
  To: Dan Williams
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Mon, 12 Sep 2016 10:29:17 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> On Sun, Sep 11, 2016 at 8:35 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> > On Sun, 11 Sep 2016 10:31:35 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> >> As evidenced by this bug report [1], userspace libraries are interested
> >> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> >> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> >> explicit "is dax" indication as a new flag in the page vector populated
> >> by mincore.  
> >
> > Can you cc linux-arch when adding new syscalls (or other such things that
> > need arch enablement).
> >
> > I wonder if the changelog for a new syscall should have a bit more grandeur.
> > Without seeing patch 2, you might not know this was a new syscall just by
> > reading the subject and changelog.  
> 
> Fair point, I'll beef up the documentation if this moves past an RFC.

Okay. Also, it would be good to summarise some of the justification
directly in the changelog rather than external link. Performance
numbers, etc.


> > mincore() defines other bits to be reserved, but I guess it probably breaks
> > things if you suddenly started using them.  
> 
> The new bits are left as zero unless an application explicitly asks
> for them, so an existing mincore() user shouldn't break.

Oh yeah, I was just musing that we can't really use the old syscall
despite it claims to have some reserved bits for future use.


> > It's a bit sad to introduce a new syscall for this and immediately use up
> > all bits that can be returned. Would it be a serious problem to return a
> > larger mask per page?  
> 
> Certainly one of the new request flags can indicate that the vector is
> made up of larger entries.

Hmm. Changing prototype depending on flags. I thought I was having
a nightmare about ioctls for a minute there :)

In general, is this what we want for a new API? Should we be thinking
about an extent API?

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-13  2:16       ` Nicholas Piggin
  (?)
  (?)
@ 2016-09-13  3:49         ` Dan Williams
  -1 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-13  3:49 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Mon, Sep 12, 2016 at 7:16 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Mon, 12 Sep 2016 10:29:17 -0700
[..]
>> Certainly one of the new request flags can indicate that the vector is
>> made up of larger entries.
>
> Hmm. Changing prototype depending on flags. I thought I was having
> a nightmare about ioctls for a minute there :)

Heh :)

> In general, is this what we want for a new API? Should we be thinking
> about an extent API?

This probably fits better with the use cases I know that want to
consume this information, something like fiemap (mextmap, maybe?) for
memory.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  3:49         ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-13  3:49 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm@lists.01.org, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Mon, Sep 12, 2016 at 7:16 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Mon, 12 Sep 2016 10:29:17 -0700
[..]
>> Certainly one of the new request flags can indicate that the vector is
>> made up of larger entries.
>
> Hmm. Changing prototype depending on flags. I thought I was having
> a nightmare about ioctls for a minute there :)

Heh :)

> In general, is this what we want for a new API? Should we be thinking
> about an extent API?

This probably fits better with the use cases I know that want to
consume this information, something like fiemap (mextmap, maybe?) for
memory.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  3:49         ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-13  3:49 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Andrea Arcangeli, linux-arch-u79uwXL29TY76Z2rM5mHXA,
	Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Dave Hansen,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux MM, Andrew Morton,
	Kirill A. Shutemov

On Mon, Sep 12, 2016 at 7:16 PM, Nicholas Piggin <npiggin-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> On Mon, 12 Sep 2016 10:29:17 -0700
[..]
>> Certainly one of the new request flags can indicate that the vector is
>> made up of larger entries.
>
> Hmm. Changing prototype depending on flags. I thought I was having
> a nightmare about ioctls for a minute there :)

Heh :)

> In general, is this what we want for a new API? Should we be thinking
> about an extent API?

This probably fits better with the use cases I know that want to
consume this information, something like fiemap (mextmap, maybe?) for
memory.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  3:49         ` Dan Williams
  0 siblings, 0 replies; 46+ messages in thread
From: Dan Williams @ 2016-09-13  3:49 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Linux MM, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov, linux-arch

On Mon, Sep 12, 2016 at 7:16 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Mon, 12 Sep 2016 10:29:17 -0700
[..]
>> Certainly one of the new request flags can indicate that the vector is
>> made up of larger entries.
>
> Hmm. Changing prototype depending on flags. I thought I was having
> a nightmare about ioctls for a minute there :)

Heh :)

> In general, is this what we want for a new API? Should we be thinking
> about an extent API?

This probably fits better with the use cases I know that want to
consume this information, something like fiemap (mextmap, maybe?) for
memory.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
  2016-09-11 17:31 ` Dan Williams
  (?)
@ 2016-09-13  6:44     ` Christoph Hellwig
  -1 siblings, 0 replies; 46+ messages in thread
From: Christoph Hellwig @ 2016-09-13  6:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Dave Hansen,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-nvdimm-y27Ovi1pjclAfugRpC6u6w, Andrew Morton,
	Kirill A. Shutemov

On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.

And how exactly does an implementation detail like DAX matter for an
application?  The only thing that might matter is the atomicy boundary,
but mincore is not the right interface for that.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  6:44     ` Christoph Hellwig
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Hellwig @ 2016-09-13  6:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.

And how exactly does an implementation detail like DAX matter for an
application?  The only thing that might matter is the atomicy boundary,
but mincore is not the right interface for that.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
@ 2016-09-13  6:44     ` Christoph Hellwig
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Hellwig @ 2016-09-13  6:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, Andrea Arcangeli, Xiao Guangrong, Arnd Bergmann,
	linux-nvdimm, linux-api, Dave Hansen, linux-kernel,
	Andrew Morton, Kirill A. Shutemov

On Sun, Sep 11, 2016 at 10:31:35AM -0700, Dan Williams wrote:
> As evidenced by this bug report [1], userspace libraries are interested
> in whether a mapping is DAX mapped, i.e. no intervening page cache.
> Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
> explicit "is dax" indication as a new flag in the page vector populated
> by mincore.

And how exactly does an implementation detail like DAX matter for an
application?  The only thing that might matter is the atomicy boundary,
but mincore is not the right interface for that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 2/2] x86: wire up mincore2()
  2016-09-11 17:31   ` Dan Williams
  (?)
  (?)
@ 2016-09-13 18:44     ` Ingo Molnar
  -1 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2016-09-13 18:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm, linux-api, x86, linux-kernel, linux-mm,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner


* Dan Williams <dan.j.williams@intel.com> wrote:

> Add the new the mincore2() symbol to the x86 syscall tables.

Could you please send the patch against -tip? We have this (new) commit in the x86 
tree:

  f9afc6197e9b x86: Wire up protection keys system calls

... which created a new conflict.

Thanks,

	Ingo
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 2/2] x86: wire up mincore2()
@ 2016-09-13 18:44     ` Ingo Molnar
  0 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2016-09-13 18:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, linux-nvdimm, linux-api, x86, linux-kernel,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner


* Dan Williams <dan.j.williams@intel.com> wrote:

> Add the new the mincore2() symbol to the x86 syscall tables.

Could you please send the patch against -tip? We have this (new) commit in the x86 
tree:

  f9afc6197e9b x86: Wire up protection keys system calls

... which created a new conflict.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 2/2] x86: wire up mincore2()
@ 2016-09-13 18:44     ` Ingo Molnar
  0 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2016-09-13 18:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
	linux-api-u79uwXL29TY76Z2rM5mHXA, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Ingo Molnar, H. Peter Anvin,
	Thomas Gleixner


* Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:

> Add the new the mincore2() symbol to the x86 syscall tables.

Could you please send the patch against -tip? We have this (new) commit in the x86 
tree:

  f9afc6197e9b x86: Wire up protection keys system calls

... which created a new conflict.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 2/2] x86: wire up mincore2()
@ 2016-09-13 18:44     ` Ingo Molnar
  0 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2016-09-13 18:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-mm, linux-nvdimm, linux-api, x86, linux-kernel,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner


* Dan Williams <dan.j.williams@intel.com> wrote:

> Add the new the mincore2() symbol to the x86 syscall tables.

Could you please send the patch against -tip? We have this (new) commit in the x86 
tree:

  f9afc6197e9b x86: Wire up protection keys system calls

... which created a new conflict.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2016-09-13 18:44 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-11 17:31 [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range Dan Williams
2016-09-11 17:31 ` Dan Williams
2016-09-11 17:31 ` Dan Williams
2016-09-11 17:31 ` [RFC PATCH 2/2] x86: wire up mincore2() Dan Williams
2016-09-11 17:31   ` Dan Williams
2016-09-11 17:31   ` Dan Williams
2016-09-11 17:31   ` Dan Williams
2016-09-13 18:44   ` Ingo Molnar
2016-09-13 18:44     ` Ingo Molnar
2016-09-13 18:44     ` Ingo Molnar
2016-09-13 18:44     ` Ingo Molnar
2016-09-12  3:35 ` [RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range Nicholas Piggin
2016-09-12  3:35   ` Nicholas Piggin
2016-09-12  3:35   ` Nicholas Piggin
2016-09-12  3:35   ` Nicholas Piggin
2016-09-12  3:35   ` Nicholas Piggin
2016-09-12 17:29   ` Dan Williams
2016-09-12 17:29     ` Dan Williams
2016-09-12 17:29     ` Dan Williams
2016-09-12 17:29     ` Dan Williams
2016-09-13  2:16     ` Nicholas Piggin
2016-09-13  2:16       ` Nicholas Piggin
2016-09-13  2:16       ` Nicholas Piggin
2016-09-13  2:16       ` Nicholas Piggin
2016-09-13  2:16       ` Nicholas Piggin
2016-09-13  3:49       ` Dan Williams
2016-09-13  3:49         ` Dan Williams
2016-09-13  3:49         ` Dan Williams
2016-09-13  3:49         ` Dan Williams
2016-09-12  6:29 ` Oliver O'Halloran
2016-09-12  6:29   ` Oliver O'Halloran
2016-09-12  6:29   ` Oliver O'Halloran
2016-09-12 17:25   ` Dan Williams
2016-09-12 17:25     ` Dan Williams
2016-09-12 17:25     ` Dan Williams
2016-09-12 17:25     ` Dan Williams
2016-09-12 10:09 ` Kirill A. Shutemov
2016-09-12 10:09   ` Kirill A. Shutemov
2016-09-12 10:09   ` Kirill A. Shutemov
2016-09-12 10:09   ` Kirill A. Shutemov
2016-09-12 17:15   ` Dan Williams
2016-09-12 17:15     ` Dan Williams
2016-09-12 17:15     ` Dan Williams
     [not found] ` <147361509579.17004.5258725187329709824.stgit-p8uTFz9XbKj2zm6wflaqv1nYeNYlB/vhral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2016-09-13  6:44   ` Christoph Hellwig
2016-09-13  6:44     ` Christoph Hellwig
2016-09-13  6:44     ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.