linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* RFC: mincore: add a bit to indicate a page is dirty.
@ 2013-02-11  3:13 Rusty Russell
  2013-02-11 16:27 ` Johannes Weiner
  0 siblings, 1 reply; 22+ messages in thread
From: Rusty Russell @ 2013-02-11  3:13 UTC (permalink / raw)
  To: LKML; +Cc: Andrew Morton, Johannes Weiner, Nick Piggin, Stewart Smith

I am writing an app which really wants to know if a file is on the
disk or not (ie. do I need to sync?).

mincore() bits other than 0 are undefined (as documented in the man
page); in fact my Ubuntu 12.10 i386 system seems to write 129 in some
bytes, so it really shouldn't break anyone.

Is PG_dirty the right choice?  Is that right for huge pages?  Should I
assume is_migration_entry(entry) means it's not dirty, or is there some
other check here?

Thanks,
Rusty

diff --git a/mm/mincore.c b/mm/mincore.c
index 936b4ce..e1e8f03 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,6 +19,9 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+#define MINCORE_INCORE 1
+#define MINCORE_DIRTY  2
+
 static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end,
 				unsigned char *vec)
@@ -28,7 +31,7 @@ static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
 
 	h = hstate_vma(vma);
 	while (1) {
-		unsigned char present;
+		unsigned char flags = 0;
 		pte_t *ptep;
 		/*
 		 * Huge pages are always in RAM for now, but
@@ -36,7 +39,15 @@ static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
 		 */
 		ptep = huge_pte_offset(current->mm,
 				       addr & huge_page_mask(h));
-		present = ptep && !huge_pte_none(huge_ptep_get(ptep));
+		if (ptep) {
+			pte_t pte = huge_ptep_get(ptep);
+
+			if (!huge_pte_none(pte)) {
+				flags = MINCORE_INCORE;
+				if (pte_dirty(pte))
+				    flags |= MINCORE_DIRTY;
+			}
+		}
 		while (1) {
 			*vec = present;
 			vec++;
@@ -61,7 +72,7 @@ static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
  */
 static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 {
-	unsigned char present = 0;
+	unsigned char flags = 0;
 	struct page *page;
 
 	/*
@@ -79,11 +90,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 	}
 #endif
 	if (page) {
-		present = PageUptodate(page);
+		if (PageUptodate(page)) {
+			flags = MINCORE_INCORE;
+			if (PageDirty(page))
+				flags |= MINCORE_DIRTY;
+		}
 		page_cache_release(page);
 	}
 
-	return present;
+	return flags;
 }
 
 static void mincore_unmapped_range(struct vm_area_struct *vma,
@@ -121,9 +136,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		next = addr + PAGE_SIZE;
 		if (pte_none(pte))
 			mincore_unmapped_range(vma, addr, next, vec);
-		else if (pte_present(pte))
-			*vec = 1;
-		else if (pte_file(pte)) {
+		else if (pte_present(pte)) {
+			*vec = MINCORE_INCORE;
+			if (pte_dirty(pte))
+				*vec |= MINCORE_DIRTY;
+		} else if (pte_file(pte)) {
 			pgoff = pte_to_pgoff(pte);
 			*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
 		} else { /* pte is a swap entry */
@@ -131,14 +148,15 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 			if (is_migration_entry(entry)) {
 				/* migration entries are always uptodate */
-				*vec = 1;
+				*vec = MINCORE_INCORE;
+				/* FIXME: Can they be dirty? */
 			} else {
 #ifdef CONFIG_SWAP
 				pgoff = entry.val;
 				*vec = mincore_page(&swapper_space, pgoff);
 #else
 				WARN_ON(1);
-				*vec = 1;
+				*vec = MINCORE_INCORE|MINCORE_DIRTY;
 #endif
 			}
 		}
@@ -246,7 +264,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
  * current process's address space specified by [addr, addr + len).
  * The status is returned in a vector of bytes.  The least significant
  * bit of each byte is 1 if the referenced page is in memory, otherwise
- * it is zero.
+ * it is zero.  The second bit indicates if page (may be) dirty.
  *
  * Because the status of a page can change after mincore() checks it
  * but before it returns to the application, the returned vector may

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: RFC: mincore: add a bit to indicate a page is dirty.
  2013-02-11  3:13 RFC: mincore: add a bit to indicate a page is dirty Rusty Russell
@ 2013-02-11 16:27 ` Johannes Weiner
  2013-02-11 22:12   ` Andrew Morton
  2013-02-12  5:49   ` RFC: mincore: add a bit to indicate a page is dirty Rusty Russell
  0 siblings, 2 replies; 22+ messages in thread
From: Johannes Weiner @ 2013-02-11 16:27 UTC (permalink / raw)
  To: Rusty Russell; +Cc: LKML, Andrew Morton, Nick Piggin, Stewart Smith

On Mon, Feb 11, 2013 at 01:43:03PM +1030, Rusty Russell wrote:
> I am writing an app which really wants to know if a file is on the
> disk or not (ie. do I need to sync?).

When the page is under writeback, it's not necessarily on disk yet,
but you also don't need to sync.  Which semantics make more sense?

I'm leaning toward checking both PG_dirty and PG_writeback.

> mincore() bits other than 0 are undefined (as documented in the man
> page); in fact my Ubuntu 12.10 i386 system seems to write 129 in some
> bytes, so it really shouldn't break anyone.
> 
> Is PG_dirty the right choice?  Is that right for huge pages?  Should I
> assume is_migration_entry(entry) means it's not dirty, or is there some
> other check here?

If your only consequence of finding dirty pages is to sync, would you
be better off using fsync/fdatasync maybe?

This should work even if you only access the file through mmap, due to
the way we trap dirtying with write-protected ptes to accurately
account for dirty pages and update the status in the page cache.

> @@ -36,7 +39,15 @@ static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
>  		 */
>  		ptep = huge_pte_offset(current->mm,
>  				       addr & huge_page_mask(h));
> -		present = ptep && !huge_pte_none(huge_ptep_get(ptep));
> +		if (ptep) {
> +			pte_t pte = huge_ptep_get(ptep);
> +
> +			if (!huge_pte_none(pte)) {
> +				flags = MINCORE_INCORE;
> +				if (pte_dirty(pte))
> +				    flags |= MINCORE_DIRTY;
> +			}
> +		}

This looks good to me.  However, this only covers the hugetlb page
implementation, you also have to annotate mincore_huge_pmd to cover
transparent huge pages:

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6001ee6..c632517 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1403,12 +1403,17 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	int ret = 0;
 
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+		struct page *page = pmd_page(pmd);
+		unsigned char flags;
 		/*
 		 * All logical pages in the range are present
 		 * if backed by a huge page.
 		 */
+		flags = MINCORE_INCORE;
+		if (PageDirty(page))
+			flags |= MINCORE_DIRTY;
 		spin_unlock(&vma->vm_mm->page_table_lock);
-		memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+		memset(vec, flags, (end - addr) >> PAGE_SHIFT);
 		ret = 1;
 	}
 
> @@ -131,14 +148,15 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  
>  			if (is_migration_entry(entry)) {
>  				/* migration entries are always uptodate */
> -				*vec = 1;
> +				*vec = MINCORE_INCORE;
> +				/* FIXME: Can they be dirty? */

Yes, they can.  Use migration_entry_to_page() [safe with pte lock] and
test PageDirty() on it.

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: RFC: mincore: add a bit to indicate a page is dirty.
  2013-02-11 16:27 ` Johannes Weiner
@ 2013-02-11 22:12   ` Andrew Morton
  2013-02-12  5:44     ` Rusty Russell
                       ` (2 more replies)
  2013-02-12  5:49   ` RFC: mincore: add a bit to indicate a page is dirty Rusty Russell
  1 sibling, 3 replies; 22+ messages in thread
From: Andrew Morton @ 2013-02-11 22:12 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith

On Mon, 11 Feb 2013 11:27:01 -0500
Johannes Weiner <hannes@cmpxchg.org> wrote:

> > Is PG_dirty the right choice?  Is that right for huge pages?  Should I
> > assume is_migration_entry(entry) means it's not dirty, or is there some
> > other check here?
> 
> If your only consequence of finding dirty pages is to sync, would you
> be better off using fsync/fdatasync maybe?

Yes, if the data is all on disk then an fsync() will be a no-op.  IOW,

	if (I need to fsync)
		fsync();

is equivalent to

	fsync();


Methinks we need to understand the requirement better.


Also, having to mmap the file to be able to query pagecache state is a
hack.  Whatever happened to the fincore() patch?


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: RFC: mincore: add a bit to indicate a page is dirty.
  2013-02-11 22:12   ` Andrew Morton
@ 2013-02-12  5:44     ` Rusty Russell
  2013-02-15  6:34     ` [patch 1/2] mm: fincore() Johannes Weiner
  2013-02-15  6:35     ` [patch 2/2] x86-64: hook up fincore() syscall Johannes Weiner
  2 siblings, 0 replies; 22+ messages in thread
From: Rusty Russell @ 2013-02-12  5:44 UTC (permalink / raw)
  To: Andrew Morton, Johannes Weiner; +Cc: LKML, Nick Piggin, Stewart Smith

Andrew Morton <akpm@linux-foundation.org> writes:
> On Mon, 11 Feb 2013 11:27:01 -0500
> Johannes Weiner <hannes@cmpxchg.org> wrote:
>
>> > Is PG_dirty the right choice?  Is that right for huge pages?  Should I
>> > assume is_migration_entry(entry) means it's not dirty, or is there some
>> > other check here?
>> 
>> If your only consequence of finding dirty pages is to sync, would you
>> be better off using fsync/fdatasync maybe?
>
> Yes, if the data is all on disk then an fsync() will be a no-op.  IOW,
>
> 	if (I need to fsync)
> 		fsync();
>
> is equivalent to
>
> 	fsync();
>
>
> Methinks we need to understand the requirement better.

I have a simple journalling system in userspace, to avoid sync
(ie. consistency, not necessarily durability).  It just records all the
write() calls.  See prototype code here (in ccan/softsync dir):

        https://github.com/rustyrussell/ccan/tree/softsync

The question is, when to do check/recovery.  I currently do it on every
open (yech).  One way is to only do that if the file is older than the
mount it's on (see attached patch, which has its own issues).  Or I can
delete the journal altogether any time the file is on disk, to indicate
no recovery is needed.

> Also, having to mmap the file to be able to query pagecache state is a
> hack.  Whatever happened to the fincore() patch?

Yes.  That would be great for non-thrashing backup programs, too.

Cheers,
Rusty.
diff --git a/fs/mount.h b/fs/mount.h
index cd50079..57e0113 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -49,6 +49,9 @@ struct mount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
+#ifdef CONFIG_PROC_FS
+	union ktime mnt_time;		/* time created. */
+#endif
 };
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index 55605c5..19b5f1b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -198,6 +198,9 @@ static struct mount *alloc_vfsmnt(const char *name)
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
+#ifdef CONFIG_PROC_FS
+		mnt->mnt_time = ktime_get();
+#endif
 	}
 	return mnt;
 
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 5fe34c3..0341c34 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -75,6 +75,15 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 	}
 }
 
+static void show_mount_age(struct seq_file *m, struct mount *r)
+{
+	struct timeval age;
+
+	/* Age wearies us, but it's independent of time changes since boot. */
+	age = ktime_to_timeval(ktime_sub(ktime_get(), r->mnt_time));
+	seq_printf(m, ",age=%lu.%06lu", age.tv_sec, age.tv_usec);
+}
+
 static inline void mangle(struct seq_file *m, const char *s)
 {
 	seq_escape(m, s, " \t\n\\");
@@ -112,6 +121,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
 	if (err)
 		goto out;
 	show_mnt_opts(m, mnt);
+	show_mount_age(m, r);
 	if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt_path.dentry);
 	seq_puts(m, " 0 0\n");
@@ -145,6 +155,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 
 	seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
 	show_mnt_opts(m, mnt);
+	show_mount_age(m, r);
 
 	/* Tagged fields ("foo:X" or "bar") */
 	if (IS_MNT_SHARED(r))

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: RFC: mincore: add a bit to indicate a page is dirty.
  2013-02-11 16:27 ` Johannes Weiner
  2013-02-11 22:12   ` Andrew Morton
@ 2013-02-12  5:49   ` Rusty Russell
  1 sibling, 0 replies; 22+ messages in thread
From: Rusty Russell @ 2013-02-12  5:49 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: LKML, Andrew Morton, Nick Piggin, Stewart Smith

Johannes Weiner <hannes@cmpxchg.org> writes:

> On Mon, Feb 11, 2013 at 01:43:03PM +1030, Rusty Russell wrote:
>> I am writing an app which really wants to know if a file is on the
>> disk or not (ie. do I need to sync?).
>
> When the page is under writeback, it's not necessarily on disk yet,
> but you also don't need to sync.  Which semantics make more sense?
>
> I'm leaning toward checking both PG_dirty and PG_writeback.

If it's already under PG_writeback, you probably still need to wait for
it to finish if you're trying to ensure write ordering. ie. sync.

I've updated my patch by stealing your code.

Thanks!
Rusty.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch 1/2] mm: fincore()
  2013-02-11 22:12   ` Andrew Morton
  2013-02-12  5:44     ` Rusty Russell
@ 2013-02-15  6:34     ` Johannes Weiner
  2013-02-15 20:39       ` David Miller
                         ` (3 more replies)
  2013-02-15  6:35     ` [patch 2/2] x86-64: hook up fincore() syscall Johannes Weiner
  2 siblings, 4 replies; 22+ messages in thread
From: Johannes Weiner @ 2013-02-15  6:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Mon, Feb 11, 2013 at 02:12:39PM -0800, Andrew Morton wrote:
> Also, having to mmap the file to be able to query pagecache state is a
> hack.  Whatever happened to the fincore() patch?

I don't know, but how about this one:

---
From: Johannes Weiner <hannes@cmpxchg.org>
Subject: [patch 1/2] mm: fincore()

Provide a syscall to determine whether a given file's pages are cached
in memory.  This is more elegant than mmapping the file for the sole
purpose of using mincore(), and also works on NOMMU.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/syscalls.h |   2 +
 mm/Makefile              |   2 +-
 mm/fincore.c             | 128 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 131 insertions(+), 1 deletion(-)
 create mode 100644 mm/fincore.c

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 313a8e0..3ceab2a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -897,4 +897,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 			 unsigned long idx1, unsigned long idx2);
 asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_fincore(unsigned int fd, loff_t start, loff_t len,
+			    unsigned char __user * vec);
 #endif
diff --git a/mm/Makefile b/mm/Makefile
index 185a22b..221cdae 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o balloon_compaction.o \
-			   interval_tree.o $(mmu-y)
+			   interval_tree.o fincore.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/fincore.c b/mm/fincore.c
new file mode 100644
index 0000000..d504611
--- /dev/null
+++ b/mm/fincore.c
@@ -0,0 +1,128 @@
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+static long do_fincore(struct address_space *mapping, pgoff_t pgstart,
+		       unsigned long nr_pages, unsigned char *vec)
+{
+	pgoff_t pgend = pgstart + nr_pages;
+	struct radix_tree_iter iter;
+	void **slot;
+	long nr = 0;
+
+	rcu_read_lock();
+restart:
+	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, pgstart) {
+		unsigned char present;
+		struct page *page;
+
+		/* Handle holes */
+		if (iter.index != pgstart + nr) {
+			if (iter.index < pgend)
+				nr_pages = iter.index - pgstart;
+			break;
+		}
+repeat:
+		page = radix_tree_deref_slot(slot);
+		if (unlikely(!page))
+			continue;
+
+		if (radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page)) {
+				/*
+				 * Transient condition which can only trigger
+				 * when entry at index 0 moves out of or back
+				 * to root: none yet gotten, safe to restart.
+				 */
+				WARN_ON(iter.index);
+				goto restart;
+			}
+			present = 0;
+		} else {
+			if (!page_cache_get_speculative(page))
+				goto repeat;
+
+			/* Has the page moved? */
+			if (unlikely(page != *slot)) {
+				page_cache_release(page);
+				goto repeat;
+			}
+
+			present = PageUptodate(page);
+			page_cache_release(page);
+		}
+		vec[nr] = present;
+
+		if (++nr == nr_pages)
+			break;
+	}
+	rcu_read_unlock();
+
+	if (nr < nr_pages)
+		memset(vec + nr, 0, nr_pages - nr);
+
+	return nr_pages;
+}
+
+/*
+ * The fincore(2) system call.
+ *
+ * fincore() returns the memory residency status of the given file's
+ * pages, in the range [start, start + len].
+ * The status is returned in a vector of bytes.  The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after fincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.
+ *
+ * return values:
+ *  zero    - success
+ *  -EBADF  - fd isn't a valid open file descriptor
+ *  -EFAULT - vec points to an illegal address
+ *  -EINVAL - start is not a multiple of PAGE_CACHE_SIZE
+ */
+SYSCALL_DEFINE4(fincore, unsigned int, fd, loff_t, start, loff_t, len,
+		unsigned char __user *, vec)
+{
+	unsigned long nr_pages;
+	pgoff_t pgstart;
+	struct fd f;
+	long ret;
+
+	if (start & ~PAGE_CACHE_MASK)
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	pgstart = start >> PAGE_CACHE_SHIFT;
+	nr_pages = DIV_ROUND_UP(len, PAGE_CACHE_SIZE);
+
+	while (nr_pages) {
+		unsigned char tmp[64];
+
+		ret = do_fincore(f.file->f_mapping, pgstart,
+				 min(nr_pages, sizeof(tmp)), tmp);
+		if (ret <= 0)
+			break;
+
+		if (copy_to_user(vec, tmp, ret)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		nr_pages -= ret;
+		pgstart += ret;
+		vec += ret;
+		ret = 0;
+	}
+
+	fdput(f);
+
+	return ret;
+}
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [patch 2/2] x86-64: hook up fincore() syscall
  2013-02-11 22:12   ` Andrew Morton
  2013-02-12  5:44     ` Rusty Russell
  2013-02-15  6:34     ` [patch 1/2] mm: fincore() Johannes Weiner
@ 2013-02-15  6:35     ` Johannes Weiner
  2 siblings, 0 replies; 22+ messages in thread
From: Johannes Weiner @ 2013-02-15  6:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 arch/x86/syscalls/syscall_64.tbl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65d..c9ac047 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,7 @@
 311	64	process_vm_writev	sys_process_vm_writev
 312	common	kcmp			sys_kcmp
 313	common	finit_module		sys_finit_module
+314	common	fincore			sys_fincore
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15  6:34     ` [patch 1/2] mm: fincore() Johannes Weiner
@ 2013-02-15 20:39       ` David Miller
  2013-02-15 21:14       ` Andrew Morton
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 22+ messages in thread
From: David Miller @ 2013-02-15 20:39 UTC (permalink / raw)
  To: hannes; +Cc: akpm, rusty, linux-kernel, npiggin, stewart, linux-mm, linux-arch

From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 15 Feb 2013 01:34:50 -0500

> +	nr_pages = DIV_ROUND_UP(len, PAGE_CACHE_SIZE);

A small nit, maybe use PAGE_CACHE_ALIGN() here.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15  6:34     ` [patch 1/2] mm: fincore() Johannes Weiner
  2013-02-15 20:39       ` David Miller
@ 2013-02-15 21:14       ` Andrew Morton
  2013-02-15 22:28         ` Johannes Weiner
  2013-02-15 21:27       ` Andrew Morton
  2013-02-19 10:25       ` Simon Jeons
  3 siblings, 1 reply; 22+ messages in thread
From: Andrew Morton @ 2013-02-15 21:14 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Fri, 15 Feb 2013 01:34:50 -0500
Johannes Weiner <hannes@cmpxchg.org> wrote:

> On Mon, Feb 11, 2013 at 02:12:39PM -0800, Andrew Morton wrote:
> > Also, having to mmap the file to be able to query pagecache state is a
> > hack.  Whatever happened to the fincore() patch?
> 
> I don't know, but how about this one:

This appears to be remotely derived from Chris's original
(http://lwn.net/Articles/371540/).  The comments, at least ;) Some
mention in the changelog would be appropriate.

> Provide a syscall to determine whether a given file's pages are cached
> in memory.  This is more elegant than mmapping the file for the sole
> purpose of using mincore(), and also works on NOMMU.
> 

Obviously we'll be needing more than this at the appropriate time so
Michael can write the manpage.

Please provide a nice tools/testing/selftests/fincore/ along with this
code?

> --- /dev/null
> +++ b/mm/fincore.c
> @@ -0,0 +1,128 @@
> +#include <linux/syscalls.h>
> +#include <linux/pagemap.h>
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +
> +static long do_fincore(struct address_space *mapping, pgoff_t pgstart,
> +		       unsigned long nr_pages, unsigned char *vec)
> +{
> +	pgoff_t pgend = pgstart + nr_pages;
> +	struct radix_tree_iter iter;
> +	void **slot;
> +	long nr = 0;
> +
> +	rcu_read_lock();
> +restart:
> +	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, pgstart) {
> +		unsigned char present;
> +		struct page *page;
> +
> +		/* Handle holes */
> +		if (iter.index != pgstart + nr) {
> +			if (iter.index < pgend)
> +				nr_pages = iter.index - pgstart;
> +			break;

This break looks odd - it terminates the entire function.  Am too lazy
to work out why ;)

> +		}
> +repeat:
> +		page = radix_tree_deref_slot(slot);
> +		if (unlikely(!page))
> +			continue;

Is a bug, isn't it?  Need to zero vec[nr].

> +		if (radix_tree_exception(page)) {
> +			if (radix_tree_deref_retry(page)) {
> +				/*
> +				 * Transient condition which can only trigger
> +				 * when entry at index 0 moves out of or back
> +				 * to root: none yet gotten, safe to restart.
> +				 */
> +				WARN_ON(iter.index);
> +				goto restart;
> +			}
> +			present = 0;
> +		} else {
> +			if (!page_cache_get_speculative(page))
> +				goto repeat;
> +
> +			/* Has the page moved? */
> +			if (unlikely(page != *slot)) {
> +				page_cache_release(page);
> +				goto repeat;
> +			}
> +
> +			present = PageUptodate(page);

hm, OK, so we assume that test_bit() returns 1 or 0 and not just
"true".  That's OK, iirc.

Why does it have to be uptodate?  It could be present and under read()
IO.  That's "in core"?

> +			page_cache_release(page);
> +		}
> +		vec[nr] = present;
> +
> +		if (++nr == nr_pages)
> +			break;
> +	}
> +	rcu_read_unlock();
> +
> +	if (nr < nr_pages)
> +		memset(vec + nr, 0, nr_pages - nr);
> +
> +	return nr_pages;
> +}
> +
> +/*
> + * The fincore(2) system call.
> + *
> + * fincore() returns the memory residency status of the given file's
> + * pages, in the range [start, start + len].
> + * The status is returned in a vector of bytes.  The least significant
> + * bit of each byte is 1 if the referenced page is in memory, otherwise
> + * it is zero.

Yes, and there will be immediate calmour to add more goodies to the
other seven bits.  PageDirty, referenced state, etc.  We should think
about this now, at the design stage rather than grafting things on
later.

> + * Because the status of a page can change after fincore() checks it
> + * but before it returns to the application, the returned vector may
> + * contain stale information.
> + *
> + * return values:
> + *  zero    - success
> + *  -EBADF  - fd isn't a valid open file descriptor
> + *  -EFAULT - vec points to an illegal address
> + *  -EINVAL - start is not a multiple of PAGE_CACHE_SIZE
> + */
> +SYSCALL_DEFINE4(fincore, unsigned int, fd, loff_t, start, loff_t, len,
> +		unsigned char __user *, vec)
> +{
> +	unsigned long nr_pages;
> +	pgoff_t pgstart;
> +	struct fd f;
> +	long ret;
> +
> +	if (start & ~PAGE_CACHE_MASK)
> +		return -EINVAL;

This restriction appears to be unnecessary?

> +	f = fdget(fd);
> +	if (!f.file)
> +		return -EBADF;

I fear what happens if we run this syscall against a random fd from
/dev/some-gizmo.  Suggest adding tests for S_ISREG and non-null ->mapping.

> +	pgstart = start >> PAGE_CACHE_SHIFT;
> +	nr_pages = DIV_ROUND_UP(len, PAGE_CACHE_SIZE);
> +
> +	while (nr_pages) {
> +		unsigned char tmp[64];
> +
> +		ret = do_fincore(f.file->f_mapping, pgstart,
> +				 min(nr_pages, sizeof(tmp)), tmp);
> +		if (ret <= 0)
> +			break;
> +
> +		if (copy_to_user(vec, tmp, ret)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		nr_pages -= ret;
> +		pgstart += ret;
> +		vec += ret;
> +		ret = 0;
> +	}
> +
> +	fdput(f);
> +
> +	return ret;
> +}


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15  6:34     ` [patch 1/2] mm: fincore() Johannes Weiner
  2013-02-15 20:39       ` David Miller
  2013-02-15 21:14       ` Andrew Morton
@ 2013-02-15 21:27       ` Andrew Morton
  2013-02-15 23:13         ` Johannes Weiner
  2013-02-19 10:25       ` Simon Jeons
  3 siblings, 1 reply; 22+ messages in thread
From: Andrew Morton @ 2013-02-15 21:27 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Fri, 15 Feb 2013 01:34:50 -0500
Johannes Weiner <hannes@cmpxchg.org> wrote:

> + * The status is returned in a vector of bytes.  The least significant
> + * bit of each byte is 1 if the referenced page is in memory, otherwise
> + * it is zero.

Also, this is going to be dreadfully inefficient for some obvious cases.

We could address that by returning the info in some more efficient
representation.  That will be run-length encoded in some fashion.

The obvious way would be to populate an array of

struct page_status {
	u32 present:1;
	u32 count:31;
};

or whatever.

Another way would be to define the syscall so it returns "number of
pages present/absent starting at offset `start'".  In other words, one
call to fincore() will return a single `struct page_status'.  Userspace
can then walk through the file and generate the full picture, if needed.


This also gets inefficient in obvious cases, but it's not as obviously
bad?


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15 21:14       ` Andrew Morton
@ 2013-02-15 22:28         ` Johannes Weiner
  2013-02-15 22:34           ` Andrew Morton
  0 siblings, 1 reply; 22+ messages in thread
From: Johannes Weiner @ 2013-02-15 22:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Fri, Feb 15, 2013 at 01:14:51PM -0800, Andrew Morton wrote:
> On Fri, 15 Feb 2013 01:34:50 -0500
> Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> > On Mon, Feb 11, 2013 at 02:12:39PM -0800, Andrew Morton wrote:
> > > Also, having to mmap the file to be able to query pagecache state is a
> > > hack.  Whatever happened to the fincore() patch?
> > 
> > I don't know, but how about this one:
> 
> This appears to be remotely derived from Chris's original
> (http://lwn.net/Articles/371540/).  The comments, at least ;) Some
> mention in the changelog would be appropriate.

It's actually copy-pasted from mm/mincore.c, but we both had the same
idea of sorting the error codes by numeric value.  Funny.  I'll
mention him.

> > Provide a syscall to determine whether a given file's pages are cached
> > in memory.  This is more elegant than mmapping the file for the sole
> > purpose of using mincore(), and also works on NOMMU.
> > 
> 
> Obviously we'll be needing more than this at the appropriate time so
> Michael can write the manpage.
> 
> Please provide a nice tools/testing/selftests/fincore/ along with this
> code?

Will do.

> > --- /dev/null
> > +++ b/mm/fincore.c
> > @@ -0,0 +1,128 @@
> > +#include <linux/syscalls.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/file.h>
> > +#include <linux/fs.h>
> > +#include <linux/mm.h>
> > +
> > +static long do_fincore(struct address_space *mapping, pgoff_t pgstart,
> > +		       unsigned long nr_pages, unsigned char *vec)
> > +{
> > +	pgoff_t pgend = pgstart + nr_pages;
> > +	struct radix_tree_iter iter;
> > +	void **slot;
> > +	long nr = 0;
> > +
> > +	rcu_read_lock();
> > +restart:
> > +	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, pgstart) {
> > +		unsigned char present;
> > +		struct page *page;
> > +
> > +		/* Handle holes */
> > +		if (iter.index != pgstart + nr) {
> > +			if (iter.index < pgend)
> > +				nr_pages = iter.index - pgstart;
> > +			break;
> 
> This break looks odd - it terminates the entire function.  Am too lazy
> to work out why ;)

"Hole" and "no more pages in that file" share the same code to zero
out the vector.

> > +		}
> > +repeat:
> > +		page = radix_tree_deref_slot(slot);
> > +		if (unlikely(!page))
> > +			continue;
> 
> Is a bug, isn't it?  Need to zero vec[nr].

It works but I'll make it less awkward.  The continue will not
increase nr, so the next page lookup will trigger the hole detection
above and zero vec[nr].

> > +		if (radix_tree_exception(page)) {
> > +			if (radix_tree_deref_retry(page)) {
> > +				/*
> > +				 * Transient condition which can only trigger
> > +				 * when entry at index 0 moves out of or back
> > +				 * to root: none yet gotten, safe to restart.
> > +				 */
> > +				WARN_ON(iter.index);
> > +				goto restart;
> > +			}
> > +			present = 0;
> > +		} else {
> > +			if (!page_cache_get_speculative(page))
> > +				goto repeat;
> > +
> > +			/* Has the page moved? */
> > +			if (unlikely(page != *slot)) {
> > +				page_cache_release(page);
> > +				goto repeat;
> > +			}
> > +
> > +			present = PageUptodate(page);
> 
> hm, OK, so we assume that test_bit() returns 1 or 0 and not just
> "true".  That's OK, iirc.

Metoo.

> Why does it have to be uptodate?  It could be present and under read()
> IO.  That's "in core"?

I always thought of this as data-residency, i.e. the presence of
pages, not page frames, so I wouldn't consider pages in-core when they
are still in-flight.  mincore has the same notion.

> > +			page_cache_release(page);
> > +		}
> > +		vec[nr] = present;
> > +
> > +		if (++nr == nr_pages)
> > +			break;
> > +	}
> > +	rcu_read_unlock();
> > +
> > +	if (nr < nr_pages)
> > +		memset(vec + nr, 0, nr_pages - nr);
> > +
> > +	return nr_pages;
> > +}
> > +
> > +/*
> > + * The fincore(2) system call.
> > + *
> > + * fincore() returns the memory residency status of the given file's
> > + * pages, in the range [start, start + len].
> > + * The status is returned in a vector of bytes.  The least significant
> > + * bit of each byte is 1 if the referenced page is in memory, otherwise
> > + * it is zero.
> 
> Yes, and there will be immediate calmour to add more goodies to the
> other seven bits.  PageDirty, referenced state, etc.  We should think
> about this now, at the design stage rather than grafting things on
> later.

I'm interested in your "etc.".  PG_error, PG_active, PG_writeback,
page huge?

> > + * Because the status of a page can change after fincore() checks it
> > + * but before it returns to the application, the returned vector may
> > + * contain stale information.
> > + *
> > + * return values:
> > + *  zero    - success
> > + *  -EBADF  - fd isn't a valid open file descriptor
> > + *  -EFAULT - vec points to an illegal address
> > + *  -EINVAL - start is not a multiple of PAGE_CACHE_SIZE
> > + */
> > +SYSCALL_DEFINE4(fincore, unsigned int, fd, loff_t, start, loff_t, len,
> > +		unsigned char __user *, vec)
> > +{
> > +	unsigned long nr_pages;
> > +	pgoff_t pgstart;
> > +	struct fd f;
> > +	long ret;
> > +
> > +	if (start & ~PAGE_CACHE_MASK)
> > +		return -EINVAL;
> 
> This restriction appears to be unnecessary?

I thought about it too, but whether the kernel rounds or not, you need
to know the page size and do the rounding in userspace in order to
supply an appropriately sized vector.  And then I just copied mincore
semantics for consistency ;-)

> > +	f = fdget(fd);
> > +	if (!f.file)
> > +		return -EBADF;
> 
> I fear what happens if we run this syscall against a random fd from
> /dev/some-gizmo.  Suggest adding tests for S_ISREG and non-null ->mapping.

Good idea, I'll add this.

Thanks!

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15 22:28         ` Johannes Weiner
@ 2013-02-15 22:34           ` Andrew Morton
  0 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-02-15 22:34 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Fri, 15 Feb 2013 17:28:03 -0500
Johannes Weiner <hannes@cmpxchg.org> wrote:

> > Yes, and there will be immediate calmour to add more goodies to the
> > other seven bits.  PageDirty, referenced state, etc.  We should think
> > about this now, at the design stage rather than grafting things on
> > later.
> 
> I'm interested in your "etc.".  PG_error, PG_active, PG_writeback,
> page huge?

Gawd knows.  How many crazy people are there out there?

If we adopt my use-runlength-encoding suggestion then things get
easier.  We add an extra arg to the syscall which selects which
particular per-page boolean we're looking for and can gather up to 4
billion different PageFoo()s.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15 21:27       ` Andrew Morton
@ 2013-02-15 23:13         ` Johannes Weiner
  2013-02-15 23:42           ` Andrew Morton
  0 siblings, 1 reply; 22+ messages in thread
From: Johannes Weiner @ 2013-02-15 23:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Fri, Feb 15, 2013 at 01:27:38PM -0800, Andrew Morton wrote:
> On Fri, 15 Feb 2013 01:34:50 -0500
> Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> > + * The status is returned in a vector of bytes.  The least significant
> > + * bit of each byte is 1 if the referenced page is in memory, otherwise
> > + * it is zero.
> 
> Also, this is going to be dreadfully inefficient for some obvious cases.
> 
> We could address that by returning the info in some more efficient
> representation.  That will be run-length encoded in some fashion.
> 
> The obvious way would be to populate an array of
> 
> struct page_status {
> 	u32 present:1;
> 	u32 count:31;
> };
> 
> or whatever.

I'm having a hard time seeing how this could be extended to more
status bits without stifling the optimization too much.  If we just
add more status bits to one page_status, the likelihood of long runs
where all bits are in agreement decreases.  But as the optimization
becomes less and less effective, we are stuck with an interface that
is more PITA than just using mmap and mincore again.

The user has to supply a worst-case-sized vector with one struct
page_status per page in the range, but the per-page item will be
bigger than with the byte vector because of the additional run length
variable.

> Another way would be to define the syscall so it returns "number of
> pages present/absent starting at offset `start'".  In other words, one
> call to fincore() will return a single `struct page_status'.  Userspace
> can then walk through the file and generate the full picture, if needed.
> 
> This also gets inefficient in obvious cases, but it's not as obviously
> bad?

Any run-length encoding will have a problem with multiple status bits,
I guess.

Maybe with a mask of bits the user is interested in?

struct page_status {
	unsigned long states;
	unsigned long count;
};

int fincore(int fd, loff_t start, loff_t len,
            unsigned long states_mask,
            struct page_status *status)

However, one struct page_status per run leaves you with a worst case
of one syscall per page in the range.

I dunno.  The byte vector might not be optimal but its worst cases
seem more attractive, is just as extensible, and dead simple to use.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15 23:13         ` Johannes Weiner
@ 2013-02-15 23:42           ` Andrew Morton
  2013-02-16  4:23             ` Rusty Russell
  2013-02-18  5:41             ` Rusty Russell
  0 siblings, 2 replies; 22+ messages in thread
From: Andrew Morton @ 2013-02-15 23:42 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Rusty Russell, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Fri, 15 Feb 2013 18:13:04 -0500
Johannes Weiner <hannes@cmpxchg.org> wrote:

> On Fri, Feb 15, 2013 at 01:27:38PM -0800, Andrew Morton wrote:
> > On Fri, 15 Feb 2013 01:34:50 -0500
> > Johannes Weiner <hannes@cmpxchg.org> wrote:
> > 
> > > + * The status is returned in a vector of bytes.  The least significant
> > > + * bit of each byte is 1 if the referenced page is in memory, otherwise
> > > + * it is zero.
> > 
> > Also, this is going to be dreadfully inefficient for some obvious cases.
> > 
> > We could address that by returning the info in some more efficient
> > representation.  That will be run-length encoded in some fashion.
> > 
> > The obvious way would be to populate an array of
> > 
> > struct page_status {
> > 	u32 present:1;
> > 	u32 count:31;
> > };
> > 
> > or whatever.
> 
> I'm having a hard time seeing how this could be extended to more
> status bits without stifling the optimization too much.

See other email: add a syscall arg which specifies the boolean status
which we're searching for.

>  If we just
> add more status bits to one page_status, the likelihood of long runs
> where all bits are in agreement decreases.  But as the optimization
> becomes less and less effective, we are stuck with an interface that
> is more PITA than just using mmap and mincore again.
> 
> The user has to supply a worst-case-sized vector with one struct
> page_status per page in the range, but the per-page item will be
> bigger than with the byte vector because of the additional run length
> variable.

Yes, we'd need to tell the kernel how much storage is available for the
structures.

> However, one struct page_status per run leaves you with a worst case
> of one syscall per page in the range.

Yes.

> I dunno.  The byte vector might not be optimal but its worst cases
> seem more attractive, is just as extensible, and dead simple to use.

But I think "which pages from this 4TB file are in core" will not be an
uncommon usage, and writing a gig of memory to find three pages is just
awful.

I wonder what the most common usage would be (one should know this
before merging the syscall :)).  I guess "is this relatively-small
range of the file in core" and/or "which pages from this
relatively-small range of the file will I need to read", etc.

The syscall should handle the common usages very well.  But it
shouldn't handle uncommon usages very badly!

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15 23:42           ` Andrew Morton
@ 2013-02-16  4:23             ` Rusty Russell
  2013-02-17 22:51               ` Johannes Weiner
                                 ` (2 more replies)
  2013-02-18  5:41             ` Rusty Russell
  1 sibling, 3 replies; 22+ messages in thread
From: Rusty Russell @ 2013-02-16  4:23 UTC (permalink / raw)
  To: Andrew Morton, Johannes Weiner
  Cc: LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

Andrew Morton <akpm@linux-foundation.org> writes:
> On Fri, 15 Feb 2013 18:13:04 -0500
> Johannes Weiner <hannes@cmpxchg.org> wrote:
>> I dunno.  The byte vector might not be optimal but its worst cases
>> seem more attractive, is just as extensible, and dead simple to use.
>
> But I think "which pages from this 4TB file are in core" will not be an
> uncommon usage, and writing a gig of memory to find three pages is just
> awful.

Actually, I don't know of any usage for this call.

I'd really like to use it for backup programs, so they stop pulling
random crap into memory (but leave things already resident).  But that
needs to madvise(MADV_DONTNEED) on the page, so need mmap.

So why not just use mincore?

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-16  4:23             ` Rusty Russell
@ 2013-02-17 22:51               ` Johannes Weiner
  2013-02-17 22:54               ` Andrew Morton
  2013-05-29 14:53               ` Andres Freund
  2 siblings, 0 replies; 22+ messages in thread
From: Johannes Weiner @ 2013-02-17 22:51 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Sat, Feb 16, 2013 at 02:53:43PM +1030, Rusty Russell wrote:
> Andrew Morton <akpm@linux-foundation.org> writes:
> > On Fri, 15 Feb 2013 18:13:04 -0500
> > Johannes Weiner <hannes@cmpxchg.org> wrote:
> >> I dunno.  The byte vector might not be optimal but its worst cases
> >> seem more attractive, is just as extensible, and dead simple to use.
> >
> > But I think "which pages from this 4TB file are in core" will not be an
> > uncommon usage, and writing a gig of memory to find three pages is just
> > awful.
> 
> Actually, I don't know of any usage for this call.
> 
> I'd really like to use it for backup programs, so they stop pulling
> random crap into memory (but leave things already resident).  But that
> needs to madvise(MADV_DONTNEED) on the page, so need mmap.

We do actually have fadvise() (posix_fadvise()).

Btw, why are we not invalidating page cache from MADV_DONTNEED?  I
just see a page table teardown in there, so mmap for madvise alone
won't do any good.  fadvise() OTOTH /does/ invalidate page cache.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-16  4:23             ` Rusty Russell
  2013-02-17 22:51               ` Johannes Weiner
@ 2013-02-17 22:54               ` Andrew Morton
  2013-05-29 14:53               ` Andres Freund
  2 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-02-17 22:54 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Johannes Weiner, LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

On Sat, 16 Feb 2013 14:53:43 +1030 Rusty Russell <rusty@rustcorp.com.au> wrote:

> Andrew Morton <akpm@linux-foundation.org> writes:
> > On Fri, 15 Feb 2013 18:13:04 -0500
> > Johannes Weiner <hannes@cmpxchg.org> wrote:
> >> I dunno.  The byte vector might not be optimal but its worst cases
> >> seem more attractive, is just as extensible, and dead simple to use.
> >
> > But I think "which pages from this 4TB file are in core" will not be an
> > uncommon usage, and writing a gig of memory to find three pages is just
> > awful.
> 
> Actually, I don't know of any usage for this call.

That's good news ;)

We shouldn't add it unless there's damn good reason.

> I'd really like to use it for backup programs, so they stop pulling
> random crap into memory (but leave things already resident).  But that
> needs to madvise(MADV_DONTNEED) on the page, so need mmap.
>
> So why not just use mincore?

One can use fadvise(POSIX_FADV_DONTNEED) to drop the pages.

Or toss your backup app into a small memcg so it reclaims its own
stuff.  See recent thread "mm: fadvise: Drain all pagevecs if
POSIX_FADV_DONTNEED fails to discard all pages"

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15 23:42           ` Andrew Morton
  2013-02-16  4:23             ` Rusty Russell
@ 2013-02-18  5:41             ` Rusty Russell
  1 sibling, 0 replies; 22+ messages in thread
From: Rusty Russell @ 2013-02-18  5:41 UTC (permalink / raw)
  To: Andrew Morton, Johannes Weiner
  Cc: LKML, Nick Piggin, Stewart Smith, linux-mm, linux-arch

Andrew Morton <akpm@linux-foundation.org> writes:
> The syscall should handle the common usages very well.  But it
> shouldn't handle uncommon usages very badly!

If the user is actually dealing with the contents of the file, following
the established mincore is preferred, since it's in the noise anyway.

Which comes back to needing a user; I'll see what I can come up with.

Cheers,
Rusty.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-15  6:34     ` [patch 1/2] mm: fincore() Johannes Weiner
                         ` (2 preceding siblings ...)
  2013-02-15 21:27       ` Andrew Morton
@ 2013-02-19 10:25       ` Simon Jeons
  3 siblings, 0 replies; 22+ messages in thread
From: Simon Jeons @ 2013-02-19 10:25 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Rusty Russell, LKML, Nick Piggin, Stewart Smith,
	linux-mm, linux-arch

Hi Johannes,
On 02/15/2013 02:34 PM, Johannes Weiner wrote:
> On Mon, Feb 11, 2013 at 02:12:39PM -0800, Andrew Morton wrote:
>> Also, having to mmap the file to be able to query pagecache state is a
>> hack.  Whatever happened to the fincore() patch?
> I don't know, but how about this one:
>
> ---
> From: Johannes Weiner <hannes@cmpxchg.org>
> Subject: [patch 1/2] mm: fincore()
>
> Provide a syscall to determine whether a given file's pages are cached
> in memory.  This is more elegant than mmapping the file for the sole
> purpose of using mincore(), and also works on NOMMU.

Who is the user of mincore()/fincore()? In which scenario user processes 
need to know their pages are resident in memory or not?

> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>   include/linux/syscalls.h |   2 +
>   mm/Makefile              |   2 +-
>   mm/fincore.c             | 128 +++++++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 131 insertions(+), 1 deletion(-)
>   create mode 100644 mm/fincore.c
>
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 313a8e0..3ceab2a 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -897,4 +897,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>   asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>   			 unsigned long idx1, unsigned long idx2);
>   asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
> +asmlinkage long sys_fincore(unsigned int fd, loff_t start, loff_t len,
> +			    unsigned char __user * vec);
>   #endif
> diff --git a/mm/Makefile b/mm/Makefile
> index 185a22b..221cdae 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -17,7 +17,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
>   			   util.o mmzone.o vmstat.o backing-dev.o \
>   			   mm_init.o mmu_context.o percpu.o slab_common.o \
>   			   compaction.o balloon_compaction.o \
> -			   interval_tree.o $(mmu-y)
> +			   interval_tree.o fincore.o $(mmu-y)
>   
>   obj-y += init-mm.o
>   
> diff --git a/mm/fincore.c b/mm/fincore.c
> new file mode 100644
> index 0000000..d504611
> --- /dev/null
> +++ b/mm/fincore.c
> @@ -0,0 +1,128 @@
> +#include <linux/syscalls.h>
> +#include <linux/pagemap.h>
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +
> +static long do_fincore(struct address_space *mapping, pgoff_t pgstart,
> +		       unsigned long nr_pages, unsigned char *vec)
> +{
> +	pgoff_t pgend = pgstart + nr_pages;
> +	struct radix_tree_iter iter;
> +	void **slot;
> +	long nr = 0;
> +
> +	rcu_read_lock();
> +restart:
> +	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, pgstart) {
> +		unsigned char present;
> +		struct page *page;
> +
> +		/* Handle holes */
> +		if (iter.index != pgstart + nr) {
> +			if (iter.index < pgend)
> +				nr_pages = iter.index - pgstart;
> +			break;
> +		}
> +repeat:
> +		page = radix_tree_deref_slot(slot);
> +		if (unlikely(!page))
> +			continue;
> +
> +		if (radix_tree_exception(page)) {
> +			if (radix_tree_deref_retry(page)) {
> +				/*
> +				 * Transient condition which can only trigger
> +				 * when entry at index 0 moves out of or back
> +				 * to root: none yet gotten, safe to restart.
> +				 */
> +				WARN_ON(iter.index);
> +				goto restart;
> +			}
> +			present = 0;
> +		} else {
> +			if (!page_cache_get_speculative(page))
> +				goto repeat;
> +
> +			/* Has the page moved? */
> +			if (unlikely(page != *slot)) {
> +				page_cache_release(page);
> +				goto repeat;
> +			}
> +
> +			present = PageUptodate(page);
> +			page_cache_release(page);
> +		}
> +		vec[nr] = present;
> +
> +		if (++nr == nr_pages)
> +			break;
> +	}
> +	rcu_read_unlock();
> +
> +	if (nr < nr_pages)
> +		memset(vec + nr, 0, nr_pages - nr);
> +
> +	return nr_pages;
> +}
> +
> +/*
> + * The fincore(2) system call.
> + *
> + * fincore() returns the memory residency status of the given file's
> + * pages, in the range [start, start + len].
> + * The status is returned in a vector of bytes.  The least significant
> + * bit of each byte is 1 if the referenced page is in memory, otherwise
> + * it is zero.
> + *
> + * Because the status of a page can change after fincore() checks it
> + * but before it returns to the application, the returned vector may
> + * contain stale information.
> + *
> + * return values:
> + *  zero    - success
> + *  -EBADF  - fd isn't a valid open file descriptor
> + *  -EFAULT - vec points to an illegal address
> + *  -EINVAL - start is not a multiple of PAGE_CACHE_SIZE
> + */
> +SYSCALL_DEFINE4(fincore, unsigned int, fd, loff_t, start, loff_t, len,
> +		unsigned char __user *, vec)
> +{
> +	unsigned long nr_pages;
> +	pgoff_t pgstart;
> +	struct fd f;
> +	long ret;
> +
> +	if (start & ~PAGE_CACHE_MASK)
> +		return -EINVAL;
> +
> +	f = fdget(fd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	pgstart = start >> PAGE_CACHE_SHIFT;
> +	nr_pages = DIV_ROUND_UP(len, PAGE_CACHE_SIZE);
> +
> +	while (nr_pages) {
> +		unsigned char tmp[64];
> +
> +		ret = do_fincore(f.file->f_mapping, pgstart,
> +				 min(nr_pages, sizeof(tmp)), tmp);
> +		if (ret <= 0)
> +			break;
> +
> +		if (copy_to_user(vec, tmp, ret)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		nr_pages -= ret;
> +		pgstart += ret;
> +		vec += ret;
> +		ret = 0;
> +	}
> +
> +	fdput(f);
> +
> +	return ret;
> +}


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-02-16  4:23             ` Rusty Russell
  2013-02-17 22:51               ` Johannes Weiner
  2013-02-17 22:54               ` Andrew Morton
@ 2013-05-29 14:53               ` Andres Freund
  2013-05-29 17:32                 ` Johannes Weiner
  2 siblings, 1 reply; 22+ messages in thread
From: Andres Freund @ 2013-05-29 14:53 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Johannes Weiner, LKML, Nick Piggin, Stewart Smith,
	linux-mm, linux-arch

On 2013-02-16 14:53:43 +1030, Rusty Russell wrote:
> Andrew Morton <akpm@linux-foundation.org> writes:
> > On Fri, 15 Feb 2013 18:13:04 -0500
> > Johannes Weiner <hannes@cmpxchg.org> wrote:
> >> I dunno.  The byte vector might not be optimal but its worst cases
> >> seem more attractive, is just as extensible, and dead simple to use.
> >
> > But I think "which pages from this 4TB file are in core" will not be an
> > uncommon usage, and writing a gig of memory to find three pages is just
> > awful.
> 
> Actually, I don't know of any usage for this call.

[months later, catching up]

I do. Postgres' could really use something like that for making saner
assumptions about the cost of doing an index/heap scan. postgres doesn't
use mmap() and mmaping larger files into memory isn't all that cheap
(32bit...) so having fincore would be nice.

Greetings,

Andres Freund

-- 
 Andres Freund	                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-05-29 14:53               ` Andres Freund
@ 2013-05-29 17:32                 ` Johannes Weiner
  2013-05-29 17:52                   ` Andres Freund
  0 siblings, 1 reply; 22+ messages in thread
From: Johannes Weiner @ 2013-05-29 17:32 UTC (permalink / raw)
  To: Andres Freund
  Cc: Rusty Russell, Andrew Morton, LKML, Nick Piggin, Stewart Smith,
	linux-mm, linux-arch

On Wed, May 29, 2013 at 04:53:12PM +0200, Andres Freund wrote:
> On 2013-02-16 14:53:43 +1030, Rusty Russell wrote:
> > Andrew Morton <akpm@linux-foundation.org> writes:
> > > On Fri, 15 Feb 2013 18:13:04 -0500
> > > Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >> I dunno.  The byte vector might not be optimal but its worst cases
> > >> seem more attractive, is just as extensible, and dead simple to use.
> > >
> > > But I think "which pages from this 4TB file are in core" will not be an
> > > uncommon usage, and writing a gig of memory to find three pages is just
> > > awful.
> > 
> > Actually, I don't know of any usage for this call.
> 
> [months later, catching up]
> 
> I do. Postgres' could really use something like that for making saner
> assumptions about the cost of doing an index/heap scan. postgres doesn't
> use mmap() and mmaping larger files into memory isn't all that cheap
> (32bit...) so having fincore would be nice.

How much of the areas you want to use it against is usually cached?
I.e. are those 4TB files with 3 cached pages?

I do wonder if we should just have two separate interfaces.  Ugly, but
I don't really see how the two requirements (dense but many holes
vs. huge sparse areas) could be acceptably met with one interface.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/2] mm: fincore()
  2013-05-29 17:32                 ` Johannes Weiner
@ 2013-05-29 17:52                   ` Andres Freund
  0 siblings, 0 replies; 22+ messages in thread
From: Andres Freund @ 2013-05-29 17:52 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Rusty Russell, Andrew Morton, LKML, Nick Piggin, Stewart Smith,
	linux-mm, linux-arch

On 2013-05-29 13:32:23 -0400, Johannes Weiner wrote:
> On Wed, May 29, 2013 at 04:53:12PM +0200, Andres Freund wrote:
> > On 2013-02-16 14:53:43 +1030, Rusty Russell wrote:
> > > Andrew Morton <akpm@linux-foundation.org> writes:
> > > > On Fri, 15 Feb 2013 18:13:04 -0500
> > > > Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >> I dunno.  The byte vector might not be optimal but its worst cases
> > > >> seem more attractive, is just as extensible, and dead simple to use.
> > > >
> > > > But I think "which pages from this 4TB file are in core" will not be an
> > > > uncommon usage, and writing a gig of memory to find three pages is just
> > > > awful.
> > > 
> > > Actually, I don't know of any usage for this call.
> > 
> > [months later, catching up]
> > 
> > I do. Postgres' could really use something like that for making saner
> > assumptions about the cost of doing an index/heap scan. postgres doesn't
> > use mmap() and mmaping larger files into memory isn't all that cheap
> > (32bit...) so having fincore would be nice.

> How much of the areas you want to use it against is usually cached?
> I.e. are those 4TB files with 3 cached pages?

Hard to say in general. The point is exactly that we don't know. If
there's nothing of a large index in memory and we estimate that we want
20% of a table we sure won't do an indexscan. If its all in memory?
Different story.
For that usecase its not actually important that we get a 100% accurate
result although I, from my limited understanding, don't really see that
helping much.

(Yes, there are some problems with cache warming here)

> I do wonder if we should just have two separate interfaces.  Ugly, but
> I don't really see how the two requirements (dense but many holes
> vs. huge sparse areas) could be acceptably met with one interface.

The difference would be how the information would be encoded, right? Not
sure how the passed in memory could be sized in some run length encoded
scheme. What I could imagine is specifying the granularity we want
information about, but thats probably too specific.

Greetings,

Andres Freund

-- 
 Andres Freund	                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2013-05-29 17:52 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-11  3:13 RFC: mincore: add a bit to indicate a page is dirty Rusty Russell
2013-02-11 16:27 ` Johannes Weiner
2013-02-11 22:12   ` Andrew Morton
2013-02-12  5:44     ` Rusty Russell
2013-02-15  6:34     ` [patch 1/2] mm: fincore() Johannes Weiner
2013-02-15 20:39       ` David Miller
2013-02-15 21:14       ` Andrew Morton
2013-02-15 22:28         ` Johannes Weiner
2013-02-15 22:34           ` Andrew Morton
2013-02-15 21:27       ` Andrew Morton
2013-02-15 23:13         ` Johannes Weiner
2013-02-15 23:42           ` Andrew Morton
2013-02-16  4:23             ` Rusty Russell
2013-02-17 22:51               ` Johannes Weiner
2013-02-17 22:54               ` Andrew Morton
2013-05-29 14:53               ` Andres Freund
2013-05-29 17:32                 ` Johannes Weiner
2013-05-29 17:52                   ` Andres Freund
2013-02-18  5:41             ` Rusty Russell
2013-02-19 10:25       ` Simon Jeons
2013-02-15  6:35     ` [patch 2/2] x86-64: hook up fincore() syscall Johannes Weiner
2013-02-12  5:49   ` RFC: mincore: add a bit to indicate a page is dirty Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).