All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-12 20:16 Dave McCracken
  2003-06-12 20:49   ` Andrew Morton
  0 siblings, 1 reply; 28+ messages in thread
From: Dave McCracken @ 2003-06-12 20:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Memory Management, Linux Kernel

[-- Attachment #1: Type: text/plain, Size: 706 bytes --]


Paul McKenney and I sat down today and hashed out just what the races are
for both  vmtruncate and the distributed filesystems.  We took Andrea's
idea of using seqlocks and came up with a simple solution that definitely
fixes the race in vmtruncate, as well as most likely the invalidate race in
distributed filesystems.  Paul is going to discuss it with the DFS folks to
verify that it's a complete fix for them, but neither of us can see a hole.

Anyway, here's the patch.

Dave McCracken

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059

[-- Attachment #2: trunc-2.5.70-mm8-1.diff --]
[-- Type: text/plain, Size: 4176 bytes --]

--- 2.5.70-mm8/./drivers/mtd/devices/blkmtd.c	2003-05-26 20:00:21.000000000 -0500
+++ 2.5.70-mm8-trunc/./drivers/mtd/devices/blkmtd.c	2003-06-12 13:45:48.000000000 -0500
@@ -1189,6 +1189,7 @@ static int __init init_blkmtd(void)
   INIT_LIST_HEAD(&mtd_rawdevice->as.locked_pages);
   mtd_rawdevice->as.host = NULL;
   init_MUTEX(&(mtd_rawdevice->as.i_shared_sem));
+  seqlock_init(&(mtd_rawdevice->as.truncate_lock));
 
   mtd_rawdevice->as.a_ops = &blkmtd_aops;
   INIT_LIST_HEAD(&mtd_rawdevice->as.i_mmap);
--- 2.5.70-mm8/./include/linux/fs.h	2003-06-12 13:37:28.000000000 -0500
+++ 2.5.70-mm8-trunc/./include/linux/fs.h	2003-06-12 13:42:51.000000000 -0500
@@ -19,6 +19,7 @@
 #include <linux/cache.h>
 #include <linux/radix-tree.h>
 #include <linux/kobject.h>
+#include <linux/seqlock.h>
 #include <asm/atomic.h>
 
 struct iovec;
@@ -323,6 +324,7 @@ struct address_space {
 	struct list_head	i_mmap;		/* list of private mappings */
 	struct list_head	i_mmap_shared;	/* list of shared mappings */
 	struct semaphore	i_shared_sem;	/* protect both above lists */
+	seqlock_t		truncate_lock;	/* Cover race condition with truncate */
 	unsigned long		dirtied_when;	/* jiffies of first page dirtying */
 	int			gfp_mask;	/* how to allocate the pages */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
--- 2.5.70-mm8/./fs/inode.c	2003-06-12 13:37:22.000000000 -0500
+++ 2.5.70-mm8-trunc/./fs/inode.c	2003-06-12 13:43:29.000000000 -0500
@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
+	seqlock_init(&inode->i_data.truncate_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap);
--- 2.5.70-mm8/./mm/swap_state.c	2003-05-26 20:00:39.000000000 -0500
+++ 2.5.70-mm8-trunc/./mm/swap_state.c	2003-06-12 13:40:54.000000000 -0500
@@ -44,6 +44,7 @@ struct address_space swapper_space = {
 	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
 	.i_mmap_shared	= LIST_HEAD_INIT(swapper_space.i_mmap_shared),
 	.i_shared_sem	= __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
+	.truncate_lock  = SEQLOCK_UNLOCKED,
 	.private_lock	= SPIN_LOCK_UNLOCKED,
 	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
 };
--- 2.5.70-mm8/./mm/memory.c	2003-06-12 13:37:31.000000000 -0500
+++ 2.5.70-mm8-trunc/./mm/memory.c	2003-06-12 13:40:54.000000000 -0500
@@ -1138,6 +1138,9 @@ invalidate_mmap_range(struct address_spa
 			hlen = ULONG_MAX - hba + 1;
 	}
 	down(&mapping->i_shared_sem);
+	/* Protect against page fault */
+	write_seqlock(&mapping->truncate_lock);
+	write_sequnlock(&mapping->truncate_lock);
 	if (unlikely(!list_empty(&mapping->i_mmap)))
 		invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
 	if (unlikely(!list_empty(&mapping->i_mmap_shared)))
@@ -1390,8 +1393,11 @@ do_no_page(struct mm_struct *mm, struct 
 	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
 	struct page * new_page;
+	struct inode *inode;
+	struct address_space *mapping;
 	pte_t entry;
 	struct pte_chain *pte_chain;
+	unsigned sequence;
 	int ret;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
@@ -1400,6 +1406,10 @@ do_no_page(struct mm_struct *mm, struct 
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
+	inode = vma->vm_file->f_dentry->d_inode;
+	mapping = inode->i_mapping;
+retry:
+	sequence = read_seqbegin(&mapping->truncate_lock);
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
 
 	/* no page was available -- either SIGBUS or OOM */
@@ -1428,6 +1438,17 @@ do_no_page(struct mm_struct *mm, struct 
 	}
 
 	spin_lock(&mm->page_table_lock);
+	/*
+	 * If someone invalidated the page, serialize against the inode,
+	 * then go try again.
+	 */
+	if (unlikely(read_seqretry(&mapping->truncate_lock, sequence))) {
+		spin_unlock(&mm->page_table_lock);
+		down(&inode->i_sem);
+		up(&inode->i_sem);
+		goto retry;
+	}
+
 	page_table = pte_offset_map(pmd, address);
 
 	/*

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-12 20:16 [PATCH] Fix vmtruncate race and distributed filesystem race Dave McCracken
@ 2003-06-12 20:49   ` Andrew Morton
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-12 20:49 UTC (permalink / raw)
  To: Dave McCracken; +Cc: linux-mm, linux-kernel

Dave McCracken <dmccr@us.ibm.com> wrote:
>
> 
> Paul McKenney and I sat down today and hashed out just what the races are
> for both  vmtruncate and the distributed filesystems.  We took Andrea's
> idea of using seqlocks and came up with a simple solution that definitely
> fixes the race in vmtruncate, as well as most likely the invalidate race in
> distributed filesystems.  Paul is going to discuss it with the DFS folks to
> verify that it's a complete fix for them, but neither of us can see a hole.
> 

> +  seqlock_init(&(mtd_rawdevice->as.truncate_lock));

Why cannot this just be an atomic_t?

> +	/* Protect against page fault */
> +	write_seqlock(&mapping->truncate_lock);
> +	write_sequnlock(&mapping->truncate_lock);

See, this just does foo++.

> +	/*
> +	 * If someone invalidated the page, serialize against the inode,
> +	 * then go try again.
> +	 */

This comment is inaccurate.  "If this vma is file-backed and someone has
truncated that file, this page may have been invalidated".

> +	if (unlikely(read_seqretry(&mapping->truncate_lock, sequence))) {
> +		spin_unlock(&mm->page_table_lock);
> +		down(&inode->i_sem);
> +		up(&inode->i_sem);
> +		goto retry;
> +	}
> +

mm/memory.c shouldn't know about inodes (ok, vmtruncate() should be in
filemap.c).

How about you do this:

do_no_page()
{
	int sequence = 0;
	...

retry:
	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &sequence);
	....
	if (vma->vm_ops->revalidate && vma->vm_opa->revalidate(vma, sequence))
		goto retry;
}


filemap_nopage(..., int *sequence)
{
	...
	*sequence = atomic_read(&mapping->truncate_sequence);
	...
}

int filemap_revalidate(vma, sequence)
{
	struct address_space *mapping;

	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
	return sequence - atomic_read(&mapping->truncate_sequence);
}



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-12 20:49   ` Andrew Morton
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-12 20:49 UTC (permalink / raw)
  To: Dave McCracken; +Cc: linux-mm, linux-kernel

Dave McCracken <dmccr@us.ibm.com> wrote:
>
> 
> Paul McKenney and I sat down today and hashed out just what the races are
> for both  vmtruncate and the distributed filesystems.  We took Andrea's
> idea of using seqlocks and came up with a simple solution that definitely
> fixes the race in vmtruncate, as well as most likely the invalidate race in
> distributed filesystems.  Paul is going to discuss it with the DFS folks to
> verify that it's a complete fix for them, but neither of us can see a hole.
> 

> +  seqlock_init(&(mtd_rawdevice->as.truncate_lock));

Why cannot this just be an atomic_t?

> +	/* Protect against page fault */
> +	write_seqlock(&mapping->truncate_lock);
> +	write_sequnlock(&mapping->truncate_lock);

See, this just does foo++.

> +	/*
> +	 * If someone invalidated the page, serialize against the inode,
> +	 * then go try again.
> +	 */

This comment is inaccurate.  "If this vma is file-backed and someone has
truncated that file, this page may have been invalidated".

> +	if (unlikely(read_seqretry(&mapping->truncate_lock, sequence))) {
> +		spin_unlock(&mm->page_table_lock);
> +		down(&inode->i_sem);
> +		up(&inode->i_sem);
> +		goto retry;
> +	}
> +

mm/memory.c shouldn't know about inodes (ok, vmtruncate() should be in
filemap.c).

How about you do this:

do_no_page()
{
	int sequence = 0;
	...

retry:
	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &sequence);
	....
	if (vma->vm_ops->revalidate && vma->vm_opa->revalidate(vma, sequence))
		goto retry;
}


filemap_nopage(..., int *sequence)
{
	...
	*sequence = atomic_read(&mapping->truncate_sequence);
	...
}

int filemap_revalidate(vma, sequence)
{
	struct address_space *mapping;

	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
	return sequence - atomic_read(&mapping->truncate_sequence);
}


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-12 20:49   ` Andrew Morton
@ 2003-06-12 21:00     ` Andrew Morton
  -1 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-12 21:00 UTC (permalink / raw)
  To: dmccr, linux-mm, linux-kernel

Andrew Morton <akpm@digeo.com> wrote:
>
> do_no_page()
> {
> 	int sequence = 0;
> 	...
> 
> retry:
> 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &sequence);
> 	....
> 	if (vma->vm_ops->revalidate && vma->vm_opa->revalidate(vma, sequence))
> 		goto retry;
> }

And this does require that ->nopage be entered with page_table_lock held,
and that it drop it.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-12 21:00     ` Andrew Morton
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-12 21:00 UTC (permalink / raw)
  To: dmccr, linux-mm, linux-kernel

Andrew Morton <akpm@digeo.com> wrote:
>
> do_no_page()
> {
> 	int sequence = 0;
> 	...
> 
> retry:
> 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &sequence);
> 	....
> 	if (vma->vm_ops->revalidate && vma->vm_opa->revalidate(vma, sequence))
> 		goto retry;
> }

And this does require that ->nopage be entered with page_table_lock held,
and that it drop it.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-12 21:00     ` Andrew Morton
@ 2003-06-12 21:08       ` Dave McCracken
  -1 siblings, 0 replies; 28+ messages in thread
From: Dave McCracken @ 2003-06-12 21:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel


--On Thursday, June 12, 2003 14:00:14 -0700 Andrew Morton <akpm@digeo.com>
wrote:

> And this does require that ->nopage be entered with page_table_lock held,
> and that it drop it.

I think that's a worse layer violation than referencing inode in
do_no_page.  We shouldn't require that the filesystem layer mess with the
page_table_lock.

Dave

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-12 21:08       ` Dave McCracken
  0 siblings, 0 replies; 28+ messages in thread
From: Dave McCracken @ 2003-06-12 21:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel

--On Thursday, June 12, 2003 14:00:14 -0700 Andrew Morton <akpm@digeo.com>
wrote:

> And this does require that ->nopage be entered with page_table_lock held,
> and that it drop it.

I think that's a worse layer violation than referencing inode in
do_no_page.  We shouldn't require that the filesystem layer mess with the
page_table_lock.

Dave

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-12 21:08       ` Dave McCracken
@ 2003-06-12 21:44         ` Andrew Morton
  -1 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-12 21:44 UTC (permalink / raw)
  To: Dave McCracken; +Cc: linux-mm, linux-kernel

Dave McCracken <dmccr@us.ibm.com> wrote:
>
> 
> --On Thursday, June 12, 2003 14:00:14 -0700 Andrew Morton <akpm@digeo.com>
> wrote:
> 
> > And this does require that ->nopage be entered with page_table_lock held,
> > and that it drop it.
> 
> I think that's a worse layer violation than referencing inode in
> do_no_page.  We shouldn't require that the filesystem layer mess with the
> page_table_lock.

Well it is not "worse".  Futzing with i_sem in do_no_page() is pretty gross.
You could add vm_ops->prevalidate() or something if it worries you.

btw, it should be synchronising with
file->f_dentry->d_inode->i_mapping->host->i_sem, not
file->f_dentry->d_inode->i_sem.  do_truncate() also seems to be taking the
(potentially) wrong semaphore.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-12 21:44         ` Andrew Morton
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-12 21:44 UTC (permalink / raw)
  To: Dave McCracken; +Cc: linux-mm, linux-kernel

Dave McCracken <dmccr@us.ibm.com> wrote:
>
> 
> --On Thursday, June 12, 2003 14:00:14 -0700 Andrew Morton <akpm@digeo.com>
> wrote:
> 
> > And this does require that ->nopage be entered with page_table_lock held,
> > and that it drop it.
> 
> I think that's a worse layer violation than referencing inode in
> do_no_page.  We shouldn't require that the filesystem layer mess with the
> page_table_lock.

Well it is not "worse".  Futzing with i_sem in do_no_page() is pretty gross.
You could add vm_ops->prevalidate() or something if it worries you.

btw, it should be synchronising with
file->f_dentry->d_inode->i_mapping->host->i_sem, not
file->f_dentry->d_inode->i_sem.  do_truncate() also seems to be taking the
(potentially) wrong semaphore.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-12 21:44         ` Andrew Morton
  (?)
@ 2003-06-12 22:56         ` Dave McCracken
  2003-06-12 23:07             ` Andrew Morton
  2003-06-20  0:17             ` Andrea Arcangeli
  -1 siblings, 2 replies; 28+ messages in thread
From: Dave McCracken @ 2003-06-12 22:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1124 bytes --]


--On Thursday, June 12, 2003 14:44:18 -0700 Andrew Morton <akpm@digeo.com>
wrote:

> Well it is not "worse".  Futzing with i_sem in do_no_page() is pretty
> gross. You could add vm_ops->prevalidate() or something if it worries you.

Actually I've been studying the logic.  I don't think we need to serialize
with i_sem at all.  i_size has already been changed, so just doing a retry
immediately will be safe.  Distributed filesystems should also be safe as
long as they mark the page invalid before they call invalidate_mmap_range().

I like your idea of doing an atomic_t instead of a seqlock.  The original
idea from Andrea implied there was a range of time it was unstable, but
with this scheme a single increment is sufficient.

I also think if we can solve both the vmtruncate and the distributed file
system races without adding any vm_ops, we should.

Here's a new patch.  Does this look better?

Dave

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059

[-- Attachment #2: trunc-2.5.70-mm8-2.diff --]
[-- Type: text/plain, Size: 3966 bytes --]

--- 2.5.70-mm8/./drivers/mtd/devices/blkmtd.c	2003-05-26 20:00:21.000000000 -0500
+++ 2.5.70-mm8-trunc/./drivers/mtd/devices/blkmtd.c	2003-06-12 16:57:52.000000000 -0500
@@ -1189,6 +1189,7 @@ static int __init init_blkmtd(void)
   INIT_LIST_HEAD(&mtd_rawdevice->as.locked_pages);
   mtd_rawdevice->as.host = NULL;
   init_MUTEX(&(mtd_rawdevice->as.i_shared_sem));
+  atomic_set(&(mtd_rawdevice->as.truncate_count), 0);
 
   mtd_rawdevice->as.a_ops = &blkmtd_aops;
   INIT_LIST_HEAD(&mtd_rawdevice->as.i_mmap);
--- 2.5.70-mm8/./include/linux/fs.h	2003-06-12 13:37:28.000000000 -0500
+++ 2.5.70-mm8-trunc/./include/linux/fs.h	2003-06-12 17:00:04.000000000 -0500
@@ -323,6 +323,7 @@ struct address_space {
 	struct list_head	i_mmap;		/* list of private mappings */
 	struct list_head	i_mmap_shared;	/* list of shared mappings */
 	struct semaphore	i_shared_sem;	/* protect both above lists */
+	atomic_t		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		dirtied_when;	/* jiffies of first page dirtying */
 	int			gfp_mask;	/* how to allocate the pages */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
--- 2.5.70-mm8/./fs/inode.c	2003-06-12 13:37:22.000000000 -0500
+++ 2.5.70-mm8-trunc/./fs/inode.c	2003-06-12 16:59:30.000000000 -0500
@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
+	atomic_set(&inode->i_data.truncate_count, 0);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap);
--- 2.5.70-mm8/./mm/swap_state.c	2003-05-26 20:00:39.000000000 -0500
+++ 2.5.70-mm8-trunc/./mm/swap_state.c	2003-06-12 16:59:53.000000000 -0500
@@ -44,6 +44,7 @@ struct address_space swapper_space = {
 	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
 	.i_mmap_shared	= LIST_HEAD_INIT(swapper_space.i_mmap_shared),
 	.i_shared_sem	= __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
+	.truncate_count  = ATOMIC_INIT(0),
 	.private_lock	= SPIN_LOCK_UNLOCKED,
 	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
 };
--- 2.5.70-mm8/./mm/memory.c	2003-06-12 13:37:31.000000000 -0500
+++ 2.5.70-mm8-trunc/./mm/memory.c	2003-06-12 17:51:55.000000000 -0500
@@ -1138,6 +1138,8 @@ invalidate_mmap_range(struct address_spa
 			hlen = ULONG_MAX - hba + 1;
 	}
 	down(&mapping->i_shared_sem);
+	/* Protect against page fault */
+	atomic_inc(&mapping->truncate_count);
 	if (unlikely(!list_empty(&mapping->i_mmap)))
 		invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
 	if (unlikely(!list_empty(&mapping->i_mmap_shared)))
@@ -1390,8 +1392,10 @@ do_no_page(struct mm_struct *mm, struct 
 	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
 	struct page * new_page;
+	struct address_space *mapping;
 	pte_t entry;
 	struct pte_chain *pte_chain;
+	unsigned sequence;
 	int ret;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
@@ -1400,6 +1404,9 @@ do_no_page(struct mm_struct *mm, struct 
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+retry:
+	sequence = atomic_read(&mapping->truncate_count);
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
 
 	/* no page was available -- either SIGBUS or OOM */
@@ -1428,6 +1435,16 @@ do_no_page(struct mm_struct *mm, struct 
 	}
 
 	spin_lock(&mm->page_table_lock);
+	/*
+	 * For a file-backed vma, someone could have truncated or otherwise
+	 * invalidated this page.  If invalidate_mmap_range got called,
+	 * retry getting the page.
+	 */
+	if (unlikely(sequence != atomic_read(&mapping->truncate_count))) {
+		spin_unlock(&mm->page_table_lock);
+		page_cache_release(new_page);
+		goto retry;
+	}
 	page_table = pte_offset_map(pmd, address);
 
 	/*

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-12 22:56         ` Dave McCracken
@ 2003-06-12 23:07             ` Andrew Morton
  2003-06-20  0:17             ` Andrea Arcangeli
  1 sibling, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-12 23:07 UTC (permalink / raw)
  To: Dave McCracken; +Cc: linux-mm, linux-kernel

Dave McCracken <dmccr@us.ibm.com> wrote:
>
> I also think if we can solve both the vmtruncate and the distributed file
> system races without adding any vm_ops, we should.
> 
> Here's a new patch.  Does this look better?

grumble, mutter.  It's certainly simple enough.

+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;

I'm not so sure about this one now.  write() alters dentry->d_inode but
truncate alters dentry->d_inode->i_mapping->host.  Unless truncate is
changed we have the wrong mapping here.

I'll put it back to the original while I try to work out why truncate isn't
wrong...


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-12 23:07             ` Andrew Morton
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-12 23:07 UTC (permalink / raw)
  To: Dave McCracken; +Cc: linux-mm, linux-kernel

Dave McCracken <dmccr@us.ibm.com> wrote:
>
> I also think if we can solve both the vmtruncate and the distributed file
> system races without adding any vm_ops, we should.
> 
> Here's a new patch.  Does this look better?

grumble, mutter.  It's certainly simple enough.

+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;

I'm not so sure about this one now.  write() alters dentry->d_inode but
truncate alters dentry->d_inode->i_mapping->host.  Unless truncate is
changed we have the wrong mapping here.

I'll put it back to the original while I try to work out why truncate isn't
wrong...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-12 22:56         ` Dave McCracken
@ 2003-06-20  0:17             ` Andrea Arcangeli
  2003-06-20  0:17             ` Andrea Arcangeli
  1 sibling, 0 replies; 28+ messages in thread
From: Andrea Arcangeli @ 2003-06-20  0:17 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Andrew Morton, linux-mm, linux-kernel

Hi,

On Thu, Jun 12, 2003 at 05:56:50PM -0500, Dave McCracken wrote:
> --- 2.5.70-mm8/./mm/memory.c	2003-06-12 13:37:31.000000000 -0500
> +++ 2.5.70-mm8-trunc/./mm/memory.c	2003-06-12 17:51:55.000000000 -0500
> @@ -1138,6 +1138,8 @@ invalidate_mmap_range(struct address_spa
>  			hlen = ULONG_MAX - hba + 1;
>  	}
>  	down(&mapping->i_shared_sem);
> +	/* Protect against page fault */
> +	atomic_inc(&mapping->truncate_count);
>  	if (unlikely(!list_empty(&mapping->i_mmap)))
>  		invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
>  	if (unlikely(!list_empty(&mapping->i_mmap_shared)))
> @@ -1390,8 +1392,10 @@ do_no_page(struct mm_struct *mm, struct 
>  	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
>  {
>  	struct page * new_page;
> +	struct address_space *mapping;
>  	pte_t entry;
>  	struct pte_chain *pte_chain;
> +	unsigned sequence;
>  	int ret;
>  
>  	if (!vma->vm_ops || !vma->vm_ops->nopage)
> @@ -1400,6 +1404,9 @@ do_no_page(struct mm_struct *mm, struct 
>  	pte_unmap(page_table);
>  	spin_unlock(&mm->page_table_lock);
>  
> +	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
> +retry:
> +	sequence = atomic_read(&mapping->truncate_count);
>  	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
>  
>  	/* no page was available -- either SIGBUS or OOM */
> @@ -1428,6 +1435,16 @@ do_no_page(struct mm_struct *mm, struct 
>  	}
>  
>  	spin_lock(&mm->page_table_lock);
> +	/*
> +	 * For a file-backed vma, someone could have truncated or otherwise
> +	 * invalidated this page.  If invalidate_mmap_range got called,
> +	 * retry getting the page.
> +	 */
> +	if (unlikely(sequence != atomic_read(&mapping->truncate_count))) {
> +		spin_unlock(&mm->page_table_lock);
> +		page_cache_release(new_page);
> +		goto retry;
> +	}
>  	page_table = pte_offset_map(pmd, address);

maybe I'm missing something silly but this fixes nothing IMHO. It's not
a coincidence I used the seq_lock (aka frlock in 2.4-aa) in my fix, a
single counter increment isn't nearly enough, you definitely need _both_
an entry and exit point in do_truncate or you'll never know if
vmtruncate has been running under you. The first increment is like the
down_read, the second increment is the up_read. Both are necessary to
trap any vmtruncate during the do_no_page.

Your patch traps this timing case:

	CPU 0			CPU 1
	----------		-----------
				do_no_page
	truncate
				read counter

	increment counter
	vmtruncate
				->nopage
				read counter again -> different so retry


but you can't trap this with a single counter increment in do_truncate:

	CPU 0			CPU 1
	----------		-----------
				do_no_page
	truncate
	increment counter
				read counter
				->nopage
	vmtruncate
				read counter again -> different so retry

thanks to the second counter increment after vmtruncate in my fix, the
above race couldn't happen.

About the down(&inode->i_sem); up(), that you dropped under Andrew's
suggestion, while that maybe ugly, it will have a chance to save cpu,
and since it's a slow path such goto, it's definitely worthwhile to keep
it IMHO. Otherwise one cpu will keep scheduling in a loop until truncate
returns, and it can take time since it may have to do I/O or wait on
some I/O semaphore. It wouldn't be DoSable, because the
ret-from-exception will check need_resched, but still it's bad for cpu
utilization and such a waste can be avoided trivially as in my fix.

I was chatting with Daniel about those hooks a few minutes ago, and he
suggested to make do_no_page a callback itself (instead of having
do_no_page call into a ->nopage further callback). And to provide by
default a generic implementation that would be equivalent to the current
do_no_page. As far as I can tell that will save both the new pointer to
function for the DSM hook (that IMHO has to be taken twice, both before
->nopage and after ->nopage, not only before the ->nopage, for the
reason explained above) and the ->nopage hook itself. So maybe that
could be a cleaner solution to avoid the DSM hooks enterely, so we don't
have more hooks but less, and a library call. This sounds the best for
performance and flexibility. (talking only about 2.5 of course, 2.4 I
think is just fine with my ""production"" 8) fix here:

	http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.21rc8aa1/2.4.21rc8aa1/9999_truncate-nopage-race-1

)

Andrea

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-20  0:17             ` Andrea Arcangeli
  0 siblings, 0 replies; 28+ messages in thread
From: Andrea Arcangeli @ 2003-06-20  0:17 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Andrew Morton, linux-mm, linux-kernel

Hi,

On Thu, Jun 12, 2003 at 05:56:50PM -0500, Dave McCracken wrote:
> --- 2.5.70-mm8/./mm/memory.c	2003-06-12 13:37:31.000000000 -0500
> +++ 2.5.70-mm8-trunc/./mm/memory.c	2003-06-12 17:51:55.000000000 -0500
> @@ -1138,6 +1138,8 @@ invalidate_mmap_range(struct address_spa
>  			hlen = ULONG_MAX - hba + 1;
>  	}
>  	down(&mapping->i_shared_sem);
> +	/* Protect against page fault */
> +	atomic_inc(&mapping->truncate_count);
>  	if (unlikely(!list_empty(&mapping->i_mmap)))
>  		invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
>  	if (unlikely(!list_empty(&mapping->i_mmap_shared)))
> @@ -1390,8 +1392,10 @@ do_no_page(struct mm_struct *mm, struct 
>  	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
>  {
>  	struct page * new_page;
> +	struct address_space *mapping;
>  	pte_t entry;
>  	struct pte_chain *pte_chain;
> +	unsigned sequence;
>  	int ret;
>  
>  	if (!vma->vm_ops || !vma->vm_ops->nopage)
> @@ -1400,6 +1404,9 @@ do_no_page(struct mm_struct *mm, struct 
>  	pte_unmap(page_table);
>  	spin_unlock(&mm->page_table_lock);
>  
> +	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
> +retry:
> +	sequence = atomic_read(&mapping->truncate_count);
>  	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
>  
>  	/* no page was available -- either SIGBUS or OOM */
> @@ -1428,6 +1435,16 @@ do_no_page(struct mm_struct *mm, struct 
>  	}
>  
>  	spin_lock(&mm->page_table_lock);
> +	/*
> +	 * For a file-backed vma, someone could have truncated or otherwise
> +	 * invalidated this page.  If invalidate_mmap_range got called,
> +	 * retry getting the page.
> +	 */
> +	if (unlikely(sequence != atomic_read(&mapping->truncate_count))) {
> +		spin_unlock(&mm->page_table_lock);
> +		page_cache_release(new_page);
> +		goto retry;
> +	}
>  	page_table = pte_offset_map(pmd, address);

maybe I'm missing something silly but this fixes nothing IMHO. It's not
a coincidence I used the seq_lock (aka frlock in 2.4-aa) in my fix, a
single counter increment isn't nearly enough, you definitely need _both_
an entry and exit point in do_truncate or you'll never know if
vmtruncate has been running under you. The first increment is like the
down_read, the second increment is the up_read. Both are necessary to
trap any vmtruncate during the do_no_page.

Your patch traps this timing case:

	CPU 0			CPU 1
	----------		-----------
				do_no_page
	truncate
				read counter

	increment counter
	vmtruncate
				->nopage
				read counter again -> different so retry


but you can't trap this with a single counter increment in do_truncate:

	CPU 0			CPU 1
	----------		-----------
				do_no_page
	truncate
	increment counter
				read counter
				->nopage
	vmtruncate
				read counter again -> different so retry

thanks to the second counter increment after vmtruncate in my fix, the
above race couldn't happen.

About the down(&inode->i_sem); up(), that you dropped under Andrew's
suggestion, while that maybe ugly, it will have a chance to save cpu,
and since it's a slow path such goto, it's definitely worthwhile to keep
it IMHO. Otherwise one cpu will keep scheduling in a loop until truncate
returns, and it can take time since it may have to do I/O or wait on
some I/O semaphore. It wouldn't be DoSable, because the
ret-from-exception will check need_resched, but still it's bad for cpu
utilization and such a waste can be avoided trivially as in my fix.

I was chatting with Daniel about those hooks a few minutes ago, and he
suggested to make do_no_page a callback itself (instead of having
do_no_page call into a ->nopage further callback). And to provide by
default a generic implementation that would be equivalent to the current
do_no_page. As far as I can tell that will save both the new pointer to
function for the DSM hook (that IMHO has to be taken twice, both before
->nopage and after ->nopage, not only before the ->nopage, for the
reason explained above) and the ->nopage hook itself. So maybe that
could be a cleaner solution to avoid the DSM hooks enterely, so we don't
have more hooks but less, and a library call. This sounds the best for
performance and flexibility. (talking only about 2.5 of course, 2.4 I
think is just fine with my ""production"" 8) fix here:

	http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.21rc8aa1/2.4.21rc8aa1/9999_truncate-nopage-race-1

)

Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-20  0:17             ` Andrea Arcangeli
@ 2003-06-23  3:28               ` Paul E. McKenney
  -1 siblings, 0 replies; 28+ messages in thread
From: Paul E. McKenney @ 2003-06-23  3:28 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Dave McCracken, Andrew Morton, linux-mm, linux-kernel

On Fri, Jun 20, 2003 at 02:17:43AM +0200, Andrea Arcangeli wrote:
> maybe I'm missing something silly but this fixes nothing IMHO. It's not
> a coincidence I used the seq_lock (aka frlock in 2.4-aa) in my fix, a
> single counter increment isn't nearly enough, you definitely need _both_
> an entry and exit point in do_truncate or you'll never know if
> vmtruncate has been running under you. The first increment is like the
> down_read, the second increment is the up_read. Both are necessary to
> trap any vmtruncate during the do_no_page.
> 
> Your patch traps this timing case:
> 
> 	CPU 0			CPU 1
> 	----------		-----------
> 				do_no_page
> 	truncate
> 				read counter
> 
> 	increment counter
> 	vmtruncate
> 				->nopage
> 				read counter again -> different so retry

Yep!

> but you can't trap this with a single counter increment in do_truncate:
> 
> 	CPU 0			CPU 1
> 	----------		-----------
> 				do_no_page
> 	truncate
> 	increment counter
> 				read counter
> 				->nopage
> 	vmtruncate
> 				read counter again -> different so retry
> 
> thanks to the second counter increment after vmtruncate in my fix, the
> above race couldn't happen.

The trick is that CPU 0 is expected to have updated the filesystem's
idea of what pages are available before calling vmtruncate,
invalidate_mmap_range() or whichever.  Thus, once the counter
has been incremented, ->nopage() will see the results of the
truncation and/or invalidation, and will either block waiting
for the page to come back from some other node, or hand back
an error, as appropriate.

Or am I missing something here?

> About the down(&inode->i_sem); up(), that you dropped under Andrew's
> suggestion, while that maybe ugly, it will have a chance to save cpu,
> and since it's a slow path such goto, it's definitely worthwhile to keep
> it IMHO. Otherwise one cpu will keep scheduling in a loop until truncate
> returns, and it can take time since it may have to do I/O or wait on
> some I/O semaphore. It wouldn't be DoSable, because the
> ret-from-exception will check need_resched, but still it's bad for cpu
> utilization and such a waste can be avoided trivially as in my fix.

If we really can get away with a single increment, then it wouldn't
loop unless there were several vmtruncate()s or invalidate_mmap_range()s
going on concurrently.

> I was chatting with Daniel about those hooks a few minutes ago, and he
> suggested to make do_no_page a callback itself (instead of having
> do_no_page call into a ->nopage further callback). And to provide by
> default a generic implementation that would be equivalent to the current
> do_no_page. As far as I can tell that will save both the new pointer to
> function for the DSM hook (that IMHO has to be taken twice, both before
> ->nopage and after ->nopage, not only before the ->nopage, for the
> reason explained above) and the ->nopage hook itself. So maybe that
> could be a cleaner solution to avoid the DSM hooks enterely, so we don't
> have more hooks but less, and a library call. This sounds the best for
> performance and flexibility. (talking only about 2.5 of course, 2.4 I
> think is just fine with my ""production"" 8) fix here:

Yep, I was chatting with Daniel earlier, and that chatting was what lead
to my earlier -- and more complex -- patch.

Of course, I would be happy to rediff it only current kernels, but Dave's
patch is much simpler, and has passed some fairly severe torture tests.
(As has my earlier one.)  At this point, however, Dave's looks better
to me.  Yes, the Linux tradition of simplicity is rubbing off even on
an old-timer like me.  ;-)

						Thanx, Paul

> 	http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.21rc8aa1/2.4.21rc8aa1/9999_truncate-nopage-race-1
> 
> )
> 
> Andrea
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-23  3:28               ` Paul E. McKenney
  0 siblings, 0 replies; 28+ messages in thread
From: Paul E. McKenney @ 2003-06-23  3:28 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Dave McCracken, Andrew Morton, linux-mm, linux-kernel

On Fri, Jun 20, 2003 at 02:17:43AM +0200, Andrea Arcangeli wrote:
> maybe I'm missing something silly but this fixes nothing IMHO. It's not
> a coincidence I used the seq_lock (aka frlock in 2.4-aa) in my fix, a
> single counter increment isn't nearly enough, you definitely need _both_
> an entry and exit point in do_truncate or you'll never know if
> vmtruncate has been running under you. The first increment is like the
> down_read, the second increment is the up_read. Both are necessary to
> trap any vmtruncate during the do_no_page.
> 
> Your patch traps this timing case:
> 
> 	CPU 0			CPU 1
> 	----------		-----------
> 				do_no_page
> 	truncate
> 				read counter
> 
> 	increment counter
> 	vmtruncate
> 				->nopage
> 				read counter again -> different so retry

Yep!

> but you can't trap this with a single counter increment in do_truncate:
> 
> 	CPU 0			CPU 1
> 	----------		-----------
> 				do_no_page
> 	truncate
> 	increment counter
> 				read counter
> 				->nopage
> 	vmtruncate
> 				read counter again -> different so retry
> 
> thanks to the second counter increment after vmtruncate in my fix, the
> above race couldn't happen.

The trick is that CPU 0 is expected to have updated the filesystem's
idea of what pages are available before calling vmtruncate,
invalidate_mmap_range() or whichever.  Thus, once the counter
has been incremented, ->nopage() will see the results of the
truncation and/or invalidation, and will either block waiting
for the page to come back from some other node, or hand back
an error, as appropriate.

Or am I missing something here?

> About the down(&inode->i_sem); up(), that you dropped under Andrew's
> suggestion, while that maybe ugly, it will have a chance to save cpu,
> and since it's a slow path such goto, it's definitely worthwhile to keep
> it IMHO. Otherwise one cpu will keep scheduling in a loop until truncate
> returns, and it can take time since it may have to do I/O or wait on
> some I/O semaphore. It wouldn't be DoSable, because the
> ret-from-exception will check need_resched, but still it's bad for cpu
> utilization and such a waste can be avoided trivially as in my fix.

If we really can get away with a single increment, then it wouldn't
loop unless there were several vmtruncate()s or invalidate_mmap_range()s
going on concurrently.

> I was chatting with Daniel about those hooks a few minutes ago, and he
> suggested to make do_no_page a callback itself (instead of having
> do_no_page call into a ->nopage further callback). And to provide by
> default a generic implementation that would be equivalent to the current
> do_no_page. As far as I can tell that will save both the new pointer to
> function for the DSM hook (that IMHO has to be taken twice, both before
> ->nopage and after ->nopage, not only before the ->nopage, for the
> reason explained above) and the ->nopage hook itself. So maybe that
> could be a cleaner solution to avoid the DSM hooks enterely, so we don't
> have more hooks but less, and a library call. This sounds the best for
> performance and flexibility. (talking only about 2.5 of course, 2.4 I
> think is just fine with my ""production"" 8) fix here:

Yep, I was chatting with Daniel earlier, and that chatting was what lead
to my earlier -- and more complex -- patch.

Of course, I would be happy to rediff it only current kernels, but Dave's
patch is much simpler, and has passed some fairly severe torture tests.
(As has my earlier one.)  At this point, however, Dave's looks better
to me.  Yes, the Linux tradition of simplicity is rubbing off even on
an old-timer like me.  ;-)

						Thanx, Paul

> 	http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.21rc8aa1/2.4.21rc8aa1/9999_truncate-nopage-race-1
> 
> )
> 
> Andrea
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-23  3:28               ` Paul E. McKenney
@ 2003-06-23  6:29                 ` Andrea Arcangeli
  -1 siblings, 0 replies; 28+ messages in thread
From: Andrea Arcangeli @ 2003-06-23  6:29 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Dave McCracken, Andrew Morton, linux-mm, linux-kernel

Hi Paul,

On Sun, Jun 22, 2003 at 08:28:42PM -0700, Paul E. McKenney wrote:
> On Fri, Jun 20, 2003 at 02:17:43AM +0200, Andrea Arcangeli wrote:
> > but you can't trap this with a single counter increment in do_truncate:
> > 
> > 	CPU 0			CPU 1
> > 	----------		-----------
> > 				do_no_page
> > 	truncate
> > 	increment counter
> > 				read counter
> > 				->nopage
> > 	vmtruncate
> > 				read counter again -> different so retry
> > 
> > thanks to the second counter increment after vmtruncate in my fix, the
> > above race couldn't happen.
> 
> The trick is that CPU 0 is expected to have updated the filesystem's
> idea of what pages are available before calling vmtruncate,
> invalidate_mmap_range() or whichever.  Thus, once the counter
> has been incremented, ->nopage() will see the results of the
> truncation and/or invalidation, and will either block waiting
> for the page to come back from some other node, or hand back
> an error, as appropriate.
> 
> Or am I missing something here?

Let me add the truncate_inode_pages too (it was implicit for me, but
it's more correct to make it explicit to better explain what is going on
and why the single increment in truncate can't fix the race). 

 	CPU 0			CPU 1
 	----------		-----------
 				do_no_page
 	truncate
 	increment counter
 				read counter
 				->nopage returns "page"
			
 	vmtruncate
	truncate_inode_pages()
	"page" becomes anonymous
	
 				read counter again -> still the same
				install "page" in the address space

IMHO the race you're missing (and the reason I had to use the seq_lock
and not just a single counter increment in truncate) is the above one.
there's no serialization at all between ->nopage and truncate.
Also single incrementing the counter inside the semaphore isn't needed
(could be outside the semaphore too). I very much considered the above
scenario while designing the seq_lock fix.

The issue is that while ->nopage will make sure to do the I/O on a page
inside the pagecache, there's no guarantee at all about what ->nopage
will return after the I/O has completed, and even if ->nopage returns a
page inside pagecache, the parallel truncate can still nuke it and make it
anonymous anytime under do_no_page (after ->nopage has returned). hence
the race above. truncate_inode_pages will convert the page to anonymous
and do_no_page has no way to trap it w/o the proper locking in my patch,
to know if any truncate is racing on the same inode. do_no_page simply
can't check the page->mapping, even if it loops for 10 minutes on
page->mapping it will never know if the other cpu has a truncate running
and it got stuck into an irq for 10 minutes too ;).

This is why I had to use the seq_lock to know if any truncate is running
in parallel, a single increment simply can't provide that info and it's
in turn still buggy.

If I'm missing some new locking of 2.5 that would prevent the
truncate_inode_pages to run after ->nopage returned let me know of
course of course, I just can't see it. 

Also note, here I'm focusing first on the local filesystem case, not on
the DSM yet. This is the bug in 2.4/2.2 etc.. that Dave was looking into
fixing (and I agree it needs fixing to provide proper VM semantics to
userspace).

> > About the down(&inode->i_sem); up(), that you dropped under Andrew's
> > suggestion, while that maybe ugly, it will have a chance to save cpu,
> > and since it's a slow path such goto, it's definitely worthwhile to keep
> > it IMHO. Otherwise one cpu will keep scheduling in a loop until truncate
> > returns, and it can take time since it may have to do I/O or wait on
> > some I/O semaphore. It wouldn't be DoSable, because the
> > ret-from-exception will check need_resched, but still it's bad for cpu
> > utilization and such a waste can be avoided trivially as in my fix.
> 
> If we really can get away with a single increment, then it wouldn't
> loop unless there were several vmtruncate()s or invalidate_mmap_range()s
> going on concurrently.

same goes for my "real" fix, it would never loop in practice, but since
the "retry" path is a slow path in the first place, IMHO we should do a
dummy down/up on the i_sem semaphore too to avoid wasting cpu in the
unlikely case. I agree on the layering violation, but that's just a
beauty issue, in practice the down()/up() makes perfect sense and in
turn I prefer it. Dropping it makes no technical sense to me.

Oh really in 2.4.22pre (and in 2.5 too after the i_alloc_sem gets
forward ported) I really need to change the down(i_sem) to a
down_read(i_alloc_sem), since I only need to serialize against truncate,
not anything else (not write etc..). (noted in my todo list, trivial
change, just must not forget it ;)

> 
> > I was chatting with Daniel about those hooks a few minutes ago, and he
> > suggested to make do_no_page a callback itself (instead of having
> > do_no_page call into a ->nopage further callback). And to provide by
> > default a generic implementation that would be equivalent to the current
> > do_no_page. As far as I can tell that will save both the new pointer to
> > function for the DSM hook (that IMHO has to be taken twice, both before
> > ->nopage and after ->nopage, not only before the ->nopage, for the
> > reason explained above) and the ->nopage hook itself. So maybe that
> > could be a cleaner solution to avoid the DSM hooks enterely, so we don't
> > have more hooks but less, and a library call. This sounds the best for
> > performance and flexibility. (talking only about 2.5 of course, 2.4 I
> > think is just fine with my ""production"" 8) fix here:
> 
> Yep, I was chatting with Daniel earlier, and that chatting was what lead
> to my earlier -- and more complex -- patch.
> 
> Of course, I would be happy to rediff it only current kernels, but Dave's
> patch is much simpler, and has passed some fairly severe torture tests.

did the single counter increment pass any torture tests? Maybe they're
all focused on the DSM and not at all on the local race that Dave
pointed out originally in l-k?

> (As has my earlier one.)  At this point, however, Dave's looks better
> to me.  Yes, the Linux tradition of simplicity is rubbing off even on
> an old-timer like me.  ;-)

;)

Andrea

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-23  6:29                 ` Andrea Arcangeli
  0 siblings, 0 replies; 28+ messages in thread
From: Andrea Arcangeli @ 2003-06-23  6:29 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Dave McCracken, Andrew Morton, linux-mm, linux-kernel

Hi Paul,

On Sun, Jun 22, 2003 at 08:28:42PM -0700, Paul E. McKenney wrote:
> On Fri, Jun 20, 2003 at 02:17:43AM +0200, Andrea Arcangeli wrote:
> > but you can't trap this with a single counter increment in do_truncate:
> > 
> > 	CPU 0			CPU 1
> > 	----------		-----------
> > 				do_no_page
> > 	truncate
> > 	increment counter
> > 				read counter
> > 				->nopage
> > 	vmtruncate
> > 				read counter again -> different so retry
> > 
> > thanks to the second counter increment after vmtruncate in my fix, the
> > above race couldn't happen.
> 
> The trick is that CPU 0 is expected to have updated the filesystem's
> idea of what pages are available before calling vmtruncate,
> invalidate_mmap_range() or whichever.  Thus, once the counter
> has been incremented, ->nopage() will see the results of the
> truncation and/or invalidation, and will either block waiting
> for the page to come back from some other node, or hand back
> an error, as appropriate.
> 
> Or am I missing something here?

Let me add the truncate_inode_pages too (it was implicit for me, but
it's more correct to make it explicit to better explain what is going on
and why the single increment in truncate can't fix the race). 

 	CPU 0			CPU 1
 	----------		-----------
 				do_no_page
 	truncate
 	increment counter
 				read counter
 				->nopage returns "page"
			
 	vmtruncate
	truncate_inode_pages()
	"page" becomes anonymous
	
 				read counter again -> still the same
				install "page" in the address space

IMHO the race you're missing (and the reason I had to use the seq_lock
and not just a single counter increment in truncate) is the above one.
there's no serialization at all between ->nopage and truncate.
Also single incrementing the counter inside the semaphore isn't needed
(could be outside the semaphore too). I very much considered the above
scenario while designing the seq_lock fix.

The issue is that while ->nopage will make sure to do the I/O on a page
inside the pagecache, there's no guarantee at all about what ->nopage
will return after the I/O has completed, and even if ->nopage returns a
page inside pagecache, the parallel truncate can still nuke it and make it
anonymous anytime under do_no_page (after ->nopage has returned). hence
the race above. truncate_inode_pages will convert the page to anonymous
and do_no_page has no way to trap it w/o the proper locking in my patch,
to know if any truncate is racing on the same inode. do_no_page simply
can't check the page->mapping, even if it loops for 10 minutes on
page->mapping it will never know if the other cpu has a truncate running
and it got stuck into an irq for 10 minutes too ;).

This is why I had to use the seq_lock to know if any truncate is running
in parallel, a single increment simply can't provide that info and it's
in turn still buggy.

If I'm missing some new locking of 2.5 that would prevent the
truncate_inode_pages to run after ->nopage returned let me know of
course of course, I just can't see it. 

Also note, here I'm focusing first on the local filesystem case, not on
the DSM yet. This is the bug in 2.4/2.2 etc.. that Dave was looking into
fixing (and I agree it needs fixing to provide proper VM semantics to
userspace).

> > About the down(&inode->i_sem); up(), that you dropped under Andrew's
> > suggestion, while that maybe ugly, it will have a chance to save cpu,
> > and since it's a slow path such goto, it's definitely worthwhile to keep
> > it IMHO. Otherwise one cpu will keep scheduling in a loop until truncate
> > returns, and it can take time since it may have to do I/O or wait on
> > some I/O semaphore. It wouldn't be DoSable, because the
> > ret-from-exception will check need_resched, but still it's bad for cpu
> > utilization and such a waste can be avoided trivially as in my fix.
> 
> If we really can get away with a single increment, then it wouldn't
> loop unless there were several vmtruncate()s or invalidate_mmap_range()s
> going on concurrently.

same goes for my "real" fix, it would never loop in practice, but since
the "retry" path is a slow path in the first place, IMHO we should do a
dummy down/up on the i_sem semaphore too to avoid wasting cpu in the
unlikely case. I agree on the layering violation, but that's just a
beauty issue, in practice the down()/up() makes perfect sense and in
turn I prefer it. Dropping it makes no technical sense to me.

Oh really in 2.4.22pre (and in 2.5 too after the i_alloc_sem gets
forward ported) I really need to change the down(i_sem) to a
down_read(i_alloc_sem), since I only need to serialize against truncate,
not anything else (not write etc..). (noted in my todo list, trivial
change, just must not forget it ;)

> 
> > I was chatting with Daniel about those hooks a few minutes ago, and he
> > suggested to make do_no_page a callback itself (instead of having
> > do_no_page call into a ->nopage further callback). And to provide by
> > default a generic implementation that would be equivalent to the current
> > do_no_page. As far as I can tell that will save both the new pointer to
> > function for the DSM hook (that IMHO has to be taken twice, both before
> > ->nopage and after ->nopage, not only before the ->nopage, for the
> > reason explained above) and the ->nopage hook itself. So maybe that
> > could be a cleaner solution to avoid the DSM hooks enterely, so we don't
> > have more hooks but less, and a library call. This sounds the best for
> > performance and flexibility. (talking only about 2.5 of course, 2.4 I
> > think is just fine with my ""production"" 8) fix here:
> 
> Yep, I was chatting with Daniel earlier, and that chatting was what lead
> to my earlier -- and more complex -- patch.
> 
> Of course, I would be happy to rediff it only current kernels, but Dave's
> patch is much simpler, and has passed some fairly severe torture tests.

did the single counter increment pass any torture tests? Maybe they're
all focused on the DSM and not at all on the local race that Dave
pointed out originally in l-k?

> (As has my earlier one.)  At this point, however, Dave's looks better
> to me.  Yes, the Linux tradition of simplicity is rubbing off even on
> an old-timer like me.  ;-)

;)

Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-23  3:28               ` Paul E. McKenney
@ 2003-06-23  6:32                 ` Andrew Morton
  -1 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-23  6:32 UTC (permalink / raw)
  To: paulmck; +Cc: andrea, dmccr, linux-mm, linux-kernel

"Paul E. McKenney" <paulmck@us.ibm.com> wrote:
>
> > but you can't trap this with a single counter increment in do_truncate:
>  > 
>  > 	CPU 0			CPU 1
>  > 	----------		-----------
>  > 				do_no_page
>  > 	truncate

        i_size = new_i_size;

>  > 	increment counter
>  > 				read counter
>  > 				->nopage

                                check i_size

>  > 	vmtruncate
>  > 				read counter again -> different so retry
>  > 
>  > thanks to the second counter increment after vmtruncate in my fix, the
>  > above race couldn't happen.
> 
>  The trick is that CPU 0 is expected to have updated the filesystem's
>  idea of what pages are available before calling vmtruncate,
>  invalidate_mmap_range() or whichever.

i_size has been updated, and filemap_nopage() will return NULL.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-23  6:32                 ` Andrew Morton
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-23  6:32 UTC (permalink / raw)
  To: paulmck; +Cc: andrea, dmccr, linux-mm, linux-kernel

"Paul E. McKenney" <paulmck@us.ibm.com> wrote:
>
> > but you can't trap this with a single counter increment in do_truncate:
>  > 
>  > 	CPU 0			CPU 1
>  > 	----------		-----------
>  > 				do_no_page
>  > 	truncate

        i_size = new_i_size;

>  > 	increment counter
>  > 				read counter
>  > 				->nopage

                                check i_size

>  > 	vmtruncate
>  > 				read counter again -> different so retry
>  > 
>  > thanks to the second counter increment after vmtruncate in my fix, the
>  > above race couldn't happen.
> 
>  The trick is that CPU 0 is expected to have updated the filesystem's
>  idea of what pages are available before calling vmtruncate,
>  invalidate_mmap_range() or whichever.

i_size has been updated, and filemap_nopage() will return NULL.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-23  6:32                 ` Andrew Morton
@ 2003-06-23  7:43                   ` Andrea Arcangeli
  -1 siblings, 0 replies; 28+ messages in thread
From: Andrea Arcangeli @ 2003-06-23  7:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: paulmck, dmccr, linux-mm, linux-kernel

On Sun, Jun 22, 2003 at 11:32:35PM -0700, Andrew Morton wrote:
> "Paul E. McKenney" <paulmck@us.ibm.com> wrote:
> >
> > > but you can't trap this with a single counter increment in do_truncate:
> >  > 
> >  > 	CPU 0			CPU 1
> >  > 	----------		-----------
> >  > 				do_no_page
> >  > 	truncate
> 
>         i_size = new_i_size;
> 
> >  > 	increment counter
> >  > 				read counter
> >  > 				->nopage
> 
>                                 check i_size
> 
> >  > 	vmtruncate
> >  > 				read counter again -> different so retry
> >  > 
> >  > thanks to the second counter increment after vmtruncate in my fix, the
> >  > above race couldn't happen.
> > 
> >  The trick is that CPU 0 is expected to have updated the filesystem's
> >  idea of what pages are available before calling vmtruncate,
> >  invalidate_mmap_range() or whichever.
> 
> i_size has been updated, and filemap_nopage() will return NULL.

ok, I finally see your object now, so it's the i_size that was in theory
supposed to act as a "second" increment on the truncate side to
serialize against in do_no_page.

I was searching for any additional SMP locking, and infact there is
none.  No way the current code can serialize against the i_size.

The problem is that the last patch posted on l-k isn't guaranteeing
anything like what you wrote above.  What can happen is that the i_size
can be read still way before the counter.


 	CPU 0			CPU 1
	----------		-----------
				do_no_page
	truncate
 
                                specualtive read i_size into l2 cache
        i_size = new_i_size;

	increment counter
 				read counter
				->nopage
				check i_size from l2 cache
 

 	vmtruncate
 				read counter again -> different so retry


For the single counter increment patch to close the race using the
implicit i_size update as a second increment, you need to add an
absolutely necessary smb_rmb() here:

        sequence = atomic_read(&mapping->truncate_count);
+	smp_rmb();	
        new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);

And for non-x86 you need a smb_wmb() between the i_size = new_i_size and
the atomic_inc on the truncate side too, because advanced archs with
finegrined locking can implement a one-way barrier, where the counter
increment could pass the down(&mapping->i_shared_sem). Last but not the
least it was IMHO misleading and dirty to do the counter increment in
truncate after taking a completely unrelated semaphore just to take
advantage of the implicit barrier on the x86. So doing it with a proper
smp_wmb() besides fixing advanced archs will kind of make the code
readable, so you can understand the i_size update is the second
increment (actually decrement because it's the i_size).

Now that I see what the second increment was supposed to be, it's a cute
idea to use the i_size for that, and I can appreciate its beauty (I
mean, in purist terms, in practice IMHO the seq_lock was more obivously
correct code with all the smp reordering hidden into the lock semantics
and I'm 100% sure you couldn't measure any performance difference in
truncate due the second increment ;). But if you prefer to keep the most
finegrined approch of using the i_size with the explicit memory
barriers, I'm of course not against it after you add the needed fixed I
outlined above that will finally close the race.

thanks,

Andrea

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-23  7:43                   ` Andrea Arcangeli
  0 siblings, 0 replies; 28+ messages in thread
From: Andrea Arcangeli @ 2003-06-23  7:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: paulmck, dmccr, linux-mm, linux-kernel

On Sun, Jun 22, 2003 at 11:32:35PM -0700, Andrew Morton wrote:
> "Paul E. McKenney" <paulmck@us.ibm.com> wrote:
> >
> > > but you can't trap this with a single counter increment in do_truncate:
> >  > 
> >  > 	CPU 0			CPU 1
> >  > 	----------		-----------
> >  > 				do_no_page
> >  > 	truncate
> 
>         i_size = new_i_size;
> 
> >  > 	increment counter
> >  > 				read counter
> >  > 				->nopage
> 
>                                 check i_size
> 
> >  > 	vmtruncate
> >  > 				read counter again -> different so retry
> >  > 
> >  > thanks to the second counter increment after vmtruncate in my fix, the
> >  > above race couldn't happen.
> > 
> >  The trick is that CPU 0 is expected to have updated the filesystem's
> >  idea of what pages are available before calling vmtruncate,
> >  invalidate_mmap_range() or whichever.
> 
> i_size has been updated, and filemap_nopage() will return NULL.

ok, I finally see your object now, so it's the i_size that was in theory
supposed to act as a "second" increment on the truncate side to
serialize against in do_no_page.

I was searching for any additional SMP locking, and infact there is
none.  No way the current code can serialize against the i_size.

The problem is that the last patch posted on l-k isn't guaranteeing
anything like what you wrote above.  What can happen is that the i_size
can be read still way before the counter.


 	CPU 0			CPU 1
	----------		-----------
				do_no_page
	truncate
 
                                specualtive read i_size into l2 cache
        i_size = new_i_size;

	increment counter
 				read counter
				->nopage
				check i_size from l2 cache
 

 	vmtruncate
 				read counter again -> different so retry


For the single counter increment patch to close the race using the
implicit i_size update as a second increment, you need to add an
absolutely necessary smb_rmb() here:

        sequence = atomic_read(&mapping->truncate_count);
+	smp_rmb();	
        new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);

And for non-x86 you need a smb_wmb() between the i_size = new_i_size and
the atomic_inc on the truncate side too, because advanced archs with
finegrined locking can implement a one-way barrier, where the counter
increment could pass the down(&mapping->i_shared_sem). Last but not the
least it was IMHO misleading and dirty to do the counter increment in
truncate after taking a completely unrelated semaphore just to take
advantage of the implicit barrier on the x86. So doing it with a proper
smp_wmb() besides fixing advanced archs will kind of make the code
readable, so you can understand the i_size update is the second
increment (actually decrement because it's the i_size).

Now that I see what the second increment was supposed to be, it's a cute
idea to use the i_size for that, and I can appreciate its beauty (I
mean, in purist terms, in practice IMHO the seq_lock was more obivously
correct code with all the smp reordering hidden into the lock semantics
and I'm 100% sure you couldn't measure any performance difference in
truncate due the second increment ;). But if you prefer to keep the most
finegrined approch of using the i_size with the explicit memory
barriers, I'm of course not against it after you add the needed fixed I
outlined above that will finally close the race.

thanks,

Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-23  7:43                   ` Andrea Arcangeli
@ 2003-06-23  7:56                     ` Andrew Morton
  -1 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-23  7:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: paulmck, dmccr, linux-mm, linux-kernel

Andrea Arcangeli <andrea@suse.de> wrote:
>
> that will finally close the race

Could someone please convince me that we really _need_ to close it?

The VM handles the whacky pages OK (on slowpaths), and when this first came
up two years ago it was argued that the application was racy/buggy
anyway.  So as long as we're secure and stable, we don't care.  Certainly
not to the point of adding more atomic ops on the fastpath.

So...   what bug are we actually fixing here?


(I'd also like to see a clearer description of the distributed fs problem,
and how this fixes it).

Thanks.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-23  7:56                     ` Andrew Morton
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Morton @ 2003-06-23  7:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: paulmck, dmccr, linux-mm, linux-kernel

Andrea Arcangeli <andrea@suse.de> wrote:
>
> that will finally close the race

Could someone please convince me that we really _need_ to close it?

The VM handles the whacky pages OK (on slowpaths), and when this first came
up two years ago it was argued that the application was racy/buggy
anyway.  So as long as we're secure and stable, we don't care.  Certainly
not to the point of adding more atomic ops on the fastpath.

So...   what bug are we actually fixing here?


(I'd also like to see a clearer description of the distributed fs problem,
and how this fixes it).

Thanks.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-23  7:56                     ` Andrew Morton
@ 2003-06-23  8:10                       ` Andrea Arcangeli
  -1 siblings, 0 replies; 28+ messages in thread
From: Andrea Arcangeli @ 2003-06-23  8:10 UTC (permalink / raw)
  To: Andrew Morton; +Cc: paulmck, dmccr, linux-mm, linux-kernel

On Mon, Jun 23, 2003 at 12:56:23AM -0700, Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> > that will finally close the race
> 
> Could someone please convince me that we really _need_ to close it?
> 
> The VM handles the whacky pages OK (on slowpaths), and when this first came
> up two years ago it was argued that the application was racy/buggy
> anyway.  So as long as we're secure and stable, we don't care.  Certainly
> not to the point of adding more atomic ops on the fastpath.
> 
> So...   what bug are we actually fixing here?

we're fixing userspace data corruption with an app trapping SIGBUS.

> 
> 
> (I'd also like to see a clearer description of the distributed fs problem,
> and how this fixes it).

I certainly would like discussions about it too.

Andrea

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-23  8:10                       ` Andrea Arcangeli
  0 siblings, 0 replies; 28+ messages in thread
From: Andrea Arcangeli @ 2003-06-23  8:10 UTC (permalink / raw)
  To: Andrew Morton; +Cc: paulmck, dmccr, linux-mm, linux-kernel

On Mon, Jun 23, 2003 at 12:56:23AM -0700, Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> > that will finally close the race
> 
> Could someone please convince me that we really _need_ to close it?
> 
> The VM handles the whacky pages OK (on slowpaths), and when this first came
> up two years ago it was argued that the application was racy/buggy
> anyway.  So as long as we're secure and stable, we don't care.  Certainly
> not to the point of adding more atomic ops on the fastpath.
> 
> So...   what bug are we actually fixing here?

we're fixing userspace data corruption with an app trapping SIGBUS.

> 
> 
> (I'd also like to see a clearer description of the distributed fs problem,
> and how this fixes it).

I certainly would like discussions about it too.

Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
  2003-06-23  8:10                       ` Andrea Arcangeli
@ 2003-06-24  1:37                         ` Paul E. McKenney
  -1 siblings, 0 replies; 28+ messages in thread
From: Paul E. McKenney @ 2003-06-24  1:37 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, dmccr, linux-mm, linux-kernel

On Mon, Jun 23, 2003 at 10:10:16AM +0200, Andrea Arcangeli wrote:
> On Mon, Jun 23, 2003 at 12:56:23AM -0700, Andrew Morton wrote:
> > Andrea Arcangeli <andrea@suse.de> wrote:
> > >
> > > that will finally close the race
> > 
> > Could someone please convince me that we really _need_ to close it?
> > 
> > The VM handles the whacky pages OK (on slowpaths), and when this first came
> > up two years ago it was argued that the application was racy/buggy
> > anyway.  So as long as we're secure and stable, we don't care.  Certainly
> > not to the point of adding more atomic ops on the fastpath.
> > 
> > So...   what bug are we actually fixing here?
> 
> we're fixing userspace data corruption with an app trapping SIGBUS.

This handling of wacky pages apparently does not carry over into some
of the rmap optimizations, but I will let dmccr speak to that.

> > (I'd also like to see a clearer description of the distributed fs problem,
> > and how this fixes it).
> 
> I certainly would like discussions about it too.

The race is as follows:

	Node 0				Node 1

	mmap()s file f1
					writes to f1
					sends msg to Node 0 requesting data
	pgflt on mmap
	do_no_page() invokes ->nopage
	->nopage hands page up
	<some interrupt or other delay>

	receives message from Node 1
	invokes invalidate_mmap_range()
	returns data to Node 1

					receives data from Node 0
					does write

	<return from interrupt or whatever>
	do_no_page() installs now-stale mapping
	return from page fault
	application scribbles on page, violating DFS's consistency!!!

Now Node 0 and Node 1 have inconsistent versions of the page being
written to.  Note that this problem occurs regardless of the mechanism
that Node 1 does to accomplish the write -- mmap(), write(), whatever.

This race is a problem only in distributed filesystems that provide
the same consistency guarantees normally found on local filesystems.
Things like NFS or AFS would not be bothered, since they do not offer
such consistency guarantees.  For example, with AFS, the last guy
to close the file wins.

						Thanx, Paul

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] Fix vmtruncate race and distributed filesystem race
@ 2003-06-24  1:37                         ` Paul E. McKenney
  0 siblings, 0 replies; 28+ messages in thread
From: Paul E. McKenney @ 2003-06-24  1:37 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, dmccr, linux-mm, linux-kernel

On Mon, Jun 23, 2003 at 10:10:16AM +0200, Andrea Arcangeli wrote:
> On Mon, Jun 23, 2003 at 12:56:23AM -0700, Andrew Morton wrote:
> > Andrea Arcangeli <andrea@suse.de> wrote:
> > >
> > > that will finally close the race
> > 
> > Could someone please convince me that we really _need_ to close it?
> > 
> > The VM handles the whacky pages OK (on slowpaths), and when this first came
> > up two years ago it was argued that the application was racy/buggy
> > anyway.  So as long as we're secure and stable, we don't care.  Certainly
> > not to the point of adding more atomic ops on the fastpath.
> > 
> > So...   what bug are we actually fixing here?
> 
> we're fixing userspace data corruption with an app trapping SIGBUS.

This handling of wacky pages apparently does not carry over into some
of the rmap optimizations, but I will let dmccr speak to that.

> > (I'd also like to see a clearer description of the distributed fs problem,
> > and how this fixes it).
> 
> I certainly would like discussions about it too.

The race is as follows:

	Node 0				Node 1

	mmap()s file f1
					writes to f1
					sends msg to Node 0 requesting data
	pgflt on mmap
	do_no_page() invokes ->nopage
	->nopage hands page up
	<some interrupt or other delay>

	receives message from Node 1
	invokes invalidate_mmap_range()
	returns data to Node 1

					receives data from Node 0
					does write

	<return from interrupt or whatever>
	do_no_page() installs now-stale mapping
	return from page fault
	application scribbles on page, violating DFS's consistency!!!

Now Node 0 and Node 1 have inconsistent versions of the page being
written to.  Note that this problem occurs regardless of the mechanism
that Node 1 does to accomplish the write -- mmap(), write(), whatever.

This race is a problem only in distributed filesystems that provide
the same consistency guarantees normally found on local filesystems.
Things like NFS or AFS would not be bothered, since they do not offer
such consistency guarantees.  For example, with AFS, the last guy
to close the file wins.

						Thanx, Paul
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2003-06-24  1:37 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-06-12 20:16 [PATCH] Fix vmtruncate race and distributed filesystem race Dave McCracken
2003-06-12 20:49 ` Andrew Morton
2003-06-12 20:49   ` Andrew Morton
2003-06-12 21:00   ` Andrew Morton
2003-06-12 21:00     ` Andrew Morton
2003-06-12 21:08     ` Dave McCracken
2003-06-12 21:08       ` Dave McCracken
2003-06-12 21:44       ` Andrew Morton
2003-06-12 21:44         ` Andrew Morton
2003-06-12 22:56         ` Dave McCracken
2003-06-12 23:07           ` Andrew Morton
2003-06-12 23:07             ` Andrew Morton
2003-06-20  0:17           ` Andrea Arcangeli
2003-06-20  0:17             ` Andrea Arcangeli
2003-06-23  3:28             ` Paul E. McKenney
2003-06-23  3:28               ` Paul E. McKenney
2003-06-23  6:29               ` Andrea Arcangeli
2003-06-23  6:29                 ` Andrea Arcangeli
2003-06-23  6:32               ` Andrew Morton
2003-06-23  6:32                 ` Andrew Morton
2003-06-23  7:43                 ` Andrea Arcangeli
2003-06-23  7:43                   ` Andrea Arcangeli
2003-06-23  7:56                   ` Andrew Morton
2003-06-23  7:56                     ` Andrew Morton
2003-06-23  8:10                     ` Andrea Arcangeli
2003-06-23  8:10                       ` Andrea Arcangeli
2003-06-24  1:37                       ` Paul E. McKenney
2003-06-24  1:37                         ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.