All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-01-20 12:30 ` Miklos Szeredi
  0 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-01-20 12:30 UTC (permalink / raw)
  To: akpm; +Cc: hughd, gurudas.pai, lkml20101129, linux-kernel, linux-mm

From: Miklos Szeredi <mszeredi@suse.cz>

Running a fuse filesystem with multiple open()'s in parallel can
trigger a "kernel BUG at mm/truncate.c:475"

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.  For example:

  thread1: going through a big range, stops in the middle of a vma and
     stores the restart address in vm_truncate_count.

  thread2: comes in with a small (e.g. single page) unmap request on
     the same vma, somewhere before restart_address, finds that the
     vma was already unmapped up to the restart address and happily
     returns without doing anything.

Another scenario would be two big unmap requests, both having to
restart the unmapping and each one setting vm_truncate_count to its
own value.  This could go on forever without any of them being able to
finish.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to 'struct address_space' to prevent
running multiple concurrent unmap_mapping_range() on the same mapping.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Tested-by: Gurudas Pai <gurudas.pai@oracle.com>
---
 fs/gfs2/main.c     |    9 +--------
 fs/inode.c         |   22 +++++++++++++++-------
 fs/nilfs2/btnode.c |    5 -----
 fs/nilfs2/btnode.h |    1 -
 fs/nilfs2/mdt.c    |    4 ++--
 fs/nilfs2/page.c   |   13 -------------
 fs/nilfs2/page.h   |    1 -
 fs/nilfs2/super.c  |    2 +-
 include/linux/fs.h |    2 ++
 mm/memory.c        |    2 ++
 10 files changed, 23 insertions(+), 38 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2011-01-17 09:33:44.000000000 +0100
+++ linux-2.6/mm/memory.c	2011-01-20 13:03:29.000000000 +0100
@@ -2650,6 +2650,7 @@ void unmap_mapping_range(struct address_
 		details.last_index = ULONG_MAX;
 	details.i_mmap_lock = &mapping->i_mmap_lock;
 
+	mutex_lock(&mapping->unmap_mutex);
 	spin_lock(&mapping->i_mmap_lock);
 
 	/* Protect against endless unmapping loops */
@@ -2666,6 +2667,7 @@ void unmap_mapping_range(struct address_
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	spin_unlock(&mapping->i_mmap_lock);
+	mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
Index: linux-2.6/fs/gfs2/main.c
===================================================================
--- linux-2.6.orig/fs/gfs2/main.c	2011-01-12 12:27:59.000000000 +0100
+++ linux-2.6/fs/gfs2/main.c	2011-01-20 13:03:29.000000000 +0100
@@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(voi
 	struct address_space *mapping = (struct address_space *)(gl + 1);
 
 	gfs2_init_glock_once(gl);
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	address_space_init_once(mapping);
 }
 
 /**
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2011-01-12 12:27:59.000000000 +0100
+++ linux-2.6/fs/inode.c	2011-01-20 13:03:29.000000000 +0100
@@ -295,6 +295,20 @@ static void destroy_inode(struct inode *
 		call_rcu(&inode->i_rcu, i_callback);
 }
 
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
+
 /*
  * These are initializations that only need to be done
  * once, because the fields are idempotent across use
@@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_wb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
-	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
-	INIT_LIST_HEAD(&inode->i_data.private_list);
-	spin_lock_init(&inode->i_data.private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
+	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
Index: linux-2.6/fs/nilfs2/btnode.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/btnode.c	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/btnode.c	2011-01-20 13:03:29.000000000 +0100
@@ -35,11 +35,6 @@
 #include "btnode.h"
 
 
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-	nilfs_mapping_init_once(btnc);
-}
-
 static const struct address_space_operations def_btnode_aops = {
 	.sync_page		= block_sync_page,
 };
Index: linux-2.6/fs/nilfs2/btnode.h
===================================================================
--- linux-2.6.orig/fs/nilfs2/btnode.h	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/btnode.h	2011-01-20 13:03:29.000000000 +0100
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
 	struct buffer_head *newbh;
 };
 
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
Index: linux-2.6/fs/nilfs2/mdt.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/mdt.c	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/mdt.c	2011-01-20 13:03:29.000000000 +0100
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct in
 	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
-	nilfs_mapping_init_once(&shadow->frozen_data);
+	address_space_init_once(&shadow->frozen_data);
 	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	address_space_init_once(&shadow->frozen_btnodes);
 	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
 	mi->mi_shadow = shadow;
 	return 0;
Index: linux-2.6/fs/nilfs2/page.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/page.c	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/page.c	2011-01-20 13:03:29.000000000 +0100
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(
 	return nc;
 }
 
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
-
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops)
Index: linux-2.6/fs/nilfs2/page.h
===================================================================
--- linux-2.6.orig/fs/nilfs2/page.h	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/page.h	2011-01-20 13:03:29.000000000 +0100
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops);
Index: linux-2.6/fs/nilfs2/super.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/super.c	2011-01-17 09:33:44.000000000 +0100
+++ linux-2.6/fs/nilfs2/super.c	2011-01-20 13:03:29.000000000 +0100
@@ -1278,7 +1278,7 @@ static void nilfs_inode_init_once(void *
 #ifdef CONFIG_NILFS_XATTR
 	init_rwsem(&ii->xattr_sem);
 #endif
-	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	address_space_init_once(&ii->i_btnode_cache);
 	ii->i_bmap = &ii->i_bmap_data;
 	inode_init_once(&ii->vfs_inode);
 }
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2011-01-20 13:03:13.000000000 +0100
+++ linux-2.6/include/linux/fs.h	2011-01-20 13:03:29.000000000 +0100
@@ -649,6 +649,7 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct mutex		unmap_mutex;    /* to protect unmapping */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *fi
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void address_space_init_once(struct address_space *mapping);
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-01-20 12:30 ` Miklos Szeredi
  0 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-01-20 12:30 UTC (permalink / raw)
  To: akpm; +Cc: hughd, gurudas.pai, lkml20101129, linux-kernel, linux-mm

From: Miklos Szeredi <mszeredi@suse.cz>

Running a fuse filesystem with multiple open()'s in parallel can
trigger a "kernel BUG at mm/truncate.c:475"

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.  For example:

  thread1: going through a big range, stops in the middle of a vma and
     stores the restart address in vm_truncate_count.

  thread2: comes in with a small (e.g. single page) unmap request on
     the same vma, somewhere before restart_address, finds that the
     vma was already unmapped up to the restart address and happily
     returns without doing anything.

Another scenario would be two big unmap requests, both having to
restart the unmapping and each one setting vm_truncate_count to its
own value.  This could go on forever without any of them being able to
finish.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to 'struct address_space' to prevent
running multiple concurrent unmap_mapping_range() on the same mapping.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Tested-by: Gurudas Pai <gurudas.pai@oracle.com>
---
 fs/gfs2/main.c     |    9 +--------
 fs/inode.c         |   22 +++++++++++++++-------
 fs/nilfs2/btnode.c |    5 -----
 fs/nilfs2/btnode.h |    1 -
 fs/nilfs2/mdt.c    |    4 ++--
 fs/nilfs2/page.c   |   13 -------------
 fs/nilfs2/page.h   |    1 -
 fs/nilfs2/super.c  |    2 +-
 include/linux/fs.h |    2 ++
 mm/memory.c        |    2 ++
 10 files changed, 23 insertions(+), 38 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2011-01-17 09:33:44.000000000 +0100
+++ linux-2.6/mm/memory.c	2011-01-20 13:03:29.000000000 +0100
@@ -2650,6 +2650,7 @@ void unmap_mapping_range(struct address_
 		details.last_index = ULONG_MAX;
 	details.i_mmap_lock = &mapping->i_mmap_lock;
 
+	mutex_lock(&mapping->unmap_mutex);
 	spin_lock(&mapping->i_mmap_lock);
 
 	/* Protect against endless unmapping loops */
@@ -2666,6 +2667,7 @@ void unmap_mapping_range(struct address_
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	spin_unlock(&mapping->i_mmap_lock);
+	mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
Index: linux-2.6/fs/gfs2/main.c
===================================================================
--- linux-2.6.orig/fs/gfs2/main.c	2011-01-12 12:27:59.000000000 +0100
+++ linux-2.6/fs/gfs2/main.c	2011-01-20 13:03:29.000000000 +0100
@@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(voi
 	struct address_space *mapping = (struct address_space *)(gl + 1);
 
 	gfs2_init_glock_once(gl);
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	address_space_init_once(mapping);
 }
 
 /**
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2011-01-12 12:27:59.000000000 +0100
+++ linux-2.6/fs/inode.c	2011-01-20 13:03:29.000000000 +0100
@@ -295,6 +295,20 @@ static void destroy_inode(struct inode *
 		call_rcu(&inode->i_rcu, i_callback);
 }
 
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
+
 /*
  * These are initializations that only need to be done
  * once, because the fields are idempotent across use
@@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_wb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
-	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
-	INIT_LIST_HEAD(&inode->i_data.private_list);
-	spin_lock_init(&inode->i_data.private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
+	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
Index: linux-2.6/fs/nilfs2/btnode.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/btnode.c	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/btnode.c	2011-01-20 13:03:29.000000000 +0100
@@ -35,11 +35,6 @@
 #include "btnode.h"
 
 
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-	nilfs_mapping_init_once(btnc);
-}
-
 static const struct address_space_operations def_btnode_aops = {
 	.sync_page		= block_sync_page,
 };
Index: linux-2.6/fs/nilfs2/btnode.h
===================================================================
--- linux-2.6.orig/fs/nilfs2/btnode.h	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/btnode.h	2011-01-20 13:03:29.000000000 +0100
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
 	struct buffer_head *newbh;
 };
 
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
Index: linux-2.6/fs/nilfs2/mdt.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/mdt.c	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/mdt.c	2011-01-20 13:03:29.000000000 +0100
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct in
 	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
-	nilfs_mapping_init_once(&shadow->frozen_data);
+	address_space_init_once(&shadow->frozen_data);
 	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	address_space_init_once(&shadow->frozen_btnodes);
 	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
 	mi->mi_shadow = shadow;
 	return 0;
Index: linux-2.6/fs/nilfs2/page.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/page.c	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/page.c	2011-01-20 13:03:29.000000000 +0100
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(
 	return nc;
 }
 
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
-
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops)
Index: linux-2.6/fs/nilfs2/page.h
===================================================================
--- linux-2.6.orig/fs/nilfs2/page.h	2011-01-12 12:28:00.000000000 +0100
+++ linux-2.6/fs/nilfs2/page.h	2011-01-20 13:03:29.000000000 +0100
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops);
Index: linux-2.6/fs/nilfs2/super.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/super.c	2011-01-17 09:33:44.000000000 +0100
+++ linux-2.6/fs/nilfs2/super.c	2011-01-20 13:03:29.000000000 +0100
@@ -1278,7 +1278,7 @@ static void nilfs_inode_init_once(void *
 #ifdef CONFIG_NILFS_XATTR
 	init_rwsem(&ii->xattr_sem);
 #endif
-	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	address_space_init_once(&ii->i_btnode_cache);
 	ii->i_bmap = &ii->i_bmap_data;
 	inode_init_once(&ii->vfs_inode);
 }
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2011-01-20 13:03:13.000000000 +0100
+++ linux-2.6/include/linux/fs.h	2011-01-20 13:03:29.000000000 +0100
@@ -649,6 +649,7 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct mutex		unmap_mutex;    /* to protect unmapping */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *fi
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void address_space_init_once(struct address_space *mapping);
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-01-20 12:30 ` Miklos Szeredi
@ 2011-01-20 12:40   ` Christoph Hellwig
  -1 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2011-01-20 12:40 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, hughd, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> Running a fuse filesystem with multiple open()'s in parallel can
> trigger a "kernel BUG at mm/truncate.c:475"
> 
> The reason is, unmap_mapping_range() is not prepared for more than
> one concurrent invocation per inode.  For example:
> 
>   thread1: going through a big range, stops in the middle of a vma and
>      stores the restart address in vm_truncate_count.
> 
>   thread2: comes in with a small (e.g. single page) unmap request on
>      the same vma, somewhere before restart_address, finds that the
>      vma was already unmapped up to the restart address and happily
>      returns without doing anything.
> 
> Another scenario would be two big unmap requests, both having to
> restart the unmapping and each one setting vm_truncate_count to its
> own value.  This could go on forever without any of them being able to
> finish.
> 
> Truncate and hole punching already serialize with i_mutex.  Other
> callers of unmap_mapping_range() do not, and it's difficult to get
> i_mutex protection for all callers.  In particular ->d_revalidate(),
> which calls invalidate_inode_pages2_range() in fuse, may be called
> with or without i_mutex.


Which I think is mostly a fuse problem.  I really hate bloating the
generic inode (into which the address_space is embedded) with another
mutex for deficits in rather special case filesystems. 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-01-20 12:40   ` Christoph Hellwig
  0 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2011-01-20 12:40 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, hughd, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> Running a fuse filesystem with multiple open()'s in parallel can
> trigger a "kernel BUG at mm/truncate.c:475"
> 
> The reason is, unmap_mapping_range() is not prepared for more than
> one concurrent invocation per inode.  For example:
> 
>   thread1: going through a big range, stops in the middle of a vma and
>      stores the restart address in vm_truncate_count.
> 
>   thread2: comes in with a small (e.g. single page) unmap request on
>      the same vma, somewhere before restart_address, finds that the
>      vma was already unmapped up to the restart address and happily
>      returns without doing anything.
> 
> Another scenario would be two big unmap requests, both having to
> restart the unmapping and each one setting vm_truncate_count to its
> own value.  This could go on forever without any of them being able to
> finish.
> 
> Truncate and hole punching already serialize with i_mutex.  Other
> callers of unmap_mapping_range() do not, and it's difficult to get
> i_mutex protection for all callers.  In particular ->d_revalidate(),
> which calls invalidate_inode_pages2_range() in fuse, may be called
> with or without i_mutex.


Which I think is mostly a fuse problem.  I really hate bloating the
generic inode (into which the address_space is embedded) with another
mutex for deficits in rather special case filesystems. 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-01-20 12:40   ` Christoph Hellwig
@ 2011-01-20 14:13     ` Miklos Szeredi
  -1 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-01-20 14:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: miklos, akpm, hughd, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Thu, 20 Jan 2011, Christoph Hellwig wrote:
> On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
> > From: Miklos Szeredi <mszeredi@suse.cz>
> > 
> > Running a fuse filesystem with multiple open()'s in parallel can
> > trigger a "kernel BUG at mm/truncate.c:475"
> > 
> > The reason is, unmap_mapping_range() is not prepared for more than
> > one concurrent invocation per inode.  For example:
> > 
> >   thread1: going through a big range, stops in the middle of a vma and
> >      stores the restart address in vm_truncate_count.
> > 
> >   thread2: comes in with a small (e.g. single page) unmap request on
> >      the same vma, somewhere before restart_address, finds that the
> >      vma was already unmapped up to the restart address and happily
> >      returns without doing anything.
> > 
> > Another scenario would be two big unmap requests, both having to
> > restart the unmapping and each one setting vm_truncate_count to its
> > own value.  This could go on forever without any of them being able to
> > finish.
> > 
> > Truncate and hole punching already serialize with i_mutex.  Other
> > callers of unmap_mapping_range() do not, and it's difficult to get
> > i_mutex protection for all callers.  In particular ->d_revalidate(),
> > which calls invalidate_inode_pages2_range() in fuse, may be called
> > with or without i_mutex.
> 
> 
> Which I think is mostly a fuse problem.  I really hate bloating the
> generic inode (into which the address_space is embedded) with another
> mutex for deficits in rather special case filesystems. 

As Hugh pointed out unmap_mapping_range() has grown a varied set of
callers, which are difficult to fix up wrt i_mutex.  Fuse was just an
example.

I don't like the bloat either, but this is the best I could come up
with for fixing this problem generally.  If you have a better idea,
please share it.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-01-20 14:13     ` Miklos Szeredi
  0 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-01-20 14:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: miklos, akpm, hughd, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Thu, 20 Jan 2011, Christoph Hellwig wrote:
> On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
> > From: Miklos Szeredi <mszeredi@suse.cz>
> > 
> > Running a fuse filesystem with multiple open()'s in parallel can
> > trigger a "kernel BUG at mm/truncate.c:475"
> > 
> > The reason is, unmap_mapping_range() is not prepared for more than
> > one concurrent invocation per inode.  For example:
> > 
> >   thread1: going through a big range, stops in the middle of a vma and
> >      stores the restart address in vm_truncate_count.
> > 
> >   thread2: comes in with a small (e.g. single page) unmap request on
> >      the same vma, somewhere before restart_address, finds that the
> >      vma was already unmapped up to the restart address and happily
> >      returns without doing anything.
> > 
> > Another scenario would be two big unmap requests, both having to
> > restart the unmapping and each one setting vm_truncate_count to its
> > own value.  This could go on forever without any of them being able to
> > finish.
> > 
> > Truncate and hole punching already serialize with i_mutex.  Other
> > callers of unmap_mapping_range() do not, and it's difficult to get
> > i_mutex protection for all callers.  In particular ->d_revalidate(),
> > which calls invalidate_inode_pages2_range() in fuse, may be called
> > with or without i_mutex.
> 
> 
> Which I think is mostly a fuse problem.  I really hate bloating the
> generic inode (into which the address_space is embedded) with another
> mutex for deficits in rather special case filesystems. 

As Hugh pointed out unmap_mapping_range() has grown a varied set of
callers, which are difficult to fix up wrt i_mutex.  Fuse was just an
example.

I don't like the bloat either, but this is the best I could come up
with for fixing this problem generally.  If you have a better idea,
please share it.

Thanks,
Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-01-20 14:13     ` Miklos Szeredi
@ 2011-01-22  4:46       ` Hugh Dickins
  -1 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2011-01-22  4:46 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Christoph Hellwig, akpm, gurudas.pai, lkml20101129, linux-kernel,
	linux-mm

On Thu, 20 Jan 2011, Miklos Szeredi wrote:
> On Thu, 20 Jan 2011, Christoph Hellwig wrote:
> > On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
> > > 
> > > Truncate and hole punching already serialize with i_mutex.  Other
> > > callers of unmap_mapping_range() do not, and it's difficult to get
> > > i_mutex protection for all callers.  In particular ->d_revalidate(),
> > > which calls invalidate_inode_pages2_range() in fuse, may be called
> > > with or without i_mutex.
> > 
> > 
> > Which I think is mostly a fuse problem.  I really hate bloating the
> > generic inode (into which the address_space is embedded) with another
> > mutex for deficits in rather special case filesystems. 
> 
> As Hugh pointed out unmap_mapping_range() has grown a varied set of
> callers, which are difficult to fix up wrt i_mutex.  Fuse was just an
> example.
> 
> I don't like the bloat either, but this is the best I could come up
> with for fixing this problem generally.  If you have a better idea,
> please share it.

If we start from the point that this is mostly a fuse problem (I expect
that a thorough audit will show up a few other filesystems too, but
let's start from this point): you cite ->d_revalidate as a particular
problem, but can we fix up its call sites so that it is always called
either with, or much preferably without, i_mutex held?  Though actually
I couldn't find where ->d_revalidate() is called while holding i_mutex.

Failing that, can fuse down_write i_alloc_sem before calling
invalidate_inode_pages2(_range), to achieve the same exclusion?
The setattr truncation path takes i_alloc_sem as well as i_mutex,
though I'm not certain of its full coverage.

I did already consider holding and dropping i_alloc_sem inside
invalidate_inode_pages2_range(); but direct-io.c very much wants
to take mmap_sem (when get_user_pages_fast goes slow) after taking
i_alloc_sem, whereas fuse_direct_mmap() very much wants to call
invalidate_inode_pages2() while mmap_sem is held.

Hugh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-01-22  4:46       ` Hugh Dickins
  0 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2011-01-22  4:46 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Christoph Hellwig, akpm, gurudas.pai, lkml20101129, linux-kernel,
	linux-mm

On Thu, 20 Jan 2011, Miklos Szeredi wrote:
> On Thu, 20 Jan 2011, Christoph Hellwig wrote:
> > On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
> > > 
> > > Truncate and hole punching already serialize with i_mutex.  Other
> > > callers of unmap_mapping_range() do not, and it's difficult to get
> > > i_mutex protection for all callers.  In particular ->d_revalidate(),
> > > which calls invalidate_inode_pages2_range() in fuse, may be called
> > > with or without i_mutex.
> > 
> > 
> > Which I think is mostly a fuse problem.  I really hate bloating the
> > generic inode (into which the address_space is embedded) with another
> > mutex for deficits in rather special case filesystems. 
> 
> As Hugh pointed out unmap_mapping_range() has grown a varied set of
> callers, which are difficult to fix up wrt i_mutex.  Fuse was just an
> example.
> 
> I don't like the bloat either, but this is the best I could come up
> with for fixing this problem generally.  If you have a better idea,
> please share it.

If we start from the point that this is mostly a fuse problem (I expect
that a thorough audit will show up a few other filesystems too, but
let's start from this point): you cite ->d_revalidate as a particular
problem, but can we fix up its call sites so that it is always called
either with, or much preferably without, i_mutex held?  Though actually
I couldn't find where ->d_revalidate() is called while holding i_mutex.

Failing that, can fuse down_write i_alloc_sem before calling
invalidate_inode_pages2(_range), to achieve the same exclusion?
The setattr truncation path takes i_alloc_sem as well as i_mutex,
though I'm not certain of its full coverage.

I did already consider holding and dropping i_alloc_sem inside
invalidate_inode_pages2_range(); but direct-io.c very much wants
to take mmap_sem (when get_user_pages_fast goes slow) after taking
i_alloc_sem, whereas fuse_direct_mmap() very much wants to call
invalidate_inode_pages2() while mmap_sem is held.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-01-22  4:46       ` Hugh Dickins
@ 2011-01-24 19:47         ` Miklos Szeredi
  -1 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-01-24 19:47 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: miklos, hch, akpm, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Fri, 21 Jan 2011, Hugh Dickins wrote:
> On Thu, 20 Jan 2011, Miklos Szeredi wrote:
> > On Thu, 20 Jan 2011, Christoph Hellwig wrote:
> > > On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
> > > > 
> > > > Truncate and hole punching already serialize with i_mutex.  Other
> > > > callers of unmap_mapping_range() do not, and it's difficult to get
> > > > i_mutex protection for all callers.  In particular ->d_revalidate(),
> > > > which calls invalidate_inode_pages2_range() in fuse, may be called
> > > > with or without i_mutex.
> > > 
> > > 
> > > Which I think is mostly a fuse problem.  I really hate bloating the
> > > generic inode (into which the address_space is embedded) with another
> > > mutex for deficits in rather special case filesystems. 
> > 
> > As Hugh pointed out unmap_mapping_range() has grown a varied set of
> > callers, which are difficult to fix up wrt i_mutex.  Fuse was just an
> > example.
> > 
> > I don't like the bloat either, but this is the best I could come up
> > with for fixing this problem generally.  If you have a better idea,
> > please share it.
> 
> If we start from the point that this is mostly a fuse problem (I expect
> that a thorough audit will show up a few other filesystems too, but
> let's start from this point): you cite ->d_revalidate as a particular
> problem, but can we fix up its call sites so that it is always called
> either with, or much preferably without, i_mutex held?  Though actually
> I couldn't find where ->d_revalidate() is called while holding i_mutex.

lookup_one_len
lookup_hash
  __lookup_hash
    do_revalidate
      d_revalidate

I don't see an easy way to get rid of i_mutex for lookup_one_len() and
lookup_hash().

> Failing that, can fuse down_write i_alloc_sem before calling
> invalidate_inode_pages2(_range), to achieve the same exclusion?
> The setattr truncation path takes i_alloc_sem as well as i_mutex,
> though I'm not certain of its full coverage.

Yeah, fuse could use i_alloc_sem or a private mutex, but that would
leave the other uses of unmap_mapping_range() to sort this out for
themsevels.

Thanks,
Miklos



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-01-24 19:47         ` Miklos Szeredi
  0 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-01-24 19:47 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: miklos, hch, akpm, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Fri, 21 Jan 2011, Hugh Dickins wrote:
> On Thu, 20 Jan 2011, Miklos Szeredi wrote:
> > On Thu, 20 Jan 2011, Christoph Hellwig wrote:
> > > On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
> > > > 
> > > > Truncate and hole punching already serialize with i_mutex.  Other
> > > > callers of unmap_mapping_range() do not, and it's difficult to get
> > > > i_mutex protection for all callers.  In particular ->d_revalidate(),
> > > > which calls invalidate_inode_pages2_range() in fuse, may be called
> > > > with or without i_mutex.
> > > 
> > > 
> > > Which I think is mostly a fuse problem.  I really hate bloating the
> > > generic inode (into which the address_space is embedded) with another
> > > mutex for deficits in rather special case filesystems. 
> > 
> > As Hugh pointed out unmap_mapping_range() has grown a varied set of
> > callers, which are difficult to fix up wrt i_mutex.  Fuse was just an
> > example.
> > 
> > I don't like the bloat either, but this is the best I could come up
> > with for fixing this problem generally.  If you have a better idea,
> > please share it.
> 
> If we start from the point that this is mostly a fuse problem (I expect
> that a thorough audit will show up a few other filesystems too, but
> let's start from this point): you cite ->d_revalidate as a particular
> problem, but can we fix up its call sites so that it is always called
> either with, or much preferably without, i_mutex held?  Though actually
> I couldn't find where ->d_revalidate() is called while holding i_mutex.

lookup_one_len
lookup_hash
  __lookup_hash
    do_revalidate
      d_revalidate

I don't see an easy way to get rid of i_mutex for lookup_one_len() and
lookup_hash().

> Failing that, can fuse down_write i_alloc_sem before calling
> invalidate_inode_pages2(_range), to achieve the same exclusion?
> The setattr truncation path takes i_alloc_sem as well as i_mutex,
> though I'm not certain of its full coverage.

Yeah, fuse could use i_alloc_sem or a private mutex, but that would
leave the other uses of unmap_mapping_range() to sort this out for
themsevels.

Thanks,
Miklos


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-01-24 19:47         ` Miklos Szeredi
@ 2011-01-27  4:19           ` Hugh Dickins
  -1 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2011-01-27  4:19 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: hch, akpm, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Mon, Jan 24, 2011 at 11:47 AM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Fri, 21 Jan 2011, Hugh Dickins wrote:
>> On Thu, 20 Jan 2011, Miklos Szeredi wrote:
>> > On Thu, 20 Jan 2011, Christoph Hellwig wrote:
>> > > On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
>> > > >
>> > > > Truncate and hole punching already serialize with i_mutex.  Other
>> > > > callers of unmap_mapping_range() do not, and it's difficult to get
>> > > > i_mutex protection for all callers.  In particular ->d_revalidate(),
>> > > > which calls invalidate_inode_pages2_range() in fuse, may be called
>> > > > with or without i_mutex.
>> > >
>> > >
>> > > Which I think is mostly a fuse problem.  I really hate bloating the
>> > > generic inode (into which the address_space is embedded) with another
>> > > mutex for deficits in rather special case filesystems.
>> >
>> > As Hugh pointed out unmap_mapping_range() has grown a varied set of
>> > callers, which are difficult to fix up wrt i_mutex.  Fuse was just an
>> > example.
>> >
>> > I don't like the bloat either, but this is the best I could come up
>> > with for fixing this problem generally.  If you have a better idea,
>> > please share it.
>>
>> If we start from the point that this is mostly a fuse problem (I expect
>> that a thorough audit will show up a few other filesystems too, but
>> let's start from this point): you cite ->d_revalidate as a particular
>> problem, but can we fix up its call sites so that it is always called
>> either with, or much preferably without, i_mutex held?  Though actually
>> I couldn't find where ->d_revalidate() is called while holding i_mutex.
>
> lookup_one_len
> lookup_hash
>  __lookup_hash
>    do_revalidate
>     d_revalidate

Right, thanks.

>
> I don't see an easy way to get rid of i_mutex for lookup_one_len() and
> lookup_hash().
>
>> Failing that, can fuse down_write i_alloc_sem before calling
>> invalidate_inode_pages2(_range), to achieve the same exclusion?
>> The setattr truncation path takes i_alloc_sem as well as i_mutex,
>> though I'm not certain of its full coverage.
>
> Yeah, fuse could use i_alloc_sem or a private mutex, but that would
> leave the other uses of unmap_mapping_range() to sort this out for
> themsevels.

I had wanted to propose that for now you modify just fuse to use
i_alloc_sem for serialization there, and I provide a patch to
unmap_mapping_range() to give safety to whatever other cases there are
(I'm now sure there are other cases, but also sure that I cannot
safely identify them all and fix them correctly at source myself -
even if I found time to do the patches, they'd need at least a release
cycle to bed in with BUG_ONs).

I've spent quite a while on it, but not succeeded: even if I could get
around the restart_addr issue, we're stuck with the deadly embrace
when two are in unmap_mapping_range(), each repeatedly yielding to the
other, each having to start over again.  Anything I came up with was
inferior to the two alternatives you have proposed: your original
wait_on_bit patch, or your current unmap_mutex patch.

Your wait_on_bit patch doesn't bloat (and may be attractive to
enterprise distros seeking binary compatibility), but several of us
agreed with Andrew's comments:

> I do think this was premature optimisation.  The open-coded lock is
> hidden from lockdep so we won't find out if this introduces potential
> deadlocks.  It would be better to add a new mutex at least temporarily,
> then look at replacing it with a MiklosLock later on, when the code is
> bedded in.
>
> At which time, replacing mutexes with MiklosLocks becomes part of a
> general "shrink the address_space" exercise in which there's no reason
> to exclusively concentrate on that new mutex!

It really does seem a mutex too far; but we may let Peter do away with
all that lock breaking when/if his preemptibility patches go in, and
could cut it out at that time.  I don't see a good alternative.

Hugh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-01-27  4:19           ` Hugh Dickins
  0 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2011-01-27  4:19 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: hch, akpm, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Mon, Jan 24, 2011 at 11:47 AM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Fri, 21 Jan 2011, Hugh Dickins wrote:
>> On Thu, 20 Jan 2011, Miklos Szeredi wrote:
>> > On Thu, 20 Jan 2011, Christoph Hellwig wrote:
>> > > On Thu, Jan 20, 2011 at 01:30:58PM +0100, Miklos Szeredi wrote:
>> > > >
>> > > > Truncate and hole punching already serialize with i_mutex.  Other
>> > > > callers of unmap_mapping_range() do not, and it's difficult to get
>> > > > i_mutex protection for all callers.  In particular ->d_revalidate(),
>> > > > which calls invalidate_inode_pages2_range() in fuse, may be called
>> > > > with or without i_mutex.
>> > >
>> > >
>> > > Which I think is mostly a fuse problem.  I really hate bloating the
>> > > generic inode (into which the address_space is embedded) with another
>> > > mutex for deficits in rather special case filesystems.
>> >
>> > As Hugh pointed out unmap_mapping_range() has grown a varied set of
>> > callers, which are difficult to fix up wrt i_mutex.  Fuse was just an
>> > example.
>> >
>> > I don't like the bloat either, but this is the best I could come up
>> > with for fixing this problem generally.  If you have a better idea,
>> > please share it.
>>
>> If we start from the point that this is mostly a fuse problem (I expect
>> that a thorough audit will show up a few other filesystems too, but
>> let's start from this point): you cite ->d_revalidate as a particular
>> problem, but can we fix up its call sites so that it is always called
>> either with, or much preferably without, i_mutex held?  Though actually
>> I couldn't find where ->d_revalidate() is called while holding i_mutex.
>
> lookup_one_len
> lookup_hash
>  __lookup_hash
>    do_revalidate
>     d_revalidate

Right, thanks.

>
> I don't see an easy way to get rid of i_mutex for lookup_one_len() and
> lookup_hash().
>
>> Failing that, can fuse down_write i_alloc_sem before calling
>> invalidate_inode_pages2(_range), to achieve the same exclusion?
>> The setattr truncation path takes i_alloc_sem as well as i_mutex,
>> though I'm not certain of its full coverage.
>
> Yeah, fuse could use i_alloc_sem or a private mutex, but that would
> leave the other uses of unmap_mapping_range() to sort this out for
> themsevels.

I had wanted to propose that for now you modify just fuse to use
i_alloc_sem for serialization there, and I provide a patch to
unmap_mapping_range() to give safety to whatever other cases there are
(I'm now sure there are other cases, but also sure that I cannot
safely identify them all and fix them correctly at source myself -
even if I found time to do the patches, they'd need at least a release
cycle to bed in with BUG_ONs).

I've spent quite a while on it, but not succeeded: even if I could get
around the restart_addr issue, we're stuck with the deadly embrace
when two are in unmap_mapping_range(), each repeatedly yielding to the
other, each having to start over again.  Anything I came up with was
inferior to the two alternatives you have proposed: your original
wait_on_bit patch, or your current unmap_mutex patch.

Your wait_on_bit patch doesn't bloat (and may be attractive to
enterprise distros seeking binary compatibility), but several of us
agreed with Andrew's comments:

> I do think this was premature optimisation.  The open-coded lock is
> hidden from lockdep so we won't find out if this introduces potential
> deadlocks.  It would be better to add a new mutex at least temporarily,
> then look at replacing it with a MiklosLock later on, when the code is
> bedded in.
>
> At which time, replacing mutexes with MiklosLocks becomes part of a
> general "shrink the address_space" exercise in which there's no reason
> to exclusively concentrate on that new mutex!

It really does seem a mutex too far; but we may let Peter do away with
all that lock breaking when/if his preemptibility patches go in, and
could cut it out at that time.  I don't see a good alternative.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-01-27  4:19           ` Hugh Dickins
@ 2011-02-08 10:30             ` Miklos Szeredi
  -1 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-02-08 10:30 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: miklos, hch, akpm, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Wed, 26 Jan 2011, Hugh Dickins wrote:
> I had wanted to propose that for now you modify just fuse to use
> i_alloc_sem for serialization there, and I provide a patch to
> unmap_mapping_range() to give safety to whatever other cases there are
> (I'm now sure there are other cases, but also sure that I cannot
> safely identify them all and fix them correctly at source myself -
> even if I found time to do the patches, they'd need at least a release
> cycle to bed in with BUG_ONs).

Since fuse is the only one where the BUG has actually been triggered,
and since there are problems with all the proposed generic approaches,
I concur.  I didn't want to use i_alloc_sem here as it's more
confusing than a new mutex.

Gurudas, could you please give this patch a go in your testcase?

Thanks,
Miklos
---

From: Miklos Szeredi <mszeredi@suse.cz>
Subject: fuse: prevent concurrent unmap on the same inode

Running a fuse filesystem with multiple open()'s in parallel can
trigger a "kernel BUG at mm/truncate.c:475"

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to fuse_inode to prevent running multiple
concurrent unmap_mapping_range() on the same mapping.

Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Gurudas Pai <gurudas.pai@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c    |    8 +++-----
 fs/fuse/file.c   |   10 +++++++++-
 fs/fuse/fuse_i.h |    3 +++
 fs/fuse/inode.c  |    6 ++++++
 4 files changed, 21 insertions(+), 6 deletions(-)

Index: linux-2.6/fs/fuse/dir.c
===================================================================
--- linux-2.6.orig/fs/fuse/dir.c	2011-02-07 17:52:34.000000000 +0100
+++ linux-2.6/fs/fuse/dir.c	2011-02-07 17:52:35.000000000 +0100
@@ -1255,16 +1255,12 @@ void fuse_release_nowrite(struct inode *
 
 /*
  * Set attributes, and at the same time refresh them.
- *
- * Truncation is slightly complicated, because the 'truncate' request
- * may fail, in which case we don't want to touch the mapping.
- * vmtruncate() doesn't allow for this case, so do the rlimit checking
- * and the actual truncation by hand.
  */
 static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 			   struct file *file)
 {
 	struct inode *inode = entry->d_inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
 	struct fuse_setattr_in inarg;
@@ -1352,8 +1348,10 @@ static int fuse_do_setattr(struct dentry
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 */
 	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+		mutex_lock(&fi->unmap_mutex);
 		truncate_pagecache(inode, oldsize, outarg.attr.size);
 		invalidate_inode_pages2(inode->i_mapping);
+		mutex_unlock(&fi->unmap_mutex);
 	}
 
 	return 0;
Index: linux-2.6/fs/fuse/file.c
===================================================================
--- linux-2.6.orig/fs/fuse/file.c	2011-02-07 17:52:34.000000000 +0100
+++ linux-2.6/fs/fuse/file.c	2011-02-07 17:52:35.000000000 +0100
@@ -170,11 +170,15 @@ void fuse_finish_open(struct inode *inod
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	if (ff->open_flags & FOPEN_DIRECT_IO)
 		file->f_op = &fuse_direct_io_file_operations;
-	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+	if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
+		mutex_lock(&fi->unmap_mutex);
 		invalidate_inode_pages2(inode->i_mapping);
+		mutex_unlock(&fi->unmap_mutex);
+	}
 	if (ff->open_flags & FOPEN_NONSEEKABLE)
 		nonseekable_open(inode, file);
 	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
@@ -1403,11 +1407,15 @@ static int fuse_file_mmap(struct file *f
 
 static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct fuse_inode *fi = get_fuse_inode(file->f_mapping->host);
+
 	/* Can't provide the coherency needed for MAP_SHARED */
 	if (vma->vm_flags & VM_MAYSHARE)
 		return -ENODEV;
 
+	mutex_lock(&fi->unmap_mutex);
 	invalidate_inode_pages2(file->f_mapping);
+	mutex_unlock(&fi->unmap_mutex);
 
 	return generic_file_mmap(file, vma);
 }
Index: linux-2.6/fs/fuse/fuse_i.h
===================================================================
--- linux-2.6.orig/fs/fuse/fuse_i.h	2011-02-07 17:52:34.000000000 +0100
+++ linux-2.6/fs/fuse/fuse_i.h	2011-02-07 17:52:35.000000000 +0100
@@ -100,6 +100,9 @@ struct fuse_inode {
 
 	/** List of writepage requestst (pending or sent) */
 	struct list_head writepages;
+
+	/** to protect unmapping */
+	struct mutex unmap_mutex;
 };
 
 struct fuse_conn;
Index: linux-2.6/fs/fuse/inode.c
===================================================================
--- linux-2.6.orig/fs/fuse/inode.c	2011-02-07 17:52:34.000000000 +0100
+++ linux-2.6/fs/fuse/inode.c	2011-02-07 17:54:03.000000000 +0100
@@ -95,6 +95,7 @@ static struct inode *fuse_alloc_inode(st
 	INIT_LIST_HEAD(&fi->queued_writes);
 	INIT_LIST_HEAD(&fi->writepages);
 	init_waitqueue_head(&fi->page_waitq);
+	mutex_init(&fi->unmap_mutex);
 	fi->forget = fuse_alloc_forget();
 	if (!fi->forget) {
 		kmem_cache_free(fuse_inode_cachep, inode);
@@ -197,8 +198,10 @@ void fuse_change_attributes(struct inode
 	spin_unlock(&fc->lock);
 
 	if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
+		mutex_lock(&fi->unmap_mutex);
 		truncate_pagecache(inode, oldsize, attr->size);
 		invalidate_inode_pages2(inode->i_mapping);
+		mutex_unlock(&fi->unmap_mutex);
 	}
 }
 
@@ -286,13 +289,16 @@ int fuse_reverse_inval_inode(struct supe
 
 	fuse_invalidate_attr(inode);
 	if (offset >= 0) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
 		pg_start = offset >> PAGE_CACHE_SHIFT;
 		if (len <= 0)
 			pg_end = -1;
 		else
 			pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+		mutex_lock(&fi->unmap_mutex);
 		invalidate_inode_pages2_range(inode->i_mapping,
 					      pg_start, pg_end);
+		mutex_unlock(&fi->unmap_mutex);
 	}
 	iput(inode);
 	return 0;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-02-08 10:30             ` Miklos Szeredi
  0 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-02-08 10:30 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: miklos, hch, akpm, gurudas.pai, lkml20101129, linux-kernel, linux-mm

On Wed, 26 Jan 2011, Hugh Dickins wrote:
> I had wanted to propose that for now you modify just fuse to use
> i_alloc_sem for serialization there, and I provide a patch to
> unmap_mapping_range() to give safety to whatever other cases there are
> (I'm now sure there are other cases, but also sure that I cannot
> safely identify them all and fix them correctly at source myself -
> even if I found time to do the patches, they'd need at least a release
> cycle to bed in with BUG_ONs).

Since fuse is the only one where the BUG has actually been triggered,
and since there are problems with all the proposed generic approaches,
I concur.  I didn't want to use i_alloc_sem here as it's more
confusing than a new mutex.

Gurudas, could you please give this patch a go in your testcase?

Thanks,
Miklos
---

From: Miklos Szeredi <mszeredi@suse.cz>
Subject: fuse: prevent concurrent unmap on the same inode

Running a fuse filesystem with multiple open()'s in parallel can
trigger a "kernel BUG at mm/truncate.c:475"

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to fuse_inode to prevent running multiple
concurrent unmap_mapping_range() on the same mapping.

Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Gurudas Pai <gurudas.pai@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c    |    8 +++-----
 fs/fuse/file.c   |   10 +++++++++-
 fs/fuse/fuse_i.h |    3 +++
 fs/fuse/inode.c  |    6 ++++++
 4 files changed, 21 insertions(+), 6 deletions(-)

Index: linux-2.6/fs/fuse/dir.c
===================================================================
--- linux-2.6.orig/fs/fuse/dir.c	2011-02-07 17:52:34.000000000 +0100
+++ linux-2.6/fs/fuse/dir.c	2011-02-07 17:52:35.000000000 +0100
@@ -1255,16 +1255,12 @@ void fuse_release_nowrite(struct inode *
 
 /*
  * Set attributes, and at the same time refresh them.
- *
- * Truncation is slightly complicated, because the 'truncate' request
- * may fail, in which case we don't want to touch the mapping.
- * vmtruncate() doesn't allow for this case, so do the rlimit checking
- * and the actual truncation by hand.
  */
 static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 			   struct file *file)
 {
 	struct inode *inode = entry->d_inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
 	struct fuse_setattr_in inarg;
@@ -1352,8 +1348,10 @@ static int fuse_do_setattr(struct dentry
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 */
 	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+		mutex_lock(&fi->unmap_mutex);
 		truncate_pagecache(inode, oldsize, outarg.attr.size);
 		invalidate_inode_pages2(inode->i_mapping);
+		mutex_unlock(&fi->unmap_mutex);
 	}
 
 	return 0;
Index: linux-2.6/fs/fuse/file.c
===================================================================
--- linux-2.6.orig/fs/fuse/file.c	2011-02-07 17:52:34.000000000 +0100
+++ linux-2.6/fs/fuse/file.c	2011-02-07 17:52:35.000000000 +0100
@@ -170,11 +170,15 @@ void fuse_finish_open(struct inode *inod
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	if (ff->open_flags & FOPEN_DIRECT_IO)
 		file->f_op = &fuse_direct_io_file_operations;
-	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+	if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
+		mutex_lock(&fi->unmap_mutex);
 		invalidate_inode_pages2(inode->i_mapping);
+		mutex_unlock(&fi->unmap_mutex);
+	}
 	if (ff->open_flags & FOPEN_NONSEEKABLE)
 		nonseekable_open(inode, file);
 	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
@@ -1403,11 +1407,15 @@ static int fuse_file_mmap(struct file *f
 
 static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct fuse_inode *fi = get_fuse_inode(file->f_mapping->host);
+
 	/* Can't provide the coherency needed for MAP_SHARED */
 	if (vma->vm_flags & VM_MAYSHARE)
 		return -ENODEV;
 
+	mutex_lock(&fi->unmap_mutex);
 	invalidate_inode_pages2(file->f_mapping);
+	mutex_unlock(&fi->unmap_mutex);
 
 	return generic_file_mmap(file, vma);
 }
Index: linux-2.6/fs/fuse/fuse_i.h
===================================================================
--- linux-2.6.orig/fs/fuse/fuse_i.h	2011-02-07 17:52:34.000000000 +0100
+++ linux-2.6/fs/fuse/fuse_i.h	2011-02-07 17:52:35.000000000 +0100
@@ -100,6 +100,9 @@ struct fuse_inode {
 
 	/** List of writepage requestst (pending or sent) */
 	struct list_head writepages;
+
+	/** to protect unmapping */
+	struct mutex unmap_mutex;
 };
 
 struct fuse_conn;
Index: linux-2.6/fs/fuse/inode.c
===================================================================
--- linux-2.6.orig/fs/fuse/inode.c	2011-02-07 17:52:34.000000000 +0100
+++ linux-2.6/fs/fuse/inode.c	2011-02-07 17:54:03.000000000 +0100
@@ -95,6 +95,7 @@ static struct inode *fuse_alloc_inode(st
 	INIT_LIST_HEAD(&fi->queued_writes);
 	INIT_LIST_HEAD(&fi->writepages);
 	init_waitqueue_head(&fi->page_waitq);
+	mutex_init(&fi->unmap_mutex);
 	fi->forget = fuse_alloc_forget();
 	if (!fi->forget) {
 		kmem_cache_free(fuse_inode_cachep, inode);
@@ -197,8 +198,10 @@ void fuse_change_attributes(struct inode
 	spin_unlock(&fc->lock);
 
 	if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
+		mutex_lock(&fi->unmap_mutex);
 		truncate_pagecache(inode, oldsize, attr->size);
 		invalidate_inode_pages2(inode->i_mapping);
+		mutex_unlock(&fi->unmap_mutex);
 	}
 }
 
@@ -286,13 +289,16 @@ int fuse_reverse_inval_inode(struct supe
 
 	fuse_invalidate_attr(inode);
 	if (offset >= 0) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
 		pg_start = offset >> PAGE_CACHE_SHIFT;
 		if (len <= 0)
 			pg_end = -1;
 		else
 			pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+		mutex_lock(&fi->unmap_mutex);
 		invalidate_inode_pages2_range(inode->i_mapping,
 					      pg_start, pg_end);
+		mutex_unlock(&fi->unmap_mutex);
 	}
 	iput(inode);
 	return 0;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-02-08 10:30             ` Miklos Szeredi
@ 2011-02-08 11:52               ` Gurudas Pai
  -1 siblings, 0 replies; 28+ messages in thread
From: Gurudas Pai @ 2011-02-08 11:52 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Hugh Dickins, hch, akpm, lkml20101129, linux-kernel, linux-mm

> On Wed, 26 Jan 2011, Hugh Dickins wrote:
>> I had wanted to propose that for now you modify just fuse to use
>> i_alloc_sem for serialization there, and I provide a patch to
>> unmap_mapping_range() to give safety to whatever other cases there are
>> (I'm now sure there are other cases, but also sure that I cannot
>> safely identify them all and fix them correctly at source myself -
>> even if I found time to do the patches, they'd need at least a release
>> cycle to bed in with BUG_ONs).
> 
> Since fuse is the only one where the BUG has actually been triggered,
> and since there are problems with all the proposed generic approaches,
> I concur.  I didn't want to use i_alloc_sem here as it's more
> confusing than a new mutex.
> 
> Gurudas, could you please give this patch a go in your testcase?
I found this BUG with nfs, so trying with current patch may not help.
https://lkml.org/lkml/2010/12/29/9

Let me know if I have to run this
> 
> From: Miklos Szeredi <mszeredi@suse.cz>
> Subject: fuse: prevent concurrent unmap on the same inode
> 
> Running a fuse filesystem with multiple open()'s in parallel can
> trigger a "kernel BUG at mm/truncate.c:475"
> 
> The reason is, unmap_mapping_range() is not prepared for more than
> one concurrent invocation per inode.
> 
> Truncate and hole punching already serialize with i_mutex.  Other
> callers of unmap_mapping_range() do not, and it's difficult to get
> i_mutex protection for all callers.  In particular ->d_revalidate(),
> which calls invalidate_inode_pages2_range() in fuse, may be called
> with or without i_mutex.
> 
> This patch adds a new mutex to fuse_inode to prevent running multiple
> concurrent unmap_mapping_range() on the same mapping.

Thanks,
-Guru




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-02-08 11:52               ` Gurudas Pai
  0 siblings, 0 replies; 28+ messages in thread
From: Gurudas Pai @ 2011-02-08 11:52 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Hugh Dickins, hch, akpm, lkml20101129, linux-kernel, linux-mm

> On Wed, 26 Jan 2011, Hugh Dickins wrote:
>> I had wanted to propose that for now you modify just fuse to use
>> i_alloc_sem for serialization there, and I provide a patch to
>> unmap_mapping_range() to give safety to whatever other cases there are
>> (I'm now sure there are other cases, but also sure that I cannot
>> safely identify them all and fix them correctly at source myself -
>> even if I found time to do the patches, they'd need at least a release
>> cycle to bed in with BUG_ONs).
> 
> Since fuse is the only one where the BUG has actually been triggered,
> and since there are problems with all the proposed generic approaches,
> I concur.  I didn't want to use i_alloc_sem here as it's more
> confusing than a new mutex.
> 
> Gurudas, could you please give this patch a go in your testcase?
I found this BUG with nfs, so trying with current patch may not help.
https://lkml.org/lkml/2010/12/29/9

Let me know if I have to run this
> 
> From: Miklos Szeredi <mszeredi@suse.cz>
> Subject: fuse: prevent concurrent unmap on the same inode
> 
> Running a fuse filesystem with multiple open()'s in parallel can
> trigger a "kernel BUG at mm/truncate.c:475"
> 
> The reason is, unmap_mapping_range() is not prepared for more than
> one concurrent invocation per inode.
> 
> Truncate and hole punching already serialize with i_mutex.  Other
> callers of unmap_mapping_range() do not, and it's difficult to get
> i_mutex protection for all callers.  In particular ->d_revalidate(),
> which calls invalidate_inode_pages2_range() in fuse, may be called
> with or without i_mutex.
> 
> This patch adds a new mutex to fuse_inode to prevent running multiple
> concurrent unmap_mapping_range() on the same mapping.

Thanks,
-Guru



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-02-08 11:52               ` Gurudas Pai
@ 2011-02-08 11:59                 ` Miklos Szeredi
  -1 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-02-08 11:59 UTC (permalink / raw)
  To: Gurudas Pai
  Cc: Trond.Myklebust, miklos, hughd, hch, akpm, lkml20101129,
	linux-kernel, linux-mm

On Tue, 08 Feb 2011, Gurudas Pai wrote:
> > On Wed, 26 Jan 2011, Hugh Dickins wrote:
> >> I had wanted to propose that for now you modify just fuse to use
> >> i_alloc_sem for serialization there, and I provide a patch to
> >> unmap_mapping_range() to give safety to whatever other cases there are
> >> (I'm now sure there are other cases, but also sure that I cannot
> >> safely identify them all and fix them correctly at source myself -
> >> even if I found time to do the patches, they'd need at least a release
> >> cycle to bed in with BUG_ONs).
> > 
> > Since fuse is the only one where the BUG has actually been triggered,
> > and since there are problems with all the proposed generic approaches,
> > I concur.  I didn't want to use i_alloc_sem here as it's more
> > confusing than a new mutex.
> > 
> > Gurudas, could you please give this patch a go in your testcase?
> I found this BUG with nfs, so trying with current patch may not help.
> https://lkml.org/lkml/2010/12/29/9
> 
> Let me know if I have to run this

Ahh, I was not aware of that.  No, in that case there's not much point
in trying this patch for you as it only fixes the issue in fuse. I
haven't looked at the NFS side of it yet.

Added Trond to the Cc.

Thanks,
Miklos


> > 
> > From: Miklos Szeredi <mszeredi@suse.cz>
> > Subject: fuse: prevent concurrent unmap on the same inode
> > 
> > Running a fuse filesystem with multiple open()'s in parallel can
> > trigger a "kernel BUG at mm/truncate.c:475"
> > 
> > The reason is, unmap_mapping_range() is not prepared for more than
> > one concurrent invocation per inode.
> > 
> > Truncate and hole punching already serialize with i_mutex.  Other
> > callers of unmap_mapping_range() do not, and it's difficult to get
> > i_mutex protection for all callers.  In particular ->d_revalidate(),
> > which calls invalidate_inode_pages2_range() in fuse, may be called
> > with or without i_mutex.
> > 
> > This patch adds a new mutex to fuse_inode to prevent running multiple
> > concurrent unmap_mapping_range() on the same mapping.
> 
> Thanks,
> -Guru
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-02-08 11:59                 ` Miklos Szeredi
  0 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-02-08 11:59 UTC (permalink / raw)
  To: Gurudas Pai
  Cc: Trond.Myklebust, miklos, hughd, hch, akpm, lkml20101129,
	linux-kernel, linux-mm

On Tue, 08 Feb 2011, Gurudas Pai wrote:
> > On Wed, 26 Jan 2011, Hugh Dickins wrote:
> >> I had wanted to propose that for now you modify just fuse to use
> >> i_alloc_sem for serialization there, and I provide a patch to
> >> unmap_mapping_range() to give safety to whatever other cases there are
> >> (I'm now sure there are other cases, but also sure that I cannot
> >> safely identify them all and fix them correctly at source myself -
> >> even if I found time to do the patches, they'd need at least a release
> >> cycle to bed in with BUG_ONs).
> > 
> > Since fuse is the only one where the BUG has actually been triggered,
> > and since there are problems with all the proposed generic approaches,
> > I concur.  I didn't want to use i_alloc_sem here as it's more
> > confusing than a new mutex.
> > 
> > Gurudas, could you please give this patch a go in your testcase?
> I found this BUG with nfs, so trying with current patch may not help.
> https://lkml.org/lkml/2010/12/29/9
> 
> Let me know if I have to run this

Ahh, I was not aware of that.  No, in that case there's not much point
in trying this patch for you as it only fixes the issue in fuse. I
haven't looked at the NFS side of it yet.

Added Trond to the Cc.

Thanks,
Miklos


> > 
> > From: Miklos Szeredi <mszeredi@suse.cz>
> > Subject: fuse: prevent concurrent unmap on the same inode
> > 
> > Running a fuse filesystem with multiple open()'s in parallel can
> > trigger a "kernel BUG at mm/truncate.c:475"
> > 
> > The reason is, unmap_mapping_range() is not prepared for more than
> > one concurrent invocation per inode.
> > 
> > Truncate and hole punching already serialize with i_mutex.  Other
> > callers of unmap_mapping_range() do not, and it's difficult to get
> > i_mutex protection for all callers.  In particular ->d_revalidate(),
> > which calls invalidate_inode_pages2_range() in fuse, may be called
> > with or without i_mutex.
> > 
> > This patch adds a new mutex to fuse_inode to prevent running multiple
> > concurrent unmap_mapping_range() on the same mapping.
> 
> Thanks,
> -Guru
> 
> 
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-02-23 23:12     ` Hugh Dickins
@ 2011-03-02  9:48       ` Peter Zijlstra
  -1 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2011-03-02  9:48 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Miklos Szeredi, akpm, hch, gurudas.pai,
	lkml20101129, rjw, florian, trond.myklebust, maciej.rutecki,
	linux-kernel, linux-mm

On Wed, 2011-02-23 at 15:12 -0800, Hugh Dickins wrote:
> 
> In his [2/8] mm: remove i_mmap_mutex lockbreak patch, Peter says
> "shouldn't hold up reclaim more than lock_page() would".  But (apart
> from a write error case) we always use trylock_page() in reclaim, we
> never dare hold it up on a lock_page(). 

D'0h! I so missed that, ok fixed up the changelog.

>  So page reclaim would get
> held up on truncation more than at present - though he's right to
> point out that truncation will usually be freeing pages much faster.

*phew* :-)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-03-02  9:48       ` Peter Zijlstra
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2011-03-02  9:48 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Miklos Szeredi, akpm, hch, gurudas.pai,
	lkml20101129, rjw, florian, trond.myklebust, maciej.rutecki,
	linux-kernel, linux-mm

On Wed, 2011-02-23 at 15:12 -0800, Hugh Dickins wrote:
> 
> In his [2/8] mm: remove i_mmap_mutex lockbreak patch, Peter says
> "shouldn't hold up reclaim more than lock_page() would".  But (apart
> from a write error case) we always use trylock_page() in reclaim, we
> never dare hold it up on a lock_page(). 

D'0h! I so missed that, ok fixed up the changelog.

>  So page reclaim would get
> held up on truncation more than at present - though he's right to
> point out that truncation will usually be freeing pages much faster.

*phew* :-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-02-23 22:33   ` Linus Torvalds
@ 2011-02-23 23:12     ` Hugh Dickins
  -1 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2011-02-23 23:12 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Miklos Szeredi, akpm, hch, a.p.zijlstra, gurudas.pai,
	lkml20101129, rjw, florian, trond.myklebust, maciej.rutecki,
	linux-kernel, linux-mm

On Wed, 23 Feb 2011, Linus Torvalds wrote:
> On Wed, Feb 23, 2011 at 4:49 AM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> >
> > This resolves Bug 25822 listed in the regressions since 2.6.36 (though
> > it's a bug much older than that, for some reason it only started
> > triggering for people recently).
> 
> Gaah. I hate this patch. It is, in fact, a patch that makes me finally
> think that the mm preemptibility is actually worth it, because then
> i_mmap_lock turns into a mutex and makes the whole "drop the lock"
> thing hopefully a thing of the past (see the patch "mm: Remove
> i_mmap_mutex lockbreak").
> 
> Because as far as I can see, the only thing that makes this thing
> needed in the first place is that horribly ugly "we drop i_mmap_lock
> in the middle of random operations that really still need it".
> 
> That said, I don't really see any alternatives - I guess we can't
> really just say "remove that crazy lock dropping". Even though I
> really really really would like to.

Those feelings understood and shared.

> 
> Of course, we could also just decide that we should apply the mm
> preemptibility series instead. Can people confirm that that fixes the
> bug too?

It would fix it, but there's a but.

In his [2/8] mm: remove i_mmap_mutex lockbreak patch, Peter says
"shouldn't hold up reclaim more than lock_page() would".  But (apart
from a write error case) we always use trylock_page() in reclaim, we
never dare hold it up on a lock_page().  So page reclaim would get
held up on truncation more than at present - though he's right to
point out that truncation will usually be freeing pages much faster.

I'm not sure whether it will prove good enough to abandon the lock
breaking if we move to a mutex there.  And besides, this unmapping
BUG does need a fix in stable, well before we want to try out the
preemptible mmu gathering.

I'd rather hold out Peter's series as a hope that we can
eliminate this extra unmapping mutex in a few months time.

Hugh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-02-23 23:12     ` Hugh Dickins
  0 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2011-02-23 23:12 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Miklos Szeredi, akpm, hch, a.p.zijlstra, gurudas.pai,
	lkml20101129, rjw, florian, trond.myklebust, maciej.rutecki,
	linux-kernel, linux-mm

On Wed, 23 Feb 2011, Linus Torvalds wrote:
> On Wed, Feb 23, 2011 at 4:49 AM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> >
> > This resolves Bug 25822 listed in the regressions since 2.6.36 (though
> > it's a bug much older than that, for some reason it only started
> > triggering for people recently).
> 
> Gaah. I hate this patch. It is, in fact, a patch that makes me finally
> think that the mm preemptibility is actually worth it, because then
> i_mmap_lock turns into a mutex and makes the whole "drop the lock"
> thing hopefully a thing of the past (see the patch "mm: Remove
> i_mmap_mutex lockbreak").
> 
> Because as far as I can see, the only thing that makes this thing
> needed in the first place is that horribly ugly "we drop i_mmap_lock
> in the middle of random operations that really still need it".
> 
> That said, I don't really see any alternatives - I guess we can't
> really just say "remove that crazy lock dropping". Even though I
> really really really would like to.

Those feelings understood and shared.

> 
> Of course, we could also just decide that we should apply the mm
> preemptibility series instead. Can people confirm that that fixes the
> bug too?

It would fix it, but there's a but.

In his [2/8] mm: remove i_mmap_mutex lockbreak patch, Peter says
"shouldn't hold up reclaim more than lock_page() would".  But (apart
from a write error case) we always use trylock_page() in reclaim, we
never dare hold it up on a lock_page().  So page reclaim would get
held up on truncation more than at present - though he's right to
point out that truncation will usually be freeing pages much faster.

I'm not sure whether it will prove good enough to abandon the lock
breaking if we move to a mutex there.  And besides, this unmapping
BUG does need a fix in stable, well before we want to try out the
preemptible mmu gathering.

I'd rather hold out Peter's series as a hope that we can
eliminate this extra unmapping mutex in a few months time.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-02-23 12:49 ` Miklos Szeredi
@ 2011-02-23 22:33   ` Linus Torvalds
  -1 siblings, 0 replies; 28+ messages in thread
From: Linus Torvalds @ 2011-02-23 22:33 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, hughd, gurudas.pai, lkml20101129, rjw, florian,
	trond.myklebust, maciej.rutecki, linux-kernel, linux-mm

On Wed, Feb 23, 2011 at 4:49 AM, Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> This resolves Bug 25822 listed in the regressions since 2.6.36 (though
> it's a bug much older than that, for some reason it only started
> triggering for people recently).

Gaah. I hate this patch. It is, in fact, a patch that makes me finally
think that the mm preemptibility is actually worth it, because then
i_mmap_lock turns into a mutex and makes the whole "drop the lock"
thing hopefully a thing of the past (see the patch "mm: Remove
i_mmap_mutex lockbreak").

Because as far as I can see, the only thing that makes this thing
needed in the first place is that horribly ugly "we drop i_mmap_lock
in the middle of random operations that really still need it".

That said, I don't really see any alternatives - I guess we can't
really just say "remove that crazy lock dropping". Even though I
really really really would like to.

Of course, we could also just decide that we should apply the mm
preemptibility series instead. Can people confirm that that fixes the
bug too?

                         Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-02-23 22:33   ` Linus Torvalds
  0 siblings, 0 replies; 28+ messages in thread
From: Linus Torvalds @ 2011-02-23 22:33 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, hughd, gurudas.pai, lkml20101129, rjw, florian,
	trond.myklebust, maciej.rutecki, linux-kernel, linux-mm

On Wed, Feb 23, 2011 at 4:49 AM, Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> This resolves Bug 25822 listed in the regressions since 2.6.36 (though
> it's a bug much older than that, for some reason it only started
> triggering for people recently).

Gaah. I hate this patch. It is, in fact, a patch that makes me finally
think that the mm preemptibility is actually worth it, because then
i_mmap_lock turns into a mutex and makes the whole "drop the lock"
thing hopefully a thing of the past (see the patch "mm: Remove
i_mmap_mutex lockbreak").

Because as far as I can see, the only thing that makes this thing
needed in the first place is that horribly ugly "we drop i_mmap_lock
in the middle of random operations that really still need it".

That said, I don't really see any alternatives - I guess we can't
really just say "remove that crazy lock dropping". Even though I
really really really would like to.

Of course, we could also just decide that we should apply the mm
preemptibility series instead. Can people confirm that that fixes the
bug too?

                         Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH]  mm: prevent concurrent unmap_mapping_range() on the same inode
  2011-02-23 12:49 ` Miklos Szeredi
@ 2011-02-23 22:20   ` Hugh Dickins
  -1 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2011-02-23 22:20 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, torvalds, hch, gurudas.pai, lkml20101129, rjw, florian,
	trond.myklebust, maciej.rutecki, linux-kernel, linux-mm

On Wed, 23 Feb 2011, Miklos Szeredi wrote:

> Linus, Andrew,
> 
> This resolves Bug 25822 listed in the regressions since 2.6.36 (though
> it's a bug much older than that, for some reason it only started
> triggering for people recently).
> 
> Summary of discussion of below patch by Hugh:
> 
>    An executive summary of the thread that followed would be that hch
>    dislikes the bloat, as we all do, but Miklos and I don't see a
>    decent alternative (and hch has not proposed one).
> 
> Please consider this for 2.6.38.
> 
> Thanks,
> Miklos
> 
> ----
> Subject: mm: prevent concurrent unmap_mapping_range() on the same inode
> 
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> Michael Leun reported that running parallel opens on a fuse filesystem
> can trigger a "kernel BUG at mm/truncate.c:475"
> 
> Gurudas Pai reported the same bug on NFS.
> 
> The reason is, unmap_mapping_range() is not prepared for more than
> one concurrent invocation per inode.  For example:

Yes, at the time I did that preemptible restart stuff 6 years ago,
i_mutex was always held by callers of unmap_mapping_range(); and
I built that in as an assumption, without ever enforcing it with
a BUG to check.

>From that very time exceptions have been added, some with their own
serialization, some with none, so that now it's all too messy to fix
without a leadin time for weeding out and and trying (with uncertain
success) to rework its usage in miscellaneous filesystems (including
fuse, nfs, spufs, others and gpu/drm/i915 use of shmobjects).

> 
>   thread1: going through a big range, stops in the middle of a vma and
>      stores the restart address in vm_truncate_count.
> 
>   thread2: comes in with a small (e.g. single page) unmap request on
>      the same vma, somewhere before restart_address, finds that the
>      vma was already unmapped up to the restart address and happily
>      returns without doing anything.

We could probably hack something in cheaply to fix that part of it.

> 
> Another scenario would be two big unmap requests, both having to
> restart the unmapping and each one setting vm_truncate_count to its
> own value.  This could go on forever without any of them being able to
> finish.

But I don't know how to fix this part without proper serialization.

> 
> Truncate and hole punching already serialize with i_mutex.  Other
> callers of unmap_mapping_range() do not, and it's difficult to get
> i_mutex protection for all callers.  In particular ->d_revalidate(),
> which calls invalidate_inode_pages2_range() in fuse, may be called
> with or without i_mutex.
> 
> This patch adds a new mutex to 'struct address_space' to prevent
> running multiple concurrent unmap_mapping_range() on the same mapping.

Yes, I once had hopes to reuse i_alloc_sem for this purpose; but
it's taken outside of mmap_sem and this needs to be taken inside.

> 
> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
> Reported-by: Michael Leun <lkml20101129@newton.leun.net>
> Reported-by: Gurudas Pai <gurudas.pai@oracle.com>
> Tested-by: Gurudas Pai <gurudas.pai@oracle.com>

Acked-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org

I just tried again, and failed again, to come up with a better answer:
thanks for persisting, Miklos.

> ---
>  fs/gfs2/main.c     |    9 +--------
>  fs/inode.c         |   22 +++++++++++++++-------
>  fs/nilfs2/btnode.c |    5 -----
>  fs/nilfs2/btnode.h |    1 -
>  fs/nilfs2/mdt.c    |    4 ++--
>  fs/nilfs2/page.c   |   13 -------------
>  fs/nilfs2/page.h   |    1 -
>  fs/nilfs2/super.c  |    2 +-
>  include/linux/fs.h |    2 ++
>  mm/memory.c        |    2 ++
>  10 files changed, 23 insertions(+), 38 deletions(-)
> 
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c	2011-02-22 11:05:15.000000000 +0100
> +++ linux-2.6/mm/memory.c	2011-02-23 13:35:30.000000000 +0100
> @@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_
>  		details.last_index = ULONG_MAX;
>  	details.i_mmap_lock = &mapping->i_mmap_lock;
>  
> +	mutex_lock(&mapping->unmap_mutex);
>  	spin_lock(&mapping->i_mmap_lock);
>  
>  	/* Protect against endless unmapping loops */
> @@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_
>  	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
>  		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
>  	spin_unlock(&mapping->i_mmap_lock);
> +	mutex_unlock(&mapping->unmap_mutex);
>  }
>  EXPORT_SYMBOL(unmap_mapping_range);
>  
> Index: linux-2.6/fs/gfs2/main.c
> ===================================================================
> --- linux-2.6.orig/fs/gfs2/main.c	2011-02-22 11:05:15.000000000 +0100
> +++ linux-2.6/fs/gfs2/main.c	2011-02-23 13:35:30.000000000 +0100
> @@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(voi
>  	struct address_space *mapping = (struct address_space *)(gl + 1);
>  
>  	gfs2_init_glock_once(gl);
> -	memset(mapping, 0, sizeof(*mapping));
> -	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
> -	spin_lock_init(&mapping->tree_lock);
> -	spin_lock_init(&mapping->i_mmap_lock);
> -	INIT_LIST_HEAD(&mapping->private_list);
> -	spin_lock_init(&mapping->private_lock);
> -	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
> -	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
> +	address_space_init_once(mapping);
>  }
>  
>  /**
> Index: linux-2.6/fs/inode.c
> ===================================================================
> --- linux-2.6.orig/fs/inode.c	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/inode.c	2011-02-23 13:35:30.000000000 +0100
> @@ -295,6 +295,20 @@ static void destroy_inode(struct inode *
>  		call_rcu(&inode->i_rcu, i_callback);
>  }
>  
> +void address_space_init_once(struct address_space *mapping)
> +{
> +	memset(mapping, 0, sizeof(*mapping));
> +	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
> +	spin_lock_init(&mapping->tree_lock);
> +	spin_lock_init(&mapping->i_mmap_lock);
> +	INIT_LIST_HEAD(&mapping->private_list);
> +	spin_lock_init(&mapping->private_lock);
> +	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
> +	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
> +	mutex_init(&mapping->unmap_mutex);
> +}
> +EXPORT_SYMBOL(address_space_init_once);
> +
>  /*
>   * These are initializations that only need to be done
>   * once, because the fields are idempotent across use
> @@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode
>  	INIT_LIST_HEAD(&inode->i_devices);
>  	INIT_LIST_HEAD(&inode->i_wb_list);
>  	INIT_LIST_HEAD(&inode->i_lru);
> -	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
> -	spin_lock_init(&inode->i_data.tree_lock);
> -	spin_lock_init(&inode->i_data.i_mmap_lock);
> -	INIT_LIST_HEAD(&inode->i_data.private_list);
> -	spin_lock_init(&inode->i_data.private_lock);
> -	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
> -	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
> +	address_space_init_once(&inode->i_data);
>  	i_size_ordered_init(inode);
>  #ifdef CONFIG_FSNOTIFY
>  	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
> Index: linux-2.6/fs/nilfs2/btnode.c
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/btnode.c	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/btnode.c	2011-02-23 13:35:30.000000000 +0100
> @@ -35,11 +35,6 @@
>  #include "btnode.h"
>  
>  
> -void nilfs_btnode_cache_init_once(struct address_space *btnc)
> -{
> -	nilfs_mapping_init_once(btnc);
> -}
> -
>  static const struct address_space_operations def_btnode_aops = {
>  	.sync_page		= block_sync_page,
>  };
> Index: linux-2.6/fs/nilfs2/btnode.h
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/btnode.h	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/btnode.h	2011-02-23 13:35:30.000000000 +0100
> @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
>  	struct buffer_head *newbh;
>  };
>  
> -void nilfs_btnode_cache_init_once(struct address_space *);
>  void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
>  void nilfs_btnode_cache_clear(struct address_space *);
>  struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
> Index: linux-2.6/fs/nilfs2/mdt.c
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/mdt.c	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/mdt.c	2011-02-23 13:35:30.000000000 +0100
> @@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct in
>  	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
>  
>  	INIT_LIST_HEAD(&shadow->frozen_buffers);
> -	nilfs_mapping_init_once(&shadow->frozen_data);
> +	address_space_init_once(&shadow->frozen_data);
>  	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
> -	nilfs_mapping_init_once(&shadow->frozen_btnodes);
> +	address_space_init_once(&shadow->frozen_btnodes);
>  	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
>  	mi->mi_shadow = shadow;
>  	return 0;
> Index: linux-2.6/fs/nilfs2/page.c
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/page.c	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/page.c	2011-02-23 13:35:30.000000000 +0100
> @@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(
>  	return nc;
>  }
>  
> -void nilfs_mapping_init_once(struct address_space *mapping)
> -{
> -	memset(mapping, 0, sizeof(*mapping));
> -	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
> -	spin_lock_init(&mapping->tree_lock);
> -	INIT_LIST_HEAD(&mapping->private_list);
> -	spin_lock_init(&mapping->private_lock);
> -
> -	spin_lock_init(&mapping->i_mmap_lock);
> -	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
> -	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
> -}
> -
>  void nilfs_mapping_init(struct address_space *mapping,
>  			struct backing_dev_info *bdi,
>  			const struct address_space_operations *aops)
> Index: linux-2.6/fs/nilfs2/page.h
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/page.h	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/page.h	2011-02-23 13:35:30.000000000 +0100
> @@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page
>  int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
>  void nilfs_copy_back_pages(struct address_space *, struct address_space *);
>  void nilfs_clear_dirty_pages(struct address_space *);
> -void nilfs_mapping_init_once(struct address_space *mapping);
>  void nilfs_mapping_init(struct address_space *mapping,
>  			struct backing_dev_info *bdi,
>  			const struct address_space_operations *aops);
> Index: linux-2.6/fs/nilfs2/super.c
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/super.c	2011-02-07 17:05:19.000000000 +0100
> +++ linux-2.6/fs/nilfs2/super.c	2011-02-23 13:35:30.000000000 +0100
> @@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *
>  #ifdef CONFIG_NILFS_XATTR
>  	init_rwsem(&ii->xattr_sem);
>  #endif
> -	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
> +	address_space_init_once(&ii->i_btnode_cache);
>  	ii->i_bmap = &ii->i_bmap_data;
>  	inode_init_once(&ii->vfs_inode);
>  }
> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h	2011-02-22 11:04:39.000000000 +0100
> +++ linux-2.6/include/linux/fs.h	2011-02-23 13:35:30.000000000 +0100
> @@ -649,6 +649,7 @@ struct address_space {
>  	spinlock_t		private_lock;	/* for use by the address_space */
>  	struct list_head	private_list;	/* ditto */
>  	struct address_space	*assoc_mapping;	/* ditto */
> +	struct mutex		unmap_mutex;    /* to protect unmapping */
>  } __attribute__((aligned(sizeof(long))));
>  	/*
>  	 * On most architectures that alignment is already the case; but
> @@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *fi
>  
>  extern int inode_init_always(struct super_block *, struct inode *);
>  extern void inode_init_once(struct inode *);
> +extern void address_space_init_once(struct address_space *mapping);
>  extern void ihold(struct inode * inode);
>  extern void iput(struct inode *);
>  extern struct inode * igrab(struct inode *);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH]  mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-02-23 22:20   ` Hugh Dickins
  0 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2011-02-23 22:20 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, torvalds, hch, gurudas.pai, lkml20101129, rjw, florian,
	trond.myklebust, maciej.rutecki, linux-kernel, linux-mm

On Wed, 23 Feb 2011, Miklos Szeredi wrote:

> Linus, Andrew,
> 
> This resolves Bug 25822 listed in the regressions since 2.6.36 (though
> it's a bug much older than that, for some reason it only started
> triggering for people recently).
> 
> Summary of discussion of below patch by Hugh:
> 
>    An executive summary of the thread that followed would be that hch
>    dislikes the bloat, as we all do, but Miklos and I don't see a
>    decent alternative (and hch has not proposed one).
> 
> Please consider this for 2.6.38.
> 
> Thanks,
> Miklos
> 
> ----
> Subject: mm: prevent concurrent unmap_mapping_range() on the same inode
> 
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> Michael Leun reported that running parallel opens on a fuse filesystem
> can trigger a "kernel BUG at mm/truncate.c:475"
> 
> Gurudas Pai reported the same bug on NFS.
> 
> The reason is, unmap_mapping_range() is not prepared for more than
> one concurrent invocation per inode.  For example:

Yes, at the time I did that preemptible restart stuff 6 years ago,
i_mutex was always held by callers of unmap_mapping_range(); and
I built that in as an assumption, without ever enforcing it with
a BUG to check.

>From that very time exceptions have been added, some with their own
serialization, some with none, so that now it's all too messy to fix
without a leadin time for weeding out and and trying (with uncertain
success) to rework its usage in miscellaneous filesystems (including
fuse, nfs, spufs, others and gpu/drm/i915 use of shmobjects).

> 
>   thread1: going through a big range, stops in the middle of a vma and
>      stores the restart address in vm_truncate_count.
> 
>   thread2: comes in with a small (e.g. single page) unmap request on
>      the same vma, somewhere before restart_address, finds that the
>      vma was already unmapped up to the restart address and happily
>      returns without doing anything.

We could probably hack something in cheaply to fix that part of it.

> 
> Another scenario would be two big unmap requests, both having to
> restart the unmapping and each one setting vm_truncate_count to its
> own value.  This could go on forever without any of them being able to
> finish.

But I don't know how to fix this part without proper serialization.

> 
> Truncate and hole punching already serialize with i_mutex.  Other
> callers of unmap_mapping_range() do not, and it's difficult to get
> i_mutex protection for all callers.  In particular ->d_revalidate(),
> which calls invalidate_inode_pages2_range() in fuse, may be called
> with or without i_mutex.
> 
> This patch adds a new mutex to 'struct address_space' to prevent
> running multiple concurrent unmap_mapping_range() on the same mapping.

Yes, I once had hopes to reuse i_alloc_sem for this purpose; but
it's taken outside of mmap_sem and this needs to be taken inside.

> 
> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
> Reported-by: Michael Leun <lkml20101129@newton.leun.net>
> Reported-by: Gurudas Pai <gurudas.pai@oracle.com>
> Tested-by: Gurudas Pai <gurudas.pai@oracle.com>

Acked-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org

I just tried again, and failed again, to come up with a better answer:
thanks for persisting, Miklos.

> ---
>  fs/gfs2/main.c     |    9 +--------
>  fs/inode.c         |   22 +++++++++++++++-------
>  fs/nilfs2/btnode.c |    5 -----
>  fs/nilfs2/btnode.h |    1 -
>  fs/nilfs2/mdt.c    |    4 ++--
>  fs/nilfs2/page.c   |   13 -------------
>  fs/nilfs2/page.h   |    1 -
>  fs/nilfs2/super.c  |    2 +-
>  include/linux/fs.h |    2 ++
>  mm/memory.c        |    2 ++
>  10 files changed, 23 insertions(+), 38 deletions(-)
> 
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c	2011-02-22 11:05:15.000000000 +0100
> +++ linux-2.6/mm/memory.c	2011-02-23 13:35:30.000000000 +0100
> @@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_
>  		details.last_index = ULONG_MAX;
>  	details.i_mmap_lock = &mapping->i_mmap_lock;
>  
> +	mutex_lock(&mapping->unmap_mutex);
>  	spin_lock(&mapping->i_mmap_lock);
>  
>  	/* Protect against endless unmapping loops */
> @@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_
>  	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
>  		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
>  	spin_unlock(&mapping->i_mmap_lock);
> +	mutex_unlock(&mapping->unmap_mutex);
>  }
>  EXPORT_SYMBOL(unmap_mapping_range);
>  
> Index: linux-2.6/fs/gfs2/main.c
> ===================================================================
> --- linux-2.6.orig/fs/gfs2/main.c	2011-02-22 11:05:15.000000000 +0100
> +++ linux-2.6/fs/gfs2/main.c	2011-02-23 13:35:30.000000000 +0100
> @@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(voi
>  	struct address_space *mapping = (struct address_space *)(gl + 1);
>  
>  	gfs2_init_glock_once(gl);
> -	memset(mapping, 0, sizeof(*mapping));
> -	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
> -	spin_lock_init(&mapping->tree_lock);
> -	spin_lock_init(&mapping->i_mmap_lock);
> -	INIT_LIST_HEAD(&mapping->private_list);
> -	spin_lock_init(&mapping->private_lock);
> -	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
> -	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
> +	address_space_init_once(mapping);
>  }
>  
>  /**
> Index: linux-2.6/fs/inode.c
> ===================================================================
> --- linux-2.6.orig/fs/inode.c	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/inode.c	2011-02-23 13:35:30.000000000 +0100
> @@ -295,6 +295,20 @@ static void destroy_inode(struct inode *
>  		call_rcu(&inode->i_rcu, i_callback);
>  }
>  
> +void address_space_init_once(struct address_space *mapping)
> +{
> +	memset(mapping, 0, sizeof(*mapping));
> +	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
> +	spin_lock_init(&mapping->tree_lock);
> +	spin_lock_init(&mapping->i_mmap_lock);
> +	INIT_LIST_HEAD(&mapping->private_list);
> +	spin_lock_init(&mapping->private_lock);
> +	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
> +	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
> +	mutex_init(&mapping->unmap_mutex);
> +}
> +EXPORT_SYMBOL(address_space_init_once);
> +
>  /*
>   * These are initializations that only need to be done
>   * once, because the fields are idempotent across use
> @@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode
>  	INIT_LIST_HEAD(&inode->i_devices);
>  	INIT_LIST_HEAD(&inode->i_wb_list);
>  	INIT_LIST_HEAD(&inode->i_lru);
> -	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
> -	spin_lock_init(&inode->i_data.tree_lock);
> -	spin_lock_init(&inode->i_data.i_mmap_lock);
> -	INIT_LIST_HEAD(&inode->i_data.private_list);
> -	spin_lock_init(&inode->i_data.private_lock);
> -	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
> -	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
> +	address_space_init_once(&inode->i_data);
>  	i_size_ordered_init(inode);
>  #ifdef CONFIG_FSNOTIFY
>  	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
> Index: linux-2.6/fs/nilfs2/btnode.c
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/btnode.c	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/btnode.c	2011-02-23 13:35:30.000000000 +0100
> @@ -35,11 +35,6 @@
>  #include "btnode.h"
>  
>  
> -void nilfs_btnode_cache_init_once(struct address_space *btnc)
> -{
> -	nilfs_mapping_init_once(btnc);
> -}
> -
>  static const struct address_space_operations def_btnode_aops = {
>  	.sync_page		= block_sync_page,
>  };
> Index: linux-2.6/fs/nilfs2/btnode.h
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/btnode.h	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/btnode.h	2011-02-23 13:35:30.000000000 +0100
> @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
>  	struct buffer_head *newbh;
>  };
>  
> -void nilfs_btnode_cache_init_once(struct address_space *);
>  void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
>  void nilfs_btnode_cache_clear(struct address_space *);
>  struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
> Index: linux-2.6/fs/nilfs2/mdt.c
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/mdt.c	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/mdt.c	2011-02-23 13:35:30.000000000 +0100
> @@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct in
>  	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
>  
>  	INIT_LIST_HEAD(&shadow->frozen_buffers);
> -	nilfs_mapping_init_once(&shadow->frozen_data);
> +	address_space_init_once(&shadow->frozen_data);
>  	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
> -	nilfs_mapping_init_once(&shadow->frozen_btnodes);
> +	address_space_init_once(&shadow->frozen_btnodes);
>  	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
>  	mi->mi_shadow = shadow;
>  	return 0;
> Index: linux-2.6/fs/nilfs2/page.c
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/page.c	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/page.c	2011-02-23 13:35:30.000000000 +0100
> @@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(
>  	return nc;
>  }
>  
> -void nilfs_mapping_init_once(struct address_space *mapping)
> -{
> -	memset(mapping, 0, sizeof(*mapping));
> -	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
> -	spin_lock_init(&mapping->tree_lock);
> -	INIT_LIST_HEAD(&mapping->private_list);
> -	spin_lock_init(&mapping->private_lock);
> -
> -	spin_lock_init(&mapping->i_mmap_lock);
> -	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
> -	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
> -}
> -
>  void nilfs_mapping_init(struct address_space *mapping,
>  			struct backing_dev_info *bdi,
>  			const struct address_space_operations *aops)
> Index: linux-2.6/fs/nilfs2/page.h
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/page.h	2011-01-20 13:28:34.000000000 +0100
> +++ linux-2.6/fs/nilfs2/page.h	2011-02-23 13:35:30.000000000 +0100
> @@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page
>  int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
>  void nilfs_copy_back_pages(struct address_space *, struct address_space *);
>  void nilfs_clear_dirty_pages(struct address_space *);
> -void nilfs_mapping_init_once(struct address_space *mapping);
>  void nilfs_mapping_init(struct address_space *mapping,
>  			struct backing_dev_info *bdi,
>  			const struct address_space_operations *aops);
> Index: linux-2.6/fs/nilfs2/super.c
> ===================================================================
> --- linux-2.6.orig/fs/nilfs2/super.c	2011-02-07 17:05:19.000000000 +0100
> +++ linux-2.6/fs/nilfs2/super.c	2011-02-23 13:35:30.000000000 +0100
> @@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *
>  #ifdef CONFIG_NILFS_XATTR
>  	init_rwsem(&ii->xattr_sem);
>  #endif
> -	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
> +	address_space_init_once(&ii->i_btnode_cache);
>  	ii->i_bmap = &ii->i_bmap_data;
>  	inode_init_once(&ii->vfs_inode);
>  }
> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h	2011-02-22 11:04:39.000000000 +0100
> +++ linux-2.6/include/linux/fs.h	2011-02-23 13:35:30.000000000 +0100
> @@ -649,6 +649,7 @@ struct address_space {
>  	spinlock_t		private_lock;	/* for use by the address_space */
>  	struct list_head	private_list;	/* ditto */
>  	struct address_space	*assoc_mapping;	/* ditto */
> +	struct mutex		unmap_mutex;    /* to protect unmapping */
>  } __attribute__((aligned(sizeof(long))));
>  	/*
>  	 * On most architectures that alignment is already the case; but
> @@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *fi
>  
>  extern int inode_init_always(struct super_block *, struct inode *);
>  extern void inode_init_once(struct inode *);
> +extern void address_space_init_once(struct address_space *mapping);
>  extern void ihold(struct inode * inode);
>  extern void iput(struct inode *);
>  extern struct inode * igrab(struct inode *);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH]  mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-02-23 12:49 ` Miklos Szeredi
  0 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-02-23 12:49 UTC (permalink / raw)
  To: akpm, torvalds
  Cc: hughd, gurudas.pai, lkml20101129, rjw, florian, trond.myklebust,
	maciej.rutecki, linux-kernel, linux-mm

Linus, Andrew,

This resolves Bug 25822 listed in the regressions since 2.6.36 (though
it's a bug much older than that, for some reason it only started
triggering for people recently).

Summary of discussion of below patch by Hugh:

   An executive summary of the thread that followed would be that hch
   dislikes the bloat, as we all do, but Miklos and I don't see a
   decent alternative (and hch has not proposed one).

Please consider this for 2.6.38.

Thanks,
Miklos

----
Subject: mm: prevent concurrent unmap_mapping_range() on the same inode

From: Miklos Szeredi <mszeredi@suse.cz>

Michael Leun reported that running parallel opens on a fuse filesystem
can trigger a "kernel BUG at mm/truncate.c:475"

Gurudas Pai reported the same bug on NFS.

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.  For example:

  thread1: going through a big range, stops in the middle of a vma and
     stores the restart address in vm_truncate_count.

  thread2: comes in with a small (e.g. single page) unmap request on
     the same vma, somewhere before restart_address, finds that the
     vma was already unmapped up to the restart address and happily
     returns without doing anything.

Another scenario would be two big unmap requests, both having to
restart the unmapping and each one setting vm_truncate_count to its
own value.  This could go on forever without any of them being able to
finish.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to 'struct address_space' to prevent
running multiple concurrent unmap_mapping_range() on the same mapping.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Reported-by: Gurudas Pai <gurudas.pai@oracle.com>
Tested-by: Gurudas Pai <gurudas.pai@oracle.com>
---
 fs/gfs2/main.c     |    9 +--------
 fs/inode.c         |   22 +++++++++++++++-------
 fs/nilfs2/btnode.c |    5 -----
 fs/nilfs2/btnode.h |    1 -
 fs/nilfs2/mdt.c    |    4 ++--
 fs/nilfs2/page.c   |   13 -------------
 fs/nilfs2/page.h   |    1 -
 fs/nilfs2/super.c  |    2 +-
 include/linux/fs.h |    2 ++
 mm/memory.c        |    2 ++
 10 files changed, 23 insertions(+), 38 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2011-02-22 11:05:15.000000000 +0100
+++ linux-2.6/mm/memory.c	2011-02-23 13:35:30.000000000 +0100
@@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_
 		details.last_index = ULONG_MAX;
 	details.i_mmap_lock = &mapping->i_mmap_lock;
 
+	mutex_lock(&mapping->unmap_mutex);
 	spin_lock(&mapping->i_mmap_lock);
 
 	/* Protect against endless unmapping loops */
@@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	spin_unlock(&mapping->i_mmap_lock);
+	mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
Index: linux-2.6/fs/gfs2/main.c
===================================================================
--- linux-2.6.orig/fs/gfs2/main.c	2011-02-22 11:05:15.000000000 +0100
+++ linux-2.6/fs/gfs2/main.c	2011-02-23 13:35:30.000000000 +0100
@@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(voi
 	struct address_space *mapping = (struct address_space *)(gl + 1);
 
 	gfs2_init_glock_once(gl);
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	address_space_init_once(mapping);
 }
 
 /**
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/inode.c	2011-02-23 13:35:30.000000000 +0100
@@ -295,6 +295,20 @@ static void destroy_inode(struct inode *
 		call_rcu(&inode->i_rcu, i_callback);
 }
 
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
+
 /*
  * These are initializations that only need to be done
  * once, because the fields are idempotent across use
@@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_wb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
-	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
-	INIT_LIST_HEAD(&inode->i_data.private_list);
-	spin_lock_init(&inode->i_data.private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
+	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
Index: linux-2.6/fs/nilfs2/btnode.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/btnode.c	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/btnode.c	2011-02-23 13:35:30.000000000 +0100
@@ -35,11 +35,6 @@
 #include "btnode.h"
 
 
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-	nilfs_mapping_init_once(btnc);
-}
-
 static const struct address_space_operations def_btnode_aops = {
 	.sync_page		= block_sync_page,
 };
Index: linux-2.6/fs/nilfs2/btnode.h
===================================================================
--- linux-2.6.orig/fs/nilfs2/btnode.h	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/btnode.h	2011-02-23 13:35:30.000000000 +0100
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
 	struct buffer_head *newbh;
 };
 
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
Index: linux-2.6/fs/nilfs2/mdt.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/mdt.c	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/mdt.c	2011-02-23 13:35:30.000000000 +0100
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct in
 	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
-	nilfs_mapping_init_once(&shadow->frozen_data);
+	address_space_init_once(&shadow->frozen_data);
 	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	address_space_init_once(&shadow->frozen_btnodes);
 	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
 	mi->mi_shadow = shadow;
 	return 0;
Index: linux-2.6/fs/nilfs2/page.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/page.c	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/page.c	2011-02-23 13:35:30.000000000 +0100
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(
 	return nc;
 }
 
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
-
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops)
Index: linux-2.6/fs/nilfs2/page.h
===================================================================
--- linux-2.6.orig/fs/nilfs2/page.h	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/page.h	2011-02-23 13:35:30.000000000 +0100
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops);
Index: linux-2.6/fs/nilfs2/super.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/super.c	2011-02-07 17:05:19.000000000 +0100
+++ linux-2.6/fs/nilfs2/super.c	2011-02-23 13:35:30.000000000 +0100
@@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *
 #ifdef CONFIG_NILFS_XATTR
 	init_rwsem(&ii->xattr_sem);
 #endif
-	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	address_space_init_once(&ii->i_btnode_cache);
 	ii->i_bmap = &ii->i_bmap_data;
 	inode_init_once(&ii->vfs_inode);
 }
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2011-02-22 11:04:39.000000000 +0100
+++ linux-2.6/include/linux/fs.h	2011-02-23 13:35:30.000000000 +0100
@@ -649,6 +649,7 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct mutex		unmap_mutex;    /* to protect unmapping */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *fi
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void address_space_init_once(struct address_space *mapping);
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH]  mm: prevent concurrent unmap_mapping_range() on the same inode
@ 2011-02-23 12:49 ` Miklos Szeredi
  0 siblings, 0 replies; 28+ messages in thread
From: Miklos Szeredi @ 2011-02-23 12:49 UTC (permalink / raw)
  To: akpm, torvalds
  Cc: hughd, gurudas.pai, lkml20101129, rjw, florian, trond.myklebust,
	maciej.rutecki, linux-kernel, linux-mm

Linus, Andrew,

This resolves Bug 25822 listed in the regressions since 2.6.36 (though
it's a bug much older than that, for some reason it only started
triggering for people recently).

Summary of discussion of below patch by Hugh:

   An executive summary of the thread that followed would be that hch
   dislikes the bloat, as we all do, but Miklos and I don't see a
   decent alternative (and hch has not proposed one).

Please consider this for 2.6.38.

Thanks,
Miklos

----
Subject: mm: prevent concurrent unmap_mapping_range() on the same inode

From: Miklos Szeredi <mszeredi@suse.cz>

Michael Leun reported that running parallel opens on a fuse filesystem
can trigger a "kernel BUG at mm/truncate.c:475"

Gurudas Pai reported the same bug on NFS.

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.  For example:

  thread1: going through a big range, stops in the middle of a vma and
     stores the restart address in vm_truncate_count.

  thread2: comes in with a small (e.g. single page) unmap request on
     the same vma, somewhere before restart_address, finds that the
     vma was already unmapped up to the restart address and happily
     returns without doing anything.

Another scenario would be two big unmap requests, both having to
restart the unmapping and each one setting vm_truncate_count to its
own value.  This could go on forever without any of them being able to
finish.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to 'struct address_space' to prevent
running multiple concurrent unmap_mapping_range() on the same mapping.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Reported-by: Gurudas Pai <gurudas.pai@oracle.com>
Tested-by: Gurudas Pai <gurudas.pai@oracle.com>
---
 fs/gfs2/main.c     |    9 +--------
 fs/inode.c         |   22 +++++++++++++++-------
 fs/nilfs2/btnode.c |    5 -----
 fs/nilfs2/btnode.h |    1 -
 fs/nilfs2/mdt.c    |    4 ++--
 fs/nilfs2/page.c   |   13 -------------
 fs/nilfs2/page.h   |    1 -
 fs/nilfs2/super.c  |    2 +-
 include/linux/fs.h |    2 ++
 mm/memory.c        |    2 ++
 10 files changed, 23 insertions(+), 38 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2011-02-22 11:05:15.000000000 +0100
+++ linux-2.6/mm/memory.c	2011-02-23 13:35:30.000000000 +0100
@@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_
 		details.last_index = ULONG_MAX;
 	details.i_mmap_lock = &mapping->i_mmap_lock;
 
+	mutex_lock(&mapping->unmap_mutex);
 	spin_lock(&mapping->i_mmap_lock);
 
 	/* Protect against endless unmapping loops */
@@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	spin_unlock(&mapping->i_mmap_lock);
+	mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
Index: linux-2.6/fs/gfs2/main.c
===================================================================
--- linux-2.6.orig/fs/gfs2/main.c	2011-02-22 11:05:15.000000000 +0100
+++ linux-2.6/fs/gfs2/main.c	2011-02-23 13:35:30.000000000 +0100
@@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(voi
 	struct address_space *mapping = (struct address_space *)(gl + 1);
 
 	gfs2_init_glock_once(gl);
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	address_space_init_once(mapping);
 }
 
 /**
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/inode.c	2011-02-23 13:35:30.000000000 +0100
@@ -295,6 +295,20 @@ static void destroy_inode(struct inode *
 		call_rcu(&inode->i_rcu, i_callback);
 }
 
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
+
 /*
  * These are initializations that only need to be done
  * once, because the fields are idempotent across use
@@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_wb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
-	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
-	INIT_LIST_HEAD(&inode->i_data.private_list);
-	spin_lock_init(&inode->i_data.private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
+	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
Index: linux-2.6/fs/nilfs2/btnode.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/btnode.c	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/btnode.c	2011-02-23 13:35:30.000000000 +0100
@@ -35,11 +35,6 @@
 #include "btnode.h"
 
 
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-	nilfs_mapping_init_once(btnc);
-}
-
 static const struct address_space_operations def_btnode_aops = {
 	.sync_page		= block_sync_page,
 };
Index: linux-2.6/fs/nilfs2/btnode.h
===================================================================
--- linux-2.6.orig/fs/nilfs2/btnode.h	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/btnode.h	2011-02-23 13:35:30.000000000 +0100
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
 	struct buffer_head *newbh;
 };
 
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
Index: linux-2.6/fs/nilfs2/mdt.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/mdt.c	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/mdt.c	2011-02-23 13:35:30.000000000 +0100
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct in
 	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
-	nilfs_mapping_init_once(&shadow->frozen_data);
+	address_space_init_once(&shadow->frozen_data);
 	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	address_space_init_once(&shadow->frozen_btnodes);
 	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
 	mi->mi_shadow = shadow;
 	return 0;
Index: linux-2.6/fs/nilfs2/page.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/page.c	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/page.c	2011-02-23 13:35:30.000000000 +0100
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(
 	return nc;
 }
 
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
-
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops)
Index: linux-2.6/fs/nilfs2/page.h
===================================================================
--- linux-2.6.orig/fs/nilfs2/page.h	2011-01-20 13:28:34.000000000 +0100
+++ linux-2.6/fs/nilfs2/page.h	2011-02-23 13:35:30.000000000 +0100
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops);
Index: linux-2.6/fs/nilfs2/super.c
===================================================================
--- linux-2.6.orig/fs/nilfs2/super.c	2011-02-07 17:05:19.000000000 +0100
+++ linux-2.6/fs/nilfs2/super.c	2011-02-23 13:35:30.000000000 +0100
@@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *
 #ifdef CONFIG_NILFS_XATTR
 	init_rwsem(&ii->xattr_sem);
 #endif
-	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	address_space_init_once(&ii->i_btnode_cache);
 	ii->i_bmap = &ii->i_bmap_data;
 	inode_init_once(&ii->vfs_inode);
 }
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2011-02-22 11:04:39.000000000 +0100
+++ linux-2.6/include/linux/fs.h	2011-02-23 13:35:30.000000000 +0100
@@ -649,6 +649,7 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct mutex		unmap_mutex;    /* to protect unmapping */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *fi
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void address_space_init_once(struct address_space *mapping);
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2011-03-02  9:48 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-20 12:30 [PATCH] mm: prevent concurrent unmap_mapping_range() on the same inode Miklos Szeredi
2011-01-20 12:30 ` Miklos Szeredi
2011-01-20 12:40 ` Christoph Hellwig
2011-01-20 12:40   ` Christoph Hellwig
2011-01-20 14:13   ` Miklos Szeredi
2011-01-20 14:13     ` Miklos Szeredi
2011-01-22  4:46     ` Hugh Dickins
2011-01-22  4:46       ` Hugh Dickins
2011-01-24 19:47       ` Miklos Szeredi
2011-01-24 19:47         ` Miklos Szeredi
2011-01-27  4:19         ` Hugh Dickins
2011-01-27  4:19           ` Hugh Dickins
2011-02-08 10:30           ` Miklos Szeredi
2011-02-08 10:30             ` Miklos Szeredi
2011-02-08 11:52             ` Gurudas Pai
2011-02-08 11:52               ` Gurudas Pai
2011-02-08 11:59               ` Miklos Szeredi
2011-02-08 11:59                 ` Miklos Szeredi
2011-02-23 12:49 Miklos Szeredi
2011-02-23 12:49 ` Miklos Szeredi
2011-02-23 22:20 ` Hugh Dickins
2011-02-23 22:20   ` Hugh Dickins
2011-02-23 22:33 ` Linus Torvalds
2011-02-23 22:33   ` Linus Torvalds
2011-02-23 23:12   ` Hugh Dickins
2011-02-23 23:12     ` Hugh Dickins
2011-03-02  9:48     ` Peter Zijlstra
2011-03-02  9:48       ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.