linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH, RFC 0/2] Recover some scalability for DAX
@ 2015-08-10 15:14 Kirill A. Shutemov
  2015-08-10 15:14 ` [PATCH, RFC 1/2] lib: Implement range locks Kirill A. Shutemov
  2015-08-10 15:14 ` [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock Kirill A. Shutemov
  0 siblings, 2 replies; 18+ messages in thread
From: Kirill A. Shutemov @ 2015-08-10 15:14 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox
  Cc: linux-mm, linux-fsdevel, linux-kernel, Davidlohr Bueso, Jan Kara,
	Kirill A. Shutemov

Hi all,

Currently, i_mmap_lock is huge bottleneck for DAX scalability as we use in
place of lock_page().

This patchset tries to recover some scalability by introducing per-mapping
range-lock. The range-lock itself is implemented by Jan Kara on top of
interval tree. It looks not so cheap, by should scale better than
exclusive i_mmap_lock.

Any comments?

Jan Kara (1):
  lib: Implement range locks

Kirill A. Shutemov (1):
  dax: use range_lock instead of i_mmap_lock

 drivers/gpu/drm/Kconfig      |  1 -
 drivers/gpu/drm/i915/Kconfig |  1 -
 fs/dax.c                     | 30 +++++++++--------
 fs/inode.c                   |  1 +
 include/linux/fs.h           |  2 ++
 include/linux/range_lock.h   | 51 +++++++++++++++++++++++++++++
 lib/Kconfig                  | 14 --------
 lib/Kconfig.debug            |  1 -
 lib/Makefile                 |  3 +-
 lib/range_lock.c             | 78 ++++++++++++++++++++++++++++++++++++++++++++
 mm/memory.c                  | 35 +++++++++++++-------
 mm/rmap.c                    |  1 +
 12 files changed, 174 insertions(+), 44 deletions(-)
 create mode 100644 include/linux/range_lock.h
 create mode 100644 lib/range_lock.c

-- 
2.5.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH, RFC 1/2] lib: Implement range locks
  2015-08-10 15:14 [PATCH, RFC 0/2] Recover some scalability for DAX Kirill A. Shutemov
@ 2015-08-10 15:14 ` Kirill A. Shutemov
  2015-08-10 15:14 ` [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock Kirill A. Shutemov
  1 sibling, 0 replies; 18+ messages in thread
From: Kirill A. Shutemov @ 2015-08-10 15:14 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox
  Cc: linux-mm, linux-fsdevel, linux-kernel, Davidlohr Bueso, Jan Kara,
	Kirill A. Shutemov

From: Jan Kara <jack@suse.cz>

Implement range locking using interval tree.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 drivers/gpu/drm/Kconfig      |  1 -
 drivers/gpu/drm/i915/Kconfig |  1 -
 include/linux/range_lock.h   | 51 +++++++++++++++++++++++++++++
 lib/Kconfig                  | 14 --------
 lib/Kconfig.debug            |  1 -
 lib/Makefile                 |  3 +-
 lib/range_lock.c             | 78 ++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 130 insertions(+), 19 deletions(-)
 create mode 100644 include/linux/range_lock.h
 create mode 100644 lib/range_lock.c

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 06ae5008c5ed..1e02f73891fd 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -130,7 +130,6 @@ config DRM_RADEON
 	select POWER_SUPPLY
 	select HWMON
 	select BACKLIGHT_CLASS_DEVICE
-	select INTERVAL_TREE
 	help
 	  Choose this option if you have an ATI Radeon graphics card.  There
 	  are both PCI and AGP versions.  You don't need to choose this to
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index eb87e2538861..c76525f8cbe9 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -5,7 +5,6 @@ config DRM_I915
 	depends on (AGP || AGP=n)
 	select INTEL_GTT
 	select AGP_INTEL if AGP
-	select INTERVAL_TREE
 	# we need shmfs for the swappable backing store, and in particular
 	# the shmem_readpage() which depends upon tmpfs
 	select SHMEM
diff --git a/include/linux/range_lock.h b/include/linux/range_lock.h
new file mode 100644
index 000000000000..fe258a599676
--- /dev/null
+++ b/include/linux/range_lock.h
@@ -0,0 +1,51 @@
+/*
+ * Range locking
+ *
+ * We allow exclusive locking of arbitrary ranges. We guarantee that each
+ * range is locked only after all conflicting range locks requested previously
+ * have been unlocked. Thus we achieve fairness and avoid livelocks.
+ *
+ * The cost of lock and unlock of a range is O(log(R_all)+R_int) where R_all is
+ * total number of ranges and R_int is the number of ranges intersecting the
+ * operated range.
+ */
+#ifndef _LINUX_RANGE_LOCK_H
+#define _LINUX_RANGE_LOCK_H
+
+#include <linux/rbtree.h>
+#include <linux/interval_tree.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+
+struct task_struct;
+
+struct range_lock {
+	struct interval_tree_node node;
+	struct task_struct *task;
+	/* Number of ranges which are blocking acquisition of the lock */
+	unsigned int blocking_ranges;
+};
+
+struct range_lock_tree {
+	struct rb_root root;
+	spinlock_t lock;
+};
+
+#define RANGE_LOCK_INITIALIZER(start, end) {\
+	.node = {\
+		.start = (start),\
+		.end = (end)\
+	}\
+}
+
+static inline void range_lock_tree_init(struct range_lock_tree *tree)
+{
+	tree->root = RB_ROOT;
+	spin_lock_init(&tree->lock);
+}
+void range_lock_init(struct range_lock *lock, unsigned long start,
+		     unsigned long end);
+void range_lock(struct range_lock_tree *tree, struct range_lock *lock);
+void range_unlock(struct range_lock_tree *tree, struct range_lock *lock);
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index a4766fee0017..29802dfd51de 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -355,20 +355,6 @@ config TEXTSEARCH_FSM
 config BTREE
 	bool
 
-config INTERVAL_TREE
-	bool
-	help
-	  Simple, embeddable, interval-tree. Can find the start of an
-	  overlapping range in log(n) time and then iterate over all
-	  overlapping nodes. The algorithm is implemented as an
-	  augmented rbtree.
-
-	  See:
-
-		Documentation/rbtree.txt
-
-	  for more information.
-
 config ASSOCIATIVE_ARRAY
 	bool
 	help
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f7dd8f1d4075..deb14201b3c1 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1643,7 +1643,6 @@ config RBTREE_TEST
 config INTERVAL_TREE_TEST
 	tristate "Interval tree test"
 	depends on m && DEBUG_KERNEL
-	select INTERVAL_TREE
 	help
 	  A benchmark measuring the performance of the interval tree library
 
diff --git a/lib/Makefile b/lib/Makefile
index 51e1d761f0b9..7eafc7567306 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
 	 proportions.o flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o
+	 earlycpio.o seq_buf.o nmi_backtrace.o interval_tree.o range_lock.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
@@ -63,7 +63,6 @@ CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
-obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
 obj-$(CONFIG_ASSOCIATIVE_ARRAY) += assoc_array.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 obj-$(CONFIG_DEBUG_LIST) += list_debug.o
diff --git a/lib/range_lock.c b/lib/range_lock.c
new file mode 100644
index 000000000000..1cb119ba6d1a
--- /dev/null
+++ b/lib/range_lock.c
@@ -0,0 +1,78 @@
+/*
+ * Implementation of range locks.
+ *
+ * We keep interval tree of locked and to-be-locked ranges. When new range lock
+ * is requested, we add its interval to the tree and store number of intervals
+ * intersecting it to 'blocking_ranges'.
+ *
+ * When a range is unlocked, we again walk intervals that intersect with the
+ * unlocked one and decrement their 'blocking_ranges'.  We wake up owner of any
+ * range lock whose 'blocking_ranges' drops to 0.
+ */
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/interval_tree.h>
+#include <linux/spinlock.h>
+#include <linux/range_lock.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+
+void range_lock_init(struct range_lock *lock, unsigned long start,
+		     unsigned long end)
+{
+	lock->node.start = start;
+	lock->node.last = end;
+	RB_CLEAR_NODE(&lock->node.rb);
+	lock->blocking_ranges = 0;
+}
+EXPORT_SYMBOL(range_lock_init);
+
+void range_lock(struct range_lock_tree *tree, struct range_lock *lock)
+{
+	struct interval_tree_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tree->lock, flags);
+	node = interval_tree_iter_first(&tree->root, lock->node.start,
+					lock->node.last);
+	while (node) {
+		lock->blocking_ranges++;
+		node = interval_tree_iter_next(node, lock->node.start,
+					       lock->node.last);
+	}
+	interval_tree_insert(&lock->node, &tree->root);
+	/* Do we need to go to sleep? */
+	while (lock->blocking_ranges) {
+		lock->task = current;
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock_irqrestore(&tree->lock, flags);
+		schedule();
+		spin_lock_irqsave(&tree->lock, flags);
+	}
+	spin_unlock_irqrestore(&tree->lock, flags);
+}
+EXPORT_SYMBOL(range_lock);
+
+static void range_lock_unblock(struct range_lock *lock)
+{
+	if (!--lock->blocking_ranges)
+		wake_up_process(lock->task);
+}
+
+void range_unlock(struct range_lock_tree *tree, struct range_lock *lock)
+{
+	struct interval_tree_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tree->lock, flags);
+	interval_tree_remove(&lock->node, &tree->root);
+	node = interval_tree_iter_first(&tree->root, lock->node.start,
+					lock->node.last);
+	while (node) {
+		range_lock_unblock((struct range_lock *)node);
+		node = interval_tree_iter_next(node, lock->node.start,
+					       lock->node.last);
+	}
+	spin_unlock_irqrestore(&tree->lock, flags);
+}
+EXPORT_SYMBOL(range_unlock);
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-10 15:14 [PATCH, RFC 0/2] Recover some scalability for DAX Kirill A. Shutemov
  2015-08-10 15:14 ` [PATCH, RFC 1/2] lib: Implement range locks Kirill A. Shutemov
@ 2015-08-10 15:14 ` Kirill A. Shutemov
  2015-08-11  8:19   ` Jan Kara
  1 sibling, 1 reply; 18+ messages in thread
From: Kirill A. Shutemov @ 2015-08-10 15:14 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox
  Cc: linux-mm, linux-fsdevel, linux-kernel, Davidlohr Bueso, Jan Kara,
	Kirill A. Shutemov

As we don't have struct pages for DAX memory, Matthew had to find an
replacement for lock_page() to avoid fault vs. truncate races.
i_mmap_lock was used for that.

Recently, Matthew had to convert locking to exclusive to address fault
vs. fault races. And this kills scalability completely.

The patch below tries to recover some scalability for DAX by introducing
per-mapping range lock.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 fs/dax.c           | 30 +++++++++++++++++-------------
 fs/inode.c         |  1 +
 include/linux/fs.h |  2 ++
 mm/memory.c        | 35 +++++++++++++++++++++++------------
 mm/rmap.c          |  1 +
 5 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index ed54efedade6..27a68eca698e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -333,6 +333,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head bh;
+	struct range_lock mapping_lock;
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
 	unsigned blkbits = inode->i_blkbits;
 	sector_t block;
@@ -348,6 +349,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
 	bh.b_size = PAGE_SIZE;
 
+	/* do_cow_fault() takes the lock */
+	if (!vmf->cow_page) {
+		range_lock_init(&mapping_lock, vmf->pgoff, vmf->pgoff);
+		range_lock(&mapping->mapping_lock, &mapping_lock);
+	}
  repeat:
 	page = find_get_page(mapping, vmf->pgoff);
 	if (page) {
@@ -369,8 +375,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			error = -EIO;
 			goto unlock;
 		}
-	} else {
-		i_mmap_lock_write(mapping);
 	}
 
 	error = get_block(inode, block, &bh, 0);
@@ -390,8 +394,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			if (error)
 				goto unlock;
 		} else {
-			i_mmap_unlock_write(mapping);
-			return dax_load_hole(mapping, page, vmf);
+			error =  dax_load_hole(mapping, page, vmf);
+			range_unlock(&mapping->mapping_lock, &mapping_lock);
+			return error;
 		}
 	}
 
@@ -446,9 +451,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
 	}
 
-	if (!page)
-		i_mmap_unlock_write(mapping);
  out:
+	if (!vmf->cow_page)
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM | major;
 	/* -EBUSY is fine, somebody else faulted on the same PTE */
@@ -460,10 +465,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
-	} else {
-		i_mmap_unlock_write(mapping);
 	}
-
 	goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
@@ -510,6 +512,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	struct buffer_head bh;
+	struct range_lock mapping_lock;
 	unsigned blkbits = inode->i_blkbits;
 	unsigned long pmd_addr = address & PMD_MASK;
 	bool write = flags & FAULT_FLAG_WRITE;
@@ -541,7 +544,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 	bh.b_size = PMD_SIZE;
-	i_mmap_lock_write(mapping);
+	range_lock_init(&mapping_lock, pgoff, pgoff + HPAGE_PMD_NR - 1);
+	range_lock(&mapping->mapping_lock, &mapping_lock);
 	length = get_block(inode, block, &bh, write);
 	if (length)
 		return VM_FAULT_SIGBUS;
@@ -568,9 +572,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 * zero pages covering this hole
 	 */
 	if (buffer_new(&bh)) {
-		i_mmap_unlock_write(mapping);
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
-		i_mmap_lock_write(mapping);
+		range_lock(&mapping->mapping_lock, &mapping_lock);
 	}
 
 	/*
@@ -624,7 +628,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (buffer_unwritten(&bh))
 		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
 
-	i_mmap_unlock_write(mapping);
+	range_unlock(&mapping->mapping_lock, &mapping_lock);
 
 	return result;
 
diff --git a/fs/inode.c b/fs/inode.c
index e560535706ff..6a24144d679f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -343,6 +343,7 @@ void address_space_init_once(struct address_space *mapping)
 	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
 	spin_lock_init(&mapping->tree_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
+	range_lock_tree_init(&mapping->mapping_lock);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	mapping->i_mmap = RB_ROOT;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b6361e2e2a26..368e7208d4f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -30,6 +30,7 @@
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/blk_types.h>
+#include <linux/range_lock.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -429,6 +430,7 @@ struct address_space {
 	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
 	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
+	struct range_lock_tree	mapping_lock;	/* lock_page() replacement for DAX */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
 	unsigned long		nrshadows;	/* number of shadow entries */
diff --git a/mm/memory.c b/mm/memory.c
index 7f6a9563d5a6..b4898db7e4c4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2409,6 +2409,7 @@ void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows)
 {
 	struct zap_details details;
+	struct range_lock mapping_lock;
 	pgoff_t hba = holebegin >> PAGE_SHIFT;
 	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
@@ -2426,10 +2427,17 @@ void unmap_mapping_range(struct address_space *mapping,
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
 
+	if (IS_DAX(mapping->host)) {
+		/* Exclude fault under us */
+		range_lock_init(&mapping_lock, hba, hba + hlen - 1);
+		range_lock(&mapping->mapping_lock, &mapping_lock);
+	}
 	i_mmap_lock_write(mapping);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	i_mmap_unlock_write(mapping);
+	if (IS_DAX(mapping->host))
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
@@ -2978,6 +2986,8 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct range_lock mapping_lock;
 	struct page *fault_page, *new_page;
 	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
@@ -2996,6 +3006,15 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	}
 
+	if (IS_DAX(file_inode(vma->vm_file))) {
+		/*
+		 * The fault handler has no page to lock, so it holds
+		 * mapping->mapping_lock to protect against truncate.
+		 */
+		range_lock_init(&mapping_lock, pgoff, pgoff);
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
+	}
+
 	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		goto uncharge_out;
@@ -3010,12 +3029,6 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (fault_page) {
 			unlock_page(fault_page);
 			page_cache_release(fault_page);
-		} else {
-			/*
-			 * The fault handler has no page to lock, so it holds
-			 * i_mmap_lock for write to protect against truncate.
-			 */
-			i_mmap_unlock_write(vma->vm_file->f_mapping);
 		}
 		goto uncharge_out;
 	}
@@ -3026,15 +3039,13 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (fault_page) {
 		unlock_page(fault_page);
 		page_cache_release(fault_page);
-	} else {
-		/*
-		 * The fault handler has no page to lock, so it holds
-		 * i_mmap_lock for write to protect against truncate.
-		 */
-		i_mmap_unlock_write(vma->vm_file->f_mapping);
 	}
+	if (IS_DAX(file_inode(vma->vm_file)))
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 	return ret;
 uncharge_out:
+	if (IS_DAX(file_inode(vma->vm_file)))
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 	mem_cgroup_cancel_charge(new_page, memcg);
 	page_cache_release(new_page);
 	return ret;
diff --git a/mm/rmap.c b/mm/rmap.c
index dcaad464aab0..3d509326d354 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -22,6 +22,7 @@
  *
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   mm->mmap_sem
+ *    mapping->mapping_lock
  *     page->flags PG_locked (lock_page)
  *       mapping->i_mmap_rwsem
  *         anon_vma->rwsem
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-10 15:14 ` [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock Kirill A. Shutemov
@ 2015-08-11  8:19   ` Jan Kara
  2015-08-11  9:37     ` Dave Chinner
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2015-08-11  8:19 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Matthew Wilcox, linux-mm, linux-fsdevel,
	linux-kernel, Davidlohr Bueso, Jan Kara

On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote:
> As we don't have struct pages for DAX memory, Matthew had to find an
> replacement for lock_page() to avoid fault vs. truncate races.
> i_mmap_lock was used for that.
> 
> Recently, Matthew had to convert locking to exclusive to address fault
> vs. fault races. And this kills scalability completely.
> 
> The patch below tries to recover some scalability for DAX by introducing
> per-mapping range lock.

So this grows noticeably (3 longs if I'm right) struct address_space and
thus struct inode just for DAX. That looks like a waste but I don't see an
easy solution.

OTOH filesystems in normal mode might want to use the range lock as well to
provide truncate / punch hole vs page fault exclusion (XFS already has a
private rwsem for this and ext4 needs something as well) and at that point
growing generic struct inode would be acceptable for me.

My grand plan was to use the range lock to also simplify locking rules for
read, write and esp. direct IO but that has issues with mmap_sem ordering
because filesystems get called under mmap_sem in page fault path. So
probably just fixing the worst issue with punch hole vs page fault would be
good for now.

Also for a patch set like this, it would be good to show some numbers - how
big hit do you take in the single-threaded case (the lock is more
expensive) and how much scalability do you get in the multithreaded case?

								Honza

> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  fs/dax.c           | 30 +++++++++++++++++-------------
>  fs/inode.c         |  1 +
>  include/linux/fs.h |  2 ++
>  mm/memory.c        | 35 +++++++++++++++++++++++------------
>  mm/rmap.c          |  1 +
>  5 files changed, 44 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index ed54efedade6..27a68eca698e 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -333,6 +333,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	struct inode *inode = mapping->host;
>  	struct page *page;
>  	struct buffer_head bh;
> +	struct range_lock mapping_lock;
>  	unsigned long vaddr = (unsigned long)vmf->virtual_address;
>  	unsigned blkbits = inode->i_blkbits;
>  	sector_t block;
> @@ -348,6 +349,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
>  	bh.b_size = PAGE_SIZE;
>  
> +	/* do_cow_fault() takes the lock */
> +	if (!vmf->cow_page) {
> +		range_lock_init(&mapping_lock, vmf->pgoff, vmf->pgoff);
> +		range_lock(&mapping->mapping_lock, &mapping_lock);
> +	}
>   repeat:
>  	page = find_get_page(mapping, vmf->pgoff);
>  	if (page) {
> @@ -369,8 +375,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			error = -EIO;
>  			goto unlock;
>  		}
> -	} else {
> -		i_mmap_lock_write(mapping);
>  	}
>  
>  	error = get_block(inode, block, &bh, 0);
> @@ -390,8 +394,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			if (error)
>  				goto unlock;
>  		} else {
> -			i_mmap_unlock_write(mapping);
> -			return dax_load_hole(mapping, page, vmf);
> +			error =  dax_load_hole(mapping, page, vmf);
> +			range_unlock(&mapping->mapping_lock, &mapping_lock);
> +			return error;
>  		}
>  	}
>  
> @@ -446,9 +451,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
>  	}
>  
> -	if (!page)
> -		i_mmap_unlock_write(mapping);
>   out:
> +	if (!vmf->cow_page)
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  	if (error == -ENOMEM)
>  		return VM_FAULT_OOM | major;
>  	/* -EBUSY is fine, somebody else faulted on the same PTE */
> @@ -460,10 +465,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	if (page) {
>  		unlock_page(page);
>  		page_cache_release(page);
> -	} else {
> -		i_mmap_unlock_write(mapping);
>  	}
> -
>  	goto out;
>  }
>  EXPORT_SYMBOL(__dax_fault);
> @@ -510,6 +512,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	struct address_space *mapping = file->f_mapping;
>  	struct inode *inode = mapping->host;
>  	struct buffer_head bh;
> +	struct range_lock mapping_lock;
>  	unsigned blkbits = inode->i_blkbits;
>  	unsigned long pmd_addr = address & PMD_MASK;
>  	bool write = flags & FAULT_FLAG_WRITE;
> @@ -541,7 +544,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
>  
>  	bh.b_size = PMD_SIZE;
> -	i_mmap_lock_write(mapping);
> +	range_lock_init(&mapping_lock, pgoff, pgoff + HPAGE_PMD_NR - 1);
> +	range_lock(&mapping->mapping_lock, &mapping_lock);
>  	length = get_block(inode, block, &bh, write);
>  	if (length)
>  		return VM_FAULT_SIGBUS;
> @@ -568,9 +572,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	 * zero pages covering this hole
>  	 */
>  	if (buffer_new(&bh)) {
> -		i_mmap_unlock_write(mapping);
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
> -		i_mmap_lock_write(mapping);
> +		range_lock(&mapping->mapping_lock, &mapping_lock);
>  	}
>  
>  	/*
> @@ -624,7 +628,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	if (buffer_unwritten(&bh))
>  		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
>  
> -	i_mmap_unlock_write(mapping);
> +	range_unlock(&mapping->mapping_lock, &mapping_lock);
>  
>  	return result;
>  
> diff --git a/fs/inode.c b/fs/inode.c
> index e560535706ff..6a24144d679f 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -343,6 +343,7 @@ void address_space_init_once(struct address_space *mapping)
>  	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
>  	spin_lock_init(&mapping->tree_lock);
>  	init_rwsem(&mapping->i_mmap_rwsem);
> +	range_lock_tree_init(&mapping->mapping_lock);
>  	INIT_LIST_HEAD(&mapping->private_list);
>  	spin_lock_init(&mapping->private_lock);
>  	mapping->i_mmap = RB_ROOT;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index b6361e2e2a26..368e7208d4f2 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -30,6 +30,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/percpu-rwsem.h>
>  #include <linux/blk_types.h>
> +#include <linux/range_lock.h>
>  
>  #include <asm/byteorder.h>
>  #include <uapi/linux/fs.h>
> @@ -429,6 +430,7 @@ struct address_space {
>  	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
>  	struct rb_root		i_mmap;		/* tree of private and shared mappings */
>  	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
> +	struct range_lock_tree	mapping_lock;	/* lock_page() replacement for DAX */
>  	/* Protected by tree_lock together with the radix tree */
>  	unsigned long		nrpages;	/* number of total pages */
>  	unsigned long		nrshadows;	/* number of shadow entries */
> diff --git a/mm/memory.c b/mm/memory.c
> index 7f6a9563d5a6..b4898db7e4c4 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2409,6 +2409,7 @@ void unmap_mapping_range(struct address_space *mapping,
>  		loff_t const holebegin, loff_t const holelen, int even_cows)
>  {
>  	struct zap_details details;
> +	struct range_lock mapping_lock;
>  	pgoff_t hba = holebegin >> PAGE_SHIFT;
>  	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
>  
> @@ -2426,10 +2427,17 @@ void unmap_mapping_range(struct address_space *mapping,
>  	if (details.last_index < details.first_index)
>  		details.last_index = ULONG_MAX;
>  
> +	if (IS_DAX(mapping->host)) {
> +		/* Exclude fault under us */
> +		range_lock_init(&mapping_lock, hba, hba + hlen - 1);
> +		range_lock(&mapping->mapping_lock, &mapping_lock);
> +	}
>  	i_mmap_lock_write(mapping);
>  	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
>  		unmap_mapping_range_tree(&mapping->i_mmap, &details);
>  	i_mmap_unlock_write(mapping);
> +	if (IS_DAX(mapping->host))
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  }
>  EXPORT_SYMBOL(unmap_mapping_range);
>  
> @@ -2978,6 +2986,8 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		unsigned long address, pmd_t *pmd,
>  		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
>  {
> +	struct address_space *mapping = vma->vm_file->f_mapping;
> +	struct range_lock mapping_lock;
>  	struct page *fault_page, *new_page;
>  	struct mem_cgroup *memcg;
>  	spinlock_t *ptl;
> @@ -2996,6 +3006,15 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		return VM_FAULT_OOM;
>  	}
>  
> +	if (IS_DAX(file_inode(vma->vm_file))) {
> +		/*
> +		 * The fault handler has no page to lock, so it holds
> +		 * mapping->mapping_lock to protect against truncate.
> +		 */
> +		range_lock_init(&mapping_lock, pgoff, pgoff);
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
> +	}
> +
>  	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
>  		goto uncharge_out;
> @@ -3010,12 +3029,6 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		if (fault_page) {
>  			unlock_page(fault_page);
>  			page_cache_release(fault_page);
> -		} else {
> -			/*
> -			 * The fault handler has no page to lock, so it holds
> -			 * i_mmap_lock for write to protect against truncate.
> -			 */
> -			i_mmap_unlock_write(vma->vm_file->f_mapping);
>  		}
>  		goto uncharge_out;
>  	}
> @@ -3026,15 +3039,13 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	if (fault_page) {
>  		unlock_page(fault_page);
>  		page_cache_release(fault_page);
> -	} else {
> -		/*
> -		 * The fault handler has no page to lock, so it holds
> -		 * i_mmap_lock for write to protect against truncate.
> -		 */
> -		i_mmap_unlock_write(vma->vm_file->f_mapping);
>  	}
> +	if (IS_DAX(file_inode(vma->vm_file)))
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  	return ret;
>  uncharge_out:
> +	if (IS_DAX(file_inode(vma->vm_file)))
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  	mem_cgroup_cancel_charge(new_page, memcg);
>  	page_cache_release(new_page);
>  	return ret;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index dcaad464aab0..3d509326d354 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -22,6 +22,7 @@
>   *
>   * inode->i_mutex	(while writing or truncating, not reading or faulting)
>   *   mm->mmap_sem
> + *    mapping->mapping_lock
>   *     page->flags PG_locked (lock_page)
>   *       mapping->i_mmap_rwsem
>   *         anon_vma->rwsem
> -- 
> 2.5.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11  8:19   ` Jan Kara
@ 2015-08-11  9:37     ` Dave Chinner
  2015-08-11 11:09       ` Boaz Harrosh
                         ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Dave Chinner @ 2015-08-11  9:37 UTC (permalink / raw)
  To: Jan Kara
  Cc: Kirill A. Shutemov, Andrew Morton, Matthew Wilcox, linux-mm,
	linux-fsdevel, linux-kernel, Davidlohr Bueso

On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote:
> On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote:
> > As we don't have struct pages for DAX memory, Matthew had to find an
> > replacement for lock_page() to avoid fault vs. truncate races.
> > i_mmap_lock was used for that.
> > 
> > Recently, Matthew had to convert locking to exclusive to address fault
> > vs. fault races. And this kills scalability completely.

I'm assuming this locking change is in a for-next git tree branch
somewhere as there isn't anything that I can see in a 4.2-rc6
tree. Can you point me to the git tree that has these changes in it?

> > The patch below tries to recover some scalability for DAX by introducing
> > per-mapping range lock.
> 
> So this grows noticeably (3 longs if I'm right) struct address_space and
> thus struct inode just for DAX. That looks like a waste but I don't see an
> easy solution.
> 
> OTOH filesystems in normal mode might want to use the range lock as well to
> provide truncate / punch hole vs page fault exclusion (XFS already has a
> private rwsem for this and ext4 needs something as well) and at that point
> growing generic struct inode would be acceptable for me.

It sounds to me like the way DAX has tried to solve this race is the
wrong direction. We really need to drive the truncate/page fault
serialisation higher up the stack towards the filesystem, not deeper
into the mm subsystem where locking is greatly limited.

As Jan mentions, we already have this serialisation in XFS, and I
think it would be better first step to replicate that locking model
in each filesystem that is supports DAX. I think this is a better
direction because it moves towards solving a whole class of problems
fileystem face with page fault serialisation, not just for DAX.

> My grand plan was to use the range lock to also simplify locking
> rules for read, write and esp. direct IO but that has issues with
> mmap_sem ordering because filesystems get called under mmap_sem in
> page fault path. So probably just fixing the worst issue with
> punch hole vs page fault would be good for now.

Right, I think adding a rwsem to the ext4 inode to handle the
fault/truncate serialisation similar to XFS would be sufficient to
allow DAX to remove the i_mmap_lock serialisation...

> Also for a patch set like this, it would be good to show some numbers - how
> big hit do you take in the single-threaded case (the lock is more
> expensive) and how much scalability do you get in the multithreaded case?

And also, if you remove the serialisation and run the test on XFS,
what do you get in terms of performance and correctness?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11  9:37     ` Dave Chinner
@ 2015-08-11 11:09       ` Boaz Harrosh
  2015-08-11 12:03       ` Kirill A. Shutemov
  2015-08-11 13:50       ` Jan Kara
  2 siblings, 0 replies; 18+ messages in thread
From: Boaz Harrosh @ 2015-08-11 11:09 UTC (permalink / raw)
  To: Dave Chinner, Jan Kara
  Cc: Kirill A. Shutemov, Andrew Morton, Matthew Wilcox, linux-mm,
	linux-fsdevel, linux-kernel, Davidlohr Bueso

On 08/11/2015 12:37 PM, Dave Chinner wrote:
> On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote:
>> On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote:
>>> As we don't have struct pages for DAX memory, Matthew had to find an
>>> replacement for lock_page() to avoid fault vs. truncate races.
>>> i_mmap_lock was used for that.
>>>
>>> Recently, Matthew had to convert locking to exclusive to address fault
>>> vs. fault races. And this kills scalability completely.
> 
> I'm assuming this locking change is in a for-next git tree branch
> somewhere as there isn't anything that I can see in a 4.2-rc6
> tree. Can you point me to the git tree that has these changes in it?
> 
>>> The patch below tries to recover some scalability for DAX by introducing
>>> per-mapping range lock.
>>
>> So this grows noticeably (3 longs if I'm right) struct address_space and
>> thus struct inode just for DAX. That looks like a waste but I don't see an
>> easy solution.
>>
>> OTOH filesystems in normal mode might want to use the range lock as well to
>> provide truncate / punch hole vs page fault exclusion (XFS already has a
>> private rwsem for this and ext4 needs something as well) and at that point
>> growing generic struct inode would be acceptable for me.
> 
> It sounds to me like the way DAX has tried to solve this race is the
> wrong direction. We really need to drive the truncate/page fault
> serialisation higher up the stack towards the filesystem, not deeper
> into the mm subsystem where locking is greatly limited.
> 
> As Jan mentions, we already have this serialisation in XFS, and I
> think it would be better first step to replicate that locking model
> in each filesystem that is supports DAX. I think this is a better
> direction because it moves towards solving a whole class of problems
> fileystem face with page fault serialisation, not just for DAX.
> 
>> My grand plan was to use the range lock to also simplify locking
>> rules for read, write and esp. direct IO but that has issues with
>> mmap_sem ordering because filesystems get called under mmap_sem in
>> page fault path. So probably just fixing the worst issue with
>> punch hole vs page fault would be good for now.
> 
> Right, I think adding a rwsem to the ext4 inode to handle the
> fault/truncate serialisation similar to XFS would be sufficient to
> allow DAX to remove the i_mmap_lock serialisation...
> 
>> Also for a patch set like this, it would be good to show some numbers - how
>> big hit do you take in the single-threaded case (the lock is more
>> expensive) and how much scalability do you get in the multithreaded case?
> 
> And also, if you remove the serialisation and run the test on XFS,
> what do you get in terms of performance and correctness?
> 
> Cheers,
> 

Cheers indeed

It is very easy to serialise at the FS level which has control over the
truncate path as well. Is much harder and uglier on the mm side.

Currently there is DAX for xfs, and  ext2, ext4. With xfs's implementation
I think removing the lock completely will just work. If I read the code
correctly.

With ext2/4 you can at worse add the struct range_lock_tree to its private
inode structure and again only if dax is configured. And use that at the
fault wrappers. But I suspect there should be an easier way once at the
ext2/4 code level, and no need for generalization.

I think the xfs case, of "no locks needed at all", calls for removing the
locks at the mm/dax level and moving them up to the FS.

> Dave.
> 

Thanks
Boaz


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11  9:37     ` Dave Chinner
  2015-08-11 11:09       ` Boaz Harrosh
@ 2015-08-11 12:03       ` Kirill A. Shutemov
  2015-08-11 13:50       ` Jan Kara
  2 siblings, 0 replies; 18+ messages in thread
From: Kirill A. Shutemov @ 2015-08-11 12:03 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Kirill A. Shutemov, Andrew Morton, Matthew Wilcox,
	linux-mm, linux-fsdevel, linux-kernel, Davidlohr Bueso

On Tue, Aug 11, 2015 at 07:37:08PM +1000, Dave Chinner wrote:
> On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote:
> > On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote:
> > > As we don't have struct pages for DAX memory, Matthew had to find an
> > > replacement for lock_page() to avoid fault vs. truncate races.
> > > i_mmap_lock was used for that.
> > > 
> > > Recently, Matthew had to convert locking to exclusive to address fault
> > > vs. fault races. And this kills scalability completely.
> 
> I'm assuming this locking change is in a for-next git tree branch
> somewhere as there isn't anything that I can see in a 4.2-rc6
> tree. Can you point me to the git tree that has these changes in it?

It's in -mm tree. See e4261a3ed000 in mhocko/mm.git[1]. There are also two
fixups for that commit[2][3]

[1] git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
[2] http://lkml.kernel.org/g/1438948423-128882-1-git-send-email-kirill.shutemov@linux.intel.com
[3] http://lkml.kernel.org/g/1438948482-129043-1-git-send-email-kirill.shutemov@linux.intel.com

> > > The patch below tries to recover some scalability for DAX by introducing
> > > per-mapping range lock.
> > 
> > So this grows noticeably (3 longs if I'm right) struct address_space and
> > thus struct inode just for DAX. That looks like a waste but I don't see an
> > easy solution.

We can try to convert it to pointer instead of embedding it into
struct address_space and make filesystems allocate it on S_DAX setting.
Is it better?

> > OTOH filesystems in normal mode might want to use the range lock as well to
> > provide truncate / punch hole vs page fault exclusion (XFS already has a
> > private rwsem for this and ext4 needs something as well) and at that point
> > growing generic struct inode would be acceptable for me.
> 
> It sounds to me like the way DAX has tried to solve this race is the
> wrong direction. We really need to drive the truncate/page fault
> serialisation higher up the stack towards the filesystem, not deeper
> into the mm subsystem where locking is greatly limited.

My understanding of fs locking is very limited, but I think we have
problem with this approach in fault path: to dispatch page fault properly
we need to take mmap_sem and find VMA. Only after that we have chance to
obtain any fs lock. And that's the place where we take page lock which
this range_lock intend to replace.

I don't see how we can move any lock to serialize truncate vs. fault much
higher.

> As Jan mentions, we already have this serialisation in XFS, and I
> think it would be better first step to replicate that locking model
> in each filesystem that is supports DAX. I think this is a better
> direction because it moves towards solving a whole class of problems
> fileystem face with page fault serialisation, not just for DAX.

Could you point me to that lock and locking rules for it?
 
> > My grand plan was to use the range lock to also simplify locking
> > rules for read, write and esp. direct IO but that has issues with
> > mmap_sem ordering because filesystems get called under mmap_sem in
> > page fault path. So probably just fixing the worst issue with
> > punch hole vs page fault would be good for now.
> 
> Right, I think adding a rwsem to the ext4 inode to handle the
> fault/truncate serialisation similar to XFS would be sufficient to
> allow DAX to remove the i_mmap_lock serialisation...
> 
> > Also for a patch set like this, it would be good to show some numbers - how
> > big hit do you take in the single-threaded case (the lock is more
> > expensive) and how much scalability do you get in the multithreaded case?
> 
> And also, if you remove the serialisation and run the test on XFS,
> what do you get in terms of performance and correctness?

I'll talk with Matthew on numbers.

As we don't have any HW yet, the only numbers we can possibly provide is
DAX over DRAM.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11  9:37     ` Dave Chinner
  2015-08-11 11:09       ` Boaz Harrosh
  2015-08-11 12:03       ` Kirill A. Shutemov
@ 2015-08-11 13:50       ` Jan Kara
  2015-08-11 14:31         ` Boaz Harrosh
  2 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2015-08-11 13:50 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Kirill A. Shutemov, Andrew Morton, Matthew Wilcox,
	linux-mm, linux-fsdevel, linux-kernel, Davidlohr Bueso

On Tue 11-08-15 19:37:08, Dave Chinner wrote:
> > > The patch below tries to recover some scalability for DAX by introducing
> > > per-mapping range lock.
> > 
> > So this grows noticeably (3 longs if I'm right) struct address_space and
> > thus struct inode just for DAX. That looks like a waste but I don't see an
> > easy solution.
> > 
> > OTOH filesystems in normal mode might want to use the range lock as well to
> > provide truncate / punch hole vs page fault exclusion (XFS already has a
> > private rwsem for this and ext4 needs something as well) and at that point
> > growing generic struct inode would be acceptable for me.
> 
> It sounds to me like the way DAX has tried to solve this race is the
> wrong direction. We really need to drive the truncate/page fault
> serialisation higher up the stack towards the filesystem, not deeper
> into the mm subsystem where locking is greatly limited.
> 
> As Jan mentions, we already have this serialisation in XFS, and I
> think it would be better first step to replicate that locking model
> in each filesystem that is supports DAX. I think this is a better
> direction because it moves towards solving a whole class of problems
> fileystem face with page fault serialisation, not just for DAX.

Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
fault / page_mkwrite callback so it doesn't provide the exclusion necessary
for DAX which needs exclusive access to the page given range in the page
cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
excercise (at least from DAX POV).

So regardless whether the lock will be a fs-private one or in
address_space, DAX needs something like the range lock Kirill suggested.
Having the range lock in fs-private part of inode has the advantage that
only filesystems supporting DAX / punch hole will pay the memory overhead.
OTOH most major filesystems need it so the savings would be IMO noticeable
only for tiny systems using special fs etc. So I'm undecided whether
putting the lock in address_space and doing the locking in generic
pagefault / truncate helpers is a better choice or not.
 
								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 13:50       ` Jan Kara
@ 2015-08-11 14:31         ` Boaz Harrosh
  2015-08-11 15:28           ` Kirill A. Shutemov
  2015-08-11 16:51           ` Wilcox, Matthew R
  0 siblings, 2 replies; 18+ messages in thread
From: Boaz Harrosh @ 2015-08-11 14:31 UTC (permalink / raw)
  To: Jan Kara, Dave Chinner
  Cc: Kirill A. Shutemov, Andrew Morton, Matthew Wilcox, linux-mm,
	linux-fsdevel, linux-kernel, Davidlohr Bueso

On 08/11/2015 04:50 PM, Jan Kara wrote:
> On Tue 11-08-15 19:37:08, Dave Chinner wrote:
>>>> The patch below tries to recover some scalability for DAX by introducing
>>>> per-mapping range lock.
>>>
>>> So this grows noticeably (3 longs if I'm right) struct address_space and
>>> thus struct inode just for DAX. That looks like a waste but I don't see an
>>> easy solution.
>>>
>>> OTOH filesystems in normal mode might want to use the range lock as well to
>>> provide truncate / punch hole vs page fault exclusion (XFS already has a
>>> private rwsem for this and ext4 needs something as well) and at that point
>>> growing generic struct inode would be acceptable for me.
>>
>> It sounds to me like the way DAX has tried to solve this race is the
>> wrong direction. We really need to drive the truncate/page fault
>> serialisation higher up the stack towards the filesystem, not deeper
>> into the mm subsystem where locking is greatly limited.
>>
>> As Jan mentions, we already have this serialisation in XFS, and I
>> think it would be better first step to replicate that locking model
>> in each filesystem that is supports DAX. I think this is a better
>> direction because it moves towards solving a whole class of problems
>> fileystem face with page fault serialisation, not just for DAX.
> 
> Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
> fault / page_mkwrite callback so it doesn't provide the exclusion necessary
> for DAX which needs exclusive access to the page given range in the page
> cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
> excercise (at least from DAX POV).
> 

Hi Jan. So you got me confused above. You say:
	"DAX which needs exclusive access to the page given range in the page cache"

but DAX and page-cache are mutually exclusive. I guess you meant the VMA
range, or the inode->mapping range (which one is it)

Actually I do not understand this race you guys found at all. (Please bear with
me sorry for being slow)

If two threads of the same VMA fault on the same pte
(I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
then the mm knows how to handle this just fine.

If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
wide lock like XFS's to protect against i_size-change / truncate is more than
enough.
Because with DAX there is no inode->mapping "mapping" at all. You have the call
into the FS with get_block() to replace "holes" (zero pages) with real allocated
blocks, on WRITE faults, but this conversion should be protected inside the FS
already. Then there is the atomic exchange of the PTE which is fine.
(And vis versa with holes mapping and writes)

There is no global "mapping" radix-tree shared between VMAs like we are
used to.

Please explain to me the races you are seeing. I would love to also see them
with xfs. I think there they should not happen.

> So regardless whether the lock will be a fs-private one or in
> address_space, DAX needs something like the range lock Kirill suggested.
> Having the range lock in fs-private part of inode has the advantage that
> only filesystems supporting DAX / punch hole will pay the memory overhead.
> OTOH most major filesystems need it so the savings would be IMO noticeable

punch-hole is truncate for me. With the xfs model of read-write lock where
truncate takes write, any fault taking read before executing the fault looks
good for the FS side of things. I guess you mean the optimization of the
radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.

Please explain. Do you have a reproducer of this race. I would need to understand
this.

Thanks
Boaz

> only for tiny systems using special fs etc. So I'm undecided whether
> putting the lock in address_space and doing the locking in generic
> pagefault / truncate helpers is a better choice or not.
>  
> 								Honza
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 14:31         ` Boaz Harrosh
@ 2015-08-11 15:28           ` Kirill A. Shutemov
  2015-08-11 16:17             ` Boaz Harrosh
  2015-08-11 16:51           ` Wilcox, Matthew R
  1 sibling, 1 reply; 18+ messages in thread
From: Kirill A. Shutemov @ 2015-08-11 15:28 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Jan Kara, Dave Chinner, Kirill A. Shutemov, Andrew Morton,
	Matthew Wilcox, linux-mm, linux-fsdevel, linux-kernel,
	Davidlohr Bueso, Theodore Ts'o

On Tue, Aug 11, 2015 at 05:31:04PM +0300, Boaz Harrosh wrote:
> On 08/11/2015 04:50 PM, Jan Kara wrote:
> > On Tue 11-08-15 19:37:08, Dave Chinner wrote:
> >>>> The patch below tries to recover some scalability for DAX by introducing
> >>>> per-mapping range lock.
> >>>
> >>> So this grows noticeably (3 longs if I'm right) struct address_space and
> >>> thus struct inode just for DAX. That looks like a waste but I don't see an
> >>> easy solution.
> >>>
> >>> OTOH filesystems in normal mode might want to use the range lock as well to
> >>> provide truncate / punch hole vs page fault exclusion (XFS already has a
> >>> private rwsem for this and ext4 needs something as well) and at that point
> >>> growing generic struct inode would be acceptable for me.
> >>
> >> It sounds to me like the way DAX has tried to solve this race is the
> >> wrong direction. We really need to drive the truncate/page fault
> >> serialisation higher up the stack towards the filesystem, not deeper
> >> into the mm subsystem where locking is greatly limited.
> >>
> >> As Jan mentions, we already have this serialisation in XFS, and I
> >> think it would be better first step to replicate that locking model
> >> in each filesystem that is supports DAX. I think this is a better
> >> direction because it moves towards solving a whole class of problems
> >> fileystem face with page fault serialisation, not just for DAX.
> > 
> > Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
> > fault / page_mkwrite callback so it doesn't provide the exclusion necessary
> > for DAX which needs exclusive access to the page given range in the page
> > cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
> > excercise (at least from DAX POV).
> > 
> 
> Hi Jan. So you got me confused above. You say:
> 	"DAX which needs exclusive access to the page given range in the page cache"
> 
> but DAX and page-cache are mutually exclusive. I guess you meant the VMA
> range, or the inode->mapping range (which one is it)

The second -- pgoff range within the inode->mapping.

> Actually I do not understand this race you guys found at all. (Please bear with
> me sorry for being slow)
> 
> If two threads of the same VMA fault on the same pte
> (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
> then the mm knows how to handle this just fine.

It does. But only if we have struct page. See lock_page_or_retry() in
filemap_fault(). Without lock_page() it's problematic.

> If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
> wide lock like XFS's to protect against i_size-change / truncate is more than
> enough.

We also used lock_page() to make sure we shoot out all pages as we don't
exclude page faults during truncate. Consider this race:

	<fault>			<truncate>
	get_block
	check i_size
    				update i_size
				unmap
	setup pte

With normal page cache we make sure that all pages beyond i_size is
dropped using lock_page() in truncate_inode_pages_range().

For DAX we need a way to stop all page faults to the pgoff range before
doing unmap.

> Because with DAX there is no inode->mapping "mapping" at all. You have the call
> into the FS with get_block() to replace "holes" (zero pages) with real allocated
> blocks, on WRITE faults, but this conversion should be protected inside the FS
> already. Then there is the atomic exchange of the PTE which is fine.
> (And vis versa with holes mapping and writes)

Having unmap_mapping_range() in PMD fault handling is very unfortunate.
Go to rmap just to solve page fault is very wrong.
BTW, we need to do it in write path too.

I'm not convinced that all these "let's avoid backing storage allocation"
in DAX code is not layering violation. I think the right place to solve
this is filesystem. And we have almost all required handles for this in
place.  We only need to change vm_ops->page_mkwrite() interface to be able
to return different page than what was given on input.

> > So regardless whether the lock will be a fs-private one or in
> > address_space, DAX needs something like the range lock Kirill suggested.
> > Having the range lock in fs-private part of inode has the advantage that
> > only filesystems supporting DAX / punch hole will pay the memory overhead.
> > OTOH most major filesystems need it so the savings would be IMO noticeable
> 
> punch-hole is truncate for me. With the xfs model of read-write lock where
> truncate takes write, any fault taking read before executing the fault looks
> good for the FS side of things. I guess you mean the optimization of the
> radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.

Hm. Where does XFS take this read-write lock in fault path?

IIUC, truncation vs. page fault serialization relies on i_size being
updated before doing truncate_pagecache() and checking i_size under
page_lock() on fault side. We don't have i_size fence for punch hole.

BTW, how things like ext4_collapse_range() can be safe wrt parallel page
fault? Ted? 

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 15:28           ` Kirill A. Shutemov
@ 2015-08-11 16:17             ` Boaz Harrosh
  2015-08-11 20:26               ` Kirill A. Shutemov
  0 siblings, 1 reply; 18+ messages in thread
From: Boaz Harrosh @ 2015-08-11 16:17 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Jan Kara, Dave Chinner, Kirill A. Shutemov, Andrew Morton,
	Matthew Wilcox, linux-mm, linux-fsdevel, linux-kernel,
	Davidlohr Bueso, Theodore Ts'o

On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote:
<>
>> Hi Jan. So you got me confused above. You say:
>> 	"DAX which needs exclusive access to the page given range in the page cache"
>>
>> but DAX and page-cache are mutually exclusive. I guess you meant the VMA
>> range, or the inode->mapping range (which one is it)
> 
> The second -- pgoff range within the inode->mapping.
> 

So yes this is what I do not understand with DAX the inode->mapping radix-tree is
empty.

>> Actually I do not understand this race you guys found at all. (Please bear with
>> me sorry for being slow)
>>
>> If two threads of the same VMA fault on the same pte
>> (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
>> then the mm knows how to handle this just fine.
> 
> It does. But only if we have struct page. See lock_page_or_retry() in
> filemap_fault(). Without lock_page() it's problematic.
> 
>> If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
>> wide lock like XFS's to protect against i_size-change / truncate is more than
>> enough.
> 
> We also used lock_page() to make sure we shoot out all pages as we don't
> exclude page faults during truncate. Consider this race:
> 
> 	<fault>			<truncate>
> 	get_block
> 	check i_size
>     				update i_size
> 				unmap
> 	setup pte
> 

Please consider this senario then:

 	<fault>			<truncate>
	read_lock(inode)

 	get_block
 	check i_size
	
	read_unlock(inode)

				write_lock(inode)

     				update i_size
				* remove allocated blocks
 				unmap

				write_unlock(inode)

 	setup pte

IS what you suppose to do in xfs

> With normal page cache we make sure that all pages beyond i_size is
> dropped using lock_page() in truncate_inode_pages_range().
> 

Yes there is no truncate_inode_pages_range() in DAX again radix tree is
empty.

Please do you have a reproducer I would like to see this race and also
experiment with xfs (I guess you saw it in ext4)

> For DAX we need a way to stop all page faults to the pgoff range before
> doing unmap.
> 

Why ?

>> Because with DAX there is no inode->mapping "mapping" at all. You have the call
>> into the FS with get_block() to replace "holes" (zero pages) with real allocated
>> blocks, on WRITE faults, but this conversion should be protected inside the FS
>> already. Then there is the atomic exchange of the PTE which is fine.
>> (And vis versa with holes mapping and writes)
> 
> Having unmap_mapping_range() in PMD fault handling is very unfortunate.
> Go to rmap just to solve page fault is very wrong.
> BTW, we need to do it in write path too.
> 

Only the write path and only when we exchange a zero-page (hole) with
a new allocated (written to) page. Both write fault and/or write-path

> I'm not convinced that all these "let's avoid backing storage allocation"
> in DAX code is not layering violation. I think the right place to solve
> this is filesystem. And we have almost all required handles for this in
> place.  We only need to change vm_ops->page_mkwrite() interface to be able
> to return different page than what was given on input.
> 

What? there is no page returned for DAX page_mkwrite(), it is all insert_mixed
with direct pmd.

Ha I think I see what you are tumbling on. Maybe it is the zero-pages when
read-mapping holes.

A solution I have, (And is working for a year now) is have only a single
zero-page per inode->mapping, inserted at all the holes. and again radix-tree
is kept clean always. This both saves memory and avoids the race on the
(always empty) radix tree.

Tell me if you want that I send a patch there is a small trick I do
at vm_ops->page_mkwrite():

	/* our zero page doesn't really hold the correct offset to the file in
	 * page->index so vmf->pgoff is incorrect, lets fix that */
	vmf->pgoff = vma->vm_pgoff + (((unsigned long)vmf->virtual_address -
			vma->vm_start) >> PAGE_SHIFT);

	/* call fault handler to get a real page for writing */
	return __page_fault(vma, vmf);

Again the return from page_mkwrite() && __page_fault(WRITE_CASE) is always
VM_FAULT_NOPAGE, right?

>>> So regardless whether the lock will be a fs-private one or in
>>> address_space, DAX needs something like the range lock Kirill suggested.
>>> Having the range lock in fs-private part of inode has the advantage that
>>> only filesystems supporting DAX / punch hole will pay the memory overhead.
>>> OTOH most major filesystems need it so the savings would be IMO noticeable
>>
>> punch-hole is truncate for me. With the xfs model of read-write lock where
>> truncate takes write, any fault taking read before executing the fault looks
>> good for the FS side of things. I guess you mean the optimization of the
>> radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.
> 
> Hm. Where does XFS take this read-write lock in fault path?
> 
> IIUC, truncation vs. page fault serialization relies on i_size being
> updated before doing truncate_pagecache() and checking i_size under
> page_lock() on fault side. We don't have i_size fence for punch hole.
> 

again truncate_pagecache() is NONE.
And yes the read-write locking will protect punch-hole just as truncate
see my locking senario above.

> BTW, how things like ext4_collapse_range() can be safe wrt parallel page
> fault? Ted? 
> 

Thanks
Boaz


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 14:31         ` Boaz Harrosh
  2015-08-11 15:28           ` Kirill A. Shutemov
@ 2015-08-11 16:51           ` Wilcox, Matthew R
  2015-08-11 18:46             ` Boaz Harrosh
  2015-08-11 21:48             ` Dave Chinner
  1 sibling, 2 replies; 18+ messages in thread
From: Wilcox, Matthew R @ 2015-08-11 16:51 UTC (permalink / raw)
  To: Boaz Harrosh, Jan Kara, Dave Chinner
  Cc: Kirill A. Shutemov, Andrew Morton, linux-mm, linux-fsdevel,
	linux-kernel, Davidlohr Bueso

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 5586 bytes --]

The race that you're not seeing is page fault vs page fault.  Two threads each attempt to store a byte to different locations on the same page.  With a read-mutex to exclude truncates, each thread calls ->get_block.  One of the threads gets back a buffer marked as BH_New and calls memset() to clear the page.  The other thread gets back a buffer which isn't marked as BH_New and simply inserts the mapping, returning to userspace, which stores the byte ... just in time for the other thread's memset() to write a zero over the top of it.

Oh, and if it's a load racing against a store, you could read data that used to be in this block the last time it was used; maybe the contents of /etc/shadow.

So if we want to be able to handle more than one page fault at a time per file (... and I think we do ...), we need to have exclusive access to a range of the file while we're calling ->get_block(), and for a certain amount of time afterwards.  With non-DAX files, this is the page lock.  My original proposal for solving this was to enhance the page cache radix tree to be able to lock something other than a page, but that also has complexities.

-----Original Message-----
From: Boaz Harrosh [mailto:boaz@plexistor.com] 
Sent: Tuesday, August 11, 2015 7:31 AM
To: Jan Kara; Dave Chinner
Cc: Kirill A. Shutemov; Andrew Morton; Wilcox, Matthew R; linux-mm@kvack.org; linux-fsdevel@vger.kernel.org; linux-kernel@vger.kernel.org; Davidlohr Bueso
Subject: Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock

On 08/11/2015 04:50 PM, Jan Kara wrote:
> On Tue 11-08-15 19:37:08, Dave Chinner wrote:
>>>> The patch below tries to recover some scalability for DAX by introducing
>>>> per-mapping range lock.
>>>
>>> So this grows noticeably (3 longs if I'm right) struct address_space and
>>> thus struct inode just for DAX. That looks like a waste but I don't see an
>>> easy solution.
>>>
>>> OTOH filesystems in normal mode might want to use the range lock as well to
>>> provide truncate / punch hole vs page fault exclusion (XFS already has a
>>> private rwsem for this and ext4 needs something as well) and at that point
>>> growing generic struct inode would be acceptable for me.
>>
>> It sounds to me like the way DAX has tried to solve this race is the
>> wrong direction. We really need to drive the truncate/page fault
>> serialisation higher up the stack towards the filesystem, not deeper
>> into the mm subsystem where locking is greatly limited.
>>
>> As Jan mentions, we already have this serialisation in XFS, and I
>> think it would be better first step to replicate that locking model
>> in each filesystem that is supports DAX. I think this is a better
>> direction because it moves towards solving a whole class of problems
>> fileystem face with page fault serialisation, not just for DAX.
> 
> Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
> fault / page_mkwrite callback so it doesn't provide the exclusion necessary
> for DAX which needs exclusive access to the page given range in the page
> cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
> excercise (at least from DAX POV).
> 

Hi Jan. So you got me confused above. You say:
	"DAX which needs exclusive access to the page given range in the page cache"

but DAX and page-cache are mutually exclusive. I guess you meant the VMA
range, or the inode->mapping range (which one is it)

Actually I do not understand this race you guys found at all. (Please bear with
me sorry for being slow)

If two threads of the same VMA fault on the same pte
(I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
then the mm knows how to handle this just fine.

If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
wide lock like XFS's to protect against i_size-change / truncate is more than
enough.
Because with DAX there is no inode->mapping "mapping" at all. You have the call
into the FS with get_block() to replace "holes" (zero pages) with real allocated
blocks, on WRITE faults, but this conversion should be protected inside the FS
already. Then there is the atomic exchange of the PTE which is fine.
(And vis versa with holes mapping and writes)

There is no global "mapping" radix-tree shared between VMAs like we are
used to.

Please explain to me the races you are seeing. I would love to also see them
with xfs. I think there they should not happen.

> So regardless whether the lock will be a fs-private one or in
> address_space, DAX needs something like the range lock Kirill suggested.
> Having the range lock in fs-private part of inode has the advantage that
> only filesystems supporting DAX / punch hole will pay the memory overhead.
> OTOH most major filesystems need it so the savings would be IMO noticeable

punch-hole is truncate for me. With the xfs model of read-write lock where
truncate takes write, any fault taking read before executing the fault looks
good for the FS side of things. I guess you mean the optimization of the
radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.

Please explain. Do you have a reproducer of this race. I would need to understand
this.

Thanks
Boaz

> only for tiny systems using special fs etc. So I'm undecided whether
> putting the lock in address_space and doing the locking in generic
> pagefault / truncate helpers is a better choice or not.
>  
> 								Honza
> 

ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 16:51           ` Wilcox, Matthew R
@ 2015-08-11 18:46             ` Boaz Harrosh
  2015-08-11 21:48             ` Dave Chinner
  1 sibling, 0 replies; 18+ messages in thread
From: Boaz Harrosh @ 2015-08-11 18:46 UTC (permalink / raw)
  To: Wilcox, Matthew R, Jan Kara, Dave Chinner
  Cc: Kirill A. Shutemov, Andrew Morton, linux-mm, linux-fsdevel,
	linux-kernel, Davidlohr Bueso

On 08/11/2015 07:51 PM, Wilcox, Matthew R wrote:
> The race that you're not seeing is page fault vs page fault. Two
> threads each attempt to store a byte to different locations on the
> same page. With a read-mutex to exclude truncates, each thread calls
> ->get_block. One of the threads gets back a buffer marked as BH_New
> and calls memset() to clear the page. The other thread gets back a
> buffer which isn't marked as BH_New and simply inserts the mapping,
> returning to userspace, which stores the byte ... just in time for
> the other thread's memset() to write a zero over the top of it.
> 

So I guess all you need is to zero it at the FS allocator like a __GFP_ZERO
Perhaps you invent a new flag to pass to ->get_block() like a __GB_FOR_MMAP
that will make the FS zero the block in the case it was newly allocated.
Sure there is some locking going on inside the FS so to not allocate two
blocks for the same inode offset. Then memset_nt() inside or before
that lock.

ie. In the presence of __GB_FOR_MMAP the FS will internally convert any
BH_New to a memset_nt() and regular return.

> Oh, and if it's a load racing against a store, you could read data
> that used to be in this block the last time it was used; maybe the
> contents of /etc/shadow.
> 

Do you have any kind of test like this? I wish we could exercise this
and see that we actually solved it.

> So if we want to be able to handle more than one page fault at a time
> per file (... and I think we do ...), we need to have exclusive
> access to a range of the file while we're calling ->get_block(), and
> for a certain amount of time afterwards. With non-DAX files, this is
> the page lock. My original proposal for solving this was to enhance
> the page cache radix tree to be able to lock something other than a
> page, but that also has complexities.
> 

Yes this could be done with a bit-lock at the RADIX_TREE_EXCEPTIONAL_ENTRY bits
of a radix_tree_exception() entry. These are only used by shmem radix-trees and will
never conflict with DAX

But it means you will need to populate the radix-tree with radix_tree_exception() entries
for every block accessed which will be a great pity.

I still think the above extra flag to ->get_block() taps into filesystem already
existing infra, and will cost much less pain.

(And is much more efficient say in hot path like application already did fallocate
 for performance)

Thanks
Boaz

> -----Original Message-----
> From: Boaz Harrosh [mailto:boaz@plexistor.com] 
> Sent: Tuesday, August 11, 2015 7:31 AM
> To: Jan Kara; Dave Chinner
> Cc: Kirill A. Shutemov; Andrew Morton; Wilcox, Matthew R; linux-mm@kvack.org; linux-fsdevel@vger.kernel.org; linux-kernel@vger.kernel.org; Davidlohr Bueso
> Subject: Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
> 
> On 08/11/2015 04:50 PM, Jan Kara wrote:
>> On Tue 11-08-15 19:37:08, Dave Chinner wrote:
>>>>> The patch below tries to recover some scalability for DAX by introducing
>>>>> per-mapping range lock.
>>>>
>>>> So this grows noticeably (3 longs if I'm right) struct address_space and
>>>> thus struct inode just for DAX. That looks like a waste but I don't see an
>>>> easy solution.
>>>>
>>>> OTOH filesystems in normal mode might want to use the range lock as well to
>>>> provide truncate / punch hole vs page fault exclusion (XFS already has a
>>>> private rwsem for this and ext4 needs something as well) and at that point
>>>> growing generic struct inode would be acceptable for me.
>>>
>>> It sounds to me like the way DAX has tried to solve this race is the
>>> wrong direction. We really need to drive the truncate/page fault
>>> serialisation higher up the stack towards the filesystem, not deeper
>>> into the mm subsystem where locking is greatly limited.
>>>
>>> As Jan mentions, we already have this serialisation in XFS, and I
>>> think it would be better first step to replicate that locking model
>>> in each filesystem that is supports DAX. I think this is a better
>>> direction because it moves towards solving a whole class of problems
>>> fileystem face with page fault serialisation, not just for DAX.
>>
>> Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
>> fault / page_mkwrite callback so it doesn't provide the exclusion necessary
>> for DAX which needs exclusive access to the page given range in the page
>> cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
>> excercise (at least from DAX POV).
>>
> 
> Hi Jan. So you got me confused above. You say:
> 	"DAX which needs exclusive access to the page given range in the page cache"
> 
> but DAX and page-cache are mutually exclusive. I guess you meant the VMA
> range, or the inode->mapping range (which one is it)
> 
> Actually I do not understand this race you guys found at all. (Please bear with
> me sorry for being slow)
> 
> If two threads of the same VMA fault on the same pte
> (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
> then the mm knows how to handle this just fine.
> 
> If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
> wide lock like XFS's to protect against i_size-change / truncate is more than
> enough.
> Because with DAX there is no inode->mapping "mapping" at all. You have the call
> into the FS with get_block() to replace "holes" (zero pages) with real allocated
> blocks, on WRITE faults, but this conversion should be protected inside the FS
> already. Then there is the atomic exchange of the PTE which is fine.
> (And vis versa with holes mapping and writes)
> 
> There is no global "mapping" radix-tree shared between VMAs like we are
> used to.
> 
> Please explain to me the races you are seeing. I would love to also see them
> with xfs. I think there they should not happen.
> 
>> So regardless whether the lock will be a fs-private one or in
>> address_space, DAX needs something like the range lock Kirill suggested.
>> Having the range lock in fs-private part of inode has the advantage that
>> only filesystems supporting DAX / punch hole will pay the memory overhead.
>> OTOH most major filesystems need it so the savings would be IMO noticeable
> 
> punch-hole is truncate for me. With the xfs model of read-write lock where
> truncate takes write, any fault taking read before executing the fault looks
> good for the FS side of things. I guess you mean the optimization of the
> radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.
> 
> Please explain. Do you have a reproducer of this race. I would need to understand
> this.
> 
> Thanks
> Boaz
> 
>> only for tiny systems using special fs etc. So I'm undecided whether
>> putting the lock in address_space and doing the locking in generic
>> pagefault / truncate helpers is a better choice or not.
>>  
>> 								Honza
>>
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 16:17             ` Boaz Harrosh
@ 2015-08-11 20:26               ` Kirill A. Shutemov
  2015-08-12  7:54                 ` Boaz Harrosh
  0 siblings, 1 reply; 18+ messages in thread
From: Kirill A. Shutemov @ 2015-08-11 20:26 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Jan Kara, Dave Chinner, Kirill A. Shutemov, Andrew Morton,
	Matthew Wilcox, linux-mm, linux-fsdevel, linux-kernel,
	Davidlohr Bueso, Theodore Ts'o

On Tue, Aug 11, 2015 at 07:17:12PM +0300, Boaz Harrosh wrote:
> On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote:
> > We also used lock_page() to make sure we shoot out all pages as we don't
> > exclude page faults during truncate. Consider this race:
> > 
> > 	<fault>			<truncate>
> > 	get_block
> > 	check i_size
> >     				update i_size
> > 				unmap
> > 	setup pte
> > 
> 
> Please consider this senario then:
> 
>  	<fault>			<truncate>
> 	read_lock(inode)
> 
>  	get_block
>  	check i_size
> 	
> 	read_unlock(inode)
> 
> 				write_lock(inode)
> 
>      				update i_size
> 				* remove allocated blocks
>  				unmap
> 
> 				write_unlock(inode)
> 
>  	setup pte
> 
> IS what you suppose to do in xfs

Do you realize that you describe a race? :-P

Exactly in this scenario pfn your pte point to is not belong to the file
anymore. Have fun.

> > With normal page cache we make sure that all pages beyond i_size is
> > dropped using lock_page() in truncate_inode_pages_range().
> > 
> 
> Yes there is no truncate_inode_pages_range() in DAX again radix tree is
> empty.
> 
> Please do you have a reproducer I would like to see this race and also
> experiment with xfs (I guess you saw it in ext4)

I don't. And I don't see how race like above can be FS-specific. All
critical points in generic code.
 
> > For DAX we need a way to stop all page faults to the pgoff range before
> > doing unmap.
> > 
> 
> Why ?

Because you can end up with ptes pointing to pfns which fs consider not be
part of the file.

	<truncate>		<fault>
	unmap..
				fault in pfn which unmap already unmapped
	..continue unmap

> >> Because with DAX there is no inode->mapping "mapping" at all. You have the call
> >> into the FS with get_block() to replace "holes" (zero pages) with real allocated
> >> blocks, on WRITE faults, but this conversion should be protected inside the FS
> >> already. Then there is the atomic exchange of the PTE which is fine.
> >> (And vis versa with holes mapping and writes)
> > 
> > Having unmap_mapping_range() in PMD fault handling is very unfortunate.
> > Go to rmap just to solve page fault is very wrong.
> > BTW, we need to do it in write path too.
> > 
> 
> Only the write path and only when we exchange a zero-page (hole) with
> a new allocated (written to) page. Both write fault and/or write-path

No. Always on new BH. We don't have anything (except rmap) to find out if
any other process have zero page for the pgoff.
 
> > I'm not convinced that all these "let's avoid backing storage allocation"
> > in DAX code is not layering violation. I think the right place to solve
> > this is filesystem. And we have almost all required handles for this in
> > place.  We only need to change vm_ops->page_mkwrite() interface to be able
> > to return different page than what was given on input.
> > 
> 
> What? there is no page returned for DAX page_mkwrite(), it is all insert_mixed
> with direct pmd.

That was bogus idea, please ignore.

> > Hm. Where does XFS take this read-write lock in fault path?
> > 
> > IIUC, truncation vs. page fault serialization relies on i_size being
> > updated before doing truncate_pagecache() and checking i_size under
> > page_lock() on fault side. We don't have i_size fence for punch hole.
> > 
> 
> again truncate_pagecache() is NONE.
> And yes the read-write locking will protect punch-hole just as truncate
> see my locking senario above.

Do you mean as racy as your truncate scenario? ;-P

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 16:51           ` Wilcox, Matthew R
  2015-08-11 18:46             ` Boaz Harrosh
@ 2015-08-11 21:48             ` Dave Chinner
  2015-08-12  8:51               ` Boaz Harrosh
  2015-08-13 11:30               ` Jan Kara
  1 sibling, 2 replies; 18+ messages in thread
From: Dave Chinner @ 2015-08-11 21:48 UTC (permalink / raw)
  To: Wilcox, Matthew R
  Cc: Boaz Harrosh, Jan Kara, Kirill A. Shutemov, Andrew Morton,
	linux-mm, linux-fsdevel, linux-kernel, Davidlohr Bueso

On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote:
> The race that you're not seeing is page fault vs page fault.  Two
> threads each attempt to store a byte to different locations on the
> same page.  With a read-mutex to exclude truncates, each thread
> calls ->get_block.  One of the threads gets back a buffer marked
> as BH_New and calls memset() to clear the page.  The other thread
> gets back a buffer which isn't marked as BH_New and simply inserts
> the mapping, returning to userspace, which stores the byte ...
> just in time for the other thread's memset() to write a zero over
> the top of it.

So, this is not a truncate race that the XFS MMAPLOCK solves.

However, that doesn't mean that the DAX code needs to add locking to
solve it. The race here is caused by block initialisation being
unserialised after a ->get_block call allocates the block (which the
filesystem serialises via internal locking). Hence two simultaneous
->get_block calls to the same block is guaranteed to have the DAX
block initialisation race with the second ->get_block call that says
the block is already allocated.

IOWs, the way to handle this is to have the ->get_block call handle
the block zeroing for new blocks instead of doing it after the fact
in the generic DAX code where there is no fine-grained serialisation
object available. By calling dax_clear_blocks() in the ->get_block
callback, the filesystem can ensure that the second racing call will
only make progress once the block has been fully initialised by the
first call.

IMO the fix is - again - to move the functionality into the
filesystem where we already have the necessary exclusion in place to
avoid this race condition entirely.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 20:26               ` Kirill A. Shutemov
@ 2015-08-12  7:54                 ` Boaz Harrosh
  0 siblings, 0 replies; 18+ messages in thread
From: Boaz Harrosh @ 2015-08-12  7:54 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Jan Kara, Dave Chinner, Kirill A. Shutemov, Andrew Morton,
	Matthew Wilcox, linux-mm, linux-fsdevel, linux-kernel,
	Davidlohr Bueso, Theodore Ts'o

On 08/11/2015 11:26 PM, Kirill A. Shutemov wrote:
> On Tue, Aug 11, 2015 at 07:17:12PM +0300, Boaz Harrosh wrote:
>> On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote:
>>> We also used lock_page() to make sure we shoot out all pages as we don't
>>> exclude page faults during truncate. Consider this race:
>>>
>>> 	<fault>			<truncate>
>>> 	get_block
>>> 	check i_size
>>>     				update i_size
>>> 				unmap
>>> 	setup pte
>>>
>>
>> Please consider this senario then:
>>
>>  	<fault>			<truncate>
>> 	read_lock(inode)
>>
>>  	get_block
>>  	check i_size
>> 	
>> 	read_unlock(inode)
>>
>> 				write_lock(inode)
>>
>>      				update i_size
>> 				* remove allocated blocks
>>  				unmap
>>
>> 				write_unlock(inode)
>>
>>  	setup pte
>>
>> IS what you suppose to do in xfs
> 
> Do you realize that you describe a race? :-P
> 
> Exactly in this scenario pfn your pte point to is not belong to the file
> anymore. Have fun.
> 

Sorry yes I have written it wrong, I have now returned to read the actual code
and the setup pte part is also part of the read lock inside the fault handler
before the release of the r_lock.
Da of course it is, it is the page_fault handler that does the
vm_insert_mixed(vma,,pfn) and in the case of concurrent faults the second
call to vm_insert_mixed will return -EBUSY which means all is well.

So the only thing left is the fault-to-fault zero-the-page race as Matthew described
and as Dave and me think we can make this part of the FS's get_block where it is
more natural.

Thanks
Boaz


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 21:48             ` Dave Chinner
@ 2015-08-12  8:51               ` Boaz Harrosh
  2015-08-13 11:30               ` Jan Kara
  1 sibling, 0 replies; 18+ messages in thread
From: Boaz Harrosh @ 2015-08-12  8:51 UTC (permalink / raw)
  To: Dave Chinner, Wilcox, Matthew R
  Cc: Boaz Harrosh, Jan Kara, Kirill A. Shutemov, Andrew Morton,
	linux-mm, linux-fsdevel, linux-kernel, Davidlohr Bueso

On 08/12/2015 12:48 AM, Dave Chinner wrote:
> On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote:
>> The race that you're not seeing is page fault vs page fault.  Two
>> threads each attempt to store a byte to different locations on the
>> same page.  With a read-mutex to exclude truncates, each thread
>> calls ->get_block.  One of the threads gets back a buffer marked
>> as BH_New and calls memset() to clear the page.  The other thread
>> gets back a buffer which isn't marked as BH_New and simply inserts
>> the mapping, returning to userspace, which stores the byte ...
>> just in time for the other thread's memset() to write a zero over
>> the top of it.
> 
> So, this is not a truncate race that the XFS MMAPLOCK solves.
> 
> However, that doesn't mean that the DAX code needs to add locking to
> solve it. The race here is caused by block initialisation being
> unserialised after a ->get_block call allocates the block (which the
> filesystem serialises via internal locking). Hence two simultaneous
> ->get_block calls to the same block is guaranteed to have the DAX
> block initialisation race with the second ->get_block call that says
> the block is already allocated.
> 
> IOWs, the way to handle this is to have the ->get_block call handle
> the block zeroing for new blocks instead of doing it after the fact
> in the generic DAX code where there is no fine-grained serialisation
> object available. By calling dax_clear_blocks() in the ->get_block
> callback, the filesystem can ensure that the second racing call will
> only make progress once the block has been fully initialised by the
> first call.
> 
> IMO the fix is - again - to move the functionality into the
> filesystem where we already have the necessary exclusion in place to
> avoid this race condition entirely.
> 

Exactly, thanks

> Cheers,
> 
> Dave.
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
  2015-08-11 21:48             ` Dave Chinner
  2015-08-12  8:51               ` Boaz Harrosh
@ 2015-08-13 11:30               ` Jan Kara
  1 sibling, 0 replies; 18+ messages in thread
From: Jan Kara @ 2015-08-13 11:30 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Wilcox, Matthew R, Boaz Harrosh, Jan Kara, Kirill A. Shutemov,
	Andrew Morton, linux-mm, linux-fsdevel, linux-kernel,
	Davidlohr Bueso

On Wed 12-08-15 07:48:22, Dave Chinner wrote:
> On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote:
> > The race that you're not seeing is page fault vs page fault.  Two
> > threads each attempt to store a byte to different locations on the
> > same page.  With a read-mutex to exclude truncates, each thread
> > calls ->get_block.  One of the threads gets back a buffer marked
> > as BH_New and calls memset() to clear the page.  The other thread
> > gets back a buffer which isn't marked as BH_New and simply inserts
> > the mapping, returning to userspace, which stores the byte ...
> > just in time for the other thread's memset() to write a zero over
> > the top of it.
> 
> So, this is not a truncate race that the XFS MMAPLOCK solves.
> 
> However, that doesn't mean that the DAX code needs to add locking to
> solve it. The race here is caused by block initialisation being
> unserialised after a ->get_block call allocates the block (which the
> filesystem serialises via internal locking). Hence two simultaneous
> ->get_block calls to the same block is guaranteed to have the DAX
> block initialisation race with the second ->get_block call that says
> the block is already allocated.
> 
> IOWs, the way to handle this is to have the ->get_block call handle
> the block zeroing for new blocks instead of doing it after the fact
> in the generic DAX code where there is no fine-grained serialisation
> object available. By calling dax_clear_blocks() in the ->get_block
> callback, the filesystem can ensure that the second racing call will
> only make progress once the block has been fully initialised by the
> first call.
> 
> IMO the fix is - again - to move the functionality into the
> filesystem where we already have the necessary exclusion in place to
> avoid this race condition entirely.

I'm somewhat sad to add even more functionality into the already loaded
block mapping interface - we can already allocate delalloc blocks, unwritten
blocks, uninitialized blocks, and now also pre-zeroed blocks. But I agree
fs already synchronizes block allocation for a given inode so adding the
pre-zeroing there is pretty easy. Also getting rid of unwritten extent
handling from DAX code is a nice bonus so all in all I'm for this approach.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2015-08-13 11:30 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-08-10 15:14 [PATCH, RFC 0/2] Recover some scalability for DAX Kirill A. Shutemov
2015-08-10 15:14 ` [PATCH, RFC 1/2] lib: Implement range locks Kirill A. Shutemov
2015-08-10 15:14 ` [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock Kirill A. Shutemov
2015-08-11  8:19   ` Jan Kara
2015-08-11  9:37     ` Dave Chinner
2015-08-11 11:09       ` Boaz Harrosh
2015-08-11 12:03       ` Kirill A. Shutemov
2015-08-11 13:50       ` Jan Kara
2015-08-11 14:31         ` Boaz Harrosh
2015-08-11 15:28           ` Kirill A. Shutemov
2015-08-11 16:17             ` Boaz Harrosh
2015-08-11 20:26               ` Kirill A. Shutemov
2015-08-12  7:54                 ` Boaz Harrosh
2015-08-11 16:51           ` Wilcox, Matthew R
2015-08-11 18:46             ` Boaz Harrosh
2015-08-11 21:48             ` Dave Chinner
2015-08-12  8:51               ` Boaz Harrosh
2015-08-13 11:30               ` Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).