All of lore.kernel.org
 help / color / mirror / Atom feed
* [rfc][patch 0/3] a faster buffered write deadlock fix?
@ 2007-02-08 13:07 Nick Piggin
  2007-02-08 13:07 ` [patch 1/3] fs: add an iovec iterator Nick Piggin
                   ` (4 more replies)
  0 siblings, 5 replies; 34+ messages in thread
From: Nick Piggin @ 2007-02-08 13:07 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Nick Piggin, Andrew Morton, Linus Torvalds

In my last set of numbers for my buffered-write deadlock fix using 2 copies
per page, I realised there is no real performance hit for !uptodate pages
as opposed to uptodate ones. This is unexpected because the uptodate pages
only require a single copy...

The problem turns out to be operator error. I forgot tmpfs won't use this
prepare_write path, so sorry about that.

On ext2, copy 64MB of data from /dev/zero (IO isn't involved), using
4K and 64K block sizes, and conv=notrunc for testing overwriting of
uptodate pages. Numbers is elapsed time in seconds, lower is better.

		2.6.20		bufferd write fix
4K		0.0742		0.1208 (1.63x)
4K-uptodate	0.0493		0.0479 (0.97x)
64K		0.0671		0.1068 (1.59x)
64K-uptodate	0.0357		0.0362 (1.01x)

So we get about a 60% performance hit, which is more expected. I guess if
0.5% doesn't fly, then 60% is right out ;)

If there were any misconceptions, the problem is not that the code is
incredibly tricky or impossible to fix with good performance. The problem
is that the existing aops interface is crap. "correct, fast, compatible
-- choose any 2"

So I have finally finished a first slightly-working draft of my new aops
op (perform_write) proposal. I would be interested to hear comments about
it.  Most of my issues and concerns are in the patch headers themselves,
so reply to them.

The patches are against my latest buffered-write-fix patchset. This
means filesystems not implementing the new aop, will remain safe, if slow.
Here's some numbers after converting ext2 to the new aop:

		2.6.20		perform_write aop
4K		0.0742		0.0769 (1.04x)
4K-uptodate	0.0493		0.0475 (0.96x)
64K		0.0671		0.0613 (0.91x)
64K-uptodate	0.0357		0.0343 (0.96x)

Thanks,
Nick

--
SuSE Labs


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 1/3] fs: add an iovec iterator
  2007-02-08 13:07 [rfc][patch 0/3] a faster buffered write deadlock fix? Nick Piggin
@ 2007-02-08 13:07 ` Nick Piggin
  2007-02-08 19:49   ` Christoph Hellwig
  2007-02-08 23:04   ` Mark Fasheh
  2007-02-08 13:07 ` [patch 2/3] fs: introduce perform_write aop Nick Piggin
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 34+ messages in thread
From: Nick Piggin @ 2007-02-08 13:07 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Nick Piggin, Andrew Morton, Linus Torvalds

Add an iterator data structure to operate over an iovec. Add usercopy
operators needed by generic_file_buffered_write, and convert that function
over.

 include/linux/fs.h |   32 ++++++++++++
 mm/filemap.c       |  132 ++++++++++++++++++++++++++++++++++++++++++-----------
 mm/filemap.h       |  103 -----------------------------------------
 3 files changed, 137 insertions(+), 130 deletions(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -395,6 +395,38 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+struct iovec_iterator {
+	const struct iovec *iov;
+	unsigned long nr_segs;
+	size_t iov_offset;
+	size_t count;
+};
+
+size_t iovec_iterator_copy_from_user_atomic(struct page *page,
+		struct iovec_iterator *i, unsigned long offset, size_t bytes);
+size_t iovec_iterator_copy_from_user(struct page *page,
+		struct iovec_iterator *i, unsigned long offset, size_t bytes);
+void iovec_iterator_advance(struct iovec_iterator *i, size_t bytes);
+int iovec_iterator_fault_in_readable(struct iovec_iterator *i);
+
+static inline void iovec_iterator_init(struct iovec_iterator *i,
+			const struct iovec *iov, unsigned long nr_segs,
+			size_t count, size_t written)
+{
+	i->iov = iov;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iovec_iterator_advance(i, written);
+}
+
+static inline size_t iovec_iterator_count(struct iovec_iterator *i)
+{
+	return i->count;
+}
+
+
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -30,7 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
-#include "filemap.h"
+#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include "internal.h"
 
 /*
@@ -1679,8 +1679,7 @@ int remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(remove_suid);
 
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
 	size_t copied = 0, left = 0;
@@ -1703,6 +1702,98 @@ __filemap_copy_from_user_iovec_inatomic(
 }
 
 /*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iovec_iterator_copy_from_user_atomic(struct page *page,
+		struct iovec_iterator *i, unsigned long offset, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	BUG_ON(!in_atomic());
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = i->iov->iov_base + i->iov_offset;
+		left = __copy_from_user_inatomic_nocache(kaddr + offset,
+							buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						i->iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	return copied;
+}
+
+/*
+ * This has the same sideeffects and return value as
+ * iovec_iterator_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iovec_iterator_copy_from_user(struct page *page,
+		struct iovec_iterator *i, unsigned long offset, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = i->iov->iov_base + i->iov_offset;
+		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						i->iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+
+static void __iovec_iterator_advance_iov(struct iovec_iterator *i, size_t bytes)
+{
+	if (likely(i->nr_segs == 1)) {
+		i->iov_offset += bytes;
+	} else {
+		const struct iovec *iov = i->iov;
+		size_t base = i->iov_offset;
+
+		while (bytes) {
+			int copy = min(bytes, iov->iov_len - base);
+
+			bytes -= copy;
+			base += copy;
+			if (iov->iov_len == base) {
+				iov++;
+				base = 0;
+			}
+		}
+		i->iov = iov;
+		i->iov_offset = base;
+	}
+}
+
+void iovec_iterator_advance(struct iovec_iterator *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+
+	__iovec_iterator_advance_iov(i, bytes);
+	i->count -= bytes;
+}
+
+int iovec_iterator_fault_in_readable(struct iovec_iterator *i)
+{
+	size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count);
+	char __user *buf = i->iov->iov_base + i->iov_offset;
+	return fault_in_pages_readable(buf, seglen);
+}
+
+/*
  * Performs necessary checks before doing a write
  *
  * Can adjust writing position or amount of bytes to write.
@@ -1862,30 +1953,22 @@ generic_file_buffered_write(struct kiocb
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
-	const struct iovec *cur_iov = iov; /* current iovec */
-	size_t		iov_offset = 0;	   /* offset in the current iovec */
-	char __user	*buf;
+	struct iovec_iterator i;
 
-	/*
-	 * handle partial DIO write.  Adjust cur_iov if needed.
-	 */
-	filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, written);
+	iovec_iterator_init(&i, iov, nr_segs, count, written);
 
 	do {
 		struct page *src_page;
 		struct page *page;
 		pgoff_t index;		/* Pagecache index for current page */
 		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long seglen;	/* Bytes remaining in current iovec */
 		unsigned long bytes;	/* Bytes to write to page */
 		size_t copied;		/* Bytes copied from user */
 
-		buf = cur_iov->iov_base + iov_offset;
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
 		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
+		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iovec_iterator_count(&i));
 
 		/*
 		 * a non-NULL src_page indicates that we're doing the
@@ -1893,10 +1976,6 @@ generic_file_buffered_write(struct kiocb
 		 */
 		src_page = NULL;
 
-		seglen = cur_iov->iov_len - iov_offset;
-		if (seglen > bytes)
-			seglen = bytes;
-
 		/*
 		 * Bring in the user page that we will copy from _first_.
 		 * Otherwise there's a nasty deadlock on copying from the
@@ -1907,7 +1986,7 @@ generic_file_buffered_write(struct kiocb
 		 * to check that the address is actually valid, when atomic
 		 * usercopies are used, below.
 		 */
-		if (unlikely(fault_in_pages_readable(buf, seglen))) {
+		if (unlikely(iovec_iterator_fault_in_readable(&i))) {
 			status = -EFAULT;
 			break;
 		}
@@ -1938,8 +2017,8 @@ generic_file_buffered_write(struct kiocb
 			 * same reason as we can't take a page fault with a
 			 * page locked (as explained below).
 			 */
-			copied = filemap_copy_from_user(src_page, offset,
-					cur_iov, nr_segs, iov_offset, bytes);
+			copied = iovec_iterator_copy_from_user(src_page, &i,
+								offset, bytes);
 			if (unlikely(copied == 0)) {
 				status = -EFAULT;
 				page_cache_release(page);
@@ -1977,8 +2056,8 @@ generic_file_buffered_write(struct kiocb
 			 * really matter.
 			 */
 			pagefault_disable();
-			copied = filemap_copy_from_user_atomic(page, offset,
-					cur_iov, nr_segs, iov_offset, bytes);
+			copied = iovec_iterator_copy_from_user_atomic(page, &i,
+								offset, bytes);
 			pagefault_enable();
 		} else {
 			void *src, *dst;
@@ -2003,10 +2082,9 @@ generic_file_buffered_write(struct kiocb
 		if (src_page)
 			page_cache_release(src_page);
 
+		iovec_iterator_advance(&i, copied);
 		written += copied;
-		count -= copied;
 		pos += copied;
-		filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, copied);
 
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
@@ -2030,7 +2108,7 @@ fs_write_aop_error:
 			continue;
 		else
 			break;
-	} while (count);
+	} while (iovec_iterator_count(&i));
 	*ppos = pos;
 
 	/*
Index: linux-2.6/mm/filemap.h
===================================================================
--- linux-2.6.orig/mm/filemap.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *	linux/mm/filemap.h
- *
- * Copyright (C) 1994-1999  Linus Torvalds
- */
-
-#ifndef __FILEMAP_H
-#define __FILEMAP_H
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/uio.h>
-#include <linux/uaccess.h>
-
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
-					const struct iovec *iov,
-					size_t base,
-					size_t bytes);
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then return the number of
- * bytes which were copied.
- */
-static inline size_t
-filemap_copy_from_user_atomic(struct page *page, unsigned long offset,
-			const struct iovec *iov, unsigned long nr_segs,
-			size_t base, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	if (likely(nr_segs == 1)) {
-		int left;
-		char __user *buf = iov->iov_base + base;
-		left = __copy_from_user_inatomic_nocache(kaddr + offset,
-							buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset,
-							iov, base, bytes);
-	}
-	kunmap_atomic(kaddr, KM_USER0);
-
-	return copied;
-}
-
-/*
- * This has the same sideeffects and return value as
- * filemap_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
-			const struct iovec *iov, unsigned long nr_segs,
-			 size_t base, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap(page);
-	if (likely(nr_segs == 1)) {
-		int left;
-		char __user *buf = iov->iov_base + base;
-		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset,
-							iov, base, bytes);
-	}
-	kunmap(page);
-	return copied;
-}
-
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, unsigned long nr_segs,
-						 size_t *basep, size_t bytes)
-{
-	if (likely(nr_segs == 1)) {
-		*basep += bytes;
-	} else {
-		const struct iovec *iov = *iovp;
-		size_t base = *basep;
-
-		while (bytes) {
-			int copy = min(bytes, iov->iov_len - base);
-
-			bytes -= copy;
-			base += copy;
-			if (iov->iov_len == base) {
-				iov++;
-				base = 0;
-			}
-		}
-		*iovp = iov;
-		*basep = base;
-	}
-}
-#endif

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 2/3] fs: introduce perform_write aop
  2007-02-08 13:07 [rfc][patch 0/3] a faster buffered write deadlock fix? Nick Piggin
  2007-02-08 13:07 ` [patch 1/3] fs: add an iovec iterator Nick Piggin
@ 2007-02-08 13:07 ` Nick Piggin
  2007-03-09 10:39   ` Christoph Hellwig
  2007-02-08 13:07 ` [patch 3/3] ext2: use " Nick Piggin
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-02-08 13:07 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Nick Piggin, Andrew Morton, Linus Torvalds

Add a new "perform_write" aop, which replaces prepare_write and commit_write
as a single call to copy a given amount of userdata at the given offset. This
is more flexible, because the implementation can determine how to best handle
errors, or multi-page ranges (eg. it may use a gang lookup), and only requires
one call into the fs.

One problem with this interface is that it cannot be used to write into the
filesystem by any means other than already-initialised buffers via iovecs. So
prepare/commit have to stay around for non-user data... 

Another thing is that it seems to be less able to be implemented in generic,
reusable code. It should be possible to introduce a new 2-op interface (or
maybe just a new error handler op) which can be used correctly in generic code.

I avoided that way in my first attempt because this seemed more elegant from
a logical POV. But after seeing the implementation, I'm having second
thoughts.

 include/linux/fs.h |    6 ++++
 mm/filemap.c       |   64 +++++++++++++++++++++++++++++++++--------------------
 2 files changed, 47 insertions(+), 23 deletions(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -447,6 +447,12 @@ struct address_space_operations {
 	 */
 	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+
+	/*
+	 * perform_write replaces prepare and commit_write callbacks.
+	 */
+	int (*perform_write)(struct file *, struct iovec_iterator *, loff_t);
+
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
 	void (*invalidatepage) (struct page *, unsigned long);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -1920,8 +1920,7 @@ EXPORT_SYMBOL(generic_file_direct_write)
  * Find or create a page at the given pagecache position. Return the locked
  * page. This function is specifically for buffered writes.
  */
-static struct page *__grab_cache_page(struct address_space *mapping,
-							pgoff_t index)
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
 {
 	int status;
 	struct page *page;
@@ -1943,19 +1942,14 @@ repeat:
 	return page;
 }
 
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, ssize_t written)
+static ssize_t generic_perform_write(struct file *file,
+				struct iovec_iterator *i, loff_t pos)
 {
-	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode 	*inode = mapping->host;
-	long		status = 0;
-	struct iovec_iterator i;
-
-	iovec_iterator_init(&i, iov, nr_segs, count, written);
+	struct inode *inode = mapping->host;
+	long status = 0;
+	ssize_t written = 0;
 
 	do {
 		struct page *src_page;
@@ -1968,7 +1962,7 @@ generic_file_buffered_write(struct kiocb
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-						iovec_iterator_count(&i));
+						iovec_iterator_count(i));
 
 		/*
 		 * a non-NULL src_page indicates that we're doing the
@@ -1986,7 +1980,7 @@ generic_file_buffered_write(struct kiocb
 		 * to check that the address is actually valid, when atomic
 		 * usercopies are used, below.
 		 */
-		if (unlikely(iovec_iterator_fault_in_readable(&i))) {
+		if (unlikely(iovec_iterator_fault_in_readable(i))) {
 			status = -EFAULT;
 			break;
 		}
@@ -2017,7 +2011,7 @@ generic_file_buffered_write(struct kiocb
 			 * same reason as we can't take a page fault with a
 			 * page locked (as explained below).
 			 */
-			copied = iovec_iterator_copy_from_user(src_page, &i,
+			copied = iovec_iterator_copy_from_user(src_page, i,
 								offset, bytes);
 			if (unlikely(copied == 0)) {
 				status = -EFAULT;
@@ -2056,7 +2050,7 @@ generic_file_buffered_write(struct kiocb
 			 * really matter.
 			 */
 			pagefault_disable();
-			copied = iovec_iterator_copy_from_user_atomic(page, &i,
+			copied = iovec_iterator_copy_from_user_atomic(page, i,
 								offset, bytes);
 			pagefault_enable();
 		} else {
@@ -2082,9 +2076,9 @@ generic_file_buffered_write(struct kiocb
 		if (src_page)
 			page_cache_release(src_page);
 
-		iovec_iterator_advance(&i, copied);
-		written += copied;
+		iovec_iterator_advance(i, copied);
 		pos += copied;
+		written += copied;
 
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
@@ -2108,13 +2102,37 @@ fs_write_aop_error:
 			continue;
 		else
 			break;
-	} while (iovec_iterator_count(&i));
-	*ppos = pos;
+	} while (iovec_iterator_count(i));
+
+	return written ? written : status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, ssize_t written)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode 	*inode = mapping->host;
+	long		status = 0;
+	struct iovec_iterator i;
+
+	iovec_iterator_init(&i, iov, nr_segs, count, written);
+	if (!a_ops->perform_write)
+		status = generic_perform_write(file, &i, pos);
+	else
+		status = a_ops->perform_write(file, &i, pos);
 
-	/*
-	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
-	 */
 	if (likely(status >= 0)) {
+		written += status;
+		*ppos = pos + status;
+
+		/*
+		 * For now, when the user asks for O_SYNC, we'll actually give
+		 * O_DSYNC
+		 */
 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 			if (!a_ops->writepage || !is_sync_kiocb(iocb))
 				status = generic_osync_inode(inode, mapping,

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 3/3] ext2: use perform_write aop
  2007-02-08 13:07 [rfc][patch 0/3] a faster buffered write deadlock fix? Nick Piggin
  2007-02-08 13:07 ` [patch 1/3] fs: add an iovec iterator Nick Piggin
  2007-02-08 13:07 ` [patch 2/3] fs: introduce perform_write aop Nick Piggin
@ 2007-02-08 13:07 ` Nick Piggin
  2007-02-08 14:47   ` Dmitriy Monakhov
  2007-02-09 19:14   ` Andrew Morton
  2007-02-09  0:38 ` [rfc][patch 0/3] a faster buffered write deadlock fix? Mark Fasheh
  2007-02-09  8:41 ` Andrew Morton
  4 siblings, 2 replies; 34+ messages in thread
From: Nick Piggin @ 2007-02-08 13:07 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Nick Piggin, Andrew Morton, Linus Torvalds

Convert ext2 to use ->perform_write. This uses the main loop out of
generic_perform_write, but when encountering a short usercopy, it
zeroes out new uninitialised blocks, and passes in a short-length commit
to __block_commit_write, which does the right thing (in terms of not
setting things uptodate).

 fs/buffer.c                 |  143 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c             |    7 ++
 include/linux/buffer_head.h |    1 
 include/linux/pagemap.h     |    2 
 4 files changed, 153 insertions(+)

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -1866,6 +1866,50 @@ next_bh:
 	return err;
 }
 
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
+
+	bh = head = page_buffers(page);
+	block_start = 0;
+	do {
+		block_end = block_start + bh->b_size;
+
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, end;
+					void *kaddr;
+
+					start = max(from, block_start);
+					end = min(to, block_end);
+
+					kaddr = kmap_atomic(page, KM_USER0);
+					memset(kaddr+start, 0, block_end-end);
+					flush_dcache_page(page);
+					kunmap_atomic(kaddr, KM_USER0);
+					set_buffer_uptodate(bh);
+				}
+
+				/*
+				 * XXX: make buffer_new behaviour more
+				 * consistent.
+				 * clear_buffer_new(bh);
+				 */
+				mark_buffer_dirty(bh);
+			}
+		}
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
 static int __block_commit_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to)
 {
@@ -1900,6 +1944,105 @@ static int __block_commit_write(struct i
 	return 0;
 }
 
+ssize_t block_perform_write(struct file *file, struct iovec_iterator *i,
+					loff_t pos, get_block_t *get_block)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	long status = 0;
+	ssize_t written = 0;
+
+	do {
+		struct page *page;
+		pgoff_t index;		/* Pagecache index for current page */
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+		size_t copied;		/* Bytes copied from user */
+
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iovec_iterator_count(i));
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iovec_iterator_fault_in_readable(i))) {
+			status = -EFAULT;
+			break;
+		}
+
+		page = __grab_cache_page(mapping, index);
+		if (!page) {
+			status = -ENOMEM;
+			break;
+		}
+
+		status = __block_prepare_write(inode, page, offset,
+						offset+bytes, get_block);
+		if (unlikely(status)) {
+			ClearPageUptodate(page);
+
+			page_cache_release(page);
+
+			/*
+			 * prepare_write() may have instantiated a few blocks
+			 * outside i_size.  Trim these off again. Don't need
+			 * i_size_read because we hold i_mutex.
+			 */
+			if (pos + bytes > inode->i_size)
+				vmtruncate(inode, inode->i_size);
+			break;
+		}
+
+		/*
+		 * Must not enter the pagefault handler here, because
+		 * we hold the page lock. See mm/filemap.c for more
+		 * details.
+		 */
+		pagefault_disable();
+		copied = iovec_iterator_copy_from_user_atomic(page, i,
+							offset, bytes);
+		pagefault_enable();
+		if (unlikely(copied < bytes))
+			page_zero_new_buffers(page, offset+copied, offset+bytes);
+		flush_dcache_page(page);
+
+		/* This could be a short (even 0-length) commit */
+		__block_commit_write(inode, page, offset, offset+copied);
+
+		unlock_page(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+
+		iovec_iterator_advance(i, copied);
+		pos += copied;
+		written += copied;
+
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+
+	} while (iovec_iterator_count(i));
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 */
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+
+	return written ? written : status;
+}
+
 /*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
Index: linux-2.6/fs/ext2/inode.c
===================================================================
--- linux-2.6.orig/fs/ext2/inode.c
+++ linux-2.6/fs/ext2/inode.c
@@ -642,6 +642,12 @@ ext2_readpages(struct file *file, struct
 	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
 }
 
+static ssize_t
+ext2_perform_write(struct file *file, struct iovec_iterator *i, loff_t pos)
+{
+	return block_perform_write(file, i, pos, ext2_get_block);
+}
+
 static int
 ext2_prepare_write(struct file *file, struct page *page,
 			unsigned from, unsigned to)
@@ -689,6 +695,7 @@ const struct address_space_operations ex
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
 	.sync_page		= block_sync_page,
+	.perform_write		= ext2_perform_write,
 	.prepare_write		= ext2_prepare_write,
 	.commit_write		= generic_commit_write,
 	.bmap			= ext2_bmap,
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h
+++ linux-2.6/include/linux/buffer_head.h
@@ -198,6 +198,7 @@ void block_invalidatepage(struct page *p
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
+ssize_t block_perform_write(struct file *, struct iovec_iterator*, loff_t, get_block_t*);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 				loff_t *);
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -87,6 +87,8 @@ unsigned find_get_pages_contig(struct ad
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
 
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index);
+
 /*
  * Returns locked page at given index in given cache, creating it if needed.
  */

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 3/3] ext2: use perform_write aop
  2007-02-08 13:07 ` [patch 3/3] ext2: use " Nick Piggin
@ 2007-02-08 14:47   ` Dmitriy Monakhov
  2007-02-09 19:14   ` Andrew Morton
  1 sibling, 0 replies; 34+ messages in thread
From: Dmitriy Monakhov @ 2007-02-08 14:47 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Andrew Morton

Nick Piggin <npiggin@suse.de> writes:

> Convert ext2 to use ->perform_write. This uses the main loop out of
> generic_perform_write, but when encountering a short usercopy, it
> zeroes out new uninitialised blocks, and passes in a short-length commit
> to __block_commit_write, which does the right thing (in terms of not
> setting things uptodate).
>
>  fs/buffer.c                 |  143 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext2/inode.c             |    7 ++
>  include/linux/buffer_head.h |    1 
>  include/linux/pagemap.h     |    2 
>  4 files changed, 153 insertions(+)
>
> Index: linux-2.6/fs/buffer.c
> ===================================================================
> --- linux-2.6.orig/fs/buffer.c
> +++ linux-2.6/fs/buffer.c
> @@ -1866,6 +1866,50 @@ next_bh:
>  	return err;
>  }
>  
> +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
> +{
> +	unsigned int block_start, block_end;
> +	struct buffer_head *head, *bh;
> +
> +	BUG_ON(!PageLocked(page));
> +	if (!page_has_buffers(page))
> +		return;
> +
> +	bh = head = page_buffers(page);
> +	block_start = 0;
> +	do {
> +		block_end = block_start + bh->b_size;
> +
> +		if (buffer_new(bh)) {
> +			if (block_end > from && block_start < to) {
> +				if (!PageUptodate(page)) {
> +					unsigned start, end;
> +					void *kaddr;
> +
> +					start = max(from, block_start);
> +					end = min(to, block_end);
> +
> +					kaddr = kmap_atomic(page, KM_USER0);
> +					memset(kaddr+start, 0, block_end-end);
> +					flush_dcache_page(page);
> +					kunmap_atomic(kaddr, KM_USER0);
> +					set_buffer_uptodate(bh);
> +				}
> +
> +				/*
> +				 * XXX: make buffer_new behaviour more
> +				 * consistent.
> +				 * clear_buffer_new(bh);
> +				 */
> +				mark_buffer_dirty(bh);
> +			}
> +		}
> +
> +		block_start = block_end;
> +		bh = bh->b_this_page;
> +	} while (bh != head);
> +}
> +
>  static int __block_commit_write(struct inode *inode, struct page *page,
>  		unsigned from, unsigned to)
>  {
> @@ -1900,6 +1944,105 @@ static int __block_commit_write(struct i
>  	return 0;
>  }
>  
> +ssize_t block_perform_write(struct file *file, struct iovec_iterator *i,
> +					loff_t pos, get_block_t *get_block)
> +{
> +	struct address_space *mapping = file->f_mapping;
> +	struct inode *inode = mapping->host;
> +	long status = 0;
> +	ssize_t written = 0;
> +
> +	do {
> +		struct page *page;
> +		pgoff_t index;		/* Pagecache index for current page */
> +		unsigned long offset;	/* Offset into pagecache page */
> +		unsigned long bytes;	/* Bytes to write to page */
> +		size_t copied;		/* Bytes copied from user */
> +
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +		index = pos >> PAGE_CACHE_SHIFT;
> +		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
> +						iovec_iterator_count(i));
> +
> +		/*
> +		 * Bring in the user page that we will copy from _first_.
> +		 * Otherwise there's a nasty deadlock on copying from the
> +		 * same page as we're writing to, without it being marked
> +		 * up-to-date.
> +		 *
> +		 * Not only is this an optimisation, but it is also required
> +		 * to check that the address is actually valid, when atomic
> +		 * usercopies are used, below.
> +		 */
> +		if (unlikely(iovec_iterator_fault_in_readable(i))) {
> +			status = -EFAULT;
> +			break;
> +		}
> +
> +		page = __grab_cache_page(mapping, index);
> +		if (!page) {
> +			status = -ENOMEM;
> +			break;
> +		}
> +
> +		status = __block_prepare_write(inode, page, offset,
> +						offset+bytes, get_block);
> +		if (unlikely(status)) {
> +			ClearPageUptodate(page);
> +
> +			page_cache_release(page);
> +
> +			/*
> +			 * prepare_write() may have instantiated a few blocks
> +			 * outside i_size.  Trim these off again. Don't need
> +			 * i_size_read because we hold i_mutex.
> +			 */
> +			if (pos + bytes > inode->i_size)
> +				vmtruncate(inode, inode->i_size);
> +			break;
> +		}
> +
> +		/*
> +		 * Must not enter the pagefault handler here, because
> +		 * we hold the page lock. See mm/filemap.c for more
> +		 * details.
> +		 */
> +		pagefault_disable();
> +		copied = iovec_iterator_copy_from_user_atomic(page, i,
> +							offset, bytes);
> +		pagefault_enable();
> +		if (unlikely(copied < bytes))
> +			page_zero_new_buffers(page, offset+copied, offset+bytes);
> +		flush_dcache_page(page);
> +
<<<<<<<<<<< here fs cat do some fs-specific stuff without making
            internal state visiable.  cool.
> +		/* This could be a short (even 0-length) commit */
> +		__block_commit_write(inode, page, offset, offset+copied);
> +
> +		unlock_page(page);
> +		mark_page_accessed(page);
> +		page_cache_release(page);
> +
> +		iovec_iterator_advance(i, copied);
> +		pos += copied;
> +		written += copied;
> +
> +		balance_dirty_pages_ratelimited(mapping);
> +		cond_resched();
> +
> +	} while (iovec_iterator_count(i));
> +
<<<<<<<<<<<  If i've understand correctly folowing scenario possible:
         iteration 1: ->iovec_iterator_fault_in_readable(...)  = 0           
         iteration 1: __block_prepare_write  = {blocks allocated}
         iteration 1: iovec_iterator_copy_from_user_atomic(...) = 0 
         iteration 1: while(iovec_iterator_count(i))  == goto next loop

         iteration 2: ->iovec_iterator_fault_in_readable(...)  = -EFAULT
                      Than breack loop .
         At this point prepare_write() may have instantiated a few blocks
         outside i_size on iteration(1) So we have to trim these off again.

> +	/*
> +	 * No need to use i_size_read() here, the i_size
> +	 * cannot change under us because we hold i_mutex.
> +	 */
> +	if (pos > inode->i_size) {
> +		i_size_write(inode, pos);
> +		mark_inode_dirty(inode);
> +	}
> +
> +	return written ? written : status;
> +}
> +
>  /*
>   * Generic "read page" function for block devices that have the normal
>   * get_block functionality. This is most of the block device filesystems.
> Index: linux-2.6/fs/ext2/inode.c
> ===================================================================
> --- linux-2.6.orig/fs/ext2/inode.c
> +++ linux-2.6/fs/ext2/inode.c
> @@ -642,6 +642,12 @@ ext2_readpages(struct file *file, struct
>  	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
>  }
>  
> +static ssize_t
> +ext2_perform_write(struct file *file, struct iovec_iterator *i, loff_t pos)
> +{
> +	return block_perform_write(file, i, pos, ext2_get_block);
> +}
> +
>  static int
>  ext2_prepare_write(struct file *file, struct page *page,
>  			unsigned from, unsigned to)
> @@ -689,6 +695,7 @@ const struct address_space_operations ex
>  	.readpages		= ext2_readpages,
>  	.writepage		= ext2_writepage,
>  	.sync_page		= block_sync_page,
> +	.perform_write		= ext2_perform_write,
>  	.prepare_write		= ext2_prepare_write,
>  	.commit_write		= generic_commit_write,
>  	.bmap			= ext2_bmap,
> Index: linux-2.6/include/linux/buffer_head.h
> ===================================================================
> --- linux-2.6.orig/include/linux/buffer_head.h
> +++ linux-2.6/include/linux/buffer_head.h
> @@ -198,6 +198,7 @@ void block_invalidatepage(struct page *p
>  int block_write_full_page(struct page *page, get_block_t *get_block,
>  				struct writeback_control *wbc);
>  int block_read_full_page(struct page*, get_block_t*);
> +ssize_t block_perform_write(struct file *, struct iovec_iterator*, loff_t, get_block_t*);
>  int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
>  int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
>  				loff_t *);
> Index: linux-2.6/include/linux/pagemap.h
> ===================================================================
> --- linux-2.6.orig/include/linux/pagemap.h
> +++ linux-2.6/include/linux/pagemap.h
> @@ -87,6 +87,8 @@ unsigned find_get_pages_contig(struct ad
>  unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
>  			int tag, unsigned int nr_pages, struct page **pages);
>  
> +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index);
> +
>  /*
>   * Returns locked page at given index in given cache, creating it if needed.
>   */
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 1/3] fs: add an iovec iterator
  2007-02-08 13:07 ` [patch 1/3] fs: add an iovec iterator Nick Piggin
@ 2007-02-08 19:49   ` Christoph Hellwig
  2007-02-09  1:46     ` Nick Piggin
  2007-02-08 23:04   ` Mark Fasheh
  1 sibling, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2007-02-08 19:49 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Andrew Morton

On Thu, Feb 08, 2007 at 02:07:24PM +0100, Nick Piggin wrote:
> Add an iterator data structure to operate over an iovec. Add usercopy
> operators needed by generic_file_buffered_write, and convert that function
> over.

iovec_iterator is an awfully long and not very descriptive name.
In past discussions we named this thingy iodesc and wanted to pass it
down all the I/O path, including the file operations.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 1/3] fs: add an iovec iterator
  2007-02-08 13:07 ` [patch 1/3] fs: add an iovec iterator Nick Piggin
  2007-02-08 19:49   ` Christoph Hellwig
@ 2007-02-08 23:04   ` Mark Fasheh
  1 sibling, 0 replies; 34+ messages in thread
From: Mark Fasheh @ 2007-02-08 23:04 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Andrew Morton

Hi Nick,

On Thu, Feb 08, 2007 at 02:07:24PM +0100, Nick Piggin wrote:
> Add an iterator data structure to operate over an iovec. Add usercopy
> operators needed by generic_file_buffered_write, and convert that function
> over.

I really like this iterator structure - it brings together some fields that
seem to get passed around seperately, even though they're linked.

Also IMHO, it makes things in filemap.c easer to read...
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-08 13:07 [rfc][patch 0/3] a faster buffered write deadlock fix? Nick Piggin
                   ` (2 preceding siblings ...)
  2007-02-08 13:07 ` [patch 3/3] ext2: use " Nick Piggin
@ 2007-02-09  0:38 ` Mark Fasheh
  2007-02-09  2:04   ` Nick Piggin
  2007-02-09  8:41 ` Andrew Morton
  4 siblings, 1 reply; 34+ messages in thread
From: Mark Fasheh @ 2007-02-09  0:38 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Andrew Morton

On Thu, Feb 08, 2007 at 02:07:15PM +0100, Nick Piggin wrote:
> The problem is that the existing aops interface is crap. "correct, fast,
> compatible -- choose any 2"

Agreed. There's lots of problems with the interface (imho), but my biggest
two issues are the page lock being held on ->prepare_write() /
->commit_write() and the fact that the file system only sees the write one
page at a time


> So I have finally finished a first slightly-working draft of my new aops
> op (perform_write) proposal. I would be interested to hear comments about
> it.  Most of my issues and concerns are in the patch headers themselves,
> so reply to them.

I like ->perform_write(). It allows the file system to make re-use of the
checks in the generic write path, but gives a maximum amount of information
about the overall operation to be done. There's an added advantage in that
some file systems (ocfs2 is one of these) want to be more careful about
ordering page locks, which should be much easier with it.

If this goes in, it could probably be helpful to me in some of the code I'm
currently writing which needs to be better about page lock / cluster lock
ordering and wants to see as much of the (allocating) writes as possible.
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 1/3] fs: add an iovec iterator
  2007-02-08 19:49   ` Christoph Hellwig
@ 2007-02-09  1:46     ` Nick Piggin
  2007-02-09  2:03       ` Nate Diller
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-02-09  1:46 UTC (permalink / raw)
  To: Christoph Hellwig, Linux Filesystems, Linux Kernel, Andrew Morton

On Thu, Feb 08, 2007 at 07:49:53PM +0000, Christoph Hellwig wrote:
> On Thu, Feb 08, 2007 at 02:07:24PM +0100, Nick Piggin wrote:
> > Add an iterator data structure to operate over an iovec. Add usercopy
> > operators needed by generic_file_buffered_write, and convert that function
> > over.
> 
> iovec_iterator is an awfully long and not very descriptive name.
> In past discussions we named this thingy iodesc and wanted to pass it
> down all the I/O path, including the file operations.

Hi Christoph,

Sure I think it would be a good idea to shorten the name. And yes, although
I just construct the iterator to pass into perform_write, I think it should
make sense to go much further up the call stack instead of passing all those
args around. iodesc seems like a fine name, so I'll use that unless
anyone objects.


Thanks,
Nick

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 1/3] fs: add an iovec iterator
  2007-02-09  1:46     ` Nick Piggin
@ 2007-02-09  2:03       ` Nate Diller
  2007-02-09  3:31         ` Nick Piggin
  2007-03-09 10:40         ` Christoph Hellwig
  0 siblings, 2 replies; 34+ messages in thread
From: Nate Diller @ 2007-02-09  2:03 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Hellwig, Linux Filesystems, Linux Kernel, Andrew Morton

On 2/8/07, Nick Piggin <npiggin@suse.de> wrote:
> On Thu, Feb 08, 2007 at 07:49:53PM +0000, Christoph Hellwig wrote:
> > On Thu, Feb 08, 2007 at 02:07:24PM +0100, Nick Piggin wrote:
> > > Add an iterator data structure to operate over an iovec. Add usercopy
> > > operators needed by generic_file_buffered_write, and convert that function
> > > over.
> >
> > iovec_iterator is an awfully long and not very descriptive name.
> > In past discussions we named this thingy iodesc and wanted to pass it
> > down all the I/O path, including the file operations.
>
> Hi Christoph,
>
> Sure I think it would be a good idea to shorten the name. And yes, although
> I just construct the iterator to pass into perform_write, I think it should
> make sense to go much further up the call stack instead of passing all those
> args around. iodesc seems like a fine name, so I'll use that unless
> anyone objects.

i had a patch integrating the iodesc idea, but after some thought, had
decided to call it struct file_io.  That name reflects the fact that
it's doing I/O in arbitrary lengths with byte offsets, and struct
file_io *fio contrasts well with struct bio (block_io).  I also had
used the field ->nbytes instead of ->count, to clarify the difference
between segment iterators, segment offsets, and absolute bytecount.

FYI, the patch is still in the works and would convert the whole file
I/O stack to use the new structure.  I would like to base it off of
this work as well if this makes it into -mm (as I think it should)

NATE

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-09  0:38 ` [rfc][patch 0/3] a faster buffered write deadlock fix? Mark Fasheh
@ 2007-02-09  2:04   ` Nick Piggin
  0 siblings, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-02-09  2:04 UTC (permalink / raw)
  To: Mark Fasheh; +Cc: Linux Filesystems, Linux Kernel, Andrew Morton

On Thu, Feb 08, 2007 at 04:38:01PM -0800, Mark Fasheh wrote:
> On Thu, Feb 08, 2007 at 02:07:15PM +0100, Nick Piggin wrote:
> > The problem is that the existing aops interface is crap. "correct, fast,
> > compatible -- choose any 2"
> 
> Agreed. There's lots of problems with the interface (imho), but my biggest
> two issues are the page lock being held on ->prepare_write() /
> ->commit_write() and the fact that the file system only sees the write one
> page at a time
> 
> 
> > So I have finally finished a first slightly-working draft of my new aops
> > op (perform_write) proposal. I would be interested to hear comments about
> > it.  Most of my issues and concerns are in the patch headers themselves,
> > so reply to them.
> 
> I like ->perform_write(). It allows the file system to make re-use of the
> checks in the generic write path, but gives a maximum amount of information
> about the overall operation to be done. There's an added advantage in that
> some file systems (ocfs2 is one of these) want to be more careful about
> ordering page locks, which should be much easier with it.

Yeah, if possible I like a range based interface rather than page
based. As you say it gives the most information with the least constraints.

> If this goes in, it could probably be helpful to me in some of the code I'm
> currently writing which needs to be better about page lock / cluster lock
> ordering and wants to see as much of the (allocating) writes as possible.

I think it would be important to have a non trivial user of this new API
before it goes into mainline. It would be great if you could look at
using it, after it passes some review.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 1/3] fs: add an iovec iterator
  2007-02-09  2:03       ` Nate Diller
@ 2007-02-09  3:31         ` Nick Piggin
  2007-02-09 17:28           ` Zach Brown
  2007-03-09 10:40         ` Christoph Hellwig
  1 sibling, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-02-09  3:31 UTC (permalink / raw)
  To: Nate Diller
  Cc: Christoph Hellwig, Linux Filesystems, Linux Kernel, Andrew Morton

On Thu, Feb 08, 2007 at 06:03:50PM -0800, Nate Diller wrote:
> On 2/8/07, Nick Piggin <npiggin@suse.de> wrote:
> >On Thu, Feb 08, 2007 at 07:49:53PM +0000, Christoph Hellwig wrote:
> >> On Thu, Feb 08, 2007 at 02:07:24PM +0100, Nick Piggin wrote:
> >> > Add an iterator data structure to operate over an iovec. Add usercopy
> >> > operators needed by generic_file_buffered_write, and convert that 
> >function
> >> > over.
> >>
> >> iovec_iterator is an awfully long and not very descriptive name.
> >> In past discussions we named this thingy iodesc and wanted to pass it
> >> down all the I/O path, including the file operations.
> >
> >Hi Christoph,
> >
> >Sure I think it would be a good idea to shorten the name. And yes, although
> >I just construct the iterator to pass into perform_write, I think it should
> >make sense to go much further up the call stack instead of passing all 
> >those
> >args around. iodesc seems like a fine name, so I'll use that unless
> >anyone objects.
> 
> i had a patch integrating the iodesc idea, but after some thought, had
> decided to call it struct file_io.  That name reflects the fact that
> it's doing I/O in arbitrary lengths with byte offsets, and struct
> file_io *fio contrasts well with struct bio (block_io).  I also had
> used the field ->nbytes instead of ->count, to clarify the difference
> between segment iterators, segment offsets, and absolute bytecount.

The field name is a good suggestion.

What I have there is not actually a full-blown file io descriptor, because
there is no file or offset. It is just an iovec iterator (so maybe I should
rename it to iov_iter, rather than iodesc). 

I think it might be a nice idea to keep this iov_iter as a standalone
structure, and it could be embedded into a struct file_io?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-08 13:07 [rfc][patch 0/3] a faster buffered write deadlock fix? Nick Piggin
                   ` (3 preceding siblings ...)
  2007-02-09  0:38 ` [rfc][patch 0/3] a faster buffered write deadlock fix? Mark Fasheh
@ 2007-02-09  8:41 ` Andrew Morton
  2007-02-09  9:54   ` Nick Piggin
  4 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-02-09  8:41 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Linus Torvalds

On Thu,  8 Feb 2007 14:07:15 +0100 (CET) Nick Piggin <npiggin@suse.de> wrote:

> So I have finally finished a first slightly-working draft of my new aops
> op (perform_write) proposal. I would be interested to hear comments about
> it.  Most of my issues and concerns are in the patch headers themselves,
> so reply to them.
> 
> The patches are against my latest buffered-write-fix patchset.

What happened with Linus's proposal to instantiate the page as pinned,
non-uptodate, unlocked and in pagecache while we poke the user address?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-09  8:41 ` Andrew Morton
@ 2007-02-09  9:54   ` Nick Piggin
  2007-02-09 10:09     ` Andrew Morton
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-02-09  9:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Filesystems, Linux Kernel, Linus Torvalds

On Fri, Feb 09, 2007 at 12:41:01AM -0800, Andrew Morton wrote:
> On Thu,  8 Feb 2007 14:07:15 +0100 (CET) Nick Piggin <npiggin@suse.de> wrote:
> 
> > So I have finally finished a first slightly-working draft of my new aops
> > op (perform_write) proposal. I would be interested to hear comments about
> > it.  Most of my issues and concerns are in the patch headers themselves,
> > so reply to them.
> > 
> > The patches are against my latest buffered-write-fix patchset.
> 
> What happened with Linus's proposal to instantiate the page as pinned,
> non-uptodate, unlocked and in pagecache while we poke the user address?

That's still got a deadlock, and also it doesn't work if we want to lock
the page when performing a minor fault (which I want to fix fault vs
invalidate), and also assumes nobody's ->nopage locks the page or
requires any resources that are held by prepare_write (something not
immediately clear to me with the cluster filesystems, at least).

But that all becomes legacy path, so do we really care? Supposing fs
maintainers like perform_write, then after the main ones have implementations
we could switch over to the slow-but-correct prepare_write legacy path.
Or we could leave it, or we could use Linus's slightly-less-buggy scheme...
by that point I expect I'd be sick of arguing about it ;)


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-09  9:54   ` Nick Piggin
@ 2007-02-09 10:09     ` Andrew Morton
  2007-02-09 10:32       ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-02-09 10:09 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Linus Torvalds

On Fri, 9 Feb 2007 10:54:05 +0100 Nick Piggin <npiggin@suse.de> wrote:

> On Fri, Feb 09, 2007 at 12:41:01AM -0800, Andrew Morton wrote:
> > On Thu,  8 Feb 2007 14:07:15 +0100 (CET) Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > So I have finally finished a first slightly-working draft of my new aops
> > > op (perform_write) proposal. I would be interested to hear comments about
> > > it.  Most of my issues and concerns are in the patch headers themselves,
> > > so reply to them.
> > > 
> > > The patches are against my latest buffered-write-fix patchset.
> > 
> > What happened with Linus's proposal to instantiate the page as pinned,
> > non-uptodate, unlocked and in pagecache while we poke the user address?
> 
> That's still got a deadlock,

It does?

>  and also it doesn't work if we want to lock
> the page when performing a minor fault (which I want to fix fault vs
> invalidate),

It's hard to discuss this without a description of what you want to fix
there, and a description of how you plan to fix it.

> and also assumes nobody's ->nopage locks the page or
> requires any resources that are held by prepare_write (something not
> immediately clear to me with the cluster filesystems, at least).

The nopage handler is filemap_nopage().  Are there any exceptions to that?

> But that all becomes legacy path, so do we really care? Supposing fs
> maintainers like perform_write, then after the main ones have implementations
> we could switch over to the slow-but-correct prepare_write legacy path.
> Or we could leave it, or we could use Linus's slightly-less-buggy scheme...
> by that point I expect I'd be sick of arguing about it ;)

It's worth "arguing" about.  This is write().  What matters more??

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-09 10:09     ` Andrew Morton
@ 2007-02-09 10:32       ` Nick Piggin
  2007-02-09 10:52         ` Andrew Morton
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-02-09 10:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Filesystems, Linux Kernel, Linus Torvalds

On Fri, Feb 09, 2007 at 02:09:54AM -0800, Andrew Morton wrote:
> On Fri, 9 Feb 2007 10:54:05 +0100 Nick Piggin <npiggin@suse.de> wrote:
> 
> > 
> > That's still got a deadlock,
> 
> It does?

Yes, PG_lock vs mm->mmap_sem.

> >  and also it doesn't work if we want to lock
> > the page when performing a minor fault (which I want to fix fault vs
> > invalidate),
> 
> It's hard to discuss this without a description of what you want to fix
> there, and a description of how you plan to fix it.

http://marc.theaimsgroup.com/?l=linux-mm&m=116865911432667&w=2

> > and also assumes nobody's ->nopage locks the page or
> > requires any resources that are held by prepare_write (something not
> > immediately clear to me with the cluster filesystems, at least).
> 
> The nopage handler is filemap_nopage().  Are there any exceptions to that?

OCFS2 and GFS2.

> > But that all becomes legacy path, so do we really care? Supposing fs
> > maintainers like perform_write, then after the main ones have implementations
> > we could switch over to the slow-but-correct prepare_write legacy path.
> > Or we could leave it, or we could use Linus's slightly-less-buggy scheme...
> > by that point I expect I'd be sick of arguing about it ;)
> 
> It's worth "arguing" about.  This is write().  What matters more??

That's the legacy path that uses prepare/commit (ie. supposing that all
major filesystems did get converted to perform_write).

Of course I would still want my correct-but-slow version in that case,
but I just wouldn't care to argue if you still wanted to keep it fast.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-09 10:32       ` Nick Piggin
@ 2007-02-09 10:52         ` Andrew Morton
  2007-02-09 11:31           ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-02-09 10:52 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Linus Torvalds

On Fri, 9 Feb 2007 11:32:58 +0100 Nick Piggin <npiggin@suse.de> wrote:

> On Fri, Feb 09, 2007 at 02:09:54AM -0800, Andrew Morton wrote:
> > On Fri, 9 Feb 2007 10:54:05 +0100 Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > 
> > > That's still got a deadlock,
> > 
> > It does?
> 
> Yes, PG_lock vs mm->mmap_sem.

Where?  It requires that someone hold mmap_sem for writing as well as a
page lock (in an order which would require some thought).  Do we ever do
that?

> > >  and also it doesn't work if we want to lock
> > > the page when performing a minor fault (which I want to fix fault vs
> > > invalidate),
> > 
> > It's hard to discuss this without a description of what you want to fix
> > there, and a description of how you plan to fix it.
> 
> http://marc.theaimsgroup.com/?l=linux-mm&m=116865911432667&w=2

mutter.

Could perhaps fix that by running ClearPageUptodate in invalidate, thus
forcing the pagefault code to take the page lock (which we already hold).

That does mean that we'll fleetingly have a non-uptodate page in pagetables
which is a bit nasty.

Or, probably better, we could add a new page flag (heh) telling nopage that
it needs to lock the page even if it's uptodate.

> > > and also assumes nobody's ->nopage locks the page or
> > > requires any resources that are held by prepare_write (something not
> > > immediately clear to me with the cluster filesystems, at least).
> > 
> > The nopage handler is filemap_nopage().  Are there any exceptions to that?
> 
> OCFS2 and GFS2.

So the rule is that ->nopage handlers against pagecache mustn't lock the
page if it's already uptodate.  That's OK.  But it might conflict with the
above invalidate proposal.

Gad.  ocfs2_nopage() diddles with signals.


> > > But that all becomes legacy path, so do we really care? Supposing fs
> > > maintainers like perform_write, then after the main ones have implementations
> > > we could switch over to the slow-but-correct prepare_write legacy path.
> > > Or we could leave it, or we could use Linus's slightly-less-buggy scheme...
> > > by that point I expect I'd be sick of arguing about it ;)
> > 
> > It's worth "arguing" about.  This is write().  What matters more??
> 
> That's the legacy path that uses prepare/commit (ie. supposing that all
> major filesystems did get converted to perform_write).

We'll never, ever, ever update and test all filesytems.  What you're
calling "legacy" code will be there for all time.

I haven't had time to look at the perform_write stuff yet.

> Of course I would still want my correct-but-slow version in that case,
> but I just wouldn't care to argue if you still wanted to keep it fast.

This is write().  We just cannot go and double-copy all the memory or take
mmap_sem and do a full pagetable walk in there.  It just means that we
haven't found a suitable solution yet.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-09 10:52         ` Andrew Morton
@ 2007-02-09 11:31           ` Nick Piggin
  2007-02-09 11:46             ` Andrew Morton
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-02-09 11:31 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Filesystems, Linux Kernel, Linus Torvalds

On Fri, Feb 09, 2007 at 02:52:49AM -0800, Andrew Morton wrote:
> On Fri, 9 Feb 2007 11:32:58 +0100 Nick Piggin <npiggin@suse.de> wrote:
> 
> > On Fri, Feb 09, 2007 at 02:09:54AM -0800, Andrew Morton wrote:
> > > On Fri, 9 Feb 2007 10:54:05 +0100 Nick Piggin <npiggin@suse.de> wrote:
> > > 
> > > > 
> > > > That's still got a deadlock,
> > > 
> > > It does?
> > 
> > Yes, PG_lock vs mm->mmap_sem.
> 
> Where?  It requires that someone hold mmap_sem for writing as well as a
> page lock (in an order which would require some thought).  Do we ever do
> that?

task1			task2			task3
lock_page(page)		down_read(mmap_sem)
						down_write(mmap_sem)
down_read(mmap_sem)
			lock_page(page)

t1 is blocked by t3
t2 is blocked by t1
t3 is blocked by t2

OK, it isn't quite ABBA, but that's shorthand if we remember that
that an rwsem read must obey normal lock nesting rules.

> > > >  and also it doesn't work if we want to lock
> > > > the page when performing a minor fault (which I want to fix fault vs
> > > > invalidate),
> > > 
> > > It's hard to discuss this without a description of what you want to fix
> > > there, and a description of how you plan to fix it.
> > 
> > http://marc.theaimsgroup.com/?l=linux-mm&m=116865911432667&w=2
> 
> mutter.
> 
> Could perhaps fix that by running ClearPageUptodate in invalidate, thus
> forcing the pagefault code to take the page lock (which we already hold).
> 
> That does mean that we'll fleetingly have a non-uptodate page in pagetables
> which is a bit nasty.
> 
> Or, probably better, we could add a new page flag (heh) telling nopage that
> it needs to lock the page even if it's uptodate.

I don't see how either of those suggestions would fix this bug.

> > > > and also assumes nobody's ->nopage locks the page or
> > > > requires any resources that are held by prepare_write (something not
> > > > immediately clear to me with the cluster filesystems, at least).
> > > 
> > > The nopage handler is filemap_nopage().  Are there any exceptions to that?
> > 
> > OCFS2 and GFS2.
> 
> So the rule is that ->nopage handlers against pagecache mustn't lock the
> page if it's already uptodate.  That's OK.  But it might conflict with the
> above invalidate proposal.

Well that _will_ be the rule if you want to do Linus's half-fix. It will
also be the rule that they're not to deadlock against prepare_write.

> > > > But that all becomes legacy path, so do we really care? Supposing fs
> > > > maintainers like perform_write, then after the main ones have implementations
> > > > we could switch over to the slow-but-correct prepare_write legacy path.
> > > > Or we could leave it, or we could use Linus's slightly-less-buggy scheme...
> > > > by that point I expect I'd be sick of arguing about it ;)
> > > 
> > > It's worth "arguing" about.  This is write().  What matters more??
> > 
> > That's the legacy path that uses prepare/commit (ie. supposing that all
> > major filesystems did get converted to perform_write).
> 
> We'll never, ever, ever update and test all filesytems.  What you're
> calling "legacy" code will be there for all time.

I didn't say all; I still prefer correct than fast; you are still free
to keep the fast-and-buggy code in the legacy path.

> 
> I haven't had time to look at the perform_write stuff yet.
> 
> > Of course I would still want my correct-but-slow version in that case,
> > but I just wouldn't care to argue if you still wanted to keep it fast.
> 
> This is write().  We just cannot go and double-copy all the memory or take
> mmap_sem and do a full pagetable walk in there.  It just means that we
> haven't found a suitable solution yet.

You prefer speed over correctness even for little used filessytems, which
is fine because I'm sick of arguing about it. The main thing for me is that
important filesystems can be correct and fast.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-09 11:31           ` Nick Piggin
@ 2007-02-09 11:46             ` Andrew Morton
  2007-02-09 12:11               ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-02-09 11:46 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Linus Torvalds

On Fri, 9 Feb 2007 12:31:16 +0100 Nick Piggin <npiggin@suse.de> wrote:

> > > > > But that all becomes legacy path, so do we really care? Supposing fs
> > > > > maintainers like perform_write, then after the main ones have implementations
> > > > > we could switch over to the slow-but-correct prepare_write legacy path.
> > > > > Or we could leave it, or we could use Linus's slightly-less-buggy scheme...
> > > > > by that point I expect I'd be sick of arguing about it ;)
> > > > 
> > > > It's worth "arguing" about.  This is write().  What matters more??
> > > 
> > > That's the legacy path that uses prepare/commit (ie. supposing that all
> > > major filesystems did get converted to perform_write).
> > 
> > We'll never, ever, ever update and test all filesytems.  What you're
> > calling "legacy" code will be there for all time.
> 
> I didn't say all; I still prefer correct than fast;

For gawd's sake.  You can make the kernel less buggy by removing SMP
support.

Guess what?  Tradeoffs exist.

> you are still free
> to keep the fast-and-buggy code in the legacy path.

You make it sound like this is a choice.  It isn't.  Nobody is going to go
in and convert all those filesystems.

> > 
> > I haven't had time to look at the perform_write stuff yet.
> > 
> > > Of course I would still want my correct-but-slow version in that case,
> > > but I just wouldn't care to argue if you still wanted to keep it fast.
> > 
> > This is write().  We just cannot go and double-copy all the memory or take
> > mmap_sem and do a full pagetable walk in there.  It just means that we
> > haven't found a suitable solution yet.
> 
> You prefer speed over correctness even for little used filessytems, which
> is fine because I'm sick of arguing about it. The main thing for me is that
> important filesystems can be correct and fast.

I wouldn't characterise it as "arguing".  It's development.  Going and
sticking enormous slowdowns into write() to fix some bug which nobody is
hitting is insane.

We need to find a better fix, that's all.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc][patch 0/3] a faster buffered write deadlock fix?
  2007-02-09 11:46             ` Andrew Morton
@ 2007-02-09 12:11               ` Nick Piggin
  0 siblings, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-02-09 12:11 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Filesystems, Linux Kernel, Linus Torvalds

On Fri, Feb 09, 2007 at 03:46:44AM -0800, Andrew Morton wrote:
> On Fri, 9 Feb 2007 12:31:16 +0100 Nick Piggin <npiggin@suse.de> wrote:
> 
> > > 
> > > We'll never, ever, ever update and test all filesytems.  What you're
> > > calling "legacy" code will be there for all time.
> > 
> > I didn't say all; I still prefer correct than fast;
> 
> For gawd's sake.  You can make the kernel less buggy by removing SMP
> support.

I'm talking about known bugs.

> Guess what?  Tradeoffs exist.

I agree that 60% is much too big of a hit for all filesystems. Which is
why I propose this new aop.

> > you are still free
> > to keep the fast-and-buggy code in the legacy path.
> 
> You make it sound like this is a choice.  It isn't.  Nobody is going to go
> in and convert all those filesystems.

IMO, once all the maintained filesystems are converted then it would be
a good choice to make. You think otherwise and I won't argue.

> > > 
> > > I haven't had time to look at the perform_write stuff yet.
> > > 
> > > > Of course I would still want my correct-but-slow version in that case,
> > > > but I just wouldn't care to argue if you still wanted to keep it fast.
> > > 
> > > This is write().  We just cannot go and double-copy all the memory or take
> > > mmap_sem and do a full pagetable walk in there.  It just means that we
> > > haven't found a suitable solution yet.
> > 
> > You prefer speed over correctness even for little used filessytems, which
> > is fine because I'm sick of arguing about it. The main thing for me is that
> > important filesystems can be correct and fast.
> 
> I wouldn't characterise it as "arguing".  It's development.  Going and
> sticking enormous slowdowns into write() to fix some bug which nobody is
> hitting is insane.

Actually I'm doing this because I try to fix real data corruption problems
which people are hitting. You told me I can't get those fixes in until I
fix this problem.

> We need to find a better fix, that's all.

I actually found perform_write to be a speedup. And if perform_write is
merged then I would be happy to not fix the prepare_write path, or wait for
someone to come up with a better fix.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 1/3] fs: add an iovec iterator
  2007-02-09  3:31         ` Nick Piggin
@ 2007-02-09 17:28           ` Zach Brown
  0 siblings, 0 replies; 34+ messages in thread
From: Zach Brown @ 2007-02-09 17:28 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Nate Diller, Christoph Hellwig, Linux Filesystems, Linux Kernel,
	Andrew Morton

> What I have there is not actually a full-blown file io descriptor,  
> because
> there is no file or offset. It is just an iovec iterator (so maybe  
> I should
> rename it to iov_iter, rather than iodesc).
>
> I think it might be a nice idea to keep this iov_iter as a standalone
> structure, and it could be embedded into a struct file_io?

Yeah, maybe.

I'm not sure we need something as generic as a "file_io" struct.

To recap, I've hoped for the expression of the state of an iovec  
array with a richer structure to avoid the multiple walks of the  
array at different parts of the kernel that previously only had  
access to the raw iovec * and size_t count.

Stuff like the alignment checks in __blockdev_direct_IO() and the  
pages_in_io accounting in direct_io_worker().

I imagined building up the state in this 'iodesc' structure as we  
first copied and verified the structure from userspace.  (say in  
rw_copy_check_uvector()).

If as we copied we, say, stored in the bits of the buffer and length  
fields then by the time we got to __blockdev_direct_IO() we'd just  
test the bits for misalignment instead of iterating over the array  
again.

It starts to get gross as some paths currently modify the kernel copy  
of the array as they process it :/.

- z

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 3/3] ext2: use perform_write aop
  2007-02-08 13:07 ` [patch 3/3] ext2: use " Nick Piggin
  2007-02-08 14:47   ` Dmitriy Monakhov
@ 2007-02-09 19:14   ` Andrew Morton
  2007-02-09 19:45     ` Andrew Morton
  1 sibling, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-02-09 19:14 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel

On Thu,  8 Feb 2007 14:07:46 +0100 (CET) Nick Piggin <npiggin@suse.de> wrote:

> +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
> +{
> +	unsigned int block_start, block_end;
> +	struct buffer_head *head, *bh;
> +
> +	BUG_ON(!PageLocked(page));
> +	if (!page_has_buffers(page))
> +		return;
> +
> +	bh = head = page_buffers(page);
> +	block_start = 0;
> +	do {
> +		block_end = block_start + bh->b_size;
> +
> +		if (buffer_new(bh)) {
> +			if (block_end > from && block_start < to) {
> +				if (!PageUptodate(page)) {
> +					unsigned start, end;
> +					void *kaddr;
> +
> +					start = max(from, block_start);
> +					end = min(to, block_end);
> +
> +					kaddr = kmap_atomic(page, KM_USER0);
> +					memset(kaddr+start, 0, block_end-end);
> +					flush_dcache_page(page);
> +					kunmap_atomic(kaddr, KM_USER0);
> +					set_buffer_uptodate(bh);
> +				}

I don't see how this differs from the previous attempts to solve the
deadlock via atomic copt_from_user().  Here we temporarily zero out the
pagecache page then block_perform_write() unlocks the page.  So another
thread can come in, read the page and see the temporary zeroes?  

If so, that might be preventable by leaving the buffer nonuptodate.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 3/3] ext2: use perform_write aop
  2007-02-09 19:14   ` Andrew Morton
@ 2007-02-09 19:45     ` Andrew Morton
  2007-02-10  1:34       ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-02-09 19:45 UTC (permalink / raw)
  To: Nick Piggin, Linux Filesystems, Linux Kernel

On Fri, 9 Feb 2007 11:14:55 -0800 Andrew Morton <akpm@linux-foundation.org> wrote:

> On Thu,  8 Feb 2007 14:07:46 +0100 (CET) Nick Piggin <npiggin@suse.de> wrote:
> 
> > +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
> > +{
> > +	unsigned int block_start, block_end;
> > +	struct buffer_head *head, *bh;
> > +
> > +	BUG_ON(!PageLocked(page));
> > +	if (!page_has_buffers(page))
> > +		return;
> > +
> > +	bh = head = page_buffers(page);
> > +	block_start = 0;
> > +	do {
> > +		block_end = block_start + bh->b_size;
> > +
> > +		if (buffer_new(bh)) {
> > +			if (block_end > from && block_start < to) {
> > +				if (!PageUptodate(page)) {
> > +					unsigned start, end;
> > +					void *kaddr;
> > +
> > +					start = max(from, block_start);
> > +					end = min(to, block_end);
> > +
> > +					kaddr = kmap_atomic(page, KM_USER0);
> > +					memset(kaddr+start, 0, block_end-end);
> > +					flush_dcache_page(page);
> > +					kunmap_atomic(kaddr, KM_USER0);
> > +					set_buffer_uptodate(bh);
> > +				}
> 
> I don't see how this differs from the previous attempts to solve the
> deadlock via atomic copt_from_user().  Here we temporarily zero out the
> pagecache page then block_perform_write() unlocks the page.  So another
> thread can come in, read the page and see the temporary zeroes?  
> 
> If so, that might be preventable by leaving the buffer nonuptodate.

oh, OK, it was buffer_new(), so zeroes are the right thing for a reader to
see.

But if it wasn't buffer_new() then the appropriate thing for the reader to
see is what's on the disk.  But __block_prepare_write() won't read a buffer
which is fully-inside the write area from disk.

And that's seemingly OK, because if a reader gets in there after the short
copy, that reader will see the non-uptodate buffer and will populate it
from disk.

But doing that will overwrite the data which the write() caller managed to
copy into the page before it took a fault.  And that's not OK because
block_perform_write() does iovec_iterator_advance(i, copied) in this case
and hence will not rerun the copy after acquiring the page lock?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 3/3] ext2: use perform_write aop
  2007-02-09 19:45     ` Andrew Morton
@ 2007-02-10  1:34       ` Nick Piggin
  2007-02-10  1:50         ` Andrew Morton
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-02-10  1:34 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Filesystems, Linux Kernel

On Fri, Feb 09, 2007 at 11:45:39AM -0800, Andrew Morton wrote:
> On Fri, 9 Feb 2007 11:14:55 -0800 Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > If so, that might be preventable by leaving the buffer nonuptodate.
> 
> oh, OK, it was buffer_new(), so zeroes are the right thing for a reader to
> see.
> 
> But if it wasn't buffer_new() then the appropriate thing for the reader to
> see is what's on the disk.  But __block_prepare_write() won't read a buffer
> which is fully-inside the write area from disk.
> 
> And that's seemingly OK, because if a reader gets in there after the short
> copy, that reader will see the non-uptodate buffer and will populate it
> from disk.
> 
> But doing that will overwrite the data which the write() caller managed to
> copy into the page before it took a fault.  And that's not OK because
> block_perform_write() does iovec_iterator_advance(i, copied) in this case
> and hence will not rerun the copy after acquiring the page lock?

Hmm, yeah. This can be handled by not advancing partially into a !uptodate
buffer.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 3/3] ext2: use perform_write aop
  2007-02-10  1:34       ` Nick Piggin
@ 2007-02-10  1:50         ` Andrew Morton
  0 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2007-02-10  1:50 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel

On Sat, 10 Feb 2007 02:34:07 +0100
Nick Piggin <npiggin@suse.de> wrote:

> On Fri, Feb 09, 2007 at 11:45:39AM -0800, Andrew Morton wrote:
> > On Fri, 9 Feb 2007 11:14:55 -0800 Andrew Morton <akpm@linux-foundation.org> wrote:
> > 
> > > If so, that might be preventable by leaving the buffer nonuptodate.
> > 
> > oh, OK, it was buffer_new(), so zeroes are the right thing for a reader to
> > see.
> > 
> > But if it wasn't buffer_new() then the appropriate thing for the reader to
> > see is what's on the disk.  But __block_prepare_write() won't read a buffer
> > which is fully-inside the write area from disk.
> > 
> > And that's seemingly OK, because if a reader gets in there after the short
> > copy, that reader will see the non-uptodate buffer and will populate it
> > from disk.
> > 
> > But doing that will overwrite the data which the write() caller managed to
> > copy into the page before it took a fault.  And that's not OK because
> > block_perform_write() does iovec_iterator_advance(i, copied) in this case
> > and hence will not rerun the copy after acquiring the page lock?
> 
> Hmm, yeah. This can be handled by not advancing partially into a !uptodate
> buffer.

Think so, yeah.

Overall, the implementation you have there seems reasonable to me. 
Basically it's passing the responsibility for preventing the deadlock and
the exposure-of-zeroes problem down into the filesystem itself, where we
have visibility of the state of the various subsections of the page and can
take appropriate actions in response to that.

It's got conceptually harder to follow as a result, which is a shame.  But
still no magic bullet is on offer.

I pity the poor schmuck who has to write ext3_journalled_perform_write(),
ext3_ordered_perform_write(), ext3_writeback_perform_write(),
ext3_writeback_nobh_perform_write() and all that other stuff.  But I think
we need to do that pretty soon to validate the whole approach.  Also xfs
and reiser3.

NTFS will be interesting from the can-this-be-made-to-work POV.

Is NFS vulnerable to the deadlock?  It looks to be.  Shudder.

We'd need to find a way of communicating all this to the poor old
fs maintainers.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 2/3] fs: introduce perform_write aop
  2007-02-08 13:07 ` [patch 2/3] fs: introduce perform_write aop Nick Piggin
@ 2007-03-09 10:39   ` Christoph Hellwig
  2007-03-09 12:52     ` Nick Piggin
  2007-03-09 23:33     ` Mark Fasheh
  0 siblings, 2 replies; 34+ messages in thread
From: Christoph Hellwig @ 2007-03-09 10:39 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Filesystems, Linux Kernel, Andrew Morton

Hi Nick,

sorry for my later reply, this has been on my to answer list for the last
month and I only managed to get back to it now.

On Thu, Feb 08, 2007 at 02:07:36PM +0100, Nick Piggin wrote:
> Add a new "perform_write" aop, which replaces prepare_write and commit_write
> as a single call to copy a given amount of userdata at the given offset. This
> is more flexible, because the implementation can determine how to best handle
> errors, or multi-page ranges (eg. it may use a gang lookup), and only requires
> one call into the fs.

I really like this idea, especially for avoiding to call into the allocator
for every block.  Have you contacted the reiser4 folks whether this would
superceed their batch_write op completely?

> One problem with this interface is that it cannot be used to write into the
> filesystem by any means other than already-initialised buffers via iovecs. So
> prepare/commit have to stay around for non-user data... 

Actually I think that's a a good thing to a certain extent.  It reminds
us that all other users are horrible abuse of the interface.  I'd even
go so far as to make batch_write a callback that the filesystem passes
to generic_file_aio_write to make clear it's not a generic thing but
a helper.  (It's not a generic thing because it's the upper layer writing
into the pagecache, not a pagecache to fs below operation).

The still leaves open on how to get rid of ->prepare_write and ->commit_write
compltely, and for that we'll probably need ->kernel_read and ->kernel_write
file operations.  But that's a step you shouldn't consider yet when doing
this work.

> Another thing is that it seems to be less able to be implemented in generic,
> reusable code. It should be possible to introduce a new 2-op interface (or
> maybe just a new error handler op) which can be used correctly in generic code.

We should be able to find a nice abstraction for this, see my next mails.

> +	/*
> +	 * perform_write replaces prepare and commit_write callbacks.
> +	 */

This is a rather useless comment :)  Better remove it and add a proper
descriptions to Documentation/filesystems/vfs.txt and
Documentation/filesystems/Locking


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 1/3] fs: add an iovec iterator
  2007-02-09  2:03       ` Nate Diller
  2007-02-09  3:31         ` Nick Piggin
@ 2007-03-09 10:40         ` Christoph Hellwig
  1 sibling, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2007-03-09 10:40 UTC (permalink / raw)
  To: Nate Diller
  Cc: Nick Piggin, Christoph Hellwig, Linux Filesystems, Linux Kernel,
	Andrew Morton

On Thu, Feb 08, 2007 at 06:03:50PM -0800, Nate Diller wrote:
> i had a patch integrating the iodesc idea, but after some thought, had
> decided to call it struct file_io.  That name reflects the fact that
> it's doing I/O in arbitrary lengths with byte offsets, and struct
> file_io *fio contrasts well with struct bio (block_io).  I also had
> used the field ->nbytes instead of ->count, to clarify the difference
> between segment iterators, segment offsets, and absolute bytecount.

struct file_io sounds rather ugly to me, I don't know why.  And it's
really user I/O so we could call it struct uio (historical punt intended) :)


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 2/3] fs: introduce perform_write aop
  2007-03-09 10:39   ` Christoph Hellwig
@ 2007-03-09 12:52     ` Nick Piggin
  2007-03-09 22:01       ` Anton Altaparmakov
  2007-03-09 23:33     ` Mark Fasheh
  1 sibling, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-03-09 12:52 UTC (permalink / raw)
  To: Christoph Hellwig, Linux Filesystems, Linux Kernel, Andrew Morton

Hi Christoph,

On Fri, Mar 09, 2007 at 10:39:13AM +0000, Christoph Hellwig wrote:
> Hi Nick,
> 
> sorry for my later reply, this has been on my to answer list for the last
> month and I only managed to get back to it now.

No worries, I haven't had much time to work on it since then anyway.
Thanks for taking a look.

> On Thu, Feb 08, 2007 at 02:07:36PM +0100, Nick Piggin wrote:
> > as a single call to copy a given amount of userdata at the given offset. This
> > is more flexible, because the implementation can determine how to best handle
> > errors, or multi-page ranges (eg. it may use a gang lookup), and only requires
> > one call into the fs.
> 
> I really like this idea, especially for avoiding to call into the allocator
> for every block.  Have you contacted the reiser4 folks whether this would
> superceed their batch_write op completely?

I haven't yet, although that's been on my todo list when I get the API
into a more final state.

batch_write seems quite similar, however theirs is still page based, and
a bit crufty, IMO. I found it to be really clean to just pass down offsets,
but that may be a matter for debate.

What they _do_ have is a write actor function that will do the data copy.
This could be one possible way to get rid of ->prepare_write and
->commit_write, but I haven't tried that yet, because I don't like adding
more redirection and complexity if possible...

> > One problem with this interface is that it cannot be used to write into the
> > filesystem by any means other than already-initialised buffers via iovecs. So
> > prepare/commit have to stay around for non-user data... 
> 
> Actually I think that's a a good thing to a certain extent.  It reminds
> us that all other users are horrible abuse of the interface.  I'd even
> go so far as to make batch_write a callback that the filesystem passes
> to generic_file_aio_write to make clear it's not a generic thing but
> a helper.  (It's not a generic thing because it's the upper layer writing
> into the pagecache, not a pagecache to fs below operation).

OK, if you think that's reasonable, then that is one hurdle out of the way ;)

> The still leaves open on how to get rid of ->prepare_write and ->commit_write
> compltely, and for that we'll probably need ->kernel_read and ->kernel_write
> file operations.  But that's a step you shouldn't consider yet when doing
> this work.

I had a couple of possibilities for that. First is passing in a write actor
(eg. defaulting to the normal iovec usercopy), but as I said I consider this
more like fixing the problem with brute force (ie. just making the interface
more complex). Maybe as a last resort, though.

Another thing that would be much nicer from _my_ point of view would be to
just make all kernel users set up their data in an iovec, and use the normal
call with KERNEL_DS. Unfortunately, this is not the expected way for a lot
of code to work, and it might require extra copying of the data.


> > Another thing is that it seems to be less able to be implemented in generic,
> > reusable code. It should be possible to introduce a new 2-op interface (or
> > maybe just a new error handler op) which can be used correctly in generic code.
> 
> We should be able to find a nice abstraction for this, see my next mails.
> 
> > +	/*
> > +	 * perform_write replaces prepare and commit_write callbacks.
> > +	 */
> 
> This is a rather useless comment :)  Better remove it and add a proper
> descriptions to Documentation/filesystems/vfs.txt and
> Documentation/filesystems/Locking

Will do. Thanks!


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 2/3] fs: introduce perform_write aop
  2007-03-09 12:52     ` Nick Piggin
@ 2007-03-09 22:01       ` Anton Altaparmakov
  0 siblings, 0 replies; 34+ messages in thread
From: Anton Altaparmakov @ 2007-03-09 22:01 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Hellwig, Linux Filesystems, Linux Kernel, Andrew Morton


On 9 Mar 2007, at 12:52, Nick Piggin wrote:

> Hi Christoph,
>
> On Fri, Mar 09, 2007 at 10:39:13AM +0000, Christoph Hellwig wrote:
>> Hi Nick,
>>
>> sorry for my later reply, this has been on my to answer list for  
>> the last
>> month and I only managed to get back to it now.
>
> No worries, I haven't had much time to work on it since then anyway.
> Thanks for taking a look.
>
>> On Thu, Feb 08, 2007 at 02:07:36PM +0100, Nick Piggin wrote:
>>> as a single call to copy a given amount of userdata at the given  
>>> offset. This
>>> is more flexible, because the implementation can determine how to  
>>> best handle
>>> errors, or multi-page ranges (eg. it may use a gang lookup), and  
>>> only requires
>>> one call into the fs.
>>
>> I really like this idea, especially for avoiding to call into the  
>> allocator
>> for every block.

Indeed.  FWIW my NTFS driver does not use the generic file write  
helper function and instead has its own code that does the allocation  
first and then does the writing a page at a time much the same as  
generic file write does so depending on what this new interface ends  
up looking like exactly I may well be able to switch the NTFS driver  
to use it instead of doing everything by myself and duplicating a ton  
of code from the VFS...

Best regards,

	Anton

>> Have you contacted the reiser4 folks whether this would
>> superceed their batch_write op completely?
>
> I haven't yet, although that's been on my todo list when I get the API
> into a more final state.
>
> batch_write seems quite similar, however theirs is still page  
> based, and
> a bit crufty, IMO. I found it to be really clean to just pass down  
> offsets,
> but that may be a matter for debate.
>
> What they _do_ have is a write actor function that will do the data  
> copy.
> This could be one possible way to get rid of ->prepare_write and
> ->commit_write, but I haven't tried that yet, because I don't like  
> adding
> more redirection and complexity if possible...
>
>>> One problem with this interface is that it cannot be used to  
>>> write into the
>>> filesystem by any means other than already-initialised buffers  
>>> via iovecs. So
>>> prepare/commit have to stay around for non-user data...
>>
>> Actually I think that's a a good thing to a certain extent.  It  
>> reminds
>> us that all other users are horrible abuse of the interface.  I'd  
>> even
>> go so far as to make batch_write a callback that the filesystem  
>> passes
>> to generic_file_aio_write to make clear it's not a generic thing but
>> a helper.  (It's not a generic thing because it's the upper layer  
>> writing
>> into the pagecache, not a pagecache to fs below operation).
>
> OK, if you think that's reasonable, then that is one hurdle out of  
> the way ;)
>
>> The still leaves open on how to get rid of ->prepare_write and - 
>> >commit_write
>> compltely, and for that we'll probably need ->kernel_read and - 
>> >kernel_write
>> file operations.  But that's a step you shouldn't consider yet  
>> when doing
>> this work.
>
> I had a couple of possibilities for that. First is passing in a  
> write actor
> (eg. defaulting to the normal iovec usercopy), but as I said I  
> consider this
> more like fixing the problem with brute force (ie. just making the  
> interface
> more complex). Maybe as a last resort, though.
>
> Another thing that would be much nicer from _my_ point of view  
> would be to
> just make all kernel users set up their data in an iovec, and use  
> the normal
> call with KERNEL_DS. Unfortunately, this is not the expected way  
> for a lot
> of code to work, and it might require extra copying of the data.
>
>
>>> Another thing is that it seems to be less able to be implemented  
>>> in generic,
>>> reusable code. It should be possible to introduce a new 2-op  
>>> interface (or
>>> maybe just a new error handler op) which can be used correctly in  
>>> generic code.
>>
>> We should be able to find a nice abstraction for this, see my next  
>> mails.
>>
>>> +	/*
>>> +	 * perform_write replaces prepare and commit_write callbacks.
>>> +	 */
>>
>> This is a rather useless comment :)  Better remove it and add a  
>> proper
>> descriptions to Documentation/filesystems/vfs.txt and
>> Documentation/filesystems/Locking
>
> Will do. Thanks!

-- 
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer, http://www.linux-ntfs.org/



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 2/3] fs: introduce perform_write aop
  2007-03-09 10:39   ` Christoph Hellwig
  2007-03-09 12:52     ` Nick Piggin
@ 2007-03-09 23:33     ` Mark Fasheh
  2007-03-10  9:25       ` Christoph Hellwig
  1 sibling, 1 reply; 34+ messages in thread
From: Mark Fasheh @ 2007-03-09 23:33 UTC (permalink / raw)
  To: Christoph Hellwig, Nick Piggin, Linux Filesystems, Linux Kernel,
	Andrew Morton

On Fri, Mar 09, 2007 at 10:39:13AM +0000, Christoph Hellwig wrote:
> > One problem with this interface is that it cannot be used to write into the
> > filesystem by any means other than already-initialised buffers via iovecs. So
> > prepare/commit have to stay around for non-user data... 
> 
> Actually I think that's a a good thing to a certain extent.  It reminds
> us that all other users are horrible abuse of the interface.  I'd even
> go so far as to make batch_write a callback that the filesystem passes
> to generic_file_aio_write to make clear it's not a generic thing but
> a helper.  (It's not a generic thing because it's the upper layer writing
> into the pagecache, not a pagecache to fs below operation).
> 
> The still leaves open on how to get rid of ->prepare_write and ->commit_write
> compltely, and for that we'll probably need ->kernel_read and ->kernel_write
> file operations.  But that's a step you shouldn't consider yet when doing
> this work.

->kernel_write() as opposed to genericizing ->perform_write() would be fine
with me. Just so long as we get rid of ->prepare_write and ->commit_write in
that other kernel code doesn't call them directly. That interface just
doesn't work for Ocfs2. There, we have the triple whammy of having to order
cluster locks with page locks, avoiding nesting cluster locks in the case
that the user data has to be paged in (causing a lock in ->readpage()) and
grabbing / zeroing adjacent pages to fill holes.

So, a combination of ->perform_write and ->kernel_write() could really help
me solve my write woes.

Right now I've got Ocfs2 implementing it's own lowest-level buffered write
code - think generic_file_buffered_write() replacement for Ocfs2. With some
duplicated code above that layer. What's nice is that I can abstract away
the "copy data into some target pages" bits such that the majority of that
code is re-usable for ocfs2's splice write operation. I'm not sure we could
have that low a level of abstraction for anyhing above individual the file
system though which also has to deal with non-kernel writes though. That's
where a ->kernel_write() might come in handy.
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 2/3] fs: introduce perform_write aop
  2007-03-09 23:33     ` Mark Fasheh
@ 2007-03-10  9:25       ` Christoph Hellwig
  2007-03-12  2:13         ` Mark Fasheh
  2007-03-14 13:30         ` Nick Piggin
  0 siblings, 2 replies; 34+ messages in thread
From: Christoph Hellwig @ 2007-03-10  9:25 UTC (permalink / raw)
  To: Mark Fasheh
  Cc: Christoph Hellwig, Nick Piggin, Linux Filesystems, Linux Kernel,
	Andrew Morton

On Fri, Mar 09, 2007 at 03:33:01PM -0800, Mark Fasheh wrote:
> ->kernel_write() as opposed to genericizing ->perform_write() would be fine
> with me. Just so long as we get rid of ->prepare_write and ->commit_write in
> that other kernel code doesn't call them directly. That interface just
> doesn't work for Ocfs2.

It doesn't work for any filesystem that needs slightly fancy locking.
That and the reason that's an interface that doesn't fit into our
layering is why I want to get rid of it.  Note that fops->kernel_write
might in fact use ->perform_write with an actor as Nick suggested.
I'm not quite sure how it'll look like - I'd rather take care of the
buffered write path first and then handle this issue once the first
changes have stabilized.

> Right now I've got Ocfs2 implementing it's own lowest-level buffered write
> code - think generic_file_buffered_write() replacement for Ocfs2. With some
> duplicated code above that layer. What's nice is that I can abstract away
> the "copy data into some target pages" bits such that the majority of that
> code is re-usable for ocfs2's splice write operation. I'm not sure we could
> have that low a level of abstraction for anyhing above individual the file
> system though which also has to deal with non-kernel writes though. That's
> where a ->kernel_write() might come in handy.

Why do you need your own low level buffered write functionality?  As in
past times when filesystems want to come up I'd like to have a very
good exaplanation on why you think it's needed and whether we shouldn't
improve the generic buffered write code instead.  This codepath is so nasty
that any duplication will be a maintaince horror.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 2/3] fs: introduce perform_write aop
  2007-03-10  9:25       ` Christoph Hellwig
@ 2007-03-12  2:13         ` Mark Fasheh
  2007-03-14 13:30         ` Nick Piggin
  1 sibling, 0 replies; 34+ messages in thread
From: Mark Fasheh @ 2007-03-12  2:13 UTC (permalink / raw)
  To: Christoph Hellwig, Nick Piggin, Linux Filesystems, Linux Kernel,
	Andrew Morton

On Sat, Mar 10, 2007 at 09:25:41AM +0000, Christoph Hellwig wrote:
> On Fri, Mar 09, 2007 at 03:33:01PM -0800, Mark Fasheh wrote:
> > ->kernel_write() as opposed to genericizing ->perform_write() would be fine
> > with me. Just so long as we get rid of ->prepare_write and ->commit_write in
> > that other kernel code doesn't call them directly. That interface just
> > doesn't work for Ocfs2.
> 
> It doesn't work for any filesystem that needs slightly fancy locking.
> That and the reason that's an interface that doesn't fit into our
> layering is why I want to get rid of it.  Note that fops->kernel_write
> might in fact use ->perform_write with an actor as Nick suggested.
> I'm not quite sure how it'll look like - I'd rather take care of the
> buffered write path first and then handle this issue once the first
> changes have stabilized.
> 
> > Right now I've got Ocfs2 implementing it's own lowest-level buffered write
> > code - think generic_file_buffered_write() replacement for Ocfs2. With some
> > duplicated code above that layer. What's nice is that I can abstract away
> > the "copy data into some target pages" bits such that the majority of that
> > code is re-usable for ocfs2's splice write operation. I'm not sure we could
> > have that low a level of abstraction for anyhing above individual the file
> > system though which also has to deal with non-kernel writes though. That's
> > where a ->kernel_write() might come in handy.
> 
> Why do you need your own low level buffered write functionality?  As in
> past times when filesystems want to come up I'd like to have a very
> good exaplanation on why you think it's needed and whether we shouldn't
> improve the generic buffered write code instead.

Fair enough - I personally tried everything I could before coming to the
conclusion that for the time being, Ocfs2 should have a seperate write path.

As you know, I've been adding sparse file support for Ocfs2. Putting aside
all the reasons to have real support for sparse files (as opposed to zeroing
allocated regions), the tree code changes alone has gotten us 90% the way to
supporting unwritten extents (much like xfs does).

Ocfs2 supports atomic data allocation units ('clusters', to use an
overloaded term) which can range in size from 4k to 1 meg. This means that
for allocating writes on page size < cluster size file systems, we have to
zero pages adjacent to the one being written so that a re-read doesn't
return dirty data. This alone requires page locking which we can't
realistically achieve via ->prepare_write() and ->commit_write(). I believe
NTFS has a similar restriction, which has lead to their own file write.

So, page locking was definitely the straw that broke the camels back. Some
other things which were akward or slightly less critically broken than the
page locking:

Since ocfs2 has a rather large (compared to a local file system) context to
build up during an allocating write, it became uncomfortable to pass that
around ->prepare_write() and ->commit_write() without putting that context
on our struct inode and protecting it with a lock. And since the existing
interfaces were so rigid, it actually required a lot more context to be
passed around than in my current code.

There's also the cluster lock / page lock inversion which we have to deal
with (it gets even worse if we fault in pages in the middle of the user copy
for a write). Granted, we fixed a lot of that before merging, but allocating
in write means taking even more cluster locks and I don't really feel
comfortable nesting so many of those within the page locks.

Finally, we get to the optimization problem - writing stuff one page at a
time. To be fair, my current stuff doesn't do a very good job of optimizing
the amount of data written in a given pass, but the groundwork is there to
easily write at least one clusters worth of user data at a time. My priority
has been mostly to stabilize it as opposed to performance tuning.

So, quite possibly, I overstated what Ocfs2 was doing earlier - we still
make use of as much generic code as we can. The O_DIRECT path for instance
wasn't touched. Ocfs2 still makes use of block_commit_write(), the standard
jbd mechanisms for ordered data mode, and though we got rid of
block_prepare_write() (for zeroing reasons), what we do is a much simpler
version.

By the way, the code in question can be found in the sparse_files branch of
ocfs2.git:

http://git.kernel.org/?p=linux/kernel/git/mfasheh/ocfs2.git;a=log;h=sparse_files

Your review has been extremely useful in the past, so I welcome any comments
you might have.

Though it's getting close to being put in ALL (for a spin in -mm), it's
definitely a work in progress branch. There's 3 patches to generic code
which I need to push out for review (it's pretty much just exporting symbols
which we'd need in any case). Also, some of the bug fixes and feature
adjustments need to get folded back into their respective patches.

> This codepath is so nasty that any duplication will be a maintaince
> horror.

All that said above, I agree that duplication sucks - it's what I've had to
do for the time being, but I'm 100% keen on re-merging things into a generic
path once we sort it out. Hence my interest in this thread. The Ocfs2 write
changes are almost complete and are testing well, so I think I'm in a good
position to start working on some of this stuff now.

I'm hoping that ->perform_write() can become the interface that Ocfs2 plugs
into. Just to list things out, here's the requirements I could come up with
off the top of my head:
  - allow locking of pages for zeroing
  - fault in user data before taking cluster locks and target inode page
    locks
  - give us enough information to write larger than one page worth of data
  - avoid forcing fancier file systems to stick write context information on
    their global inode structure.

Thanks,
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 2/3] fs: introduce perform_write aop
  2007-03-10  9:25       ` Christoph Hellwig
  2007-03-12  2:13         ` Mark Fasheh
@ 2007-03-14 13:30         ` Nick Piggin
  2007-03-14 15:17           ` Christoph Hellwig
  1 sibling, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-03-14 13:30 UTC (permalink / raw)
  To: Christoph Hellwig, Mark Fasheh, Linux Filesystems, Linux Kernel,
	Andrew Morton

On Sat, Mar 10, 2007 at 09:25:41AM +0000, Christoph Hellwig wrote:
> On Fri, Mar 09, 2007 at 03:33:01PM -0800, Mark Fasheh wrote:
> > ->kernel_write() as opposed to genericizing ->perform_write() would be fine
> > with me. Just so long as we get rid of ->prepare_write and ->commit_write in
> > that other kernel code doesn't call them directly. That interface just
> > doesn't work for Ocfs2.
> 
> It doesn't work for any filesystem that needs slightly fancy locking.
> That and the reason that's an interface that doesn't fit into our
> layering is why I want to get rid of it.  Note that fops->kernel_write
> might in fact use ->perform_write with an actor as Nick suggested.
> I'm not quite sure how it'll look like - I'd rather take care of the
> buffered write path first and then handle this issue once the first
> changes have stabilized.

So I've tried a different approach - the 2-op API rather than an actor.

perform_write stays around as a higher performance API, but it isn't
required if the filesystem implements the 2-op API. I've called them
write_begin/write_end for now.

There are a few upshots to doing this rather than the actor approach.
First of all, this is what callers expect, they want to write into the
page directly rather than making an actor.

More importantly, it allows us to implement generic block versions of
the API which is much more reusable than block_perform_write (which was
basically useless for anything more than ext2).

The API calls for the filesystem to find and lock the page itself, and
pass that up to the caller, as well as handle short-writes properly, so
we can solve this deadlock properly.

The nice thing about this is that write_begin is basically a single
page case of the first half (before the iovec copy) of perform_write,
and write_end is the single page case of the second half (after the
copy). So any filesystem that implements perform_write should be able
to reuse write_begin/write_end components.

Anyway, I'm attaching the top patches of my stack (underneath are the
initial patches to solve prepare_write deadlock -- I'll repost the
complete set once I get some more feedback). Sorry it is a bit under
commented and not stress tested. However it does boot and run with
ext2/3/shmem and other simplefses.

Mark has been providing some helpful advice about the new 2-op interface,
but it is still pretty much up in the air.

Also note that a _lot_ of crud is there to support prepare_write (eg.
pagecache_write_begin/pagecache_write_end basically become single liners
once we get rid of the old API).

Anyway, I think I should throw this out for comments before investing too
much more time, in case everyone hates it ;)

Anyway, comments much appreciated...


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 2/3] fs: introduce perform_write aop
  2007-03-14 13:30         ` Nick Piggin
@ 2007-03-14 15:17           ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2007-03-14 15:17 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Hellwig, Mark Fasheh, Linux Filesystems, Linux Kernel,
	Andrew Morton

On Wed, Mar 14, 2007 at 02:30:24PM +0100, Nick Piggin wrote:
> So I've tried a different approach - the 2-op API rather than an actor.
> 
> perform_write stays around as a higher performance API, but it isn't
> required if the filesystem implements the 2-op API. I've called them
> write_begin/write_end for now.
> 
> There are a few upshots to doing this rather than the actor approach.
> First of all, this is what callers expect, they want to write into the
> page directly rather than making an actor.
> 
> More importantly, it allows us to implement generic block versions of
> the API which is much more reusable than block_perform_write (which was
> basically useless for anything more than ext2).

Generally thiis look pretty cool.  But even if we go with perform_write
as aop for now (which I think is a bad idea aswell, but moving it out
would better be done after all filesystems are converted) these should
just stay callbacks passed to generic_perform_write.

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2007-03-14 15:17 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-08 13:07 [rfc][patch 0/3] a faster buffered write deadlock fix? Nick Piggin
2007-02-08 13:07 ` [patch 1/3] fs: add an iovec iterator Nick Piggin
2007-02-08 19:49   ` Christoph Hellwig
2007-02-09  1:46     ` Nick Piggin
2007-02-09  2:03       ` Nate Diller
2007-02-09  3:31         ` Nick Piggin
2007-02-09 17:28           ` Zach Brown
2007-03-09 10:40         ` Christoph Hellwig
2007-02-08 23:04   ` Mark Fasheh
2007-02-08 13:07 ` [patch 2/3] fs: introduce perform_write aop Nick Piggin
2007-03-09 10:39   ` Christoph Hellwig
2007-03-09 12:52     ` Nick Piggin
2007-03-09 22:01       ` Anton Altaparmakov
2007-03-09 23:33     ` Mark Fasheh
2007-03-10  9:25       ` Christoph Hellwig
2007-03-12  2:13         ` Mark Fasheh
2007-03-14 13:30         ` Nick Piggin
2007-03-14 15:17           ` Christoph Hellwig
2007-02-08 13:07 ` [patch 3/3] ext2: use " Nick Piggin
2007-02-08 14:47   ` Dmitriy Monakhov
2007-02-09 19:14   ` Andrew Morton
2007-02-09 19:45     ` Andrew Morton
2007-02-10  1:34       ` Nick Piggin
2007-02-10  1:50         ` Andrew Morton
2007-02-09  0:38 ` [rfc][patch 0/3] a faster buffered write deadlock fix? Mark Fasheh
2007-02-09  2:04   ` Nick Piggin
2007-02-09  8:41 ` Andrew Morton
2007-02-09  9:54   ` Nick Piggin
2007-02-09 10:09     ` Andrew Morton
2007-02-09 10:32       ` Nick Piggin
2007-02-09 10:52         ` Andrew Morton
2007-02-09 11:31           ` Nick Piggin
2007-02-09 11:46             ` Andrew Morton
2007-02-09 12:11               ` Nick Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.