Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -578,10 +578,12 @@ struct block_device {
 
 /*
  * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
- * radix trees
+ * radix trees. Also, to snapshot all pages required to be fsync'ed in order
+ * to obey data integrity semantics.
  */
 #define PAGECACHE_TAG_DIRTY	0
 #define PAGECACHE_TAG_WRITEBACK	1
+#define PAGECACHE_TAG_FSYNC	2
 
 int mapping_tagged(struct address_space *mapping, int tag);
 
Index: linux-2.6/include/linux/radix-tree.h
===================================================================
--- linux-2.6.orig/include/linux/radix-tree.h
+++ linux-2.6/include/linux/radix-tree.h
@@ -55,7 +55,7 @@ static inline int radix_tree_is_indirect
 
 /*** radix-tree API starts here ***/
 
-#define RADIX_TREE_MAX_TAGS 2
+#define RADIX_TREE_MAX_TAGS 3
 
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -848,22 +848,7 @@ void __init page_writeback_init(void)
 	prop_descriptor_init(&vm_dirties, shift);
 }
 
-/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
- *
- * If a page is already under I/O, write_cache_pages() skips it, even
- * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
- * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
- * and msync() need to guarantee that all the data which was dirty at the time
- * the call was made get new I/O started against them.  If wbc->sync_mode is
- * WB_SYNC_ALL then we were called for data integrity and we must wait for
- * existing IO to complete.
- */
-int write_cache_pages(struct address_space *mapping,
+static int __write_cache_pages(struct address_space *mapping,
 		      struct writeback_control *wbc, writepage_t writepage,
 		      void *data)
 {
@@ -947,10 +932,7 @@ done_release:
 					goto continue_unlock;
 				} else {
 					/* hmm, but it has been dirtied again */
-					if (wbc->sync_mode != WB_SYNC_NONE)
-						wait_on_page_writeback(page);
-					else
-						goto continue_unlock;
+					goto continue_unlock;
 				}
 			}
 
@@ -970,11 +952,9 @@ done_release:
 				}
 				goto done;
 			}
-			if (wbc->sync_mode == WB_SYNC_NONE) {
-				wbc->nr_to_write--;
-				if (wbc->nr_to_write <= 0)
-					goto done_release;
-			}
+			wbc->nr_to_write--;
+			if (wbc->nr_to_write <= 0)
+				goto done_release;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
 				goto done_release;
@@ -1002,6 +982,183 @@ done:
 
 	return ret;
 }
+
+static int __write_cache_pages_sync(struct address_space *mapping,
+		      struct writeback_control *wbc, writepage_t writepage,
+		      void *data)
+{
+	int ret = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+
+	pagevec_init(&pvec, 0);
+	index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	do {
+		int i;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_DIRTY,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+		if (!nr_pages)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+			if (page->index > end) {
+				unlock_page(page);
+				break;
+			}
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_set(&mapping->page_tree, page->index, PAGECACHE_TAG_FSYNC);
+			spin_unlock_irq(&mapping->tree_lock);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	} while (index <= end);
+
+	index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	do {
+		int i;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_WRITEBACK,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+		if (!nr_pages)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+			if (page->index > end) {
+				unlock_page(page);
+				break;
+			}
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_set(&mapping->page_tree, page->index, PAGECACHE_TAG_FSYNC);
+			spin_unlock_irq(&mapping->tree_lock);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	} while (index <= end);
+
+	index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	do {
+		int i;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_FSYNC,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+		if (!nr_pages)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+again:
+			lock_page(page);
+
+			/*
+			 * Page truncated or invalidated. We can freely skip it
+			 * then, even for data integrity operations: the page
+			 * has disappeared concurrently, so there could be no
+			 * real expectation of this data interity operation
+			 * even if there is now a new, dirty page at the same
+			 * pagecache address.
+			 */
+			if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+
+			if (page->index > end) {
+				unlock_page(page);
+				break;
+			}
+
+			if (PageWriteback(page)) {
+				/* someone else wrote it for us */
+				if (!PageDirty(page)) {
+					goto continue_unlock;
+				} else {
+					/* hmm, but it has been dirtied again */
+					wait_on_page_writeback(page);
+				}
+			}
+
+			BUG_ON(PageWriteback(page));
+
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
+			ret = (*writepage)(page, wbc, data);
+
+			if (unlikely(ret)) {
+				/* Must retry the write, esp. for integrity */
+				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+					unlock_page(page);
+					ret = 0;
+					goto again;
+				}
+				break;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	} while (index <= end);
+
+	if (wbc->range_cont)
+		wbc->range_start = (loff_t)index << PAGE_CACHE_SHIFT;
+
+	return ret;
+}
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+int write_cache_pages(struct address_space *mapping,
+		      struct writeback_control *wbc, writepage_t writepage,
+		      void *data)
+{
+	if (wbc->sync_mode == WB_SYNC_NONE)
+		return __write_cache_pages(mapping, wbc, writepage, data);
+	else
+		return __write_cache_pages_sync(mapping, wbc, writepage, data);
+}
 EXPORT_SYMBOL(write_cache_pages);
 
 /*
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -305,6 +305,53 @@ int wait_on_page_writeback_range(struct 
 	return ret;
 }
 
+int wait_on_page_writeback_range_fsync(struct address_space *mapping,
+				pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	int nr_pages;
+	int ret = 0;
+	pgoff_t index;
+
+	if (end < start)
+		return 0;
+
+	pagevec_init(&pvec, 0);
+	index = start;
+	while ((index <= end) &&
+			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			PAGECACHE_TAG_FSYNC,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+		unsigned i;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/* until radix tree lookup accepts end_index */
+			if (page->index > end)
+				continue;
+
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_clear(&mapping->page_tree, page->index, PAGECACHE_TAG_FSYNC);
+			spin_unlock_irq(&mapping->tree_lock);
+
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	/* Check for outstanding write errors */
+	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+		ret = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &mapping->flags))
+		ret = -EIO;
+
+	return ret;
+}
+
 /**
  * sync_page_range - write and wait on all pages in the passed range
  * @inode:	target inode
@@ -388,6 +435,17 @@ int filemap_fdatawait(struct address_spa
 }
 EXPORT_SYMBOL(filemap_fdatawait);
 
+int filemap_fdatawait_fsync(struct address_space *mapping)
+{
+	loff_t i_size = i_size_read(mapping->host);
+
+	if (i_size == 0)
+		return 0;
+
+	return wait_on_page_writeback_range_fsync(mapping, 0,
+				(i_size - 1) >> PAGE_CACHE_SHIFT);
+}
+
 int filemap_write_and_wait(struct address_space *mapping)
 {
 	int err = 0;
@@ -401,7 +459,7 @@ int filemap_write_and_wait(struct addres
 		 * thing (e.g. bug) happened, so we avoid waiting for it.
 		 */
 		if (err != -EIO) {
-			int err2 = filemap_fdatawait(mapping);
+			int err2 = filemap_fdatawait_fsync(mapping);
 			if (!err)
 				err = err2;
 		}