Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h +++ linux-2.6/include/linux/fs.h @@ -578,10 +578,12 @@ struct block_device { /* * Radix-tree tags, for tagging dirty and writeback pages within the pagecache - * radix trees + * radix trees. Also, to snapshot all pages required to be fsync'ed in order + * to obey data integrity semantics. */ #define PAGECACHE_TAG_DIRTY 0 #define PAGECACHE_TAG_WRITEBACK 1 +#define PAGECACHE_TAG_FSYNC 2 int mapping_tagged(struct address_space *mapping, int tag); Index: linux-2.6/include/linux/radix-tree.h =================================================================== --- linux-2.6.orig/include/linux/radix-tree.h +++ linux-2.6/include/linux/radix-tree.h @@ -55,7 +55,7 @@ static inline int radix_tree_is_indirect /*** radix-tree API starts here ***/ -#define RADIX_TREE_MAX_TAGS 2 +#define RADIX_TREE_MAX_TAGS 3 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { Index: linux-2.6/mm/page-writeback.c =================================================================== --- linux-2.6.orig/mm/page-writeback.c +++ linux-2.6/mm/page-writeback.c @@ -848,22 +848,7 @@ void __init page_writeback_init(void) prop_descriptor_init(&vm_dirties, shift); } -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. - */ -int write_cache_pages(struct address_space *mapping, +static int __write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { @@ -947,10 +932,7 @@ done_release: goto continue_unlock; } else { /* hmm, but it has been dirtied again */ - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - else - goto continue_unlock; + goto continue_unlock; } } @@ -970,11 +952,9 @@ done_release: } goto done; } - if (wbc->sync_mode == WB_SYNC_NONE) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) - goto done_release; - } + wbc->nr_to_write--; + if (wbc->nr_to_write <= 0) + goto done_release; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; goto done_release; @@ -1002,6 +982,183 @@ done: return ret; } + +static int __write_cache_pages_sync(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + int ret = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + + pagevec_init(&pvec, 0); + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + do { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + if (page->index > end) { + unlock_page(page); + break; + } + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_set(&mapping->page_tree, page->index, PAGECACHE_TAG_FSYNC); + spin_unlock_irq(&mapping->tree_lock); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } while (index <= end); + + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + do { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + if (page->index > end) { + unlock_page(page); + break; + } + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_set(&mapping->page_tree, page->index, PAGECACHE_TAG_FSYNC); + spin_unlock_irq(&mapping->tree_lock); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } while (index <= end); + + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + do { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_FSYNC, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ +again: + lock_page(page); + + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (page->index > end) { + unlock_page(page); + break; + } + + if (PageWriteback(page)) { + /* someone else wrote it for us */ + if (!PageDirty(page)) { + goto continue_unlock; + } else { + /* hmm, but it has been dirtied again */ + wait_on_page_writeback(page); + } + } + + BUG_ON(PageWriteback(page)); + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret)) { + /* Must retry the write, esp. for integrity */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + goto again; + } + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } while (index <= end); + + if (wbc->range_cont) + wbc->range_start = (loff_t)index << PAGE_CACHE_SHIFT; + + return ret; +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + if (wbc->sync_mode == WB_SYNC_NONE) + return __write_cache_pages(mapping, wbc, writepage, data); + else + return __write_cache_pages_sync(mapping, wbc, writepage, data); +} EXPORT_SYMBOL(write_cache_pages); /* Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c +++ linux-2.6/mm/filemap.c @@ -305,6 +305,53 @@ int wait_on_page_writeback_range(struct return ret; } +int wait_on_page_writeback_range_fsync(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + int nr_pages; + int ret = 0; + pgoff_t index; + + if (end < start) + return 0; + + pagevec_init(&pvec, 0); + index = start; + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_FSYNC, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(&mapping->page_tree, page->index, PAGECACHE_TAG_FSYNC); + spin_unlock_irq(&mapping->tree_lock); + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + pagevec_release(&pvec); + cond_resched(); + } + + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + + return ret; +} + /** * sync_page_range - write and wait on all pages in the passed range * @inode: target inode @@ -388,6 +435,17 @@ int filemap_fdatawait(struct address_spa } EXPORT_SYMBOL(filemap_fdatawait); +int filemap_fdatawait_fsync(struct address_space *mapping) +{ + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return 0; + + return wait_on_page_writeback_range_fsync(mapping, 0, + (i_size - 1) >> PAGE_CACHE_SHIFT); +} + int filemap_write_and_wait(struct address_space *mapping) { int err = 0; @@ -401,7 +459,7 @@ int filemap_write_and_wait(struct addres * thing (e.g. bug) happened, so we avoid waiting for it. */ if (err != -EIO) { - int err2 = filemap_fdatawait(mapping); + int err2 = filemap_fdatawait_fsync(mapping); if (!err) err = err2; }