All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c"
@ 2012-07-13 13:19 Lukas Czerner
  2012-07-13 13:19 ` [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure" Lukas Czerner
                   ` (10 more replies)
  0 siblings, 11 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner

This reverts commit ccb4d7af914e0fe9b2f1022f8ea6c300463fd5e6.

This commit reintroduces functions ext4_block_truncate_page() and
ext4_block_zero_page_range() which has been previously removed in favour
of ext4_discard_partial_page_buffers().

In future commits we want to reintroduce those function and remove
ext4_discard_partial_page_buffers() since it is duplicating some code
and also partially duplicating work of truncate_pagecache_range(),
moreover the old implementation was much clearer.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/ext4.h  |    4 ++
 fs/ext4/inode.c |  120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cfc4e01..439af1e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1995,6 +1995,10 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_discard_partial_page_buffers(handle_t *handle,
 		struct address_space *mapping, loff_t from,
 		loff_t length, int flags);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 02bc8cb..588f9fa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3381,6 +3381,126 @@ next:
 	return err;
 }
 
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from)
+{
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length;
+	unsigned blocksize;
+	struct inode *inode = mapping->host;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+
+	return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
+{
+	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, max, pos;
+	ext4_lblk_t iblock;
+	struct inode *inode = mapping->host;
+	struct buffer_head *bh;
+	struct page *page;
+	int err = 0;
+
+	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+				   mapping_gfp_mask(mapping) & ~__GFP_FS);
+	if (!page)
+		return -ENOMEM;
+
+	blocksize = inode->i_sb->s_blocksize;
+	max = blocksize - (offset & (blocksize - 1));
+
+	/*
+	 * correct length if it does not fall between
+	 * 'from' and the end of the block
+	 */
+	if (length > max || length < 0)
+		length = max;
+
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (buffer_freed(bh)) {
+		BUFFER_TRACE(bh, "freed: skip");
+		goto unlock;
+	}
+
+	if (!buffer_mapped(bh)) {
+		BUFFER_TRACE(bh, "unmapped");
+		ext4_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh)) {
+			BUFFER_TRACE(bh, "still unmapped");
+			goto unlock;
+		}
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	if (ext4_should_journal_data(inode)) {
+		BUFFER_TRACE(bh, "get write access");
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			goto unlock;
+	}
+
+	zero_user(page, offset, length);
+
+	BUFFER_TRACE(bh, "zeroed end of block");
+
+	err = 0;
+	if (ext4_should_journal_data(inode)) {
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+	} else
+		mark_buffer_dirty(bh);
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
 int ext4_can_truncate(struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-13 17:42   ` Theodore Ts'o
  2012-07-18 19:34   ` Eric Sandeen
  2012-07-13 13:19 ` [PATCH 03/12 v2] shmem: pass LLONG_MAX to shmem_truncate_range Lukas Czerner
                   ` (9 subsequent siblings)
  10 siblings, 2 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner

This reverts commit 189e868fa8fdca702eb9db9d8afc46b5cb9144c9.

This commit reintroduces the use of ext4_block_truncate_page() in ext4
truncate operation instead of ext4_discard_partial_page_buffers().

The statement in the commit description that the truncate operation only
zero block unaligned portion of the last page is not exactly right,
since truncate_pagecache_range() also zeroes and invalidate the unaligned
portion of the page. Then there is no need to zero and unmap it once more
and ext4_block_truncate_page() was doing the right job, although we
still need to update the buffer head containing the last block, which is
exactly what ext4_block_truncate_page() is doing.

This was tested on ppc64 machine with block size of 1024 bytes without
any problems.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/extents.c  |   13 ++-----------
 fs/ext4/indirect.c |   13 ++-----------
 2 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 91341ec..ceab5f5 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4222,7 +4222,6 @@ void ext4_ext_truncate(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t last_block;
 	handle_t *handle;
-	loff_t page_len;
 	int err = 0;
 
 	/*
@@ -4239,16 +4238,8 @@ void ext4_ext_truncate(struct inode *inode)
 	if (IS_ERR(handle))
 		return;
 
-	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-		page_len = PAGE_CACHE_SIZE -
-			(inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-		err = ext4_discard_partial_page_buffers(handle,
-			mapping, inode->i_size, page_len, 0);
-
-		if (err)
-			goto out_stop;
-	}
+	if (inode->i_size & (sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
 
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2..a082b30 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1349,9 +1349,7 @@ void ext4_ind_truncate(struct inode *inode)
 	__le32 nr = 0;
 	int n = 0;
 	ext4_lblk_t last_block, max_block;
-	loff_t page_len;
 	unsigned blocksize = inode->i_sb->s_blocksize;
-	int err;
 
 	handle = start_transaction(inode);
 	if (IS_ERR(handle))
@@ -1362,16 +1360,9 @@ void ext4_ind_truncate(struct inode *inode)
 	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-		page_len = PAGE_CACHE_SIZE -
-			(inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-		err = ext4_discard_partial_page_buffers(handle,
-			mapping, inode->i_size, page_len, 0);

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 03/12 v2] shmem: pass LLONG_MAX to shmem_truncate_range
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
  2012-07-13 13:19 ` [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure" Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-18 19:54   ` Eric Sandeen
  2012-07-13 13:19 ` [PATCH 04/12 v2] xfs: pass LLONG_MAX to truncate_inode_pages_range Lukas Czerner
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner, Hugh Dickins

Currently we're passing -1 to shmem_truncate_range which can then call
truncate_inode_pages_range() which is actually really confusing since the
argument is signed so we do not get "huge" number as one would expect,
but rather just -1. To make things clearer and easier for
truncate_inode_pages_range() just pass LLONG_MAX since it is actually what
was intended anyway.

It also makes thing easier for allowing truncate_inode_pages_range() to
handle non page aligned regions. Moreover letting the lend argument to
be negative might actually hide some bugs.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
---
 mm/shmem.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 4ce02e0..3199733 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2961,7 +2961,8 @@ void shmem_unlock_mapping(struct address_space *mapping)
 
 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
-	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
+	truncate_inode_pages_range(inode->i_mapping, lstart,
+				   lend == -1 ? LLONG_MAX : last);
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 04/12 v2] xfs: pass LLONG_MAX to truncate_inode_pages_range
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
  2012-07-13 13:19 ` [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure" Lukas Czerner
  2012-07-13 13:19 ` [PATCH 03/12 v2] shmem: pass LLONG_MAX to shmem_truncate_range Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-15 23:11   ` Dave Chinner
  2012-07-13 13:19 ` [PATCH 05/12 v2] mm: " Lukas Czerner
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner, Dave Chinner

Currently we're passing -1 to truncate_inode_pages_range() which is
actually really confusing since the argument is signed so we do not get
"huge" number as one would expect, but rather just -1. To make things
clearer and easier for truncate_inode_pages_range() just pass LLONG_MAX
since it is actually what was intended anyway.

It also makes thing easier for allowing truncate_inode_pages_range() to
handle non page aligned regions. Moreover letting the lend argument to
be negative might actually hide some bugs.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_fs_subr.c |    6 ++++--
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index 652b875..6e9b052 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -34,7 +34,8 @@ xfs_tosspages(
 {
 	/* can't toss partial tail pages, so mask them out */
 	last &= ~(PAGE_SIZE - 1);
-	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
+	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first,
+				   last == -1 ? LLONG_MAX : last);
 }
 
 int
@@ -53,7 +54,8 @@ xfs_flushinval_pages(
 	ret = filemap_write_and_wait_range(mapping, first,
 				last == -1 ? LLONG_MAX : last);
 	if (!ret)
-		truncate_inode_pages_range(mapping, first, last);
+		truncate_inode_pages_range(mapping, first,
+					   last == -1 ? LLONG_MAX : last);
 	return -ret;
 }
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 05/12 v2] mm: pass LLONG_MAX to truncate_inode_pages_range
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
                   ` (2 preceding siblings ...)
  2012-07-13 13:19 ` [PATCH 04/12 v2] xfs: pass LLONG_MAX to truncate_inode_pages_range Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-13 13:19 ` [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges Lukas Czerner
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner, Hugh Dickins

Currently we're passing -1 to truncate_inode_pages_range() which is
actually really confusing since the argument is signed so we do not get
"huge" number as one would expect, but rather just -1. To make things
clearer and easier for truncate_inode_pages_range() just pass LLONG_MAX
since it is actually what was intended anyway.

It also makes thing easier for allowing truncate_inode_pages_range() to
handle non page aligned regions. Moreover letting the lend argument to
be negative might actually hide some bugs.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
---
 mm/truncate.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/truncate.c b/mm/truncate.c
index 75801ac..77a693e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -310,7 +310,7 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
  */
 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 {
-	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+	truncate_inode_pages_range(mapping, lstart, (loff_t)LLONG_MAX);
 }
 EXPORT_SYMBOL(truncate_inode_pages);
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
                   ` (3 preceding siblings ...)
  2012-07-13 13:19 ` [PATCH 05/12 v2] mm: " Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-17  8:28   ` Hugh Dickins
  2012-07-13 13:19 ` [PATCH 07/12 v2] ext4: use ext4_zero_partial_blocks in punch_hole Lukas Czerner
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner, Hugh Dickins

This commit changes truncate_inode_pages_range() so it can handle non
page aligned regions of the truncate. Currently we can hit BUG_ON when
the end of the range is not page aligned, but he can handle unaligned
start of the range.

Being able to handle non page aligned regions of the page can help file
system punch_hole implementations and save some work, because once we're
holding the page we might as well deal with it right away.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
---
 mm/truncate.c |   61 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/mm/truncate.c b/mm/truncate.c
index 77a693e..92aa4ad 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -49,6 +49,16 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 		(*invalidatepage)(page, offset);
 }
 
+static inline void punch_hole_into_page(struct page *page, unsigned start,
+					unsigned end)
+{
+	BUG_ON(end > PAGE_CACHE_SIZE);
+	zero_user_segment(page, start, end);
+	cleancache_invalidate_page(page->mapping, page);
+	if (page_has_private(page))
+		do_invalidatepage(page, start);
+}
+
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
@@ -206,24 +216,30 @@ int invalidate_inode_page(struct page *page)
 void truncate_inode_pages_range(struct address_space *mapping,
 				loff_t lstart, loff_t lend)
 {
-	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
-	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	const unsigned partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+	const unsigned partial_end = lend & (PAGE_CACHE_SIZE - 1);
+	loff_t start = lstart >> PAGE_CACHE_SHIFT;
+	loff_t end = lend >> PAGE_CACHE_SHIFT;
 	struct pagevec pvec;
-	pgoff_t index;
-	pgoff_t end;
+	loff_t index;
 	int i;
 
+	BUG_ON(lend < start || lend < 0);
+
 	cleancache_invalidate_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
-	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
-	end = (lend >> PAGE_CACHE_SHIFT);
+	/* Adjust start and end so we cover only full pages */
+	if (partial_start)
+		start++;
+	if (partial_end < PAGE_CACHE_SIZE - 1)
+		end--;
 
 	pagevec_init(&pvec, 0);
 	index = start;
 	while (index <= end && pagevec_lookup(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+			min(end - index, (loff_t)PAGEVEC_SIZE - 1) + 1)) {
 		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
@@ -249,21 +265,44 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		index++;
 	}
 
-	if (partial) {
+	/* truncate happened within the page - punch hole */
+	if ((start > end) && (start - end > 1)) {
 		struct page *page = find_lock_page(mapping, start - 1);
 		if (page) {
 			wait_on_page_writeback(page);
-			truncate_partial_page(page, partial);
+			punch_hole_into_page(page, partial_start,
+					     partial_end + 1);
 			unlock_page(page);
 			page_cache_release(page);
 		}
+	} else {
+		/* Partial page truncate at the start of the region */
+		if (partial_start) {
+			struct page *page = find_lock_page(mapping, start - 1);
+			if (page) {
+				wait_on_page_writeback(page);
+				truncate_partial_page(page, partial_start);
+				unlock_page(page);
+				page_cache_release(page);
+			}
+		}
+		/* Partial page truncate at the end of the region */
+		if (partial_end < PAGE_CACHE_SIZE - 1) {
+			struct page *page = find_lock_page(mapping, end + 1);
+			if (page) {
+				wait_on_page_writeback(page);
+				punch_hole_into_page(page, 0, partial_end + 1);
+				unlock_page(page);
+				page_cache_release(page);
+			}
+		}
 	}
 
 	index = start;
-	for ( ; ; ) {
+	while (index <= end) {
 		cond_resched();
 		if (!pagevec_lookup(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+			min(end - index, (loff_t)PAGEVEC_SIZE - 1) + 1)) {
 			if (index == start)
 				break;
 			index = start;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 07/12 v2] ext4: use ext4_zero_partial_blocks in punch_hole
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
                   ` (4 preceding siblings ...)
  2012-07-13 13:19 ` [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-13 13:19 ` [PATCH 08/12 v2] ext4: remove unused discard_partial_page_buffers Lukas Czerner
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner

We're doing to get rid of ext4_discard_partial_page_buffers() since it is
duplicating some code and also partially duplicating work of
truncate_pagecache_range(), moreover the old implementation was much
clearer.

Now when the truncate_inode_pages_range() can handle truncating non page
aligned regions we can use this to invalidate and zero out block aligned
region of the punched out range and then use ext4_block_truncate_page()
to zero the unaligned blocks on the start and end of the range. This
will greatly simplify the punch hole code. Moreover after this commit we
can get rid of the ext4_discard_partial_page_buffers() completely.

This has been tested on ppc64 with 1k block size with fsx and xfstests
without any problems.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/ext4.h    |    2 +
 fs/ext4/extents.c |   81 ++++++----------------------------------------------
 fs/ext4/inode.c   |   28 ++++++++++++++++++
 3 files changed, 40 insertions(+), 71 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 439af1e..704ceab 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1999,6 +1999,8 @@ extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t lend);
 extern int ext4_discard_partial_page_buffers(handle_t *handle,
 		struct address_space *mapping, loff_t from,
 		loff_t length, int flags);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ceab5f5..b3300eb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4742,7 +4742,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	ext4_lblk_t first_block, stop_block;
 	struct address_space *mapping = inode->i_mapping;
 	handle_t *handle;
-	loff_t first_page, last_page, page_len;
 	loff_t first_page_offset, last_page_offset;
 	int credits, err = 0;
 
@@ -4760,12 +4759,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 		   offset;
 	}
 
-	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
-	first_page_offset = first_page << PAGE_CACHE_SHIFT;
-	last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
 	/*
 	 * Write out all dirty pages to avoid race conditions
 	 * Then release them.
@@ -4778,11 +4771,13 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 			return err;
 	}
 
-	/* Now release the pages */
-	if (last_page_offset > first_page_offset) {
+	first_page_offset = round_up(offset, sb->s_blocksize);
+	last_page_offset = round_down((offset + length), sb->s_blocksize) - 1;
+
+	/* Now release the pages and zero block aligned part of pages*/
+	if (last_page_offset > first_page_offset)
 		truncate_pagecache_range(inode, first_page_offset,
-					 last_page_offset - 1);
-	}
+					 last_page_offset);
 
 	/* finish any pending end_io work */
 	ext4_flush_completed_IO(inode);
@@ -4796,66 +4791,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	if (err)
 		goto out;
 
-	/*
-	 * Now we need to zero out the non-page-aligned data in the
-	 * pages at the start and tail of the hole, and unmap the buffer
-	 * heads for the block aligned regions of the page that were
-	 * completely zeroed.
-	 */
-	if (first_page > last_page) {
-		/*
-		 * If the file space being truncated is contained within a page
-		 * just zero out and unmap the middle of that page
-		 */
-		err = ext4_discard_partial_page_buffers(handle,
-			mapping, offset, length, 0);
-
-		if (err)
-			goto out;
-	} else {
-		/*
-		 * zero out and unmap the partial page that contains
-		 * the start of the hole
-		 */
-		page_len  = first_page_offset - offset;
-		if (page_len > 0) {
-			err = ext4_discard_partial_page_buffers(handle, mapping,
-						   offset, page_len, 0);
-			if (err)
-				goto out;
-		}
-
-		/*
-		 * zero out and unmap the partial page that contains
-		 * the end of the hole
-		 */
-		page_len = offset + length - last_page_offset;
-		if (page_len > 0) {
-			err = ext4_discard_partial_page_buffers(handle, mapping,
-					last_page_offset, page_len, 0);
-			if (err)
-				goto out;
-		}
-	}
-
-	/*
-	 * If i_size is contained in the last page, we need to
-	 * unmap and zero the partial page after i_size
-	 */
-	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-	   inode->i_size % PAGE_CACHE_SIZE != 0) {
-
-		page_len = PAGE_CACHE_SIZE -
-			(inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-		if (page_len > 0) {
-			err = ext4_discard_partial_page_buffers(handle,
-			  mapping, inode->i_size, page_len, 0);
-
-			if (err)
-				goto out;
-		}
-	}
+	err = ext4_zero_partial_blocks(handle, inode, offset,
+				       offset + length - 1);
+	if (err)
+		goto out;
 
 	first_block = (offset + sb->s_blocksize - 1) >>
 		EXT4_BLOCK_SIZE_BITS(sb);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 588f9fa..28df4f6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3501,6 +3501,34 @@ unlock:
 	return err;
 }
 
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t lend)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned partial = lstart & (sb->s_blocksize - 1);
+	ext4_fsblk_t start = lstart >> sb->s_blocksize_bits;
+	ext4_fsblk_t end = lend >> sb->s_blocksize_bits;
+	int err = 0;
+
+	if (start == end) {
+		err = ext4_block_zero_page_range(handle, inode->i_mapping,
+						 lstart, lend - lstart + 1);
+		return err;
+	}
+	/* Handle partial zero out on the start of the range */
+	if (partial) {
+		err = ext4_block_zero_page_range(handle, inode->i_mapping,
+						 lstart, sb->s_blocksize);
+		if (err)
+			return err;
+	}
+	partial = lend & (sb->s_blocksize - 1);
+	if (partial != sb->s_blocksize - 1)
+		err = ext4_block_zero_page_range(handle, inode->i_mapping,
+						 lend - partial, partial + 1);
+	return err;
+}
+
 int ext4_can_truncate(struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 08/12 v2] ext4: remove unused discard_partial_page_buffers
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
                   ` (5 preceding siblings ...)
  2012-07-13 13:19 ` [PATCH 07/12 v2] ext4: use ext4_zero_partial_blocks in punch_hole Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-13 13:19 ` [PATCH 09/12 v2] ext4: remove unused code from ext4_remove_blocks() Lukas Czerner
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner

The discard_partial_page_buffers is no longer used anywhere so we can
simply remove it including the *_no_lock variant and
EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED define.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/ext4.h  |    8 --
 fs/ext4/inode.c |  206 -------------------------------------------------------
 2 files changed, 0 insertions(+), 214 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 704ceab..72d75ff 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -583,11 +583,6 @@ enum {
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
 
 /*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED	0x0001
-
-/*
  * ioctl commands
  */
 #define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
@@ -2001,9 +1996,6 @@ extern int ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
-		struct address_space *mapping, loff_t from,
-		loff_t length, int flags);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28df4f6..9e3b279 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -138,9 +138,6 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-		struct inode *inode, struct page *page, loff_t from,
-		loff_t length, int flags);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -3178,209 +3175,6 @@ void ext4_set_aops(struct inode *inode)
 	}
 }
 
-
-/*
- * ext4_discard_partial_page_buffers()
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
- * This function finds and locks the page containing the offset
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
- */
-int ext4_discard_partial_page_buffers(handle_t *handle,
-		struct address_space *mapping, loff_t from,
-		loff_t length, int flags)
-{
-	struct inode *inode = mapping->host;
-	struct page *page;
-	int err = 0;
-
-	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-				   mapping_gfp_mask(mapping) & ~__GFP_FS);
-	if (!page)
-		return -ENOMEM;
-
-	err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
-		from, length, flags);
-
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
-}
-
-/*
- * ext4_discard_partial_page_buffers_no_lock()
- * Zeros a page range of length 'length' starting from offset 'from'.
- * Buffer heads that correspond to the block aligned regions of the
- * zeroed range will be unmapped.  Unblock aligned regions
- * will have the corresponding buffer head mapped if needed so that
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been  locked.  The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'.  This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode:  The files inode
- * page:   A locked page that contains the offset "from"
- * from:   The starting byte offset (from the begining of the file)
- *         to begin discarding
- * len:    The length of bytes to discard
- * flags:  Optional flags that may be used:
- *
- *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- *         Only zero the regions of the page whose buffer heads
- *         have already been unmapped.  This flag is appropriate
- *         for updateing the contents of a page whose blocks may
- *         have already been released, and we only want to zero
- *         out the regions that correspond to those released blocks.
- *
- * Returns zero on sucess or negative on failure.
- */
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-		struct inode *inode, struct page *page, loff_t from,
-		loff_t length, int flags)
-{
-	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned int offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned int blocksize, max, pos;
-	ext4_lblk_t iblock;
-	struct buffer_head *bh;
-	int err = 0;
-
-	blocksize = inode->i_sb->s_blocksize;
-	max = PAGE_CACHE_SIZE - offset;
-
-	if (index != page->index)
-		return -EINVAL;
-
-	/*
-	 * correct length if it does not fall between
-	 * 'from' and the end of the page
-	 */
-	if (length > max || length < 0)
-		length = max;
-
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-
-	/* Find the buffer that contains "offset" */
-	bh = page_buffers(page);
-	pos = blocksize;
-	while (offset >= pos) {
-		bh = bh->b_this_page;
-		iblock++;
-		pos += blocksize;
-	}
-
-	pos = offset;
-	while (pos < offset + length) {
-		unsigned int end_of_block, range_to_discard;
-
-		err = 0;
-
-		/* The length of space left to zero and unmap */
-		range_to_discard = offset + length - pos;
-
-		/* The length of space until the end of the block */
-		end_of_block = blocksize - (pos & (blocksize-1));
-
-		/*
-		 * Do not unmap or zero past end of block
-		 * for this buffer head
-		 */
-		if (range_to_discard > end_of_block)
-			range_to_discard = end_of_block;
-
-
-		/*
-		 * Skip this buffer head if we are only zeroing unampped
-		 * regions of the page
-		 */
-		if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
-			buffer_mapped(bh))
-				goto next;
-
-		/* If the range is block aligned, unmap */
-		if (range_to_discard == blocksize) {
-			clear_buffer_dirty(bh);
-			bh->b_bdev = NULL;
-			clear_buffer_mapped(bh);
-			clear_buffer_req(bh);
-			clear_buffer_new(bh);
-			clear_buffer_delay(bh);
-			clear_buffer_unwritten(bh);
-			clear_buffer_uptodate(bh);
-			zero_user(page, pos, range_to_discard);
-			BUFFER_TRACE(bh, "Buffer discarded");
-			goto next;
-		}
-
-		/*
-		 * If this block is not completely contained in the range
-		 * to be discarded, then it is not going to be released. Because
-		 * we need to keep this block, we need to make sure this part
-		 * of the page is uptodate before we modify it by writeing
-		 * partial zeros on it.
-		 */
-		if (!buffer_mapped(bh)) {
-			/*
-			 * Buffer head must be mapped before we can read
-			 * from the block
-			 */
-			BUFFER_TRACE(bh, "unmapped");
-			ext4_get_block(inode, iblock, bh, 0);
-			/* unmapped? It's a hole - nothing to do */
-			if (!buffer_mapped(bh)) {
-				BUFFER_TRACE(bh, "still unmapped");
-				goto next;
-			}
-		}
-
-		/* Ok, it's mapped. Make sure it's up-to-date */
-		if (PageUptodate(page))
-			set_buffer_uptodate(bh);
-
-		if (!buffer_uptodate(bh)) {
-			err = -EIO;
-			ll_rw_block(READ, 1, &bh);
-			wait_on_buffer(bh);
-			/* Uhhuh. Read error. Complain and punt.*/
-			if (!buffer_uptodate(bh))
-				goto next;
-		}
-
-		if (ext4_should_journal_data(inode)) {
-			BUFFER_TRACE(bh, "get write access");
-			err = ext4_journal_get_write_access(handle, bh);
-			if (err)
-				goto next;
-		}
-
-		zero_user(page, pos, range_to_discard);
-
-		err = 0;
-		if (ext4_should_journal_data(inode)) {
-			err = ext4_handle_dirty_metadata(handle, inode, bh);
-		} else
-			mark_buffer_dirty(bh);
-
-		BUFFER_TRACE(bh, "Partial buffer zeroed");
-next:
-		bh = bh->b_this_page;
-		iblock++;
-		pos += range_to_discard;
-	}
-
-	return err;
-}

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 09/12 v2] ext4: remove unused code from ext4_remove_blocks()
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
                   ` (6 preceding siblings ...)
  2012-07-13 13:19 ` [PATCH 08/12 v2] ext4: remove unused discard_partial_page_buffers Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-13 13:19 ` [PATCH 10/12 v2] ext4: update ext4_ext_remove_space trace point Lukas Czerner
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner

The "head removal" branch in the condition is never used in any code
path in ext4 since the function only caller ext4_ext_rm_leaf() will make
sure that the extent is properly split before removing blocks. Note that
there is a bug in this branch anyway.

This commit removes the unused code completely and makes use of
ext4_error() instead of printk if dubious range is provided.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/extents.c |   21 ++++-----------------
 1 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b3300eb..455257f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2339,23 +2339,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 			*partial_cluster = EXT4_B2C(sbi, pblk);
 		else
 			*partial_cluster = 0;
-	} else if (from == le32_to_cpu(ex->ee_block)
-		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		/* head removal */
-		ext4_lblk_t num;
-		ext4_fsblk_t start;
-
-		num = to - from;
-		start = ext4_ext_pblock(ex);
-
-		ext_debug("free first %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, NULL, start, num, flags);

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 10/12 v2] ext4: update ext4_ext_remove_space trace point
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
                   ` (7 preceding siblings ...)
  2012-07-13 13:19 ` [PATCH 09/12 v2] ext4: remove unused code from ext4_remove_blocks() Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-13 13:19 ` [PATCH 11/12 v2] ext4: make punch hole code path work with bigalloc Lukas Czerner
  2012-07-13 13:19 ` [PATCH 12/12 v2] ext4: Allow punch hole with bigalloc enabled Lukas Czerner
  10 siblings, 0 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner

Add "end" variable.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/extents.c           |    6 +++---
 include/trace/events/ext4.h |   21 ++++++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 455257f..3706738 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2572,7 +2572,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 again:
 	ext4_ext_invalidate_cache(inode);
 
-	trace_ext4_ext_remove_space(inode, start, depth);
+	trace_ext4_ext_remove_space(inode, start, end, depth);
 
 	/*
 	 * Check if we are removing extents inside the extent tree. If that
@@ -2725,8 +2725,8 @@ cont:
 		}
 	}
 
-	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
-			path->p_hdr->eh_entries);
+	trace_ext4_ext_remove_space_done(inode, start, end, depth,
+			partial_cluster, path->p_hdr->eh_entries);
 
 	/* If we still have something in the partial cluster and we have removed
 	 * even the first extent, then we should free the blocks in the partial
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 69d8a69..f7faeba 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -1996,14 +1996,16 @@ TRACE_EVENT(ext4_ext_rm_idx,
 );
 
 TRACE_EVENT(ext4_ext_remove_space,
-	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth),
+	TP_PROTO(struct inode *inode, ext4_lblk_t start,
+		 ext4_lblk_t end, int depth),
 
-	TP_ARGS(inode, start, depth),
+	TP_ARGS(inode, start, end, depth),
 
 	TP_STRUCT__entry(
 		__field(	ino_t,		ino	)
 		__field(	dev_t,		dev	)
 		__field(	ext4_lblk_t,	start	)
+		__field(	ext4_lblk_t,	end	)
 		__field(	int,		depth	)
 	),
 
@@ -2011,26 +2013,29 @@ TRACE_EVENT(ext4_ext_remove_space,
 		__entry->ino	= inode->i_ino;
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->start	= start;
+		__entry->end	= end;
 		__entry->depth	= depth;
 	),
 
-	TP_printk("dev %d,%d ino %lu since %u depth %d",
+	TP_printk("dev %d,%d ino %lu start %u end %u depth %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->start,
+		  (unsigned) __entry->end,
 		  __entry->depth)
 );
 
 TRACE_EVENT(ext4_ext_remove_space_done,
-	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
-		ext4_lblk_t partial, unsigned short eh_entries),
+	TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
+		 int depth, ext4_lblk_t partial, unsigned short eh_entries),
 
-	TP_ARGS(inode, start, depth, partial, eh_entries),
+	TP_ARGS(inode, start, end, depth, partial, eh_entries),
 
 	TP_STRUCT__entry(
 		__field(	ino_t,		ino		)
 		__field(	dev_t,		dev		)
 		__field(	ext4_lblk_t,	start		)
+		__field(	ext4_lblk_t,	end		)
 		__field(	int,		depth		)
 		__field(	ext4_lblk_t,	partial		)
 		__field(	unsigned short,	eh_entries	)
@@ -2040,16 +2045,18 @@ TRACE_EVENT(ext4_ext_remove_space_done,
 		__entry->ino		= inode->i_ino;
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->start		= start;
+		__entry->end		= end;
 		__entry->depth		= depth;
 		__entry->partial	= partial;
 		__entry->eh_entries	= eh_entries;
 	),
 
-	TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
+	TP_printk("dev %d,%d ino %lu start %u end %u depth %d partial %u "
 		  "remaining_entries %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->start,
+		  (unsigned) __entry->end,
 		  __entry->depth,
 		  (unsigned) __entry->partial,
 		  (unsigned short) __entry->eh_entries)
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 11/12 v2] ext4: make punch hole code path work with bigalloc
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
                   ` (8 preceding siblings ...)
  2012-07-13 13:19 ` [PATCH 10/12 v2] ext4: update ext4_ext_remove_space trace point Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  2012-07-13 13:19 ` [PATCH 12/12 v2] ext4: Allow punch hole with bigalloc enabled Lukas Czerner
  10 siblings, 0 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner

Currently punch hole is disabled in file systems with bigalloc
feature enabled. However the recent changes in punch hole patch should
make it easier to support punching holes on bigalloc enabled file
systems.

This commit changes partial_cluster handling in ext4_remove_blocks(),
ext4_ext_rm_leaf() and ext4_ext_remove_space(). Currently
partial_cluster is unsigned long long type and it makes sure that we
will free the partial cluster if all extents has been released from that
cluster. However it has been specifically designed only for truncate.

With punch hole we can be freeing just some extents in the cluster
leaving the rest untouched. So we have to make sure that we will notice
cluster which still has some extents. To do this I've changed
partial_cluster to be signed long long type. The only scenario where
this could be a problem is when cluster_size == block size, however in
that case there would not be any partial clusters so we're safe. For
bigger clusters the signed type is enough. Now we use the negative value
in partial_cluster to mark such cluster used, hence we know that we must
not free it even if all other extents has been freed from such cluster.

This scenario can be described in simple diagram:

|FFF...FF..FF.UUU|
 ^----------^
  punch hole

. - free space
| - cluster boundary
F - freed extent
U - used extent

Also update respective tracepoints to use signed long long type for
partial_cluster.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/extents.c           |   69 +++++++++++++++++++++++++++++++-----------
 include/trace/events/ext4.h |   25 ++++++++-------
 2 files changed, 64 insertions(+), 30 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3706738..82cb64c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2269,7 +2269,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 			      struct ext4_extent *ex,
-			      ext4_fsblk_t *partial_cluster,
+			      signed long long *partial_cluster,
 			      ext4_lblk_t from, ext4_lblk_t to)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2295,7 +2295,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	 * partial cluster here.
 	 */
 	pblk = ext4_ext_pblock(ex) + ee_len - 1;
-	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+	if ((*partial_cluster > 0) &&
+	    (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
 		ext4_free_blocks(handle, inode, NULL,
 				 EXT4_C2B(sbi, *partial_cluster),
 				 sbi->s_cluster_ratio, flags);
@@ -2321,23 +2322,41 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
 		ext4_lblk_t num;
+		unsigned int unaligned;
 
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
 		pblk = ext4_ext_pblock(ex) + ee_len - num;
-		ext_debug("free last %u blocks starting %llu\n", num, pblk);
+		/*
+		 * Usually we want to free partial cluster at the end of the
+		 * extent, except for the situation when the cluster is still
+		 * used by any other extent (partial_cluster is negative).
+		 */
+		if (*partial_cluster < 0 &&
+		    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+		ext_debug("free last %u blocks starting %llu partial %lld\n",
+			  num, pblk, *partial_cluster);
 		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
 		/*
 		 * If the block range to be freed didn't start at the
 		 * beginning of a cluster, and we removed the entire
-		 * extent, save the partial cluster here, since we
-		 * might need to delete if we determine that the
-		 * truncate operation has removed all of the blocks in
-		 * the cluster.
+		 * extent and the cluster is not used by any other extent,
+		 * save the partial cluster here, since we might need to
+		 * delete if we determine that the truncate operation has
+		 * removed all of the blocks in the cluster.
+		 *
+		 * On the other hand, if we did not manage to free the whole
+		 * extent, we have to mark the cluster as used (store negative
+		 * cluster number in partial_cluster).
 		 */
-		if (pblk & (sbi->s_cluster_ratio - 1) &&
-		    (ee_len == num))
+		unaligned = pblk & (sbi->s_cluster_ratio - 1);
+		if (unaligned && (ee_len == num) &&
+		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
 			*partial_cluster = EXT4_B2C(sbi, pblk);
-		else
+		else if (unaligned)
+			*partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+		else if (*partial_cluster > 0)
 			*partial_cluster = 0;
 	} else
 		ext4_error(sbi->s_sb, "strange request: removal(2) "
@@ -2355,12 +2374,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
  * @handle: The journal handle
  * @inode:  The files inode
  * @path:   The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ *                   has been released from it. It gets negative in case
+ *                   that the cluster is still used.
  * @start:  The first block to remove
  * @end:   The last block to remove
  */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+		 struct ext4_ext_path *path,
+		 signed long long *partial_cluster,
 		 ext4_lblk_t start, ext4_lblk_t end)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2373,6 +2396,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	unsigned short ex_ee_len;
 	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
+	ext4_fsblk_t pblk;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
 	ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2411,6 +2435,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 
 		/* If this extent is beyond the end of the hole, skip it */
 		if (end < ex_ee_block) {
+			/*
+			 * We're going to skip this extent and move to another,
+			 * so if this extent is not cluster aligned we have
+			 * to mark the current cluster as used to avoid
+			 * accidentally freeing it later on
+			 */
+			pblk = ext4_ext_pblock(ex);
+			if (pblk & (sbi->s_cluster_ratio - 1))
+				*partial_cluster =
+					-((long long)EXT4_B2C(sbi, pblk));
 			ex--;
 			ex_ee_block = le32_to_cpu(ex->ee_block);
 			ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2486,7 +2520,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 					sizeof(struct ext4_extent));
 			}
 			le16_add_cpu(&eh->eh_entries, -1);
-		} else
+		} else if (*partial_cluster > 0)
 			*partial_cluster = 0;
 
 		err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2504,11 +2538,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		err = ext4_ext_correct_indexes(handle, inode, path);
 
 	/*
-	 * If there is still a entry in the leaf node, check to see if
-	 * it references the partial cluster.  This is the only place
-	 * where it could; if it doesn't, we can free the cluster.
+	 * Free the partial cluster only if the current extent does not
+	 * reference it. Otherwise we might free used cluster.
 	 */
-	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+	if (*partial_cluster > 0 &&
 	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
 	     *partial_cluster)) {
 		int flags = EXT4_FREE_BLOCKS_FORGET;
@@ -2558,7 +2591,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path;
-	ext4_fsblk_t partial_cluster = 0;
+	signed long long partial_cluster = 0;
 	handle_t *handle;
 	int i, err;
 
@@ -2731,7 +2764,7 @@ cont:
 	/* If we still have something in the partial cluster and we have removed
 	 * even the first extent, then we should free the blocks in the partial
 	 * cluster as well. */
-	if (partial_cluster && path->p_hdr->eh_entries == 0) {
+	if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
 		int flags = EXT4_FREE_BLOCKS_FORGET;
 
 		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f7faeba..2bc3d20 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -1897,7 +1897,7 @@ TRACE_EVENT(ext4_ext_show_extent,
 TRACE_EVENT(ext4_remove_blocks,
 	    TP_PROTO(struct inode *inode, struct ext4_extent *ex,
 		ext4_lblk_t from, ext4_fsblk_t to,
-		ext4_fsblk_t partial_cluster),
+		long long int partial_cluster),
 
 	TP_ARGS(inode, ex, from, to, partial_cluster),
 
@@ -1909,7 +1909,7 @@ TRACE_EVENT(ext4_remove_blocks,
 		__field(	unsigned short,	ee_len	)
 		__field(	ext4_lblk_t,	from	)
 		__field(	ext4_lblk_t,	to	)
-		__field(	ext4_fsblk_t,	partial	)
+		__field(	long long int,	partial	)
 	),
 
 	TP_fast_assign(
@@ -1924,7 +1924,7 @@ TRACE_EVENT(ext4_remove_blocks,
 	),
 
 	TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
-		  "from %u to %u partial_cluster %u",
+		  "from %u to %u partial_cluster %lld",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->ee_lblk,
@@ -1932,12 +1932,13 @@ TRACE_EVENT(ext4_remove_blocks,
 		  (unsigned short) __entry->ee_len,
 		  (unsigned) __entry->from,
 		  (unsigned) __entry->to,
-		  (unsigned) __entry->partial)
+		  (long long int) __entry->partial)
 );
 
 TRACE_EVENT(ext4_ext_rm_leaf,
 	TP_PROTO(struct inode *inode, ext4_lblk_t start,
-		 struct ext4_extent *ex, ext4_fsblk_t partial_cluster),
+		 struct ext4_extent *ex,
+		 long long int partial_cluster),
 
 	TP_ARGS(inode, start, ex, partial_cluster),
 
@@ -1948,7 +1949,7 @@ TRACE_EVENT(ext4_ext_rm_leaf,
 		__field(	ext4_lblk_t,	ee_lblk	)
 		__field(	ext4_fsblk_t,	ee_pblk	)
 		__field(	short,		ee_len	)
-		__field(	ext4_fsblk_t,	partial	)
+		__field(	long long int,	partial	)
 	),
 
 	TP_fast_assign(
@@ -1962,14 +1963,14 @@ TRACE_EVENT(ext4_ext_rm_leaf,
 	),
 
 	TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
-		  "partial_cluster %u",
+		  "partial_cluster %lld",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->start,
 		  (unsigned) __entry->ee_lblk,
 		  (unsigned long long) __entry->ee_pblk,
 		  (unsigned short) __entry->ee_len,
-		  (unsigned) __entry->partial)
+		  (long long int) __entry->partial)
 );
 
 TRACE_EVENT(ext4_ext_rm_idx,
@@ -2027,7 +2028,7 @@ TRACE_EVENT(ext4_ext_remove_space,
 
 TRACE_EVENT(ext4_ext_remove_space_done,
 	TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
-		 int depth, ext4_lblk_t partial, unsigned short eh_entries),
+		 int depth, long long int partial, unsigned short eh_entries),
 
 	TP_ARGS(inode, start, end, depth, partial, eh_entries),
 
@@ -2037,7 +2038,7 @@ TRACE_EVENT(ext4_ext_remove_space_done,
 		__field(	ext4_lblk_t,	start		)
 		__field(	ext4_lblk_t,	end		)
 		__field(	int,		depth		)
-		__field(	ext4_lblk_t,	partial		)
+		__field(	long long int,	partial		)
 		__field(	unsigned short,	eh_entries	)
 	),
 
@@ -2051,14 +2052,14 @@ TRACE_EVENT(ext4_ext_remove_space_done,
 		__entry->eh_entries	= eh_entries;
 	),
 
-	TP_printk("dev %d,%d ino %lu start %u end %u depth %d partial %u "
+	TP_printk("dev %d,%d ino %lu start %u end %u depth %d partial %lld "
 		  "remaining_entries %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->start,
 		  (unsigned) __entry->end,
 		  __entry->depth,
-		  (unsigned) __entry->partial,
+		  (long long int) __entry->partial,
 		  (unsigned short) __entry->eh_entries)
 );
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 12/12 v2] ext4: Allow punch hole with bigalloc enabled
  2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
                   ` (9 preceding siblings ...)
  2012-07-13 13:19 ` [PATCH 11/12 v2] ext4: make punch hole code path work with bigalloc Lukas Czerner
@ 2012-07-13 13:19 ` Lukas Czerner
  10 siblings, 0 replies; 31+ messages in thread
From: Lukas Czerner @ 2012-07-13 13:19 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-fsdevel, tytso, achender, Lukas Czerner

In commits 5f95d21fb6f2aaa52830e5b7fb405f6c71d3ab85 and
30bc2ec9598a1b156ad75217f2e7d4560efdeeab we've reworked punch_hole
implementation and there is noting holding us back from using punch hole
on file system with bigalloc feature enabled.

This has been tested with fsx and xfstests.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/inode.c |    5 -----
 1 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9e3b279..eb3565d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3356,11 +3356,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 		return -EOPNOTSUPP;
 	}
 
-	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
-		/* TODO: Add support for bigalloc file systems */
-		return -EOPNOTSUPP;
-	}
-
 	return ext4_ext_punch_hole(file, offset, length);
 }
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
  2012-07-13 13:19 ` [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure" Lukas Czerner
@ 2012-07-13 17:42   ` Theodore Ts'o
  2012-07-14  7:45     ` Christoph Hellwig
  2012-07-16  7:35     ` Lukáš Czerner
  2012-07-18 19:34   ` Eric Sandeen
  1 sibling, 2 replies; 31+ messages in thread
From: Theodore Ts'o @ 2012-07-13 17:42 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: linux-ext4, linux-fsdevel, achender

Hi Lukas,

This patch set has changes to the VFS, XFS, and ext4, and there are
cross dependencies between them.  Is there any way you can disentagle
the dependencies between the patches so we don't need to worry about
how these get pulled in during the merge window?

I suppose could try to get the XFS folks to sign off with my carrying
this patch in the ext4 tree, since the bulk of the changes are
ext4-related, but it is a bit of a complication.

	      	     	      	 - Ted

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
  2012-07-13 17:42   ` Theodore Ts'o
@ 2012-07-14  7:45     ` Christoph Hellwig
  2012-07-16  7:35     ` Lukáš Czerner
  1 sibling, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2012-07-14  7:45 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Lukas Czerner, linux-ext4, linux-fsdevel, achender

On Fri, Jul 13, 2012 at 01:42:09PM -0400, Theodore Ts'o wrote:
> Hi Lukas,
> 
> This patch set has changes to the VFS, XFS, and ext4, and there are
> cross dependencies between them.  Is there any way you can disentagle
> the dependencies between the patches so we don't need to worry about
> how these get pulled in during the merge window?
> 
> I suppose could try to get the XFS folks to sign off with my carrying
> this patch in the ext4 tree, since the bulk of the changes are
> ext4-related, but it is a bit of a complication.

There are no real XFS changes, just changing the calling conventions for
a generic function.  Adding this through another tree if properly
reviewed seems perfectly fine.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 04/12 v2] xfs: pass LLONG_MAX to truncate_inode_pages_range
  2012-07-13 13:19 ` [PATCH 04/12 v2] xfs: pass LLONG_MAX to truncate_inode_pages_range Lukas Czerner
@ 2012-07-15 23:11   ` Dave Chinner
  2012-07-16  7:13     ` Lukáš Czerner
  0 siblings, 1 reply; 31+ messages in thread
From: Dave Chinner @ 2012-07-15 23:11 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: linux-ext4, linux-fsdevel, tytso, achender

On Fri, Jul 13, 2012 at 03:19:07PM +0200, Lukas Czerner wrote:
> Currently we're passing -1 to truncate_inode_pages_range() which is
> actually really confusing since the argument is signed so we do not get
> "huge" number as one would expect, but rather just -1. To make things
> clearer and easier for truncate_inode_pages_range() just pass LLONG_MAX
> since it is actually what was intended anyway.
> 
> It also makes thing easier for allowing truncate_inode_pages_range() to
> handle non page aligned regions. Moreover letting the lend argument to
> be negative might actually hide some bugs.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_fs_subr.c |    6 ++++--
>  1 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
> index 652b875..6e9b052 100644
> --- a/fs/xfs/xfs_fs_subr.c
> +++ b/fs/xfs/xfs_fs_subr.c
> @@ -34,7 +34,8 @@ xfs_tosspages(
>  {
>  	/* can't toss partial tail pages, so mask them out */
>  	last &= ~(PAGE_SIZE - 1);
> -	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
> +	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first,
> +				   last == -1 ? LLONG_MAX : last);

The last paramter changed from (last -1) to last. so if we pass in
last = 16384, we now truncate to 16384 (first byte of page index 5)
instead of 16383 (last byte of page index 4). That's a change of
behaviour and a potential off-by one error, right?

> @@ -53,7 +54,8 @@ xfs_flushinval_pages(
>  	ret = filemap_write_and_wait_range(mapping, first,
>  				last == -1 ? LLONG_MAX : last);
>  	if (!ret)
> -		truncate_inode_pages_range(mapping, first, last);
> +		truncate_inode_pages_range(mapping, first,
> +					   last == -1 ? LLONG_MAX : last);

Given this is also done immediately above in the function, perhaps
this should be done before anything:

	if (last == -1)
		last = LLONG_MAX;

and the parameter simply passed to the two functions without the
conditional logic?

Cheers,

Dave.
-- 
Dave Chinner
dchinner@redhat.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 04/12 v2] xfs: pass LLONG_MAX to truncate_inode_pages_range
  2012-07-15 23:11   ` Dave Chinner
@ 2012-07-16  7:13     ` Lukáš Czerner
  2012-07-16 11:52       ` Lukáš Czerner
  0 siblings, 1 reply; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-16  7:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Lukas Czerner, linux-ext4, linux-fsdevel, tytso, achender

On Mon, 16 Jul 2012, Dave Chinner wrote:

> Date: Mon, 16 Jul 2012 09:11:17 +1000
> From: Dave Chinner <dchinner@redhat.com>
> To: Lukas Czerner <lczerner@redhat.com>
> Cc: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org, tytso@mit.edu,
>     achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 04/12 v2] xfs: pass LLONG_MAX to
>     truncate_inode_pages_range
> 
> On Fri, Jul 13, 2012 at 03:19:07PM +0200, Lukas Czerner wrote:
> > Currently we're passing -1 to truncate_inode_pages_range() which is
> > actually really confusing since the argument is signed so we do not get
> > "huge" number as one would expect, but rather just -1. To make things
> > clearer and easier for truncate_inode_pages_range() just pass LLONG_MAX
> > since it is actually what was intended anyway.
> > 
> > It also makes thing easier for allowing truncate_inode_pages_range() to
> > handle non page aligned regions. Moreover letting the lend argument to
> > be negative might actually hide some bugs.
> > 
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > Cc: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_fs_subr.c |    6 ++++--
> >  1 files changed, 4 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
> > index 652b875..6e9b052 100644
> > --- a/fs/xfs/xfs_fs_subr.c
> > +++ b/fs/xfs/xfs_fs_subr.c
> > @@ -34,7 +34,8 @@ xfs_tosspages(
> >  {
> >  	/* can't toss partial tail pages, so mask them out */
> >  	last &= ~(PAGE_SIZE - 1);
> > -	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
> > +	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first,
> > +				   last == -1 ? LLONG_MAX : last);
> 
> The last paramter changed from (last -1) to last. so if we pass in
> last = 16384, we now truncate to 16384 (first byte of page index 5)
> instead of 16383 (last byte of page index 4). That's a change of
> behaviour and a potential off-by one error, right?

Right, this could potentially cause off-by-one errors, but as it is
now I do not think this could happen. The only place where it is
used with a proper range is XFS_IOC_ZERO_RANGE and you're going to
convert the whole range to unwritten anyway. But it was unintended
and I\ll fix it.


> 
> > @@ -53,7 +54,8 @@ xfs_flushinval_pages(
> >  	ret = filemap_write_and_wait_range(mapping, first,
> >  				last == -1 ? LLONG_MAX : last);
> >  	if (!ret)
> > -		truncate_inode_pages_range(mapping, first, last);
> > +		truncate_inode_pages_range(mapping, first,
> > +					   last == -1 ? LLONG_MAX : last);
> 
> Given this is also done immediately above in the function, perhaps
> this should be done before anything:
> 
> 	if (last == -1)
> 		last = LLONG_MAX;
> 
> and the parameter simply passed to the two functions without the
> conditional logic?

Yes, it makes sense to do this, I'll change it in the next
iteration.

Thanks for the review Dave.
-Lukas

> 
> Cheers,
> 
> Dave.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
  2012-07-13 17:42   ` Theodore Ts'o
  2012-07-14  7:45     ` Christoph Hellwig
@ 2012-07-16  7:35     ` Lukáš Czerner
  2012-07-16 21:41       ` Hugh Dickins
  1 sibling, 1 reply; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-16  7:35 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Lukas Czerner, linux-ext4, linux-fsdevel, achender, Hugh Dickins

On Fri, 13 Jul 2012, Theodore Ts'o wrote:

> Date: Fri, 13 Jul 2012 13:42:09 -0400
> From: Theodore Ts'o <tytso@mit.edu>
> To: Lukas Czerner <lczerner@redhat.com>
> Cc: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
>     achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
> 
> Hi Lukas,
> 
> This patch set has changes to the VFS, XFS, and ext4, and there are
> cross dependencies between them.  Is there any way you can disentagle
> the dependencies between the patches so we don't need to worry about
> how these get pulled in during the merge window?
> 
> I suppose could try to get the XFS folks to sign off with my carrying
> this patch in the ext4 tree, since the bulk of the changes are
> ext4-related, but it is a bit of a complication.
> 
> 	      	     	      	 - Ted

Hi Ted,

there is no VFS change, MM is probably what you've had in mind ? So
the only reason why there are xfs, mm and tmpfs changes is that I am
changing truncate_partial_page() and a small side effect is changed
calling conventions.

We can push patches 3, 4, 5 and 6 through the mm tree. But we'll
have to make sure that it will land before ext4 changes since I am
using the new truncate_partial_page() behaviour in ext4. Will that
be possible ?

Hugh ?

-Lukas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 04/12 v2] xfs: pass LLONG_MAX to truncate_inode_pages_range
  2012-07-16  7:13     ` Lukáš Czerner
@ 2012-07-16 11:52       ` Lukáš Czerner
  0 siblings, 0 replies; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-16 11:52 UTC (permalink / raw)
  To: Lukáš Czerner
  Cc: Dave Chinner, linux-ext4, linux-fsdevel, linux-xfs, hch

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3869 bytes --]

On Mon, 16 Jul 2012, Lukáš Czerner wrote:

> Date: Mon, 16 Jul 2012 09:13:44 +0200 (CEST)
> From: Lukáš Czerner <lczerner@redhat.com>
> To: Dave Chinner <dchinner@redhat.com>
> Cc: Lukas Czerner <lczerner@redhat.com>, linux-ext4@vger.kernel.org,
>     linux-fsdevel@vger.kernel.org, tytso@mit.edu, achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 04/12 v2] xfs: pass LLONG_MAX to
>     truncate_inode_pages_range
> 
> On Mon, 16 Jul 2012, Dave Chinner wrote:
> 
> > Date: Mon, 16 Jul 2012 09:11:17 +1000
> > From: Dave Chinner <dchinner@redhat.com>
> > To: Lukas Czerner <lczerner@redhat.com>
> > Cc: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org, tytso@mit.edu,
> >     achender@linux.vnet.ibm.com
> > Subject: Re: [PATCH 04/12 v2] xfs: pass LLONG_MAX to
> >     truncate_inode_pages_range
> > 
> > On Fri, Jul 13, 2012 at 03:19:07PM +0200, Lukas Czerner wrote:
> > > Currently we're passing -1 to truncate_inode_pages_range() which is
> > > actually really confusing since the argument is signed so we do not get
> > > "huge" number as one would expect, but rather just -1. To make things
> > > clearer and easier for truncate_inode_pages_range() just pass LLONG_MAX
> > > since it is actually what was intended anyway.
> > > 
> > > It also makes thing easier for allowing truncate_inode_pages_range() to
> > > handle non page aligned regions. Moreover letting the lend argument to
> > > be negative might actually hide some bugs.
> > > 
> > > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > > Cc: Dave Chinner <dchinner@redhat.com>
> > > ---
> > >  fs/xfs/xfs_fs_subr.c |    6 ++++--
> > >  1 files changed, 4 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
> > > index 652b875..6e9b052 100644
> > > --- a/fs/xfs/xfs_fs_subr.c
> > > +++ b/fs/xfs/xfs_fs_subr.c
> > > @@ -34,7 +34,8 @@ xfs_tosspages(
> > >  {
> > >  	/* can't toss partial tail pages, so mask them out */
> > >  	last &= ~(PAGE_SIZE - 1);
> > > -	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
> > > +	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first,
> > > +				   last == -1 ? LLONG_MAX : last);
> > 
> > The last paramter changed from (last -1) to last. so if we pass in
> > last = 16384, we now truncate to 16384 (first byte of page index 5)
> > instead of 16383 (last byte of page index 4). That's a change of
> > behaviour and a potential off-by one error, right?
> 
> Right, this could potentially cause off-by-one errors, but as it is
> now I do not think this could happen. The only place where it is
> used with a proper range is XFS_IOC_ZERO_RANGE and you're going to
> convert the whole range to unwritten anyway. But it was unintended
> and I\ll fix it.

Hi Dave,

Is there a reason for aligning the last page in the xfs_tosspages()
other than truncate_inode_pages_range() does not handle unaligned
regions ? Because with my patch it does now, so it seems to me that
we can easily get rid of the xfs_tosspages() and just use
truncate_inode_pages_range() instead in xfs_change_file_space() and
xfs_swap_extents().

Thanks!
-Lukas

> 
> 
> > 
> > > @@ -53,7 +54,8 @@ xfs_flushinval_pages(
> > >  	ret = filemap_write_and_wait_range(mapping, first,
> > >  				last == -1 ? LLONG_MAX : last);
> > >  	if (!ret)
> > > -		truncate_inode_pages_range(mapping, first, last);
> > > +		truncate_inode_pages_range(mapping, first,
> > > +					   last == -1 ? LLONG_MAX : last);
> > 
> > Given this is also done immediately above in the function, perhaps
> > this should be done before anything:
> > 
> > 	if (last == -1)
> > 		last = LLONG_MAX;
> > 
> > and the parameter simply passed to the two functions without the
> > conditional logic?
> 
> Yes, it makes sense to do this, I'll change it in the next
> iteration.
> 
> Thanks for the review Dave.
> -Lukas
> 
> > 
> > Cheers,
> > 
> > Dave.
> > 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
  2012-07-16  7:35     ` Lukáš Czerner
@ 2012-07-16 21:41       ` Hugh Dickins
  2012-07-17  7:53         ` Lukáš Czerner
  0 siblings, 1 reply; 31+ messages in thread
From: Hugh Dickins @ 2012-07-16 21:41 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Andrew Morton, Theodore Ts'o, Dave Chinner, linux-ext4,
	linux-fsdevel, achender

On Mon, 16 Jul 2012, Lukas Czerner wrote:
> On Fri, 13 Jul 2012, Theodore Ts'o wrote:
> > Date: Fri, 13 Jul 2012 13:42:09 -0400
> > From: Theodore Ts'o <tytso@mit.edu>
> > To: Lukas Czerner <lczerner@redhat.com>
> > Cc: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
> >     achender@linux.vnet.ibm.com
> > Subject: Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
> > 
> > Hi Lukas,
> > 
> > This patch set has changes to the VFS, XFS, and ext4, and there are
> > cross dependencies between them.  Is there any way you can disentagle
> > the dependencies between the patches so we don't need to worry about
> > how these get pulled in during the merge window?
> > 
> > I suppose could try to get the XFS folks to sign off with my carrying
> > this patch in the ext4 tree, since the bulk of the changes are
> > ext4-related, but it is a bit of a complication.
> > 
> > 	      	     	      	 - Ted
> 
> Hi Ted,
> 
> there is no VFS change, MM is probably what you've had in mind ? So
> the only reason why there are xfs, mm and tmpfs changes is that I am
> changing truncate_partial_page() and a small side effect is changed
> calling conventions.

truncate_partial_page() is static inline to mm/truncate.c:
was that a typo, or are you changing that too and I missed it?

Sorry, but I find your changes of the infinity convention from -1 to
LLONG_MAX just gratuitous and confusing (and error prone if we ever
have to backport portions to earlier stable releases).

It grieves me enough that unmap_mapping_range() has always had quite
a different convention from truncate_inode_pages_range().  You're now
proposing to give truncate_inode_pages_range() yet another convention
different from invalidate_inode_pages2_range() and
truncate_pagecache_range()?

At cost of having to make xfs and tmpfs (well, actually it's the
tiny !SHMEM ramfs case you're changing there in 03/12) dependent
on synchronizing with your other changes?

I can see that when I converted shmem_truncate_range() (later extended
to shmem_undo_range()) to partial_start and partial_end, I did put in
an ugly couple of lines
	if (lend == -1)
		end = -1;	/* unsigned, so actually very big */
but I think it's better there than "lend == -1 ? LLONG_MAX : lend;"
ugliness spread around in the filesystems.

I don't see any upside: please, could you just drop those changes,
so that then it comes down to synchronizing ext4 with mm/truncate.c?

> 
> We can push patches 3, 4, 5 and 6 through the mm tree. But we'll
> have to make sure that it will land before ext4 changes since I am
> using the new truncate_partial_page() behaviour in ext4. Will that
> be possible ?
> 
> Hugh ?

I'd like patchs 3, 4 and 5 to vanish.  As to 6 (perhaps it won't be
6 after that!), I want to work through that one myself (I'll try this
evening): it looked over-complicated to me, and I'd rather go back to
what I worked out for partial_end in shmem_truncate_range().

Then yes, if we're all happy with the result of that, it will be best
for Ted to take even the mm/truncate.c patch into his ext4 branch,
made visible in linux-next before the merge window.

We're running out of time: I'll make every effort to get back to you
on 6 after I've tested late today.

Hugh

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
  2012-07-16 21:41       ` Hugh Dickins
@ 2012-07-17  7:53         ` Lukáš Czerner
  0 siblings, 0 replies; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-17  7:53 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Lukas Czerner, Andrew Morton, Theodore Ts'o, Dave Chinner,
	linux-ext4, linux-fsdevel, achender

On Mon, 16 Jul 2012, Hugh Dickins wrote:

> Date: Mon, 16 Jul 2012 14:41:41 -0700 (PDT)
> From: Hugh Dickins <hughd@google.com>
> To: Lukas Czerner <lczerner@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
>     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
>     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
> 
> On Mon, 16 Jul 2012, Lukas Czerner wrote:
> > On Fri, 13 Jul 2012, Theodore Ts'o wrote:
> > > Date: Fri, 13 Jul 2012 13:42:09 -0400
> > > From: Theodore Ts'o <tytso@mit.edu>
> > > To: Lukas Czerner <lczerner@redhat.com>
> > > Cc: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
> > >     achender@linux.vnet.ibm.com
> > > Subject: Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
> > > 
> > > Hi Lukas,
> > > 
> > > This patch set has changes to the VFS, XFS, and ext4, and there are
> > > cross dependencies between them.  Is there any way you can disentagle
> > > the dependencies between the patches so we don't need to worry about
> > > how these get pulled in during the merge window?
> > > 
> > > I suppose could try to get the XFS folks to sign off with my carrying
> > > this patch in the ext4 tree, since the bulk of the changes are
> > > ext4-related, but it is a bit of a complication.
> > > 
> > > 	      	     	      	 - Ted
> > 
> > Hi Ted,
> > 
> > there is no VFS change, MM is probably what you've had in mind ? So
> > the only reason why there are xfs, mm and tmpfs changes is that I am
> > changing truncate_partial_page() and a small side effect is changed
> > calling conventions.
> 
> truncate_partial_page() is static inline to mm/truncate.c:
> was that a typo, or are you changing that too and I missed it?

Sorry, I meant truncate_inode_pages_range().

> 
> Sorry, but I find your changes of the infinity convention from -1 to
> LLONG_MAX just gratuitous and confusing (and error prone if we ever
> have to backport portions to earlier stable releases).
> 
> It grieves me enough that unmap_mapping_range() has always had quite
> a different convention from truncate_inode_pages_range().  You're now
> proposing to give truncate_inode_pages_range() yet another convention
> different from invalidate_inode_pages2_range() and
> truncate_pagecache_range()?
> 
> At cost of having to make xfs and tmpfs (well, actually it's the
> tiny !SHMEM ramfs case you're changing there in 03/12) dependent
> on synchronizing with your other changes?
> 
> I can see that when I converted shmem_truncate_range() (later extended
> to shmem_undo_range()) to partial_start and partial_end, I did put in
> an ugly couple of lines
> 	if (lend == -1)
> 		end = -1;	/* unsigned, so actually very big */
> but I think it's better there than "lend == -1 ? LLONG_MAX : lend;"
> ugliness spread around in the filesystems.

I find the -1 convention rather confusing and the above, as you said,
quite ugly. I seemed to me much cleaner to just send what we
actually want, which is LLONG_MAX. You're right that "lend == -1 ?
LLONG_MAX : lend;" is not pretty, but it at least gives the file
systems reason to fix that and do not use -1.

It is bad enough that zero_user_segment() and friends has absolutely
weird convention where instead of "end" you actually expect "end +
1", whereas zero_user() actually uses start + size instead. Things
like that, plus the -1, makes it easy to confuse (at least for me).

> 
> I don't see any upside: please, could you just drop those changes,
> so that then it comes down to synchronizing ext4 with mm/truncate.c?

The reason of this change to teach truncate_inode_pages_range() to
handle non page aligned ranges which we then can make use of in
ext4.

> 
> > 
> > We can push patches 3, 4, 5 and 6 through the mm tree. But we'll
> > have to make sure that it will land before ext4 changes since I am
> > using the new truncate_partial_page() behaviour in ext4. Will that
> > be possible ?
> > 
> > Hugh ?
> 
> I'd like patchs 3, 4 and 5 to vanish.  As to 6 (perhaps it won't be
> 6 after that!), I want to work through that one myself (I'll try this
> evening): it looked over-complicated to me, and I'd rather go back to
> what I worked out for partial_end in shmem_truncate_range().

The code in the shmem_undo_range() is really similar, except it is
not able to handle offset of LLONG_MAX, or actually even the last
page with this offset, but truncate_inode_pages_range() obviously
does. But again, there is the confusion with the "end" being "end + 1"
which seems odd.

> 
> Then yes, if we're all happy with the result of that, it will be best
> for Ted to take even the mm/truncate.c patch into his ext4 branch,
> made visible in linux-next before the merge window.
> 
> We're running out of time: I'll make every effort to get back to you
> on 6 after I've tested late today.

Great, thanks Hugh.

-Lukas

> 
> Hugh
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges
  2012-07-13 13:19 ` [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges Lukas Czerner
@ 2012-07-17  8:28   ` Hugh Dickins
  2012-07-17 11:57     ` Lukáš Czerner
  0 siblings, 1 reply; 31+ messages in thread
From: Hugh Dickins @ 2012-07-17  8:28 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Andrew Morton, Theodore Ts'o, Dave Chinner, linux-ext4,
	linux-fsdevel, achender

On Fri, 13 Jul 2012, Lukas Czerner wrote:
> This commit changes truncate_inode_pages_range() so it can handle non
> page aligned regions of the truncate. Currently we can hit BUG_ON when
> the end of the range is not page aligned, but he can handle unaligned
> start of the range.
> 
> Being able to handle non page aligned regions of the page can help file
> system punch_hole implementations and save some work, because once we're
> holding the page we might as well deal with it right away.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Hugh Dickins <hughd@google.com>

As I said under 02/12, I'd much rather not change from the existing -1
convention: I don't think it's wonderful, but I do think it's confusing
and a waste of effort to change from it; and I'd rather keep the code
in truncate.c close to what's doing the same job in shmem.c.

Here's what I came up with (and hacked tmpfs to use it without swap
temporarily, so I could run fsx for an hour to validate it).  But you
can see I've a couple of questions; and probably ought to reduce the
partial page code duplication once we're sure what should go in there.

Hugh

[PATCH]...

Apply to truncate_inode_pages_range() the changes 83e4fa9c16e4 ("tmpfs:
support fallocate FALLOC_FL_PUNCH_HOLE") made to shmem_truncate_range():
so the generic function can handle partial end offset for hole-punching.

In doing tmpfs, I became convinced that it needed a set_page_dirty() on
the partial pages, and I'm doing that here: but perhaps it should be the
responsibility of the calling filesystem?  I don't know.

And I'm doubtful whether this code can be correct (on a filesystem with
blocksize less than pagesize) without adding an end offset argument to
address_space_operations invalidatepage(page, offset): convince me!

Not-yet-signed-off-by: Hugh Dickins <hughd@google.com>
---

 mm/truncate.c |   69 +++++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 27 deletions(-)

--- 3.5-rc7/mm/truncate.c	2012-06-03 06:42:11.249787128 -0700
+++ linux/mm/truncate.c	2012-07-16 22:54:16.903821549 -0700
@@ -49,14 +49,6 @@ void do_invalidatepage(struct page *page
 		(*invalidatepage)(page, offset);
 }
 
-static inline void truncate_partial_page(struct page *page, unsigned partial)
-{
-	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-	cleancache_invalidate_page(page->mapping, page);
-	if (page_has_private(page))
-		do_invalidatepage(page, partial);
-}
-
 /*
  * This cancels just the dirty bit on the kernel page itself, it
  * does NOT actually remove dirty bits on any mmap's that may be
@@ -190,8 +182,8 @@ int invalidate_inode_page(struct page *p
  * @lend: offset to which to truncate
  *
  * Truncate the page cache, removing the pages that are between
- * specified offsets (and zeroing out partial page
- * (if lstart is not page aligned)).
+ * specified offsets (and zeroing out partial pages
+ * if lstart or lend + 1 is not page aligned).
  *
  * Truncate takes two passes - the first pass is nonblocking.  It will not
  * block on page locks and it will not block on writeback.  The second pass
@@ -206,31 +198,32 @@ int invalidate_inode_page(struct page *p
 void truncate_inode_pages_range(struct address_space *mapping,
 				loff_t lstart, loff_t lend)
 {
-	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
-	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
+	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t index;
-	pgoff_t end;
 	int i;
 
 	cleancache_invalidate_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
-	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
-	end = (lend >> PAGE_CACHE_SHIFT);
+	if (lend == -1)
+		end = -1;	/* unsigned, so actually very big */
 
 	pagevec_init(&pvec, 0);
 	index = start;
-	while (index <= end && pagevec_lookup(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+	while (index < end && pagevec_lookup(&pvec, mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
 		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
 			/* We rely upon deletion not changing page->index */
 			index = page->index;
-			if (index > end)
+			if (index >= end)
 				break;
 
 			if (!trylock_page(page))
@@ -249,27 +242,51 @@ void truncate_inode_pages_range(struct a
 		index++;
 	}
 
-	if (partial) {
+	if (partial_start) {
 		struct page *page = find_lock_page(mapping, start - 1);
 		if (page) {
+			unsigned int top = PAGE_CACHE_SIZE;
+			if (start > end) {
+				top = partial_end;
+				partial_end = 0;
+			}
 			wait_on_page_writeback(page);
-			truncate_partial_page(page, partial);
+			zero_user_segment(page, partial_start, top);
+			cleancache_invalidate_page(mapping, page);
+			if (page_has_private(page))
+				do_invalidatepage(page, partial_start);
+			set_page_dirty(page);
 			unlock_page(page);
 			page_cache_release(page);
 		}
 	}
+	if (partial_end) {
+		struct page *page = find_lock_page(mapping, end);
+		if (page) {
+			wait_on_page_writeback(page);
+			zero_user_segment(page, 0, partial_end);
+			cleancache_invalidate_page(mapping, page);
+			if (page_has_private(page))
+				do_invalidatepage(page, 0);
+			set_page_dirty(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+	if (start >= end)
+		return;
 
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
 		if (!pagevec_lookup(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
 			if (index == start)
 				break;
 			index = start;
 			continue;
 		}
-		if (index == start && pvec.pages[0]->index > end) {
+		if (index == start && pvec.pages[0]->index >= end) {
 			pagevec_release(&pvec);
 			break;
 		}
@@ -279,7 +296,7 @@ void truncate_inode_pages_range(struct a
 
 			/* We rely upon deletion not changing page->index */
 			index = page->index;
-			if (index > end)
+			if (index >= end)
 				break;
 
 			lock_page(page);
@@ -624,10 +641,8 @@ void truncate_pagecache_range(struct ino
 	 * This rounding is currently just for example: unmap_mapping_range
 	 * expands its hole outwards, whereas we want it to contract the hole
 	 * inwards.  However, existing callers of truncate_pagecache_range are
-	 * doing their own page rounding first; and truncate_inode_pages_range
-	 * currently BUGs if lend is not pagealigned-1 (it handles partial
-	 * page at start of hole, but not partial page at end of hole).  Note
-	 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
+	 * doing their own page rounding first.  Note that unmap_mapping_range
+	 * allows holelen 0 for all, and we allow lend -1 for end of file.
 	 */
 
 	/*

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges
  2012-07-17  8:28   ` Hugh Dickins
@ 2012-07-17 11:57     ` Lukáš Czerner
  2012-07-17 12:16       ` Lukáš Czerner
  0 siblings, 1 reply; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-17 11:57 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Lukas Czerner, Andrew Morton, Theodore Ts'o, Dave Chinner,
	linux-ext4, linux-fsdevel, achender

On Tue, 17 Jul 2012, Hugh Dickins wrote:

> Date: Tue, 17 Jul 2012 01:28:08 -0700 (PDT)
> From: Hugh Dickins <hughd@google.com>
> To: Lukas Czerner <lczerner@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
>     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
>     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle
>      non page aligned ranges
> 
> On Fri, 13 Jul 2012, Lukas Czerner wrote:
> > This commit changes truncate_inode_pages_range() so it can handle non
> > page aligned regions of the truncate. Currently we can hit BUG_ON when
> > the end of the range is not page aligned, but he can handle unaligned
> > start of the range.
> > 
> > Being able to handle non page aligned regions of the page can help file
> > system punch_hole implementations and save some work, because once we're
> > holding the page we might as well deal with it right away.
> > 
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > Cc: Hugh Dickins <hughd@google.com>
> 
> As I said under 02/12, I'd much rather not change from the existing -1
> convention: I don't think it's wonderful, but I do think it's confusing
> and a waste of effort to change from it; and I'd rather keep the code
> in truncate.c close to what's doing the same job in shmem.c.
> 
> Here's what I came up with (and hacked tmpfs to use it without swap
> temporarily, so I could run fsx for an hour to validate it).  But you
> can see I've a couple of questions; and probably ought to reduce the
> partial page code duplication once we're sure what should go in there.
> 
> Hugh

Ok.

> 
> [PATCH]...
> 
> Apply to truncate_inode_pages_range() the changes 83e4fa9c16e4 ("tmpfs:
> support fallocate FALLOC_FL_PUNCH_HOLE") made to shmem_truncate_range():
> so the generic function can handle partial end offset for hole-punching.
> 
> In doing tmpfs, I became convinced that it needed a set_page_dirty() on
> the partial pages, and I'm doing that here: but perhaps it should be the
> responsibility of the calling filesystem?  I don't know.

In file system, if the range is block aligned we do not need the page to
be dirtied. However if it is not block aligned (at least in ext4)
we're going to handle it ourselves and possibly mark the page buffer
dirty (hence the page would be dirty). Also in case of data
journalling, we'll have to take care of the last block in the hole
ourselves. So I think file systems should take care of dirtying the
partial page if needed.

> 
> And I'm doubtful whether this code can be correct (on a filesystem with
> blocksize less than pagesize) without adding an end offset argument to
> address_space_operations invalidatepage(page, offset): convince me!

Well, I can't. It really seems that on block size < page size file
systems we could potentially discard dirty buffers beyond the hole
we're punching if it is not page aligned. We would probably need to
add end offset argument to the invalidatepage() aop. However I do not
seem to be able to trigger the problem yet so maybe I'm still
missing something.

-Lukas

> 
> Not-yet-signed-off-by: Hugh Dickins <hughd@google.com>
> ---
> 
>  mm/truncate.c |   69 +++++++++++++++++++++++++++++-------------------
>  1 file changed, 42 insertions(+), 27 deletions(-)
> 
> --- 3.5-rc7/mm/truncate.c	2012-06-03 06:42:11.249787128 -0700
> +++ linux/mm/truncate.c	2012-07-16 22:54:16.903821549 -0700
> @@ -49,14 +49,6 @@ void do_invalidatepage(struct page *page
>  		(*invalidatepage)(page, offset);
>  }
>  
> -static inline void truncate_partial_page(struct page *page, unsigned partial)
> -{
> -	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
> -	cleancache_invalidate_page(page->mapping, page);
> -	if (page_has_private(page))
> -		do_invalidatepage(page, partial);
> -}
> -
>  /*
>   * This cancels just the dirty bit on the kernel page itself, it
>   * does NOT actually remove dirty bits on any mmap's that may be
> @@ -190,8 +182,8 @@ int invalidate_inode_page(struct page *p
>   * @lend: offset to which to truncate
>   *
>   * Truncate the page cache, removing the pages that are between
> - * specified offsets (and zeroing out partial page
> - * (if lstart is not page aligned)).
> + * specified offsets (and zeroing out partial pages
> + * if lstart or lend + 1 is not page aligned).
>   *
>   * Truncate takes two passes - the first pass is nonblocking.  It will not
>   * block on page locks and it will not block on writeback.  The second pass
> @@ -206,31 +198,32 @@ int invalidate_inode_page(struct page *p
>  void truncate_inode_pages_range(struct address_space *mapping,
>  				loff_t lstart, loff_t lend)
>  {
> -	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
> -	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
> +	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> +	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
> +	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
> +	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
>  	struct pagevec pvec;
>  	pgoff_t index;
> -	pgoff_t end;
>  	int i;
>  
>  	cleancache_invalidate_inode(mapping);
>  	if (mapping->nrpages == 0)
>  		return;
>  
> -	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
> -	end = (lend >> PAGE_CACHE_SHIFT);
> +	if (lend == -1)
> +		end = -1;	/* unsigned, so actually very big */
>  
>  	pagevec_init(&pvec, 0);
>  	index = start;
> -	while (index <= end && pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> +	while (index < end && pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
>  		mem_cgroup_uncharge_start();
>  		for (i = 0; i < pagevec_count(&pvec); i++) {
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
>  			index = page->index;
> -			if (index > end)
> +			if (index >= end)
>  				break;
>  
>  			if (!trylock_page(page))
> @@ -249,27 +242,51 @@ void truncate_inode_pages_range(struct a
>  		index++;
>  	}
>  
> -	if (partial) {
> +	if (partial_start) {
>  		struct page *page = find_lock_page(mapping, start - 1);
>  		if (page) {
> +			unsigned int top = PAGE_CACHE_SIZE;
> +			if (start > end) {
> +				top = partial_end;
> +				partial_end = 0;
> +			}
>  			wait_on_page_writeback(page);
> -			truncate_partial_page(page, partial);
> +			zero_user_segment(page, partial_start, top);
> +			cleancache_invalidate_page(mapping, page);
> +			if (page_has_private(page))
> +				do_invalidatepage(page, partial_start);
> +			set_page_dirty(page);
>  			unlock_page(page);
>  			page_cache_release(page);
>  		}
>  	}
> +	if (partial_end) {
> +		struct page *page = find_lock_page(mapping, end);
> +		if (page) {
> +			wait_on_page_writeback(page);
> +			zero_user_segment(page, 0, partial_end);
> +			cleancache_invalidate_page(mapping, page);
> +			if (page_has_private(page))
> +				do_invalidatepage(page, 0);
> +			set_page_dirty(page);
> +			unlock_page(page);
> +			page_cache_release(page);
> +		}
> +	}
> +	if (start >= end)
> +		return;
>  
>  	index = start;
>  	for ( ; ; ) {
>  		cond_resched();
>  		if (!pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
>  			if (index == start)
>  				break;
>  			index = start;
>  			continue;
>  		}
> -		if (index == start && pvec.pages[0]->index > end) {
> +		if (index == start && pvec.pages[0]->index >= end) {
>  			pagevec_release(&pvec);
>  			break;
>  		}
> @@ -279,7 +296,7 @@ void truncate_inode_pages_range(struct a
>  
>  			/* We rely upon deletion not changing page->index */
>  			index = page->index;
> -			if (index > end)
> +			if (index >= end)
>  				break;
>  
>  			lock_page(page);
> @@ -624,10 +641,8 @@ void truncate_pagecache_range(struct ino
>  	 * This rounding is currently just for example: unmap_mapping_range
>  	 * expands its hole outwards, whereas we want it to contract the hole
>  	 * inwards.  However, existing callers of truncate_pagecache_range are
> -	 * doing their own page rounding first; and truncate_inode_pages_range
> -	 * currently BUGs if lend is not pagealigned-1 (it handles partial
> -	 * page at start of hole, but not partial page at end of hole).  Note
> -	 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
> +	 * doing their own page rounding first.  Note that unmap_mapping_range
> +	 * allows holelen 0 for all, and we allow lend -1 for end of file.
>  	 */
>  
>  	/*
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges
  2012-07-17 11:57     ` Lukáš Czerner
@ 2012-07-17 12:16       ` Lukáš Czerner
  2012-07-18  8:18         ` Lukáš Czerner
  0 siblings, 1 reply; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-17 12:16 UTC (permalink / raw)
  To: Lukáš Czerner
  Cc: Hugh Dickins, Andrew Morton, Theodore Ts'o, Dave Chinner,
	linux-ext4, linux-fsdevel, achender

[-- Attachment #1: Type: TEXT/PLAIN, Size: 9743 bytes --]

On Tue, 17 Jul 2012, Lukáš Czerner wrote:

> Date: Tue, 17 Jul 2012 13:57:42 +0200 (CEST)
> From: Lukáš Czerner <lczerner@redhat.com>
> To: Hugh Dickins <hughd@google.com>
> Cc: Lukas Czerner <lczerner@redhat.com>,
>     Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
>     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
>     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle
>      non page aligned ranges
> 
> On Tue, 17 Jul 2012, Hugh Dickins wrote:
> 
> > Date: Tue, 17 Jul 2012 01:28:08 -0700 (PDT)
> > From: Hugh Dickins <hughd@google.com>
> > To: Lukas Czerner <lczerner@redhat.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
> >     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
> >     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> > Subject: Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle
> >      non page aligned ranges
> > 
> > On Fri, 13 Jul 2012, Lukas Czerner wrote:
> > > This commit changes truncate_inode_pages_range() so it can handle non
> > > page aligned regions of the truncate. Currently we can hit BUG_ON when
> > > the end of the range is not page aligned, but he can handle unaligned
> > > start of the range.
> > > 
> > > Being able to handle non page aligned regions of the page can help file
> > > system punch_hole implementations and save some work, because once we're
> > > holding the page we might as well deal with it right away.
> > > 
> > > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > > Cc: Hugh Dickins <hughd@google.com>
> > 
> > As I said under 02/12, I'd much rather not change from the existing -1
> > convention: I don't think it's wonderful, but I do think it's confusing
> > and a waste of effort to change from it; and I'd rather keep the code
> > in truncate.c close to what's doing the same job in shmem.c.
> > 
> > Here's what I came up with (and hacked tmpfs to use it without swap
> > temporarily, so I could run fsx for an hour to validate it).  But you
> > can see I've a couple of questions; and probably ought to reduce the
> > partial page code duplication once we're sure what should go in there.
> > 
> > Hugh
> 
> Ok.
> 
> > 
> > [PATCH]...
> > 
> > Apply to truncate_inode_pages_range() the changes 83e4fa9c16e4 ("tmpfs:
> > support fallocate FALLOC_FL_PUNCH_HOLE") made to shmem_truncate_range():
> > so the generic function can handle partial end offset for hole-punching.
> > 
> > In doing tmpfs, I became convinced that it needed a set_page_dirty() on
> > the partial pages, and I'm doing that here: but perhaps it should be the
> > responsibility of the calling filesystem?  I don't know.
> 
> In file system, if the range is block aligned we do not need the page to
> be dirtied. However if it is not block aligned (at least in ext4)
> we're going to handle it ourselves and possibly mark the page buffer
> dirty (hence the page would be dirty). Also in case of data
> journalling, we'll have to take care of the last block in the hole
> ourselves. So I think file systems should take care of dirtying the
> partial page if needed.
> 
> > 
> > And I'm doubtful whether this code can be correct (on a filesystem with
> > blocksize less than pagesize) without adding an end offset argument to
> > address_space_operations invalidatepage(page, offset): convince me!
> 
> Well, I can't. It really seems that on block size < page size file
> systems we could potentially discard dirty buffers beyond the hole
> we're punching if it is not page aligned. We would probably need to
> add end offset argument to the invalidatepage() aop. However I do not
> seem to be able to trigger the problem yet so maybe I'm still
> missing something.

My bad, it definitely is not safe without the end offset argument in
invalidatepage() aops ..sigh..

> 
> -Lukas
> 
> > 
> > Not-yet-signed-off-by: Hugh Dickins <hughd@google.com>
> > ---
> > 
> >  mm/truncate.c |   69 +++++++++++++++++++++++++++++-------------------
> >  1 file changed, 42 insertions(+), 27 deletions(-)
> > 
> > --- 3.5-rc7/mm/truncate.c	2012-06-03 06:42:11.249787128 -0700
> > +++ linux/mm/truncate.c	2012-07-16 22:54:16.903821549 -0700
> > @@ -49,14 +49,6 @@ void do_invalidatepage(struct page *page
> >  		(*invalidatepage)(page, offset);
> >  }
> >  
> > -static inline void truncate_partial_page(struct page *page, unsigned partial)
> > -{
> > -	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
> > -	cleancache_invalidate_page(page->mapping, page);
> > -	if (page_has_private(page))
> > -		do_invalidatepage(page, partial);
> > -}
> > -
> >  /*
> >   * This cancels just the dirty bit on the kernel page itself, it
> >   * does NOT actually remove dirty bits on any mmap's that may be
> > @@ -190,8 +182,8 @@ int invalidate_inode_page(struct page *p
> >   * @lend: offset to which to truncate
> >   *
> >   * Truncate the page cache, removing the pages that are between
> > - * specified offsets (and zeroing out partial page
> > - * (if lstart is not page aligned)).
> > + * specified offsets (and zeroing out partial pages
> > + * if lstart or lend + 1 is not page aligned).
> >   *
> >   * Truncate takes two passes - the first pass is nonblocking.  It will not
> >   * block on page locks and it will not block on writeback.  The second pass
> > @@ -206,31 +198,32 @@ int invalidate_inode_page(struct page *p
> >  void truncate_inode_pages_range(struct address_space *mapping,
> >  				loff_t lstart, loff_t lend)
> >  {
> > -	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
> > -	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
> > +	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> > +	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
> > +	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
> > +	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
> >  	struct pagevec pvec;
> >  	pgoff_t index;
> > -	pgoff_t end;
> >  	int i;
> >  
> >  	cleancache_invalidate_inode(mapping);
> >  	if (mapping->nrpages == 0)
> >  		return;
> >  
> > -	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
> > -	end = (lend >> PAGE_CACHE_SHIFT);
> > +	if (lend == -1)
> > +		end = -1;	/* unsigned, so actually very big */
> >  
> >  	pagevec_init(&pvec, 0);
> >  	index = start;
> > -	while (index <= end && pagevec_lookup(&pvec, mapping, index,
> > -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> > +	while (index < end && pagevec_lookup(&pvec, mapping, index,
> > +			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> >  		mem_cgroup_uncharge_start();
> >  		for (i = 0; i < pagevec_count(&pvec); i++) {
> >  			struct page *page = pvec.pages[i];
> >  
> >  			/* We rely upon deletion not changing page->index */
> >  			index = page->index;
> > -			if (index > end)
> > +			if (index >= end)
> >  				break;
> >  
> >  			if (!trylock_page(page))
> > @@ -249,27 +242,51 @@ void truncate_inode_pages_range(struct a
> >  		index++;
> >  	}
> >  
> > -	if (partial) {
> > +	if (partial_start) {
> >  		struct page *page = find_lock_page(mapping, start - 1);
> >  		if (page) {
> > +			unsigned int top = PAGE_CACHE_SIZE;
> > +			if (start > end) {
> > +				top = partial_end;
> > +				partial_end = 0;
> > +			}
> >  			wait_on_page_writeback(page);
> > -			truncate_partial_page(page, partial);
> > +			zero_user_segment(page, partial_start, top);
> > +			cleancache_invalidate_page(mapping, page);
> > +			if (page_has_private(page))
> > +				do_invalidatepage(page, partial_start);
> > +			set_page_dirty(page);
> >  			unlock_page(page);
> >  			page_cache_release(page);
> >  		}
> >  	}
> > +	if (partial_end) {
> > +		struct page *page = find_lock_page(mapping, end);
> > +		if (page) {
> > +			wait_on_page_writeback(page);
> > +			zero_user_segment(page, 0, partial_end);
> > +			cleancache_invalidate_page(mapping, page);
> > +			if (page_has_private(page))
> > +				do_invalidatepage(page, 0);
> > +			set_page_dirty(page);
> > +			unlock_page(page);
> > +			page_cache_release(page);
> > +		}
> > +	}
> > +	if (start >= end)
> > +		return;
> >  
> >  	index = start;
> >  	for ( ; ; ) {
> >  		cond_resched();
> >  		if (!pagevec_lookup(&pvec, mapping, index,
> > -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> > +			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> >  			if (index == start)
> >  				break;
> >  			index = start;
> >  			continue;
> >  		}
> > -		if (index == start && pvec.pages[0]->index > end) {
> > +		if (index == start && pvec.pages[0]->index >= end) {
> >  			pagevec_release(&pvec);
> >  			break;
> >  		}
> > @@ -279,7 +296,7 @@ void truncate_inode_pages_range(struct a
> >  
> >  			/* We rely upon deletion not changing page->index */
> >  			index = page->index;
> > -			if (index > end)
> > +			if (index >= end)
> >  				break;
> >  
> >  			lock_page(page);
> > @@ -624,10 +641,8 @@ void truncate_pagecache_range(struct ino
> >  	 * This rounding is currently just for example: unmap_mapping_range
> >  	 * expands its hole outwards, whereas we want it to contract the hole
> >  	 * inwards.  However, existing callers of truncate_pagecache_range are
> > -	 * doing their own page rounding first; and truncate_inode_pages_range
> > -	 * currently BUGs if lend is not pagealigned-1 (it handles partial
> > -	 * page at start of hole, but not partial page at end of hole).  Note
> > -	 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
> > +	 * doing their own page rounding first.  Note that unmap_mapping_range
> > +	 * allows holelen 0 for all, and we allow lend -1 for end of file.
> >  	 */
> >  
> >  	/*
> > 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges
  2012-07-17 12:16       ` Lukáš Czerner
@ 2012-07-18  8:18         ` Lukáš Czerner
  2012-07-18 19:36           ` Hugh Dickins
  0 siblings, 1 reply; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-18  8:18 UTC (permalink / raw)
  To: Lukáš Czerner
  Cc: Hugh Dickins, Andrew Morton, Theodore Ts'o, Dave Chinner,
	linux-ext4, linux-fsdevel, achender

[-- Attachment #1: Type: TEXT/PLAIN, Size: 11517 bytes --]

On Tue, 17 Jul 2012, Lukáš Czerner wrote:

> Date: Tue, 17 Jul 2012 14:16:48 +0200 (CEST)
> From: Lukáš Czerner <lczerner@redhat.com>
> To: Lukáš Czerner <lczerner@redhat.com>
> Cc: Hugh Dickins <hughd@google.com>,
>     Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
>     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
>     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle
>      non page aligned ranges
> 
> On Tue, 17 Jul 2012, Lukáš Czerner wrote:
> 
> > Date: Tue, 17 Jul 2012 13:57:42 +0200 (CEST)
> > From: Lukáš Czerner <lczerner@redhat.com>
> > To: Hugh Dickins <hughd@google.com>
> > Cc: Lukas Czerner <lczerner@redhat.com>,
> >     Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
> >     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
> >     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> > Subject: Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle
> >      non page aligned ranges
> > 
> > On Tue, 17 Jul 2012, Hugh Dickins wrote:
> > 
> > > Date: Tue, 17 Jul 2012 01:28:08 -0700 (PDT)
> > > From: Hugh Dickins <hughd@google.com>
> > > To: Lukas Czerner <lczerner@redhat.com>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
> > >     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
> > >     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> > > Subject: Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle
> > >      non page aligned ranges
> > > 
> > > On Fri, 13 Jul 2012, Lukas Czerner wrote:
> > > > This commit changes truncate_inode_pages_range() so it can handle non
> > > > page aligned regions of the truncate. Currently we can hit BUG_ON when
> > > > the end of the range is not page aligned, but he can handle unaligned
> > > > start of the range.
> > > > 
> > > > Being able to handle non page aligned regions of the page can help file
> > > > system punch_hole implementations and save some work, because once we're
> > > > holding the page we might as well deal with it right away.
> > > > 
> > > > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > > > Cc: Hugh Dickins <hughd@google.com>
> > > 
> > > As I said under 02/12, I'd much rather not change from the existing -1
> > > convention: I don't think it's wonderful, but I do think it's confusing
> > > and a waste of effort to change from it; and I'd rather keep the code
> > > in truncate.c close to what's doing the same job in shmem.c.
> > > 
> > > Here's what I came up with (and hacked tmpfs to use it without swap
> > > temporarily, so I could run fsx for an hour to validate it).  But you
> > > can see I've a couple of questions; and probably ought to reduce the
> > > partial page code duplication once we're sure what should go in there.
> > > 
> > > Hugh
> > 
> > Ok.
> > 
> > > 
> > > [PATCH]...
> > > 
> > > Apply to truncate_inode_pages_range() the changes 83e4fa9c16e4 ("tmpfs:
> > > support fallocate FALLOC_FL_PUNCH_HOLE") made to shmem_truncate_range():
> > > so the generic function can handle partial end offset for hole-punching.
> > > 
> > > In doing tmpfs, I became convinced that it needed a set_page_dirty() on
> > > the partial pages, and I'm doing that here: but perhaps it should be the
> > > responsibility of the calling filesystem?  I don't know.
> > 
> > In file system, if the range is block aligned we do not need the page to
> > be dirtied. However if it is not block aligned (at least in ext4)
> > we're going to handle it ourselves and possibly mark the page buffer
> > dirty (hence the page would be dirty). Also in case of data
> > journalling, we'll have to take care of the last block in the hole
> > ourselves. So I think file systems should take care of dirtying the
> > partial page if needed.
> > 
> > > 
> > > And I'm doubtful whether this code can be correct (on a filesystem with
> > > blocksize less than pagesize) without adding an end offset argument to
> > > address_space_operations invalidatepage(page, offset): convince me!
> > 
> > Well, I can't. It really seems that on block size < page size file
> > systems we could potentially discard dirty buffers beyond the hole
> > we're punching if it is not page aligned. We would probably need to
> > add end offset argument to the invalidatepage() aop. However I do not
> > seem to be able to trigger the problem yet so maybe I'm still
> > missing something.
> 
> My bad, it definitely is not safe without the end offset argument in
> invalidatepage() aops ..sigh..

So what about having new aop invalidatepage_range and using that in
the truncate_inode_pages_range(). We can still BUG_ON if the file
system register invalidatepage, but does not invalidatepage_range
while the range to truncate is not page aligned at the end.

I am sure more file system than just ext4 can take advantage of
this. Currently only ext4, xfs and ocfs2 support punch hole and I
think that all of them can use truncate_inode_pages_range() which
handles unaligned ranges.

Currently ext4 has it's own overcomplicated method of freeing and
zeroing unaligned ranges. Xfs seems just truncate the whole file and
there seems to be a bug in ocfs2 where we can hit BUG_ON when the
cluster size < page size.

What do you reckon ?

-Lukas

> 
> > 
> > -Lukas
> > 
> > > 
> > > Not-yet-signed-off-by: Hugh Dickins <hughd@google.com>
> > > ---
> > > 
> > >  mm/truncate.c |   69 +++++++++++++++++++++++++++++-------------------
> > >  1 file changed, 42 insertions(+), 27 deletions(-)
> > > 
> > > --- 3.5-rc7/mm/truncate.c	2012-06-03 06:42:11.249787128 -0700
> > > +++ linux/mm/truncate.c	2012-07-16 22:54:16.903821549 -0700
> > > @@ -49,14 +49,6 @@ void do_invalidatepage(struct page *page
> > >  		(*invalidatepage)(page, offset);
> > >  }
> > >  
> > > -static inline void truncate_partial_page(struct page *page, unsigned partial)
> > > -{
> > > -	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
> > > -	cleancache_invalidate_page(page->mapping, page);
> > > -	if (page_has_private(page))
> > > -		do_invalidatepage(page, partial);
> > > -}
> > > -
> > >  /*
> > >   * This cancels just the dirty bit on the kernel page itself, it
> > >   * does NOT actually remove dirty bits on any mmap's that may be
> > > @@ -190,8 +182,8 @@ int invalidate_inode_page(struct page *p
> > >   * @lend: offset to which to truncate
> > >   *
> > >   * Truncate the page cache, removing the pages that are between
> > > - * specified offsets (and zeroing out partial page
> > > - * (if lstart is not page aligned)).
> > > + * specified offsets (and zeroing out partial pages
> > > + * if lstart or lend + 1 is not page aligned).
> > >   *
> > >   * Truncate takes two passes - the first pass is nonblocking.  It will not
> > >   * block on page locks and it will not block on writeback.  The second pass
> > > @@ -206,31 +198,32 @@ int invalidate_inode_page(struct page *p
> > >  void truncate_inode_pages_range(struct address_space *mapping,
> > >  				loff_t lstart, loff_t lend)
> > >  {
> > > -	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
> > > -	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
> > > +	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> > > +	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
> > > +	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
> > > +	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
> > >  	struct pagevec pvec;
> > >  	pgoff_t index;
> > > -	pgoff_t end;
> > >  	int i;
> > >  
> > >  	cleancache_invalidate_inode(mapping);
> > >  	if (mapping->nrpages == 0)
> > >  		return;
> > >  
> > > -	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
> > > -	end = (lend >> PAGE_CACHE_SHIFT);
> > > +	if (lend == -1)
> > > +		end = -1;	/* unsigned, so actually very big */
> > >  
> > >  	pagevec_init(&pvec, 0);
> > >  	index = start;
> > > -	while (index <= end && pagevec_lookup(&pvec, mapping, index,
> > > -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> > > +	while (index < end && pagevec_lookup(&pvec, mapping, index,
> > > +			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> > >  		mem_cgroup_uncharge_start();
> > >  		for (i = 0; i < pagevec_count(&pvec); i++) {
> > >  			struct page *page = pvec.pages[i];
> > >  
> > >  			/* We rely upon deletion not changing page->index */
> > >  			index = page->index;
> > > -			if (index > end)
> > > +			if (index >= end)
> > >  				break;
> > >  
> > >  			if (!trylock_page(page))
> > > @@ -249,27 +242,51 @@ void truncate_inode_pages_range(struct a
> > >  		index++;
> > >  	}
> > >  
> > > -	if (partial) {
> > > +	if (partial_start) {
> > >  		struct page *page = find_lock_page(mapping, start - 1);
> > >  		if (page) {
> > > +			unsigned int top = PAGE_CACHE_SIZE;
> > > +			if (start > end) {
> > > +				top = partial_end;
> > > +				partial_end = 0;
> > > +			}
> > >  			wait_on_page_writeback(page);
> > > -			truncate_partial_page(page, partial);
> > > +			zero_user_segment(page, partial_start, top);
> > > +			cleancache_invalidate_page(mapping, page);
> > > +			if (page_has_private(page))
> > > +				do_invalidatepage(page, partial_start);
> > > +			set_page_dirty(page);
> > >  			unlock_page(page);
> > >  			page_cache_release(page);
> > >  		}
> > >  	}
> > > +	if (partial_end) {
> > > +		struct page *page = find_lock_page(mapping, end);
> > > +		if (page) {
> > > +			wait_on_page_writeback(page);
> > > +			zero_user_segment(page, 0, partial_end);
> > > +			cleancache_invalidate_page(mapping, page);
> > > +			if (page_has_private(page))
> > > +				do_invalidatepage(page, 0);
> > > +			set_page_dirty(page);
> > > +			unlock_page(page);
> > > +			page_cache_release(page);
> > > +		}
> > > +	}
> > > +	if (start >= end)
> > > +		return;
> > >  
> > >  	index = start;
> > >  	for ( ; ; ) {
> > >  		cond_resched();
> > >  		if (!pagevec_lookup(&pvec, mapping, index,
> > > -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> > > +			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> > >  			if (index == start)
> > >  				break;
> > >  			index = start;
> > >  			continue;
> > >  		}
> > > -		if (index == start && pvec.pages[0]->index > end) {
> > > +		if (index == start && pvec.pages[0]->index >= end) {
> > >  			pagevec_release(&pvec);
> > >  			break;
> > >  		}
> > > @@ -279,7 +296,7 @@ void truncate_inode_pages_range(struct a
> > >  
> > >  			/* We rely upon deletion not changing page->index */
> > >  			index = page->index;
> > > -			if (index > end)
> > > +			if (index >= end)
> > >  				break;
> > >  
> > >  			lock_page(page);
> > > @@ -624,10 +641,8 @@ void truncate_pagecache_range(struct ino
> > >  	 * This rounding is currently just for example: unmap_mapping_range
> > >  	 * expands its hole outwards, whereas we want it to contract the hole
> > >  	 * inwards.  However, existing callers of truncate_pagecache_range are
> > > -	 * doing their own page rounding first; and truncate_inode_pages_range
> > > -	 * currently BUGs if lend is not pagealigned-1 (it handles partial
> > > -	 * page at start of hole, but not partial page at end of hole).  Note
> > > -	 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
> > > +	 * doing their own page rounding first.  Note that unmap_mapping_range
> > > +	 * allows holelen 0 for all, and we allow lend -1 for end of file.
> > >  	 */
> > >  
> > >  	/*
> > > 
> > 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
  2012-07-13 13:19 ` [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure" Lukas Czerner
  2012-07-13 17:42   ` Theodore Ts'o
@ 2012-07-18 19:34   ` Eric Sandeen
  2012-07-19  6:45     ` Lukáš Czerner
  1 sibling, 1 reply; 31+ messages in thread
From: Eric Sandeen @ 2012-07-18 19:34 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: linux-ext4, linux-fsdevel, tytso, achender

On 7/13/12 8:19 AM, Lukas Czerner wrote:
> This reverts commit 189e868fa8fdca702eb9db9d8afc46b5cb9144c9.
> 
> This commit reintroduces the use of ext4_block_truncate_page() in ext4
> truncate operation instead of ext4_discard_partial_page_buffers().

And I think commit 15291164b22a357cb211b618adfef4fa82fc0de3
jbd2: clear BH_Delay & BH_Unwritten in journal_unmap_buffer

fixed the fsx problem referenced in that commit more properly.

> The statement in the commit description that the truncate operation only
> zero block unaligned portion of the last page is not exactly right,
> since truncate_pagecache_range() also zeroes and invalidate the unaligned
> portion of the page. Then there is no need to zero and unmap it once more
> and ext4_block_truncate_page() was doing the right job, although we
> still need to update the buffer head containing the last block, which is
> exactly what ext4_block_truncate_page() is doing.
> 
> This was tested on ppc64 machine with block size of 1024 bytes without
> any problems.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> ---
>  fs/ext4/extents.c  |   13 ++-----------
>  fs/ext4/indirect.c |   13 ++-----------
>  2 files changed, 4 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 91341ec..ceab5f5 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4222,7 +4222,6 @@ void ext4_ext_truncate(struct inode *inode)
>  	struct super_block *sb = inode->i_sb;
>  	ext4_lblk_t last_block;
>  	handle_t *handle;
> -	loff_t page_len;
>  	int err = 0;
>  
>  	/*
> @@ -4239,16 +4238,8 @@ void ext4_ext_truncate(struct inode *inode)
>  	if (IS_ERR(handle))
>  		return;
>  
> -	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
> -		page_len = PAGE_CACHE_SIZE -
> -			(inode->i_size & (PAGE_CACHE_SIZE - 1));
> -
> -		err = ext4_discard_partial_page_buffers(handle,
> -			mapping, inode->i_size, page_len, 0);
> -
> -		if (err)
> -			goto out_stop;
> -	}
> +	if (inode->i_size & (sb->s_blocksize - 1))
> +		ext4_block_truncate_page(handle, mapping, inode->i_size);
>  
>  	if (ext4_orphan_add(handle, inode))
>  		goto out_stop;
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 830e1b2..a082b30 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -1349,9 +1349,7 @@ void ext4_ind_truncate(struct inode *inode)
>  	__le32 nr = 0;
>  	int n = 0;
>  	ext4_lblk_t last_block, max_block;
> -	loff_t page_len;
>  	unsigned blocksize = inode->i_sb->s_blocksize;
> -	int err;
>  
>  	handle = start_transaction(inode);
>  	if (IS_ERR(handle))
> @@ -1362,16 +1360,9 @@ void ext4_ind_truncate(struct inode *inode)
>  	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
>  					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
>  
> -	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
> -		page_len = PAGE_CACHE_SIZE -
> -			(inode->i_size & (PAGE_CACHE_SIZE - 1));
> -
> -		err = ext4_discard_partial_page_buffers(handle,
> -			mapping, inode->i_size, page_len, 0);
> -
> -		if (err)
> +	if (inode->i_size & (blocksize - 1))
> +		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
>  			goto out_stop;
> -	}
>  
>  	if (last_block != max_block) {
>  		n = ext4_block_to_path(inode, last_block, offsets, NULL);
> 



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges
  2012-07-18  8:18         ` Lukáš Czerner
@ 2012-07-18 19:36           ` Hugh Dickins
  2012-07-19  7:15             ` Lukáš Czerner
  0 siblings, 1 reply; 31+ messages in thread
From: Hugh Dickins @ 2012-07-18 19:36 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Christoph Hellwig, Andrew Morton, Theodore Ts'o,
	Dave Chinner, linux-ext4, linux-fsdevel, achender

On Wed, 18 Jul 2012, Lukas Czerner wrote:
> On Tue, 17 Jul 2012, Lukas Czerner wrote:
> > 
> > My bad, it definitely is not safe without the end offset argument in
> > invalidatepage() aops ..sigh..
> 
> So what about having new aop invalidatepage_range and using that in
> the truncate_inode_pages_range(). We can still BUG_ON if the file
> system register invalidatepage, but not invalidatepage_range,
> when the range to truncate is not page aligned at the end.

I had some trouble parsing what you wrote, and have slightly adjusted
it (mainly adding a comma) to fit my understanding: shout at me if I'm
misrepresenting you!

Yes, I think that's what has to be done.  It's irritating to have two
methods doing the same job, but not nearly so irritating as having to
change core and all filesystems at the same time.  Then at some future
date there can be a cleanup to remove the old invalidatepage method.

> 
> I am sure more file system than just ext4 can take advantage of
> this. Currently only ext4, xfs and ocfs2 support punch hole and I
> think that all of them can use truncate_inode_pages_range() which
> handles unaligned ranges.

I expect that they can, but I'm far from sure of it: each filesystem
will have its own needs and difficulties, which might delay them from
a quick switchover to invalidatepage_range.

> 
> Currently ext4 has it's own overcomplicated method of freeing and
> zeroing unaligned ranges.

You're best placed to judge if its overcomplicated, I've not looked.

> Xfs seems just truncate the whole file and

I doubt that can be the case: how would it ever pass testing with
the hole-punching fsx if so?  But it is the case that xfs unmaps
all the pages from hole onwards, in the exceptional case where the
punched file is currently mmap'ed into userspace; and that is wrong,
and will get fixed, but it's not a huge big deal meanwhile.  (But it
does suggest that hole-punching is more difficult to get completely
right than people think at first.)

> there seems to be a bug in ocfs2 where we can hit BUG_ON when the
> cluster size < page size.
> 
> What do you reckon ?

I agree that you need invalidatepage_range for truncate_inode_page_range
to drop its end alignment restriction.  But now that we have to add a
method, I think it would be more convincing justification to have two
filesystems converted to make use of it, than just the one ext4.

Hugh

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 03/12 v2] shmem: pass LLONG_MAX to shmem_truncate_range
  2012-07-13 13:19 ` [PATCH 03/12 v2] shmem: pass LLONG_MAX to shmem_truncate_range Lukas Czerner
@ 2012-07-18 19:54   ` Eric Sandeen
  2012-07-19  6:40     ` Lukáš Czerner
  0 siblings, 1 reply; 31+ messages in thread
From: Eric Sandeen @ 2012-07-18 19:54 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: linux-ext4, linux-fsdevel, tytso, achender, Hugh Dickins

On 7/13/12 8:19 AM, Lukas Czerner wrote:
> Currently we're passing -1 to shmem_truncate_range which can then call
> truncate_inode_pages_range() which is actually really confusing since the
> argument is signed so we do not get "huge" number as one would expect,
> but rather just -1. To make things clearer and easier for
> truncate_inode_pages_range() just pass LLONG_MAX since it is actually what
> was intended anyway.
> 
> It also makes thing easier for allowing truncate_inode_pages_range() to
> handle non page aligned regions. Moreover letting the lend argument to
> be negative might actually hide some bugs.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Hugh Dickins <hughd@google.com>
> ---
>  mm/shmem.c |    3 ++-
>  1 files changed, 2 insertions(+), 1 deletions(-)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 4ce02e0..3199733 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2961,7 +2961,8 @@ void shmem_unlock_mapping(struct address_space *mapping)
>  
>  void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
>  {
> -	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
> +	truncate_inode_pages_range(inode->i_mapping, lstart,
> +				   lend == -1 ? LLONG_MAX : last);

Where'd "last" come from?  That won't build; should be "lend" right?

(code only goes this way if !CONFIG_SHMEM so you might have missed it)

-Eric

>  }
>  EXPORT_SYMBOL_GPL(shmem_truncate_range);
>  
> 



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 03/12 v2] shmem: pass LLONG_MAX to shmem_truncate_range
  2012-07-18 19:54   ` Eric Sandeen
@ 2012-07-19  6:40     ` Lukáš Czerner
  0 siblings, 0 replies; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-19  6:40 UTC (permalink / raw)
  To: Eric Sandeen
  Cc: Lukas Czerner, linux-ext4, linux-fsdevel, tytso, achender, Hugh Dickins

On Wed, 18 Jul 2012, Eric Sandeen wrote:

> Date: Wed, 18 Jul 2012 14:54:04 -0500
> From: Eric Sandeen <sandeen@redhat.com>
> To: Lukas Czerner <lczerner@redhat.com>
> Cc: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org, tytso@mit.edu,
>     achender@linux.vnet.ibm.com, Hugh Dickins <hughd@google.com>
> Subject: Re: [PATCH 03/12 v2] shmem: pass LLONG_MAX to shmem_truncate_range
> 
> On 7/13/12 8:19 AM, Lukas Czerner wrote:
> > Currently we're passing -1 to shmem_truncate_range which can then call
> > truncate_inode_pages_range() which is actually really confusing since the
> > argument is signed so we do not get "huge" number as one would expect,
> > but rather just -1. To make things clearer and easier for
> > truncate_inode_pages_range() just pass LLONG_MAX since it is actually what
> > was intended anyway.
> > 
> > It also makes thing easier for allowing truncate_inode_pages_range() to
> > handle non page aligned regions. Moreover letting the lend argument to
> > be negative might actually hide some bugs.
> > 
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > Cc: Hugh Dickins <hughd@google.com>
> > ---
> >  mm/shmem.c |    3 ++-
> >  1 files changed, 2 insertions(+), 1 deletions(-)
> > 
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 4ce02e0..3199733 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2961,7 +2961,8 @@ void shmem_unlock_mapping(struct address_space *mapping)
> >  
> >  void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
> >  {
> > -	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
> > +	truncate_inode_pages_range(inode->i_mapping, lstart,
> > +				   lend == -1 ? LLONG_MAX : last);
> 
> Where'd "last" come from?  That won't build; should be "lend" right?
> 
> (code only goes this way if !CONFIG_SHMEM so you might have missed it)

Ah, I definitely missed it. Thanks Eric. But from the discussion
with the Hugh, we're going to do this a bit differently anyway.

-Lukas

> 
> -Eric
> 
> >  }
> >  EXPORT_SYMBOL_GPL(shmem_truncate_range);
> >  
> > 
> 
> 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
  2012-07-18 19:34   ` Eric Sandeen
@ 2012-07-19  6:45     ` Lukáš Czerner
  0 siblings, 0 replies; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-19  6:45 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Lukas Czerner, linux-ext4, linux-fsdevel, tytso, achender

On Wed, 18 Jul 2012, Eric Sandeen wrote:

> Date: Wed, 18 Jul 2012 14:34:47 -0500
> From: Eric Sandeen <sandeen@redhat.com>
> To: Lukas Czerner <lczerner@redhat.com>
> Cc: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org, tytso@mit.edu,
>     achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure"
> 
> On 7/13/12 8:19 AM, Lukas Czerner wrote:
> > This reverts commit 189e868fa8fdca702eb9db9d8afc46b5cb9144c9.
> > 
> > This commit reintroduces the use of ext4_block_truncate_page() in ext4
> > truncate operation instead of ext4_discard_partial_page_buffers().
> 
> And I think commit 15291164b22a357cb211b618adfef4fa82fc0de3
> jbd2: clear BH_Delay & BH_Unwritten in journal_unmap_buffer
> 
> fixed the fsx problem referenced in that commit more properly.

You're probably right, I'll test that with your test case and add it
to the description.

Thanks!
-Lukas

> 
> > The statement in the commit description that the truncate operation only
> > zero block unaligned portion of the last page is not exactly right,
> > since truncate_pagecache_range() also zeroes and invalidate the unaligned
> > portion of the page. Then there is no need to zero and unmap it once more
> > and ext4_block_truncate_page() was doing the right job, although we
> > still need to update the buffer head containing the last block, which is
> > exactly what ext4_block_truncate_page() is doing.
> > 
> > This was tested on ppc64 machine with block size of 1024 bytes without
> > any problems.
> > 
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > ---
> >  fs/ext4/extents.c  |   13 ++-----------
> >  fs/ext4/indirect.c |   13 ++-----------
> >  2 files changed, 4 insertions(+), 22 deletions(-)
> > 
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index 91341ec..ceab5f5 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> > @@ -4222,7 +4222,6 @@ void ext4_ext_truncate(struct inode *inode)
> >  	struct super_block *sb = inode->i_sb;
> >  	ext4_lblk_t last_block;
> >  	handle_t *handle;
> > -	loff_t page_len;
> >  	int err = 0;
> >  
> >  	/*
> > @@ -4239,16 +4238,8 @@ void ext4_ext_truncate(struct inode *inode)
> >  	if (IS_ERR(handle))
> >  		return;
> >  
> > -	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
> > -		page_len = PAGE_CACHE_SIZE -
> > -			(inode->i_size & (PAGE_CACHE_SIZE - 1));
> > -
> > -		err = ext4_discard_partial_page_buffers(handle,
> > -			mapping, inode->i_size, page_len, 0);
> > -
> > -		if (err)
> > -			goto out_stop;
> > -	}
> > +	if (inode->i_size & (sb->s_blocksize - 1))
> > +		ext4_block_truncate_page(handle, mapping, inode->i_size);
> >  
> >  	if (ext4_orphan_add(handle, inode))
> >  		goto out_stop;
> > diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> > index 830e1b2..a082b30 100644
> > --- a/fs/ext4/indirect.c
> > +++ b/fs/ext4/indirect.c
> > @@ -1349,9 +1349,7 @@ void ext4_ind_truncate(struct inode *inode)
> >  	__le32 nr = 0;
> >  	int n = 0;
> >  	ext4_lblk_t last_block, max_block;
> > -	loff_t page_len;
> >  	unsigned blocksize = inode->i_sb->s_blocksize;
> > -	int err;
> >  
> >  	handle = start_transaction(inode);
> >  	if (IS_ERR(handle))
> > @@ -1362,16 +1360,9 @@ void ext4_ind_truncate(struct inode *inode)
> >  	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
> >  					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
> >  
> > -	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
> > -		page_len = PAGE_CACHE_SIZE -
> > -			(inode->i_size & (PAGE_CACHE_SIZE - 1));
> > -
> > -		err = ext4_discard_partial_page_buffers(handle,
> > -			mapping, inode->i_size, page_len, 0);
> > -
> > -		if (err)
> > +	if (inode->i_size & (blocksize - 1))
> > +		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
> >  			goto out_stop;
> > -	}
> >  
> >  	if (last_block != max_block) {
> >  		n = ext4_block_to_path(inode, last_block, offsets, NULL);
> > 
> 
> 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges
  2012-07-18 19:36           ` Hugh Dickins
@ 2012-07-19  7:15             ` Lukáš Czerner
  2012-07-19 23:07               ` Dave Chinner
  0 siblings, 1 reply; 31+ messages in thread
From: Lukáš Czerner @ 2012-07-19  7:15 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Lukas Czerner, Christoph Hellwig, Andrew Morton,
	Theodore Ts'o, Dave Chinner, linux-ext4, linux-fsdevel,
	achender

On Wed, 18 Jul 2012, Hugh Dickins wrote:

> Date: Wed, 18 Jul 2012 12:36:39 -0700 (PDT)
> From: Hugh Dickins <hughd@google.com>
> To: Lukas Czerner <lczerner@redhat.com>
> Cc: Christoph Hellwig <hch@infradead.org>,
>     Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
>     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
>     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> Subject: Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle
>      non page aligned ranges
> 
> On Wed, 18 Jul 2012, Lukas Czerner wrote:
> > On Tue, 17 Jul 2012, Lukas Czerner wrote:
> > > 
> > > My bad, it definitely is not safe without the end offset argument in
> > > invalidatepage() aops ..sigh..
> > 
> > So what about having new aop invalidatepage_range and using that in
> > the truncate_inode_pages_range(). We can still BUG_ON if the file
> > system register invalidatepage, but not invalidatepage_range,
> > when the range to truncate is not page aligned at the end.
> 
> I had some trouble parsing what you wrote, and have slightly adjusted
> it (mainly adding a comma) to fit my understanding: shout at me if I'm
> misrepresenting you!
> 
> Yes, I think that's what has to be done.  It's irritating to have two
> methods doing the same job, but not nearly so irritating as having to
> change core and all filesystems at the same time.  Then at some future
> date there can be a cleanup to remove the old invalidatepage method.

Agreed!

> 
> > 
> > I am sure more file system than just ext4 can take advantage of
> > this. Currently only ext4, xfs and ocfs2 support punch hole and I
> > think that all of them can use truncate_inode_pages_range() which
> > handles unaligned ranges.
> 
> I expect that they can, but I'm far from sure of it: each filesystem
> will have its own needs and difficulties, which might delay them from
> a quick switchover to invalidatepage_range.
> 
> > 
> > Currently ext4 has it's own overcomplicated method of freeing and
> > zeroing unaligned ranges.
> 
> You're best placed to judge if its overcomplicated, I've not looked.
> 
> > Xfs seems just truncate the whole file and
> 
> I doubt that can be the case: how would it ever pass testing with
> the hole-punching fsx if so?  But it is the case that xfs unmaps
> all the pages from hole onwards, in the exceptional case where the
> punched file is currently mmap'ed into userspace; and that is wrong,
> and will get fixed, but it's not a huge big deal meanwhile.  (But it
> does suggest that hole-punching is more difficult to get completely
> right than people think at first.)

Ok, maybe I did not express myself very well, sorry. I meant to say
that xfs will unmap all mapped pages in the file from start of the
hole to the end of the file.

> 
> > there seems to be a bug in ocfs2 where we can hit BUG_ON when the
> > cluster size < page size.
> > 
> > What do you reckon ?
> 
> I agree that you need invalidatepage_range for truncate_inode_page_range
> to drop its end alignment restriction.  But now that we have to add a
> method, I think it would be more convincing justification to have two
> filesystems converted to make use of it, than just the one ext4.

Ok, I'll do this and try to see what I can do with some other file
systems as well.

Thanks!
-Lukas

> 
> Hugh
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges
  2012-07-19  7:15             ` Lukáš Czerner
@ 2012-07-19 23:07               ` Dave Chinner
  0 siblings, 0 replies; 31+ messages in thread
From: Dave Chinner @ 2012-07-19 23:07 UTC (permalink / raw)
  To: Lukáš Czerner
  Cc: Hugh Dickins, Christoph Hellwig, Andrew Morton,
	Theodore Ts'o, linux-ext4, linux-fsdevel, achender

On Thu, Jul 19, 2012 at 09:15:09AM +0200, Lukáš Czerner wrote:
> On Wed, 18 Jul 2012, Hugh Dickins wrote:
> 
> > Date: Wed, 18 Jul 2012 12:36:39 -0700 (PDT)
> > From: Hugh Dickins <hughd@google.com>
> > To: Lukas Czerner <lczerner@redhat.com>
> > Cc: Christoph Hellwig <hch@infradead.org>,
> >     Andrew Morton <akpm@linux-foundation.org>, Theodore Ts'o <tytso@mit.edu>,
> >     Dave Chinner <dchinner@redhat.com>, linux-ext4@vger.kernel.org,
> >     linux-fsdevel@vger.kernel.org, achender@linux.vnet.ibm.com
> > Subject: Re: [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle
> >      non page aligned ranges
> > 
> > On Wed, 18 Jul 2012, Lukas Czerner wrote:
> > > On Tue, 17 Jul 2012, Lukas Czerner wrote:
> > > > 
> > > > My bad, it definitely is not safe without the end offset argument in
> > > > invalidatepage() aops ..sigh..
> > > 
> > > So what about having new aop invalidatepage_range and using that in
> > > the truncate_inode_pages_range(). We can still BUG_ON if the file
> > > system register invalidatepage, but not invalidatepage_range,
> > > when the range to truncate is not page aligned at the end.
> > 
> > I had some trouble parsing what you wrote, and have slightly adjusted
> > it (mainly adding a comma) to fit my understanding: shout at me if I'm
> > misrepresenting you!
> > 
> > Yes, I think that's what has to be done.  It's irritating to have two
> > methods doing the same job, but not nearly so irritating as having to
> > change core and all filesystems at the same time.  Then at some future
> > date there can be a cleanup to remove the old invalidatepage method.
> 
> Agreed!
> 
> > 
> > > 
> > > I am sure more file system than just ext4 can take advantage of
> > > this. Currently only ext4, xfs and ocfs2 support punch hole and I
> > > think that all of them can use truncate_inode_pages_range() which
> > > handles unaligned ranges.
> > 
> > I expect that they can, but I'm far from sure of it: each filesystem
> > will have its own needs and difficulties, which might delay them from
> > a quick switchover to invalidatepage_range.
> > 
> > > 
> > > Currently ext4 has it's own overcomplicated method of freeing and
> > > zeroing unaligned ranges.
> > 
> > You're best placed to judge if its overcomplicated, I've not looked.
> > 
> > > Xfs seems just truncate the whole file and
> > 
> > I doubt that can be the case: how would it ever pass testing with
> > the hole-punching fsx if so?  But it is the case that xfs unmaps
> > all the pages from hole onwards, in the exceptional case where the
> > punched file is currently mmap'ed into userspace; and that is wrong,
> > and will get fixed, but it's not a huge big deal meanwhile.  (But it
> > does suggest that hole-punching is more difficult to get completely
> > right than people think at first.)
> 
> Ok, maybe I did not express myself very well, sorry. I meant to say
> that xfs will unmap all mapped pages in the file from start of the
> hole to the end of the file.

It might do that right now, but that's no guarantee that we won't
change it in future. Indeed, we've been considering changing all the
toss/inval page calls to just the required range for a few years,
but never got around to doing it because of we never really
understood how the VM would handle it....

Likewise, those wrappers in fs/xfs/xfs_fs_subr.c need to go away,and
we've been considering that for just as long. It's never happened
because of the above.

If the VM can handle ranged toss/inval regions correctly, then we
can make those changes without concerns of introducing data integrity
regressions....

Cheers,

Dave.
-- 
Dave Chinner
dchinner@redhat.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2012-07-19 23:07 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-13 13:19 [PATCH 01/12 v2] Revert "ext4: remove no longer used functions in inode.c" Lukas Czerner
2012-07-13 13:19 ` [PATCH 02/12 v2] Revert "ext4: fix fsx truncate failure" Lukas Czerner
2012-07-13 17:42   ` Theodore Ts'o
2012-07-14  7:45     ` Christoph Hellwig
2012-07-16  7:35     ` Lukáš Czerner
2012-07-16 21:41       ` Hugh Dickins
2012-07-17  7:53         ` Lukáš Czerner
2012-07-18 19:34   ` Eric Sandeen
2012-07-19  6:45     ` Lukáš Czerner
2012-07-13 13:19 ` [PATCH 03/12 v2] shmem: pass LLONG_MAX to shmem_truncate_range Lukas Czerner
2012-07-18 19:54   ` Eric Sandeen
2012-07-19  6:40     ` Lukáš Czerner
2012-07-13 13:19 ` [PATCH 04/12 v2] xfs: pass LLONG_MAX to truncate_inode_pages_range Lukas Czerner
2012-07-15 23:11   ` Dave Chinner
2012-07-16  7:13     ` Lukáš Czerner
2012-07-16 11:52       ` Lukáš Czerner
2012-07-13 13:19 ` [PATCH 05/12 v2] mm: " Lukas Czerner
2012-07-13 13:19 ` [PATCH 06/12 v2] mm: teach truncate_inode_pages_range() to hadnle non page aligned ranges Lukas Czerner
2012-07-17  8:28   ` Hugh Dickins
2012-07-17 11:57     ` Lukáš Czerner
2012-07-17 12:16       ` Lukáš Czerner
2012-07-18  8:18         ` Lukáš Czerner
2012-07-18 19:36           ` Hugh Dickins
2012-07-19  7:15             ` Lukáš Czerner
2012-07-19 23:07               ` Dave Chinner
2012-07-13 13:19 ` [PATCH 07/12 v2] ext4: use ext4_zero_partial_blocks in punch_hole Lukas Czerner
2012-07-13 13:19 ` [PATCH 08/12 v2] ext4: remove unused discard_partial_page_buffers Lukas Czerner
2012-07-13 13:19 ` [PATCH 09/12 v2] ext4: remove unused code from ext4_remove_blocks() Lukas Czerner
2012-07-13 13:19 ` [PATCH 10/12 v2] ext4: update ext4_ext_remove_space trace point Lukas Czerner
2012-07-13 13:19 ` [PATCH 11/12 v2] ext4: make punch hole code path work with bigalloc Lukas Czerner
2012-07-13 13:19 ` [PATCH 12/12 v2] ext4: Allow punch hole with bigalloc enabled Lukas Czerner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.