* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-03 21:32 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-03 21:32 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
Here's the second version of my corruption fix. It fixes two
bugs:
1) i_size can obviously be at a place that is a hole, so don't BUG on
that.
2) Fix an off-by-one when checking whether the write position is within
the tail allocation.
This version passes my tail corruption test as well as the kernel
compile that exposed the two bugs above.
Joel
---------------------------------------------------------------
ocfs2's allocation unit is the cluster. This can be larger than a block
or even a memory page. This means that a file may have many blocks in
its last extent that are beyond the block containing i_size.
When ocfs2 grows a file, it zeros the entire cluster in order to ensure
future i_size growth will see cleared blocks. Unfortunately,
block_write_full_page() drops the pages past i_size. This means that
ocfs2 is actually leaking garbage data into the tail end of that last
cluster.
We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
when a write or truncate is past i_size. If there is any existing
allocation between the block containing the current i_size and the
location of the write or truncate, zeros will be written to that
allocation.
This is only for sparse filesystems. Non-sparse filesystems already get
this via ocfs2_extend_no_holes().
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 22 ++++----
fs/ocfs2/file.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++------
fs/ocfs2/file.h | 2 +
3 files changed, 150 insertions(+), 28 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca2..96e6aeb 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
dump_stack();
goto bail;
}
-
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
-
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
}
+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+ (unsigned long long)past_eof);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
+
bail:
if (err < 0)
err = -EIO;
@@ -1625,11 +1624,9 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
struct ocfs2_write_ctxt *wc)
{
int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;
- if (ocfs2_sparse_alloc(osb))
- return 0;
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (newsize <= i_size_read(inode))
return 0;
@@ -1679,7 +1676,10 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}
- ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea6..7fca78d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -848,6 +848,137 @@ out:
return ret;
}
+/*
+ * This function is a helper for ocfs2_zero_tail(). It calculates
+ * what blocks need zeroing and does any CoW necessary.
+ */
+static int ocfs2_zero_tail_prepare(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, u64 *start_blkno,
+ u64 *blocks)
+{
+ int rc = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u32 tail_cpos, pos_cpos, p_cpos;
+ u64 tail_blkno, pos_blkno, blocks_to_zero;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ /*
+ * The block containing i_size has already been zeroed, so our tail
+ * block is the first block after i_size. The block containing
+ * pos will be zeroed. So we only need to do anything if
+ * tail_blkno is before pos_blkno.
+ */
+ tail_blkno = (i_size_read(inode) >> inode->i_sb->s_blocksize_bits) + 1;
+ pos_blkno = pos >> inode->i_sb->s_blocksize_bits;
+ mlog(0, "tail_blkno = %llu, pos_blkno = %llu\n",
+ (unsigned long long)tail_blkno, (unsigned long long)pos_blkno);
+ if (pos_blkno <= tail_blkno)
+ goto out;
+ blocks_to_zero = pos_blkno - tail_blkno;
+
+ /*
+ * If tail_blkno is in the cluster past i_size, we don't need
+ * to touch the cluster containing i_size@all.
+ */
+ tail_cpos = i_size_read(inode) >> osb->s_clustersize_bits;
+ if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno) > tail_cpos)
+ tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ tail_blkno);
+
+ rc = ocfs2_get_clusters(inode, tail_cpos, &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ /* Is there a cluster to zero? */
+ if (!p_cpos)
+ goto out;
+
+ pos_cpos = pos >> osb->s_clustersize_bits;
+ mlog(0, "tail_cpos = %u, num_clusters = %u, pos_cpos = %u, tail_blkno = %llu, pos_blkno = %llu\n",
+ (unsigned int)tail_cpos, (unsigned int)num_clusters,
+ (unsigned int)pos_cpos, (unsigned long long)tail_blkno,
+ (unsigned long long)pos_blkno);
+ if ((tail_cpos + num_clusters) > pos_cpos) {
+ num_clusters = pos_cpos - tail_cpos;
+ if (pos_blkno >
+ ocfs2_clusters_to_blocks(inode->i_sb, pos_cpos))
+ num_clusters += 1;
+ } else {
+ blocks_to_zero =
+ ocfs2_clusters_to_blocks(inode->i_sb,
+ tail_cpos + num_clusters);
+ blocks_to_zero -= tail_blkno;
+ }
+
+ /* Now CoW the clusters we're about to zero */
+ if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+ rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
+ num_clusters, UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+ }
+
+ *start_blkno = tail_blkno;
+ *blocks = blocks_to_zero;
+ mlog(0, "start_blkno = %llu, blocks = %llu\n",
+ (unsigned long long)(*start_blkno),
+ (unsigned long long)(*blocks));
+
+out:
+ return rc;
+}
+
+/*
+ * This function only does work for sparse filesystems.
+ * ocfs2_extend_no_holes() will do the same work for non-sparse * files.
+ *
+ * If the last extent of the file has blocks beyond i_size, we must zero
+ * them before we can grow i_size to cover them. Specifically, any
+ * allocation between the block containing the current i_size and the block
+ * containing pos must be zeroed.
+ */
+int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int rc = 0;
+ u64 tail_blkno = 0, blocks_to_zero = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+
+ rc = ocfs2_zero_tail_prepare(inode, di_bh, pos, &tail_blkno,
+ &blocks_to_zero);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (!blocks_to_zero)
+ goto out;
+
+ mlog(0, "i_size = %llu, tail_blkno = %llu, blocks_to_zero = %llu, pos = %llu, zero_to = %llu\n",
+ (unsigned long long)i_size_read(inode),
+ (unsigned long long)tail_blkno,
+ (unsigned long long)blocks_to_zero,
+ (unsigned long long)pos,
+ (unsigned long long)((tail_blkno + blocks_to_zero) <<
+ inode->i_sb->s_blocksize_bits));
+ rc = ocfs2_zero_extend(inode,
+ (tail_blkno + blocks_to_zero) <<
+ inode->i_sb->s_blocksize_bits);
+ if (rc)
+ mlog_errno(rc);
+
+out:
+ return rc;
+}
+
static int ocfs2_extend_file(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size)
@@ -862,27 +993,15 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));
/*
- * Fall through for converting inline data, even if the fs
- * supports sparse files.
- *
- * The check for inline data here is legal - nobody can add
- * the feature since we have i_mutex. We must check it again
- * after acquiring ip_alloc_sem though, as paths like mmap
- * might have raced us to converting the inode to extents.
- */
- if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- goto out_update_size;
-
- /*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
* i_mutex to block other extend/truncate calls while we're
- * here.
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
*/
down_write(&oi->ip_alloc_sem);
@@ -899,13 +1018,14 @@ static int ocfs2_extend_file(struct inode *inode,
ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
if (ret) {
up_write(&oi->ip_alloc_sem);
-
mlog_errno(ret);
goto out;
}
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_tail(inode, di_bh, new_i_size);
+ else
ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
up_write(&oi->ip_alloc_sem);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f..7493d97 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -56,6 +56,8 @@ int ocfs2_simple_size_update(struct inode *inode,
u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
u64 zero_to);
+int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
--
1.7.1
--
"The lawgiver, of all beings, most owes the law allegiance. He of all
men should behave as though the law compelled him. But it is the
universal weakness of mankind that what we are given to administer we
presently imagine we own."
- H.G. Wells
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
2010-07-03 21:32 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-03 21:33 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-03 21:33 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
Here's the second patch, the one that keeps us from zeroing
pages past i_size. This should keep ocfs2 and Dave's writeback patch
happy.
Joel
-------------------------------------------------------
When ocfs2 fills a hole, it does so by allocating clusters. When a
cluster is larger than the write, ocfs2 must zero the portions of the
cluster outside of the write. If the clustersize is smaller than a
pagecache page, this is handled by the normal pagecache mechanisms, but
when the clustersize is larger than a page, ocfs2's write code will zero
the pages adjacent to the write. This makes sure the entire cluster is
zeroed correctly.
Currently ocfs2 behaves exactly the same when writing past i_size.
However, this means ocfs2 is writing zeroed pages for portions of a new
cluster that are beyond i_size. The page writeback code isn't expecting
this. It treats all pages past the one containing i_size as left behind
due to a previous truncate operation.
Thankfully, ocfs2 calculates the number of pages it will be working on
up front. The rest of the write code merely honors the original
calculation. We can simply trim the number of pages to only cover the
actual file data.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 15 +++++++++++----
1 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 96e6aeb..e90ad74 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1130,11 +1130,12 @@ out:
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos, int new,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
- unsigned long start, target_index, index;
+ unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
target_index = user_pos >> PAGE_CACHE_SHIFT;
@@ -1142,11 +1143,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
- * page. Otherwise, we'll need a whole clusters worth.
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /* This is the index *past* the write */
+ end_index = ((user_pos + user_len) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
@@ -1789,7 +1796,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
if (ret) {
mlog_errno(ret);
--
1.7.1
--
"Vote early and vote often."
- Al Capone
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
@ 2010-07-03 21:33 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-03 21:33 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
Here's the second patch, the one that keeps us from zeroing
pages past i_size. This should keep ocfs2 and Dave's writeback patch
happy.
Joel
-------------------------------------------------------
When ocfs2 fills a hole, it does so by allocating clusters. When a
cluster is larger than the write, ocfs2 must zero the portions of the
cluster outside of the write. If the clustersize is smaller than a
pagecache page, this is handled by the normal pagecache mechanisms, but
when the clustersize is larger than a page, ocfs2's write code will zero
the pages adjacent to the write. This makes sure the entire cluster is
zeroed correctly.
Currently ocfs2 behaves exactly the same when writing past i_size.
However, this means ocfs2 is writing zeroed pages for portions of a new
cluster that are beyond i_size. The page writeback code isn't expecting
this. It treats all pages past the one containing i_size as left behind
due to a previous truncate operation.
Thankfully, ocfs2 calculates the number of pages it will be working on
up front. The rest of the write code merely honors the original
calculation. We can simply trim the number of pages to only cover the
actual file data.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 15 +++++++++++----
1 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 96e6aeb..e90ad74 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1130,11 +1130,12 @@ out:
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos, int new,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
- unsigned long start, target_index, index;
+ unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
target_index = user_pos >> PAGE_CACHE_SHIFT;
@@ -1142,11 +1143,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
- * page. Otherwise, we'll need a whole clusters worth.
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /* This is the index *past* the write */
+ end_index = ((user_pos + user_len) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
@@ -1789,7 +1796,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
if (ret) {
mlog_errno(ret);
--
1.7.1
--
"Vote early and vote often."
- Al Capone
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
2010-07-03 21:33 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-04 15:13 ` Tao Ma
-1 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-04 15:13 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Hi Joel,
On 07/04/2010 05:33 AM, Joel Becker wrote:
> Here's the second patch, the one that keeps us from zeroing
> pages past i_size. This should keep ocfs2 and Dave's writeback patch
> happy.
>
> Joel
>
> -------------------------------------------------------
>
> When ocfs2 fills a hole, it does so by allocating clusters. When a
> cluster is larger than the write, ocfs2 must zero the portions of the
> cluster outside of the write. If the clustersize is smaller than a
> pagecache page, this is handled by the normal pagecache mechanisms, but
> when the clustersize is larger than a page, ocfs2's write code will zero
> the pages adjacent to the write. This makes sure the entire cluster is
> zeroed correctly.
>
> Currently ocfs2 behaves exactly the same when writing past i_size.
> However, this means ocfs2 is writing zeroed pages for portions of a new
> cluster that are beyond i_size. The page writeback code isn't expecting
> this. It treats all pages past the one containing i_size as left behind
> due to a previous truncate operation.
>
> Thankfully, ocfs2 calculates the number of pages it will be working on
> up front. The rest of the write code merely honors the original
> calculation. We can simply trim the number of pages to only cover the
> actual file data.
>
> Signed-off-by: Joel Becker<joel.becker@oracle.com>
> ---
> fs/ocfs2/aops.c | 15 +++++++++++----
> 1 files changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index 96e6aeb..e90ad74 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
<snip>
> @@ -1142,11 +1143,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
> /*
> * Figure out how many pages we'll be manipulating here. For
> * non allocating write, we just change the one
> - * page. Otherwise, we'll need a whole clusters worth.
> + * page. Otherwise, we'll need a whole clusters worth. If we're
> + * writing past i_size, we only need enough pages to cover the
> + * last page of the write.
The comments for the whole function before the function name also needs
this change accordingly?
> */
> if (new) {
> wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
> start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
> + /* This is the index *past* the write */
> + end_index = ((user_pos + user_len)>> PAGE_CACHE_SHIFT) + 1;
should it be
end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1?
> + if ((start + wc->w_num_pages)> end_index)
> + wc->w_num_pages = end_index - start;
I just noticed that the below loop in ocfs2_grab_pages_for_write is
for (i = 0; i < wc->w_num_pages; i++)
I guess w_num_pages should be set to end_index -
start_page_of_the_cluster so that we can make sure we grab all the pages
in this cluster until i_size?
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
@ 2010-07-04 15:13 ` Tao Ma
0 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-04 15:13 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Hi Joel,
On 07/04/2010 05:33 AM, Joel Becker wrote:
> Here's the second patch, the one that keeps us from zeroing
> pages past i_size. This should keep ocfs2 and Dave's writeback patch
> happy.
>
> Joel
>
> -------------------------------------------------------
>
> When ocfs2 fills a hole, it does so by allocating clusters. When a
> cluster is larger than the write, ocfs2 must zero the portions of the
> cluster outside of the write. If the clustersize is smaller than a
> pagecache page, this is handled by the normal pagecache mechanisms, but
> when the clustersize is larger than a page, ocfs2's write code will zero
> the pages adjacent to the write. This makes sure the entire cluster is
> zeroed correctly.
>
> Currently ocfs2 behaves exactly the same when writing past i_size.
> However, this means ocfs2 is writing zeroed pages for portions of a new
> cluster that are beyond i_size. The page writeback code isn't expecting
> this. It treats all pages past the one containing i_size as left behind
> due to a previous truncate operation.
>
> Thankfully, ocfs2 calculates the number of pages it will be working on
> up front. The rest of the write code merely honors the original
> calculation. We can simply trim the number of pages to only cover the
> actual file data.
>
> Signed-off-by: Joel Becker<joel.becker@oracle.com>
> ---
> fs/ocfs2/aops.c | 15 +++++++++++----
> 1 files changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index 96e6aeb..e90ad74 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
<snip>
> @@ -1142,11 +1143,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
> /*
> * Figure out how many pages we'll be manipulating here. For
> * non allocating write, we just change the one
> - * page. Otherwise, we'll need a whole clusters worth.
> + * page. Otherwise, we'll need a whole clusters worth. If we're
> + * writing past i_size, we only need enough pages to cover the
> + * last page of the write.
The comments for the whole function before the function name also needs
this change accordingly?
> */
> if (new) {
> wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
> start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
> + /* This is the index *past* the write */
> + end_index = ((user_pos + user_len)>> PAGE_CACHE_SHIFT) + 1;
should it be
end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1?
> + if ((start + wc->w_num_pages)> end_index)
> + wc->w_num_pages = end_index - start;
I just noticed that the below loop in ocfs2_grab_pages_for_write is
for (i = 0; i < wc->w_num_pages; i++)
I guess w_num_pages should be set to end_index -
start_page_of_the_cluster so that we can make sure we grab all the pages
in this cluster until i_size?
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
2010-07-04 15:13 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-05 1:38 ` Tao Ma
-1 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-05 1:38 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Hi Joel,
On 07/04/2010 11:13 PM, Tao Ma wrote:
> Hi Joel,
>
> On 07/04/2010 05:33 AM, Joel Becker wrote:
>> Here's the second patch, the one that keeps us from zeroing
>> pages past i_size. This should keep ocfs2 and Dave's writeback patch
>> happy.
>>
>> Joel
>>
>> -------------------------------------------------------
>>
>> When ocfs2 fills a hole, it does so by allocating clusters. When a
>> cluster is larger than the write, ocfs2 must zero the portions of the
>> cluster outside of the write. If the clustersize is smaller than a
>> pagecache page, this is handled by the normal pagecache mechanisms, but
>> when the clustersize is larger than a page, ocfs2's write code will zero
>> the pages adjacent to the write. This makes sure the entire cluster is
>> zeroed correctly.
>>
>> Currently ocfs2 behaves exactly the same when writing past i_size.
>> However, this means ocfs2 is writing zeroed pages for portions of a new
>> cluster that are beyond i_size. The page writeback code isn't expecting
>> this. It treats all pages past the one containing i_size as left behind
>> due to a previous truncate operation.
>>
>> Thankfully, ocfs2 calculates the number of pages it will be working on
>> up front. The rest of the write code merely honors the original
>> calculation. We can simply trim the number of pages to only cover the
>> actual file data.
>>
>> Signed-off-by: Joel Becker<joel.becker@oracle.com>
>> ---
>> fs/ocfs2/aops.c | 15 +++++++++++----
>> 1 files changed, 11 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
>> index 96e6aeb..e90ad74 100644
>> --- a/fs/ocfs2/aops.c
>> +++ b/fs/ocfs2/aops.c
> <snip>
>> @@ -1142,11 +1143,17 @@ static int ocfs2_grab_pages_for_write(struct
>> address_space *mapping,
>> /*
>> * Figure out how many pages we'll be manipulating here. For
>> * non allocating write, we just change the one
>> - * page. Otherwise, we'll need a whole clusters worth.
>> + * page. Otherwise, we'll need a whole clusters worth. If we're
>> + * writing past i_size, we only need enough pages to cover the
>> + * last page of the write.
> The comments for the whole function before the function name also needs
> this change accordingly?
>> */
>> if (new) {
>> wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
>> start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
>> + /* This is the index *past* the write */
>> + end_index = ((user_pos + user_len)>> PAGE_CACHE_SHIFT) + 1;
> should it be
> end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1?
>
>
>> + if ((start + wc->w_num_pages)> end_index)
>> + wc->w_num_pages = end_index - start;
> I just noticed that the below loop in ocfs2_grab_pages_for_write is
> for (i = 0; i < wc->w_num_pages; i++)
>
> I guess w_num_pages should be set to end_index -
> start_page_of_the_cluster so that we can make sure we grab all the pages
> in this cluster until i_size?
oh, start is set to that value, sorry for this bit.
btw, do we ever have a chance that start + wc->w_num_pages > end_index?
I can't find it.
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
@ 2010-07-05 1:38 ` Tao Ma
0 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-05 1:38 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Hi Joel,
On 07/04/2010 11:13 PM, Tao Ma wrote:
> Hi Joel,
>
> On 07/04/2010 05:33 AM, Joel Becker wrote:
>> Here's the second patch, the one that keeps us from zeroing
>> pages past i_size. This should keep ocfs2 and Dave's writeback patch
>> happy.
>>
>> Joel
>>
>> -------------------------------------------------------
>>
>> When ocfs2 fills a hole, it does so by allocating clusters. When a
>> cluster is larger than the write, ocfs2 must zero the portions of the
>> cluster outside of the write. If the clustersize is smaller than a
>> pagecache page, this is handled by the normal pagecache mechanisms, but
>> when the clustersize is larger than a page, ocfs2's write code will zero
>> the pages adjacent to the write. This makes sure the entire cluster is
>> zeroed correctly.
>>
>> Currently ocfs2 behaves exactly the same when writing past i_size.
>> However, this means ocfs2 is writing zeroed pages for portions of a new
>> cluster that are beyond i_size. The page writeback code isn't expecting
>> this. It treats all pages past the one containing i_size as left behind
>> due to a previous truncate operation.
>>
>> Thankfully, ocfs2 calculates the number of pages it will be working on
>> up front. The rest of the write code merely honors the original
>> calculation. We can simply trim the number of pages to only cover the
>> actual file data.
>>
>> Signed-off-by: Joel Becker<joel.becker@oracle.com>
>> ---
>> fs/ocfs2/aops.c | 15 +++++++++++----
>> 1 files changed, 11 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
>> index 96e6aeb..e90ad74 100644
>> --- a/fs/ocfs2/aops.c
>> +++ b/fs/ocfs2/aops.c
> <snip>
>> @@ -1142,11 +1143,17 @@ static int ocfs2_grab_pages_for_write(struct
>> address_space *mapping,
>> /*
>> * Figure out how many pages we'll be manipulating here. For
>> * non allocating write, we just change the one
>> - * page. Otherwise, we'll need a whole clusters worth.
>> + * page. Otherwise, we'll need a whole clusters worth. If we're
>> + * writing past i_size, we only need enough pages to cover the
>> + * last page of the write.
> The comments for the whole function before the function name also needs
> this change accordingly?
>> */
>> if (new) {
>> wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
>> start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
>> + /* This is the index *past* the write */
>> + end_index = ((user_pos + user_len)>> PAGE_CACHE_SHIFT) + 1;
> should it be
> end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1?
>
>
>> + if ((start + wc->w_num_pages)> end_index)
>> + wc->w_num_pages = end_index - start;
> I just noticed that the below loop in ocfs2_grab_pages_for_write is
> for (i = 0; i < wc->w_num_pages; i++)
>
> I guess w_num_pages should be set to end_index -
> start_page_of_the_cluster so that we can make sure we grab all the pages
> in this cluster until i_size?
oh, start is set to that value, sorry for this bit.
btw, do we ever have a chance that start + wc->w_num_pages > end_index?
I can't find it.
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
2010-07-05 1:38 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-06 7:10 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 7:10 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Mon, Jul 05, 2010 at 09:38:42AM +0800, Tao Ma wrote:
> btw, do we ever have a chance that start + wc->w_num_pages >
> end_index? I can't find it.
Of course. If you have a 1MB clustersize, w_num_pages will be
256. But if you are only writing the first page of the cluster,
end_index is only 1.
Joel
--
"If you took all of the grains of sand in the world, and lined
them up end to end in a row, you'd be working for the government!"
- Mr. Interesting
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
@ 2010-07-06 7:10 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 7:10 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Mon, Jul 05, 2010 at 09:38:42AM +0800, Tao Ma wrote:
> btw, do we ever have a chance that start + wc->w_num_pages >
> end_index? I can't find it.
Of course. If you have a 1MB clustersize, w_num_pages will be
256. But if you are only writing the first page of the cluster,
end_index is only 1.
Joel
--
"If you took all of the grains of sand in the world, and lined
them up end to end in a row, you'd be working for the government!"
- Mr. Interesting
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
2010-07-04 15:13 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-06 7:09 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 7:09 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Sun, Jul 04, 2010 at 11:13:01PM +0800, Tao Ma wrote:
> On 07/04/2010 05:33 AM, Joel Becker wrote:
> >@@ -1142,11 +1143,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
> > /*
> > * Figure out how many pages we'll be manipulating here. For
> > * non allocating write, we just change the one
> >- * page. Otherwise, we'll need a whole clusters worth.
> >+ * page. Otherwise, we'll need a whole clusters worth. If we're
> >+ * writing past i_size, we only need enough pages to cover the
> >+ * last page of the write.
> The comments for the whole function before the function name also
> needs this change accordingly?
Not really. That comment set a limit, this comment is more
detailed.
> > if (new) {
> > wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
> > start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
> >+ /* This is the index *past* the write */
> >+ end_index = ((user_pos + user_len)>> PAGE_CACHE_SHIFT) + 1;
> should it be
> end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1?
Maybe. Gotta think about it and test.
Joel
--
Life's Little Instruction Book #232
"Keep your promises."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
@ 2010-07-06 7:09 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 7:09 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Sun, Jul 04, 2010 at 11:13:01PM +0800, Tao Ma wrote:
> On 07/04/2010 05:33 AM, Joel Becker wrote:
> >@@ -1142,11 +1143,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
> > /*
> > * Figure out how many pages we'll be manipulating here. For
> > * non allocating write, we just change the one
> >- * page. Otherwise, we'll need a whole clusters worth.
> >+ * page. Otherwise, we'll need a whole clusters worth. If we're
> >+ * writing past i_size, we only need enough pages to cover the
> >+ * last page of the write.
> The comments for the whole function before the function name also
> needs this change accordingly?
Not really. That comment set a limit, this comment is more
detailed.
> > if (new) {
> > wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
> > start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
> >+ /* This is the index *past* the write */
> >+ end_index = ((user_pos + user_len)>> PAGE_CACHE_SHIFT) + 1;
> should it be
> end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1?
Maybe. Gotta think about it and test.
Joel
--
Life's Little Instruction Book #232
"Keep your promises."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [Ocfs2-devel] [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
2010-07-06 7:09 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-06 18:39 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 18:39 UTC (permalink / raw)
To: Tao Ma, Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Tue, Jul 06, 2010 at 12:09:19AM -0700, Joel Becker wrote:
> On Sun, Jul 04, 2010 at 11:13:01PM +0800, Tao Ma wrote:
> > > if (new) {
> > > wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
> > > start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
> > >+ /* This is the index *past* the write */
> > >+ end_index = ((user_pos + user_len)>> PAGE_CACHE_SHIFT) + 1;
> > should it be
> > end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1?
>
> Maybe. Gotta think about it and test.
I think you're right. Since there are other changes too, I'm
going to add this in and test it.
Joel
--
"I'm drifting and drifting
Just like a ship out on the sea.
Cause I ain't got nobody, baby,
In this world to care for me."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 2/2] ocfs2: No need to zero pages past i_size. i_size v2
@ 2010-07-06 18:39 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 18:39 UTC (permalink / raw)
To: Tao Ma, Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Tue, Jul 06, 2010 at 12:09:19AM -0700, Joel Becker wrote:
> On Sun, Jul 04, 2010 at 11:13:01PM +0800, Tao Ma wrote:
> > > if (new) {
> > > wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
> > > start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
> > >+ /* This is the index *past* the write */
> > >+ end_index = ((user_pos + user_len)>> PAGE_CACHE_SHIFT) + 1;
> > should it be
> > end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1?
>
> Maybe. Gotta think about it and test.
I think you're right. Since there are other changes too, I'm
going to add this in and test it.
Joel
--
"I'm drifting and drifting
Just like a ship out on the sea.
Cause I ain't got nobody, baby,
In this world to care for me."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
2010-07-03 21:32 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-05 3:51 ` Tao Ma
-1 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-05 3:51 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Hi Joel,
On 07/04/2010 05:32 AM, Joel Becker wrote:
> Here's the second version of my corruption fix. It fixes two
> bugs:
>
> 1) i_size can obviously be at a place that is a hole, so don't BUG on
> that.
> 2) Fix an off-by-one when checking whether the write position is within
> the tail allocation.
>
> This version passes my tail corruption test as well as the kernel
> compile that exposed the two bugs above.
>
> Joel
>
> ---------------------------------------------------------------
>
> ocfs2's allocation unit is the cluster. This can be larger than a block
> or even a memory page. This means that a file may have many blocks in
> its last extent that are beyond the block containing i_size.
>
> When ocfs2 grows a file, it zeros the entire cluster in order to ensure
> future i_size growth will see cleared blocks. Unfortunately,
> block_write_full_page() drops the pages past i_size. This means that
> ocfs2 is actually leaking garbage data into the tail end of that last
> cluster.
>
> We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
> when a write or truncate is past i_size. If there is any existing
> allocation between the block containing the current i_size and the
> location of the write or truncate, zeros will be written to that
> allocation.
>
> This is only for sparse filesystems. Non-sparse filesystems already get
> this via ocfs2_extend_no_holes().
>
> Signed-off-by: Joel Becker<joel.becker@oracle.com>
> ---
> fs/ocfs2/aops.c | 22 ++++----
> fs/ocfs2/file.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++------
> fs/ocfs2/file.h | 2 +
> 3 files changed, 150 insertions(+), 28 deletions(-)
>
<snip>
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 6a13ea6..7fca78d 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -848,6 +848,137 @@ out:
> return ret;
> }
>
> +/*
> + * This function is a helper for ocfs2_zero_tail(). It calculates
> + * what blocks need zeroing and does any CoW necessary.
> + */
> +static int ocfs2_zero_tail_prepare(struct inode *inode,
> + struct buffer_head *di_bh,
> + loff_t pos, u64 *start_blkno,
> + u64 *blocks)
> +{
> + int rc = 0;
> + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> + u32 tail_cpos, pos_cpos, p_cpos;
> + u64 tail_blkno, pos_blkno, blocks_to_zero;
> + unsigned int num_clusters = 0;
> + unsigned int ext_flags = 0;
> +
> + /*
> + * The block containing i_size has already been zeroed, so our tail
> + * block is the first block after i_size. The block containing
> + * pos will be zeroed. So we only need to do anything if
> + * tail_blkno is before pos_blkno.
> + */
> + tail_blkno = (i_size_read(inode)>> inode->i_sb->s_blocksize_bits) + 1;
> + pos_blkno = pos>> inode->i_sb->s_blocksize_bits;
> + mlog(0, "tail_blkno = %llu, pos_blkno = %llu\n",
> + (unsigned long long)tail_blkno, (unsigned long long)pos_blkno);
> + if (pos_blkno<= tail_blkno)
> + goto out;
> + blocks_to_zero = pos_blkno - tail_blkno;
> +
> + /*
> + * If tail_blkno is in the cluster past i_size, we don't need
> + * to touch the cluster containing i_size at all.
> + */
> + tail_cpos = i_size_read(inode)>> osb->s_clustersize_bits;
> + if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)> tail_cpos)
> + tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
> + tail_blkno);
Can we always set tail_cpos in one line?
tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)?
tail_cpos is either the same cluster as i_size or the next cluster and
both works for tail_blkno I guess?
> +
> + rc = ocfs2_get_clusters(inode, tail_cpos,&p_cpos,&num_clusters,
> + &ext_flags);
> + if (rc) {
> + mlog_errno(rc);
> + goto out;
> + }
> +
> + /* Is there a cluster to zero? */
> + if (!p_cpos)
> + goto out;
For unwritten extent, we also need to clear the pages? If yes, the
solution doesn't complete if we have 2 unwritten extent, one contains
i_size while one passes i_size. Here we only clear the pages for the 1st
unwritten extent and leave the 2nd one untouched.
> +
> + pos_cpos = pos>> osb->s_clustersize_bits;
> + mlog(0, "tail_cpos = %u, num_clusters = %u, pos_cpos = %u, tail_blkno = %llu, pos_blkno = %llu\n",
> + (unsigned int)tail_cpos, (unsigned int)num_clusters,
> + (unsigned int)pos_cpos, (unsigned long long)tail_blkno,
> + (unsigned long long)pos_blkno);
From here to the call of CoW is a bit hard to understand. In 'if',
num_clusters is set for CoW and in 'else', blocks_to_zero is set. So it
isn't easy for the reader to tell why these 2 clauses are setting
different values. So how about my code below? It looks more
straightforward I think.
> + if ((tail_cpos + num_clusters)> pos_cpos) {
> + num_clusters = pos_cpos - tail_cpos;
> + if (pos_blkno>
> + ocfs2_clusters_to_blocks(inode->i_sb, pos_cpos))
> + num_clusters += 1;
> + } else {
> + blocks_to_zero =
> + ocfs2_clusters_to_blocks(inode->i_sb,
> + tail_cpos + num_clusters);
> + blocks_to_zero -= tail_blkno;
> + }
> +
> + /* Now CoW the clusters we're about to zero */
> + if (ext_flags& OCFS2_EXT_REFCOUNTED) {
> + rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
> + num_clusters, UINT_MAX);
> + if (rc) {
> + mlog_errno(rc);
> + goto out;
> + }
> + }
/* Decrease blocks_to_zero if there is some hole after extent */
if (tail_cpos + num_clusters <= pos_cpos) {
blocks_to_zero =
ocfs2_clusters_to_blocks(inode->i_sb,
tail_cpos + num_clusters);
blocks_to_zero -= tail_blkno;
}
/* Now CoW if we have some refcounted clusters. */
if (ext_flags & OCFS2_EXT_REFCOUNTED) {
/*
* We add one more cluster here since it will be
* written shortly and if the pos_blkno isn't aligned
* to the cluster size, we have to zero the blocks
* before it.
*/
if (tail_cpos + num_clusters > pos_cpos)
num_clusters = pos_cpos - tail_cpos + 1;
rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
num_clusters, UINT_MAX);
if (rc) {
mlog_errno(rc);
goto out;
}
}
> +
> + *start_blkno = tail_blkno;
> + *blocks = blocks_to_zero;
> + mlog(0, "start_blkno = %llu, blocks = %llu\n",
> + (unsigned long long)(*start_blkno),
> + (unsigned long long)(*blocks));
> +
> +out:
> + return rc;
> +}
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-05 3:51 ` Tao Ma
0 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-05 3:51 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Hi Joel,
On 07/04/2010 05:32 AM, Joel Becker wrote:
> Here's the second version of my corruption fix. It fixes two
> bugs:
>
> 1) i_size can obviously be at a place that is a hole, so don't BUG on
> that.
> 2) Fix an off-by-one when checking whether the write position is within
> the tail allocation.
>
> This version passes my tail corruption test as well as the kernel
> compile that exposed the two bugs above.
>
> Joel
>
> ---------------------------------------------------------------
>
> ocfs2's allocation unit is the cluster. This can be larger than a block
> or even a memory page. This means that a file may have many blocks in
> its last extent that are beyond the block containing i_size.
>
> When ocfs2 grows a file, it zeros the entire cluster in order to ensure
> future i_size growth will see cleared blocks. Unfortunately,
> block_write_full_page() drops the pages past i_size. This means that
> ocfs2 is actually leaking garbage data into the tail end of that last
> cluster.
>
> We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
> when a write or truncate is past i_size. If there is any existing
> allocation between the block containing the current i_size and the
> location of the write or truncate, zeros will be written to that
> allocation.
>
> This is only for sparse filesystems. Non-sparse filesystems already get
> this via ocfs2_extend_no_holes().
>
> Signed-off-by: Joel Becker<joel.becker@oracle.com>
> ---
> fs/ocfs2/aops.c | 22 ++++----
> fs/ocfs2/file.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++------
> fs/ocfs2/file.h | 2 +
> 3 files changed, 150 insertions(+), 28 deletions(-)
>
<snip>
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 6a13ea6..7fca78d 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -848,6 +848,137 @@ out:
> return ret;
> }
>
> +/*
> + * This function is a helper for ocfs2_zero_tail(). It calculates
> + * what blocks need zeroing and does any CoW necessary.
> + */
> +static int ocfs2_zero_tail_prepare(struct inode *inode,
> + struct buffer_head *di_bh,
> + loff_t pos, u64 *start_blkno,
> + u64 *blocks)
> +{
> + int rc = 0;
> + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> + u32 tail_cpos, pos_cpos, p_cpos;
> + u64 tail_blkno, pos_blkno, blocks_to_zero;
> + unsigned int num_clusters = 0;
> + unsigned int ext_flags = 0;
> +
> + /*
> + * The block containing i_size has already been zeroed, so our tail
> + * block is the first block after i_size. The block containing
> + * pos will be zeroed. So we only need to do anything if
> + * tail_blkno is before pos_blkno.
> + */
> + tail_blkno = (i_size_read(inode)>> inode->i_sb->s_blocksize_bits) + 1;
> + pos_blkno = pos>> inode->i_sb->s_blocksize_bits;
> + mlog(0, "tail_blkno = %llu, pos_blkno = %llu\n",
> + (unsigned long long)tail_blkno, (unsigned long long)pos_blkno);
> + if (pos_blkno<= tail_blkno)
> + goto out;
> + blocks_to_zero = pos_blkno - tail_blkno;
> +
> + /*
> + * If tail_blkno is in the cluster past i_size, we don't need
> + * to touch the cluster containing i_size at all.
> + */
> + tail_cpos = i_size_read(inode)>> osb->s_clustersize_bits;
> + if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)> tail_cpos)
> + tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
> + tail_blkno);
Can we always set tail_cpos in one line?
tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)?
tail_cpos is either the same cluster as i_size or the next cluster and
both works for tail_blkno I guess?
> +
> + rc = ocfs2_get_clusters(inode, tail_cpos,&p_cpos,&num_clusters,
> + &ext_flags);
> + if (rc) {
> + mlog_errno(rc);
> + goto out;
> + }
> +
> + /* Is there a cluster to zero? */
> + if (!p_cpos)
> + goto out;
For unwritten extent, we also need to clear the pages? If yes, the
solution doesn't complete if we have 2 unwritten extent, one contains
i_size while one passes i_size. Here we only clear the pages for the 1st
unwritten extent and leave the 2nd one untouched.
> +
> + pos_cpos = pos>> osb->s_clustersize_bits;
> + mlog(0, "tail_cpos = %u, num_clusters = %u, pos_cpos = %u, tail_blkno = %llu, pos_blkno = %llu\n",
> + (unsigned int)tail_cpos, (unsigned int)num_clusters,
> + (unsigned int)pos_cpos, (unsigned long long)tail_blkno,
> + (unsigned long long)pos_blkno);
From here to the call of CoW is a bit hard to understand. In 'if',
num_clusters is set for CoW and in 'else', blocks_to_zero is set. So it
isn't easy for the reader to tell why these 2 clauses are setting
different values. So how about my code below? It looks more
straightforward I think.
> + if ((tail_cpos + num_clusters)> pos_cpos) {
> + num_clusters = pos_cpos - tail_cpos;
> + if (pos_blkno>
> + ocfs2_clusters_to_blocks(inode->i_sb, pos_cpos))
> + num_clusters += 1;
> + } else {
> + blocks_to_zero =
> + ocfs2_clusters_to_blocks(inode->i_sb,
> + tail_cpos + num_clusters);
> + blocks_to_zero -= tail_blkno;
> + }
> +
> + /* Now CoW the clusters we're about to zero */
> + if (ext_flags& OCFS2_EXT_REFCOUNTED) {
> + rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
> + num_clusters, UINT_MAX);
> + if (rc) {
> + mlog_errno(rc);
> + goto out;
> + }
> + }
/* Decrease blocks_to_zero if there is some hole after extent */
if (tail_cpos + num_clusters <= pos_cpos) {
blocks_to_zero =
ocfs2_clusters_to_blocks(inode->i_sb,
tail_cpos + num_clusters);
blocks_to_zero -= tail_blkno;
}
/* Now CoW if we have some refcounted clusters. */
if (ext_flags & OCFS2_EXT_REFCOUNTED) {
/*
* We add one more cluster here since it will be
* written shortly and if the pos_blkno isn't aligned
* to the cluster size, we have to zero the blocks
* before it.
*/
if (tail_cpos + num_clusters > pos_cpos)
num_clusters = pos_cpos - tail_cpos + 1;
rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
num_clusters, UINT_MAX);
if (rc) {
mlog_errno(rc);
goto out;
}
}
> +
> + *start_blkno = tail_blkno;
> + *blocks = blocks_to_zero;
> + mlog(0, "start_blkno = %llu, blocks = %llu\n",
> + (unsigned long long)(*start_blkno),
> + (unsigned long long)(*blocks));
> +
> +out:
> + return rc;
> +}
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
2010-07-05 3:51 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-06 7:17 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 7:17 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Mon, Jul 05, 2010 at 11:51:44AM +0800, Tao Ma wrote:
> >+ /*
> >+ * If tail_blkno is in the cluster past i_size, we don't need
> >+ * to touch the cluster containing i_size at all.
> >+ */
> >+ tail_cpos = i_size_read(inode)>> osb->s_clustersize_bits;
> >+ if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)> tail_cpos)
> >+ tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
> >+ tail_blkno);
> Can we always set tail_cpos in one line?
> tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)?
> tail_cpos is either the same cluster as i_size or the next cluster
> and both works for tail_blkno I guess?
I had the same thought on Friday, but the current version passes
testing and I was wary of changing that.
> >+ /* Is there a cluster to zero? */
> >+ if (!p_cpos)
> >+ goto out;
> For unwritten extent, we also need to clear the pages? If yes, the
> solution doesn't complete if we have 2 unwritten extent, one
> contains i_size while one passes i_size. Here we only clear the
> pages for the 1st unwritten extent and leave the 2nd one untouched.
We probably don't need to zero unwritten extents. We cannot
have an extent past i_size, can we?
> From here to the call of CoW is a bit hard to understand. In 'if',
> num_clusters is set for CoW and in 'else', blocks_to_zero is set. So
> it isn't easy for the reader to tell why these 2 clauses are setting
> different values. So how about my code below? It looks more
> straightforward I think.
> >+ if ((tail_cpos + num_clusters)> pos_cpos) {
> >+ num_clusters = pos_cpos - tail_cpos;
> >+ if (pos_blkno>
> >+ ocfs2_clusters_to_blocks(inode->i_sb, pos_cpos))
> >+ num_clusters += 1;
> >+ } else {
> >+ blocks_to_zero =
> >+ ocfs2_clusters_to_blocks(inode->i_sb,
> >+ tail_cpos + num_clusters);
> >+ blocks_to_zero -= tail_blkno;
> >+ }
> >+
> >+ /* Now CoW the clusters we're about to zero */
> >+ if (ext_flags& OCFS2_EXT_REFCOUNTED) {
> >+ rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
> >+ num_clusters, UINT_MAX);
> >+ if (rc) {
> >+ mlog_errno(rc);
> >+ goto out;
> >+ }
> >+ }
> /* Decrease blocks_to_zero if there is some hole after extent */
> if (tail_cpos + num_clusters <= pos_cpos) {
> blocks_to_zero =
> ocfs2_clusters_to_blocks(inode->i_sb,
> tail_cpos + num_clusters);
> blocks_to_zero -= tail_blkno;
> }
Not a bad split-out here.
> /* Now CoW if we have some refcounted clusters. */
> if (ext_flags & OCFS2_EXT_REFCOUNTED) {
> /*
> * We add one more cluster here since it will be
> * written shortly and if the pos_blkno isn't aligned
> * to the cluster size, we have to zero the blocks
> * before it.
> */
> if (tail_cpos + num_clusters > pos_cpos)
> num_clusters = pos_cpos - tail_cpos + 1;
But you dropped the check for pos_blkno alignment.
Unconditionally adding the +1 doesn't seem like a good idea.
Joel
--
"Where are my angels?
Where's my golden one?
And where is my hope
Now that my heroes are gone?"
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-06 7:17 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 7:17 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Mon, Jul 05, 2010 at 11:51:44AM +0800, Tao Ma wrote:
> >+ /*
> >+ * If tail_blkno is in the cluster past i_size, we don't need
> >+ * to touch the cluster containing i_size at all.
> >+ */
> >+ tail_cpos = i_size_read(inode)>> osb->s_clustersize_bits;
> >+ if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)> tail_cpos)
> >+ tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
> >+ tail_blkno);
> Can we always set tail_cpos in one line?
> tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)?
> tail_cpos is either the same cluster as i_size or the next cluster
> and both works for tail_blkno I guess?
I had the same thought on Friday, but the current version passes
testing and I was wary of changing that.
> >+ /* Is there a cluster to zero? */
> >+ if (!p_cpos)
> >+ goto out;
> For unwritten extent, we also need to clear the pages? If yes, the
> solution doesn't complete if we have 2 unwritten extent, one
> contains i_size while one passes i_size. Here we only clear the
> pages for the 1st unwritten extent and leave the 2nd one untouched.
We probably don't need to zero unwritten extents. We cannot
have an extent past i_size, can we?
> From here to the call of CoW is a bit hard to understand. In 'if',
> num_clusters is set for CoW and in 'else', blocks_to_zero is set. So
> it isn't easy for the reader to tell why these 2 clauses are setting
> different values. So how about my code below? It looks more
> straightforward I think.
> >+ if ((tail_cpos + num_clusters)> pos_cpos) {
> >+ num_clusters = pos_cpos - tail_cpos;
> >+ if (pos_blkno>
> >+ ocfs2_clusters_to_blocks(inode->i_sb, pos_cpos))
> >+ num_clusters += 1;
> >+ } else {
> >+ blocks_to_zero =
> >+ ocfs2_clusters_to_blocks(inode->i_sb,
> >+ tail_cpos + num_clusters);
> >+ blocks_to_zero -= tail_blkno;
> >+ }
> >+
> >+ /* Now CoW the clusters we're about to zero */
> >+ if (ext_flags& OCFS2_EXT_REFCOUNTED) {
> >+ rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
> >+ num_clusters, UINT_MAX);
> >+ if (rc) {
> >+ mlog_errno(rc);
> >+ goto out;
> >+ }
> >+ }
> /* Decrease blocks_to_zero if there is some hole after extent */
> if (tail_cpos + num_clusters <= pos_cpos) {
> blocks_to_zero =
> ocfs2_clusters_to_blocks(inode->i_sb,
> tail_cpos + num_clusters);
> blocks_to_zero -= tail_blkno;
> }
Not a bad split-out here.
> /* Now CoW if we have some refcounted clusters. */
> if (ext_flags & OCFS2_EXT_REFCOUNTED) {
> /*
> * We add one more cluster here since it will be
> * written shortly and if the pos_blkno isn't aligned
> * to the cluster size, we have to zero the blocks
> * before it.
> */
> if (tail_cpos + num_clusters > pos_cpos)
> num_clusters = pos_cpos - tail_cpos + 1;
But you dropped the check for pos_blkno alignment.
Unconditionally adding the +1 doesn't seem like a good idea.
Joel
--
"Where are my angels?
Where's my golden one?
And where is my hope
Now that my heroes are gone?"
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
2010-07-06 7:17 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-06 7:54 ` Tao Ma
-1 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-06 7:54 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Hi Joel,
On 07/06/2010 03:17 PM, Joel Becker wrote:
> On Mon, Jul 05, 2010 at 11:51:44AM +0800, Tao Ma wrote:
>>> + /*
>>> + * If tail_blkno is in the cluster past i_size, we don't need
>>> + * to touch the cluster containing i_size at all.
>>> + */
>>> + tail_cpos = i_size_read(inode)>> osb->s_clustersize_bits;
>>> + if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)> tail_cpos)
>>> + tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
>>> + tail_blkno);
>> Can we always set tail_cpos in one line?
>> tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)?
>> tail_cpos is either the same cluster as i_size or the next cluster
>> and both works for tail_blkno I guess?
>
> I had the same thought on Friday, but the current version passes
> testing and I was wary of changing that.
ok, so as you wish.
>
>>> + /* Is there a cluster to zero? */
>>> + if (!p_cpos)
>>> + goto out;
>> For unwritten extent, we also need to clear the pages? If yes, the
>> solution doesn't complete if we have 2 unwritten extent, one
>> contains i_size while one passes i_size. Here we only clear the
>> pages for the 1st unwritten extent and leave the 2nd one untouched.
>
> We probably don't need to zero unwritten extents. We cannot
> have an extent past i_size, can we?
we can. AFAICS, ocfs2_change_file_space will allocate unwritten extents
and does't change i_size.
>
>> From here to the call of CoW is a bit hard to understand. In 'if',
>> num_clusters is set for CoW and in 'else', blocks_to_zero is set. So
>> it isn't easy for the reader to tell why these 2 clauses are setting
>> different values. So how about my code below? It looks more
>> straightforward I think.
>>> + if ((tail_cpos + num_clusters)> pos_cpos) {
>>> + num_clusters = pos_cpos - tail_cpos;
>>> + if (pos_blkno>
>>> + ocfs2_clusters_to_blocks(inode->i_sb, pos_cpos))
>>> + num_clusters += 1;
>>> + } else {
>>> + blocks_to_zero =
>>> + ocfs2_clusters_to_blocks(inode->i_sb,
>>> + tail_cpos + num_clusters);
>>> + blocks_to_zero -= tail_blkno;
>>> + }
>>> +
>>> + /* Now CoW the clusters we're about to zero */
>>> + if (ext_flags& OCFS2_EXT_REFCOUNTED) {
>>> + rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
>>> + num_clusters, UINT_MAX);
>>> + if (rc) {
>>> + mlog_errno(rc);
>>> + goto out;
>>> + }
>>> + }
>> /* Decrease blocks_to_zero if there is some hole after extent */
>> if (tail_cpos + num_clusters<= pos_cpos) {
>> blocks_to_zero =
>> ocfs2_clusters_to_blocks(inode->i_sb,
>> tail_cpos + num_clusters);
>> blocks_to_zero -= tail_blkno;
>> }
>
> Not a bad split-out here.
>
>> /* Now CoW if we have some refcounted clusters. */
>> if (ext_flags& OCFS2_EXT_REFCOUNTED) {
>> /*
>> * We add one more cluster here since it will be
>> * written shortly and if the pos_blkno isn't aligned
>> * to the cluster size, we have to zero the blocks
>> * before it.
>> */
>> if (tail_cpos + num_clusters> pos_cpos)
>> num_clusters = pos_cpos - tail_cpos + 1;
>
> But you dropped the check for pos_blkno alignment.
> Unconditionally adding the +1 doesn't seem like a good idea.
You can add it as you wish.
I just thought that you add one more extra cluster if pos_blkno isn't
aligned so as to zero blocks in [pos_cpos_start_block, pos_blkno).
But As I said in the comments, you will soon write pos_blkno(it also
needs to be CoW since it is within this refcounted extent), so if we can
CoW it out now, maybe we have a chance to not call ocfs2_refcount_cow later.
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-06 7:54 ` Tao Ma
0 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-06 7:54 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Hi Joel,
On 07/06/2010 03:17 PM, Joel Becker wrote:
> On Mon, Jul 05, 2010 at 11:51:44AM +0800, Tao Ma wrote:
>>> + /*
>>> + * If tail_blkno is in the cluster past i_size, we don't need
>>> + * to touch the cluster containing i_size at all.
>>> + */
>>> + tail_cpos = i_size_read(inode)>> osb->s_clustersize_bits;
>>> + if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)> tail_cpos)
>>> + tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
>>> + tail_blkno);
>> Can we always set tail_cpos in one line?
>> tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)?
>> tail_cpos is either the same cluster as i_size or the next cluster
>> and both works for tail_blkno I guess?
>
> I had the same thought on Friday, but the current version passes
> testing and I was wary of changing that.
ok, so as you wish.
>
>>> + /* Is there a cluster to zero? */
>>> + if (!p_cpos)
>>> + goto out;
>> For unwritten extent, we also need to clear the pages? If yes, the
>> solution doesn't complete if we have 2 unwritten extent, one
>> contains i_size while one passes i_size. Here we only clear the
>> pages for the 1st unwritten extent and leave the 2nd one untouched.
>
> We probably don't need to zero unwritten extents. We cannot
> have an extent past i_size, can we?
we can. AFAICS, ocfs2_change_file_space will allocate unwritten extents
and does't change i_size.
>
>> From here to the call of CoW is a bit hard to understand. In 'if',
>> num_clusters is set for CoW and in 'else', blocks_to_zero is set. So
>> it isn't easy for the reader to tell why these 2 clauses are setting
>> different values. So how about my code below? It looks more
>> straightforward I think.
>>> + if ((tail_cpos + num_clusters)> pos_cpos) {
>>> + num_clusters = pos_cpos - tail_cpos;
>>> + if (pos_blkno>
>>> + ocfs2_clusters_to_blocks(inode->i_sb, pos_cpos))
>>> + num_clusters += 1;
>>> + } else {
>>> + blocks_to_zero =
>>> + ocfs2_clusters_to_blocks(inode->i_sb,
>>> + tail_cpos + num_clusters);
>>> + blocks_to_zero -= tail_blkno;
>>> + }
>>> +
>>> + /* Now CoW the clusters we're about to zero */
>>> + if (ext_flags& OCFS2_EXT_REFCOUNTED) {
>>> + rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
>>> + num_clusters, UINT_MAX);
>>> + if (rc) {
>>> + mlog_errno(rc);
>>> + goto out;
>>> + }
>>> + }
>> /* Decrease blocks_to_zero if there is some hole after extent */
>> if (tail_cpos + num_clusters<= pos_cpos) {
>> blocks_to_zero =
>> ocfs2_clusters_to_blocks(inode->i_sb,
>> tail_cpos + num_clusters);
>> blocks_to_zero -= tail_blkno;
>> }
>
> Not a bad split-out here.
>
>> /* Now CoW if we have some refcounted clusters. */
>> if (ext_flags& OCFS2_EXT_REFCOUNTED) {
>> /*
>> * We add one more cluster here since it will be
>> * written shortly and if the pos_blkno isn't aligned
>> * to the cluster size, we have to zero the blocks
>> * before it.
>> */
>> if (tail_cpos + num_clusters> pos_cpos)
>> num_clusters = pos_cpos - tail_cpos + 1;
>
> But you dropped the check for pos_blkno alignment.
> Unconditionally adding the +1 doesn't seem like a good idea.
You can add it as you wish.
I just thought that you add one more extra cluster if pos_blkno isn't
aligned so as to zero blocks in [pos_cpos_start_block, pos_blkno).
But As I said in the comments, you will soon write pos_blkno(it also
needs to be CoW since it is within this refcounted extent), so if we can
CoW it out now, maybe we have a chance to not call ocfs2_refcount_cow later.
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
2010-07-06 7:54 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-06 11:58 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 11:58 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Tue, Jul 06, 2010 at 03:54:58PM +0800, Tao Ma wrote:
> On 07/06/2010 03:17 PM, Joel Becker wrote:
> >>>+ /* Is there a cluster to zero? */
> >>>+ if (!p_cpos)
> >>>+ goto out;
> >>For unwritten extent, we also need to clear the pages? If yes, the
> >>solution doesn't complete if we have 2 unwritten extent, one
> >>contains i_size while one passes i_size. Here we only clear the
> >>pages for the 1st unwritten extent and leave the 2nd one untouched.
> >
> > We probably don't need to zero unwritten extents. We cannot
> >have an extent past i_size, can we?
> we can. AFAICS, ocfs2_change_file_space will allocate unwritten
> extents and does't change i_size.
Oh, you're right. We need to walk the entire extent range
between i_size and pos and figure out what needs CoW. This needs to
happen no matter what.
> > But you dropped the check for pos_blkno alignment.
> >Unconditionally adding the +1 doesn't seem like a good idea.
> You can add it as you wish.
> I just thought that you add one more extra cluster if pos_blkno
> isn't aligned so as to zero blocks in [pos_cpos_start_block,
> pos_blkno).
> But As I said in the comments, you will soon write pos_blkno(it also
> needs to be CoW since it is within this refcounted extent), so if we
> can CoW it out now, maybe we have a chance to not call
> ocfs2_refcount_cow later.
I'd much rather let the write handle its own contiguousness. If
we get lucky, that CoW melds with our CoW. If we don't get lucky, isn't
it better to have the newly changed area be fully contiguous rather than
have the first extent of it not be and then the remaining extents be?
Joel
--
Life's Little Instruction Book #3
"Watch a sunrise at least once a year."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-06 11:58 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 11:58 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Tue, Jul 06, 2010 at 03:54:58PM +0800, Tao Ma wrote:
> On 07/06/2010 03:17 PM, Joel Becker wrote:
> >>>+ /* Is there a cluster to zero? */
> >>>+ if (!p_cpos)
> >>>+ goto out;
> >>For unwritten extent, we also need to clear the pages? If yes, the
> >>solution doesn't complete if we have 2 unwritten extent, one
> >>contains i_size while one passes i_size. Here we only clear the
> >>pages for the 1st unwritten extent and leave the 2nd one untouched.
> >
> > We probably don't need to zero unwritten extents. We cannot
> >have an extent past i_size, can we?
> we can. AFAICS, ocfs2_change_file_space will allocate unwritten
> extents and does't change i_size.
Oh, you're right. We need to walk the entire extent range
between i_size and pos and figure out what needs CoW. This needs to
happen no matter what.
> > But you dropped the check for pos_blkno alignment.
> >Unconditionally adding the +1 doesn't seem like a good idea.
> You can add it as you wish.
> I just thought that you add one more extra cluster if pos_blkno
> isn't aligned so as to zero blocks in [pos_cpos_start_block,
> pos_blkno).
> But As I said in the comments, you will soon write pos_blkno(it also
> needs to be CoW since it is within this refcounted extent), so if we
> can CoW it out now, maybe we have a chance to not call
> ocfs2_refcount_cow later.
I'd much rather let the write handle its own contiguousness. If
we get lucky, that CoW melds with our CoW. If we don't get lucky, isn't
it better to have the newly changed area be fully contiguous rather than
have the first extent of it not be and then the remaining extents be?
Joel
--
Life's Little Instruction Book #3
"Watch a sunrise at least once a year."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
2010-07-06 11:58 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-07 0:42 ` Tao Ma
-1 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-07 0:42 UTC (permalink / raw)
To: Tao Ma, Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Joel Becker wrote:
> On Tue, Jul 06, 2010 at 03:54:58PM +0800, Tao Ma wrote:
>
>> On 07/06/2010 03:17 PM, Joel Becker wrote:
>>
>>>>> + /* Is there a cluster to zero? */
>>>>> + if (!p_cpos)
>>>>> + goto out;
>>>>>
>>>> For unwritten extent, we also need to clear the pages? If yes, the
>>>> solution doesn't complete if we have 2 unwritten extent, one
>>>> contains i_size while one passes i_size. Here we only clear the
>>>> pages for the 1st unwritten extent and leave the 2nd one untouched.
>>>>
>>> We probably don't need to zero unwritten extents. We cannot
>>> have an extent past i_size, can we?
>>>
>> we can. AFAICS, ocfs2_change_file_space will allocate unwritten
>> extents and does't change i_size.
>>
>
> Oh, you're right. We need to walk the entire extent range
> between i_size and pos and figure out what needs CoW. This needs to
> happen no matter what.
>
Actually we can only have unwritten extents after i_size and it
shouldn't hurt you in this case.
So do we really need to CoW all the unwritten extents?
All I want to say is that since they are unwritten, they should also
mean 'zero' for the user space.
So can we just need to skip clearing pages if i_size is in an unwritten
extent?
>
>>> But you dropped the check for pos_blkno alignment.
>>> Unconditionally adding the +1 doesn't seem like a good idea.
>>>
>> You can add it as you wish.
>> I just thought that you add one more extra cluster if pos_blkno
>> isn't aligned so as to zero blocks in [pos_cpos_start_block,
>> pos_blkno).
>> But As I said in the comments, you will soon write pos_blkno(it also
>> needs to be CoW since it is within this refcounted extent), so if we
>> can CoW it out now, maybe we have a chance to not call
>> ocfs2_refcount_cow later.
>>
>
> I'd much rather let the write handle its own contiguousness. If
> we get lucky, that CoW melds with our CoW. If we don't get lucky, isn't
> it better to have the newly changed area be fully contiguous rather than
> have the first extent of it not be and then the remaining extents be?
>
fair enough.
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-07 0:42 ` Tao Ma
0 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-07 0:42 UTC (permalink / raw)
To: Tao Ma, Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh, Joel Becker
Joel Becker wrote:
> On Tue, Jul 06, 2010 at 03:54:58PM +0800, Tao Ma wrote:
>
>> On 07/06/2010 03:17 PM, Joel Becker wrote:
>>
>>>>> + /* Is there a cluster to zero? */
>>>>> + if (!p_cpos)
>>>>> + goto out;
>>>>>
>>>> For unwritten extent, we also need to clear the pages? If yes, the
>>>> solution doesn't complete if we have 2 unwritten extent, one
>>>> contains i_size while one passes i_size. Here we only clear the
>>>> pages for the 1st unwritten extent and leave the 2nd one untouched.
>>>>
>>> We probably don't need to zero unwritten extents. We cannot
>>> have an extent past i_size, can we?
>>>
>> we can. AFAICS, ocfs2_change_file_space will allocate unwritten
>> extents and does't change i_size.
>>
>
> Oh, you're right. We need to walk the entire extent range
> between i_size and pos and figure out what needs CoW. This needs to
> happen no matter what.
>
Actually we can only have unwritten extents after i_size and it
shouldn't hurt you in this case.
So do we really need to CoW all the unwritten extents?
All I want to say is that since they are unwritten, they should also
mean 'zero' for the user space.
So can we just need to skip clearing pages if i_size is in an unwritten
extent?
>
>>> But you dropped the check for pos_blkno alignment.
>>> Unconditionally adding the +1 doesn't seem like a good idea.
>>>
>> You can add it as you wish.
>> I just thought that you add one more extra cluster if pos_blkno
>> isn't aligned so as to zero blocks in [pos_cpos_start_block,
>> pos_blkno).
>> But As I said in the comments, you will soon write pos_blkno(it also
>> needs to be CoW since it is within this refcounted extent), so if we
>> can CoW it out now, maybe we have a chance to not call
>> ocfs2_refcount_cow later.
>>
>
> I'd much rather let the write handle its own contiguousness. If
> we get lucky, that CoW melds with our CoW. If we don't get lucky, isn't
> it better to have the newly changed area be fully contiguous rather than
> have the first extent of it not be and then the remaining extents be?
>
fair enough.
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
2010-07-07 0:42 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-07 2:03 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 2:03 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Wed, Jul 07, 2010 at 08:42:53AM +0800, Tao Ma wrote:
> > Oh, you're right. We need to walk the entire extent range
> >between i_size and pos and figure out what needs CoW. This needs to
> >happen no matter what.
> Actually we can only have unwritten extents after i_size and it
> shouldn't hurt you in this case.
> So do we really need to CoW all the unwritten extents?
> All I want to say is that since they are unwritten, they should also
> mean 'zero' for the user space.
> So can we just need to skip clearing pages if i_size is in an
> unwritten extent?
We can certainly have unwritten extents in the middle too ;-)
I've just reworked the entire ocfs2_zero_extend() logic to skip
unwritten extents and CoW refcounted ones. We have to CoW for nonsparse
anyway, so we needed this logic. We do need to walk the entire range,
just in case there are extents anywhere between i_size and pos.
Patches coming as soon as it stops breaking.
Joel
--
Life's Little Instruction Book #3
"Watch a sunrise at least once a year."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-07 2:03 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 2:03 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Wed, Jul 07, 2010 at 08:42:53AM +0800, Tao Ma wrote:
> > Oh, you're right. We need to walk the entire extent range
> >between i_size and pos and figure out what needs CoW. This needs to
> >happen no matter what.
> Actually we can only have unwritten extents after i_size and it
> shouldn't hurt you in this case.
> So do we really need to CoW all the unwritten extents?
> All I want to say is that since they are unwritten, they should also
> mean 'zero' for the user space.
> So can we just need to skip clearing pages if i_size is in an
> unwritten extent?
We can certainly have unwritten extents in the middle too ;-)
I've just reworked the entire ocfs2_zero_extend() logic to skip
unwritten extents and CoW refcounted ones. We have to CoW for nonsparse
anyway, so we needed this logic. We do need to walk the entire range,
just in case there are extents anywhere between i_size and pos.
Patches coming as soon as it stops breaking.
Joel
--
Life's Little Instruction Book #3
"Watch a sunrise at least once a year."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
2010-07-05 3:51 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-06 18:48 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 18:48 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Mon, Jul 05, 2010 at 11:51:44AM +0800, Tao Ma wrote:
> On 07/04/2010 05:32 AM, Joel Becker wrote:
> >+ /*
> >+ * If tail_blkno is in the cluster past i_size, we don't need
> >+ * to touch the cluster containing i_size at all.
> >+ */
> >+ tail_cpos = i_size_read(inode)>> osb->s_clustersize_bits;
> >+ if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)> tail_cpos)
> >+ tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
> >+ tail_blkno);
> Can we always set tail_cpos in one line?
> tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)?
> tail_cpos is either the same cluster as i_size or the next cluster
> and both works for tail_blkno I guess?
I'm taking this as well.
Joel
--
"Here's something to think about: How come you never see a headline
like ``Psychic Wins Lottery''?"
- Jay Leno
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-06 18:48 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 18:48 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Mon, Jul 05, 2010 at 11:51:44AM +0800, Tao Ma wrote:
> On 07/04/2010 05:32 AM, Joel Becker wrote:
> >+ /*
> >+ * If tail_blkno is in the cluster past i_size, we don't need
> >+ * to touch the cluster containing i_size at all.
> >+ */
> >+ tail_cpos = i_size_read(inode)>> osb->s_clustersize_bits;
> >+ if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)> tail_cpos)
> >+ tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
> >+ tail_blkno);
> Can we always set tail_cpos in one line?
> tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno)?
> tail_cpos is either the same cluster as i_size or the next cluster
> and both works for tail_blkno I guess?
I'm taking this as well.
Joel
--
"Here's something to think about: How come you never see a headline
like ``Psychic Wins Lottery''?"
- Jay Leno
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
2010-07-05 3:51 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-06 18:57 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 18:57 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Mon, Jul 05, 2010 at 11:51:44AM +0800, Tao Ma wrote:
> On 07/04/2010 05:32 AM, Joel Becker wrote:
> From here to the call of CoW is a bit hard to understand. In 'if',
> num_clusters is set for CoW and in 'else', blocks_to_zero is set. So
> it isn't easy for the reader to tell why these 2 clauses are setting
> different values. So how about my code below? It looks more
> straightforward I think.
I took your cleanup mostly.
Joel
--
"The cynics are right nine times out of ten."
- H. L. Mencken
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: Zero the tail cluster when extending past i_size v2
@ 2010-07-06 18:57 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-06 18:57 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Mon, Jul 05, 2010 at 11:51:44AM +0800, Tao Ma wrote:
> On 07/04/2010 05:32 AM, Joel Becker wrote:
> From here to the call of CoW is a bit hard to understand. In 'if',
> num_clusters is set for CoW and in 'else', blocks_to_zero is set. So
> it isn't easy for the reader to tell why these 2 clauses are setting
> different values. So how about my code below? It looks more
> straightforward I think.
I took your cleanup mostly.
Joel
--
"The cynics are right nine times out of ten."
- H. L. Mencken
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH 0/3] ocfs2: Tail zeroing fixes.
2010-07-03 21:32 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-07 11:16 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 11:16 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
This is version 3 of the ocfs2 tail zeroing fixes. This version
has some major changes. Tao correctly pointed out that we can have
multiple extents past i_size due to unwritten extents. I've reworked
the zeroing code to walk them all. Since I had to do that, and I had to
handle refcounted extents, I end up fixing a refcount bug with
non-sparse extentds.
There are now three patches. The first changes our zeroing
code to go page-by-page at the high level. The second actually changes
the zeroing code. The final patch, limiting zeroing to the end of a
write, is unchanged from v2.
Joel
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 0/3] ocfs2: Tail zeroing fixes.
@ 2010-07-07 11:16 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 11:16 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
This is version 3 of the ocfs2 tail zeroing fixes. This version
has some major changes. Tao correctly pointed out that we can have
multiple extents past i_size due to unwritten extents. I've reworked
the zeroing code to walk them all. Since I had to do that, and I had to
handle refcounted extents, I end up fixing a refcount bug with
non-sparse extentds.
There are now three patches. The first changes our zeroing
code to go page-by-page at the high level. The second actually changes
the zeroing code. The final patch, limiting zeroing to the end of a
write, is unchanged from v2.
Joel
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [Ocfs2-devel] [PATCH 0/3] ocfs2: Tail zeroing fixes.
2010-07-07 11:16 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-12 22:45 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-12 22:45 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Wed, Jul 07, 2010 at 04:16:04AM -0700, Joel Becker wrote:
> This is version 3 of the ocfs2 tail zeroing fixes. This version
> has some major changes. Tao correctly pointed out that we can have
> multiple extents past i_size due to unwritten extents. I've reworked
> the zeroing code to walk them all. Since I had to do that, and I had to
> handle refcounted extents, I end up fixing a refcount bug with
> non-sparse extentds.
> There are now three patches. The first changes our zeroing
> code to go page-by-page at the high level. The second actually changes
> the zeroing code. The final patch, limiting zeroing to the end of a
> write, is unchanged from v2.
Version 4 of these fixes are now in the fixes and linux-next
branches of ocfs2.git. I'm just appending the diff to this file rather
than resending all the patches. They've passed the first round of heavy
testing from our testers, and we're going to keep pounding on them as we
get towards 2.6.35.
Linus, I'm going to let all of the ocfs2 fixes stew in
linux-next for a few days before I send the pull request. Figure it by
the end of the week.
Joel
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9b3381a..356e976 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1107,6 +1107,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
int ret = 0, i;
unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
+ loff_t last_byte;
target_index = user_pos >> PAGE_CACHE_SHIFT;
@@ -1120,8 +1121,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
- /* This is the index *past* the write */
- end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1;
+ /*
+ * We need the index *past* the last page we could possibly
+ * touch. This is the page past the end of the write or
+ * i_size, whichever is greater.
+ */
+ last_byte = max(user_pos + user_len, i_size_read(inode));
+ BUG_ON(last_byte < 1);
+ end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
if ((start + wc->w_num_pages) > end_index)
wc->w_num_pages = end_index - start;
} else {
@@ -1619,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
return ret;
}
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int ret = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+ if (pos > i_size_read(inode))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+ return ret;
+}
+
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
@@ -1655,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_extend(inode, di_bh, pos);
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
else
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
wc);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1fdc45a..ac15911 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -734,7 +734,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
handle_t *handle = NULL;
int ret = 0;
- if (ocfs2_should_order_data(inode))
+ if (!ocfs2_should_order_data(inode))
goto out;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
@@ -771,7 +771,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
unsigned zero_from, zero_to, block_start, block_end;
BUG_ON(abs_from >= abs_to);
- BUG_ON(abs_to > ((index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
page = grab_cache_page(mapping, index);
@@ -793,8 +793,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
index, zero_from, zero_to);
/* We know that zero_from is block aligned */
- for (block_start = zero_from;
- (block_start < PAGE_CACHE_SIZE) && (block_start < zero_to);
+ for (block_start = zero_from; block_start < zero_to;
block_start = block_end) {
block_end = block_start + (1 << inode->i_blkbits);
@@ -966,6 +965,9 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
struct super_block *sb = inode->i_sb;
zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ mlog(0, "zero_start %llu for i_size %llu\n",
+ (unsigned long long)zero_start,
+ (unsigned long long)i_size_read(inode));
while (zero_start < zero_to_size) {
ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
zero_to_size,
--
"The only way to get rid of a temptation is to yield to it."
- Oscar Wilde
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 0/3] ocfs2: Tail zeroing fixes.
@ 2010-07-12 22:45 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-12 22:45 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Wed, Jul 07, 2010 at 04:16:04AM -0700, Joel Becker wrote:
> This is version 3 of the ocfs2 tail zeroing fixes. This version
> has some major changes. Tao correctly pointed out that we can have
> multiple extents past i_size due to unwritten extents. I've reworked
> the zeroing code to walk them all. Since I had to do that, and I had to
> handle refcounted extents, I end up fixing a refcount bug with
> non-sparse extentds.
> There are now three patches. The first changes our zeroing
> code to go page-by-page at the high level. The second actually changes
> the zeroing code. The final patch, limiting zeroing to the end of a
> write, is unchanged from v2.
Version 4 of these fixes are now in the fixes and linux-next
branches of ocfs2.git. I'm just appending the diff to this file rather
than resending all the patches. They've passed the first round of heavy
testing from our testers, and we're going to keep pounding on them as we
get towards 2.6.35.
Linus, I'm going to let all of the ocfs2 fixes stew in
linux-next for a few days before I send the pull request. Figure it by
the end of the week.
Joel
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9b3381a..356e976 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1107,6 +1107,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
int ret = 0, i;
unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
+ loff_t last_byte;
target_index = user_pos >> PAGE_CACHE_SHIFT;
@@ -1120,8 +1121,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
- /* This is the index *past* the write */
- end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1;
+ /*
+ * We need the index *past* the last page we could possibly
+ * touch. This is the page past the end of the write or
+ * i_size, whichever is greater.
+ */
+ last_byte = max(user_pos + user_len, i_size_read(inode));
+ BUG_ON(last_byte < 1);
+ end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
if ((start + wc->w_num_pages) > end_index)
wc->w_num_pages = end_index - start;
} else {
@@ -1619,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
return ret;
}
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int ret = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+ if (pos > i_size_read(inode))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+ return ret;
+}
+
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
@@ -1655,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_extend(inode, di_bh, pos);
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
else
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
wc);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1fdc45a..ac15911 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -734,7 +734,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
handle_t *handle = NULL;
int ret = 0;
- if (ocfs2_should_order_data(inode))
+ if (!ocfs2_should_order_data(inode))
goto out;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
@@ -771,7 +771,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
unsigned zero_from, zero_to, block_start, block_end;
BUG_ON(abs_from >= abs_to);
- BUG_ON(abs_to > ((index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
page = grab_cache_page(mapping, index);
@@ -793,8 +793,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
index, zero_from, zero_to);
/* We know that zero_from is block aligned */
- for (block_start = zero_from;
- (block_start < PAGE_CACHE_SIZE) && (block_start < zero_to);
+ for (block_start = zero_from; block_start < zero_to;
block_start = block_end) {
block_end = block_start + (1 << inode->i_blkbits);
@@ -966,6 +965,9 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
struct super_block *sb = inode->i_sb;
zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ mlog(0, "zero_start %llu for i_size %llu\n",
+ (unsigned long long)zero_start,
+ (unsigned long long)i_size_read(inode));
while (zero_start < zero_to_size) {
ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
zero_to_size,
--
"The only way to get rid of a temptation is to yield to it."
- Oscar Wilde
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH 1/3] ocfs2: When zero extending, do it by page.
2010-07-03 21:32 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-07 11:16 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 11:16 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
ocfs2_zero_extend() does its zeroing block by block, but it calls a
function named ocfs2_write_zero_page(). Let's have
ocfs2_write_zero_page() handle the page level. From
ocfs2_zero_extend()'s perspective, it is now page-at-a-time.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 30 --------------
fs/ocfs2/file.c | 119 +++++++++++++++++++++++++++++++++++++++----------------
2 files changed, 85 insertions(+), 64 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca2..9a5c931 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -459,36 +459,6 @@ int walk_page_buffers( handle_t *handle,
return ret;
}
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
- struct page *page,
- unsigned from,
- unsigned to)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle;
- int ret = 0;
-
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
- mlog_errno(ret);
- }
-out:
- if (ret) {
- if (!IS_ERR(handle))
- ocfs2_commit_trans(osb, handle);
- handle = ERR_PTR(ret);
- }
- return handle;
-}
-
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea6..a6e0eb6 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -724,28 +724,55 @@ leave:
return status;
}
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ if (ocfs2_should_order_data(inode))
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out:
+ if (ret) {
+ if (!IS_ERR(handle))
+ ocfs2_commit_trans(osb, handle);
+ handle = ERR_PTR(ret);
+ }
+ return handle;
+}
+
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
- u64 size)
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+ u64 abs_to)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index;
- unsigned int offset;
+ unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
handle_t *handle = NULL;
int ret;
+ unsigned zero_from, zero_to, block_start, block_end;
- offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
+ BUG_ON(abs_from >= abs_to);
+ BUG_ON(abs_to > ((index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_from & (inode->i_blkbits - 1));
page = grab_cache_page(mapping, index);
if (!page) {
@@ -754,31 +781,52 @@ static int ocfs2_write_zero_page(struct inode *inode,
goto out;
}
- ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
+ /* Get the offsets within the page that we want to zero */
+ zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+ zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ if (!zero_to)
+ zero_to = PAGE_CACHE_SIZE;
- if (ocfs2_should_order_data(inode)) {
- handle = ocfs2_start_walk_page_trans(inode, page, offset,
- offset);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ /* We know that zero_from is block aligned */
+ for (block_start = zero_from;
+ (block_start < PAGE_CACHE_SIZE) && (block_start < zero_to);
+ block_start = block_end) {
+ block_end = block_start + (1 << inode->i_blkbits);
+
+ /*
+ * block_start is block-aligned. Bump it by one to
+ * force ocfs2_{prepare,commit}_write() to zero the
+ * whole block.
+ */
+ ret = ocfs2_prepare_write_nolock(inode, page,
+ block_start + 1,
+ block_start + 1);
+ if (ret < 0) {
+ mlog_errno(ret);
goto out_unlock;
}
- }
- /* must not update i_size! */
- ret = block_commit_write(page, offset, offset);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ if (!handle) {
+ handle = ocfs2_zero_start_ordered_transaction(inode);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ break;
+ }
+ }
+
+ /* must not update i_size! */
+ ret = block_commit_write(page, block_start + 1,
+ block_start + 1);
+ if (ret < 0)
+ mlog_errno(ret);
+ else
+ ret = 0;
+ }
if (handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
out_unlock:
unlock_page(page);
page_cache_release(page);
@@ -790,18 +838,21 @@ static int ocfs2_zero_extend(struct inode *inode,
u64 zero_to_size)
{
int ret = 0;
- u64 start_off;
+ u64 start_off, next_off;
struct super_block *sb = inode->i_sb;
start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
while (start_off < zero_to_size) {
- ret = ocfs2_write_zero_page(inode, start_off);
+ next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_off > zero_to_size)
+ next_off = zero_to_size;
+ ret = ocfs2_write_zero_page(inode, start_off, next_off);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- start_off += sb->s_blocksize;
+ start_off = next_off;
/*
* Very large extends have the potential to lock up
--
1.7.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/3] ocfs2: When zero extending, do it by page.
@ 2010-07-07 11:16 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 11:16 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
ocfs2_zero_extend() does its zeroing block by block, but it calls a
function named ocfs2_write_zero_page(). Let's have
ocfs2_write_zero_page() handle the page level. From
ocfs2_zero_extend()'s perspective, it is now page-at-a-time.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 30 --------------
fs/ocfs2/file.c | 119 +++++++++++++++++++++++++++++++++++++++----------------
2 files changed, 85 insertions(+), 64 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca2..9a5c931 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -459,36 +459,6 @@ int walk_page_buffers( handle_t *handle,
return ret;
}
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
- struct page *page,
- unsigned from,
- unsigned to)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle;
- int ret = 0;
-
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
- mlog_errno(ret);
- }
-out:
- if (ret) {
- if (!IS_ERR(handle))
- ocfs2_commit_trans(osb, handle);
- handle = ERR_PTR(ret);
- }
- return handle;
-}
-
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea6..a6e0eb6 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -724,28 +724,55 @@ leave:
return status;
}
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ if (ocfs2_should_order_data(inode))
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out:
+ if (ret) {
+ if (!IS_ERR(handle))
+ ocfs2_commit_trans(osb, handle);
+ handle = ERR_PTR(ret);
+ }
+ return handle;
+}
+
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
- u64 size)
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+ u64 abs_to)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index;
- unsigned int offset;
+ unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
handle_t *handle = NULL;
int ret;
+ unsigned zero_from, zero_to, block_start, block_end;
- offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
+ BUG_ON(abs_from >= abs_to);
+ BUG_ON(abs_to > ((index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_from & (inode->i_blkbits - 1));
page = grab_cache_page(mapping, index);
if (!page) {
@@ -754,31 +781,52 @@ static int ocfs2_write_zero_page(struct inode *inode,
goto out;
}
- ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
+ /* Get the offsets within the page that we want to zero */
+ zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+ zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ if (!zero_to)
+ zero_to = PAGE_CACHE_SIZE;
- if (ocfs2_should_order_data(inode)) {
- handle = ocfs2_start_walk_page_trans(inode, page, offset,
- offset);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ /* We know that zero_from is block aligned */
+ for (block_start = zero_from;
+ (block_start < PAGE_CACHE_SIZE) && (block_start < zero_to);
+ block_start = block_end) {
+ block_end = block_start + (1 << inode->i_blkbits);
+
+ /*
+ * block_start is block-aligned. Bump it by one to
+ * force ocfs2_{prepare,commit}_write() to zero the
+ * whole block.
+ */
+ ret = ocfs2_prepare_write_nolock(inode, page,
+ block_start + 1,
+ block_start + 1);
+ if (ret < 0) {
+ mlog_errno(ret);
goto out_unlock;
}
- }
- /* must not update i_size! */
- ret = block_commit_write(page, offset, offset);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ if (!handle) {
+ handle = ocfs2_zero_start_ordered_transaction(inode);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ break;
+ }
+ }
+
+ /* must not update i_size! */
+ ret = block_commit_write(page, block_start + 1,
+ block_start + 1);
+ if (ret < 0)
+ mlog_errno(ret);
+ else
+ ret = 0;
+ }
if (handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
out_unlock:
unlock_page(page);
page_cache_release(page);
@@ -790,18 +838,21 @@ static int ocfs2_zero_extend(struct inode *inode,
u64 zero_to_size)
{
int ret = 0;
- u64 start_off;
+ u64 start_off, next_off;
struct super_block *sb = inode->i_sb;
start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
while (start_off < zero_to_size) {
- ret = ocfs2_write_zero_page(inode, start_off);
+ next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_off > zero_to_size)
+ next_off = zero_to_size;
+ ret = ocfs2_write_zero_page(inode, start_off, next_off);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- start_off += sb->s_blocksize;
+ start_off = next_off;
/*
* Very large extends have the potential to lock up
--
1.7.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH 1/3] ocfs2: When zero extending, do it by page.
2010-07-07 11:16 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-07 15:19 ` Tao Ma
-1 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-07 15:19 UTC (permalink / raw)
To: Joel Becker
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
Hi Joel,
Joel Becker wrote:
> ocfs2_zero_extend() does its zeroing block by block, but it calls a
> function named ocfs2_write_zero_page(). Let's have
> ocfs2_write_zero_page() handle the page level. From
> ocfs2_zero_extend()'s perspective, it is now page-at-a-time.
>
> Signed-off-by: Joel Becker <joel.becker@oracle.com>
> ---
> fs/ocfs2/aops.c | 30 --------------
> fs/ocfs2/file.c | 119 +++++++++++++++++++++++++++++++++++++++----------------
> 2 files changed, 85 insertions(+), 64 deletions(-)
>
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index 3623ca2..9a5c931 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -459,36 +459,6 @@ int walk_page_buffers( handle_t *handle,
> return ret;
> }
>
> -handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
> - struct page *page,
> - unsigned from,
> - unsigned to)
> -{
> - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> - handle_t *handle;
> - int ret = 0;
> -
> - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> - if (IS_ERR(handle)) {
> - ret = -ENOMEM;
> - mlog_errno(ret);
> - goto out;
> - }
> -
> - if (ocfs2_should_order_data(inode)) {
> - ret = ocfs2_jbd2_file_inode(handle, inode);
> - if (ret < 0)
> - mlog_errno(ret);
> - }
> -out:
> - if (ret) {
> - if (!IS_ERR(handle))
> - ocfs2_commit_trans(osb, handle);
> - handle = ERR_PTR(ret);
> - }
> - return handle;
> -}
> -
> static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
> {
> sector_t status;
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 6a13ea6..a6e0eb6 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -724,28 +724,55 @@ leave:
> return status;
> }
>
> +/*
> + * While a write will already be ordering the data, a truncate will not.
> + * Thus, we need to explicitly order the zeroed pages.
> + */
> +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
> +{
> + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> + handle_t *handle = NULL;
> + int ret = 0;
> +
> + if (ocfs2_should_order_data(inode))
>
This should be if (!ocfs2_should_order_data(inode)) I guess? ;)
> + goto out;
> +
> + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> + if (IS_ERR(handle)) {
> + ret = -ENOMEM;
> + mlog_errno(ret);
> + goto out;
> + }
> +
>
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/3] ocfs2: When zero extending, do it by page.
@ 2010-07-07 15:19 ` Tao Ma
0 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-07 15:19 UTC (permalink / raw)
To: Joel Becker
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
Hi Joel,
Joel Becker wrote:
> ocfs2_zero_extend() does its zeroing block by block, but it calls a
> function named ocfs2_write_zero_page(). Let's have
> ocfs2_write_zero_page() handle the page level. From
> ocfs2_zero_extend()'s perspective, it is now page-at-a-time.
>
> Signed-off-by: Joel Becker <joel.becker@oracle.com>
> ---
> fs/ocfs2/aops.c | 30 --------------
> fs/ocfs2/file.c | 119 +++++++++++++++++++++++++++++++++++++++----------------
> 2 files changed, 85 insertions(+), 64 deletions(-)
>
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index 3623ca2..9a5c931 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -459,36 +459,6 @@ int walk_page_buffers( handle_t *handle,
> return ret;
> }
>
> -handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
> - struct page *page,
> - unsigned from,
> - unsigned to)
> -{
> - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> - handle_t *handle;
> - int ret = 0;
> -
> - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> - if (IS_ERR(handle)) {
> - ret = -ENOMEM;
> - mlog_errno(ret);
> - goto out;
> - }
> -
> - if (ocfs2_should_order_data(inode)) {
> - ret = ocfs2_jbd2_file_inode(handle, inode);
> - if (ret < 0)
> - mlog_errno(ret);
> - }
> -out:
> - if (ret) {
> - if (!IS_ERR(handle))
> - ocfs2_commit_trans(osb, handle);
> - handle = ERR_PTR(ret);
> - }
> - return handle;
> -}
> -
> static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
> {
> sector_t status;
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 6a13ea6..a6e0eb6 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -724,28 +724,55 @@ leave:
> return status;
> }
>
> +/*
> + * While a write will already be ordering the data, a truncate will not.
> + * Thus, we need to explicitly order the zeroed pages.
> + */
> +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
> +{
> + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> + handle_t *handle = NULL;
> + int ret = 0;
> +
> + if (ocfs2_should_order_data(inode))
>
This should be if (!ocfs2_should_order_data(inode)) I guess? ;)
> + goto out;
> +
> + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> + if (IS_ERR(handle)) {
> + ret = -ENOMEM;
> + mlog_errno(ret);
> + goto out;
> + }
> +
>
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/3] ocfs2: When zero extending, do it by page.
2010-07-07 15:19 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-07 20:04 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 20:04 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Wed, Jul 07, 2010 at 11:19:27PM +0800, Tao Ma wrote:
> >+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
> >+{
> >+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> >+ handle_t *handle = NULL;
> >+ int ret = 0;
> >+
> >+ if (ocfs2_should_order_data(inode))
> This should be if (!ocfs2_should_order_data(inode)) I guess? ;)
Of course it should. Fixed ;-)
Joel
--
"Too much walking shoes worn thin.
Too much trippin' and my soul's worn thin.
Time to catch a ride it leaves today
Her name is what it means.
Too much walking shoes worn thin."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/3] ocfs2: When zero extending, do it by page.
@ 2010-07-07 20:04 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 20:04 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Wed, Jul 07, 2010 at 11:19:27PM +0800, Tao Ma wrote:
> >+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
> >+{
> >+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> >+ handle_t *handle = NULL;
> >+ int ret = 0;
> >+
> >+ if (ocfs2_should_order_data(inode))
> This should be if (!ocfs2_should_order_data(inode)) I guess? ;)
Of course it should. Fixed ;-)
Joel
--
"Too much walking shoes worn thin.
Too much trippin' and my soul's worn thin.
Time to catch a ride it leaves today
Her name is what it means.
Too much walking shoes worn thin."
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/3] ocfs2: When zero extending, do it by page.
2010-07-07 11:16 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-08 3:44 ` Tao Ma
-1 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-08 3:44 UTC (permalink / raw)
To: Joel Becker
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
Hi Joel,
On 07/07/2010 07:16 PM, Joel Becker wrote:
> ocfs2_zero_extend() does its zeroing block by block, but it calls a
> function named ocfs2_write_zero_page(). Let's have
> ocfs2_write_zero_page() handle the page level. From
> ocfs2_zero_extend()'s perspective, it is now page-at-a-time.
>
> Signed-off-by: Joel Becker<joel.becker@oracle.com>
> ---
> fs/ocfs2/aops.c | 30 --------------
> fs/ocfs2/file.c | 119 +++++++++++++++++++++++++++++++++++++++----------------
> 2 files changed, 85 insertions(+), 64 deletions(-)
>
<snip>
> -static int ocfs2_write_zero_page(struct inode *inode,
> - u64 size)
> +static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
> + u64 abs_to)
> {
> struct address_space *mapping = inode->i_mapping;
> struct page *page;
> - unsigned long index;
> - unsigned int offset;
> + unsigned long index = abs_from>> PAGE_CACHE_SHIFT;
> handle_t *handle = NULL;
> int ret;
> + unsigned zero_from, zero_to, block_start, block_end;
>
> - offset = (size& (PAGE_CACHE_SIZE-1)); /* Within page */
> - /* ugh. in prepare/commit_write, if from==to==start of block, we
> - ** skip the prepare. make sure we never send an offset for the start
> - ** of a block
> - */
> - if ((offset& (inode->i_sb->s_blocksize - 1)) == 0) {
> - offset++;
> - }
> - index = size>> PAGE_CACHE_SHIFT;
> + BUG_ON(abs_from>= abs_to);
> + BUG_ON(abs_to> ((index + 1)<< PAGE_CACHE_SHIFT));
Sorry for not noticing this yesterday night. This can't work and will
overflow and bug out. I met with a similar bug in reflink test. See
commit d622b89.
> + BUG_ON(abs_from& (inode->i_blkbits - 1));
>
> page = grab_cache_page(mapping, index);
> if (!page) {
> @@ -754,31 +781,52 @@ static int ocfs2_write_zero_page(struct inode *inode,
> goto out;
> }
>
> - ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
> - if (ret< 0) {
> - mlog_errno(ret);
> - goto out_unlock;
> - }
> + /* Get the offsets within the page that we want to zero */
> + zero_from = abs_from& (PAGE_CACHE_SIZE - 1);
> + zero_to = abs_to& (PAGE_CACHE_SIZE - 1);
> + if (!zero_to)
> + zero_to = PAGE_CACHE_SIZE;
>
> - if (ocfs2_should_order_data(inode)) {
> - handle = ocfs2_start_walk_page_trans(inode, page, offset,
> - offset);
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - handle = NULL;
> + /* We know that zero_from is block aligned */
> + for (block_start = zero_from;
> + (block_start< PAGE_CACHE_SIZE)&& (block_start< zero_to);
> + block_start = block_end) {
Do we really need to check block_start < PAGE_CACHE_SIZE? I think just
check block_start < zero_to is enough since you have limit zero_to with
PAGE_CACHE_SIZE. What's more, it looks more natural(see below), does it?
for (block_start = zero_form; block_start < zero_to; block_start =
block_end) {
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/3] ocfs2: When zero extending, do it by page.
@ 2010-07-08 3:44 ` Tao Ma
0 siblings, 0 replies; 78+ messages in thread
From: Tao Ma @ 2010-07-08 3:44 UTC (permalink / raw)
To: Joel Becker
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
Hi Joel,
On 07/07/2010 07:16 PM, Joel Becker wrote:
> ocfs2_zero_extend() does its zeroing block by block, but it calls a
> function named ocfs2_write_zero_page(). Let's have
> ocfs2_write_zero_page() handle the page level. From
> ocfs2_zero_extend()'s perspective, it is now page-at-a-time.
>
> Signed-off-by: Joel Becker<joel.becker@oracle.com>
> ---
> fs/ocfs2/aops.c | 30 --------------
> fs/ocfs2/file.c | 119 +++++++++++++++++++++++++++++++++++++++----------------
> 2 files changed, 85 insertions(+), 64 deletions(-)
>
<snip>
> -static int ocfs2_write_zero_page(struct inode *inode,
> - u64 size)
> +static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
> + u64 abs_to)
> {
> struct address_space *mapping = inode->i_mapping;
> struct page *page;
> - unsigned long index;
> - unsigned int offset;
> + unsigned long index = abs_from>> PAGE_CACHE_SHIFT;
> handle_t *handle = NULL;
> int ret;
> + unsigned zero_from, zero_to, block_start, block_end;
>
> - offset = (size& (PAGE_CACHE_SIZE-1)); /* Within page */
> - /* ugh. in prepare/commit_write, if from==to==start of block, we
> - ** skip the prepare. make sure we never send an offset for the start
> - ** of a block
> - */
> - if ((offset& (inode->i_sb->s_blocksize - 1)) == 0) {
> - offset++;
> - }
> - index = size>> PAGE_CACHE_SHIFT;
> + BUG_ON(abs_from>= abs_to);
> + BUG_ON(abs_to> ((index + 1)<< PAGE_CACHE_SHIFT));
Sorry for not noticing this yesterday night. This can't work and will
overflow and bug out. I met with a similar bug in reflink test. See
commit d622b89.
> + BUG_ON(abs_from& (inode->i_blkbits - 1));
>
> page = grab_cache_page(mapping, index);
> if (!page) {
> @@ -754,31 +781,52 @@ static int ocfs2_write_zero_page(struct inode *inode,
> goto out;
> }
>
> - ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
> - if (ret< 0) {
> - mlog_errno(ret);
> - goto out_unlock;
> - }
> + /* Get the offsets within the page that we want to zero */
> + zero_from = abs_from& (PAGE_CACHE_SIZE - 1);
> + zero_to = abs_to& (PAGE_CACHE_SIZE - 1);
> + if (!zero_to)
> + zero_to = PAGE_CACHE_SIZE;
>
> - if (ocfs2_should_order_data(inode)) {
> - handle = ocfs2_start_walk_page_trans(inode, page, offset,
> - offset);
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - handle = NULL;
> + /* We know that zero_from is block aligned */
> + for (block_start = zero_from;
> + (block_start< PAGE_CACHE_SIZE)&& (block_start< zero_to);
> + block_start = block_end) {
Do we really need to check block_start < PAGE_CACHE_SIZE? I think just
check block_start < zero_to is enough since you have limit zero_to with
PAGE_CACHE_SIZE. What's more, it looks more natural(see below), does it?
for (block_start = zero_form; block_start < zero_to; block_start =
block_end) {
Regards,
Tao
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH 1/3] ocfs2: When zero extending, do it by page.
2010-07-08 3:44 ` [Ocfs2-devel] " Tao Ma
@ 2010-07-08 9:51 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-08 9:51 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Thu, Jul 08, 2010 at 11:44:59AM +0800, Tao Ma wrote:
> On 07/07/2010 07:16 PM, Joel Becker wrote:
> >+ BUG_ON(abs_to> ((index + 1)<< PAGE_CACHE_SHIFT));
> Sorry for not noticing this yesterday night. This can't work and
> will overflow and bug out. I met with a similar bug in reflink test.
> See commit d622b89.
Good catch. It's obvious, now that you mention it.
> >+ /* We know that zero_from is block aligned */
> >+ for (block_start = zero_from;
> >+ (block_start< PAGE_CACHE_SIZE)&& (block_start< zero_to);
> >+ block_start = block_end) {
> Do we really need to check block_start < PAGE_CACHE_SIZE? I think
> just check block_start < zero_to is enough since you have limit
> zero_to with PAGE_CACHE_SIZE. What's more, it looks more natural(see
> below), does it?
>
> for (block_start = zero_form; block_start < zero_to; block_start =
> block_end) {
Yup. The code looked different halfway through, so I didn't
realize I was checking the same thing twice.
Joel
--
"Depend on the rabbit's foot if you will, but remember, it didn't
help the rabbit."
- R. E. Shay
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 1/3] ocfs2: When zero extending, do it by page.
@ 2010-07-08 9:51 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-08 9:51 UTC (permalink / raw)
To: Tao Ma
Cc: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel,
Dave Chinner, Christoph Hellwig, Mark Fasheh
On Thu, Jul 08, 2010 at 11:44:59AM +0800, Tao Ma wrote:
> On 07/07/2010 07:16 PM, Joel Becker wrote:
> >+ BUG_ON(abs_to> ((index + 1)<< PAGE_CACHE_SHIFT));
> Sorry for not noticing this yesterday night. This can't work and
> will overflow and bug out. I met with a similar bug in reflink test.
> See commit d622b89.
Good catch. It's obvious, now that you mention it.
> >+ /* We know that zero_from is block aligned */
> >+ for (block_start = zero_from;
> >+ (block_start< PAGE_CACHE_SIZE)&& (block_start< zero_to);
> >+ block_start = block_end) {
> Do we really need to check block_start < PAGE_CACHE_SIZE? I think
> just check block_start < zero_to is enough since you have limit
> zero_to with PAGE_CACHE_SIZE. What's more, it looks more natural(see
> below), does it?
>
> for (block_start = zero_form; block_start < zero_to; block_start =
> block_end) {
Yup. The code looked different halfway through, so I didn't
realize I was checking the same thing twice.
Joel
--
"Depend on the rabbit's foot if you will, but remember, it didn't
help the rabbit."
- R. E. Shay
Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH 2/3] ocfs2: Zero the tail cluster when extending past i_size.
2010-07-03 21:32 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-07 11:16 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 11:16 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
ocfs2's allocation unit is the cluster. This can be larger than a block
or even a memory page. This means that a file may have many blocks in
its last extent that are beyond the block containing i_size. There also
may be more unwritten extents after that.
When ocfs2 grows a file, it zeros the entire cluster in order to ensure
future i_size growth will see cleared blocks. Unfortunately,
block_write_full_page() drops the pages past i_size. This means that
ocfs2 is actually leaking garbage data into the tail end of that last
cluster. This is a bug.
We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
when a write or truncate is past i_size. They will use
ocfs2_zero_extend() to ensure the data is properly zeroed.
Older versions of ocfs2_zero_extend() simply zeroed every block between
i_size and the zeroing position. This presumes three things:
1) There is allocation for all of these blocks.
2) The extents are not unwritten.
3) The extents are not refcounted.
(1) and (2) hold true for non-sparse filesystems, which used to be the
only users of ocfs2_zero_extend(). (3) is another bug.
Since we're now using ocfs2_zero_extend() for sparse filesystems as
well, we teach ocfs2_zero_extend() to check every extent between
i_size and the zeroing position. If the extent is unwritten, it is
ignored. If it is refcounted, it is CoWed. Then it is zeroed.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 30 ++++----
fs/ocfs2/file.c | 198 ++++++++++++++++++++++++++++++++++++++--------
fs/ocfs2/file.h | 6 +-
fs/ocfs2/quota_global.c | 2 +-
fs/ocfs2/quota_local.c | 4 +-
fs/ocfs2/refcounttree.c | 6 ++
6 files changed, 192 insertions(+), 54 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9a5c931..8d6dc3f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
dump_stack();
goto bail;
}
-
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
-
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
}
+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+ (unsigned long long)past_eof);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
+
bail:
if (err < 0)
err = -EIO;
@@ -1590,21 +1589,20 @@ out:
* write path can treat it as an non-allocating write, which has no
* special case code for sparse/nonsparse files.
*/
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
- unsigned len,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, unsigned len,
struct ocfs2_write_ctxt *wc)
{
int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;
- if (ocfs2_sparse_alloc(osb))
- return 0;
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (newsize <= i_size_read(inode))
return 0;
- ret = ocfs2_extend_no_holes(inode, newsize, pos);
+ ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
if (ret)
mlog_errno(ret);
@@ -1649,7 +1647,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}
- ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+ wc);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a6e0eb6..1fdc45a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -787,6 +787,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
if (!zero_to)
zero_to = PAGE_CACHE_SIZE;
+ mlog(0,
+ "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
+ (unsigned long long)abs_from, (unsigned long long)abs_to,
+ index, zero_from, zero_to);
+
/* We know that zero_from is block aligned */
for (block_start = zero_from;
(block_start < PAGE_CACHE_SIZE) && (block_start < zero_to);
@@ -834,25 +839,114 @@ out:
return ret;
}
-static int ocfs2_zero_extend(struct inode *inode,
- u64 zero_to_size)
+/*
+ * Find the next range to zero. We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache. We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed. range_start and range_end return the next zeroing
+ * range. A subsequent call should pass the previous range_end as its
+ * zero_start. If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over. Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 zero_start, u64 zero_end,
+ u64 *range_start, u64 *range_end)
{
- int ret = 0;
- u64 start_off, next_off;
- struct super_block *sb = inode->i_sb;
+ int rc = 0, needs_cow = 0;
+ u32 p_cpos, zero_clusters = 0;
+ u32 zero_cpos =
+ zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
- start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
- while (start_off < zero_to_size) {
- next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
- if (next_off > zero_to_size)
- next_off = zero_to_size;
- ret = ocfs2_write_zero_page(inode, start_off, next_off);
- if (ret < 0) {
- mlog_errno(ret);
+ while (zero_cpos < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ zero_clusters = num_clusters;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ break;
+ }
+
+ zero_cpos += num_clusters;
+ }
+ if (!zero_clusters) {
+ *range_end = 0;
+ goto out;
+ }
+
+ while ((zero_cpos + zero_clusters) < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+ &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+ break;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ zero_clusters += num_clusters;
+ }
+ if ((zero_cpos + zero_clusters) > last_cpos)
+ zero_clusters = last_cpos - zero_cpos;
+
+ if (needs_cow) {
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+ UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
goto out;
}
+ }
- start_off = next_off;
+ *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+ *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+ zero_cpos + zero_clusters);
+
+out:
+ return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+ u64 range_end)
+{
+ int rc = 0;
+ u64 next_pos;
+ u64 zero_pos = range_start;
+
+ mlog(0, "range_start = %llu, range_end = %llu\n",
+ (unsigned long long)range_start,
+ (unsigned long long)range_end);
+ BUG_ON(range_start >= range_end);
+
+ while (zero_pos < range_end) {
+ next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_pos > range_end)
+ next_pos = range_end;
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+ if (rc < 0) {
+ mlog_errno(rc);
+ break;
+ }
+ zero_pos = next_pos;
/*
* Very large extends have the potential to lock up
@@ -861,16 +955,60 @@ static int ocfs2_zero_extend(struct inode *inode,
cond_resched();
}
-out:
+ return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to_size)
+{
+ int ret = 0;
+ u64 zero_start, range_start = 0, range_end = 0;
+ struct super_block *sb = inode->i_sb;
+
+ zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ while (zero_start < zero_to_size) {
+ ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+ zero_to_size,
+ &range_start,
+ &range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ if (!range_end)
+ break;
+ /* Trim the ends */
+ if (range_start < zero_start)
+ range_start = zero_start;
+ if (range_end > zero_to_size)
+ range_end = zero_to_size;
+
+ ret = ocfs2_zero_extend_range(inode, range_start,
+ range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ zero_start = range_end;
+ }
+
return ret;
}
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to)
{
int ret;
u32 clusters_to_add;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ /*
+ * Only quota files call this without a bh, and they can't be
+ * refcounted.
+ */
+ BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
if (clusters_to_add < oi->ip_clusters)
clusters_to_add = 0;
@@ -891,7 +1029,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
* still need to zero the area between the old i_size and the
* new i_size.
*/
- ret = ocfs2_zero_extend(inode, zero_to);
+ ret = ocfs2_zero_extend(inode, di_bh, zero_to);
if (ret < 0)
mlog_errno(ret);
@@ -913,27 +1051,15 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));
/*
- * Fall through for converting inline data, even if the fs
- * supports sparse files.
- *
- * The check for inline data here is legal - nobody can add
- * the feature since we have i_mutex. We must check it again
- * after acquiring ip_alloc_sem though, as paths like mmap
- * might have raced us to converting the inode to extents.
- */
- if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- goto out_update_size;
-
- /*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
* i_mutex to block other extend/truncate calls while we're
- * here.
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
*/
down_write(&oi->ip_alloc_sem);
@@ -950,14 +1076,16 @@ static int ocfs2_extend_file(struct inode *inode,
ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
if (ret) {
up_write(&oi->ip_alloc_sem);
-
mlog_errno(ret);
goto out;
}
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+ new_i_size);
up_write(&oi->ip_alloc_sem);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f..97bf761 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
- u64 zero_to);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe..4607923 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
* locking allocators ranks above a transaction start
*/
WARN_ON(journal_current_handle());
- status = ocfs2_extend_no_holes(gqinode,
+ status = ocfs2_extend_no_holes(gqinode, NULL,
gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
gqinode->i_size);
if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4..dc78764 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
u64 p_blkno;
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
+ status = ocfs2_extend_no_holes(lqinode, NULL,
lqinode->i_size + 2 * sb->s_blocksize,
lqinode->i_size);
if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
return ocfs2_local_quota_add_chunk(sb, type, offset);
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
+ status = ocfs2_extend_no_holes(lqinode, NULL,
lqinode->i_size + sb->s_blocksize,
lqinode->i_size);
if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36..32949df 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4166,6 +4166,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
struct buffer_head *new_bh = NULL;
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = filemap_fdatawrite(inode->i_mapping);
if (ret) {
mlog_errno(ret);
--
1.7.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 2/3] ocfs2: Zero the tail cluster when extending past i_size.
@ 2010-07-07 11:16 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 11:16 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
ocfs2's allocation unit is the cluster. This can be larger than a block
or even a memory page. This means that a file may have many blocks in
its last extent that are beyond the block containing i_size. There also
may be more unwritten extents after that.
When ocfs2 grows a file, it zeros the entire cluster in order to ensure
future i_size growth will see cleared blocks. Unfortunately,
block_write_full_page() drops the pages past i_size. This means that
ocfs2 is actually leaking garbage data into the tail end of that last
cluster. This is a bug.
We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
when a write or truncate is past i_size. They will use
ocfs2_zero_extend() to ensure the data is properly zeroed.
Older versions of ocfs2_zero_extend() simply zeroed every block between
i_size and the zeroing position. This presumes three things:
1) There is allocation for all of these blocks.
2) The extents are not unwritten.
3) The extents are not refcounted.
(1) and (2) hold true for non-sparse filesystems, which used to be the
only users of ocfs2_zero_extend(). (3) is another bug.
Since we're now using ocfs2_zero_extend() for sparse filesystems as
well, we teach ocfs2_zero_extend() to check every extent between
i_size and the zeroing position. If the extent is unwritten, it is
ignored. If it is refcounted, it is CoWed. Then it is zeroed.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 30 ++++----
fs/ocfs2/file.c | 198 ++++++++++++++++++++++++++++++++++++++--------
fs/ocfs2/file.h | 6 +-
fs/ocfs2/quota_global.c | 2 +-
fs/ocfs2/quota_local.c | 4 +-
fs/ocfs2/refcounttree.c | 6 ++
6 files changed, 192 insertions(+), 54 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9a5c931..8d6dc3f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
dump_stack();
goto bail;
}
-
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
-
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
}
+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+ (unsigned long long)past_eof);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
+
bail:
if (err < 0)
err = -EIO;
@@ -1590,21 +1589,20 @@ out:
* write path can treat it as an non-allocating write, which has no
* special case code for sparse/nonsparse files.
*/
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
- unsigned len,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, unsigned len,
struct ocfs2_write_ctxt *wc)
{
int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;
- if (ocfs2_sparse_alloc(osb))
- return 0;
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (newsize <= i_size_read(inode))
return 0;
- ret = ocfs2_extend_no_holes(inode, newsize, pos);
+ ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
if (ret)
mlog_errno(ret);
@@ -1649,7 +1647,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}
- ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+ wc);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a6e0eb6..1fdc45a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -787,6 +787,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
if (!zero_to)
zero_to = PAGE_CACHE_SIZE;
+ mlog(0,
+ "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
+ (unsigned long long)abs_from, (unsigned long long)abs_to,
+ index, zero_from, zero_to);
+
/* We know that zero_from is block aligned */
for (block_start = zero_from;
(block_start < PAGE_CACHE_SIZE) && (block_start < zero_to);
@@ -834,25 +839,114 @@ out:
return ret;
}
-static int ocfs2_zero_extend(struct inode *inode,
- u64 zero_to_size)
+/*
+ * Find the next range to zero. We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache. We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed. range_start and range_end return the next zeroing
+ * range. A subsequent call should pass the previous range_end as its
+ * zero_start. If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over. Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 zero_start, u64 zero_end,
+ u64 *range_start, u64 *range_end)
{
- int ret = 0;
- u64 start_off, next_off;
- struct super_block *sb = inode->i_sb;
+ int rc = 0, needs_cow = 0;
+ u32 p_cpos, zero_clusters = 0;
+ u32 zero_cpos =
+ zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
- start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
- while (start_off < zero_to_size) {
- next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
- if (next_off > zero_to_size)
- next_off = zero_to_size;
- ret = ocfs2_write_zero_page(inode, start_off, next_off);
- if (ret < 0) {
- mlog_errno(ret);
+ while (zero_cpos < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ zero_clusters = num_clusters;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ break;
+ }
+
+ zero_cpos += num_clusters;
+ }
+ if (!zero_clusters) {
+ *range_end = 0;
+ goto out;
+ }
+
+ while ((zero_cpos + zero_clusters) < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+ &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+ break;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ zero_clusters += num_clusters;
+ }
+ if ((zero_cpos + zero_clusters) > last_cpos)
+ zero_clusters = last_cpos - zero_cpos;
+
+ if (needs_cow) {
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+ UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
goto out;
}
+ }
- start_off = next_off;
+ *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+ *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+ zero_cpos + zero_clusters);
+
+out:
+ return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+ u64 range_end)
+{
+ int rc = 0;
+ u64 next_pos;
+ u64 zero_pos = range_start;
+
+ mlog(0, "range_start = %llu, range_end = %llu\n",
+ (unsigned long long)range_start,
+ (unsigned long long)range_end);
+ BUG_ON(range_start >= range_end);
+
+ while (zero_pos < range_end) {
+ next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_pos > range_end)
+ next_pos = range_end;
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+ if (rc < 0) {
+ mlog_errno(rc);
+ break;
+ }
+ zero_pos = next_pos;
/*
* Very large extends have the potential to lock up
@@ -861,16 +955,60 @@ static int ocfs2_zero_extend(struct inode *inode,
cond_resched();
}
-out:
+ return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to_size)
+{
+ int ret = 0;
+ u64 zero_start, range_start = 0, range_end = 0;
+ struct super_block *sb = inode->i_sb;
+
+ zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ while (zero_start < zero_to_size) {
+ ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+ zero_to_size,
+ &range_start,
+ &range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ if (!range_end)
+ break;
+ /* Trim the ends */
+ if (range_start < zero_start)
+ range_start = zero_start;
+ if (range_end > zero_to_size)
+ range_end = zero_to_size;
+
+ ret = ocfs2_zero_extend_range(inode, range_start,
+ range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ zero_start = range_end;
+ }
+
return ret;
}
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to)
{
int ret;
u32 clusters_to_add;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ /*
+ * Only quota files call this without a bh, and they can't be
+ * refcounted.
+ */
+ BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
if (clusters_to_add < oi->ip_clusters)
clusters_to_add = 0;
@@ -891,7 +1029,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
* still need to zero the area between the old i_size and the
* new i_size.
*/
- ret = ocfs2_zero_extend(inode, zero_to);
+ ret = ocfs2_zero_extend(inode, di_bh, zero_to);
if (ret < 0)
mlog_errno(ret);
@@ -913,27 +1051,15 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));
/*
- * Fall through for converting inline data, even if the fs
- * supports sparse files.
- *
- * The check for inline data here is legal - nobody can add
- * the feature since we have i_mutex. We must check it again
- * after acquiring ip_alloc_sem though, as paths like mmap
- * might have raced us to converting the inode to extents.
- */
- if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- goto out_update_size;
-
- /*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
* i_mutex to block other extend/truncate calls while we're
- * here.
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
*/
down_write(&oi->ip_alloc_sem);
@@ -950,14 +1076,16 @@ static int ocfs2_extend_file(struct inode *inode,
ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
if (ret) {
up_write(&oi->ip_alloc_sem);
-
mlog_errno(ret);
goto out;
}
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+ new_i_size);
up_write(&oi->ip_alloc_sem);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f..97bf761 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
- u64 zero_to);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe..4607923 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
* locking allocators ranks above a transaction start
*/
WARN_ON(journal_current_handle());
- status = ocfs2_extend_no_holes(gqinode,
+ status = ocfs2_extend_no_holes(gqinode, NULL,
gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
gqinode->i_size);
if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4..dc78764 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
u64 p_blkno;
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
+ status = ocfs2_extend_no_holes(lqinode, NULL,
lqinode->i_size + 2 * sb->s_blocksize,
lqinode->i_size);
if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
return ocfs2_local_quota_add_chunk(sb, type, offset);
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
+ status = ocfs2_extend_no_holes(lqinode, NULL,
lqinode->i_size + sb->s_blocksize,
lqinode->i_size);
if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36..32949df 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4166,6 +4166,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
struct buffer_head *new_bh = NULL;
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = filemap_fdatawrite(inode->i_mapping);
if (ret) {
mlog_errno(ret);
--
1.7.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH 3/3] ocfs2: No need to zero pages past i_size.
2010-07-03 21:32 ` [Ocfs2-devel] " Joel Becker
@ 2010-07-07 11:16 ` Joel Becker
-1 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 11:16 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
When ocfs2 fills a hole, it does so by allocating clusters. When a
cluster is larger than the write, ocfs2 must zero the portions of the
cluster outside of the write. If the clustersize is smaller than a
pagecache page, this is handled by the normal pagecache mechanisms, but
when the clustersize is larger than a page, ocfs2's write code will zero
the pages adjacent to the write. This makes sure the entire cluster is
zeroed correctly.
Currently ocfs2 behaves exactly the same when writing past i_size.
However, this means ocfs2 is writing zeroed pages for portions of a new
cluster that are beyond i_size. The page writeback code isn't expecting
this. It treats all pages past the one containing i_size as left behind
due to a previous truncate operation.
Thankfully, ocfs2 calculates the number of pages it will be working on
up front. The rest of the write code merely honors the original
calculation. We can simply trim the number of pages to only cover the
actual file data.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 15 +++++++++++----
1 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8d6dc3f..9b3381a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1100,11 +1100,12 @@ out:
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos, int new,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
- unsigned long start, target_index, index;
+ unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
target_index = user_pos >> PAGE_CACHE_SHIFT;
@@ -1112,11 +1113,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
- * page. Otherwise, we'll need a whole clusters worth.
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /* This is the index *past* the write */
+ end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
@@ -1761,7 +1768,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
if (ret) {
mlog_errno(ret);
--
1.7.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [Ocfs2-devel] [PATCH 3/3] ocfs2: No need to zero pages past i_size.
@ 2010-07-07 11:16 ` Joel Becker
0 siblings, 0 replies; 78+ messages in thread
From: Joel Becker @ 2010-07-07 11:16 UTC (permalink / raw)
To: Dave Chinner, Linus Torvalds, Linux Kernel, ocfs2-devel, Tao Ma,
Dave Chinner, Christoph Hellwig, Mark Fasheh
When ocfs2 fills a hole, it does so by allocating clusters. When a
cluster is larger than the write, ocfs2 must zero the portions of the
cluster outside of the write. If the clustersize is smaller than a
pagecache page, this is handled by the normal pagecache mechanisms, but
when the clustersize is larger than a page, ocfs2's write code will zero
the pages adjacent to the write. This makes sure the entire cluster is
zeroed correctly.
Currently ocfs2 behaves exactly the same when writing past i_size.
However, this means ocfs2 is writing zeroed pages for portions of a new
cluster that are beyond i_size. The page writeback code isn't expecting
this. It treats all pages past the one containing i_size as left behind
due to a previous truncate operation.
Thankfully, ocfs2 calculates the number of pages it will be working on
up front. The rest of the write code merely honors the original
calculation. We can simply trim the number of pages to only cover the
actual file data.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
fs/ocfs2/aops.c | 15 +++++++++++----
1 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8d6dc3f..9b3381a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1100,11 +1100,12 @@ out:
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos, int new,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
- unsigned long start, target_index, index;
+ unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
target_index = user_pos >> PAGE_CACHE_SHIFT;
@@ -1112,11 +1113,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
- * page. Otherwise, we'll need a whole clusters worth.
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /* This is the index *past* the write */
+ end_index = ((user_pos + user_len - 1) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
@@ -1761,7 +1768,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
if (ret) {
mlog_errno(ret);
--
1.7.1
^ permalink raw reply related [flat|nested] 78+ messages in thread