linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] fs: Fix page_mkwrite off-by-one errors
@ 2019-12-18 13:09 Andreas Gruenbacher
  2019-12-18 14:10 ` Jan Kara
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Andreas Gruenbacher @ 2019-12-18 13:09 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Andreas Gruenbacher, Alexander Viro, Christoph Hellwig,
	Linus Torvalds, linux-kernel, Jeff Layton, Sage Weil,
	Ilya Dryomov, Theodore Ts'o, Andreas Dilger, Jaegeuk Kim,
	Chao Yu, linux-xfs, linux-fsdevel, Richard Weinberger,
	Artem Bityutskiy, Adrian Hunter, ceph-devel, linux-ext4,
	linux-f2fs-devel, linux-mtd, Chris Mason, Josef Bacik,
	David Sterba, linux-btrfs, Jan Kara

Hi Darrick,

can this fix go in via the xfs tree?

Thanks,
Andreas

--

The check in block_page_mkwrite that is meant to determine whether an
offset is within the inode size is off by one.  This bug has been copied
into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
ceph).

Fix that by introducing a new page_mkwrite_check_truncate helper that
checks for truncate and computes the bytes in the page up to EOF.  Use
the helper in the above mentioned filesystems.

In addition, use the new helper in btrfs as well.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Acked-by: David Sterba <dsterba@suse.com> (btrfs part)
Acked-by: Richard Weinberger <richard@nod.at> (ubifs part)
---
 fs/btrfs/inode.c        | 15 ++++-----------
 fs/buffer.c             | 16 +++-------------
 fs/ceph/addr.c          |  2 +-
 fs/ext4/inode.c         | 14 ++++----------
 fs/f2fs/file.c          | 19 +++++++------------
 fs/iomap/buffered-io.c  | 18 +++++-------------
 fs/ubifs/file.c         |  3 +--
 include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
 8 files changed, 53 insertions(+), 62 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 56032c518b26..86c6fcd8139d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
 	lock_page(page);
-	size = i_size_read(inode);
 
-	if ((page->mapping != inode->i_mapping) ||
-	    (page_start >= size)) {
-		/* page got truncated out from underneath us */
+	ret2 = page_mkwrite_check_truncate(page, inode);
+	if (ret2 < 0)
 		goto out_unlock;
-	}
+	zero_start = ret2;
 	wait_on_page_writeback(page);
 
 	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
@@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 		goto again;
 	}
 
+	size = i_size_read(inode);
 	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
 		reserved_space = round_up(size - page_start,
 					  fs_info->sectorsize);
@@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 	}
 	ret2 = 0;
 
-	/* page is wholly or partially inside EOF */
-	if (page_start + PAGE_SIZE > size)
-		zero_start = offset_in_page(size);
-	else
-		zero_start = PAGE_SIZE;
-
 	if (zero_start != PAGE_SIZE) {
 		kaddr = kmap(page);
 		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
diff --git a/fs/buffer.c b/fs/buffer.c
index d8c7242426bb..53aabde57ca7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
 	unsigned long end;
-	loff_t size;
 	int ret;
 
 	lock_page(page);
-	size = i_size_read(inode);
-	if ((page->mapping != inode->i_mapping) ||
-	    (page_offset(page) > size)) {
-		/* We overload EFAULT to mean page got truncated */
-		ret = -EFAULT;
+	ret = page_mkwrite_check_truncate(page, inode);
+	if (ret < 0)
 		goto out_unlock;
-	}
-
-	/* page is wholly or partially inside EOF */
-	if (((page->index + 1) << PAGE_SHIFT) > size)
-		end = size & ~PAGE_MASK;
-	else
-		end = PAGE_SIZE;
+	end = ret;
 
 	ret = __block_write_begin(page, 0, end, get_block);
 	if (!ret)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..ef958aa4adb4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 	do {
 		lock_page(page);
 
-		if ((off > size) || (page->mapping != inode->i_mapping)) {
+		if (page_mkwrite_check_truncate(page, inode) < 0) {
 			unlock_page(page);
 			ret = VM_FAULT_NOPAGE;
 			break;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28f28de0c1b6..51ab1d2cac80 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
-	loff_t size;
 	unsigned long len;
 	int err;
 	vm_fault_t ret;
@@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 	}
 
 	lock_page(page);
-	size = i_size_read(inode);
-	/* Page got truncated from under us? */
-	if (page->mapping != mapping || page_offset(page) > size) {
+	err = page_mkwrite_check_truncate(page, inode);
+	if (err < 0) {
 		unlock_page(page);
-		ret = VM_FAULT_NOPAGE;
-		goto out;
+		goto out_ret;
 	}
+	len = err;
 
-	if (page->index == size >> PAGE_SHIFT)
-		len = size & ~PAGE_MASK;
-	else
-		len = PAGE_SIZE;
 	/*
 	 * Return if we have all the buffers mapped. This avoids the need to do
 	 * journal_start/journal_stop which can block and take a long time
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 85af112e868d..0e77b2e6f873 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -51,7 +51,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn = { .node_changed = false };
-	int err;
+	int offset, err;
 
 	if (unlikely(f2fs_cp_error(sbi))) {
 		err = -EIO;
@@ -70,13 +70,14 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	file_update_time(vmf->vma->vm_file);
 	down_read(&F2FS_I(inode)->i_mmap_sem);
 	lock_page(page);
-	if (unlikely(page->mapping != inode->i_mapping ||
-			page_offset(page) > i_size_read(inode) ||
-			!PageUptodate(page))) {
+	err = -EFAULT;
+	if (likely(PageUptodate(page)))
+		err = page_mkwrite_check_truncate(page, inode);
+	if (unlikely(err < 0)) {
 		unlock_page(page);
-		err = -EFAULT;
 		goto out_sem;
 	}
+	offset = err;
 
 	/* block allocation */
 	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
@@ -101,14 +102,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	if (PageMappedToDisk(page))
 		goto out_sem;
 
-	/* page is wholly or partially inside EOF */
-	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
-						i_size_read(inode)) {
-		loff_t offset;
-
-		offset = i_size_read(inode) & ~PAGE_MASK;
+	if (offset != PAGE_SIZE)
 		zero_user_segment(page, offset, PAGE_SIZE);
-	}
 	set_page_dirty(page);
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d33c7bc5ee92..1aaf157fd6e9 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1062,24 +1062,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	unsigned long length;
-	loff_t offset, size;
+	loff_t offset;
 	ssize_t ret;
 
 	lock_page(page);
-	size = i_size_read(inode);
-	offset = page_offset(page);
-	if (page->mapping != inode->i_mapping || offset > size) {
-		/* We overload EFAULT to mean page got truncated */
-		ret = -EFAULT;
+	ret = page_mkwrite_check_truncate(page, inode);
+	if (ret < 0)
 		goto out_unlock;
-	}
-
-	/* page is wholly or partially inside EOF */
-	if (offset > size - PAGE_SIZE)
-		length = offset_in_page(size);
-	else
-		length = PAGE_SIZE;
+	length = ret;
 
+	offset = page_offset(page);
 	while (length > 0) {
 		ret = iomap_apply(inode, offset, length,
 				IOMAP_WRITE | IOMAP_FAULT, ops, page,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index cd52585c8f4f..91f7a1f2db0d 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1563,8 +1563,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 	}
 
 	lock_page(page);
-	if (unlikely(page->mapping != inode->i_mapping ||
-		     page_offset(page) > i_size_read(inode))) {
+	if (unlikely(page_mkwrite_check_truncate(page, inode) < 0)) {
 		/* Page got truncated out from underneath us */
 		goto sigbus;
 	}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 37a4d9e32cd3..ccb14b6a16b5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -636,4 +636,32 @@ static inline unsigned long dir_pages(struct inode *inode)
 			       PAGE_SHIFT;
 }
 
+/**
+ * page_mkwrite_check_truncate - check if page was truncated
+ * @page: the page to check
+ * @inode: the inode to check the page against
+ *
+ * Returns the number of bytes in the page up to EOF,
+ * or -EFAULT if the page was truncated.
+ */
+static inline int page_mkwrite_check_truncate(struct page *page,
+					      struct inode *inode)
+{
+	loff_t size = i_size_read(inode);
+	pgoff_t index = size >> PAGE_SHIFT;
+	int offset = offset_in_page(size);
+
+	if (page->mapping != inode->i_mapping)
+		return -EFAULT;
+
+	/* page is wholly inside EOF */
+	if (page->index < index)
+		return PAGE_SIZE;
+	/* page is wholly past EOF */
+	if (page->index > index || !offset)
+		return -EFAULT;
+	/* page is partially inside EOF */
+	return offset;
+}
+
 #endif /* _LINUX_PAGEMAP_H */
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2019-12-18 13:09 [PATCH v3] fs: Fix page_mkwrite off-by-one errors Andreas Gruenbacher
@ 2019-12-18 14:10 ` Jan Kara
  2019-12-18 18:52 ` Darrick J. Wong
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Jan Kara @ 2019-12-18 14:10 UTC (permalink / raw)
  To: Andreas Gruenbacher
  Cc: Darrick J. Wong, Alexander Viro, Christoph Hellwig,
	Linus Torvalds, linux-kernel, Jeff Layton, Sage Weil,
	Ilya Dryomov, Theodore Ts'o, Andreas Dilger, Jaegeuk Kim,
	Chao Yu, linux-xfs, linux-fsdevel, Richard Weinberger,
	Artem Bityutskiy, Adrian Hunter, ceph-devel, linux-ext4,
	linux-f2fs-devel, linux-mtd, Chris Mason, Josef Bacik,
	David Sterba, linux-btrfs, Jan Kara

On Wed 18-12-19 14:09:35, Andreas Gruenbacher wrote:
> Hi Darrick,
> 
> can this fix go in via the xfs tree?
> 
> Thanks,
> Andreas
> 
> --
> 
> The check in block_page_mkwrite that is meant to determine whether an
> offset is within the inode size is off by one.  This bug has been copied
> into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
> ceph).
> 
> Fix that by introducing a new page_mkwrite_check_truncate helper that
> checks for truncate and computes the bytes in the page up to EOF.  Use
> the helper in the above mentioned filesystems.
> 
> In addition, use the new helper in btrfs as well.
> 
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> Acked-by: David Sterba <dsterba@suse.com> (btrfs part)
> Acked-by: Richard Weinberger <richard@nod.at> (ubifs part)

The patch looks good to me (didn't really check btrfs). I'd just note that
page_mkwrite_check_truncate() doesn't seem that small to be worth
inlining... Other than that feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza


> ---
>  fs/btrfs/inode.c        | 15 ++++-----------
>  fs/buffer.c             | 16 +++-------------
>  fs/ceph/addr.c          |  2 +-
>  fs/ext4/inode.c         | 14 ++++----------
>  fs/f2fs/file.c          | 19 +++++++------------
>  fs/iomap/buffered-io.c  | 18 +++++-------------
>  fs/ubifs/file.c         |  3 +--
>  include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
>  8 files changed, 53 insertions(+), 62 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 56032c518b26..86c6fcd8139d 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
>  again:
>  	lock_page(page);
> -	size = i_size_read(inode);
>  
> -	if ((page->mapping != inode->i_mapping) ||
> -	    (page_start >= size)) {
> -		/* page got truncated out from underneath us */
> +	ret2 = page_mkwrite_check_truncate(page, inode);
> +	if (ret2 < 0)
>  		goto out_unlock;
> -	}
> +	zero_start = ret2;
>  	wait_on_page_writeback(page);
>  
>  	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
> @@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  		goto again;
>  	}
>  
> +	size = i_size_read(inode);
>  	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
>  		reserved_space = round_up(size - page_start,
>  					  fs_info->sectorsize);
> @@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	}
>  	ret2 = 0;
>  
> -	/* page is wholly or partially inside EOF */
> -	if (page_start + PAGE_SIZE > size)
> -		zero_start = offset_in_page(size);
> -	else
> -		zero_start = PAGE_SIZE;
> -
>  	if (zero_start != PAGE_SIZE) {
>  		kaddr = kmap(page);
>  		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
> diff --git a/fs/buffer.c b/fs/buffer.c
> index d8c7242426bb..53aabde57ca7 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	struct page *page = vmf->page;
>  	struct inode *inode = file_inode(vma->vm_file);
>  	unsigned long end;
> -	loff_t size;
>  	int ret;
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	if ((page->mapping != inode->i_mapping) ||
> -	    (page_offset(page) > size)) {
> -		/* We overload EFAULT to mean page got truncated */
> -		ret = -EFAULT;
> +	ret = page_mkwrite_check_truncate(page, inode);
> +	if (ret < 0)
>  		goto out_unlock;
> -	}
> -
> -	/* page is wholly or partially inside EOF */
> -	if (((page->index + 1) << PAGE_SHIFT) > size)
> -		end = size & ~PAGE_MASK;
> -	else
> -		end = PAGE_SIZE;
> +	end = ret;
>  
>  	ret = __block_write_begin(page, 0, end, get_block);
>  	if (!ret)
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 7ab616601141..ef958aa4adb4 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>  	do {
>  		lock_page(page);
>  
> -		if ((off > size) || (page->mapping != inode->i_mapping)) {
> +		if (page_mkwrite_check_truncate(page, inode) < 0) {
>  			unlock_page(page);
>  			ret = VM_FAULT_NOPAGE;
>  			break;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 28f28de0c1b6..51ab1d2cac80 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = vmf->page;
> -	loff_t size;
>  	unsigned long len;
>  	int err;
>  	vm_fault_t ret;
> @@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  	}
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	/* Page got truncated from under us? */
> -	if (page->mapping != mapping || page_offset(page) > size) {
> +	err = page_mkwrite_check_truncate(page, inode);
> +	if (err < 0) {
>  		unlock_page(page);
> -		ret = VM_FAULT_NOPAGE;
> -		goto out;
> +		goto out_ret;
>  	}
> +	len = err;
>  
> -	if (page->index == size >> PAGE_SHIFT)
> -		len = size & ~PAGE_MASK;
> -	else
> -		len = PAGE_SIZE;
>  	/*
>  	 * Return if we have all the buffers mapped. This avoids the need to do
>  	 * journal_start/journal_stop which can block and take a long time
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 85af112e868d..0e77b2e6f873 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -51,7 +51,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>  	struct dnode_of_data dn = { .node_changed = false };
> -	int err;
> +	int offset, err;
>  
>  	if (unlikely(f2fs_cp_error(sbi))) {
>  		err = -EIO;
> @@ -70,13 +70,14 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	file_update_time(vmf->vma->vm_file);
>  	down_read(&F2FS_I(inode)->i_mmap_sem);
>  	lock_page(page);
> -	if (unlikely(page->mapping != inode->i_mapping ||
> -			page_offset(page) > i_size_read(inode) ||
> -			!PageUptodate(page))) {
> +	err = -EFAULT;
> +	if (likely(PageUptodate(page)))
> +		err = page_mkwrite_check_truncate(page, inode);
> +	if (unlikely(err < 0)) {
>  		unlock_page(page);
> -		err = -EFAULT;
>  		goto out_sem;
>  	}
> +	offset = err;
>  
>  	/* block allocation */
>  	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
> @@ -101,14 +102,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	if (PageMappedToDisk(page))
>  		goto out_sem;
>  
> -	/* page is wholly or partially inside EOF */
> -	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
> -						i_size_read(inode)) {
> -		loff_t offset;
> -
> -		offset = i_size_read(inode) & ~PAGE_MASK;
> +	if (offset != PAGE_SIZE)
>  		zero_user_segment(page, offset, PAGE_SIZE);
> -	}
>  	set_page_dirty(page);
>  	if (!PageUptodate(page))
>  		SetPageUptodate(page);
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index d33c7bc5ee92..1aaf157fd6e9 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1062,24 +1062,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
>  	struct page *page = vmf->page;
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	unsigned long length;
> -	loff_t offset, size;
> +	loff_t offset;
>  	ssize_t ret;
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	offset = page_offset(page);
> -	if (page->mapping != inode->i_mapping || offset > size) {
> -		/* We overload EFAULT to mean page got truncated */
> -		ret = -EFAULT;
> +	ret = page_mkwrite_check_truncate(page, inode);
> +	if (ret < 0)
>  		goto out_unlock;
> -	}
> -
> -	/* page is wholly or partially inside EOF */
> -	if (offset > size - PAGE_SIZE)
> -		length = offset_in_page(size);
> -	else
> -		length = PAGE_SIZE;
> +	length = ret;
>  
> +	offset = page_offset(page);
>  	while (length > 0) {
>  		ret = iomap_apply(inode, offset, length,
>  				IOMAP_WRITE | IOMAP_FAULT, ops, page,
> diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
> index cd52585c8f4f..91f7a1f2db0d 100644
> --- a/fs/ubifs/file.c
> +++ b/fs/ubifs/file.c
> @@ -1563,8 +1563,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
>  	}
>  
>  	lock_page(page);
> -	if (unlikely(page->mapping != inode->i_mapping ||
> -		     page_offset(page) > i_size_read(inode))) {
> +	if (unlikely(page_mkwrite_check_truncate(page, inode) < 0)) {
>  		/* Page got truncated out from underneath us */
>  		goto sigbus;
>  	}
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 37a4d9e32cd3..ccb14b6a16b5 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -636,4 +636,32 @@ static inline unsigned long dir_pages(struct inode *inode)
>  			       PAGE_SHIFT;
>  }
>  
> +/**
> + * page_mkwrite_check_truncate - check if page was truncated
> + * @page: the page to check
> + * @inode: the inode to check the page against
> + *
> + * Returns the number of bytes in the page up to EOF,
> + * or -EFAULT if the page was truncated.
> + */
> +static inline int page_mkwrite_check_truncate(struct page *page,
> +					      struct inode *inode)
> +{
> +	loff_t size = i_size_read(inode);
> +	pgoff_t index = size >> PAGE_SHIFT;
> +	int offset = offset_in_page(size);
> +
> +	if (page->mapping != inode->i_mapping)
> +		return -EFAULT;
> +
> +	/* page is wholly inside EOF */
> +	if (page->index < index)
> +		return PAGE_SIZE;
> +	/* page is wholly past EOF */
> +	if (page->index > index || !offset)
> +		return -EFAULT;
> +	/* page is partially inside EOF */
> +	return offset;
> +}
> +
>  #endif /* _LINUX_PAGEMAP_H */
> -- 
> 2.20.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2019-12-18 13:09 [PATCH v3] fs: Fix page_mkwrite off-by-one errors Andreas Gruenbacher
  2019-12-18 14:10 ` Jan Kara
@ 2019-12-18 18:52 ` Darrick J. Wong
  2019-12-18 19:15   ` Andreas Gruenbacher
  2019-12-18 19:21   ` Matthew Wilcox
  2020-01-07 23:20 ` Darrick J. Wong
  2020-01-08 11:51 ` Jaegeuk Kim
  3 siblings, 2 replies; 10+ messages in thread
From: Darrick J. Wong @ 2019-12-18 18:52 UTC (permalink / raw)
  To: Andreas Gruenbacher
  Cc: Alexander Viro, Christoph Hellwig, Linus Torvalds, linux-kernel,
	Jeff Layton, Sage Weil, Ilya Dryomov, Theodore Ts'o,
	Andreas Dilger, Jaegeuk Kim, Chao Yu, linux-xfs, linux-fsdevel,
	Richard Weinberger, Artem Bityutskiy, Adrian Hunter, ceph-devel,
	linux-ext4, linux-f2fs-devel, linux-mtd, Chris Mason,
	Josef Bacik, David Sterba, linux-btrfs, Jan Kara

On Wed, Dec 18, 2019 at 02:09:35PM +0100, Andreas Gruenbacher wrote:
> Hi Darrick,
> 
> can this fix go in via the xfs tree?

Er, I'd rather not touch five other filesystems via the XFS tree.
However, a more immediate problem that I think I see is...

> Thanks,
> Andreas
> 
> --
> 
> The check in block_page_mkwrite that is meant to determine whether an
> offset is within the inode size is off by one.  This bug has been copied
> into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
> ceph).
> 
> Fix that by introducing a new page_mkwrite_check_truncate helper that
> checks for truncate and computes the bytes in the page up to EOF.  Use
> the helper in the above mentioned filesystems.
> 
> In addition, use the new helper in btrfs as well.
> 
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> Acked-by: David Sterba <dsterba@suse.com> (btrfs part)
> Acked-by: Richard Weinberger <richard@nod.at> (ubifs part)
> ---
>  fs/btrfs/inode.c        | 15 ++++-----------
>  fs/buffer.c             | 16 +++-------------
>  fs/ceph/addr.c          |  2 +-
>  fs/ext4/inode.c         | 14 ++++----------
>  fs/f2fs/file.c          | 19 +++++++------------
>  fs/iomap/buffered-io.c  | 18 +++++-------------
>  fs/ubifs/file.c         |  3 +--
>  include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
>  8 files changed, 53 insertions(+), 62 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 56032c518b26..86c6fcd8139d 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
>  again:
>  	lock_page(page);
> -	size = i_size_read(inode);
>  
> -	if ((page->mapping != inode->i_mapping) ||
> -	    (page_start >= size)) {
> -		/* page got truncated out from underneath us */
> +	ret2 = page_mkwrite_check_truncate(page, inode);
> +	if (ret2 < 0)
>  		goto out_unlock;

...here we try to return -EFAULT as vm_fault_t.  Notice how btrfs returns
VM_FAULT_* values directly and never calls block_page_mkwrite_return?  I
know dsterba acked this, but I cannot see how this is correct?

--D

> -	}
> +	zero_start = ret2;
>  	wait_on_page_writeback(page);
>  
>  	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
> @@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  		goto again;
>  	}
>  
> +	size = i_size_read(inode);
>  	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
>  		reserved_space = round_up(size - page_start,
>  					  fs_info->sectorsize);
> @@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	}
>  	ret2 = 0;
>  
> -	/* page is wholly or partially inside EOF */
> -	if (page_start + PAGE_SIZE > size)
> -		zero_start = offset_in_page(size);
> -	else
> -		zero_start = PAGE_SIZE;
> -
>  	if (zero_start != PAGE_SIZE) {
>  		kaddr = kmap(page);
>  		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
> diff --git a/fs/buffer.c b/fs/buffer.c
> index d8c7242426bb..53aabde57ca7 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	struct page *page = vmf->page;
>  	struct inode *inode = file_inode(vma->vm_file);
>  	unsigned long end;
> -	loff_t size;
>  	int ret;
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	if ((page->mapping != inode->i_mapping) ||
> -	    (page_offset(page) > size)) {
> -		/* We overload EFAULT to mean page got truncated */
> -		ret = -EFAULT;
> +	ret = page_mkwrite_check_truncate(page, inode);
> +	if (ret < 0)
>  		goto out_unlock;
> -	}
> -
> -	/* page is wholly or partially inside EOF */
> -	if (((page->index + 1) << PAGE_SHIFT) > size)
> -		end = size & ~PAGE_MASK;
> -	else
> -		end = PAGE_SIZE;
> +	end = ret;
>  
>  	ret = __block_write_begin(page, 0, end, get_block);
>  	if (!ret)
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 7ab616601141..ef958aa4adb4 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>  	do {
>  		lock_page(page);
>  
> -		if ((off > size) || (page->mapping != inode->i_mapping)) {
> +		if (page_mkwrite_check_truncate(page, inode) < 0) {
>  			unlock_page(page);
>  			ret = VM_FAULT_NOPAGE;
>  			break;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 28f28de0c1b6..51ab1d2cac80 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = vmf->page;
> -	loff_t size;
>  	unsigned long len;
>  	int err;
>  	vm_fault_t ret;
> @@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  	}
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	/* Page got truncated from under us? */
> -	if (page->mapping != mapping || page_offset(page) > size) {
> +	err = page_mkwrite_check_truncate(page, inode);
> +	if (err < 0) {
>  		unlock_page(page);
> -		ret = VM_FAULT_NOPAGE;
> -		goto out;
> +		goto out_ret;
>  	}
> +	len = err;
>  
> -	if (page->index == size >> PAGE_SHIFT)
> -		len = size & ~PAGE_MASK;
> -	else
> -		len = PAGE_SIZE;
>  	/*
>  	 * Return if we have all the buffers mapped. This avoids the need to do
>  	 * journal_start/journal_stop which can block and take a long time
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 85af112e868d..0e77b2e6f873 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -51,7 +51,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>  	struct dnode_of_data dn = { .node_changed = false };
> -	int err;
> +	int offset, err;
>  
>  	if (unlikely(f2fs_cp_error(sbi))) {
>  		err = -EIO;
> @@ -70,13 +70,14 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	file_update_time(vmf->vma->vm_file);
>  	down_read(&F2FS_I(inode)->i_mmap_sem);
>  	lock_page(page);
> -	if (unlikely(page->mapping != inode->i_mapping ||
> -			page_offset(page) > i_size_read(inode) ||
> -			!PageUptodate(page))) {
> +	err = -EFAULT;
> +	if (likely(PageUptodate(page)))
> +		err = page_mkwrite_check_truncate(page, inode);
> +	if (unlikely(err < 0)) {
>  		unlock_page(page);
> -		err = -EFAULT;
>  		goto out_sem;
>  	}
> +	offset = err;
>  
>  	/* block allocation */
>  	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
> @@ -101,14 +102,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	if (PageMappedToDisk(page))
>  		goto out_sem;
>  
> -	/* page is wholly or partially inside EOF */
> -	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
> -						i_size_read(inode)) {
> -		loff_t offset;
> -
> -		offset = i_size_read(inode) & ~PAGE_MASK;
> +	if (offset != PAGE_SIZE)
>  		zero_user_segment(page, offset, PAGE_SIZE);
> -	}
>  	set_page_dirty(page);
>  	if (!PageUptodate(page))
>  		SetPageUptodate(page);
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index d33c7bc5ee92..1aaf157fd6e9 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1062,24 +1062,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
>  	struct page *page = vmf->page;
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	unsigned long length;
> -	loff_t offset, size;
> +	loff_t offset;
>  	ssize_t ret;
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	offset = page_offset(page);
> -	if (page->mapping != inode->i_mapping || offset > size) {
> -		/* We overload EFAULT to mean page got truncated */
> -		ret = -EFAULT;
> +	ret = page_mkwrite_check_truncate(page, inode);
> +	if (ret < 0)
>  		goto out_unlock;
> -	}
> -
> -	/* page is wholly or partially inside EOF */
> -	if (offset > size - PAGE_SIZE)
> -		length = offset_in_page(size);
> -	else
> -		length = PAGE_SIZE;
> +	length = ret;
>  
> +	offset = page_offset(page);
>  	while (length > 0) {
>  		ret = iomap_apply(inode, offset, length,
>  				IOMAP_WRITE | IOMAP_FAULT, ops, page,
> diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
> index cd52585c8f4f..91f7a1f2db0d 100644
> --- a/fs/ubifs/file.c
> +++ b/fs/ubifs/file.c
> @@ -1563,8 +1563,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
>  	}
>  
>  	lock_page(page);
> -	if (unlikely(page->mapping != inode->i_mapping ||
> -		     page_offset(page) > i_size_read(inode))) {
> +	if (unlikely(page_mkwrite_check_truncate(page, inode) < 0)) {
>  		/* Page got truncated out from underneath us */
>  		goto sigbus;
>  	}
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 37a4d9e32cd3..ccb14b6a16b5 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -636,4 +636,32 @@ static inline unsigned long dir_pages(struct inode *inode)
>  			       PAGE_SHIFT;
>  }
>  
> +/**
> + * page_mkwrite_check_truncate - check if page was truncated
> + * @page: the page to check
> + * @inode: the inode to check the page against
> + *
> + * Returns the number of bytes in the page up to EOF,
> + * or -EFAULT if the page was truncated.
> + */
> +static inline int page_mkwrite_check_truncate(struct page *page,
> +					      struct inode *inode)
> +{
> +	loff_t size = i_size_read(inode);
> +	pgoff_t index = size >> PAGE_SHIFT;
> +	int offset = offset_in_page(size);
> +
> +	if (page->mapping != inode->i_mapping)
> +		return -EFAULT;
> +
> +	/* page is wholly inside EOF */
> +	if (page->index < index)
> +		return PAGE_SIZE;
> +	/* page is wholly past EOF */
> +	if (page->index > index || !offset)
> +		return -EFAULT;
> +	/* page is partially inside EOF */
> +	return offset;
> +}
> +
>  #endif /* _LINUX_PAGEMAP_H */
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2019-12-18 18:52 ` Darrick J. Wong
@ 2019-12-18 19:15   ` Andreas Gruenbacher
  2019-12-18 19:23     ` Darrick J. Wong
  2019-12-18 19:21   ` Matthew Wilcox
  1 sibling, 1 reply; 10+ messages in thread
From: Andreas Gruenbacher @ 2019-12-18 19:15 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Alexander Viro, Christoph Hellwig, Linus Torvalds, LKML,
	Jeff Layton, Sage Weil, Ilya Dryomov, Theodore Ts'o,
	Andreas Dilger, Jaegeuk Kim, Chao Yu, linux-xfs, linux-fsdevel,
	Richard Weinberger, Artem Bityutskiy, Adrian Hunter,
	Ceph Development, linux-ext4, linux-f2fs-devel, linux-mtd,
	Chris Mason, Josef Bacik, David Sterba, linux-btrfs, Jan Kara

On Wed, Dec 18, 2019 at 7:55 PM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Wed, Dec 18, 2019 at 02:09:35PM +0100, Andreas Gruenbacher wrote:
> > Hi Darrick,
> >
> > can this fix go in via the xfs tree?
>
> Er, I'd rather not touch five other filesystems via the XFS tree.
> However, a more immediate problem that I think I see is...
>
> > Thanks,
> > Andreas
> >
> > --
> >
> > The check in block_page_mkwrite that is meant to determine whether an
> > offset is within the inode size is off by one.  This bug has been copied
> > into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
> > ceph).
> >
> > Fix that by introducing a new page_mkwrite_check_truncate helper that
> > checks for truncate and computes the bytes in the page up to EOF.  Use
> > the helper in the above mentioned filesystems.
> >
> > In addition, use the new helper in btrfs as well.
> >
> > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> > Acked-by: David Sterba <dsterba@suse.com> (btrfs part)
> > Acked-by: Richard Weinberger <richard@nod.at> (ubifs part)
> > ---
> >  fs/btrfs/inode.c        | 15 ++++-----------
> >  fs/buffer.c             | 16 +++-------------
> >  fs/ceph/addr.c          |  2 +-
> >  fs/ext4/inode.c         | 14 ++++----------
> >  fs/f2fs/file.c          | 19 +++++++------------
> >  fs/iomap/buffered-io.c  | 18 +++++-------------
> >  fs/ubifs/file.c         |  3 +--
> >  include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
> >  8 files changed, 53 insertions(+), 62 deletions(-)
> >
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 56032c518b26..86c6fcd8139d 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >       ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
> >  again:
> >       lock_page(page);
> > -     size = i_size_read(inode);
> >
> > -     if ((page->mapping != inode->i_mapping) ||
> > -         (page_start >= size)) {
> > -             /* page got truncated out from underneath us */
> > +     ret2 = page_mkwrite_check_truncate(page, inode);
> > +     if (ret2 < 0)
> >               goto out_unlock;
>
> ...here we try to return -EFAULT as vm_fault_t.  Notice how btrfs returns
> VM_FAULT_* values directly and never calls block_page_mkwrite_return?  I
> know dsterba acked this, but I cannot see how this is correct?

Well, page_mkwrite_check_truncate can only fail with -EFAULT, in which
case btrfs_page_mkwrite will return VM_FAULT_NOPAGE. It would be
cleaner not to discard page_mkwrite_check_truncate's return value
though.

> > -     }
> > +     zero_start = ret2;
> >       wait_on_page_writeback(page);
> >
> >       lock_extent_bits(io_tree, page_start, page_end, &cached_state);
> > @@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >               goto again;
> >       }
> >
> > +     size = i_size_read(inode);
> >       if (page->index == ((size - 1) >> PAGE_SHIFT)) {
> >               reserved_space = round_up(size - page_start,
> >                                         fs_info->sectorsize);
> > @@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >       }
> >       ret2 = 0;
> >
> > -     /* page is wholly or partially inside EOF */
> > -     if (page_start + PAGE_SIZE > size)
> > -             zero_start = offset_in_page(size);
> > -     else
> > -             zero_start = PAGE_SIZE;
> > -
> >       if (zero_start != PAGE_SIZE) {
> >               kaddr = kmap(page);
> >               memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
> > diff --git a/fs/buffer.c b/fs/buffer.c
> > index d8c7242426bb..53aabde57ca7 100644
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
> >       struct page *page = vmf->page;
> >       struct inode *inode = file_inode(vma->vm_file);
> >       unsigned long end;
> > -     loff_t size;
> >       int ret;
> >
> >       lock_page(page);
> > -     size = i_size_read(inode);
> > -     if ((page->mapping != inode->i_mapping) ||
> > -         (page_offset(page) > size)) {
> > -             /* We overload EFAULT to mean page got truncated */
> > -             ret = -EFAULT;
> > +     ret = page_mkwrite_check_truncate(page, inode);
> > +     if (ret < 0)
> >               goto out_unlock;
> > -     }
> > -
> > -     /* page is wholly or partially inside EOF */
> > -     if (((page->index + 1) << PAGE_SHIFT) > size)
> > -             end = size & ~PAGE_MASK;
> > -     else
> > -             end = PAGE_SIZE;
> > +     end = ret;
> >
> >       ret = __block_write_begin(page, 0, end, get_block);
> >       if (!ret)
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > index 7ab616601141..ef958aa4adb4 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
> >       do {
> >               lock_page(page);
> >
> > -             if ((off > size) || (page->mapping != inode->i_mapping)) {
> > +             if (page_mkwrite_check_truncate(page, inode) < 0) {
> >                       unlock_page(page);
> >                       ret = VM_FAULT_NOPAGE;
> >                       break;
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 28f28de0c1b6..51ab1d2cac80 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
> >  {
> >       struct vm_area_struct *vma = vmf->vma;
> >       struct page *page = vmf->page;
> > -     loff_t size;
> >       unsigned long len;
> >       int err;
> >       vm_fault_t ret;
> > @@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
> >       }
> >
> >       lock_page(page);
> > -     size = i_size_read(inode);
> > -     /* Page got truncated from under us? */
> > -     if (page->mapping != mapping || page_offset(page) > size) {
> > +     err = page_mkwrite_check_truncate(page, inode);
> > +     if (err < 0) {
> >               unlock_page(page);
> > -             ret = VM_FAULT_NOPAGE;
> > -             goto out;
> > +             goto out_ret;
> >       }
> > +     len = err;
> >
> > -     if (page->index == size >> PAGE_SHIFT)
> > -             len = size & ~PAGE_MASK;
> > -     else
> > -             len = PAGE_SIZE;
> >       /*
> >        * Return if we have all the buffers mapped. This avoids the need to do
> >        * journal_start/journal_stop which can block and take a long time
> > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > index 85af112e868d..0e77b2e6f873 100644
> > --- a/fs/f2fs/file.c
> > +++ b/fs/f2fs/file.c
> > @@ -51,7 +51,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
> >       struct inode *inode = file_inode(vmf->vma->vm_file);
> >       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> >       struct dnode_of_data dn = { .node_changed = false };
> > -     int err;
> > +     int offset, err;
> >
> >       if (unlikely(f2fs_cp_error(sbi))) {
> >               err = -EIO;
> > @@ -70,13 +70,14 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
> >       file_update_time(vmf->vma->vm_file);
> >       down_read(&F2FS_I(inode)->i_mmap_sem);
> >       lock_page(page);
> > -     if (unlikely(page->mapping != inode->i_mapping ||
> > -                     page_offset(page) > i_size_read(inode) ||
> > -                     !PageUptodate(page))) {
> > +     err = -EFAULT;
> > +     if (likely(PageUptodate(page)))
> > +             err = page_mkwrite_check_truncate(page, inode);
> > +     if (unlikely(err < 0)) {
> >               unlock_page(page);
> > -             err = -EFAULT;
> >               goto out_sem;
> >       }
> > +     offset = err;
> >
> >       /* block allocation */
> >       __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
> > @@ -101,14 +102,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
> >       if (PageMappedToDisk(page))
> >               goto out_sem;
> >
> > -     /* page is wholly or partially inside EOF */
> > -     if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
> > -                                             i_size_read(inode)) {
> > -             loff_t offset;
> > -
> > -             offset = i_size_read(inode) & ~PAGE_MASK;
> > +     if (offset != PAGE_SIZE)
> >               zero_user_segment(page, offset, PAGE_SIZE);
> > -     }
> >       set_page_dirty(page);
> >       if (!PageUptodate(page))
> >               SetPageUptodate(page);
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > index d33c7bc5ee92..1aaf157fd6e9 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -1062,24 +1062,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
> >       struct page *page = vmf->page;
> >       struct inode *inode = file_inode(vmf->vma->vm_file);
> >       unsigned long length;
> > -     loff_t offset, size;
> > +     loff_t offset;
> >       ssize_t ret;
> >
> >       lock_page(page);
> > -     size = i_size_read(inode);
> > -     offset = page_offset(page);
> > -     if (page->mapping != inode->i_mapping || offset > size) {
> > -             /* We overload EFAULT to mean page got truncated */
> > -             ret = -EFAULT;
> > +     ret = page_mkwrite_check_truncate(page, inode);
> > +     if (ret < 0)
> >               goto out_unlock;
> > -     }
> > -
> > -     /* page is wholly or partially inside EOF */
> > -     if (offset > size - PAGE_SIZE)
> > -             length = offset_in_page(size);
> > -     else
> > -             length = PAGE_SIZE;
> > +     length = ret;
> >
> > +     offset = page_offset(page);
> >       while (length > 0) {
> >               ret = iomap_apply(inode, offset, length,
> >                               IOMAP_WRITE | IOMAP_FAULT, ops, page,
> > diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
> > index cd52585c8f4f..91f7a1f2db0d 100644
> > --- a/fs/ubifs/file.c
> > +++ b/fs/ubifs/file.c
> > @@ -1563,8 +1563,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
> >       }
> >
> >       lock_page(page);
> > -     if (unlikely(page->mapping != inode->i_mapping ||
> > -                  page_offset(page) > i_size_read(inode))) {
> > +     if (unlikely(page_mkwrite_check_truncate(page, inode) < 0)) {
> >               /* Page got truncated out from underneath us */
> >               goto sigbus;
> >       }
> > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > index 37a4d9e32cd3..ccb14b6a16b5 100644
> > --- a/include/linux/pagemap.h
> > +++ b/include/linux/pagemap.h
> > @@ -636,4 +636,32 @@ static inline unsigned long dir_pages(struct inode *inode)
> >                              PAGE_SHIFT;
> >  }
> >
> > +/**
> > + * page_mkwrite_check_truncate - check if page was truncated
> > + * @page: the page to check
> > + * @inode: the inode to check the page against
> > + *
> > + * Returns the number of bytes in the page up to EOF,
> > + * or -EFAULT if the page was truncated.
> > + */
> > +static inline int page_mkwrite_check_truncate(struct page *page,
> > +                                           struct inode *inode)
> > +{
> > +     loff_t size = i_size_read(inode);
> > +     pgoff_t index = size >> PAGE_SHIFT;
> > +     int offset = offset_in_page(size);
> > +
> > +     if (page->mapping != inode->i_mapping)
> > +             return -EFAULT;
> > +
> > +     /* page is wholly inside EOF */
> > +     if (page->index < index)
> > +             return PAGE_SIZE;
> > +     /* page is wholly past EOF */
> > +     if (page->index > index || !offset)
> > +             return -EFAULT;
> > +     /* page is partially inside EOF */
> > +     return offset;
> > +}
> > +
> >  #endif /* _LINUX_PAGEMAP_H */
> > --
> > 2.20.1
> >
>


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2019-12-18 18:52 ` Darrick J. Wong
  2019-12-18 19:15   ` Andreas Gruenbacher
@ 2019-12-18 19:21   ` Matthew Wilcox
  1 sibling, 0 replies; 10+ messages in thread
From: Matthew Wilcox @ 2019-12-18 19:21 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Andreas Gruenbacher, Alexander Viro, Christoph Hellwig,
	Linus Torvalds, linux-kernel, Jeff Layton, Sage Weil,
	Ilya Dryomov, Theodore Ts'o, Andreas Dilger, Jaegeuk Kim,
	Chao Yu, linux-xfs, linux-fsdevel, Richard Weinberger,
	Artem Bityutskiy, Adrian Hunter, ceph-devel, linux-ext4,
	linux-f2fs-devel, linux-mtd, Chris Mason, Josef Bacik,
	David Sterba, linux-btrfs, Jan Kara

On Wed, Dec 18, 2019 at 10:52:16AM -0800, Darrick J. Wong wrote:
> > @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
> >  again:
> >  	lock_page(page);
> > -	size = i_size_read(inode);
> >  
> > -	if ((page->mapping != inode->i_mapping) ||
> > -	    (page_start >= size)) {
> > -		/* page got truncated out from underneath us */
> > +	ret2 = page_mkwrite_check_truncate(page, inode);
> > +	if (ret2 < 0)
> >  		goto out_unlock;
> 
> ...here we try to return -EFAULT as vm_fault_t.  Notice how btrfs returns
> VM_FAULT_* values directly and never calls block_page_mkwrite_return?  I
> know dsterba acked this, but I cannot see how this is correct?

I think you misread it.  'ret2' is never returned; we'll end up returning
VM_FAULT_NOPAGE here.  Arguably it should be SIGBUS or something, but
I think retrying the fault will also end up giving a SIGBUS.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2019-12-18 19:15   ` Andreas Gruenbacher
@ 2019-12-18 19:23     ` Darrick J. Wong
  2019-12-22  1:59       ` Theodore Y. Ts'o
  0 siblings, 1 reply; 10+ messages in thread
From: Darrick J. Wong @ 2019-12-18 19:23 UTC (permalink / raw)
  To: Andreas Gruenbacher
  Cc: Alexander Viro, Christoph Hellwig, Linus Torvalds, LKML,
	Jeff Layton, Sage Weil, Ilya Dryomov, Theodore Ts'o,
	Andreas Dilger, Jaegeuk Kim, Chao Yu, linux-xfs, linux-fsdevel,
	Richard Weinberger, Artem Bityutskiy, Adrian Hunter,
	Ceph Development, linux-ext4, linux-f2fs-devel, linux-mtd,
	Chris Mason, Josef Bacik, David Sterba, linux-btrfs, Jan Kara

On Wed, Dec 18, 2019 at 08:15:36PM +0100, Andreas Gruenbacher wrote:
> On Wed, Dec 18, 2019 at 7:55 PM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Wed, Dec 18, 2019 at 02:09:35PM +0100, Andreas Gruenbacher wrote:
> > > Hi Darrick,
> > >
> > > can this fix go in via the xfs tree?
> >
> > Er, I'd rather not touch five other filesystems via the XFS tree.
> > However, a more immediate problem that I think I see is...
> >
> > > Thanks,
> > > Andreas
> > >
> > > --
> > >
> > > The check in block_page_mkwrite that is meant to determine whether an
> > > offset is within the inode size is off by one.  This bug has been copied
> > > into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
> > > ceph).
> > >
> > > Fix that by introducing a new page_mkwrite_check_truncate helper that
> > > checks for truncate and computes the bytes in the page up to EOF.  Use
> > > the helper in the above mentioned filesystems.
> > >
> > > In addition, use the new helper in btrfs as well.
> > >
> > > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> > > Acked-by: David Sterba <dsterba@suse.com> (btrfs part)
> > > Acked-by: Richard Weinberger <richard@nod.at> (ubifs part)
> > > ---
> > >  fs/btrfs/inode.c        | 15 ++++-----------
> > >  fs/buffer.c             | 16 +++-------------
> > >  fs/ceph/addr.c          |  2 +-
> > >  fs/ext4/inode.c         | 14 ++++----------
> > >  fs/f2fs/file.c          | 19 +++++++------------
> > >  fs/iomap/buffered-io.c  | 18 +++++-------------
> > >  fs/ubifs/file.c         |  3 +--
> > >  include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
> > >  8 files changed, 53 insertions(+), 62 deletions(-)
> > >
> > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > > index 56032c518b26..86c6fcd8139d 100644
> > > --- a/fs/btrfs/inode.c
> > > +++ b/fs/btrfs/inode.c
> > > @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> > >       ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
> > >  again:
> > >       lock_page(page);
> > > -     size = i_size_read(inode);
> > >
> > > -     if ((page->mapping != inode->i_mapping) ||
> > > -         (page_start >= size)) {
> > > -             /* page got truncated out from underneath us */
> > > +     ret2 = page_mkwrite_check_truncate(page, inode);
> > > +     if (ret2 < 0)
> > >               goto out_unlock;
> >
> > ...here we try to return -EFAULT as vm_fault_t.  Notice how btrfs returns
> > VM_FAULT_* values directly and never calls block_page_mkwrite_return?  I
> > know dsterba acked this, but I cannot see how this is correct?
> 
> Well, page_mkwrite_check_truncate can only fail with -EFAULT, in which
> case btrfs_page_mkwrite will return VM_FAULT_NOPAGE. It would be
> cleaner not to discard page_mkwrite_check_truncate's return value
> though.

*OH*, because we're stuffing the value in ret2, not ret.  Ok, that makes
more sense.  Er, I guess I don't mind pushing via iomap tree, but could
we get some acks from Ted and any of the ceph maintainers?

--D

> > > -     }
> > > +     zero_start = ret2;
> > >       wait_on_page_writeback(page);
> > >
> > >       lock_extent_bits(io_tree, page_start, page_end, &cached_state);
> > > @@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> > >               goto again;
> > >       }
> > >
> > > +     size = i_size_read(inode);
> > >       if (page->index == ((size - 1) >> PAGE_SHIFT)) {
> > >               reserved_space = round_up(size - page_start,
> > >                                         fs_info->sectorsize);
> > > @@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> > >       }
> > >       ret2 = 0;
> > >
> > > -     /* page is wholly or partially inside EOF */
> > > -     if (page_start + PAGE_SIZE > size)
> > > -             zero_start = offset_in_page(size);
> > > -     else
> > > -             zero_start = PAGE_SIZE;
> > > -
> > >       if (zero_start != PAGE_SIZE) {
> > >               kaddr = kmap(page);
> > >               memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
> > > diff --git a/fs/buffer.c b/fs/buffer.c
> > > index d8c7242426bb..53aabde57ca7 100644
> > > --- a/fs/buffer.c
> > > +++ b/fs/buffer.c
> > > @@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
> > >       struct page *page = vmf->page;
> > >       struct inode *inode = file_inode(vma->vm_file);
> > >       unsigned long end;
> > > -     loff_t size;
> > >       int ret;
> > >
> > >       lock_page(page);
> > > -     size = i_size_read(inode);
> > > -     if ((page->mapping != inode->i_mapping) ||
> > > -         (page_offset(page) > size)) {
> > > -             /* We overload EFAULT to mean page got truncated */
> > > -             ret = -EFAULT;
> > > +     ret = page_mkwrite_check_truncate(page, inode);
> > > +     if (ret < 0)
> > >               goto out_unlock;
> > > -     }
> > > -
> > > -     /* page is wholly or partially inside EOF */
> > > -     if (((page->index + 1) << PAGE_SHIFT) > size)
> > > -             end = size & ~PAGE_MASK;
> > > -     else
> > > -             end = PAGE_SIZE;
> > > +     end = ret;
> > >
> > >       ret = __block_write_begin(page, 0, end, get_block);
> > >       if (!ret)
> > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > > index 7ab616601141..ef958aa4adb4 100644
> > > --- a/fs/ceph/addr.c
> > > +++ b/fs/ceph/addr.c
> > > @@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
> > >       do {
> > >               lock_page(page);
> > >
> > > -             if ((off > size) || (page->mapping != inode->i_mapping)) {
> > > +             if (page_mkwrite_check_truncate(page, inode) < 0) {
> > >                       unlock_page(page);
> > >                       ret = VM_FAULT_NOPAGE;
> > >                       break;
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index 28f28de0c1b6..51ab1d2cac80 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
> > >  {
> > >       struct vm_area_struct *vma = vmf->vma;
> > >       struct page *page = vmf->page;
> > > -     loff_t size;
> > >       unsigned long len;
> > >       int err;
> > >       vm_fault_t ret;
> > > @@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
> > >       }
> > >
> > >       lock_page(page);
> > > -     size = i_size_read(inode);
> > > -     /* Page got truncated from under us? */
> > > -     if (page->mapping != mapping || page_offset(page) > size) {
> > > +     err = page_mkwrite_check_truncate(page, inode);
> > > +     if (err < 0) {
> > >               unlock_page(page);
> > > -             ret = VM_FAULT_NOPAGE;
> > > -             goto out;
> > > +             goto out_ret;
> > >       }
> > > +     len = err;
> > >
> > > -     if (page->index == size >> PAGE_SHIFT)
> > > -             len = size & ~PAGE_MASK;
> > > -     else
> > > -             len = PAGE_SIZE;
> > >       /*
> > >        * Return if we have all the buffers mapped. This avoids the need to do
> > >        * journal_start/journal_stop which can block and take a long time
> > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > index 85af112e868d..0e77b2e6f873 100644
> > > --- a/fs/f2fs/file.c
> > > +++ b/fs/f2fs/file.c
> > > @@ -51,7 +51,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
> > >       struct inode *inode = file_inode(vmf->vma->vm_file);
> > >       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> > >       struct dnode_of_data dn = { .node_changed = false };
> > > -     int err;
> > > +     int offset, err;
> > >
> > >       if (unlikely(f2fs_cp_error(sbi))) {
> > >               err = -EIO;
> > > @@ -70,13 +70,14 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
> > >       file_update_time(vmf->vma->vm_file);
> > >       down_read(&F2FS_I(inode)->i_mmap_sem);
> > >       lock_page(page);
> > > -     if (unlikely(page->mapping != inode->i_mapping ||
> > > -                     page_offset(page) > i_size_read(inode) ||
> > > -                     !PageUptodate(page))) {
> > > +     err = -EFAULT;
> > > +     if (likely(PageUptodate(page)))
> > > +             err = page_mkwrite_check_truncate(page, inode);
> > > +     if (unlikely(err < 0)) {
> > >               unlock_page(page);
> > > -             err = -EFAULT;
> > >               goto out_sem;
> > >       }
> > > +     offset = err;
> > >
> > >       /* block allocation */
> > >       __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
> > > @@ -101,14 +102,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
> > >       if (PageMappedToDisk(page))
> > >               goto out_sem;
> > >
> > > -     /* page is wholly or partially inside EOF */
> > > -     if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
> > > -                                             i_size_read(inode)) {
> > > -             loff_t offset;
> > > -
> > > -             offset = i_size_read(inode) & ~PAGE_MASK;
> > > +     if (offset != PAGE_SIZE)
> > >               zero_user_segment(page, offset, PAGE_SIZE);
> > > -     }
> > >       set_page_dirty(page);
> > >       if (!PageUptodate(page))
> > >               SetPageUptodate(page);
> > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > > index d33c7bc5ee92..1aaf157fd6e9 100644
> > > --- a/fs/iomap/buffered-io.c
> > > +++ b/fs/iomap/buffered-io.c
> > > @@ -1062,24 +1062,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
> > >       struct page *page = vmf->page;
> > >       struct inode *inode = file_inode(vmf->vma->vm_file);
> > >       unsigned long length;
> > > -     loff_t offset, size;
> > > +     loff_t offset;
> > >       ssize_t ret;
> > >
> > >       lock_page(page);
> > > -     size = i_size_read(inode);
> > > -     offset = page_offset(page);
> > > -     if (page->mapping != inode->i_mapping || offset > size) {
> > > -             /* We overload EFAULT to mean page got truncated */
> > > -             ret = -EFAULT;
> > > +     ret = page_mkwrite_check_truncate(page, inode);
> > > +     if (ret < 0)
> > >               goto out_unlock;
> > > -     }
> > > -
> > > -     /* page is wholly or partially inside EOF */
> > > -     if (offset > size - PAGE_SIZE)
> > > -             length = offset_in_page(size);
> > > -     else
> > > -             length = PAGE_SIZE;
> > > +     length = ret;
> > >
> > > +     offset = page_offset(page);
> > >       while (length > 0) {
> > >               ret = iomap_apply(inode, offset, length,
> > >                               IOMAP_WRITE | IOMAP_FAULT, ops, page,
> > > diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
> > > index cd52585c8f4f..91f7a1f2db0d 100644
> > > --- a/fs/ubifs/file.c
> > > +++ b/fs/ubifs/file.c
> > > @@ -1563,8 +1563,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
> > >       }
> > >
> > >       lock_page(page);
> > > -     if (unlikely(page->mapping != inode->i_mapping ||
> > > -                  page_offset(page) > i_size_read(inode))) {
> > > +     if (unlikely(page_mkwrite_check_truncate(page, inode) < 0)) {
> > >               /* Page got truncated out from underneath us */
> > >               goto sigbus;
> > >       }
> > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > > index 37a4d9e32cd3..ccb14b6a16b5 100644
> > > --- a/include/linux/pagemap.h
> > > +++ b/include/linux/pagemap.h
> > > @@ -636,4 +636,32 @@ static inline unsigned long dir_pages(struct inode *inode)
> > >                              PAGE_SHIFT;
> > >  }
> > >
> > > +/**
> > > + * page_mkwrite_check_truncate - check if page was truncated
> > > + * @page: the page to check
> > > + * @inode: the inode to check the page against
> > > + *
> > > + * Returns the number of bytes in the page up to EOF,
> > > + * or -EFAULT if the page was truncated.
> > > + */
> > > +static inline int page_mkwrite_check_truncate(struct page *page,
> > > +                                           struct inode *inode)
> > > +{
> > > +     loff_t size = i_size_read(inode);
> > > +     pgoff_t index = size >> PAGE_SHIFT;
> > > +     int offset = offset_in_page(size);
> > > +
> > > +     if (page->mapping != inode->i_mapping)
> > > +             return -EFAULT;
> > > +
> > > +     /* page is wholly inside EOF */
> > > +     if (page->index < index)
> > > +             return PAGE_SIZE;
> > > +     /* page is wholly past EOF */
> > > +     if (page->index > index || !offset)
> > > +             return -EFAULT;
> > > +     /* page is partially inside EOF */
> > > +     return offset;
> > > +}
> > > +
> > >  #endif /* _LINUX_PAGEMAP_H */
> > > --
> > > 2.20.1
> > >
> >
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2019-12-18 19:23     ` Darrick J. Wong
@ 2019-12-22  1:59       ` Theodore Y. Ts'o
  0 siblings, 0 replies; 10+ messages in thread
From: Theodore Y. Ts'o @ 2019-12-22  1:59 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Andreas Gruenbacher, Alexander Viro, Christoph Hellwig,
	Linus Torvalds, LKML, Jeff Layton, Sage Weil, Ilya Dryomov,
	Andreas Dilger, Jaegeuk Kim, Chao Yu, linux-xfs, linux-fsdevel,
	Richard Weinberger, Artem Bityutskiy, Adrian Hunter,
	Ceph Development, linux-ext4, linux-f2fs-devel, linux-mtd,
	Chris Mason, Josef Bacik, David Sterba, linux-btrfs, Jan Kara

On Wed, Dec 18, 2019 at 11:23:31AM -0800, Darrick J. Wong wrote:
> *OH*, because we're stuffing the value in ret2, not ret.  Ok, that makes
> more sense.  Er, I guess I don't mind pushing via iomap tree, but could
> we get some acks from Ted and any of the ceph maintainers?

Acked-by: Theodore Ts'o <tytso@mit.edu>

My only nit is the same one Jan raised, which is should
page_mkwrite_check_truncate() be an inline function?

			      	    - Ted

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2019-12-18 13:09 [PATCH v3] fs: Fix page_mkwrite off-by-one errors Andreas Gruenbacher
  2019-12-18 14:10 ` Jan Kara
  2019-12-18 18:52 ` Darrick J. Wong
@ 2020-01-07 23:20 ` Darrick J. Wong
  2020-01-08  9:09   ` Chao Yu
  2020-01-08 11:51 ` Jaegeuk Kim
  3 siblings, 1 reply; 10+ messages in thread
From: Darrick J. Wong @ 2020-01-07 23:20 UTC (permalink / raw)
  To: Andreas Gruenbacher
  Cc: Alexander Viro, Christoph Hellwig, Linus Torvalds, linux-kernel,
	Jeff Layton, Sage Weil, Ilya Dryomov, Theodore Ts'o,
	Andreas Dilger, Jaegeuk Kim, Chao Yu, linux-xfs, linux-fsdevel,
	Richard Weinberger, Artem Bityutskiy, Adrian Hunter, ceph-devel,
	linux-ext4, linux-f2fs-devel, linux-mtd, Chris Mason,
	Josef Bacik, David Sterba, linux-btrfs, Jan Kara

On Wed, Dec 18, 2019 at 02:09:35PM +0100, Andreas Gruenbacher wrote:
> Hi Darrick,
> 
> can this fix go in via the xfs tree?
> 
> Thanks,
> Andreas
> 
> --
> 
> The check in block_page_mkwrite that is meant to determine whether an
> offset is within the inode size is off by one.  This bug has been copied
> into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
> ceph).
> 
> Fix that by introducing a new page_mkwrite_check_truncate helper that
> checks for truncate and computes the bytes in the page up to EOF.  Use
> the helper in the above mentioned filesystems.
> 
> In addition, use the new helper in btrfs as well.
> 
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> Acked-by: David Sterba <dsterba@suse.com> (btrfs part)
> Acked-by: Richard Weinberger <richard@nod.at> (ubifs part)
> ---
>  fs/btrfs/inode.c        | 15 ++++-----------
>  fs/buffer.c             | 16 +++-------------
>  fs/ceph/addr.c          |  2 +-
>  fs/ext4/inode.c         | 14 ++++----------
>  fs/f2fs/file.c          | 19 +++++++------------

Well, the f2fs developers never acked this and there was a conflict when
I put this into for-next, so I removed the f2fs part (and fixed the
unused variable warning in the ext4 part)...

--D

>  fs/iomap/buffered-io.c  | 18 +++++-------------
>  fs/ubifs/file.c         |  3 +--
>  include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
>  8 files changed, 53 insertions(+), 62 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 56032c518b26..86c6fcd8139d 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
>  again:
>  	lock_page(page);
> -	size = i_size_read(inode);
>  
> -	if ((page->mapping != inode->i_mapping) ||
> -	    (page_start >= size)) {
> -		/* page got truncated out from underneath us */
> +	ret2 = page_mkwrite_check_truncate(page, inode);
> +	if (ret2 < 0)
>  		goto out_unlock;
> -	}
> +	zero_start = ret2;
>  	wait_on_page_writeback(page);
>  
>  	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
> @@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  		goto again;
>  	}
>  
> +	size = i_size_read(inode);
>  	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
>  		reserved_space = round_up(size - page_start,
>  					  fs_info->sectorsize);
> @@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	}
>  	ret2 = 0;
>  
> -	/* page is wholly or partially inside EOF */
> -	if (page_start + PAGE_SIZE > size)
> -		zero_start = offset_in_page(size);
> -	else
> -		zero_start = PAGE_SIZE;
> -
>  	if (zero_start != PAGE_SIZE) {
>  		kaddr = kmap(page);
>  		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
> diff --git a/fs/buffer.c b/fs/buffer.c
> index d8c7242426bb..53aabde57ca7 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	struct page *page = vmf->page;
>  	struct inode *inode = file_inode(vma->vm_file);
>  	unsigned long end;
> -	loff_t size;
>  	int ret;
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	if ((page->mapping != inode->i_mapping) ||
> -	    (page_offset(page) > size)) {
> -		/* We overload EFAULT to mean page got truncated */
> -		ret = -EFAULT;
> +	ret = page_mkwrite_check_truncate(page, inode);
> +	if (ret < 0)
>  		goto out_unlock;
> -	}
> -
> -	/* page is wholly or partially inside EOF */
> -	if (((page->index + 1) << PAGE_SHIFT) > size)
> -		end = size & ~PAGE_MASK;
> -	else
> -		end = PAGE_SIZE;
> +	end = ret;
>  
>  	ret = __block_write_begin(page, 0, end, get_block);
>  	if (!ret)
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 7ab616601141..ef958aa4adb4 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>  	do {
>  		lock_page(page);
>  
> -		if ((off > size) || (page->mapping != inode->i_mapping)) {
> +		if (page_mkwrite_check_truncate(page, inode) < 0) {
>  			unlock_page(page);
>  			ret = VM_FAULT_NOPAGE;
>  			break;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 28f28de0c1b6..51ab1d2cac80 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = vmf->page;
> -	loff_t size;
>  	unsigned long len;
>  	int err;
>  	vm_fault_t ret;
> @@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  	}
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	/* Page got truncated from under us? */
> -	if (page->mapping != mapping || page_offset(page) > size) {
> +	err = page_mkwrite_check_truncate(page, inode);
> +	if (err < 0) {
>  		unlock_page(page);
> -		ret = VM_FAULT_NOPAGE;
> -		goto out;
> +		goto out_ret;
>  	}
> +	len = err;
>  
> -	if (page->index == size >> PAGE_SHIFT)
> -		len = size & ~PAGE_MASK;
> -	else
> -		len = PAGE_SIZE;
>  	/*
>  	 * Return if we have all the buffers mapped. This avoids the need to do
>  	 * journal_start/journal_stop which can block and take a long time
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 85af112e868d..0e77b2e6f873 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -51,7 +51,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>  	struct dnode_of_data dn = { .node_changed = false };
> -	int err;
> +	int offset, err;
>  
>  	if (unlikely(f2fs_cp_error(sbi))) {
>  		err = -EIO;
> @@ -70,13 +70,14 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	file_update_time(vmf->vma->vm_file);
>  	down_read(&F2FS_I(inode)->i_mmap_sem);
>  	lock_page(page);
> -	if (unlikely(page->mapping != inode->i_mapping ||
> -			page_offset(page) > i_size_read(inode) ||
> -			!PageUptodate(page))) {
> +	err = -EFAULT;
> +	if (likely(PageUptodate(page)))
> +		err = page_mkwrite_check_truncate(page, inode);
> +	if (unlikely(err < 0)) {
>  		unlock_page(page);
> -		err = -EFAULT;
>  		goto out_sem;
>  	}
> +	offset = err;
>  
>  	/* block allocation */
>  	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
> @@ -101,14 +102,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	if (PageMappedToDisk(page))
>  		goto out_sem;
>  
> -	/* page is wholly or partially inside EOF */
> -	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
> -						i_size_read(inode)) {
> -		loff_t offset;
> -
> -		offset = i_size_read(inode) & ~PAGE_MASK;
> +	if (offset != PAGE_SIZE)
>  		zero_user_segment(page, offset, PAGE_SIZE);
> -	}
>  	set_page_dirty(page);
>  	if (!PageUptodate(page))
>  		SetPageUptodate(page);
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index d33c7bc5ee92..1aaf157fd6e9 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1062,24 +1062,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
>  	struct page *page = vmf->page;
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	unsigned long length;
> -	loff_t offset, size;
> +	loff_t offset;
>  	ssize_t ret;
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	offset = page_offset(page);
> -	if (page->mapping != inode->i_mapping || offset > size) {
> -		/* We overload EFAULT to mean page got truncated */
> -		ret = -EFAULT;
> +	ret = page_mkwrite_check_truncate(page, inode);
> +	if (ret < 0)
>  		goto out_unlock;
> -	}
> -
> -	/* page is wholly or partially inside EOF */
> -	if (offset > size - PAGE_SIZE)
> -		length = offset_in_page(size);
> -	else
> -		length = PAGE_SIZE;
> +	length = ret;
>  
> +	offset = page_offset(page);
>  	while (length > 0) {
>  		ret = iomap_apply(inode, offset, length,
>  				IOMAP_WRITE | IOMAP_FAULT, ops, page,
> diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
> index cd52585c8f4f..91f7a1f2db0d 100644
> --- a/fs/ubifs/file.c
> +++ b/fs/ubifs/file.c
> @@ -1563,8 +1563,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
>  	}
>  
>  	lock_page(page);
> -	if (unlikely(page->mapping != inode->i_mapping ||
> -		     page_offset(page) > i_size_read(inode))) {
> +	if (unlikely(page_mkwrite_check_truncate(page, inode) < 0)) {
>  		/* Page got truncated out from underneath us */
>  		goto sigbus;
>  	}
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 37a4d9e32cd3..ccb14b6a16b5 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -636,4 +636,32 @@ static inline unsigned long dir_pages(struct inode *inode)
>  			       PAGE_SHIFT;
>  }
>  
> +/**
> + * page_mkwrite_check_truncate - check if page was truncated
> + * @page: the page to check
> + * @inode: the inode to check the page against
> + *
> + * Returns the number of bytes in the page up to EOF,
> + * or -EFAULT if the page was truncated.
> + */
> +static inline int page_mkwrite_check_truncate(struct page *page,
> +					      struct inode *inode)
> +{
> +	loff_t size = i_size_read(inode);
> +	pgoff_t index = size >> PAGE_SHIFT;
> +	int offset = offset_in_page(size);
> +
> +	if (page->mapping != inode->i_mapping)
> +		return -EFAULT;
> +
> +	/* page is wholly inside EOF */
> +	if (page->index < index)
> +		return PAGE_SIZE;
> +	/* page is wholly past EOF */
> +	if (page->index > index || !offset)
> +		return -EFAULT;
> +	/* page is partially inside EOF */
> +	return offset;
> +}
> +
>  #endif /* _LINUX_PAGEMAP_H */
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2020-01-07 23:20 ` Darrick J. Wong
@ 2020-01-08  9:09   ` Chao Yu
  0 siblings, 0 replies; 10+ messages in thread
From: Chao Yu @ 2020-01-08  9:09 UTC (permalink / raw)
  To: Darrick J. Wong, Andreas Gruenbacher
  Cc: Alexander Viro, Christoph Hellwig, Linus Torvalds, linux-kernel,
	Jeff Layton, Sage Weil, Ilya Dryomov, Theodore Ts'o,
	Andreas Dilger, Jaegeuk Kim, Chao Yu, linux-xfs, linux-fsdevel,
	Richard Weinberger, Artem Bityutskiy, Adrian Hunter, ceph-devel,
	linux-ext4, linux-f2fs-devel, linux-mtd, Chris Mason,
	Josef Bacik, David Sterba, linux-btrfs, Jan Kara

On 2020/1/8 7:20, Darrick J. Wong wrote:
> On Wed, Dec 18, 2019 at 02:09:35PM +0100, Andreas Gruenbacher wrote:
>> Hi Darrick,
>>
>> can this fix go in via the xfs tree?
>>
>> Thanks,
>> Andreas
>>
>> --
>>
>> The check in block_page_mkwrite that is meant to determine whether an
>> offset is within the inode size is off by one.  This bug has been copied
>> into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
>> ceph).
>>
>> Fix that by introducing a new page_mkwrite_check_truncate helper that
>> checks for truncate and computes the bytes in the page up to EOF.  Use
>> the helper in the above mentioned filesystems.
>>
>> In addition, use the new helper in btrfs as well.
>>
>> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
>> Acked-by: David Sterba <dsterba@suse.com> (btrfs part)
>> Acked-by: Richard Weinberger <richard@nod.at> (ubifs part)
>> ---
>>  fs/btrfs/inode.c        | 15 ++++-----------
>>  fs/buffer.c             | 16 +++-------------
>>  fs/ceph/addr.c          |  2 +-
>>  fs/ext4/inode.c         | 14 ++++----------
>>  fs/f2fs/file.c          | 19 +++++++------------
> 
> Well, the f2fs developers never acked this and there was a conflict when
> I put this into for-next, so I removed the f2fs part (and fixed the
> unused variable warning in the ext4 part)...

Sorry for late reply.

Acked-by: Chao Yu <yuchao0@huawei.com>

BTW, to avoid such conflict, does f2fs need to rebase/fix its last code
on current patch?

Thanks,

> 
> --D
> 
>>  fs/iomap/buffered-io.c  | 18 +++++-------------
>>  fs/ubifs/file.c         |  3 +--
>>  include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
>>  8 files changed, 53 insertions(+), 62 deletions(-)
>>
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index 56032c518b26..86c6fcd8139d 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>>  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
>>  again:
>>  	lock_page(page);
>> -	size = i_size_read(inode);
>>  
>> -	if ((page->mapping != inode->i_mapping) ||
>> -	    (page_start >= size)) {
>> -		/* page got truncated out from underneath us */
>> +	ret2 = page_mkwrite_check_truncate(page, inode);
>> +	if (ret2 < 0)
>>  		goto out_unlock;
>> -	}
>> +	zero_start = ret2;
>>  	wait_on_page_writeback(page);
>>  
>>  	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
>> @@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>>  		goto again;
>>  	}
>>  
>> +	size = i_size_read(inode);
>>  	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
>>  		reserved_space = round_up(size - page_start,
>>  					  fs_info->sectorsize);
>> @@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>>  	}
>>  	ret2 = 0;
>>  
>> -	/* page is wholly or partially inside EOF */
>> -	if (page_start + PAGE_SIZE > size)
>> -		zero_start = offset_in_page(size);
>> -	else
>> -		zero_start = PAGE_SIZE;
>> -
>>  	if (zero_start != PAGE_SIZE) {
>>  		kaddr = kmap(page);
>>  		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
>> diff --git a/fs/buffer.c b/fs/buffer.c
>> index d8c7242426bb..53aabde57ca7 100644
>> --- a/fs/buffer.c
>> +++ b/fs/buffer.c
>> @@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
>>  	struct page *page = vmf->page;
>>  	struct inode *inode = file_inode(vma->vm_file);
>>  	unsigned long end;
>> -	loff_t size;
>>  	int ret;
>>  
>>  	lock_page(page);
>> -	size = i_size_read(inode);
>> -	if ((page->mapping != inode->i_mapping) ||
>> -	    (page_offset(page) > size)) {
>> -		/* We overload EFAULT to mean page got truncated */
>> -		ret = -EFAULT;
>> +	ret = page_mkwrite_check_truncate(page, inode);
>> +	if (ret < 0)
>>  		goto out_unlock;
>> -	}
>> -
>> -	/* page is wholly or partially inside EOF */
>> -	if (((page->index + 1) << PAGE_SHIFT) > size)
>> -		end = size & ~PAGE_MASK;
>> -	else
>> -		end = PAGE_SIZE;
>> +	end = ret;
>>  
>>  	ret = __block_write_begin(page, 0, end, get_block);
>>  	if (!ret)
>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>> index 7ab616601141..ef958aa4adb4 100644
>> --- a/fs/ceph/addr.c
>> +++ b/fs/ceph/addr.c
>> @@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>>  	do {
>>  		lock_page(page);
>>  
>> -		if ((off > size) || (page->mapping != inode->i_mapping)) {
>> +		if (page_mkwrite_check_truncate(page, inode) < 0) {
>>  			unlock_page(page);
>>  			ret = VM_FAULT_NOPAGE;
>>  			break;
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 28f28de0c1b6..51ab1d2cac80 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	struct page *page = vmf->page;
>> -	loff_t size;
>>  	unsigned long len;
>>  	int err;
>>  	vm_fault_t ret;
>> @@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>>  	}
>>  
>>  	lock_page(page);
>> -	size = i_size_read(inode);
>> -	/* Page got truncated from under us? */
>> -	if (page->mapping != mapping || page_offset(page) > size) {
>> +	err = page_mkwrite_check_truncate(page, inode);
>> +	if (err < 0) {
>>  		unlock_page(page);
>> -		ret = VM_FAULT_NOPAGE;
>> -		goto out;
>> +		goto out_ret;
>>  	}
>> +	len = err;
>>  
>> -	if (page->index == size >> PAGE_SHIFT)
>> -		len = size & ~PAGE_MASK;
>> -	else
>> -		len = PAGE_SIZE;
>>  	/*
>>  	 * Return if we have all the buffers mapped. This avoids the need to do
>>  	 * journal_start/journal_stop which can block and take a long time
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 85af112e868d..0e77b2e6f873 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -51,7 +51,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>>  	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>>  	struct dnode_of_data dn = { .node_changed = false };
>> -	int err;
>> +	int offset, err;
>>  
>>  	if (unlikely(f2fs_cp_error(sbi))) {
>>  		err = -EIO;
>> @@ -70,13 +70,14 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>>  	file_update_time(vmf->vma->vm_file);
>>  	down_read(&F2FS_I(inode)->i_mmap_sem);
>>  	lock_page(page);
>> -	if (unlikely(page->mapping != inode->i_mapping ||
>> -			page_offset(page) > i_size_read(inode) ||
>> -			!PageUptodate(page))) {
>> +	err = -EFAULT;
>> +	if (likely(PageUptodate(page)))
>> +		err = page_mkwrite_check_truncate(page, inode);
>> +	if (unlikely(err < 0)) {
>>  		unlock_page(page);
>> -		err = -EFAULT;
>>  		goto out_sem;
>>  	}
>> +	offset = err;
>>  
>>  	/* block allocation */
>>  	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
>> @@ -101,14 +102,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>>  	if (PageMappedToDisk(page))
>>  		goto out_sem;
>>  
>> -	/* page is wholly or partially inside EOF */
>> -	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
>> -						i_size_read(inode)) {
>> -		loff_t offset;
>> -
>> -		offset = i_size_read(inode) & ~PAGE_MASK;
>> +	if (offset != PAGE_SIZE)
>>  		zero_user_segment(page, offset, PAGE_SIZE);
>> -	}
>>  	set_page_dirty(page);
>>  	if (!PageUptodate(page))
>>  		SetPageUptodate(page);
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index d33c7bc5ee92..1aaf157fd6e9 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -1062,24 +1062,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
>>  	struct page *page = vmf->page;
>>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>>  	unsigned long length;
>> -	loff_t offset, size;
>> +	loff_t offset;
>>  	ssize_t ret;
>>  
>>  	lock_page(page);
>> -	size = i_size_read(inode);
>> -	offset = page_offset(page);
>> -	if (page->mapping != inode->i_mapping || offset > size) {
>> -		/* We overload EFAULT to mean page got truncated */
>> -		ret = -EFAULT;
>> +	ret = page_mkwrite_check_truncate(page, inode);
>> +	if (ret < 0)
>>  		goto out_unlock;
>> -	}
>> -
>> -	/* page is wholly or partially inside EOF */
>> -	if (offset > size - PAGE_SIZE)
>> -		length = offset_in_page(size);
>> -	else
>> -		length = PAGE_SIZE;
>> +	length = ret;
>>  
>> +	offset = page_offset(page);
>>  	while (length > 0) {
>>  		ret = iomap_apply(inode, offset, length,
>>  				IOMAP_WRITE | IOMAP_FAULT, ops, page,
>> diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
>> index cd52585c8f4f..91f7a1f2db0d 100644
>> --- a/fs/ubifs/file.c
>> +++ b/fs/ubifs/file.c
>> @@ -1563,8 +1563,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
>>  	}
>>  
>>  	lock_page(page);
>> -	if (unlikely(page->mapping != inode->i_mapping ||
>> -		     page_offset(page) > i_size_read(inode))) {
>> +	if (unlikely(page_mkwrite_check_truncate(page, inode) < 0)) {
>>  		/* Page got truncated out from underneath us */
>>  		goto sigbus;
>>  	}
>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>> index 37a4d9e32cd3..ccb14b6a16b5 100644
>> --- a/include/linux/pagemap.h
>> +++ b/include/linux/pagemap.h
>> @@ -636,4 +636,32 @@ static inline unsigned long dir_pages(struct inode *inode)
>>  			       PAGE_SHIFT;
>>  }
>>  
>> +/**
>> + * page_mkwrite_check_truncate - check if page was truncated
>> + * @page: the page to check
>> + * @inode: the inode to check the page against
>> + *
>> + * Returns the number of bytes in the page up to EOF,
>> + * or -EFAULT if the page was truncated.
>> + */
>> +static inline int page_mkwrite_check_truncate(struct page *page,
>> +					      struct inode *inode)
>> +{
>> +	loff_t size = i_size_read(inode);
>> +	pgoff_t index = size >> PAGE_SHIFT;
>> +	int offset = offset_in_page(size);
>> +
>> +	if (page->mapping != inode->i_mapping)
>> +		return -EFAULT;
>> +
>> +	/* page is wholly inside EOF */
>> +	if (page->index < index)
>> +		return PAGE_SIZE;
>> +	/* page is wholly past EOF */
>> +	if (page->index > index || !offset)
>> +		return -EFAULT;
>> +	/* page is partially inside EOF */
>> +	return offset;
>> +}
>> +
>>  #endif /* _LINUX_PAGEMAP_H */
>> -- 
>> 2.20.1
>>
> .
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] fs: Fix page_mkwrite off-by-one errors
  2019-12-18 13:09 [PATCH v3] fs: Fix page_mkwrite off-by-one errors Andreas Gruenbacher
                   ` (2 preceding siblings ...)
  2020-01-07 23:20 ` Darrick J. Wong
@ 2020-01-08 11:51 ` Jaegeuk Kim
  3 siblings, 0 replies; 10+ messages in thread
From: Jaegeuk Kim @ 2020-01-08 11:51 UTC (permalink / raw)
  To: Andreas Gruenbacher
  Cc: Darrick J. Wong, Alexander Viro, Christoph Hellwig,
	Linus Torvalds, linux-kernel, Jeff Layton, Sage Weil,
	Ilya Dryomov, Theodore Ts'o, Andreas Dilger, Chao Yu,
	linux-xfs, linux-fsdevel, Richard Weinberger, Artem Bityutskiy,
	Adrian Hunter, ceph-devel, linux-ext4, linux-f2fs-devel,
	linux-mtd, Chris Mason, Josef Bacik, David Sterba, linux-btrfs,
	Jan Kara

Hi Andreas,

On 12/18, Andreas Gruenbacher wrote:
> Hi Darrick,
> 
> can this fix go in via the xfs tree?
> 
> Thanks,
> Andreas
> 
> --
> 
> The check in block_page_mkwrite that is meant to determine whether an
> offset is within the inode size is off by one.  This bug has been copied
> into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
> ceph).
> 
> Fix that by introducing a new page_mkwrite_check_truncate helper that
> checks for truncate and computes the bytes in the page up to EOF.  Use
> the helper in the above mentioned filesystems.
> 
> In addition, use the new helper in btrfs as well.
> 
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> Acked-by: David Sterba <dsterba@suse.com> (btrfs part)
> Acked-by: Richard Weinberger <richard@nod.at> (ubifs part)
> ---
>  fs/btrfs/inode.c        | 15 ++++-----------
>  fs/buffer.c             | 16 +++-------------
>  fs/ceph/addr.c          |  2 +-
>  fs/ext4/inode.c         | 14 ++++----------
>  fs/f2fs/file.c          | 19 +++++++------------
>  fs/iomap/buffered-io.c  | 18 +++++-------------
>  fs/ubifs/file.c         |  3 +--
>  include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++
>  8 files changed, 53 insertions(+), 62 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 56032c518b26..86c6fcd8139d 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
>  again:
>  	lock_page(page);
> -	size = i_size_read(inode);
>  
> -	if ((page->mapping != inode->i_mapping) ||
> -	    (page_start >= size)) {
> -		/* page got truncated out from underneath us */
> +	ret2 = page_mkwrite_check_truncate(page, inode);
> +	if (ret2 < 0)
>  		goto out_unlock;
> -	}
> +	zero_start = ret2;
>  	wait_on_page_writeback(page);
>  
>  	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
> @@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  		goto again;
>  	}
>  
> +	size = i_size_read(inode);
>  	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
>  		reserved_space = round_up(size - page_start,
>  					  fs_info->sectorsize);
> @@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	}
>  	ret2 = 0;
>  
> -	/* page is wholly or partially inside EOF */
> -	if (page_start + PAGE_SIZE > size)
> -		zero_start = offset_in_page(size);
> -	else
> -		zero_start = PAGE_SIZE;
> -
>  	if (zero_start != PAGE_SIZE) {
>  		kaddr = kmap(page);
>  		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
> diff --git a/fs/buffer.c b/fs/buffer.c
> index d8c7242426bb..53aabde57ca7 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	struct page *page = vmf->page;
>  	struct inode *inode = file_inode(vma->vm_file);
>  	unsigned long end;
> -	loff_t size;
>  	int ret;
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	if ((page->mapping != inode->i_mapping) ||
> -	    (page_offset(page) > size)) {
> -		/* We overload EFAULT to mean page got truncated */
> -		ret = -EFAULT;
> +	ret = page_mkwrite_check_truncate(page, inode);
> +	if (ret < 0)
>  		goto out_unlock;
> -	}
> -
> -	/* page is wholly or partially inside EOF */
> -	if (((page->index + 1) << PAGE_SHIFT) > size)
> -		end = size & ~PAGE_MASK;
> -	else
> -		end = PAGE_SIZE;
> +	end = ret;
>  
>  	ret = __block_write_begin(page, 0, end, get_block);
>  	if (!ret)
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 7ab616601141..ef958aa4adb4 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>  	do {
>  		lock_page(page);
>  
> -		if ((off > size) || (page->mapping != inode->i_mapping)) {
> +		if (page_mkwrite_check_truncate(page, inode) < 0) {
>  			unlock_page(page);
>  			ret = VM_FAULT_NOPAGE;
>  			break;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 28f28de0c1b6..51ab1d2cac80 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = vmf->page;
> -	loff_t size;
>  	unsigned long len;
>  	int err;
>  	vm_fault_t ret;
> @@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  	}
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	/* Page got truncated from under us? */
> -	if (page->mapping != mapping || page_offset(page) > size) {
> +	err = page_mkwrite_check_truncate(page, inode);
> +	if (err < 0) {
>  		unlock_page(page);
> -		ret = VM_FAULT_NOPAGE;
> -		goto out;
> +		goto out_ret;
>  	}
> +	len = err;
>  
> -	if (page->index == size >> PAGE_SHIFT)
> -		len = size & ~PAGE_MASK;
> -	else
> -		len = PAGE_SIZE;
>  	/*
>  	 * Return if we have all the buffers mapped. This avoids the need to do
>  	 * journal_start/journal_stop which can block and take a long time
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 85af112e868d..0e77b2e6f873 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -51,7 +51,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>  	struct dnode_of_data dn = { .node_changed = false };
> -	int err;
> +	int offset, err;
>  
>  	if (unlikely(f2fs_cp_error(sbi))) {
>  		err = -EIO;
> @@ -70,13 +70,14 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	file_update_time(vmf->vma->vm_file);
>  	down_read(&F2FS_I(inode)->i_mmap_sem);
>  	lock_page(page);
> -	if (unlikely(page->mapping != inode->i_mapping ||
> -			page_offset(page) > i_size_read(inode) ||
> -			!PageUptodate(page))) {
> +	err = -EFAULT;
> +	if (likely(PageUptodate(page)))
> +		err = page_mkwrite_check_truncate(page, inode);
> +	if (unlikely(err < 0)) {
>  		unlock_page(page);
> -		err = -EFAULT;
>  		goto out_sem;
>  	}
> +	offset = err;

This is a bit odd, so how about this?

	offset = -EFAULT;
	if (likely(PageUptodate(page))
		offset = page_mkwrite_check_truncate(page, inode);

	if (unlikely(offset < 0) {
		unlock_page(page);
		err = offset;
		goto out_sem;
	}

I think Linus will address the merge conflict simply later.

Thanks,

>  
>  	/* block allocation */
>  	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
> @@ -101,14 +102,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	if (PageMappedToDisk(page))
>  		goto out_sem;
>  
> -	/* page is wholly or partially inside EOF */
> -	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
> -						i_size_read(inode)) {
> -		loff_t offset;
> -
> -		offset = i_size_read(inode) & ~PAGE_MASK;
> +	if (offset != PAGE_SIZE)
>  		zero_user_segment(page, offset, PAGE_SIZE);
> -	}
>  	set_page_dirty(page);
>  	if (!PageUptodate(page))
>  		SetPageUptodate(page);
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index d33c7bc5ee92..1aaf157fd6e9 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1062,24 +1062,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
>  	struct page *page = vmf->page;
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	unsigned long length;
> -	loff_t offset, size;
> +	loff_t offset;
>  	ssize_t ret;
>  
>  	lock_page(page);
> -	size = i_size_read(inode);
> -	offset = page_offset(page);
> -	if (page->mapping != inode->i_mapping || offset > size) {
> -		/* We overload EFAULT to mean page got truncated */
> -		ret = -EFAULT;
> +	ret = page_mkwrite_check_truncate(page, inode);
> +	if (ret < 0)
>  		goto out_unlock;
> -	}
> -
> -	/* page is wholly or partially inside EOF */
> -	if (offset > size - PAGE_SIZE)
> -		length = offset_in_page(size);
> -	else
> -		length = PAGE_SIZE;
> +	length = ret;
>  
> +	offset = page_offset(page);
>  	while (length > 0) {
>  		ret = iomap_apply(inode, offset, length,
>  				IOMAP_WRITE | IOMAP_FAULT, ops, page,
> diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
> index cd52585c8f4f..91f7a1f2db0d 100644
> --- a/fs/ubifs/file.c
> +++ b/fs/ubifs/file.c
> @@ -1563,8 +1563,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
>  	}
>  
>  	lock_page(page);
> -	if (unlikely(page->mapping != inode->i_mapping ||
> -		     page_offset(page) > i_size_read(inode))) {
> +	if (unlikely(page_mkwrite_check_truncate(page, inode) < 0)) {
>  		/* Page got truncated out from underneath us */
>  		goto sigbus;
>  	}
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 37a4d9e32cd3..ccb14b6a16b5 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -636,4 +636,32 @@ static inline unsigned long dir_pages(struct inode *inode)
>  			       PAGE_SHIFT;
>  }
>  
> +/**
> + * page_mkwrite_check_truncate - check if page was truncated
> + * @page: the page to check
> + * @inode: the inode to check the page against
> + *
> + * Returns the number of bytes in the page up to EOF,
> + * or -EFAULT if the page was truncated.
> + */
> +static inline int page_mkwrite_check_truncate(struct page *page,
> +					      struct inode *inode)
> +{
> +	loff_t size = i_size_read(inode);
> +	pgoff_t index = size >> PAGE_SHIFT;
> +	int offset = offset_in_page(size);
> +
> +	if (page->mapping != inode->i_mapping)
> +		return -EFAULT;
> +
> +	/* page is wholly inside EOF */
> +	if (page->index < index)
> +		return PAGE_SIZE;
> +	/* page is wholly past EOF */
> +	if (page->index > index || !offset)
> +		return -EFAULT;
> +	/* page is partially inside EOF */
> +	return offset;
> +}
> +
>  #endif /* _LINUX_PAGEMAP_H */
> -- 
> 2.20.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-01-08 11:51 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-18 13:09 [PATCH v3] fs: Fix page_mkwrite off-by-one errors Andreas Gruenbacher
2019-12-18 14:10 ` Jan Kara
2019-12-18 18:52 ` Darrick J. Wong
2019-12-18 19:15   ` Andreas Gruenbacher
2019-12-18 19:23     ` Darrick J. Wong
2019-12-22  1:59       ` Theodore Y. Ts'o
2019-12-18 19:21   ` Matthew Wilcox
2020-01-07 23:20 ` Darrick J. Wong
2020-01-08  9:09   ` Chao Yu
2020-01-08 11:51 ` Jaegeuk Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).