Hi all, Today's linux-next merge of the xfs tree got a conflict in fs/xfs/xfs_file.c between commits 99733fa372ea ("xfs_file_aio_write_checks: switch to iocb/iov_iter") and 3309dd04cbcd ("switch generic_write_checks() to iocb and iter") from Linus' tree and commits b9d59846f737 ("xfs: DIO write completion size updates race"), 40c63fbc55a9 ("xfs: direct IO EOF zeroing needs to drain AIO") and 0cefb29e6a63 ("xfs: using generic_file_direct_write() is unnecessary") from the xfs tree. I fixed it up (I have no idea if this is right - see below) and can carry the fix as necessary (no action is required). -- Cheers, Stephen Rothwell sfr@canb.auug.org.au diff --cc fs/xfs/xfs_file.c index 1f12ad0a8585,3a5d305e60c9..000000000000 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@@ -544,22 -545,21 +544,22 @@@ xfs_zero_eof */ STATIC ssize_t xfs_file_aio_write_checks( - struct file *file, - loff_t *pos, - size_t *count, + struct kiocb *iocb, + struct iov_iter *from, int *iolock) { + struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - int error = 0; + ssize_t error = 0; + size_t count = iov_iter_count(from); restart: - error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); - if (error) + error = generic_write_checks(iocb, from); + if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, true); if (error) return error; @@@ -569,21 -569,41 +569,42 @@@ * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. + * + * We need to serialise against EOF updates that occur in IO + * completions here. We want to make sure that nobody is changing the + * size while we do this check until we have placed an IO barrier (i.e. + * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. + * The spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value + * and hence be able to correctly determine if we need to run zeroing. */ + spin_lock(&ip->i_flags_lock); - if (*pos > i_size_read(inode)) { + if (iocb->ki_pos > i_size_read(inode)) { bool zero = false; + spin_unlock(&ip->i_flags_lock); if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + + /* + * We now have an IO submission barrier in place, but + * AIO can do EOF updates during IO completion and hence + * we now need to wait for all of them to drain. Non-AIO + * DIO will have drained before we are given the + * XFS_IOLOCK_EXCL, and so for most cases this wait is a + * no-op. + */ + inode_dio_wait(inode); goto restart; } - error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); + error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); if (error) return error; - } + } else + spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from @@@ -680,11 -702,11 +703,12 @@@ xfs_file_dio_aio_write xfs_rw_ilock(ip, iolock); } - ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; - iov_iter_truncate(from, count); + count = iov_iter_count(from); + pos = iocb->ki_pos; + end = pos + count - 1; if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, @@@ -1385,8 -1449,59 +1449,57 @@@ xfs_file_llseek } } + /* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_sem (MM) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ + STATIC int + xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) + { + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_fault(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = filemap_fault(vma, vmf); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; + } + + /* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. + */ + STATIC int + xfs_filemap_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) + { + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_page_mkwrite(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; + } + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = xfs_file_read_iter, .write_iter = xfs_file_write_iter, .splice_read = xfs_file_splice_read,