The last patch is what started the series: XFS currently uses the direct I/O locking strategy for DAX because DAX was overloaded onto the direct I/O path. For XFS this means that we only take a shared inode lock instead of the normal exclusive one for writes IFF they are properly aligned. While this is fine for O_DIRECT which requires explicit opt-in from the application it's not fine for DAX where we'll suddenly lose expected and required synchronization of the file system happens to use DAX undeneath. Patches 1-7 just untangle the code so that we can deal with DAX on it's own easily.
Instead check the file pointer for the invisble I/O flag directly, and use the chance to drop redundant arguments from the xfs_ioc_space prototype. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_ioctl.c | 22 ++++++++-------------- fs/xfs/xfs_ioctl.h | 3 --- fs/xfs/xfs_ioctl32.c | 6 +----- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index dbca737..6ab5a24 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -595,13 +595,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf) { + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; @@ -626,7 +625,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (filp->f_mode & FMODE_NOCMTIME) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) STATIC int xfs_ioc_getbmap( - struct xfs_inode *ip, - int ioflags, + struct file *file, unsigned int cmd, void __user *arg) { @@ -1479,10 +1477,10 @@ xfs_ioc_getbmap( return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & XFS_IO_INVIS) + if (file->f_mode & FMODE_NOCMTIME) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, (__force struct getbmap *)arg+1); if (error) return error; @@ -1619,12 +1617,8 @@ xfs_file_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_ioctl(ip); switch (cmd) { @@ -1643,7 +1637,7 @@ xfs_file_ioctl( if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_DIOINFO: { struct dioattr da; @@ -1702,7 +1696,7 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 77c02c7..8b52881 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -20,10 +20,7 @@ extern int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1a05d8a..321f577 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -532,12 +532,8 @@ xfs_file_compat_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_compat_ioctl(ip); switch (cmd) { @@ -589,7 +585,7 @@ xfs_file_compat_ioctl( if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); -- 2.1.4
Now that we have the direct I/O kiocb flag there is no real need to sample the value inside of XFS, and the invis flag was always just partially used and isn't worth keeping this infrastructure around for. This also splits the read tracepoint into buffered vs direct as we've done for writes a long time ago. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_file.c | 26 +++++++++----------------- fs/xfs/xfs_inode.h | 10 ---------- fs/xfs/xfs_trace.h | 19 ++++++++----------- 3 files changed, 17 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 713991c..b32e6b0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -249,18 +249,12 @@ xfs_file_read_iter( struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; - int ioflags = 0; xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(mp, xs_read_calls); - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) - ioflags |= XFS_IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - - if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { + if ((iocb->ki_flags & IOCB_DIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -293,7 +287,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -327,7 +321,10 @@ xfs_file_read_iter( xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, pos, ioflags); + if (iocb->ki_flags & IOCB_DIRECT) + trace_xfs_file_direct_read(ip, size, pos); + else + trace_xfs_file_buffered_read(ip, size, pos); ret = generic_file_read_iter(iocb, to); if (ret > 0) @@ -346,18 +343,14 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; ssize_t ret; XFS_STATS_INC(ip->i_mount, xs_read_calls); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + trace_xfs_file_splice_read(ip, count, *ppos); /* * DAX inodes cannot ues the page cache for splice, so we have to push @@ -620,7 +613,7 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; ret = mapping->a_ops->direct_IO(iocb, &data); @@ -670,8 +663,7 @@ xfs_file_buffered_aio_write( current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, iov_iter_count(from), - iocb->ki_pos, 0); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0c19d3d..8eb78ec 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -473,14 +473,4 @@ do { \ extern struct kmem_zone *xfs_inode_zone; -/* - * Flags for read/write calls - */ -#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ -#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { XFS_IO_ISDIRECT, "DIRECT" }, \ - { XFS_IO_INVIS, "INVIS"} - #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6787a9f..2504f94 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1135,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ) DECLARE_EVENT_CLASS(xfs_file_class, - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), - TP_ARGS(ip, count, offset, flags), + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), + TP_ARGS(ip, count, offset), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -1151,23 +1150,21 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx " - "offset 0x%llx count 0x%zx ioflags %s", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, - __entry->count, - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) + __entry->count) ) #define DEFINE_RW_EVENT(name) \ DEFINE_EVENT(xfs_file_class, name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags)) -DEFINE_RW_EVENT(xfs_file_read); + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ + TP_ARGS(ip, count, offset)) +DEFINE_RW_EVENT(xfs_file_buffered_read); +DEFINE_RW_EVENT(xfs_file_direct_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_splice_read); -- 2.1.4
All the three low-level read implementations that we might call already take care of not overflowing the maximum supported bytes, no need to duplicate it here. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_file.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b32e6b0..09a5a78 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -249,7 +249,6 @@ xfs_file_read_iter( struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; - xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(mp, xs_read_calls); @@ -266,13 +265,6 @@ xfs_file_read_iter( } } - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; - - if (n < size) - size = n; - if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; -- 2.1.4
Similar to what we did on the write side a while ago. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_file.c | 83 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 09a5a78..e584333 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -239,35 +239,33 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_read_iter( +xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = iov_iter_count(to); + size_t count = iov_iter_count(to); + struct xfs_buftarg *target; ssize_t ret = 0; - loff_t pos = iocb->ki_pos; - XFS_STATS_INC(mp, xs_read_calls); + trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - if ((iocb->ki_flags & IOCB_DIRECT) && !IS_DAX(inode)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + if (XFS_IS_REALTIME_INODE(ip)) + target = ip->i_mount->m_rtdev_targp; + else + target = ip->i_mount->m_ddev_targp; + + if (!IS_DAX(inode)) { /* DIO must be aligned to device logical sector size */ - if ((pos | size) & target->bt_logical_sectormask) { - if (pos == i_size_read(inode)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == i_size_read(inode)) return 0; return -EINVAL; } } - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file @@ -279,7 +277,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_mapping->nrpages) { + if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -294,8 +292,8 @@ xfs_file_read_iter( * flush and reduce the chances of repeated iolock cycles going * forward. */ - if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (mapping->nrpages) { + ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -306,23 +304,56 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } + ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; + + trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + if (iocb->ki_flags & IOCB_DIRECT) - trace_xfs_file_direct_read(ip, size, pos); + ret = xfs_file_dio_aio_read(iocb, to); else - trace_xfs_file_buffered_read(ip, size, pos); + ret = xfs_file_buffered_aio_read(iocb, to); - ret = generic_file_read_iter(iocb, to); if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); - - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -578,7 +609,7 @@ xfs_file_dio_aio_write( end = iocb->ki_pos + count - 1; /* - * See xfs_file_read_iter() for why we do a full-file flush here. + * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); -- 2.1.4
XFS already implement it's own flushing of the pagecache because it implements proper synchronization for direct I/O reads. This means calling generic_file_read_iter for direct I/O is rather useless, as it doesn't do much but updating the atime and iocb position for us. This also gets rid of the buffered I/O fallback that isn't used for XFS. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_file.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e584333..f761f49 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -246,12 +246,17 @@ xfs_file_dio_aio_read( struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); + loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); + struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); + if (!count) + return 0; /* skip atime */ + if (XFS_IS_REALTIME_INODE(ip)) target = ip->i_mount->m_rtdev_targp; else @@ -260,7 +265,7 @@ xfs_file_dio_aio_read( if (!IS_DAX(inode)) { /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == i_size_read(inode)) + if (iocb->ki_pos == isize) return 0; return -EINVAL; } @@ -311,9 +316,15 @@ xfs_file_dio_aio_read( xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - ret = generic_file_read_iter(iocb, to); + data = *to; + ret = mapping->a_ops->direct_IO(iocb, &data); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + file_accessed(iocb->ki_filp); return ret; } -- 2.1.4
We control both the callers and callees of ->direct_IO, so remove the indirect calls. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_aops.c | 24 +++++------------------- fs/xfs/xfs_aops.h | 3 +++ fs/xfs/xfs_file.c | 17 +++++++++++++++-- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 80714eb..b368277 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1303,7 +1303,7 @@ xfs_get_blocks_dax_fault( * whereas if we have flags set we will always be called in task context * (i.e. from a workqueue). */ -STATIC int +int xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, @@ -1374,24 +1374,10 @@ xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - dio_iodone_t *endio = NULL; - int flags = 0; - struct block_device *bdev; - - if (iov_iter_rw(iter) == WRITE) { - endio = xfs_end_io_direct_write; - flags = DIO_ASYNC_EXTEND; - } - - if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, - xfs_get_blocks_direct, endio, 0); - } - - bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, - xfs_get_blocks_direct, endio, NULL, flags); + /* + * We just need the method present so that open/fcntl allow direct I/O. + */ + return -EINVAL; } STATIC sector_t diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 814aab7..bf2d9a1 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset, int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); +int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private); + extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f761f49..24b267d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -317,7 +317,13 @@ xfs_file_dio_aio_read( } data = *to; - ret = mapping->a_ops->direct_IO(iocb, &data); + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + NULL, 0); + } else { + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + } if (ret > 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); @@ -650,7 +656,14 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data); + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + } else { + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); + } /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { -- 2.1.4
So far the DAX code overloaded the direct I/O code path. There is very little in common between the two, and untangling them allows to clean up both variants. As a ѕide effect we also get separate trace points for both I/O types. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_file.c | 139 ++++++++++++++++++++++++++++++++++++++++++----------- fs/xfs/xfs_trace.h | 2 + 2 files changed, 112 insertions(+), 29 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 24b267d..0e74325 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -262,13 +262,11 @@ xfs_file_dio_aio_read( else target = ip->i_mount->m_ddev_targp; - if (!IS_DAX(inode)) { - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; } /* @@ -317,13 +315,37 @@ xfs_file_dio_aio_read( } data = *to; - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - NULL, 0); - } else { - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); if (ret > 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); @@ -356,7 +378,8 @@ xfs_file_read_iter( struct kiocb *iocb, struct iov_iter *to) { - struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; ssize_t ret = 0; XFS_STATS_INC(mp, xs_read_calls); @@ -364,7 +387,9 @@ xfs_file_read_iter( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (iocb->ki_flags & IOCB_DIRECT) + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_read(iocb, to); else ret = xfs_file_buffered_aio_read(iocb, to); @@ -586,8 +611,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -656,14 +680,9 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - xfs_end_io_direct_write, 0); - } else { - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - } + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -680,10 +699,70 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. + */ + ASSERT(ret < 0 || ret == count); + return ret; +} + +STATIC ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + if (mapping->nrpages) { + ret = invalidate_inode_pages2(mapping); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -765,7 +844,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2504f94..1451690 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name, \ TP_ARGS(ip, count, offset)) DEFINE_RW_EVENT(xfs_file_buffered_read); DEFINE_RW_EVENT(xfs_file_direct_read); +DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class, -- 2.1.4
So far DAX writes inherited the locking from direct I/O writes, but the direct I/O model of using shared locks for writes is actually wrong for DAX. For direct I/O we're out of any standards and don't have to provide the Posix required exclusion between writers, but for DAX which gets transparently enable on applications without any knowledge of it we can't simply drop the requirement. Even worse this only happens for aligned writes and thus doesn't show up for many typical use cases. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_file.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0e74325..413c9e0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -714,24 +714,11 @@ xfs_file_dax_write( struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; - int unaligned_io = 0; - int iolock; + int iolock = XFS_IOLOCK_EXCL; struct iov_iter data; - /* "unaligned" here means not aligned to a filesystem block */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { - unaligned_io = 1; - iolock = XFS_IOLOCK_EXCL; - } else if (mapping->nrpages) { - iolock = XFS_IOLOCK_EXCL; - } else { - iolock = XFS_IOLOCK_SHARED; - } xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -747,11 +734,6 @@ xfs_file_dax_write( WARN_ON_ONCE(ret); } - if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; - } - trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); data = *from; -- 2.1.4
On 06/22/2016 06:27 PM, Christoph Hellwig wrote: > So far DAX writes inherited the locking from direct I/O writes, but the direct > I/O model of using shared locks for writes is actually wrong for DAX. For > direct I/O we're out of any standards and don't have to provide the Posix > required exclusion between writers, but for DAX which gets transparently > enable on applications without any knowledge of it we can't simply drop the > requirement. Even worse this only happens for aligned writes and thus > doesn't show up for many typical use cases. > Hi Sir Christoph You raise a very interesting point and I would please like to ask questions. Is this a theoretical standards problem or a real applications problem that you know of? You say above: " Posix required exclusion between writers" As I understand, what it means is that if two threads/processes A & B write to the same offset-length, in parallel. then a consistent full version will hold of either A or B, which ever comes last. But never a torn version of both. Is this really POSIX. I mean I knew POSIX is silly but so much so? What about NFS CEPH Luster and all these network shared stuff. Does POSIX say "On a single Node?". (Trond been yelling about file locks for *any* kind of synchronization for years.) And even with the write-lock to serialize writers (Or i_mute in case of ext4) I do not see how this serialization works, because in a cached environment a write_back can start and crash while the second thread above starts his memcopy and on disk we still get a torn version of the record that was half from A half from B. (Or maybe I do not understand what your automicity means) Is not a rant I would really like to know what application uses this "single-writer" facility and how does it actually works for them? I honestly don't see how it works. (And do they really check that they are only working on a local file system?) Sorry for my slowness please explain? BTW: I think that all the patches except this one makes a lot of sense because of all the hidden quirks of direct_IO code paths. Just for example the difference between "aligned and none align writes" as you mentioned above. My $0.017: Who In the real world would actually break without this patch, which is not already broken? And why sacrifice the vast majority of good applications for the sake of an already broken (theoretical?) applications. Thank you Boaz > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/xfs_file.c | 20 +------------------- > 1 file changed, 1 insertion(+), 19 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 0e74325..413c9e0 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -714,24 +714,11 @@ xfs_file_dax_write( > struct address_space *mapping = iocb->ki_filp->f_mapping; > struct inode *inode = mapping->host; > struct xfs_inode *ip = XFS_I(inode); > - struct xfs_mount *mp = ip->i_mount; > ssize_t ret = 0; > - int unaligned_io = 0; > - int iolock; > + int iolock = XFS_IOLOCK_EXCL; > struct iov_iter data; > > - /* "unaligned" here means not aligned to a filesystem block */ > - if ((iocb->ki_pos & mp->m_blockmask) || > - ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { > - unaligned_io = 1; > - iolock = XFS_IOLOCK_EXCL; > - } else if (mapping->nrpages) { > - iolock = XFS_IOLOCK_EXCL; > - } else { > - iolock = XFS_IOLOCK_SHARED; > - } > xfs_rw_ilock(ip, iolock); > - > ret = xfs_file_aio_write_checks(iocb, from, &iolock); > if (ret) > goto out; > @@ -747,11 +734,6 @@ xfs_file_dax_write( > WARN_ON_ONCE(ret); > } > > - if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { > - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); > - iolock = XFS_IOLOCK_SHARED; > - } > - > trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); > > data = *from; >
On Wed, Jun 22, 2016 at 05:27:08PM +0200, Christoph Hellwig wrote:
> The last patch is what started the series: XFS currently uses the
> direct I/O locking strategy for DAX because DAX was overloaded onto
> the direct I/O path. For XFS this means that we only take a shared
> inode lock instead of the normal exclusive one for writes IFF they
> are properly aligned. While this is fine for O_DIRECT which requires
> explicit opt-in from the application it's not fine for DAX where we'll
> suddenly lose expected and required synchronization of the file system
> happens to use DAX undeneath.
Except we did that *intentionally* - by definition there is no
cache to bypass with DAX and so all IO is "direct". That, combined
with the fact that all Linux filesystems except XFS break the POSIX
exclusive writer rule you are quoting to begin with, it seemed
pointless to enforce it for DAX....
So, before taking any patches to change that behaviour in XFS, a
wider discussion about the policy needs to be had. I don't think
we should care about POSIX here - if you have an application that
needs this serialisation, turn off DAX. That's why I made it a
per-inode inheritable flag and why the mount option will go away
over time.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
On Thu, Jun 23, 2016 at 4:24 PM, Dave Chinner <david@fromorbit.com> wrote: > On Wed, Jun 22, 2016 at 05:27:08PM +0200, Christoph Hellwig wrote: >> The last patch is what started the series: XFS currently uses the >> direct I/O locking strategy for DAX because DAX was overloaded onto >> the direct I/O path. For XFS this means that we only take a shared >> inode lock instead of the normal exclusive one for writes IFF they >> are properly aligned. While this is fine for O_DIRECT which requires >> explicit opt-in from the application it's not fine for DAX where we'll >> suddenly lose expected and required synchronization of the file system >> happens to use DAX undeneath. > > Except we did that *intentionally* - by definition there is no > cache to bypass with DAX and so all IO is "direct". That, combined > with the fact that all Linux filesystems except XFS break the POSIX > exclusive writer rule you are quoting to begin with, it seemed > pointless to enforce it for DAX.... If we're going to be strict about POSIX fsync() semantics we should be strict about this exclusive write semantic. In other words why is it ok to loosen one and not the other, if application compatibility is the concern? > > So, before taking any patches to change that behaviour in XFS, a > wider discussion about the policy needs to be had. I don't think > we should care about POSIX here - if you have an application that > needs this serialisation, turn off DAX. s/needs this serialisation/needs the kernel to flush cpu cache/
On Thu, Jun 23, 2016 at 06:14:47PM -0700, Dan Williams wrote:
> On Thu, Jun 23, 2016 at 4:24 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Wed, Jun 22, 2016 at 05:27:08PM +0200, Christoph Hellwig wrote:
> >> The last patch is what started the series: XFS currently uses the
> >> direct I/O locking strategy for DAX because DAX was overloaded onto
> >> the direct I/O path. For XFS this means that we only take a shared
> >> inode lock instead of the normal exclusive one for writes IFF they
> >> are properly aligned. While this is fine for O_DIRECT which requires
> >> explicit opt-in from the application it's not fine for DAX where we'll
> >> suddenly lose expected and required synchronization of the file system
> >> happens to use DAX undeneath.
> >
> > Except we did that *intentionally* - by definition there is no
> > cache to bypass with DAX and so all IO is "direct". That, combined
> > with the fact that all Linux filesystems except XFS break the POSIX
> > exclusive writer rule you are quoting to begin with, it seemed
> > pointless to enforce it for DAX....
>
> If we're going to be strict about POSIX fsync() semantics we should be
> strict about this exclusive write semantic. In other words why is it
> ok to loosen one and not the other, if application compatibility is
> the concern?
This is a POSIX compliant fsync() implementation:
int fsync(int fd)
{
return 0;
}
That's not what we require from Linux filesystems and storage
subsystems. Our data integrity requirements are not actually
defined by POSIX - we go way beyond what POSIX actually requires us
to implement. If all we cared about is POSIX, then the above is how
we'd implement fsync() simply because it's fast. Everyone implements
fsync differently, so portable applications can't actually rely on
the POSIX standard fsync() implementation to keep their data safe...
IOWs, we don't give a shit about what POSIX says about fsync
because, in practice, it's useless. Instead, we implement something
that *works* and provides users with real data integrity guarantees.
If you like the POSIX specs for data integrity, go use
sync_file_range() - it doesn't guarantee data integrity, just like
posix compliant fsync(). And yes, applications that use
sync_file_range() are known to lose data when systems crash...
The POSIX exclusive write requirement is a different case. No linux
filesystem except XFS has ever met that requirement (in 20 something
years), yet I don't see applications falling over with corrupt data
from non-exclusive writes all the time, nor do I see application
developers shouting at us to provide it. i.e. reality tells us this
isn't a POSIX behaviour that applications rely on because everyone
implements it differently.
So, like fsync(), if everyone implements it differently,
applications don't rely on posix smeantics to serialise access to
overlapping ranges of a file. And if that's the case, then
why even bother exclusive write locking in the filesystem when there
is no need for serialisation of page cache contents?
We don't do it because POSIX says so, because we already ignore what
POSIX says about this topic for technical reasons. So why should we
make DAX conform to POSIX exclusive writer behaviour when DAX is
being specifically aimed at high performance, highly concurrent
applications where exclusive writer behaviour will cause major
performance issues?
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
On Fri, Jun 24, 2016 at 09:24:46AM +1000, Dave Chinner wrote: > Except we did that *intentionally* - by definition there is no > cache to bypass with DAX and so all IO is "direct". That, combined > with the fact that all Linux filesystems except XFS break the POSIX > exclusive writer rule you are quoting to begin with, it seemed > pointless to enforce it for DAX.... No file system breaks the exclusive writer rule - most filesystem don't make writers atomic vs readers. More importantly every other filesystem (well there only are ext2 and ext4..) exludes DAX writers against other DAX writers. > So, before taking any patches to change that behaviour in XFS, a > wider discussion about the policy needs to be had. I don't think > we should care about POSIX here - if you have an application that > needs this serialisation, turn off DAX. That's why I made it a > per-inode inheritable flag and why the mount option will go away > over time. Sorry, but this is simply broken - allowing apps to opt-in behavior (e.g. like we're using O_DIRECT) is always fine. Requriring filesystem-specific tuning that has affect outside the app to get existing documented behavior is not how to design APIs. Maybe we'll need to opt-in to use DAX for mmap, but giving the same existing behavior for read and write and avoiding a copy to the pagecache is an obvious win.
On Fri, Jun 24, 2016 at 05:13:18PM +1000, Dave Chinner wrote: > This is a POSIX compliant fsync() implementation: > > int fsync(int fd) > { > return 0; > } Depends on what you mean with "Posix". Modern Posix which includex XPG has the _POSIX_SYNCHRONIZED_IO option, which Linux implements. For that Posix says about fsync: [SIO] [Option Start] If _POSIX_SYNCHRONIZED_IO is defined, the fsync() function shall force all currently queued I/O operations associated with the file indicated by file descriptor fildes to the synchronized I/O completion state. All I/O operations shall be completed as defined for synchronized I/O file integrity completion. [Option End] Whereas synchronized I/O file integrity completion is defined as: 3.378 Synchronized I/O Data Integrity Completion For read, when the operation has been completed or diagnosed if unsuccessful. The read is complete only when an image of the data has been successfully transferred to the requesting process. If there were any pending write requests affecting the data to be read at the time that the synchronized read operation was requested, these write requests are successfully transferred prior to reading the data. For write, when the operation has been completed or diagnosed if unsuccessful. The write is complete only when the data specified in the write request is successfully transferred and all file system information required to retrieve the data is successfully transferred. File attributes that are not necessary for data retrieval (access time, modification time, status change time) need not be successfully transferred prior to returning to the calling process. 3.379 Synchronized I/O File Integrity Completion Identical to a synchronized I/O data integrity completion with the addition that all file attributes relative to the I/O operation (including access time, modification time, status change time) are successfully transferred prior to returning to the calling process. So in this case Posix very much requires data to be on a stable medium. > The POSIX exclusive write requirement is a different case. No linux > filesystem except XFS has ever met that requirement (in 20 something > years), yet I don't see applications falling over with corrupt data > from non-exclusive writes all the time, nor do I see application > developers shouting at us to provide it. i.e. reality tells us this > isn't a POSIX behaviour that applications rely on because everyone > implements it differently. Every file system exludes writes from other writes.
On Fri, Jun 24, 2016 at 09:26:12AM +0200, Christoph Hellwig wrote: > On Fri, Jun 24, 2016 at 09:24:46AM +1000, Dave Chinner wrote: > > Except we did that *intentionally* - by definition there is no > > cache to bypass with DAX and so all IO is "direct". That, combined > > with the fact that all Linux filesystems except XFS break the POSIX > > exclusive writer rule you are quoting to begin with, it seemed > > pointless to enforce it for DAX.... > > No file system breaks the exclusive writer rule - most filesystem > don't make writers atomic vs readers. > > More importantly every other filesystem (well there only are ext2 > and ext4..) exludes DAX writers against other DAX writers. > > > So, before taking any patches to change that behaviour in XFS, a > > wider discussion about the policy needs to be had. I don't think > > we should care about POSIX here - if you have an application that > > needs this serialisation, turn off DAX. That's why I made it a > > per-inode inheritable flag and why the mount option will go away > > over time. > > Sorry, but this is simply broken - allowing apps to opt-in behavior > (e.g. like we're using O_DIRECT) is always fine. Requriring > filesystem-specific tuning that has affect outside the app to get > existing documented behavior is not how to design APIs. Using DAX is an *admin decision*, not an application decision. Indeed, it's a mount option right now, and that's most definitely not something the application can turn on or off! Inode flags allow the admin to decide that two apps working on the same filesystem can use (or not use) DAX independently, rather than needing to put them on different filesystems. > Maybe we'll need to opt-in to use DAX for mmap, but giving the same > existing behavior for read and write and avoiding a copy to the pagecache > is an obvious win. You can't use DAX just for mmap. It's an inode scope behaviour - once it's turned on, all accesses to that inode - regardless of user interface - must use DAX. It's all or nothing, not a per file descript/mmap context option. Cheers, Dave. -- Dave Chinner david@fromorbit.com
On Sat, Jun 25, 2016 at 09:00:45AM +1000, Dave Chinner wrote: > > > > Sorry, but this is simply broken - allowing apps to opt-in behavior > > (e.g. like we're using O_DIRECT) is always fine. Requriring > > filesystem-specific tuning that has affect outside the app to get > > existing documented behavior is not how to design APIs. > > Using DAX is an *admin decision*, not an application decision. Of course - that's exactly my point. > Indeed, it's a mount option right now, and that's most definitely not > something the application can turn on or off! Inode flags allow the > admin to decide that two apps working on the same filesystem can use > (or not use) DAX independently, rather than needing to put them on > different filesystems. Right. And an existing application can get DAX turned on under its back, and will now suddently get different synchronization behavior. That is if it's writes happen to be aligned to the fs block size. > > Maybe we'll need to opt-in to use DAX for mmap, but giving the same > > existing behavior for read and write and avoiding a copy to the pagecache > > is an obvious win. > > You can't use DAX just for mmap. It's an inode scope behaviour - > once it's turned on, all accesses to that inode - regardless of user > interface - must use DAX. It's all or nothing, not a per file > descript/mmap context option. Right now it is. But when discussing mmap behavior one option was to require an opt-in to get DAX-specific mmap semantics. For plain read/write we have no such option and thus absolutely need to behave as all normal reads and writes behave. If you think the exclusive lock for writes hurts we have two options: a) implement range locks (although they might be more expensive for typical loads) b) add a new O_* or RWF_* option to not require the synchronization for apps that don't want it. Neither of those cases really is DAX-specific.
On 06/28/2016 04:10 PM, Christoph Hellwig wrote: > On Sat, Jun 25, 2016 at 09:00:45AM +1000, Dave Chinner wrote: <> > > Right. And an existing application can get DAX turned on under its > back, and will now suddently get different synchronization behavior. > That is if it's writes happen to be aligned to the fs block size. > Is there an actual application that does that? or is this purely theoretical right now? Thanks Boaz
On Tue, Jun 28, 2016 at 04:27:03PM +0300, Boaz Harrosh wrote: > > Right. And an existing application can get DAX turned on under its > > back, and will now suddently get different synchronization behavior. > > That is if it's writes happen to be aligned to the fs block size. > > > > Is there an actual application that does that? or is this purely > theoretical right now? Lots of them. The typical case is multithreaded (or preforked like old apache) daemons that use O_APPEND to write to a common log file. Then again those log records will usually not be 4k aligned, so they'd still accidentally get the exclusive locking even without this patch. But beware the case where a log record actually matches the alignment.. > > > Thanks > Boaz ---end quoted text---
On 06/28/2016 04:39 PM, Christoph Hellwig wrote: > On Tue, Jun 28, 2016 at 04:27:03PM +0300, Boaz Harrosh wrote: >>> Right. And an existing application can get DAX turned on under its >>> back, and will now suddently get different synchronization behavior. >>> That is if it's writes happen to be aligned to the fs block size. >>> >> >> Is there an actual application that does that? or is this purely >> theoretical right now? > > Lots of them. The typical case is multithreaded (or preforked like old > apache) daemons that use O_APPEND to write to a common log file. Then > again those log records will usually not be 4k aligned, so they'd still > accidentally get the exclusive locking even without this patch. But > beware the case where a log record actually matches the alignment.. > But O_APPEND is exclusion problem of i_size update not of the actual memcpy done in parallel or not. Actually with O_APPEND each write request should write a different region of the file, there are no overlapping writes. And no issue of which version of the write came last and got its data written. If it is the issue of isize update vs O_APPEND is a different issue don't you think? One way that we solved it is to update isize = isize + len; atomically before starting the IO, then on the error case sub the unwritten bytes. And still allow concurrent writers. I agree that isize updates needs to be atomic, but why does the memcpy? And BTW in NFS O_APPEND concurrent readers to a writer may see ZEROs in the interim. I still don't see how an application can use the fact that two writers will not give them mixed records. And surly it does not work on a shared FS. So I was really wondering if you know of any such app Thanks Boaz >> >> >> Thanks >> Boaz > ---end quoted text--- >
On Tue, Jun 28, 2016 at 04:56:30PM +0300, Boaz Harrosh wrote: > Actually with O_APPEND each write request should write a different region > of the file, there are no overlapping writes. And no issue of which version of the > write came last and got its data written. You have one fd for multiple threads or processes (it doesn't matter if you're using O_APPEND or not), and all of them write to it. i_size is only updated once the write finishes, so having multiple concurrent writes will mean multiple records go into the same regions. Now to be fair in current XFS writes beyond i_size will always take the lock exclusively, so for this case we will not get concurrent writes and thus data corruption anyway. But if you have a cycling log that gets overwritten (say a database journal) we're back to square one. > I still don't see how an application can use the fact that two writers > will not give them mixed records. And surly it does not work on a shared > FS. So I was really wondering if you know of any such app If it doesn't work for two threads using the same fd on a shared fs the fs is broken.
On 06/28/2016 06:39 PM, Christoph Hellwig wrote: > On Tue, Jun 28, 2016 at 04:56:30PM +0300, Boaz Harrosh wrote: >> Actually with O_APPEND each write request should write a different region >> of the file, there are no overlapping writes. And no issue of which version of the >> write came last and got its data written. > > You have one fd for multiple threads or processes (it doesn't matter if > you're using O_APPEND or not), and all of them write to it. > Yes so? but they do not write to the same (overlapping) region of the file, each thread usually writes to his own record. > i_size is only updated once the write finishes, so having multiple > concurrent writes will mean multiple records go into the same regions. > Now to be fair in current XFS writes beyond i_size will always take > the lock exclusively, so for this case we will not get concurrent > writes and thus data corruption anyway. Exactly that I understand. And it must be so. > But if you have a cycling > log that gets overwritten (say a database journal) we're back to > square one. > No! In this "cycling log" case the application (the DB) has an Head and a Tail pointers each thread grabs the next available record and writes to it. The IO is not overlapping, each thread writes to his own record, and even if they write at the same time they do not over-write each other. As long as they properly sync on the Head pointer the write itself can happen in parallel. This is not a good example and will work perfectly well with the old (current) DAX code. (Even if such records where 4k aligned) >> I still don't see how an application can use the fact that two writers >> will not give them mixed records. And surly it does not work on a shared >> FS. So I was really wondering if you know of any such app > > If it doesn't work for two threads using the same fd on a shared fs > the fs is broken. What works? the above cycling log, sure it will work, also on current dax code. I wish you could write a test to demonstrate this bug. Sorry for my slowness but I don't see it. I do not see how the fact that there is only a single memcpy in progress can help an application. Yes sure isize update must be synced, which it is in current code. Thanks Christoph, I've taken too much of your time, I guess the QA will need to bang every possible application and see what breaks when multiple parallel writers are allowed on a single file. So far for whatever I use in a VM it all works just the same. Boaz
On Wed, Jun 22, 2016 at 05:27:15PM +0200, Christoph Hellwig wrote: > So far the DAX code overloaded the direct I/O code path. There is very little > in common between the two, and untangling them allows to clean up both variants. > > As a ѕide effect we also get separate trace points for both I/O types. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/xfs_file.c | 139 ++++++++++++++++++++++++++++++++++++++++++----------- > fs/xfs/xfs_trace.h | 2 + > 2 files changed, 112 insertions(+), 29 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 24b267d..0e74325 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -262,13 +262,11 @@ xfs_file_dio_aio_read( > else > target = ip->i_mount->m_ddev_targp; > > - if (!IS_DAX(inode)) { > - /* DIO must be aligned to device logical sector size */ > - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { > - if (iocb->ki_pos == isize) > - return 0; > - return -EINVAL; > - } > + /* DIO must be aligned to device logical sector size */ > + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { > + if (iocb->ki_pos == isize) > + return 0; > + return -EINVAL; > } > > /* > @@ -317,13 +315,37 @@ xfs_file_dio_aio_read( > } > > data = *to; > - if (IS_DAX(inode)) { > - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > - NULL, 0); > - } else { > - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > - xfs_get_blocks_direct, NULL, NULL, 0); > + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > + xfs_get_blocks_direct, NULL, NULL, 0); > + if (ret > 0) { > + iocb->ki_pos += ret; > + iov_iter_advance(to, ret); > } > + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); > + > + file_accessed(iocb->ki_filp); So I noticed that generic/323 starts crashing in file_accessed -> touch_atime because iocb->ki_filp->f_path.dentry == NULL. For a while I thought it was some weird reflink bug, but I finally had time to go build a vanilla 4.8-rc8 kernel and that blew up here too. I'm not sure why this line got inserted here, since it wasn't there prior to this patch, AFAICT. <shrug> --D > + return ret; > +} > + > +STATIC ssize_t > +xfs_file_dax_read( > + struct kiocb *iocb, > + struct iov_iter *to) > +{ > + struct address_space *mapping = iocb->ki_filp->f_mapping; > + struct inode *inode = mapping->host; > + struct xfs_inode *ip = XFS_I(inode); > + struct iov_iter data = *to; > + size_t count = iov_iter_count(to); > + ssize_t ret = 0; > + > + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); > + > + if (!count) > + return 0; /* skip atime */ > + > + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); > + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); > if (ret > 0) { > iocb->ki_pos += ret; > iov_iter_advance(to, ret); > @@ -356,7 +378,8 @@ xfs_file_read_iter( > struct kiocb *iocb, > struct iov_iter *to) > { > - struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; > + struct inode *inode = file_inode(iocb->ki_filp); > + struct xfs_mount *mp = XFS_I(inode)->i_mount; > ssize_t ret = 0; > > XFS_STATS_INC(mp, xs_read_calls); > @@ -364,7 +387,9 @@ xfs_file_read_iter( > if (XFS_FORCED_SHUTDOWN(mp)) > return -EIO; > > - if (iocb->ki_flags & IOCB_DIRECT) > + if (IS_DAX(inode)) > + ret = xfs_file_dax_read(iocb, to); > + else if (iocb->ki_flags & IOCB_DIRECT) > ret = xfs_file_dio_aio_read(iocb, to); > else > ret = xfs_file_buffered_aio_read(iocb, to); > @@ -586,8 +611,7 @@ xfs_file_dio_aio_write( > mp->m_rtdev_targp : mp->m_ddev_targp; > > /* DIO must be aligned to device logical sector size */ > - if (!IS_DAX(inode) && > - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) > + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) > return -EINVAL; > > /* "unaligned" here means not aligned to a filesystem block */ > @@ -656,14 +680,9 @@ xfs_file_dio_aio_write( > trace_xfs_file_direct_write(ip, count, iocb->ki_pos); > > data = *from; > - if (IS_DAX(inode)) { > - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > - xfs_end_io_direct_write, 0); > - } else { > - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > - xfs_get_blocks_direct, xfs_end_io_direct_write, > - NULL, DIO_ASYNC_EXTEND); > - } > + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > + xfs_get_blocks_direct, xfs_end_io_direct_write, > + NULL, DIO_ASYNC_EXTEND); > > /* see generic_file_direct_write() for why this is necessary */ > if (mapping->nrpages) { > @@ -680,10 +699,70 @@ out: > xfs_rw_iunlock(ip, iolock); > > /* > - * No fallback to buffered IO on errors for XFS. DAX can result in > - * partial writes, but direct IO will either complete fully or fail. > + * No fallback to buffered IO on errors for XFS, direct IO will either > + * complete fully or fail. > + */ > + ASSERT(ret < 0 || ret == count); > + return ret; > +} > + > +STATIC ssize_t > +xfs_file_dax_write( > + struct kiocb *iocb, > + struct iov_iter *from) > +{ > + struct address_space *mapping = iocb->ki_filp->f_mapping; > + struct inode *inode = mapping->host; > + struct xfs_inode *ip = XFS_I(inode); > + struct xfs_mount *mp = ip->i_mount; > + ssize_t ret = 0; > + int unaligned_io = 0; > + int iolock; > + struct iov_iter data; > + > + /* "unaligned" here means not aligned to a filesystem block */ > + if ((iocb->ki_pos & mp->m_blockmask) || > + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { > + unaligned_io = 1; > + iolock = XFS_IOLOCK_EXCL; > + } else if (mapping->nrpages) { > + iolock = XFS_IOLOCK_EXCL; > + } else { > + iolock = XFS_IOLOCK_SHARED; > + } > + xfs_rw_ilock(ip, iolock); > + > + ret = xfs_file_aio_write_checks(iocb, from, &iolock); > + if (ret) > + goto out; > + > + /* > + * Yes, even DAX files can have page cache attached to them: A zeroed > + * page is inserted into the pagecache when we have to serve a write > + * fault on a hole. It should never be dirtied and can simply be > + * dropped from the pagecache once we get real data for the page. > */ > - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); > + if (mapping->nrpages) { > + ret = invalidate_inode_pages2(mapping); > + WARN_ON_ONCE(ret); > + } > + > + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { > + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); > + iolock = XFS_IOLOCK_SHARED; > + } > + > + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); > + > + data = *from; > + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > + xfs_end_io_direct_write, 0); > + if (ret > 0) { > + iocb->ki_pos += ret; > + iov_iter_advance(from, ret); > + } > +out: > + xfs_rw_iunlock(ip, iolock); > return ret; > } > > @@ -765,7 +844,9 @@ xfs_file_write_iter( > if (XFS_FORCED_SHUTDOWN(ip->i_mount)) > return -EIO; > > - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) > + if (IS_DAX(inode)) > + ret = xfs_file_dax_write(iocb, from); > + else if (iocb->ki_flags & IOCB_DIRECT) > ret = xfs_file_dio_aio_write(iocb, from); > else > ret = xfs_file_buffered_aio_write(iocb, from); > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index 2504f94..1451690 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name, \ > TP_ARGS(ip, count, offset)) > DEFINE_RW_EVENT(xfs_file_buffered_read); > DEFINE_RW_EVENT(xfs_file_direct_read); > +DEFINE_RW_EVENT(xfs_file_dax_read); > DEFINE_RW_EVENT(xfs_file_buffered_write); > DEFINE_RW_EVENT(xfs_file_direct_write); > +DEFINE_RW_EVENT(xfs_file_dax_write); > DEFINE_RW_EVENT(xfs_file_splice_read); > > DECLARE_EVENT_CLASS(xfs_page_class, > -- > 2.1.4 > > _______________________________________________ > xfs mailing list > xfs@oss.sgi.com > http://oss.sgi.com/mailman/listinfo/xfs
On Wed, Sep 28, 2016 at 07:53:52PM -0700, Darrick J. Wong wrote:
> So I noticed that generic/323 starts crashing in file_accessed -> touch_atime
> because iocb->ki_filp->f_path.dentry == NULL. For a while I thought it was
> some weird reflink bug, but I finally had time to go build a vanilla 4.8-rc8
> kernel and that blew up here too. I'm not sure why this line got inserted
> here, since it wasn't there prior to this patch, AFAICT.
This line was there before near the end of xfs_file_dio_aio_read already,
e.g. line 376 just before the above commit, but it only got introduced
a bit earlier in "xfs: stop using generic_file_read_iter for direct I/O",
which copied it over from generic_file_read_iter. І think any new
issues in these commits could just be a minor timing change, as
we're not changing struct file refcounting in any way here.
generic/323 reproduces the last struct file reference being dropped
by aio completions, so it seems like we have an issue here, which
I suspect is something in the common code. I can't reproduce it
locally, but looking at the aio_complete -> kiocb_free callchain
and the lack of other struct file refcounting in aio.c it seems
inherently unsafe to reference struct file once the completion
may have run, that is after (__)blkdev_direct_IO returned.
I'll see if I can come up with a solution for that, most likely
that would involve moving the file_accessed call into __blkdev_direct_IO
before we drop the final reference on the dio structure.
Can you try the patch below? That just moves the file_accessed call before the I/O, similar to how we handle timestamp updates on the write side. generic_file_read_iter will also need a similar update.
On Thu, Sep 29, 2016 at 10:18:34PM +0200, Christoph Hellwig wrote: > Can you try the patch below? That just moves the file_accessed call > before the I/O, similar to how we handle timestamp updates on the write > side. generic_file_read_iter will also need a similar update. And now with patch: diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 349f328..6919412 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -270,6 +270,8 @@ xfs_file_dio_aio_read( return -EINVAL; } + file_accessed(iocb->ki_filp); + /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file @@ -324,7 +326,6 @@ xfs_file_dio_aio_read( } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - file_accessed(iocb->ki_filp); return ret; }
On Thu, Sep 29, 2016 at 10:18:49PM +0200, Christoph Hellwig wrote:
> On Thu, Sep 29, 2016 at 10:18:34PM +0200, Christoph Hellwig wrote:
> > Can you try the patch below? That just moves the file_accessed call
> > before the I/O, similar to how we handle timestamp updates on the write
> > side. generic_file_read_iter will also need a similar update.
>
> And now with patch:
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 349f328..6919412 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -270,6 +270,8 @@ xfs_file_dio_aio_read(
> return -EINVAL;
> }
>
> + file_accessed(iocb->ki_filp);
> +
> /*
> * Locking is a bit tricky here. If we take an exclusive lock for direct
> * IO, we effectively serialise all new concurrent read IO to this file
> @@ -324,7 +326,6 @@ xfs_file_dio_aio_read(
> }
> xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
>
> - file_accessed(iocb->ki_filp);
> return ret;
> }
>
I noticed that g/323 only breaks when the XFS is on a real disk; on
pmem it works fine with or without the patch.
That said, it makes the crash go away and the patch looks ok, so:
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Tested-by: Darrick J. Wong <darrick.wong@oracle.com>
--D