From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mx3-rdu2.redhat.com ([66.187.233.73]:34300 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1752046AbeENPgk (ORCPT ); Mon, 14 May 2018 11:36:40 -0400 From: Andreas Gruenbacher To: cluster-devel@redhat.com, Christoph Hellwig Cc: linux-fsdevel@vger.kernel.org, Andreas Gruenbacher Subject: [PATCH v4 09/11] gfs2: iomap direct I/O support Date: Mon, 14 May 2018 17:36:22 +0200 Message-Id: <20180514153624.29598-10-agruenba@redhat.com> In-Reply-To: <20180514153624.29598-1-agruenba@redhat.com> References: <20180514153624.29598-1-agruenba@redhat.com> Sender: linux-fsdevel-owner@vger.kernel.org List-ID: With that, the direct_IO address space operation can be all but eliminated: only a dummy remains which indicates that the filesystem supports direct I/O. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 92 +-------------------------- fs/gfs2/bmap.c | 11 +++- fs/gfs2/file.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 170 insertions(+), 102 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 3d9633175aa8..d676ee63ab2b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -84,12 +84,6 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, return 0; } -static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create) -{ - return gfs2_block_map(inode, lblock, bh_result, 0); -} - /** * gfs2_writepage_common - Common bits of writepage * @page: The page to be written @@ -1021,94 +1015,12 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset, try_to_release_page(page, 0); } -/** - * gfs2_ok_for_dio - check that dio is valid on this file - * @ip: The inode - * @offset: The offset at which we are reading or writing - * - * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) - * 1 (to accept the i/o request) - */ -static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset) -{ - /* - * Should we return an error here? I can't see that O_DIRECT for - * a stuffed file makes any sense. For now we'll silently fall - * back to buffered I/O - */ - if (gfs2_is_stuffed(ip)) - return 0; - - if (offset >= i_size_read(&ip->i_inode)) - return 0; - return 1; -} - - - static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct address_space *mapping = inode->i_mapping; - struct gfs2_inode *ip = GFS2_I(inode); - loff_t offset = iocb->ki_pos; - struct gfs2_holder gh; - int rv; - /* - * Deferred lock, even if its a write, since we do no allocation - * on this path. All we need change is atime, and this lock mode - * ensures that other nodes have flushed their buffered read caches - * (i.e. their page cache entries for this inode). We do not, - * unfortunately have the option of only flushing a range like - * the VFS does. + * We just need the method present so that open/fcntl allow direct I/O. */ - gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); - rv = gfs2_glock_nq(&gh); - if (rv) - goto out_uninit; - rv = gfs2_ok_for_dio(ip, offset); - if (rv != 1) - goto out; /* dio not valid, fall back to buffered i/o */ - - /* - * Now since we are holding a deferred (CW) lock at this point, you - * might be wondering why this is ever needed. There is a case however - * where we've granted a deferred local lock against a cached exclusive - * glock. That is ok provided all granted local locks are deferred, but - * it also means that it is possible to encounter pages which are - * cached and possibly also mapped. So here we check for that and sort - * them out ahead of the dio. The glock state machine will take care of - * everything else. - * - * If in fact the cached glock state (gl->gl_state) is deferred (CW) in - * the first place, mapping->nr_pages will always be zero. - */ - if (mapping->nrpages) { - loff_t lstart = offset & ~(PAGE_SIZE - 1); - loff_t len = iov_iter_count(iter); - loff_t end = PAGE_ALIGN(offset + len) - 1; - - rv = 0; - if (len == 0) - goto out; - if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) - unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); - rv = filemap_write_and_wait_range(mapping, lstart, end); - if (rv) - goto out; - if (iov_iter_rw(iter) == WRITE) - truncate_inode_pages_range(mapping, lstart, end); - } - - rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - gfs2_get_block_direct, NULL, NULL, 0); -out: - gfs2_glock_dq(&gh); -out_uninit: - gfs2_holder_uninit(&gh); - return rv; + return -EINVAL; } /** diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 7ac08660c990..6f57e2bfab8e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1053,7 +1053,14 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, trace_gfs2_iomap_start(ip, pos, length, flags); if (flags & IOMAP_WRITE) { - ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap); + if (flags & IOMAP_DIRECT) { + ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + release_metapath(&mp); + if (iomap->type != IOMAP_MAPPED) + ret = -ENOTBLK; + } else { + ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap); + } } else { ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); release_metapath(&mp); @@ -1124,7 +1131,7 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_trans *tr = current->journal_info; - if (!(flags & IOMAP_WRITE)) + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE) return 0; gfs2_ordered_add_inode(ip); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c2467988f96f..5fb2ce779bf5 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -690,6 +690,122 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } +static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + size_t count = iov_iter_count(to); + struct gfs2_holder gh; + ssize_t ret; + + if (!count) + return 0; /* skip atime */ + + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + /* fall back to buffered I/O for stuffed files */ + ret = -ENOTBLK; + if (gfs2_is_stuffed(ip)) + goto out; + + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL); + +out: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return ret; +} + +static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; + struct gfs2_inode *ip = GFS2_I(inode); + size_t len = iov_iter_count(from); + loff_t offset = iocb->ki_pos; + struct gfs2_holder gh; + ssize_t ret; + + /* + * Deferred lock, even if its a write, since we do no allocation on + * this path. All we need to change is the atime, and this lock mode + * ensures that other nodes have flushed their buffered read caches + * (i.e. their page cache entries for this inode). We do not, + * unfortunately, have the option of only flushing a range like the + * VFS does. + */ + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + /* Silently fall back to buffered I/O for stuffed files */ + if (gfs2_is_stuffed(ip)) + goto out; + + /* Silently fall back to buffered I/O when writing beyond EOF */ + if (offset + len > i_size_read(&ip->i_inode)) + goto out; + + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & ~(PAGE_SIZE - 1); + loff_t len = iov_iter_count(from); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + ret = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + ret = filemap_write_and_wait_range(mapping, lstart, end); + if (ret) + goto out; + if (iov_iter_rw(from) == WRITE) + truncate_inode_pages_range(mapping, lstart, end); + } + + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL); + +out: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return ret; +} + +static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = gfs2_file_direct_read(iocb, to); + if (likely(ret != -ENOTBLK)) + goto out; + iocb->ki_flags &= ~IOCB_DIRECT; + } + ret = generic_file_read_iter(iocb, to); +out: + return ret; +} + /** * gfs2_file_write_iter - Perform a write to a file * @iocb: The io context @@ -707,7 +823,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); - ssize_t ret; + ssize_t written = 0, ret; ret = gfs2_rsqa_alloc(ip); if (ret) @@ -724,9 +840,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) gfs2_glock_dq_uninit(&gh); } - if (iocb->ki_flags & IOCB_DIRECT) - return generic_file_write_iter(iocb, from); - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) @@ -743,19 +856,55 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out2; - ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = file->f_mapping; + loff_t pos, endbyte; + ssize_t buffered; + + written = gfs2_file_direct_write(iocb, from); + if (written < 0 || !iov_iter_count(from)) + goto out2; + + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + if (unlikely(ret < 0)) + goto out2; + buffered = ret; + + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + pos = iocb->ki_pos; + endbyte = pos + buffered - 1; + ret = filemap_write_and_wait_range(mapping, pos, endbyte); + if (!ret) { + iocb->ki_pos += buffered; + written += buffered; + invalidate_mapping_pages(mapping, + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + if (likely(ret > 0)) + iocb->ki_pos += ret; + } out2: current->backing_dev_info = NULL; out: inode_unlock(inode); if (likely(ret > 0)) { - iocb->ki_pos += ret; - /* Handle various SYNC-type writes */ ret = generic_write_sync(iocb, ret); } - return ret; + return written ? written : ret; } static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, @@ -1157,7 +1306,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, - .read_iter = generic_file_read_iter, + .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, @@ -1187,7 +1336,7 @@ const struct file_operations gfs2_dir_fops = { const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, - .read_iter = generic_file_read_iter, + .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, -- 2.17.0