On 17.03.24 17:53, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> This is a regression test for "mm/madvise: make
> MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly".
>
> Cc: David Hildenbrand <david@redhat.com>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
Thanks for including this test, very helpful!
It's my first time reading fstests code, so I cannot give any feedback
that would be of a lot of value. Having that said, nothing jumped at me :)
--
Cheers,
David / dhildenb
On Tue, Mar 19, 2024 at 09:46:00AM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > The count is used purely to allocate the correct number of bvecs for > submitting IO. Rename it to b_bvec_count. Well, I think we should just kill it as it simplies is the rounded up length in PAGE_SIZE units. The patch below passes a quick xfstests run and is on top of this series: diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 2a6796c48454f7..8ecf88b5504c18 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -67,27 +67,17 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) } /* - * Return true if the buffer is vmapped. - * - * b_addr is always set, so we have to look at bp->b_bvec_count to determine if - * the buffer was vmalloc()d or not. + * See comment above xfs_buf_alloc_folios() about the constraints placed on + * allocating vmapped buffers. */ -static inline int -xfs_buf_is_vmapped( - struct xfs_buf *bp) +static inline unsigned int xfs_buf_vmap_len(struct xfs_buf *bp) { - return bp->b_bvec_count > 1; + return roundup(BBTOB(bp->b_length), PAGE_SIZE); } -/* - * See comment above xfs_buf_alloc_folios() about the constraints placed on - * allocating vmapped buffers. - */ -static inline int -xfs_buf_vmap_len( - struct xfs_buf *bp) +static inline unsigned int xfs_buf_nr_pages(struct xfs_buf *bp) { - return (bp->b_bvec_count * PAGE_SIZE); + return DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); } /* @@ -304,13 +294,15 @@ xfs_buf_free( goto free; } - if (!(bp->b_flags & _XBF_KMEM)) - mm_account_reclaimed_pages(bp->b_bvec_count); - - if (bp->b_flags & _XBF_FOLIOS) - __folio_put(kmem_to_folio(bp->b_addr)); - else + if (bp->b_flags & _XBF_FOLIOS) { + /* XXX: should this pass xfs_buf_nr_pages()? */ + mm_account_reclaimed_pages(1); + folio_put(kmem_to_folio(bp->b_addr)); + } else { + if (!(bp->b_flags & _XBF_KMEM)) + mm_account_reclaimed_pages(xfs_buf_nr_pages(bp)); kvfree(bp->b_addr); + } bp->b_flags &= _XBF_KMEM | _XBF_FOLIOS; @@ -341,7 +333,6 @@ xfs_buf_alloc_kmem( bp->b_addr = NULL; return -ENOMEM; } - bp->b_bvec_count = 1; bp->b_flags |= _XBF_KMEM; return 0; } @@ -369,7 +360,6 @@ xfs_buf_alloc_folio( return false; bp->b_addr = folio_address(folio); - bp->b_bvec_count = 1; bp->b_flags |= _XBF_FOLIOS; return true; } @@ -441,7 +431,6 @@ xfs_buf_alloc_folios( count); return -ENOMEM; } - bp->b_bvec_count = count; return 0; } @@ -1470,7 +1459,9 @@ xfs_buf_bio_end_io( cmpxchg(&bp->b_io_error, 0, error); } - if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && + (bp->b_flags & XBF_READ) && + is_vmalloc_addr(bp->b_addr)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); if (atomic_dec_and_test(&bp->b_io_remaining) == 1) @@ -1485,6 +1476,7 @@ xfs_buf_ioapply_map( unsigned int *buf_offset, blk_opf_t op) { + unsigned int nr_vecs = 1; struct bio *bio; int size; @@ -1494,7 +1486,9 @@ xfs_buf_ioapply_map( atomic_inc(&bp->b_io_remaining); - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_bvec_count, op, GFP_NOIO); + if (is_vmalloc_addr(bp->b_addr)) + nr_vecs = xfs_buf_nr_pages(bp); + bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, op, GFP_NOIO); bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; @@ -1511,7 +1505,7 @@ xfs_buf_ioapply_map( *buf_offset += len; } while (size); - if (xfs_buf_is_vmapped(bp)) + if (is_vmalloc_addr(bp->b_addr)) flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); submit_bio(bio); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 32688525890bec..ad92d11f4ae173 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -195,7 +195,6 @@ struct xfs_buf { int b_map_count; atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ - unsigned int b_bvec_count; /* bvecs needed for IO */ int b_error; /* error code on I/O */ /* diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 30d53ddd6e6980..f082b1a64fc950 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -169,7 +169,6 @@ xmbuf_map_folio( unlock_page(page); bp->b_addr = page_address(page); - bp->b_bvec_count = 1; return 0; } @@ -182,7 +181,6 @@ xmbuf_unmap_folio( folio_put(kmem_to_folio(bp->b_addr)); bp->b_addr = NULL; - bp->b_bvec_count = 0; } /* Is this a valid daddr within the buftarg? */
On Tue, Mar 19, 2024 at 01:15:21PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> It really is a unique snowflake, so peal off from normal buffer
> recovery earlier and shuffle all the unique bits into the inode
> buffer recovery function.
>
> Also, it looks like the handling of mismatched inode cluster buffer
> sizes is wrong - we have to write the recovered buffer -before- we
> mark it stale as we're not supposed to write stale buffers. I don't
> think we check that anywhere in the buffer IO path, but lets do it
> the right way anyway.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> fs/xfs/xfs_buf_item_recover.c | 99 ++++++++++++++++++++++-------------
> 1 file changed, 63 insertions(+), 36 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
> index dba57ee6fa6d..f994a303ad0a 100644
> --- a/fs/xfs/xfs_buf_item_recover.c
> +++ b/fs/xfs/xfs_buf_item_recover.c
> @@ -229,7 +229,7 @@ xlog_recover_validate_buf_type(
> * just avoid the verification stage for non-crc filesystems
> */
> if (!xfs_has_crc(mp))
> - return;
> + return 0;
>
> magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
> magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
> @@ -407,7 +407,7 @@ xlog_recover_validate_buf_type(
> * skipped.
> */
> if (current_lsn == NULLCOMMITLSN)
> - return 0;;
> + return 0;
Looks like these two should be in the previous patch.
Otherwise this looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
Looks good: Reviewed-by: Christoph Hellwig <hch@lst.de>
Add a strategic IS_ENABLED to let the compiler eliminate the unused non-crc code is CONFIG_XFS_SUPPORT_V4 is disabled. This saves almost 20k worth of .text for my .config: $ size xfs.o.* text data bss dec hex filename 1351126 294836 592 1646554 191fda xfs.o.new 1371453 294868 592 1666913 196f61 xfs.o.old Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_mount.h | 7 ++++++- fs/xfs/xfs_super.c | 22 +++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e880aa48de68bb..24fe6e7913c49f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -327,6 +327,12 @@ static inline void xfs_add_ ## name (struct xfs_mount *mp) \ xfs_sb_version_add ## name(&mp->m_sb); \ } +static inline bool xfs_has_crc(struct xfs_mount *mp) +{ + return IS_ENABLED(CONFIG_XFS_SUPPORT_V4) && + (mp->m_features & XFS_FEAT_CRC); +} + /* Superblock features */ __XFS_ADD_FEAT(attr, ATTR) __XFS_HAS_FEAT(nlink, NLINK) @@ -341,7 +347,6 @@ __XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT) __XFS_ADD_FEAT(attr2, ATTR2) __XFS_HAS_FEAT(parent, PARENT) __XFS_ADD_FEAT(projid32, PROJID32) -__XFS_HAS_FEAT(crc, CRC) __XFS_HAS_FEAT(v3inodes, V3INODES) __XFS_HAS_FEAT(pquotino, PQUOTINO) __XFS_HAS_FEAT(ftype, FTYPE) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 6828c48b15e9bd..7d972e1179255b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1580,17 +1580,21 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - /* V4 support is undergoing deprecation. */ - if (!xfs_has_crc(mp)) { -#ifdef CONFIG_XFS_SUPPORT_V4 + /* + * V4 support is undergoing deprecation. + * + * Note: this has to use an open coded m_features check as xfs_has_crc + * always returns false for !CONFIG_XFS_SUPPORT_V4. + */ + if (!(mp->m_features & XFS_FEAT_CRC)) { + if (!IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { + xfs_warn(mp, + "Deprecated V4 format (crc=0) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; + } xfs_warn_once(mp, "Deprecated V4 format (crc=0) will not be supported after September 2030."); -#else - xfs_warn(mp, - "Deprecated V4 format (crc=0) not supported by kernel."); - error = -EINVAL; - goto out_free_sb; -#endif } /* ASCII case insensitivity is undergoing deprecation. */ -- 2.39.2
Looks good: Reviewed-by: Christoph Hellwig <hch@lst.de>
This looks mostly good to me. One question: xfs_buf_free passes the folio count to mm_account_reclaimed_pages. The name and implementation suggest it actually wants a count of page size units, though.
So while this looks good to me,
> + for (i = 0; i < bp->b_folio_count; i++) {
> + if (bp->b_folios[i])
> + __folio_put(bp->b_folios[i]);
The __folio_put here really needs to be folio_put or page alloc
debugging gets very unhappy.
But even with that fixed on top of this patch the first mount just hangs
without a useful kernel backtrace in /proc/*/stack, although running
with the entire ѕeries applied it does pass the basic sanity checking
so far.
On Mon, Mar 18, 2024 at 11:38:40PM -0700, Christoph Hellwig wrote: > On Tue, Mar 19, 2024 at 09:45:53AM +1100, Dave Chinner wrote: > > From: Dave Chinner <dchinner@redhat.com> > > > > Convert the use of struct pages to struct folio everywhere. This > > is just direct API conversion, no actual logic of code changes > > should result. > > > > Note: this conversion currently assumes only single page folios are > > allocated, and because some of the MM interfaces we use take > > pointers to arrays of struct pages, the address of single page > > folios and struct pages are the same. e.g alloc_pages_bulk_array(), > > vm_map_ram(), etc. > > .. and this goes away by the end of the series. Maybe that's worth > mentioning here? Ah, yeah. I remembered to update the cover letter with this information, so 1 out of 2 ain't bad :) > Otherwise this looks good: > > Reviewed-by: Christoph Hellwig <hch@lst.de> Thanks! -Dave. -- Dave Chinner david@fromorbit.com
On Tue, Mar 19, 2024 at 09:45:53AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> Convert the use of struct pages to struct folio everywhere. This
> is just direct API conversion, no actual logic of code changes
> should result.
>
> Note: this conversion currently assumes only single page folios are
> allocated, and because some of the MM interfaces we use take
> pointers to arrays of struct pages, the address of single page
> folios and struct pages are the same. e.g alloc_pages_bulk_array(),
> vm_map_ram(), etc.
.. and this goes away by the end of the series. Maybe that's worth
mentioning here?
Otherwise this looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
Didn't this just get merged into the for-next tree already?
Hello, syzbot found the following issue on: HEAD commit: 906a93befec8 Merge tag 'efi-fixes-for-v6.9-1' of git://git.. git tree: upstream console output: https://syzkaller.appspot.com/x/log.txt?x=12d6ea6e180000 kernel config: https://syzkaller.appspot.com/x/.config?x=5206351398500a90 dashboard link: https://syzkaller.appspot.com/bug?extid=b44399433a41aaed7a9f compiler: gcc (Debian 12.2.0-14) 12.2.0, GNU ld (GNU Binutils for Debian) 2.40 Unfortunately, I don't have any reproducer for this issue yet. Downloadable assets: disk image (non-bootable): https://storage.googleapis.com/syzbot-assets/7bc7510fe41f/non_bootable_disk-906a93be.raw.xz vmlinux: https://storage.googleapis.com/syzbot-assets/f096ab7eaede/vmlinux-906a93be.xz kernel image: https://storage.googleapis.com/syzbot-assets/52e0859d6157/bzImage-906a93be.xz IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+b44399433a41aaed7a9f@syzkaller.appspotmail.com ====================================================== WARNING: possible circular locking dependency detected 6.8.0-syzkaller-11405-g906a93befec8 #0 Not tainted ------------------------------------------------------ kswapd0/109 is trying to acquire lock: ffff888022fc0958 (&qinf->qi_tree_lock){+.+.}-{3:3}, at: xfs_qm_dqfree_one+0x6f/0x1a0 fs/xfs/xfs_qm.c:1654 but task is already holding lock: ffffffff8d930be0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x166/0x19a0 mm/vmscan.c:6782 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: __fs_reclaim_acquire mm/page_alloc.c:3698 [inline] fs_reclaim_acquire+0x102/0x160 mm/page_alloc.c:3712 might_alloc include/linux/sched/mm.h:312 [inline] slab_pre_alloc_hook mm/slub.c:3746 [inline] slab_alloc_node mm/slub.c:3827 [inline] kmem_cache_alloc+0x4f/0x320 mm/slub.c:3852 radix_tree_node_alloc.constprop.0+0x7c/0x350 lib/radix-tree.c:276 radix_tree_extend+0x1a2/0x4d0 lib/radix-tree.c:425 __radix_tree_create lib/radix-tree.c:613 [inline] radix_tree_insert+0x499/0x630 lib/radix-tree.c:712 xfs_qm_dqget_cache_insert.constprop.0+0x38/0x2c0 fs/xfs/xfs_dquot.c:826 xfs_qm_dqget+0x182/0x4a0 fs/xfs/xfs_dquot.c:901 xfs_qm_scall_setqlim+0x172/0x1980 fs/xfs/xfs_qm_syscalls.c:300 xfs_fs_set_dqblk+0x166/0x1e0 fs/xfs/xfs_quotaops.c:267 quota_setquota+0x4c5/0x5f0 fs/quota/quota.c:310 do_quotactl+0xb03/0x13e0 fs/quota/quota.c:802 __do_sys_quotactl fs/quota/quota.c:961 [inline] __se_sys_quotactl fs/quota/quota.c:917 [inline] __x64_sys_quotactl+0x1b4/0x440 fs/quota/quota.c:917 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xd2/0x260 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6d/0x75 -> #0 (&qinf->qi_tree_lock){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain kernel/locking/lockdep.c:3869 [inline] __lock_acquire+0x2478/0x3b30 kernel/locking/lockdep.c:5137 lock_acquire kernel/locking/lockdep.c:5754 [inline] lock_acquire+0x1b1/0x540 kernel/locking/lockdep.c:5719 __mutex_lock_common kernel/locking/mutex.c:608 [inline] __mutex_lock+0x175/0x9c0 kernel/locking/mutex.c:752 xfs_qm_dqfree_one+0x6f/0x1a0 fs/xfs/xfs_qm.c:1654 xfs_qm_shrink_scan+0x25c/0x3f0 fs/xfs/xfs_qm.c:531 do_shrink_slab+0x44f/0x1160 mm/shrinker.c:435 shrink_slab+0x18a/0x1310 mm/shrinker.c:662 shrink_one+0x493/0x7c0 mm/vmscan.c:4774 shrink_many mm/vmscan.c:4835 [inline] lru_gen_shrink_node mm/vmscan.c:4935 [inline] shrink_node+0x231f/0x3a80 mm/vmscan.c:5894 kswapd_shrink_node mm/vmscan.c:6704 [inline] balance_pgdat+0x9a0/0x19a0 mm/vmscan.c:6895 kswapd+0x5ea/0xb90 mm/vmscan.c:7164 kthread+0x2c1/0x3a0 kernel/kthread.c:388 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&qinf->qi_tree_lock); lock(fs_reclaim); lock(&qinf->qi_tree_lock); *** DEADLOCK *** 1 lock held by kswapd0/109: #0: ffffffff8d930be0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x166/0x19a0 mm/vmscan.c:6782 stack backtrace: CPU: 3 PID: 109 Comm: kswapd0 Not tainted 6.8.0-syzkaller-11405-g906a93befec8 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:114 check_noncircular+0x31a/0x400 kernel/locking/lockdep.c:2187 check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain kernel/locking/lockdep.c:3869 [inline] __lock_acquire+0x2478/0x3b30 kernel/locking/lockdep.c:5137 lock_acquire kernel/locking/lockdep.c:5754 [inline] lock_acquire+0x1b1/0x540 kernel/locking/lockdep.c:5719 __mutex_lock_common kernel/locking/mutex.c:608 [inline] __mutex_lock+0x175/0x9c0 kernel/locking/mutex.c:752 xfs_qm_dqfree_one+0x6f/0x1a0 fs/xfs/xfs_qm.c:1654 xfs_qm_shrink_scan+0x25c/0x3f0 fs/xfs/xfs_qm.c:531 do_shrink_slab+0x44f/0x1160 mm/shrinker.c:435 shrink_slab+0x18a/0x1310 mm/shrinker.c:662 shrink_one+0x493/0x7c0 mm/vmscan.c:4774 shrink_many mm/vmscan.c:4835 [inline] lru_gen_shrink_node mm/vmscan.c:4935 [inline] shrink_node+0x231f/0x3a80 mm/vmscan.c:5894 kswapd_shrink_node mm/vmscan.c:6704 [inline] balance_pgdat+0x9a0/0x19a0 mm/vmscan.c:6895 kswapd+0x5ea/0xb90 mm/vmscan.c:7164 kthread+0x2c1/0x3a0 kernel/kthread.c:388 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243 </TASK> --- This report is generated by a bot. It may contain errors. See https://goo.gl/tpsmEJ for more information about syzbot. syzbot engineers can be reached at syzkaller@googlegroups.com. syzbot will keep track of this issue. See: https://goo.gl/tpsmEJ#status for how to communicate with syzbot. If the report is already addressed, let syzbot know by replying with: #syz fix: exact-commit-title If you want to overwrite report's subsystems, reply with: #syz set subsystems: new-subsystem (See the list of subsystem names on the web dashboard) If the report is a duplicate of another one, reply with: #syz dup: exact-subject-of-another-report If you want to undo deduplication, reply with: #syz undup
From: Dave Chinner <dchinner@redhat.com> Rather than passing a mix of xfs_mount and xlog (and sometimes both) through the call chain of buffer log item recovery, just pass the struct xlog and pull the xfs_mount from that only at the leaf functions where it is needed. This makes it all just a little cleaner. Signed-off-by: Dave Chinner <dchinner@redhat.com> --- fs/xfs/xfs_buf_item_recover.c | 94 +++++++++++++++++------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 9225baa62755..edd03b08c969 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -209,7 +209,7 @@ xlog_recover_buf_commit_pass1( */ static int xlog_recover_validate_buf_type( - struct xfs_mount *mp, + struct xlog *log, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f, xfs_lsn_t current_lsn) @@ -228,7 +228,7 @@ xlog_recover_validate_buf_type( * inconsistent state resulting in verification failures. Hence for now * just avoid the verification stage for non-crc filesystems */ - if (!xfs_has_crc(mp)) + if (!xfs_has_crc(log->l_mp)) return 0; magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); @@ -396,7 +396,7 @@ xlog_recover_validate_buf_type( break; #endif /* CONFIG_XFS_RT */ default: - xfs_warn(mp, "Unknown buffer type %d!", + xfs_warn(log->l_mp, "Unknown buffer type %d!", xfs_blft_from_flags(buf_f)); break; } @@ -410,7 +410,7 @@ xlog_recover_validate_buf_type( return 0; if (warnmsg) { - xfs_warn(mp, warnmsg); + xfs_warn(log->l_mp, warnmsg); xfs_buf_corruption_error(bp, __this_address); return -EFSCORRUPTED; } @@ -428,7 +428,7 @@ xlog_recover_validate_buf_type( */ ASSERT(bp->b_ops); bp->b_flags |= _XBF_LOGRECOVERY; - xfs_buf_item_init(bp, mp); + xfs_buf_item_init(bp, log->l_mp); bp->b_log_item->bli_item.li_lsn = current_lsn; return 0; } @@ -441,7 +441,7 @@ xlog_recover_validate_buf_type( */ static void xlog_recover_buffer( - struct xfs_mount *mp, + struct xlog *log, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) @@ -450,7 +450,7 @@ xlog_recover_buffer( int bit; int nbits; - trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + trace_xfs_log_recover_buf_reg_buf(log, buf_f); bit = 0; i = 1; /* 0 is the buf format structure */ @@ -492,14 +492,14 @@ xlog_recover_buffer( static int xlog_recover_do_reg_buffer( - struct xfs_mount *mp, + struct xlog *log, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f, xfs_lsn_t current_lsn) { - xlog_recover_buffer(mp, item, bp, buf_f); - return xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); + xlog_recover_buffer(log, item, bp, buf_f); + return xlog_recover_validate_buf_type(log, bp, buf_f, current_lsn); } /* @@ -513,7 +513,6 @@ xlog_recover_do_reg_buffer( */ static bool xlog_recover_this_dquot_buffer( - struct xfs_mount *mp, struct xlog *log, struct xlog_recover_item *item, struct xfs_buf *bp, @@ -526,7 +525,7 @@ xlog_recover_this_dquot_buffer( /* * Filesystems are required to send in quota flags at mount time. */ - if (!mp->m_qflags) + if (!log->l_mp->m_qflags) return false; type = 0; @@ -550,7 +549,7 @@ xlog_recover_this_dquot_buffer( */ static int xlog_recover_verify_dquot_buf_item( - struct xfs_mount *mp, + struct xlog *log, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) @@ -564,7 +563,7 @@ xlog_recover_verify_dquot_buf_item( case XFS_BLFT_GDQUOT_BUF: break; default: - xfs_alert(mp, + xfs_alert(log->l_mp, "XFS: dquot buffer log format type mismatch in %s.", __func__); xfs_buf_corruption_error(bp, __this_address); @@ -573,19 +572,19 @@ xlog_recover_verify_dquot_buf_item( for (i = 1; i < item->ri_total; i++) { if (item->ri_buf[i].i_addr == NULL) { - xfs_alert(mp, + xfs_alert(log->l_mp, "XFS: NULL dquot in %s.", __func__); return -EFSCORRUPTED; } if (item->ri_buf[i].i_len < sizeof(struct xfs_disk_dquot)) { - xfs_alert(mp, + xfs_alert(log->l_mp, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); return -EFSCORRUPTED; } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); + fa = xfs_dquot_verify(log->l_mp, item->ri_buf[i].i_addr, -1); if (fa) { - xfs_alert(mp, + xfs_alert(log->l_mp, "dquot corrupt at %pS trying to replay into block 0x%llx", fa, xfs_buf_daddr(bp)); return -EFSCORRUPTED; @@ -612,11 +611,12 @@ xlog_recover_verify_dquot_buf_item( */ static int xlog_recover_do_inode_buffer( - struct xfs_mount *mp, + struct xlog *log, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) { + struct xfs_sb *sbp = &log->l_mp->m_sb; int i; int item_index = 0; int bit = 0; @@ -628,7 +628,7 @@ xlog_recover_do_inode_buffer( xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; - trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + trace_xfs_log_recover_buf_inode_buf(log, buf_f); /* * If the magic number doesn't match, something has gone wrong. Don't @@ -641,12 +641,12 @@ xlog_recover_do_inode_buffer( * Post recovery validation only works properly on CRC enabled * filesystems. */ - if (xfs_has_crc(mp)) + if (xfs_has_crc(log->l_mp)) bp->b_ops = &xfs_inode_buf_ops; - inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; + inodes_per_buf = BBTOB(bp->b_length) >> sbp->sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { - next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + + next_unlinked_offset = (i * sbp->sb_inodesize) + offsetof(struct xfs_dinode, di_next_unlinked); while (next_unlinked_offset >= @@ -695,8 +695,8 @@ xlog_recover_do_inode_buffer( */ logged_nextp = item->ri_buf[item_index].i_addr + next_unlinked_offset - reg_buf_offset; - if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { - xfs_alert(mp, + if (XFS_IS_CORRUPT(log->l_mp, *logged_nextp == 0)) { + xfs_alert(log->l_mp, "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); @@ -711,8 +711,8 @@ xlog_recover_do_inode_buffer( * have to leave the inode in a consistent state for whoever * reads it next.... */ - xfs_dinode_calc_crc(mp, - xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + xfs_dinode_calc_crc(log->l_mp, + xfs_buf_offset(bp, i * sbp->sb_inodesize)); } @@ -734,7 +734,7 @@ xlog_recover_do_inode_buffer( * it stale so that it won't be found on overlapping buffer lookups and * caller knows not to queue it for delayed write. */ - if (BBTOB(bp->b_length) != M_IGEO(mp)->inode_cluster_size) { + if (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size) { int error; error = xfs_bwrite(bp); @@ -792,7 +792,7 @@ xlog_recovery_is_dir_buf( */ static void xlog_recover_do_partial_dabuf( - struct xfs_mount *mp, + struct xlog *log, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) @@ -802,7 +802,7 @@ xlog_recover_do_partial_dabuf( * and rely on post pass2 recovery cache purge to clean these out of * memory. */ - xlog_recover_buffer(mp, item, bp, buf_f); + xlog_recover_buffer(log, item, bp, buf_f); } /* @@ -827,7 +827,7 @@ xlog_recover_do_partial_dabuf( */ static xfs_lsn_t xlog_recover_get_buf_lsn( - struct xfs_mount *mp, + struct xlog *log, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) { @@ -839,7 +839,7 @@ xlog_recover_get_buf_lsn( uint16_t blft; /* v4 filesystems always recover immediately */ - if (!xfs_has_crc(mp)) + if (!xfs_has_crc(log->l_mp)) goto recover_immediately; /* @@ -916,7 +916,7 @@ xlog_recover_get_buf_lsn( * the relevant UUID in the superblock. */ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - if (xfs_has_metauuid(mp)) + if (xfs_has_metauuid(log->l_mp)) uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; else uuid = &((struct xfs_dsb *)blk)->sb_uuid; @@ -926,7 +926,7 @@ xlog_recover_get_buf_lsn( } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) + if (!uuid_equal(&log->l_mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } @@ -945,7 +945,7 @@ xlog_recover_get_buf_lsn( } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) + if (!uuid_equal(&log->l_mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } @@ -977,14 +977,14 @@ xlog_recover_get_buf_lsn( */ static bool xlog_recover_this_buffer( - struct xfs_mount *mp, + struct xlog *log, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f, xfs_lsn_t current_lsn) { xfs_lsn_t lsn; - lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); + lsn = xlog_recover_get_buf_lsn(log, bp, buf_f); if (!lsn) return true; if (lsn == -1) @@ -992,8 +992,8 @@ xlog_recover_this_buffer( if (XFS_LSN_CMP(lsn, current_lsn) < 0) return true; - trace_xfs_log_recover_buf_skip(mp->m_log, buf_f); - xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + trace_xfs_log_recover_buf_skip(log, buf_f); + xlog_recover_validate_buf_type(log, bp, buf_f, NULLCOMMITLSN); /* * We're skipping replay of this buffer log item due to the log @@ -1065,22 +1065,22 @@ xlog_recover_buf_commit_pass2( * to simplify the rest of the code. */ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { - error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + error = xlog_recover_do_inode_buffer(log, item, bp, buf_f); if (error || (bp->b_flags & XBF_STALE)) goto out_release; goto out_write; } if (buf_f->blf_flags & XFS_BLF_DQUOT_BUF) { - if (!xlog_recover_this_dquot_buffer(mp, log, item, bp, buf_f)) + if (!xlog_recover_this_dquot_buffer(log, item, bp, buf_f)) goto out_release; - error = xlog_recover_verify_dquot_buf_item(mp, item, bp, buf_f); + error = xlog_recover_verify_dquot_buf_item(log, item, bp, buf_f); if (error) goto out_release; - xlog_recover_buffer(mp, item, bp, buf_f); - error = xlog_recover_validate_buf_type(mp, bp, buf_f, + xlog_recover_buffer(log, item, bp, buf_f); + error = xlog_recover_validate_buf_type(log, bp, buf_f, NULLCOMMITLSN); if (error) goto out_release; @@ -1095,14 +1095,14 @@ xlog_recover_buf_commit_pass2( */ if (xlog_recovery_is_dir_buf(buf_f) && mp->m_dir_geo->blksize != BBTOB(buf_f->blf_len)) { - xlog_recover_do_partial_dabuf(mp, item, bp, buf_f); + xlog_recover_do_partial_dabuf(log, item, bp, buf_f); goto out_write; } /* * Whole buffer recovery, dependent on the LSN in the on-disk structure. */ - if (!xlog_recover_this_buffer(mp, bp, buf_f, current_lsn)) { + if (!xlog_recover_this_buffer(log, bp, buf_f, current_lsn)) { /* * We may have verified this buffer even though we aren't * recovering it. Return the verifier error for early detection @@ -1113,7 +1113,7 @@ xlog_recover_buf_commit_pass2( } - error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + error = xlog_recover_do_reg_buffer(log, item, bp, buf_f, current_lsn); if (error) goto out_release; -- 2.43.0
From: Dave Chinner <dchinner@redhat.com> When a compound buffer is logged (e.g. fragmented large directory block) we record it in the log as a series of separate buffer log format items in the journal. These get recovered individually, and because they are for non-contiguous extent ranges, we cannot use buffer addresses to detect that the buffer format items are from the same directory block. Further, we cannot use LSN checks to determine if the partial block buffers should be recovered - apart from the first buffer we don't have a header with an LSN in it to check. Finally, we cannot add a verifier to a partial block buffer because, again, it will fail the verifier checks and report corruption. We already skip this step due to bad magic number detection, but we should be able to do better here. The one thing we can rely on, though, is that each buffer format item is written consecutively in the journal. They are built at commit time into a single log iovec and chained into the iclog write log vector chain as an unbroken sequence. Hence all the parts of a compound buffer should be consecutive buf log format items in the transaction being recovered. Unfortunately, we don't have the information available in recovery to do a full compound buffer instantiation for recovery. We only have the fragments that contained modifications in the journal, and so there may be missing fragments that are still clean and hence are not in the journal. Hence we cannot use journal state to rebuild the compound buffer entirely and hence recover it as a complete entity and run a verifier over it before writeback. Hence the first thing we need to do is detect such partial buffer recovery situations and track whether we need to skip all the partial buffers due to the LSN check in the initial header fragment read from disk. Signed-off-by: Dave Chinner <dchinner@redhat.com> --- fs/xfs/xfs_buf_item_recover.c | 178 +++++++++++++++++++++++++++------- 1 file changed, 143 insertions(+), 35 deletions(-) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 90740fcf2fbe..9225baa62755 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -434,18 +434,17 @@ xlog_recover_validate_buf_type( } /* - * Perform a 'normal' buffer recovery. Each logged region of the + * Perform recovery of the logged regions. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ -static int -xlog_recover_do_reg_buffer( +static void +xlog_recover_buffer( struct xfs_mount *mp, struct xlog_recover_item *item, struct xfs_buf *bp, - struct xfs_buf_log_format *buf_f, - xfs_lsn_t current_lsn) + struct xfs_buf_log_format *buf_f) { int i; int bit; @@ -489,7 +488,17 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); +} +static int +xlog_recover_do_reg_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + xlog_recover_buffer(mp, item, bp, buf_f); return xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); } @@ -735,6 +744,67 @@ xlog_recover_do_inode_buffer( return 0; } +static bool +xlog_recovery_is_dir_buf( + struct xfs_buf_log_format *buf_f) +{ + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_DIR_BLOCK_BUF: + case XFS_BLFT_DIR_DATA_BUF: + case XFS_BLFT_DIR_FREE_BUF: + case XFS_BLFT_DIR_LEAF1_BUF: + case XFS_BLFT_DIR_LEAFN_BUF: + case XFS_BLFT_DA_NODE_BUF: + return true; + default: + break; + } + return false; +} + +/* + * Partial dabuf recovery. + * + * There are two main cases here - a buffer that contains the dabuf header and + * hence can be magic number and LSN checked, and then everything else. + * + * We can determine if the former should be replayed or not via LSN checks, but + * we cannot do that with the latter, so the only choice we have here is to + * always recover the changes regardless of whether this means metadata on disk + * will go backwards in time. This, at least, means that the changes in each + * checkpoint are applied consistently to the dabuf and we don't do really + * stupid things like skip the header fragment replay and then replay all the + * other changes to the dabuf block. + * + * While this is not ideal, finishing log recovery should them replay all the + * remaining changes across this buffer and so bring it back to being consistent + * on disk at the completion of recovery. Hence this "going backwards in time" + * situation will only be relevant to failed journal replay situations. These + * are rare and will require xfs_repair to be run, anyway, so the inconsistency + * that results will be corrected before the filesystem goes back into service, + * anyway. + * + * Important: This partial fragment recovery relies on log recovery purging the + * buffer cache after completion of this recovery phase. These partial buffers + * are never used at runtime (discontiguous buffers will be used instead), so + * they must be removed from the buffer cache to prevent them from causing + * overlapping range lookup failures for the entire dabuf range. + */ +static void +xlog_recover_do_partial_dabuf( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + /* + * Always recover without verification or write verifiers. Use delwri + * and rely on post pass2 recovery cache purge to clean these out of + * memory. + */ + xlog_recover_buffer(mp, item, bp, buf_f); +} + /* * V5 filesystems know the age of the buffer on disk being recovered. We can * have newer objects on disk than we are replaying, and so for these cases we @@ -886,6 +956,54 @@ xlog_recover_get_buf_lsn( } +/* + * Recover the buffer only if we get an LSN from it and it's less than the lsn + * of the transaction we are replaying. + * + * Note that we have to be extremely careful of readahead here. Readahead does + * not attach verfiers to the buffers so if we don't actually do any replay + * after readahead because of the LSN we found in the buffer if more recent than + * that current transaction then we need to attach the verifier directly. + * Failure to do so can lead to future recovery actions (e.g. EFI and unlinked + * list recovery) can operate on the buffers and they won't get the verifier + * attached. This can lead to blocks on disk having the correct content but a + * stale CRC. + * + * It is safe to assume these clean buffers are currently up to date. If the + * buffer is dirtied by a later transaction being replayed, then the verifier + * will be reset to match whatever recover turns that buffer into. + * + * Return true if the buffer needs to be recovered, false if it doesn't. + */ +static bool +xlog_recover_this_buffer( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + xfs_lsn_t lsn; + + lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); + if (!lsn) + return true; + if (lsn == -1) + return true; + if (XFS_LSN_CMP(lsn, current_lsn) < 0) + return true; + + trace_xfs_log_recover_buf_skip(mp->m_log, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + + /* + * We're skipping replay of this buffer log item due to the log + * item LSN being behind the ondisk buffer. Verify the buffer + * contents since we aren't going to run the write verifier. + */ + if (bp->b_ops) + bp->b_ops->verify_read(bp); + return false; +} /* * This routine replays a modification made to a buffer at runtime. * There are actually two types of buffer, regular and inode, which @@ -920,7 +1038,6 @@ xlog_recover_buf_commit_pass2( struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; - xfs_lsn_t lsn; /* * In this pass we only want to recover all the buffers which have @@ -962,7 +1079,8 @@ xlog_recover_buf_commit_pass2( if (error) goto out_release; - error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, + xlog_recover_buffer(mp, item, bp, buf_f); + error = xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); if (error) goto out_release; @@ -970,41 +1088,31 @@ xlog_recover_buf_commit_pass2( } /* - * Recover the buffer only if we get an LSN from it and it's less than - * the lsn of the transaction we are replaying. - * - * Note that we have to be extremely careful of readahead here. - * Readahead does not attach verfiers to the buffers so if we don't - * actually do any replay after readahead because of the LSN we found - * in the buffer if more recent than that current transaction then we - * need to attach the verifier directly. Failure to do so can lead to - * future recovery actions (e.g. EFI and unlinked list recovery) can - * operate on the buffers and they won't get the verifier attached. This - * can lead to blocks on disk having the correct content but a stale - * CRC. - * - * It is safe to assume these clean buffers are currently up to date. - * If the buffer is dirtied by a later transaction being replayed, then - * the verifier will be reset to match whatever recover turns that - * buffer into. + * Directory buffers can be larger than a single filesystem block and + * if they are they can be fragmented. There are lots of concerns about + * recovering these, so push them out of line where the concerns can be + * documented clearly. */ - lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - trace_xfs_log_recover_buf_skip(log, buf_f); - xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + if (xlog_recovery_is_dir_buf(buf_f) && + mp->m_dir_geo->blksize != BBTOB(buf_f->blf_len)) { + xlog_recover_do_partial_dabuf(mp, item, bp, buf_f); + goto out_write; + } + /* + * Whole buffer recovery, dependent on the LSN in the on-disk structure. + */ + if (!xlog_recover_this_buffer(mp, bp, buf_f, current_lsn)) { /* - * We're skipping replay of this buffer log item due to the log - * item LSN being behind the ondisk buffer. Verify the buffer - * contents since we aren't going to run the write verifier. + * We may have verified this buffer even though we aren't + * recovering it. Return the verifier error for early detection + * of recovery inconsistencies. */ - if (bp->b_ops) { - bp->b_ops->verify_read(bp); - error = bp->b_error; - } + error = bp->b_error; goto out_release; } + error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); if (error) goto out_release; -- 2.43.0
Recently Zorro tripped over a failure with 64kB directory blocks on s390x via generic/648. Recovery was reporting failures like this: XFS (loop3): Mounting V5 Filesystem c1954438-a18d-4b4a-ad32-0e29c40713ed XFS (loop3): Starting recovery (logdev: internal) XFS (loop3): Bad dir block magic! XFS: Assertion failed: 0, file: fs/xfs/xfs_buf_item_recover.c, line: 414 .... Or it was succeeding and later operations were detecting directory block corruption during idrectory operations such as: XFS (loop3): Metadata corruption detected at __xfs_dir3_data_check+0x372/0x6c0 [xfs], xfs_dir3_block block 0x1020 XFS (loop3): Unmount and run xfs_repair XFS (loop3): First 128 bytes of corrupted metadata buffer: 00000000: 58 44 42 33 00 00 00 00 00 00 00 00 00 00 10 20 XDB3........... .... Futher triage and diagnosis pointed to the fact that the test was generating a discontiguous (multi-extent) directory block and that directory block was not being recovered correctly when it was encountered. Zorro captured a trace, and what we saw in the trace was a specific pattern of buffer log items being processed through every phase of recovery: xfs_log_recover_buf_not_cancel: dev 7:0 daddr 0x2c2ce0, bbcount 0x10, flags 0x5000, size 2, map_size 2 xfs_log_recover_item_recover: dev 7:0 tid 0xce3ce480 lsn 0x300014178, pass 1, item 0x8ea70fc0, item type XFS_LI_BUF item region count/total 2/2 xfs_log_recover_buf_not_cancel: dev 7:0 daddr 0x331fb0, bbcount 0x58, flags 0x5000, size 2, map_size 11 xfs_log_recover_item_recover: dev 7:0 tid 0xce3ce480 lsn 0x300014178, pass 1, item 0x8f36c040, item type XFS_LI_BUF item region count/total 2/2 The item addresses, tid and LSN change, but the order of the two buf log items does not. These are both "flags 0x5000" which means both log items are XFS_BLFT_DIR_BLOCK_BUF types, and they are both partial directory block buffers, and they are discontiguous. They also have different types of log items both before and after them, so it is likely these are two extents within the same compound buffer. The way we log compound buffers is that we create a buf log format item for each extent in the buffer, and then we log each range as a separate buf log format item. IOWs, to recovery these fragments of the directory block appear just like complete regular buffers that need to be recovered. Hence what we see above is the first buffer (daddr 0x2c2ce0, bbcount 0x10) is the first extent in the directory block that contains the header and magic number, so it recovers and verifies just fine. The second buffer is the tail of the directory block, and it does not contain a magic number because it starts mid-way through the directory block. Hence the magic number verification fails and the buffer is not recovered. Compound buffers were logged this way so that they didn't require a change of log format to recover. Prior to compound buffers, the directory code kept it's own dabuf structure to map multiple extents in a single directory block, and they got logged as separate buffer log format items as well. So the problem isn't directly related to the conversion of dabufs to compound buffers - the problem is related to the buffer recovery verification code not knowing that directory buffer fragments are valid recovery targets. Hence the fixes in this patchset are to log recovery, and do not change runtime behaviour at all. The first thing we do is change the buffer recovery code to consider a type mismatch between the BLF and the buffer contents as a fatal error instead of a warning. If we just warn and continue, the recovered metadata may still be corrupt and so we should just abort with -EFSCORRUPTED when this occurs. That addresses the silent recovery success followed by runtime detection of directory corruption issue that was encountered. We then need to untangle the buffer recovery code a bit. Inode buffer, dquot buffer and regular buffer recovery are all a bit different, but they are tightly intertwined. neither dquot nor inode buffer recovery need discontiguous buffer recovery detection, and they also have different constraints so separate them out. We also always recover inode and dquot buffers, so we don't need check magic numbers or decode internal lsns to determine if they should be recovered. With that done, we can then add code to the general buffer recovery to detect partial block recovery situations. We check the BLF type to determine if it is a directory buffer, and add a path for recovery of partial directory block items. This allows recovery of regions of directory blocks that do not start at offset 0 in the directory block. This fixes the initial "bad dir block magic" issue reported, and results in correct recovery of discontiguous directory blocks. IOWs, this appears to be a log recovery problem and not a runtime issue. I think the fix will be to allow directory blocks to fail the magic number check if and only if the buffer length does not match the directory block size (i.e. it is a partial directory block fragment being recovered). This passes repeated looping over '-g enospc -g recoveryloop' on 64kb directory block size configurations, so the change to recovery hasn't caused any obvious regressions in fixing this issue. Thoughts?
From: Dave Chinner <dchinner@redhat.com> Dquot buffers are only logged when the dquot cluster is allocated. Recovery of them is always done and not conditional on an LSN found in the buffer because there aren't dquots stamped in the buffer when the initialisation is replayed after allocation. Signed-off-by: Dave Chinner <dchinner@redhat.com> --- fs/xfs/libxfs/xfs_log_format.h | 6 ++ fs/xfs/xfs_buf_item_recover.c | 129 +++++++++++++++++---------------- 2 files changed, 72 insertions(+), 63 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 16872972e1e9..5ac0c3066930 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -508,6 +508,9 @@ struct xfs_log_dinode { #define XFS_BLF_PDQUOT_BUF (1<<3) #define XFS_BLF_GDQUOT_BUF (1<<4) +#define XFS_BLF_DQUOT_BUF \ + (XFS_BLF_UDQUOT_BUF | XFS_BLF_PDQUOT_BUF | XFS_BLF_GDQUOT_BUF) + /* * This is the structure used to lay out a buf log item in the log. The data * map describes which 128 byte chunks of the buffer have been logged. @@ -571,6 +574,9 @@ enum xfs_blft { XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), }; +#define XFS_BLFT_DQUOT_BUF \ + (XFS_BLFT_UDQUOT_BUF | XFS_BLFT_PDQUOT_BUF | XFS_BLFT_GDQUOT_BUF) + static inline void xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) { diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index f994a303ad0a..90740fcf2fbe 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -450,8 +450,6 @@ xlog_recover_do_reg_buffer( int i; int bit; int nbits; - xfs_failaddr_t fa; - const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); @@ -481,39 +479,10 @@ xlog_recover_do_reg_buffer( if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; - /* - * Do a sanity check if this is a dquot buffer. Just checking - * the first dquot in the buffer should do. XXXThis is - * probably a good thing to do for other buf types also. - */ - fa = NULL; - if (buf_f->blf_flags & - (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - if (item->ri_buf[i].i_addr == NULL) { - xfs_alert(mp, - "XFS: NULL dquot in %s.", __func__); - goto next; - } - if (item->ri_buf[i].i_len < size_disk_dquot) { - xfs_alert(mp, - "XFS: dquot too small (%d) in %s.", - item->ri_buf[i].i_len, __func__); - goto next; - } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); - if (fa) { - xfs_alert(mp, - "dquot corrupt at %pS trying to replay into block 0x%llx", - fa, xfs_buf_daddr(bp)); - goto next; - } - } - memcpy(xfs_buf_offset(bp, (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ nbits<<XFS_BLF_SHIFT); /* length */ - next: i++; bit += nbits; } @@ -566,6 +535,56 @@ xlog_recover_this_dquot_buffer( return true; } +/* + * Do a sanity check of each region in the item to ensure we are actually + * recovering a dquot buffer item. + */ +static int +xlog_recover_verify_dquot_buf_item( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + xfs_failaddr_t fa; + int i; + + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: + break; + default: + xfs_alert(mp, + "XFS: dquot buffer log format type mismatch in %s.", + __func__); + xfs_buf_corruption_error(bp, __this_address); + return -EFSCORRUPTED; + } + + for (i = 1; i < item->ri_total; i++) { + if (item->ri_buf[i].i_addr == NULL) { + xfs_alert(mp, + "XFS: NULL dquot in %s.", __func__); + return -EFSCORRUPTED; + } + if (item->ri_buf[i].i_len < sizeof(struct xfs_disk_dquot)) { + xfs_alert(mp, + "XFS: dquot too small (%d) in %s.", + item->ri_buf[i].i_len, __func__); + return -EFSCORRUPTED; + } + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); + if (fa) { + xfs_alert(mp, +"dquot corrupt at %pS trying to replay into block 0x%llx", + fa, xfs_buf_daddr(bp)); + return -EFSCORRUPTED; + } + } + return 0; +} + /* * Perform recovery for a buffer full of inodes. We don't have inode cluster * buffer specific LSNs, so we always recover inode buffers if they contain @@ -743,7 +762,6 @@ xlog_recover_get_buf_lsn( struct xfs_buf_log_format *buf_f) { uint32_t magic32; - uint16_t magic16; uint16_t magicda; void *blk = bp->b_addr; uuid_t *uuid; @@ -862,27 +880,7 @@ xlog_recover_get_buf_lsn( return lsn; } - /* - * We do individual object checks on dquot and inode buffers as they - * have their own individual LSN records. Also, we could have a stale - * buffer here, so we have to at least recognise these buffer types. - * - * A notd complexity here is inode unlinked list processing - it logs - * the inode directly in the buffer, but we don't know which inodes have - * been modified, and there is no global buffer LSN. Hence we need to - * recover all inode buffer types immediately. This problem will be - * fixed by logical logging of the unlinked list modifications. - */ - magic16 = be16_to_cpu(*(__be16 *)blk); - switch (magic16) { - case XFS_DQUOT_MAGIC: - goto recover_immediately; - default: - break; - } - /* unknown buffer contents, recover immediately */ - recover_immediately: return (xfs_lsn_t)-1; @@ -956,6 +954,21 @@ xlog_recover_buf_commit_pass2( goto out_write; } + if (buf_f->blf_flags & XFS_BLF_DQUOT_BUF) { + if (!xlog_recover_this_dquot_buffer(mp, log, item, bp, buf_f)) + goto out_release; + + error = xlog_recover_verify_dquot_buf_item(mp, item, bp, buf_f); + if (error) + goto out_release; + + error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, + NULLCOMMITLSN); + if (error) + goto out_release; + goto out_write; + } + /* * Recover the buffer only if we get an LSN from it and it's less than * the lsn of the transaction we are replaying. @@ -992,17 +1005,7 @@ xlog_recover_buf_commit_pass2( goto out_release; } - if (buf_f->blf_flags & - (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - if (!xlog_recover_this_dquot_buffer(mp, log, item, bp, buf_f)) - goto out_release; - - error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, - NULLCOMMITLSN); - } else { - error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, - current_lsn); - } + error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); if (error) goto out_release; -- 2.43.0
From: Dave Chinner <dchinner@redhat.com> It really is a unique snowflake, so peal off from normal buffer recovery earlier and shuffle all the unique bits into the inode buffer recovery function. Also, it looks like the handling of mismatched inode cluster buffer sizes is wrong - we have to write the recovered buffer -before- we mark it stale as we're not supposed to write stale buffers. I don't think we check that anywhere in the buffer IO path, but lets do it the right way anyway. Signed-off-by: Dave Chinner <dchinner@redhat.com> --- fs/xfs/xfs_buf_item_recover.c | 99 ++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index dba57ee6fa6d..f994a303ad0a 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -229,7 +229,7 @@ xlog_recover_validate_buf_type( * just avoid the verification stage for non-crc filesystems */ if (!xfs_has_crc(mp)) - return; + return 0; magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); magic16 = be16_to_cpu(*(__be16*)bp->b_addr); @@ -407,7 +407,7 @@ xlog_recover_validate_buf_type( * skipped. */ if (current_lsn == NULLCOMMITLSN) - return 0;; + return 0; if (warnmsg) { xfs_warn(mp, warnmsg); @@ -567,18 +567,22 @@ xlog_recover_this_dquot_buffer( } /* - * Perform recovery for a buffer full of inodes. In these buffers, the only - * data which should be recovered is that which corresponds to the - * di_next_unlinked pointers in the on disk inode structures. The rest of the - * data for the inodes is always logged through the inodes themselves rather - * than the inode buffer and is recovered in xlog_recover_inode_pass2(). + * Perform recovery for a buffer full of inodes. We don't have inode cluster + * buffer specific LSNs, so we always recover inode buffers if they contain + * inodes. + * + * In these buffers, the only inode data which should be recovered is that which + * corresponds to the di_next_unlinked pointers in the on disk inode structures. + * The rest of the data for the inodes is always logged through the inodes + * themselves rather than the inode buffer and is recovered in + * xlog_recover_inode_pass2(). * * The only time when buffers full of inodes are fully recovered is when the - * buffer is full of newly allocated inodes. In this case the buffer will - * not be marked as an inode buffer and so will be sent to - * xlog_recover_do_reg_buffer() below during recovery. + * buffer is full of newly allocated inodes. In this case the buffer will not + * be marked as an inode buffer and so xlog_recover_do_reg_buffer() will be used + * instead. */ -STATIC int +static int xlog_recover_do_inode_buffer( struct xfs_mount *mp, struct xlog_recover_item *item, @@ -598,6 +602,13 @@ xlog_recover_do_inode_buffer( trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + /* + * If the magic number doesn't match, something has gone wrong. Don't + * recover the buffer. + */ + if (cpu_to_be16(XFS_DINODE_MAGIC) != *((__be16 *)bp->b_addr)) + return -EFSCORRUPTED; + /* * Post recovery validation only works properly on CRC enabled * filesystems. @@ -677,6 +688,31 @@ xlog_recover_do_inode_buffer( } + /* + * Make sure that only inode buffers with good sizes remain valid after + * recovering this buffer item. + * + * The kernel moves inodes in buffers of 1 block or inode_cluster_size + * bytes, whichever is bigger. The inode buffers in the log can be a + * different size if the log was generated by an older kernel using + * unclustered inode buffers or a newer kernel running with a different + * inode cluster size. Regardless, if the inode buffer size isn't + * max(blocksize, inode_cluster_size) for *our* value of + * inode_cluster_size, then we need to keep the buffer out of the buffer + * cache so that the buffer won't overlap with future reads of those + * inodes. + * + * To acheive this, we write the buffer ito recover the inodes then mark + * it stale so that it won't be found on overlapping buffer lookups and + * caller knows not to queue it for delayed write. + */ + if (BBTOB(bp->b_length) != M_IGEO(mp)->inode_cluster_size) { + int error; + + error = xfs_bwrite(bp); + xfs_buf_stale(bp); + return error; + } return 0; } @@ -840,7 +876,6 @@ xlog_recover_get_buf_lsn( magic16 = be16_to_cpu(*(__be16 *)blk); switch (magic16) { case XFS_DQUOT_MAGIC: - case XFS_DINODE_MAGIC: goto recover_immediately; default: break; @@ -910,6 +945,17 @@ xlog_recover_buf_commit_pass2( if (error) return error; + /* + * Inode buffer recovery is quite unique, so go out separate ways here + * to simplify the rest of the code. + */ + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + if (error || (bp->b_flags & XBF_STALE)) + goto out_release; + goto out_write; + } + /* * Recover the buffer only if we get an LSN from it and it's less than * the lsn of the transaction we are replaying. @@ -946,9 +992,7 @@ xlog_recover_buf_commit_pass2( goto out_release; } - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { - error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - } else if (buf_f->blf_flags & + if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (!xlog_recover_this_dquot_buffer(mp, log, item, bp, buf_f)) goto out_release; @@ -965,28 +1009,11 @@ xlog_recover_buf_commit_pass2( /* * Perform delayed write on the buffer. Asynchronous writes will be * slower when taking into account all the buffers to be flushed. - * - * Also make sure that only inode buffers with good sizes stay in - * the buffer cache. The kernel moves inodes in buffers of 1 block - * or inode_cluster_size bytes, whichever is bigger. The inode - * buffers in the log can be a different size if the log was generated - * by an older kernel using unclustered inode buffers or a newer kernel - * running with a different inode cluster size. Regardless, if - * the inode buffer size isn't max(blocksize, inode_cluster_size) - * for *our* value of inode_cluster_size, then we need to keep - * the buffer out of the buffer cache so that the buffer won't - * overlap with future reads of those inodes. */ - if (XFS_DINODE_MAGIC == - be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { - xfs_buf_stale(bp); - error = xfs_bwrite(bp); - } else { - ASSERT(bp->b_mount == mp); - bp->b_flags |= _XBF_LOGRECOVERY; - xfs_buf_delwri_queue(bp, buffer_list); - } +out_write: + ASSERT(bp->b_mount == mp); + bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_delwri_queue(bp, buffer_list); out_release: xfs_buf_relse(bp); -- 2.43.0
From: Dave Chinner <dchinner@redhat.com> We detect when a buffer log format type and the magic number in the buffer do not match. We issue a warning, but do not return an error nor do we write back the recovered buffer. If no further recover action is performed on that buffer, then recovery has left the buffer in an inconsistent (out of date) state on disk. i.e. the structure is corrupt on disk. If this mismatch occurs, return a -EFSCORRUPTED error and cause recovery to abort instead of letting recovery corrupt the filesystem and continue onwards. Signed-off-by: Dave Chinner <dchinner@redhat.com> --- fs/xfs/xfs_buf_item_recover.c | 51 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index d74bf7bb7794..dba57ee6fa6d 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -207,7 +207,7 @@ xlog_recover_buf_commit_pass1( * the first 32 bits of the buffer (most blocks), * inside a struct xfs_da_blkinfo at the start of the buffer. */ -static void +static int xlog_recover_validate_buf_type( struct xfs_mount *mp, struct xfs_buf *bp, @@ -407,11 +407,12 @@ xlog_recover_validate_buf_type( * skipped. */ if (current_lsn == NULLCOMMITLSN) - return; + return 0;; if (warnmsg) { xfs_warn(mp, warnmsg); - ASSERT(0); + xfs_buf_corruption_error(bp, __this_address); + return -EFSCORRUPTED; } /* @@ -425,14 +426,11 @@ xlog_recover_validate_buf_type( * the buffer. Therefore, initialize a bli purely to carry the LSN to * the verifier. */ - if (bp->b_ops) { - struct xfs_buf_log_item *bip; - - bp->b_flags |= _XBF_LOGRECOVERY; - xfs_buf_item_init(bp, mp); - bip = bp->b_log_item; - bip->bli_item.li_lsn = current_lsn; - } + ASSERT(bp->b_ops); + bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_item_init(bp, mp); + bp->b_log_item->bli_item.li_lsn = current_lsn; + return 0; } /* @@ -441,7 +439,7 @@ xlog_recover_validate_buf_type( * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ -STATIC void +static int xlog_recover_do_reg_buffer( struct xfs_mount *mp, struct xlog_recover_item *item, @@ -523,20 +521,20 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); + return xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); } /* - * Perform a dquot buffer recovery. + * Test if this dquot buffer item should be recovered. * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. * - * Return false if the buffer was tossed and true if we recovered the buffer to - * indicate to the caller if the buffer needs writing. + * Return false if the buffer should be tossed and true if the buffer needs + * to be recovered. */ -STATIC bool -xlog_recover_do_dquot_buffer( +static bool +xlog_recover_this_dquot_buffer( struct xfs_mount *mp, struct xlog *log, struct xlog_recover_item *item, @@ -565,8 +563,6 @@ xlog_recover_do_dquot_buffer( */ if (log->l_quotaoffs_flag & type) return false; - - xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); return true; } @@ -952,18 +948,19 @@ xlog_recover_buf_commit_pass2( if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - if (error) - goto out_release; } else if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - bool dirty; - - dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); - if (!dirty) + if (!xlog_recover_this_dquot_buffer(mp, log, item, bp, buf_f)) goto out_release; + + error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, + NULLCOMMITLSN); } else { - xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + error = xlog_recover_do_reg_buffer(mp, item, bp, buf_f, + current_lsn); } + if (error) + goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be -- 2.43.0
From: Zhang Yi <yi.zhang@huawei.com> Current clone operation could be non-atomic if the destination of a file is beyond EOF, user could get a file with corrupted (zeroed) data on crash. The problem is about to pre-alloctions. If you write some data into a file [A, B) (the position letters are increased one by one), and xfs could pre-allocate some blocks, then we get a delayed extent [A, D). Then the writeback path allocate blocks and convert this delayed extent [A, C) since lack of enough contiguous physical blocks, so the extent [C, D) is still delayed. After that, both the in-memory and the on-disk file size are B. If we clone file range into [E, F) from another file, xfs_reflink_zero_posteof() would call iomap_zero_range() to zero out the range [B, E) beyond EOF and flush range. Since [C, D) is still a delayed extent, it will be zeroed and the file's in-memory && on-disk size will be updated to D after flushing and before doing the clone operation. This is wrong, because user can user can see the size change and read zeros in the middle of the clone operation. We need to keep the in-memory and on-disk size before the clone operation starts, so instead of writing zeroes through the page cache for delayed ranges beyond EOF, we convert these ranges to unwritten and invalidating any cached data over that range beyond EOF. Suggested-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_iomap.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ccf83e72d8ca..1a6d05830433 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1035,6 +1035,24 @@ xfs_buffered_write_iomap_begin( } if (imap.br_startoff <= offset_fsb) { + /* + * For zeroing out delayed allocation extent, we trim it if + * it's partial beyonds EOF block, or convert it to unwritten + * extent if it's all beyonds EOF block. + */ + if ((flags & IOMAP_ZERO) && + isnullstartblock(imap.br_startblock)) { + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + + if (offset_fsb >= eof_fsb) + goto convert_delay; + if (end_fsb > eof_fsb) { + end_fsb = eof_fsb; + xfs_trim_extent(&imap, offset_fsb, + end_fsb - offset_fsb); + } + } + /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of @@ -1158,6 +1176,17 @@ xfs_buffered_write_iomap_begin( xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); +convert_delay: + xfs_iunlock(ip, lockmode); + truncate_pagecache(inode, offset); + error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset, + iomap, NULL); + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap); + return 0; + found_cow: seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { -- 2.39.2
From: Zhang Yi <yi.zhang@huawei.com> Unsharing and zeroing can only happen within EOF, so there is never a need to perform posteof pagecache truncation if write begin fails, also partial write could never theoretically happened from iomap_write_end(), so remove both of them. Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/iomap/buffered-io.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 093c4515b22a..7e32a204650b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -786,7 +786,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, out_unlock: __iomap_put_folio(iter, pos, 0, folio); - iomap_write_failed(iter->inode, pos, len); return status; } @@ -863,8 +862,6 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); - if (ret < len) - iomap_write_failed(iter->inode, pos + ret, len - ret); return ret; } @@ -912,8 +909,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) } status = iomap_write_begin(iter, pos, bytes, &folio); - if (unlikely(status)) + if (unlikely(status)) { + iomap_write_failed(iter->inode, pos, bytes); break; + } if (iter->iomap.flags & IOMAP_F_STALE) break; @@ -927,6 +926,9 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); status = iomap_write_end(iter, pos, bytes, copied, folio); + if (status < bytes) + iomap_write_failed(iter->inode, pos + status, + bytes - status); if (unlikely(copied != status)) iov_iter_revert(i, copied - status); -- 2.39.2
From: Zhang Yi <yi.zhang@huawei.com> Since xfs_bmapi_convert_delalloc() only attempts to allocate the entire delalloc extent and require multiple invocations to allocate the target offset. So xfs_convert_blocks() add a loop to do this job and we call it in the write back path, but xfs_convert_blocks() isn't a common helper. Let's do it in xfs_bmapi_convert_delalloc() and drop xfs_convert_blocks(), preparing for the post EOF delalloc blocks converting in the buffered write begin path. Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/libxfs/xfs_bmap.c | 34 +++++++++++++++++++++++-- fs/xfs/xfs_aops.c | 54 +++++++++++----------------------------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 07dc35de8ce5..042e8d3ab0ba 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4516,8 +4516,8 @@ xfs_bmapi_write( * invocations to allocate the target offset if a large enough physical extent * is not available. */ -int -xfs_bmapi_convert_delalloc( +static int +__xfs_bmapi_convert_delalloc( struct xfs_inode *ip, int whichfork, xfs_off_t offset, @@ -4648,6 +4648,36 @@ xfs_bmapi_convert_delalloc( return error; } +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in iomap. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + loff_t offset, + struct iomap *iomap, + unsigned int *seq) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into iomap. Allocate in a loop because it may + * take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. + */ + do { + error = __xfs_bmapi_convert_delalloc(ip, whichfork, offset, + iomap, seq); + if (error) + return error; + } while (iomap->offset + iomap->length <= offset); + + return 0; +} + int xfs_bmapi_remap( struct xfs_trans *tp, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 813f85156b0c..6479e0dac69d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -233,45 +233,6 @@ xfs_imap_valid( return true; } -/* - * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->iomap. - * - * The current page is held locked so nothing could have removed the block - * backing offset_fsb, although it could have moved from the COW to the data - * fork by another thread. - */ -static int -xfs_convert_blocks( - struct iomap_writepage_ctx *wpc, - struct xfs_inode *ip, - int whichfork, - loff_t offset) -{ - int error; - unsigned *seq; - - if (whichfork == XFS_COW_FORK) - seq = &XFS_WPC(wpc)->cow_seq; - else - seq = &XFS_WPC(wpc)->data_seq; - - /* - * Attempt to allocate whatever delalloc extent currently backs offset - * and put the result into wpc->iomap. Allocate in a loop because it - * may take several attempts to allocate real blocks for a contiguous - * delalloc extent if free space is sufficiently fragmented. - */ - do { - error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, - &wpc->iomap, seq); - if (error) - return error; - } while (wpc->iomap.offset + wpc->iomap.length <= offset); - - return 0; -} - static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, @@ -289,6 +250,7 @@ xfs_map_blocks( struct xfs_iext_cursor icur; int retries = 0; int error = 0; + unsigned int *seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -386,7 +348,19 @@ xfs_map_blocks( trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, whichfork, offset); + /* + * Convert a dellalloc extent to a real one. The current page is held + * locked so nothing could have removed the block backing offset_fsb, + * although it could have moved from the COW to the data fork by another + * thread. + */ + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; + + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) { /* * If we failed to find the extent in the COW fork we might have -- 2.39.2
From: Zhang Yi <yi.zhang@huawei.com> Commit 1aa91d9c9933 ("xfs: Add async buffered write support") replace xfs_ilock(XFS_ILOCK_EXCL) with xfs_ilock_for_iomap() when locking the writing inode, and a new variable lockmode is used to indicate the lock mode. Although the lockmode should always be XFS_ILOCK_EXCL, it's still better to use this variable instead of useing XFS_ILOCK_EXCL directly when unlocking the inode. Fixes: 1aa91d9c9933 ("xfs: Add async buffered write support") Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_iomap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 18c8f168b153..ccf83e72d8ca 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1149,13 +1149,13 @@ xfs_buffered_write_iomap_begin( * them out if the write happens to fail. */ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: seq = xfs_iomap_inode_sequence(ip, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: @@ -1165,17 +1165,17 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; } -- 2.39.2
From: Zhang Yi <yi.zhang@huawei.com> Increase i_size in iomap_zero_range() and iomap_unshare_iter() is not needed, the caller should handle it. Especially, when truncate partial block, we should not increase i_size beyond the new EOF here. It doesn't affect xfs and gfs2 now because they set the new file size after zero out, it doesn't matter that a transient increase in i_size, but it will affect ext4 because it set file size before truncate. So move the i_size updating logic to iomap_write_iter(). Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/iomap/buffered-io.c | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7e32a204650b..e9112dc78d15 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -837,32 +837,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t old_size = iter->inode->i_size; - size_t ret; - - if (srcmap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(iter, folio, pos, copied); - } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, - copied, &folio->page, NULL); - } else { - ret = __iomap_write_end(iter->inode, pos, len, copied, folio); - } - - /* - * Update the in-memory inode size after copying the data into the page - * cache. It's up to the file system to write the updated size to disk, - * preferably after I/O completion so that no stale data is exposed. - */ - if (pos + ret > old_size) { - i_size_write(iter->inode, pos + ret); - iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; - } - __iomap_put_folio(iter, pos, ret, folio); - if (old_size < pos) - pagecache_isize_extended(iter->inode, old_size, pos); - return ret; + if (srcmap->type == IOMAP_INLINE) + return iomap_write_end_inline(iter, folio, pos, copied); + if (srcmap->flags & IOMAP_F_BUFFER_HEAD) + return block_write_end(NULL, iter->inode->i_mapping, pos, len, + copied, &folio->page, NULL); + return __iomap_write_end(iter->inode, pos, len, copied, folio); } static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) @@ -877,6 +858,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) do { struct folio *folio; + loff_t old_size; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ @@ -926,6 +908,22 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); status = iomap_write_end(iter, pos, bytes, copied, folio); + /* + * Update the in-memory inode size after copying the data into + * the page cache. It's up to the file system to write the + * updated size to disk, preferably after I/O completion so that + * no stale data is exposed. Only once that's done can we + * unlock and release the folio. + */ + old_size = iter->inode->i_size; + if (pos + status > old_size) { + i_size_write(iter->inode, pos + status); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; + } + __iomap_put_folio(iter, pos, status, folio); + + if (old_size < pos) + pagecache_isize_extended(iter->inode, old_size, pos); if (status < bytes) iomap_write_failed(iter->inode, pos + status, bytes - status); @@ -1298,6 +1296,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) bytes = folio_size(folio) - offset; bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(bytes == 0)) return -EIO; @@ -1362,6 +1361,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) folio_mark_accessed(folio); bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(bytes == 0)) return -EIO; -- 2.39.2
From: Zhang Yi <yi.zhang@huawei.com> Allow callers to pass a NULLL seq argument if they don't care about the fork sequence number. Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/libxfs/xfs_bmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f362345467fa..07dc35de8ce5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4574,7 +4574,8 @@ xfs_bmapi_convert_delalloc( if (!isnullstartblock(bma.got.br_startblock)) { xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4623,7 +4624,8 @@ xfs_bmapi_convert_delalloc( ASSERT(!isnullstartblock(bma.got.br_startblock)); xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); -- 2.39.2
From: Zhang Yi <yi.zhang@huawei.com> Since iomap_write_end() can never return a partial write length, the comperation between written, copied and bytes becomes useless, just merge them with the unwritten branch. Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/iomap/buffered-io.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 004673ea8bc1..f2fb89056259 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -937,11 +937,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); - if (written < bytes) - iomap_write_failed(iter->inode, pos + written, - bytes - written); - if (unlikely(copied != written)) - iov_iter_revert(i, copied - written); cond_resched(); if (unlikely(written == 0)) { @@ -951,6 +946,9 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) * halfway through, might be a race with munmap, * might be severe memory pressure. */ + iomap_write_failed(iter->inode, pos, bytes); + iov_iter_revert(i, copied); + if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { -- 2.39.2