From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ipmail06.adl6.internode.on.net ([150.101.137.145]:33269 "EHLO ipmail06.adl6.internode.on.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750743AbdJEEFf (ORCPT ); Thu, 5 Oct 2017 00:05:35 -0400 Date: Thu, 5 Oct 2017 15:04:52 +1100 From: Dave Chinner Subject: Re: [PATCH 16/25] xfs: scrub inodes Message-ID: <20171005040452.GM3666@dastard> References: <150706324963.19351.17715069858921948692.stgit@magnolia> <150706335008.19351.13005092237576865356.stgit@magnolia> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <150706335008.19351.13005092237576865356.stgit@magnolia> Sender: linux-xfs-owner@vger.kernel.org List-ID: List-Id: xfs To: "Darrick J. Wong" Cc: linux-xfs@vger.kernel.org On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote: > @@ -559,3 +563,64 @@ xfs_scrub_setup_ag_btree( > > return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa); > } > + > +/* > + * Given an inode and the scrub control structure, grab either the > + * inode referenced in the control structure or the inode passed in. > + * The inode is not locked. > + */ > +int > +xfs_scrub_get_inode( > + struct xfs_scrub_context *sc, > + struct xfs_inode *ip_in) > +{ > + struct xfs_mount *mp = sc->mp; > + struct xfs_inode *ips = NULL; *ip? > + int error; > + > + if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) > + return -EINVAL; What's this checking? > + > + /* We want to scan the inode we already had opened. */ > + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { > + sc->ip = ip_in; > + return 0; > + } > + > + /* Look up the inode, see if the generation number matches. */ > + if (xfs_internal_inum(mp, sc->sm->sm_ino)) > + return -ENOENT; maybe xfs_internal_inum should be moved to the same place as all the inode/agbno/bno verification functions.... > + error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED, > + 0, &ips); I think we also want XFS_IGET_DONTCACHE here, so we don't trash the inode cache with inodes that we use once for scrub and never touch again. > + if (error == -ENOENT || error == -EINVAL) { > + /* inode doesn't exist... */ > + return -ENOENT; > + } else if (error) { > + trace_xfs_scrub_op_error(sc, > + XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), > + XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), > + error, __return_address); > + return error; > + } > + if (VFS_I(ips)->i_generation != sc->sm->sm_gen) { > + iput(VFS_I(ips)); > + return -ENOENT; > + } > + > + sc->ip = ips; > + return 0; > +} > + > +/* Push everything out of the log onto disk. */ > +int > +xfs_scrub_checkpoint_log( > + struct xfs_mount *mp) > +{ > + int error; > + > + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); > + if (error) > + return error; > + xfs_ail_push_all_sync(mp->m_ail); > + return 0; > +} Oooo, that's a nasty thing to do on busy systems with large dirty logs. I hope this is a "last resort" kind of thing.... > +/* Set us up with an inode. */ What state are we trying to get the inode into here? We grab all the various locks, but we can still have data changing via mmap pages that are already faulted in and delalloc extents in the incore extent list that aren't reflected on disk... A comment explaining what we expect here would be nice. > +int > +xfs_scrub_setup_inode( > + struct xfs_scrub_context *sc, > + struct xfs_inode *ip) > +{ > + struct xfs_mount *mp = sc->mp; > + int error; > + > + /* > + * Try to get the inode. If the verifiers fail, we try again > + * in raw mode. > + */ > + error = xfs_scrub_get_inode(sc, ip); > + switch (error) { > + case 0: > + break; > + case -EFSCORRUPTED: > + case -EFSBADCRC: > + return xfs_scrub_checkpoint_log(mp); > + default: > + return error; > + } > + > + /* Got the inode, lock it and we're ready to go. */ > + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; > + xfs_ilock(sc->ip, sc->ilock_flags); > + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); > + if (error) > + goto out_unlock; > + sc->ilock_flags |= XFS_ILOCK_EXCL; > + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); Should the inode be joined to the transaction so that cancelling the transaction unlocks the inode? Then the need for the ilock_flags variable goes away.... > + > + return error; > +out_unlock: > + xfs_iunlock(sc->ip, sc->ilock_flags); > + if (sc->ip != ip) > + iput(VFS_I(sc->ip)); > + sc->ip = NULL; > + return error; > +} > + > +/* Inode core */ > + > +/* Scrub an inode. */ > +int > +xfs_scrub_inode( > + struct xfs_scrub_context *sc) > +{ > + struct xfs_imap imap; > + struct xfs_dinode di; > + struct xfs_mount *mp = sc->mp; > + struct xfs_buf *bp = NULL; > + struct xfs_dinode *dip; > + xfs_ino_t ino; > + size_t fork_recs; > + unsigned long long isize; > + uint64_t flags2; > + uint32_t nextents; > + uint32_t extsize; > + uint32_t cowextsize; > + uint16_t flags; > + uint16_t mode; > + bool has_shared; > + int error = 0; > + > + /* Did we get the in-core inode, or are we doing this manually? */ > + if (sc->ip) { > + ino = sc->ip->i_ino; > + xfs_inode_to_disk(sc->ip, &di, 0); > + dip = &di; > + } else { > + /* Map & read inode. */ > + ino = sc->sm->sm_ino; > + error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); > + if (error == -EINVAL) { > + /* > + * Inode could have gotten deleted out from under us; > + * just forget about it. > + */ > + error = -ENOENT; > + goto out; > + } > + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), > + XFS_INO_TO_AGBNO(mp, ino), &error)) > + goto out; > + > + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, > + imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, > + NULL); > + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), > + XFS_INO_TO_AGBNO(mp, ino), &error)) > + goto out; > + > + /* Is this really an inode? */ > + bp->b_ops = &xfs_inode_buf_ops; > + dip = xfs_buf_offset(bp, imap.im_boffset); > + if (!xfs_dinode_verify(mp, ino, dip) || > + !xfs_dinode_good_version(mp, dip->di_version)) { > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + goto out; > + } > + > + /* ...and is it the one we asked for? */ > + if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { > + error = -ENOENT; > + goto out; > + } > + } Can we factor the manual mapping into a separate function? Just makes it a bit cleaner and gets rid of a bunch of local variables from this function that are just used to map and read the inode. ANd reading on and getting ahead of the code, could we split it further into xfs_scrub_dinode(sc, ino, dip, bp); > + > + flags = be16_to_cpu(dip->di_flags); > + if (dip->di_version >= 3) > + flags2 = be64_to_cpu(dip->di_flags2); > + else > + flags2 = 0; > + > + /* di_mode */ > + mode = be16_to_cpu(dip->di_mode); > + if (mode & ~(S_IALLUGO | S_IFMT)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + /* v1/v2 fields */ > + switch (dip->di_version) { > + case 1: > + if (dip->di_nlink != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (dip->di_mode == 0 && sc->ip) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (dip->di_projid_lo != 0 || dip->di_projid_hi != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; We don't really support v1 inode format anymore - we convert it to version 2 automatically in xfs_inode_from_disk() so the in-memory inode is always v2 or v3, never v1. And when we write it back out, we write it as a v2 inode, never as a v1 inode. Hence I'm not sure whether we should be worrying about scrubbing such inodes - they are going to be in an ever shrinking minority of filesystems. At minimum, they should always return "preen". > + case 2: > + case 3: > + if (dip->di_onlink != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (dip->di_mode == 0 && sc->ip) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (dip->di_projid_hi != 0 && > + !xfs_sb_version_hasprojid32bit(&mp->m_sb)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + default: > + ASSERT(0); If we don't understand the version, it's corrupt. > + break; > + } > + > + /* > + * di_uid/di_gid -- -1 isn't invalid, but there's no way that > + * userspace could have created that. > + */ > + if (dip->di_uid == cpu_to_be32(-1U) || > + dip->di_gid == cpu_to_be32(-1U)) > + xfs_scrub_ino_set_warning(sc, bp); > + > + /* di_format */ > + switch (dip->di_format) { > + case XFS_DINODE_FMT_DEV: > + if (!S_ISCHR(mode) && !S_ISBLK(mode) && > + !S_ISFIFO(mode) && !S_ISSOCK(mode)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_LOCAL: > + if (!S_ISDIR(mode) && !S_ISLNK(mode)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_EXTENTS: > + if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_BTREE: > + if (!S_ISREG(mode) && !S_ISDIR(mode)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_UUID: > + default: > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + } > + > + /* di_size */ > + isize = be64_to_cpu(dip->di_size); > + if (isize & (1ULL << 63)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); Should we be checking against the on disk format size, or the mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)? 32 or 64 bit systems are going to have different maximum valid file sizes.. Directories have a maximum bound size, too - the data space, leaf space and freespace space, each of which are 32GB in size, IIRC. And symlinks have a different maximum size, too. > + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + /* di_nblocks */ > + if (flags2 & XFS_DIFLAG2_REFLINK) { > + ; /* nblocks can exceed dblocks */ > + } else if (flags & XFS_DIFLAG_REALTIME) { > + if (be64_to_cpu(dip->di_nblocks) >= > + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); That doesn't seem right. the file can be on either the data or the rt device, so the maximum file blocks is the size of one device or the other, not both combined. > + } else { > + if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + } > + > + /* di_extsize */ > + extsize = be32_to_cpu(dip->di_extsize); > + if (flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) { > + if (extsize <= 0 || extsize > MAXEXTLEN) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (!(flags & XFS_DIFLAG_REALTIME) && > + extsize > mp->m_sb.sb_agblocks / 2) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + } else { > + if (extsize != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + } There more to validating the extentsize hints than this. From xfs_ioctl_setattr_check_extsize(): /* * extent size hint validation is somewhat cumbersome. Rules are: * * 1. extent size hint is only valid for directories and regular files * 2. FS_XFLAG_EXTSIZE is only valid for regular files * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. * 4. can only be changed on regular files if no extents are allocated * 5. can be changed on directories at any time * 6. extsize hint of 0 turns off hints, clears inode flags. * 7. Extent size must be a multiple of the appropriate block size. * 8. for non-realtime files, the extent size hint must be limited * to half the AG size to avoid alignment extending the extent beyond the * limits of the AG. */ Maybe there's some commonality between these two functions... > + > + /* di_flags */ > + if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); What about project id flags? Also, there are flags that are allowed only on regular files, and there are flags that are only allowed on directories. Those should probably also be checked for preening. > + > + /* di_nextents */ > + nextents = be32_to_cpu(dip->di_nextents); > + fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > + switch (dip->di_format) { > + case XFS_DINODE_FMT_EXTENTS: > + if (nextents > fork_recs) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_BTREE: > + if (nextents <= fork_recs) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_LOCAL: > + case XFS_DINODE_FMT_DEV: > + case XFS_DINODE_FMT_UUID: > + default: > + if (nextents != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + } > + > + /* di_anextents */ > + nextents = be16_to_cpu(dip->di_anextents); > + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > + switch (dip->di_aformat) { > + case XFS_DINODE_FMT_EXTENTS: > + if (nextents > fork_recs) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_BTREE: > + if (nextents <= fork_recs) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_LOCAL: > + case XFS_DINODE_FMT_DEV: > + case XFS_DINODE_FMT_UUID: > + default: > + if (nextents != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + } Don't we need a check here first to see whether an attribute fork exists or not? > + > + /* di_forkoff */ > + if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + if (dip->di_anextents != 0 && dip->di_forkoff == 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + /* di_aformat */ > + if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && > + dip->di_aformat != XFS_DINODE_FMT_EXTENTS && > + dip->di_aformat != XFS_DINODE_FMT_BTREE) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); Shouldn't this come before we use dip->di_aformat in a switch statement? Hmmm - aren't we missing the same checks for the data fork? Cheers, Dave. -- Dave Chinner david@fromorbit.com