All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] xfsprogs: fixes for 3.2.1
@ 2014-07-04  5:57 Dave Chinner
  2014-07-04  5:57 ` [PATCH 1/6] repair: support more than 25 ACLs Dave Chinner
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Dave Chinner @ 2014-07-04  5:57 UTC (permalink / raw)
  To: xfs

Hi folks,

This is the current list of patches I have outstanding for xfsprogs.
Several of these are critical for a 3.2.1 release as they result in
xfs_repair failing to repair filesystems correctly.

These have been tested on several complex  filesytems that broke
reapir in the 3.2.0 release, as well as all the usual xfstests
testing.

Comments welcome.

-Dave.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/6] repair: support more than 25 ACLs
  2014-07-04  5:57 [PATCH 0/6] xfsprogs: fixes for 3.2.1 Dave Chinner
@ 2014-07-04  5:57 ` Dave Chinner
  2014-07-04 14:23   ` Christoph Hellwig
  2014-07-04  5:57 ` [PATCH 2/6] xfs_db: write command broken on 64 bit values Dave Chinner
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-04  5:57 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

v5 superblock supports many more than 25 ACLs on an inode, but
xfs_repair still thinks that the maximum is 25. This slipped through
becase the reapir code does not share any of the kernel side ACL
code in libxfs, and instead has all it's own internal ACL
definitions.

Fix the repair code to support more than 25 ACLs and update
the ACL definitions to match the kernel definitions. In doing so,
this tickles a off-by-one bug on remote attribute maximum sizes
that is already fixed in the kernel code. So in addition to fixing
the repair code, this patch pulls in parts of the following kernel
commits:

bba719b5 xfs: fix off-by-one error in xfs_attr3_rmt_verify
0a8aa193 xfs: increase number of ACL entries for V5 superblocks

Reported-by: Michael L. Semon <mlsemon35@gmail.com>
Tested-by: Michael L. Semon <mlsemon35@gmail.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 libxfs/xfs_attr_remote.c |  2 +-
 repair/attr_repair.c     | 73 ++++++++++++++++++++++++++++++------------------
 repair/attr_repair.h     | 46 +++++++++++++++++++++---------
 3 files changed, 80 insertions(+), 41 deletions(-)

diff --git a/libxfs/xfs_attr_remote.c b/libxfs/xfs_attr_remote.c
index 5cf5c73..08b983b 100644
--- a/libxfs/xfs_attr_remote.c
+++ b/libxfs/xfs_attr_remote.c
@@ -85,7 +85,7 @@ xfs_attr3_rmt_verify(
 	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
 		return false;
 	if (be32_to_cpu(rmt->rm_offset) +
-				be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
+				be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
 		return false;
 	if (rmt->rm_owner == 0)
 		return false;
diff --git a/repair/attr_repair.c b/repair/attr_repair.c
index 5dd7e5f..a27a3ec 100644
--- a/repair/attr_repair.c
+++ b/repair/attr_repair.c
@@ -25,7 +25,7 @@
 #include "protos.h"
 #include "dir2.h"
 
-static int xfs_acl_valid(xfs_acl_disk_t *daclp);
+static int xfs_acl_valid(struct xfs_mount *mp, struct xfs_acl *daclp);
 static int xfs_mac_valid(xfs_mac_label_t *lp);
 
 /*
@@ -734,11 +734,15 @@ verify_da_path(xfs_mount_t	*mp,
  * If value is non-zero, then a remote attribute is being passed in
  */
 static int
-valuecheck(char *namevalue, char *value, int namelen, int valuelen)
+valuecheck(
+	struct xfs_mount *mp,
+	char		*namevalue,
+	char		*value,
+	int		namelen,
+	int		valuelen)
 {
 	/* for proper alignment issues, get the structs and memmove the values */
 	xfs_mac_label_t macl;
-	xfs_acl_t thisacl;
 	void *valuep;
 	int clearit = 0;
 
@@ -746,18 +750,23 @@ valuecheck(char *namevalue, char *value, int namelen, int valuelen)
 			(strncmp(namevalue, SGI_ACL_DEFAULT,
 				SGI_ACL_DEFAULT_SIZE) == 0)) {
 		if (value == NULL) {
-			memset(&thisacl, 0, sizeof(xfs_acl_t));
-			memmove(&thisacl, namevalue+namelen, valuelen);
-			valuep = &thisacl;
+			valuep = malloc(valuelen);
+			if (!valuep)
+				do_error(_("No memory for ACL check!\n"));
+			memcpy(valuep, namevalue + namelen, valuelen);
 		} else
 			valuep = value;
 
-		if (xfs_acl_valid((xfs_acl_disk_t *)valuep) != 0) {
+		if (xfs_acl_valid(mp, valuep) != 0) {
 			clearit = 1;
 			do_warn(
 	_("entry contains illegal value in attribute named SGI_ACL_FILE "
 	  "or SGI_ACL_DEFAULT\n"));
 		}
+
+		if (valuep != value)
+			free(valuep);
+
 	} else if (strncmp(namevalue, SGI_MAC_FILE, SGI_MAC_FILE_SIZE) == 0) {
 		if (value == NULL) {
 			memset(&macl, 0, sizeof(xfs_mac_label_t));
@@ -800,6 +809,7 @@ valuecheck(char *namevalue, char *value, int namelen, int valuelen)
  */
 static int
 process_shortform_attr(
+	struct xfs_mount *mp,
 	xfs_ino_t	ino,
 	xfs_dinode_t	*dip,
 	int		*repair)
@@ -904,7 +914,7 @@ process_shortform_attr(
 
 		/* Only check values for root security attributes */
 		if (currententry->flags & XFS_ATTR_ROOT)
-		       junkit = valuecheck((char *)&currententry->nameval[0],
+		       junkit = valuecheck(mp, (char *)&currententry->nameval[0],
 					NULL, currententry->namelen, 
 					currententry->valuelen);
 
@@ -1039,6 +1049,7 @@ rmtval_get(xfs_mount_t *mp, xfs_ino_t ino, blkmap_t *blkmap,
 
 static int
 process_leaf_attr_local(
+	struct xfs_mount	*mp,
 	xfs_attr_leafblock_t	*leaf,
 	int			i,
 	xfs_attr_leaf_entry_t	*entry,
@@ -1076,7 +1087,7 @@ process_leaf_attr_local(
 
 	/* Only check values for root security attributes */
 	if (entry->flags & XFS_ATTR_ROOT) {
-		if (valuecheck((char *)&local->nameval[0], NULL, 
+		if (valuecheck(mp, (char *)&local->nameval[0], NULL, 
 				local->namelen, be16_to_cpu(local->valuelen))) {
 			do_warn(
 	_("bad security value for attribute entry %d in attr block %u, inode %" PRIu64 "\n"),
@@ -1134,7 +1145,7 @@ process_leaf_attr_remote(
 			i, ino);
 		goto bad_free_out;
 	}
-	if (valuecheck((char *)&remotep->name[0], value, remotep->namelen,
+	if (valuecheck(mp, (char *)&remotep->name[0], value, remotep->namelen,
 				be32_to_cpu(remotep->valuelen))) {
 		do_warn(
 	_("remote attribute value check failed for entry %d, inode %" PRIu64 "\n"),
@@ -1216,15 +1227,15 @@ process_leaf_attr_block(
 			break;	/* got an overlap */
 		}
 
-		if (entry->flags & XFS_ATTR_LOCAL) 
-			thissize = process_leaf_attr_local(leaf, i, entry,
+		if (entry->flags & XFS_ATTR_LOCAL)
+			thissize = process_leaf_attr_local(mp, leaf, i, entry,
 						last_hashval, da_bno, ino);
 		else
 			thissize = process_leaf_attr_remote(leaf, i, entry,
 						last_hashval, da_bno, ino,
 						mp, blkmap);
 		if (thissize < 0) {
-			clearit = 1;				
+			clearit = 1;
 			break;
 		}
 
@@ -1608,23 +1619,27 @@ process_longform_attr(
 
 
 static int
-xfs_acl_from_disk(struct xfs_acl **aclp, struct xfs_acl_disk *dacl)
+xfs_acl_from_disk(
+	struct xfs_mount	*mp,
+	struct xfs_icacl	**aclp,
+	struct xfs_acl		*dacl)
 {
+	struct xfs_icacl	*acl;
+	struct xfs_icacl_entry	*ace;
+	struct xfs_acl_entry	*dace;
 	int			count;
-	xfs_acl_t		*acl;
-	xfs_acl_entry_t 	*ace;
-	xfs_acl_entry_disk_t	*dace, *end;
+	int			i;
 
 	count = be32_to_cpu(dacl->acl_cnt);
-	if (count > XFS_ACL_MAX_ENTRIES) {
+	if (count > XFS_ACL_MAX_ENTRIES(mp)) {
 		do_warn(_("Too many ACL entries, count %d\n"), count);
 		*aclp = NULL;
 		return EINVAL;
 	}
 
 
-	end = &dacl->acl_entry[0] + count;
-	acl = malloc((int)((char *)end - (char *)dacl));
+	acl = malloc(sizeof(struct xfs_icacl) +
+		     count * sizeof(struct xfs_icacl_entry));
 	if (!acl) {
 		do_warn(_("cannot malloc enough for ACL attribute\n"));
 		do_warn(_("SKIPPING this ACL\n"));
@@ -1633,8 +1648,10 @@ xfs_acl_from_disk(struct xfs_acl **aclp, struct xfs_acl_disk *dacl)
 	}
 
 	acl->acl_cnt = count;
-	ace = &acl->acl_entry[0];
-	for (dace = &dacl->acl_entry[0]; dace < end; ace++, dace++) {
+	for (i = 0; i < count; i++) {
+		ace = &acl->acl_entry[i];
+		dace = &dacl->acl_entry[i];
+
 		ace->ae_tag = be32_to_cpu(dace->ae_tag);
 		ace->ae_id = be32_to_cpu(dace->ae_id);
 		ace->ae_perm = be16_to_cpu(dace->ae_perm);
@@ -1667,7 +1684,7 @@ process_attributes(
 	if (aformat == XFS_DINODE_FMT_LOCAL) {
 		ASSERT(be16_to_cpu(asf->hdr.totsize) <=
 			XFS_DFORK_ASIZE(dip, mp));
-		err = process_shortform_attr(ino, dip, repair);
+		err = process_shortform_attr(mp, ino, dip, repair);
 	} else if (aformat == XFS_DINODE_FMT_EXTENTS ||
 					aformat == XFS_DINODE_FMT_BTREE)  {
 			err = process_longform_attr(mp, ino, dip, blkmap,
@@ -1686,17 +1703,19 @@ process_attributes(
  * Validate an ACL
  */
 static int
-xfs_acl_valid(xfs_acl_disk_t *daclp)
+xfs_acl_valid(
+	struct xfs_mount *mp,
+	struct xfs_acl	*daclp)
 {
-	xfs_acl_t	*aclp = NULL;
-	xfs_acl_entry_t *entry, *e;
+	struct xfs_icacl	*aclp = NULL;
+	struct xfs_icacl_entry	*entry, *e;
 	int user = 0, group = 0, other = 0, mask = 0, mask_required = 0;
 	int i, j;
 
 	if (daclp == NULL)
 		goto acl_invalid;
 
-	switch (xfs_acl_from_disk(&aclp, daclp)) {
+	switch (xfs_acl_from_disk(mp, &aclp, daclp)) {
 	case ENOMEM:
 		return 0;
 	case EINVAL:
diff --git a/repair/attr_repair.h b/repair/attr_repair.h
index f42536a..0d0c62c 100644
--- a/repair/attr_repair.h
+++ b/repair/attr_repair.h
@@ -37,29 +37,49 @@ typedef __int32_t	xfs_acl_type_t;
 typedef __int32_t	xfs_acl_tag_t;
 typedef __int32_t	xfs_acl_id_t;
 
-typedef struct xfs_acl_entry {
+/*
+ * "icacl" = in-core ACL. There is no equivalent in the XFS kernel code,
+ * so they are magic names just for repair. The "acl" types are what the kernel
+ * code uses for the on-disk format names, so use them here too for the on-disk
+ * ACL format definitions.
+ */
+struct xfs_icacl_entry {
 	xfs_acl_tag_t	ae_tag;
 	xfs_acl_id_t	ae_id;
 	xfs_acl_perm_t	ae_perm;
-} xfs_acl_entry_t;
+};
 
-#define XFS_ACL_MAX_ENTRIES	25
-typedef struct xfs_acl {
-	__int32_t	acl_cnt;
-	xfs_acl_entry_t	acl_entry[XFS_ACL_MAX_ENTRIES];
-} xfs_acl_t;
+struct xfs_icacl {
+	__int32_t		acl_cnt;
+	struct xfs_icacl_entry	acl_entry[0];
+};
 
-typedef struct xfs_acl_entry_disk {
+struct xfs_acl_entry {
 	__be32		ae_tag;
 	__be32		ae_id;
 	__be16		ae_perm;
-} xfs_acl_entry_disk_t;
+	__be16		ae_pad;
+};
 
-typedef struct xfs_acl_disk {
-	__be32		acl_cnt;
-	xfs_acl_entry_disk_t	acl_entry[XFS_ACL_MAX_ENTRIES];
-} xfs_acl_disk_t;
+struct xfs_acl {
+	__be32			acl_cnt;
+	struct xfs_acl_entry	acl_entry[0];
+};
 
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+	(xfs_sb_version_hascrc(&mp->m_sb) \
+		?  (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+						sizeof(struct xfs_acl_entry) \
+		: 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+	(sizeof(struct xfs_acl) + \
+		sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
 
 #define SGI_ACL_FILE	"SGI_ACL_FILE"
 #define SGI_ACL_DEFAULT	"SGI_ACL_DEFAULT"
-- 
2.0.0

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/6] xfs_db: write command broken on 64 bit values
  2014-07-04  5:57 [PATCH 0/6] xfsprogs: fixes for 3.2.1 Dave Chinner
  2014-07-04  5:57 ` [PATCH 1/6] repair: support more than 25 ACLs Dave Chinner
@ 2014-07-04  5:57 ` Dave Chinner
  2014-07-04 14:08   ` Christoph Hellwig
  2014-07-04  5:57 ` [PATCH 3/6] repair: handle directory block corruption in phase 6 Dave Chinner
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-04  5:57 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

convert_args() has problesm with 64 bit fields because it tries to
shift them by 64 bits. The result of doing so is undefined by the C
standard, and so results in the unexpected behaviour of the result
being being the original value unchanged rather than 0. Hence you
can't write 64 bit fields because the code thinks that all values
other than 0 are out of range.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 db/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/write.c b/db/write.c
index ca8bd0f..0157a44 100644
--- a/db/write.c
+++ b/db/write.c
@@ -565,7 +565,7 @@ convert_arg(
 		return NULL;
 
 	/* Does the value fit into the range of the destination bitfield? */
-	if ((val >> bit_length) > 0)
+	if (bit_length < 64 && (val >> bit_length) > 0)
 		return NULL;
 	/*
 	 * If the length of the field is not a multiple of a byte, push
-- 
2.0.0

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/6] repair: handle directory block corruption in phase 6
  2014-07-04  5:57 [PATCH 0/6] xfsprogs: fixes for 3.2.1 Dave Chinner
  2014-07-04  5:57 ` [PATCH 1/6] repair: support more than 25 ACLs Dave Chinner
  2014-07-04  5:57 ` [PATCH 2/6] xfs_db: write command broken on 64 bit values Dave Chinner
@ 2014-07-04  5:57 ` Dave Chinner
  2014-07-04 14:24   ` Christoph Hellwig
  2014-07-04  5:57 ` [PATCH 4/6] libxfs: reused invalidated buffers leak state and data Dave Chinner
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-04  5:57 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

This should only occur in no-modify mode, but when we fail to find
the last extent in a directory btree due to corruption we need to
trash the directory if it's the first data block we find the error
on. That is because there is nothing to recover from the directory,
and if we try to scan it xfs_reapir segv's because nothing has been
read from disk.

Also catch a memory allocation failure in this code, too.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 repair/phase6.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/repair/phase6.c b/repair/phase6.c
index 9b10f16..47ecad4 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -2179,7 +2179,7 @@ longform_dir2_entry_check(xfs_mount_t	*mp,
 	freetab = malloc(FREETAB_SIZE(ip->i_d.di_size / mp->m_dirblksize));
 	if (!freetab) {
 		do_error(
-		_("malloc failed in longform_dir2_entry_check (%" PRId64 " bytes)\n"),
+_("malloc failed in longform_dir2_entry_check (%" PRId64 " bytes)\n"),
 			FREETAB_SIZE(ip->i_d.di_size / mp->m_dirblksize));
 		exit(1);
 	}
@@ -2191,6 +2191,11 @@ longform_dir2_entry_check(xfs_mount_t	*mp,
 	}
 	num_bps = freetab->naents;
 	bplist = calloc(num_bps, sizeof(struct xfs_buf*));
+	if (!bplist)
+		do_error(
+_("calloc failed in longform_dir2_entry_check (%zu bytes)\n"),
+			num_bps * sizeof(struct xfs_buf*));
+
 	/* is this a block, leaf, or node directory? */
 	libxfs_dir2_isblock(NULL, ip, &isblock);
 	libxfs_dir2_isleaf(NULL, ip, &isleaf);
@@ -2203,8 +2208,18 @@ longform_dir2_entry_check(xfs_mount_t	*mp,
 		int			 error;
 
 		next_da_bno = da_bno + mp->m_dirblkfsbs - 1;
-		if (bmap_next_offset(NULL, ip, &next_da_bno, XFS_DATA_FORK))
+		if (bmap_next_offset(NULL, ip, &next_da_bno, XFS_DATA_FORK)) {
+			/*
+			 * if this is the first block, there isn't anything we
+			 * can recover so we just trash it.
+			 */
+			 if (da_bno == 0) {
+				fixit++;
+				goto out_fix;
+			}
 			break;
+		}
+
 		db = xfs_dir2_da_to_db(mp, da_bno);
 		if (db >= num_bps) {
 			/* more data blocks than expected */
-- 
2.0.0

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/6] libxfs: reused invalidated buffers leak state and data
  2014-07-04  5:57 [PATCH 0/6] xfsprogs: fixes for 3.2.1 Dave Chinner
                   ` (2 preceding siblings ...)
  2014-07-04  5:57 ` [PATCH 3/6] repair: handle directory block corruption in phase 6 Dave Chinner
@ 2014-07-04  5:57 ` Dave Chinner
  2014-07-04 14:15   ` Christoph Hellwig
  2014-07-04  5:57 ` [PATCH 5/6] repair: fix quota inode handling in secondary superblocks Dave Chinner
  2014-07-04  5:57 ` [PATCH 6/6] repair: get rid of BADFSINO Dave Chinner
  5 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-04  5:57 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

When rebuilding a bad directory, repair first truncates away all the
blocks in the directory, good or bad. This removes blocks from the
bmap btree, and when those blocks are freed the bmap btree code
invalidates them. This marks the buffers LIBXFS_B_STALE so that we
don't try to write stale data from that buffer at a later time.

However, when rebuilding the directory, blocks may get reallocated
and we reuse the underlying buffers. This has two problems.

The first is that if the buffer was previously detected as having a
verifier error (i.e. an error that is leading to the block being
freed and the buffer being invalidated) then the error might still
be held in b_error. Hence the libxfs code needs to ensure that
b_error does not leak from one buffer usage context to another
after invalidation.

The second problem is that when new data is written into a buffer,
it no longer has stale contents. Hence when we write the buffer, we
need to clear the LIBXFS_B_STALE flag to ensure that the new data
gets written.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 libxfs/rdwr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index 981f2ba..c81c82f 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -632,6 +632,12 @@ libxfs_putbuf(xfs_buf_t *bp)
 			pthread_mutex_unlock(&bp->b_lock);
 		}
 	}
+	/*
+	 * ensure that any errors on this use of the buffer don't carry
+	 * over to the next user.
+	 */
+	bp->b_error = 0;
+
 	cache_node_put(libxfs_bcache, (struct cache_node *)bp);
 }
 
@@ -928,6 +934,7 @@ libxfs_writebuf_int(xfs_buf_t *bp, int flags)
 	 * subsequent reads after this write from seeing stale errors.
 	 */
 	bp->b_error = 0;
+	bp->b_flags &= ~LIBXFS_B_STALE;
 	bp->b_flags |= (LIBXFS_B_DIRTY | flags);
 	return 0;
 }
@@ -946,6 +953,7 @@ libxfs_writebuf(xfs_buf_t *bp, int flags)
 	 * subsequent reads after this write from seeing stale errors.
 	 */
 	bp->b_error = 0;
+	bp->b_flags &= ~LIBXFS_B_STALE;
 	bp->b_flags |= (LIBXFS_B_DIRTY | flags);
 	libxfs_putbuf(bp);
 	return 0;
-- 
2.0.0

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/6] repair: fix quota inode handling in secondary superblocks
  2014-07-04  5:57 [PATCH 0/6] xfsprogs: fixes for 3.2.1 Dave Chinner
                   ` (3 preceding siblings ...)
  2014-07-04  5:57 ` [PATCH 4/6] libxfs: reused invalidated buffers leak state and data Dave Chinner
@ 2014-07-04  5:57 ` Dave Chinner
  2014-07-04 14:35   ` Christoph Hellwig
  2014-07-04  5:57 ` [PATCH 6/6] repair: get rid of BADFSINO Dave Chinner
  5 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-04  5:57 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Changes to support separate project quota inodes changed the way
quota inodes got written to the superblock. The current code is
tailored for the needs to the kernel, where the inodes should only
be written if certain falgs are set saying a quota type is enabled.

Unfortunately, when recovering a corrupt secondary superblock, we
need to unconditionally write the quota inode fields after we
unconditionally zero the quota flags field. The result of this bug
is that the bad quota inode fields cannot be cleared and hence
always are reported by bad by repair in subsequent runs.

Fix this by directly clearing the quota inodes in the superblock
buffers so that we do need to set special flags to get
xfs_sb_to_disk() to do the right thing as setting flags leave bad
flag values in the superblock instead of bad inode numbers....

Also, when clearing the inode numbers, write them as NULLFSINO
rather than 0 as this is what the kernel will write them as if quota
is turned off.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 include/libxfs.h  |  1 +
 libxfs/rdwr.c     |  4 ++--
 repair/agheader.c | 44 +++++++++++++++++++++++++++++---------------
 repair/sb.c       |  2 ++
 repair/scan.c     |  1 +
 5 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/include/libxfs.h b/include/libxfs.h
index 7203d79..45a924f 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -759,6 +759,7 @@ bool xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
 /* xfs_sb.h */
 #define libxfs_mod_sb			xfs_mod_sb
 #define libxfs_sb_from_disk		xfs_sb_from_disk
+#define libxfs_sb_quota_from_disk	xfs_sb_quota_from_disk
 #define libxfs_sb_to_disk		xfs_sb_to_disk
 
 /* xfs_symlink.h */
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index c81c82f..e681f2a 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -913,10 +913,10 @@ libxfs_writebufr(xfs_buf_t *bp)
 	}
 
 #ifdef IO_DEBUG
-	printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p\n",
+	printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
 			pthread_self(), __FUNCTION__, bp->b_bcount,
 			(long long)LIBXFS_BBTOOFF64(bp->b_bn),
-			(long long)bp->b_bn, bp);
+			(long long)bp->b_bn, bp, error);
 #endif
 	if (!error) {
 		bp->b_flags |= LIBXFS_B_UPTODATE;
diff --git a/repair/agheader.c b/repair/agheader.c
index fc5dac9..416dbd8 100644
--- a/repair/agheader.c
+++ b/repair/agheader.c
@@ -245,13 +245,17 @@ compare_sb(xfs_mount_t *mp, xfs_sb_t *sb)
  * superblocks, not just the secondary superblocks.
  */
 static int
-secondary_sb_wack(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb,
-	xfs_agnumber_t i)
+secondary_sb_wack(
+	struct xfs_mount *mp,
+	struct xfs_buf	*sbuf,
+	struct xfs_sb	*sb,
+	xfs_agnumber_t	i)
 {
-	int do_bzero;
-	int size;
-	char *ip;
-	int rval;
+	struct xfs_dsb	*dsb = XFS_BUF_TO_SBP(sbuf);
+	int		do_bzero = 0;
+	int		size;
+	char		*ip;
+	int		rval = 0;;
 
 	rval = do_bzero = 0;
 
@@ -334,12 +338,18 @@ secondary_sb_wack(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb,
 	}
 
 	/*
-	 * quota inodes and flags in secondary superblocks
-	 * are never set by mkfs.  However, they could be set
-	 * in a secondary if a fs with quotas was growfs'ed since
-	 * growfs copies the new primary into the secondaries.
+	 * quota inodes and flags in secondary superblocks are never set by
+	 * mkfs.  However, they could be set in a secondary if a fs with quotas
+	 * was growfs'ed since growfs copies the new primary into the
+	 * secondaries.
+	 *
+	 * Also, the in-core inode flags now have different meaning to the
+	 * on-disk flags, and so libxfs_sb_to_disk cannot directly write the
+	 * sb_gquotino/sb_pquotino fields without specific sb_qflags being set.
+	 * Hence we need to zero those fields directly in the sb buffer here.
 	 */
-	if (sb->sb_inprogress == 1 && sb->sb_uquotino)  {
+
+	if (sb->sb_inprogress == 1 && sb->sb_uquotino != NULLFSINO)  {
 		if (!no_modify)
 			sb->sb_uquotino = 0;
 		if (sb->sb_versionnum & XR_PART_SECSB_VNMASK || !do_bzero)  {
@@ -352,9 +362,11 @@ secondary_sb_wack(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb,
 			rval |= XR_AG_SB_SEC;
 	}
 
-	if (sb->sb_inprogress == 1 && sb->sb_gquotino)  {
-		if (!no_modify)
+	if (sb->sb_inprogress == 1 && sb->sb_gquotino != NULLFSINO)  {
+		if (!no_modify) {
 			sb->sb_gquotino = 0;
+			dsb->sb_gquotino = 0;
+		}
 		if (sb->sb_versionnum & XR_PART_SECSB_VNMASK || !do_bzero)  {
 			rval |= XR_AG_SB;
 			do_warn(
@@ -365,9 +377,11 @@ secondary_sb_wack(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb,
 			rval |= XR_AG_SB_SEC;
 	}
 
-	if (sb->sb_inprogress == 1 && sb->sb_pquotino)  {
-		if (!no_modify)
+	if (sb->sb_inprogress == 1 && sb->sb_pquotino != NULLFSINO)  {
+		if (!no_modify) {
 			sb->sb_pquotino = 0;
+			dsb->sb_pquotino = 0;
+		}
 		if (sb->sb_versionnum & XR_PART_SECSB_VNMASK || !do_bzero)  {
 			rval |= XR_AG_SB;
 			do_warn(
diff --git a/repair/sb.c b/repair/sb.c
index 5e0b0f2..bc421cc 100644
--- a/repair/sb.c
+++ b/repair/sb.c
@@ -138,6 +138,7 @@ find_secondary_sb(xfs_sb_t *rsb)
 		for (i = 0; !done && i < bsize; i += BBSIZE)  {
 			c_bufsb = (char *)sb + i;
 			libxfs_sb_from_disk(&bufsb, (xfs_dsb_t *)c_bufsb);
+			libxfs_sb_quota_from_disk(&bufsb);
 
 			if (verify_sb(c_bufsb, &bufsb, 0) != XR_OK)
 				continue;
@@ -538,6 +539,7 @@ get_sb(xfs_sb_t *sbp, xfs_off_t off, int size, xfs_agnumber_t agno)
 		do_error("%s\n", strerror(error));
 	}
 	libxfs_sb_from_disk(sbp, buf);
+	libxfs_sb_quota_from_disk(sbp);
 
 	rval = verify_sb((char *)buf, sbp, agno == 0);
 	free(buf);
diff --git a/repair/scan.c b/repair/scan.c
index 1b64d8b..f29ff8d 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -1496,6 +1496,7 @@ scan_ag(
 		goto out_free_sb;
 	}
 	libxfs_sb_from_disk(sb, XFS_BUF_TO_SBP(sbbuf));
+	libxfs_sb_quota_from_disk(sb);
 
 	agfbuf = libxfs_readbuf(mp->m_dev,
 			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-- 
2.0.0

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/6] repair: get rid of BADFSINO
  2014-07-04  5:57 [PATCH 0/6] xfsprogs: fixes for 3.2.1 Dave Chinner
                   ` (4 preceding siblings ...)
  2014-07-04  5:57 ` [PATCH 5/6] repair: fix quota inode handling in secondary superblocks Dave Chinner
@ 2014-07-04  5:57 ` Dave Chinner
  2014-07-04 14:15   ` Christoph Hellwig
  5 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-04  5:57 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

When we find a bad dirent, we "clear" the inode the inode number by
writing BADFSINO to the inode number in the entry:

#define BADFSINO        ((xfs_ino_t)0xfeffffffffffffffULL)

We then capture this bad inode number later in the same function
either in the same pass or in a later phase and junk the entry.
When we junk the entry, we write a "/" over the first character of
the dirent name, which is then detected up later by the directory
rebuild and ignored.

The issue with this is that the directory buffer can be written to
disk between the dirent being marked with BADFSINO and the directory
rebuild processing in phase 6, resulting in the directory block
verifier firing this error:

Invalid inode number 0xfeffffffffffffff
xfs_dir_ino_validate: XFS_ERROR_REPORT
Metadata corruption detected at block 0x11fbb698/0x1000
libxfs_writebufr: write verifer failed on bno 0x11fbb698/0x1000

And so will not write the *corrupt block* to disk. The result is
that we don't repair a corruption in the directory block correctly
and subsequent repair runs continue to find problems with the
directory.

We really don't need both BADFSINO *and* overwriting the dirent name
with "/" to mark an entry as junked. They both mean exactly the same
thing, so get rid of BADFSINO and only use the name junking to mark
dirents as bad. This prevents the directory data block verifier from
triggering on bad inode numbers, and so the later reread of the
block will find the junked entries correctly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 repair/dir2.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/repair/dir2.c b/repair/dir2.c
index 14c1435..f32bba7 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -28,13 +28,6 @@
 #include "progress.h"
 
 /*
- * Tag bad directory entries with this.
- * We can't tag them with -1 since that will look like a
- * data_unused_t instead of a data_entry_t.
- */
-#define	BADFSINO	((xfs_ino_t)0xfeffffffffffffffULL)
-
-/*
  * Known bad inode list.  These are seen when the leaf and node
  * block linkages are incorrect.
  */
@@ -1314,7 +1307,7 @@ process_dir2_data(
 		 * Conditions must either set clearino to zero or set
 		 * clearreason why it's being cleared.
 		 */
-		if (!ino_discovery && ent_ino == BADFSINO) {
+		if (!ino_discovery && dep->name[0] == '/') {
 			/*
 			 * Don't do a damned thing.  We already found this
 			 * (or did it ourselves) during phase 3.
@@ -1401,8 +1394,7 @@ _("entry at block %u offset %" PRIdPTR " in directory inode %" PRIu64
 				do_warn(
 _("\tclearing inode number in entry at offset %" PRIdPTR "...\n"),
 					(intptr_t)ptr - (intptr_t)d);
-				dep->inumber = cpu_to_be64(BADFSINO);
-				ent_ino = BADFSINO;
+				dep->name[0] = '/';
 				*dirty = 1;
 			} else {
 				do_warn(
@@ -1415,7 +1407,7 @@ _("\twould clear inode number in entry at offset %" PRIdPTR "...\n"),
 		 * discovery is turned on).  Otherwise, we'd complain a lot
 		 * during phase 4.
 		 */
-		junkit = ent_ino == BADFSINO;
+		junkit = dep->name[0] == '/';
 		nm_illegal = namecheck((char *)dep->name, dep->namelen);
 		if (ino_discovery && nm_illegal) {
 			do_warn(
@@ -1424,14 +1416,15 @@ _("entry at block %u offset %" PRIdPTR " in directory inode %" PRIu64 " has ille
 				dep->namelen, dep->namelen, dep->name);
 			junkit = 1;
 		}
+
 		/*
-		 * Now we can mark entries with BADFSINO's bad.
+		 * Ensure we write back bad entries for later processing
 		 */
-		if (!no_modify && ent_ino == BADFSINO) {
-			dep->name[0] = '/';
+		if (!no_modify && dep->name[0] == '/') {
 			*dirty = 1;
 			junkit = 0;
 		}
+
 		/*
 		 * Special .. entry processing.
 		 */
-- 
2.0.0

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/6] xfs_db: write command broken on 64 bit values
  2014-07-04  5:57 ` [PATCH 2/6] xfs_db: write command broken on 64 bit values Dave Chinner
@ 2014-07-04 14:08   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2014-07-04 14:08 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Fri, Jul 04, 2014 at 03:57:11PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> convert_args() has problesm with 64 bit fields because it tries to
> shift them by 64 bits. The result of doing so is undefined by the C
> standard, and so results in the unexpected behaviour of the result
> being being the original value unchanged rather than 0. Hence you
> can't write 64 bit fields because the code thinks that all values
> other than 0 are out of range.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] libxfs: reused invalidated buffers leak state and data
  2014-07-04  5:57 ` [PATCH 4/6] libxfs: reused invalidated buffers leak state and data Dave Chinner
@ 2014-07-04 14:15   ` Christoph Hellwig
  2014-07-04 22:22     ` Dave Chinner
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2014-07-04 14:15 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Fri, Jul 04, 2014 at 03:57:13PM +1000, Dave Chinner wrote:
> @@ -632,6 +632,12 @@ libxfs_putbuf(xfs_buf_t *bp)
>  			pthread_mutex_unlock(&bp->b_lock);
>  		}
>  	}
> +	/*
> +	 * ensure that any errors on this use of the buffer don't carry
> +	 * over to the next user.
> +	 */
> +	bp->b_error = 0;
> +
>  	cache_node_put(libxfs_bcache, (struct cache_node *)bp);

This seems a bit fishy to me.  For one I'm pretty sure it needs to be
done before unlocking b_lock, second it's different behavior from the
kernel where we explicitly clear it in the caller for the rare case
we want to reuse a buffer that had an error (xfs_buf_iodone_callbacks
seems to be the only one).  Any reason to do this differently in
userspace?

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] repair: get rid of BADFSINO
  2014-07-04  5:57 ` [PATCH 6/6] repair: get rid of BADFSINO Dave Chinner
@ 2014-07-04 14:15   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2014-07-04 14:15 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/6] repair: support more than 25 ACLs
  2014-07-04  5:57 ` [PATCH 1/6] repair: support more than 25 ACLs Dave Chinner
@ 2014-07-04 14:23   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2014-07-04 14:23 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

and yes, xfs_acl.h should be shared with userspace, it's really just
four kernel-only prototypes in there that can be moved easily.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] repair: handle directory block corruption in phase 6
  2014-07-04  5:57 ` [PATCH 3/6] repair: handle directory block corruption in phase 6 Dave Chinner
@ 2014-07-04 14:24   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2014-07-04 14:24 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

> @@ -2179,7 +2179,7 @@ longform_dir2_entry_check(xfs_mount_t	*mp,
>  	freetab = malloc(FREETAB_SIZE(ip->i_d.di_size / mp->m_dirblksize));
>  	if (!freetab) {
>  		do_error(
> -		_("malloc failed in longform_dir2_entry_check (%" PRId64 " bytes)\n"),
> +_("malloc failed in longform_dir2_entry_check (%" PRId64 " bytes)\n"),

Use __func__ to make this both more readable and future proof (same for
the other error message).


Otherwise looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 5/6] repair: fix quota inode handling in secondary superblocks
  2014-07-04  5:57 ` [PATCH 5/6] repair: fix quota inode handling in secondary superblocks Dave Chinner
@ 2014-07-04 14:35   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2014-07-04 14:35 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Fri, Jul 04, 2014 at 03:57:14PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Changes to support separate project quota inodes changed the way
> quota inodes got written to the superblock. The current code is
> tailored for the needs to the kernel, where the inodes should only
> be written if certain falgs are set saying a quota type is enabled.
> 
> Unfortunately, when recovering a corrupt secondary superblock, we
> need to unconditionally write the quota inode fields after we
> unconditionally zero the quota flags field. The result of this bug
> is that the bad quota inode fields cannot be cleared and hence
> always are reported by bad by repair in subsequent runs.
> 
> Fix this by directly clearing the quota inodes in the superblock
> buffers so that we do need to set special flags to get
> xfs_sb_to_disk() to do the right thing as setting flags leave bad
> flag values in the superblock instead of bad inode numbers....
> 
> Also, when clearing the inode numbers, write them as NULLFSINO
> rather than 0 as this is what the kernel will write them as if quota
> is turned off.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] libxfs: reused invalidated buffers leak state and data
  2014-07-04 14:15   ` Christoph Hellwig
@ 2014-07-04 22:22     ` Dave Chinner
  2014-07-05  9:48       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-04 22:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Fri, Jul 04, 2014 at 07:15:09AM -0700, Christoph Hellwig wrote:
> On Fri, Jul 04, 2014 at 03:57:13PM +1000, Dave Chinner wrote:
> > @@ -632,6 +632,12 @@ libxfs_putbuf(xfs_buf_t *bp)
> >  			pthread_mutex_unlock(&bp->b_lock);
> >  		}
> >  	}
> > +	/*
> > +	 * ensure that any errors on this use of the buffer don't carry
> > +	 * over to the next user.
> > +	 */
> > +	bp->b_error = 0;
> > +
> >  	cache_node_put(libxfs_bcache, (struct cache_node *)bp);
> 
> This seems a bit fishy to me.  For one I'm pretty sure it needs to be
> done before unlocking b_lock,

Fair enough.

> second it's different behavior from the
> kernel where we explicitly clear it in the caller for the rare case
> we want to reuse a buffer that had an error (xfs_buf_iodone_callbacks
> seems to be the only one).  Any reason to do this differently in
> userspace?

The userspace code that uses the buffer cache is much less
constrained than the kernel code. The userspace code is pretty nasty
in places, especially when it comes to buffer error handling.

We can't clear errors or zero buffer contents in libxfs_getbuf-*
like we do in the kernel, because those functions are used by the
libxfs_readbuf_* functions and hence need to leave the buffers
unchanged on cache hits. This is actually the only way to gather a
write error from a libxfs_writebuf() call - you need to
get the buffer again so you can check bp->b_error field - assuming
that the buffer is still in the cache when you check, that is.

This is very different to the kernel code which idoes not release
buffers on a write so we can wait on IO and check errors. The kernel
buffer cache also guarantees a buffer of a known initial state from
xfs_buf_get() even on a cache hit.

Hence the userspace buffer cache is behaving quite differently to
the kernel buffer cache and as a result it's leaking errors from
reads, invalidations and writes through
xfs_da_get_buf/libxfs_getbuf.  Current no userspace outside libxfs
code clears bp->b_error - very little code even checks it - so th
elibxfs code is tripping on stale errors left by the usrspace code.
libxfs_writebuf() already zeros bp->b_error to prevent propagation
of stale errors into future reads, so this patch is really just
closing the hole in the other buffer release path that the code
usually takes.

Doing a full audit and addition of error handling of all the
userspace code is a little beyond my resources right now. The only
thing I can really do quickly about the problem is clear the error
when we've finished with the buffer.

I'm open to other ways of fixing this, but right now we've got to
fix xfs_repair because it's currently breaking filesystems worse
than before xfs_repair was run...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] libxfs: reused invalidated buffers leak state and data
  2014-07-04 22:22     ` Dave Chinner
@ 2014-07-05  9:48       ` Christoph Hellwig
  2014-07-06 23:54         ` Dave Chinner
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2014-07-05  9:48 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Sat, Jul 05, 2014 at 08:22:10AM +1000, Dave Chinner wrote:
> I'm open to other ways of fixing this, but right now we've got to
> fix xfs_repair because it's currently breaking filesystems worse
> than before xfs_repair was run...

Ok, so clearly mark this as difference from kernel code in a long
comment explaining the situation similar to wrote you above.  It's
pretty obvious that the buffer cache in userspace will eventually need
a major overhaul sooner or later.  I wonder how feasible porting the
kernel one would be..

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] libxfs: reused invalidated buffers leak state and data
  2014-07-05  9:48       ` Christoph Hellwig
@ 2014-07-06 23:54         ` Dave Chinner
  2014-07-07  0:09           ` Dave Chinner
  0 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-06 23:54 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Sat, Jul 05, 2014 at 02:48:07AM -0700, Christoph Hellwig wrote:
> On Sat, Jul 05, 2014 at 08:22:10AM +1000, Dave Chinner wrote:
> > I'm open to other ways of fixing this, but right now we've got to
> > fix xfs_repair because it's currently breaking filesystems worse
> > than before xfs_repair was run...
> 
> Ok, so clearly mark this as difference from kernel code in a long
> comment explaining the situation similar to wrote you above. 

Will do.

> It's
> pretty obvious that the buffer cache in userspace will eventually need
> a major overhaul sooner or later.  I wonder how feasible porting the
> kernel one would be..

It's not so much the porting that's the issue - it's cleaning up all
the applications that use it that is the headache. And,
realistically, I don't think there's much of the kernel code that
can be used - we have to rewrite all the allocation, freeing,
locking and IO parts of it, so we'd pretty much be re-implementing
most of it, anyway....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] libxfs: reused invalidated buffers leak state and data
  2014-07-06 23:54         ` Dave Chinner
@ 2014-07-07  0:09           ` Dave Chinner
  2014-07-07 10:05             ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2014-07-07  0:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Mon, Jul 07, 2014 at 09:54:44AM +1000, Dave Chinner wrote:
> On Sat, Jul 05, 2014 at 02:48:07AM -0700, Christoph Hellwig wrote:
> > On Sat, Jul 05, 2014 at 08:22:10AM +1000, Dave Chinner wrote:
> > > I'm open to other ways of fixing this, but right now we've got to
> > > fix xfs_repair because it's currently breaking filesystems worse
> > > than before xfs_repair was run...
> > 
> > Ok, so clearly mark this as difference from kernel code in a long
> > comment explaining the situation similar to wrote you above. 
> 
> Will do.

Ok, I added this to the top of the libxfs/rdwr.c file:

/*
 * Important design/architecture note:
 *
 * The userspace code that uses the buffer cache is much less constrained than
 * the kernel code. The userspace code is pretty nasty in places, especially
 * when it comes to buffer error handling.  Very little of the userspace code
 * outside libxfs clears bp->b_error - very little code even checks it - so the
 * libxfs code is tripping on stale errors left by the userspace code.
 *
 * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
 * in the kernel, because those functions are used by the libxfs_readbuf_*
 * functions and hence need to leave the buffers unchanged on cache hits. This
 * is actually the only way to gather a write error from a libxfs_writebuf()
 * call - you need to get the buffer again so you can check bp->b_error field -
 * assuming that the buffer is still in the cache when you check, that is.
 *
 * This is very different to the kernel code which does not release buffers on a
 * write so we can wait on IO and check errors. The kernel buffer cache also
 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
 * cache hit.
 *
 * IOWs, userspace is behaving quite differently to the kernel and as a result
 * it leaks errors from reads, invalidations and writes through
 * libxfs_getbuf/libxfs_readbuf.
 *
 * The result of this is that until the userspace code outside libxfs is cleaned
 * up, functions that release buffers from userspace control (i.e
 * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
 * propagation of stale errors into future buffer operations.
 */

Is that sufficient for the moment?

-Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] libxfs: reused invalidated buffers leak state and data
  2014-07-07  0:09           ` Dave Chinner
@ 2014-07-07 10:05             ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2014-07-07 10:05 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, xfs

On Mon, Jul 07, 2014 at 10:09:29AM +1000, Dave Chinner wrote:
> Is that sufficient for the moment?

Looks good to me.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2014-07-07 10:05 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-04  5:57 [PATCH 0/6] xfsprogs: fixes for 3.2.1 Dave Chinner
2014-07-04  5:57 ` [PATCH 1/6] repair: support more than 25 ACLs Dave Chinner
2014-07-04 14:23   ` Christoph Hellwig
2014-07-04  5:57 ` [PATCH 2/6] xfs_db: write command broken on 64 bit values Dave Chinner
2014-07-04 14:08   ` Christoph Hellwig
2014-07-04  5:57 ` [PATCH 3/6] repair: handle directory block corruption in phase 6 Dave Chinner
2014-07-04 14:24   ` Christoph Hellwig
2014-07-04  5:57 ` [PATCH 4/6] libxfs: reused invalidated buffers leak state and data Dave Chinner
2014-07-04 14:15   ` Christoph Hellwig
2014-07-04 22:22     ` Dave Chinner
2014-07-05  9:48       ` Christoph Hellwig
2014-07-06 23:54         ` Dave Chinner
2014-07-07  0:09           ` Dave Chinner
2014-07-07 10:05             ` Christoph Hellwig
2014-07-04  5:57 ` [PATCH 5/6] repair: fix quota inode handling in secondary superblocks Dave Chinner
2014-07-04 14:35   ` Christoph Hellwig
2014-07-04  5:57 ` [PATCH 6/6] repair: get rid of BADFSINO Dave Chinner
2014-07-04 14:15   ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.