All of lore.kernel.org
 help / color / mirror / Atom feed
* [rfc] larger batches for crc32c
@ 2016-10-27 16:17 Nicholas Piggin
  2016-10-27 18:29 ` Darrick J. Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Nicholas Piggin @ 2016-10-27 16:17 UTC (permalink / raw)
  To: linux-xfs; +Cc: Christoph Hellwig, Dave Chinner

Hi guys,

We're seeing crc32c_le show up in xfs log checksumming on a MySQL benchmark
on powerpc. I could reproduce similar overheads with dbench as well.

1.11%  mysqld           [kernel.vmlinux]            [k] __crc32c_le
        |
        ---__crc32c_le
           |          
            --1.11%--chksum_update
                      |          
                       --1.11%--crypto_shash_update
                                 crc32c
                                 xlog_cksum
                                 xlog_sync
                                 _xfs_log_force_lsn
                                 xfs_file_fsync
                                 vfs_fsync_range
                                 do_fsync
                                 sys_fsync
                                 system_call
                                 0x17738
                                 0x17704
                                 os_file_flush_func
                                 fil_flush

As a rule, it helps the crc implementation if it can operate on as large a
chunk as possible (alignment, startup overhead, etc). So I did a quick hack
at getting XFS checksumming to feed crc32c() with larger chunks, by setting
the existing crc to 0 before running over the entire buffer. Together with
some small work on the powerpc crc implementation, crc drops below 0.1%.

I don't know if something like this would be acceptable? It's not pretty,
but I didn't see an easier way.

diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676..88f4431 100644
--- a/fs/xfs/libxfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -9,20 +9,9 @@
  * cksum_offset parameter.
  */
 static inline __uint32_t
-xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+xfs_start_cksum(void *buffer, size_t length)
 {
-	__uint32_t zero = 0;
-	__uint32_t crc;
-
-	/* Calculate CRC up to the checksum. */
-	crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
-
-	/* Skip checksum field */
-	crc = crc32c(crc, &zero, sizeof(__u32));
-
-	/* Calculate the rest of the CRC. */
-	return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
-		      length - (cksum_offset + sizeof(__be32)));
+	return crc32c(XFS_CRC_SEED, buffer, length);
 }
 
 /*
@@ -42,22 +31,29 @@ xfs_end_cksum(__uint32_t crc)
  * Helper to generate the checksum for a buffer.
  */
 static inline void
-xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+xfs_update_cksum(void *buffer, size_t length, unsigned long cksum_offset)
 {
-	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+	__le32 *crcp = buffer + cksum_offset;
 
-	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+	*crcp = 0; /* set to 0 before calculating checksum */
+	*crcp = xfs_end_cksum(xfs_start_cksum(buffer, length));
 }
 
 /*
  * Helper to verify the checksum for a buffer.
  */
 static inline int
-xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+xfs_verify_cksum(void *buffer, size_t length, unsigned long cksum_offset)
 {
-	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+	__le32 *crcp = buffer + cksum_offset;
+	__le32 crc = *crcp;
+	__le32 new_crc;
+
+	*crcp = 0;
+	new_crc = xfs_end_cksum(xfs_start_cksum(buffer, length));
+	*crcp = crc; /* restore original CRC in place */
 
-	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+	return new_crc == crc;
 }
 
 #endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 8de9a3a..efd8daa 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -419,15 +419,11 @@ xfs_dinode_calc_crc(
 	struct xfs_mount	*mp,
 	struct xfs_dinode	*dip)
 {
-	__uint32_t		crc;
-
 	if (dip->di_version < 3)
 		return;
 
 	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
-	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
-			      XFS_DINODE_CRC_OFF);
-	dip->di_crc = xfs_end_cksum(crc);
+	xfs_update_cksum(dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF);
 }
 
 /*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3b74fa0..4e242f0 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1658,7 +1658,7 @@ xlog_pack_data(
  * This is a little more complicated than it should be because the various
  * headers and the actual data are non-contiguous.
  */
-__le32
+void
 xlog_cksum(
 	struct xlog		*log,
 	struct xlog_rec_header	*rhead,
@@ -1667,10 +1667,9 @@ xlog_cksum(
 {
 	__uint32_t		crc;
 
-	/* first generate the crc for the record header ... */
-	crc = xfs_start_cksum((char *)rhead,
-			      sizeof(struct xlog_rec_header),
-			      offsetof(struct xlog_rec_header, h_crc));
+	/* first generate the crc for the record header with 0 crc... */
+	rhead->h_crc = 0;
+	crc = xfs_start_cksum(rhead, sizeof(struct xlog_rec_header));
 
 	/* ... then for additional cycle data for v2 logs ... */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
@@ -1691,7 +1690,7 @@ xlog_cksum(
 	/* ... and finally for the payload */
 	crc = crc32c(crc, dp, size);
 
-	return xfs_end_cksum(crc);
+	rhead->h_crc = xfs_end_cksum(crc);
 }
 
 /*
@@ -1840,8 +1839,7 @@ xlog_sync(
 	}
 
 	/* calculcate the checksum */
-	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
-					    iclog->ic_datap, size);
+	xlog_cksum(log, &iclog->ic_header, iclog->ic_datap, size);
 #ifdef DEBUG
 	/*
 	 * Intentionally corrupt the log record CRC based on the error injection
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2b6eec5..18ba385 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -432,7 +432,7 @@ xlog_recover_finish(
 extern int
 xlog_recover_cancel(struct xlog *);
 
-extern __le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+extern void	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
 			    char *dp, int size);
 
 extern kmem_zone_t *xfs_log_ticket_zone;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9b3d7c7..3f62d9a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5114,8 +5114,13 @@ xlog_recover_process(
 {
 	int			error;
 	__le32			crc;
+	__le32			old_crc;
 
-	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+	old_crc = rhead->h_crc;
+	rhead->h_crc = 0;
+	xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+	crc = rhead->h_crc;
+	rhead->h_crc = old_crc;
 
 	/*
 	 * Nothing else to do if this is a CRC verification pass. Just return
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-27 16:17 [rfc] larger batches for crc32c Nicholas Piggin
@ 2016-10-27 18:29 ` Darrick J. Wong
  2016-10-28  3:21   ` Nicholas Piggin
  2016-10-27 21:42 ` Dave Chinner
  2016-11-04  0:12 ` [rfc] larger batches for crc32c Dave Chinner
  2 siblings, 1 reply; 20+ messages in thread
From: Darrick J. Wong @ 2016-10-27 18:29 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner

On Fri, Oct 28, 2016 at 03:17:47AM +1100, Nicholas Piggin wrote:
> Hi guys,
> 
> We're seeing crc32c_le show up in xfs log checksumming on a MySQL benchmark
> on powerpc. I could reproduce similar overheads with dbench as well.
> 
> 1.11%  mysqld           [kernel.vmlinux]            [k] __crc32c_le
>         |
>         ---__crc32c_le
>            |          
>             --1.11%--chksum_update
>                       |          
>                        --1.11%--crypto_shash_update
>                                  crc32c
>                                  xlog_cksum
>                                  xlog_sync
>                                  _xfs_log_force_lsn
>                                  xfs_file_fsync
>                                  vfs_fsync_range
>                                  do_fsync
>                                  sys_fsync
>                                  system_call
>                                  0x17738
>                                  0x17704
>                                  os_file_flush_func
>                                  fil_flush
> 
> As a rule, it helps the crc implementation if it can operate on as large a
> chunk as possible (alignment, startup overhead, etc). So I did a quick hack
> at getting XFS checksumming to feed crc32c() with larger chunks, by setting
> the existing crc to 0 before running over the entire buffer. Together with
> some small work on the powerpc crc implementation, crc drops below 0.1%.
> 
> I don't know if something like this would be acceptable? It's not pretty,
> but I didn't see an easier way.
> 
> diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
> index fad1676..88f4431 100644
> --- a/fs/xfs/libxfs/xfs_cksum.h
> +++ b/fs/xfs/libxfs/xfs_cksum.h
> @@ -9,20 +9,9 @@
>   * cksum_offset parameter.
>   */
>  static inline __uint32_t
> -xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
> +xfs_start_cksum(void *buffer, size_t length)
>  {
> -	__uint32_t zero = 0;
> -	__uint32_t crc;
> -
> -	/* Calculate CRC up to the checksum. */
> -	crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
> -
> -	/* Skip checksum field */
> -	crc = crc32c(crc, &zero, sizeof(__u32));
> -
> -	/* Calculate the rest of the CRC. */
> -	return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
> -		      length - (cksum_offset + sizeof(__be32)));
> +	return crc32c(XFS_CRC_SEED, buffer, length);
>  }
>  
>  /*
> @@ -42,22 +31,29 @@ xfs_end_cksum(__uint32_t crc)
>   * Helper to generate the checksum for a buffer.
>   */
>  static inline void
> -xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
> +xfs_update_cksum(void *buffer, size_t length, unsigned long cksum_offset)
>  {
> -	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
> +	__le32 *crcp = buffer + cksum_offset;
>  
> -	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
> +	*crcp = 0; /* set to 0 before calculating checksum */
> +	*crcp = xfs_end_cksum(xfs_start_cksum(buffer, length));
>  }
>  
>  /*
>   * Helper to verify the checksum for a buffer.
>   */
>  static inline int
> -xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
> +xfs_verify_cksum(void *buffer, size_t length, unsigned long cksum_offset)
>  {
> -	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
> +	__le32 *crcp = buffer + cksum_offset;
> +	__le32 crc = *crcp;
> +	__le32 new_crc;
> +
> +	*crcp = 0;
> +	new_crc = xfs_end_cksum(xfs_start_cksum(buffer, length));
> +	*crcp = crc; /* restore original CRC in place */

So... this temporarily modifies the metadata buffer as part of verifying
checksums in the read path.  I'm not sure about this, but what prevents
a situation where a read verifier races with the buffer actually being
written back to disk by the log?

The reason I ask is that we just ripped this optimization out of ext4
because it turned out that it does actually allow multiple readers of a
directory block (I think because the VFS now allows it).  ext4 allows
for multiple threads to call the read verifier on a block, which lead to
races and verifier errors.  Normally the journal mediates write access
to buffers, but we don't allocate transactions for a readdir.  Worse
yet, I think the Samsung engineers who reported the ext4 issue also saw
zeroed checksums on disk because the journal can be writing buffers back
to disk concurrently with other processes having read access to the
buffer.

I /think/ XFS buffer locking rules prevent verifier races by only
allowing one owner of a buffer at a time.  However, I think if we ever
wanted to allow multiple threads to have read access to a buffer then
this is a bug waiting to happen.

--D

>  
> -	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
> +	return new_crc == crc;
>  }
>  
>  #endif /* _XFS_CKSUM_H */
> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> index 8de9a3a..efd8daa 100644
> --- a/fs/xfs/libxfs/xfs_inode_buf.c
> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> @@ -419,15 +419,11 @@ xfs_dinode_calc_crc(
>  	struct xfs_mount	*mp,
>  	struct xfs_dinode	*dip)
>  {
> -	__uint32_t		crc;
> -
>  	if (dip->di_version < 3)
>  		return;
>  
>  	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
> -	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
> -			      XFS_DINODE_CRC_OFF);
> -	dip->di_crc = xfs_end_cksum(crc);
> +	xfs_update_cksum(dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF);
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index 3b74fa0..4e242f0 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -1658,7 +1658,7 @@ xlog_pack_data(
>   * This is a little more complicated than it should be because the various
>   * headers and the actual data are non-contiguous.
>   */
> -__le32
> +void
>  xlog_cksum(
>  	struct xlog		*log,
>  	struct xlog_rec_header	*rhead,
> @@ -1667,10 +1667,9 @@ xlog_cksum(
>  {
>  	__uint32_t		crc;
>  
> -	/* first generate the crc for the record header ... */
> -	crc = xfs_start_cksum((char *)rhead,
> -			      sizeof(struct xlog_rec_header),
> -			      offsetof(struct xlog_rec_header, h_crc));
> +	/* first generate the crc for the record header with 0 crc... */
> +	rhead->h_crc = 0;
> +	crc = xfs_start_cksum(rhead, sizeof(struct xlog_rec_header));
>  
>  	/* ... then for additional cycle data for v2 logs ... */
>  	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
> @@ -1691,7 +1690,7 @@ xlog_cksum(
>  	/* ... and finally for the payload */
>  	crc = crc32c(crc, dp, size);
>  
> -	return xfs_end_cksum(crc);
> +	rhead->h_crc = xfs_end_cksum(crc);
>  }
>  
>  /*
> @@ -1840,8 +1839,7 @@ xlog_sync(
>  	}
>  
>  	/* calculcate the checksum */
> -	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
> -					    iclog->ic_datap, size);
> +	xlog_cksum(log, &iclog->ic_header, iclog->ic_datap, size);
>  #ifdef DEBUG
>  	/*
>  	 * Intentionally corrupt the log record CRC based on the error injection
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 2b6eec5..18ba385 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -432,7 +432,7 @@ xlog_recover_finish(
>  extern int
>  xlog_recover_cancel(struct xlog *);
>  
> -extern __le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
> +extern void	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
>  			    char *dp, int size);
>  
>  extern kmem_zone_t *xfs_log_ticket_zone;
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index 9b3d7c7..3f62d9a 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -5114,8 +5114,13 @@ xlog_recover_process(
>  {
>  	int			error;
>  	__le32			crc;
> +	__le32			old_crc;
>  
> -	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
> +	old_crc = rhead->h_crc;
> +	rhead->h_crc = 0;
> +	xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
> +	crc = rhead->h_crc;
> +	rhead->h_crc = old_crc;
>  
>  	/*
>  	 * Nothing else to do if this is a CRC verification pass. Just return
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-27 16:17 [rfc] larger batches for crc32c Nicholas Piggin
  2016-10-27 18:29 ` Darrick J. Wong
@ 2016-10-27 21:42 ` Dave Chinner
  2016-10-27 23:16   ` Dave Chinner
  2016-10-28  2:12   ` Nicholas Piggin
  2016-11-04  0:12 ` [rfc] larger batches for crc32c Dave Chinner
  2 siblings, 2 replies; 20+ messages in thread
From: Dave Chinner @ 2016-10-27 21:42 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner

On Fri, Oct 28, 2016 at 03:17:47AM +1100, Nicholas Piggin wrote:
> Hi guys,
> 
> We're seeing crc32c_le show up in xfs log checksumming on a MySQL benchmark
> on powerpc. I could reproduce similar overheads with dbench as well.
> 
> 1.11%  mysqld           [kernel.vmlinux]            [k] __crc32c_le
>         |
>         ---__crc32c_le
>            |          
>             --1.11%--chksum_update
>                       |          
>                        --1.11%--crypto_shash_update
>                                  crc32c
>                                  xlog_cksum
>                                  xlog_sync
>                                  _xfs_log_force_lsn
>                                  xfs_file_fsync
>                                  vfs_fsync_range
>                                  do_fsync
>                                  sys_fsync
>                                  system_call
>                                  0x17738
>                                  0x17704
>                                  os_file_flush_func
>                                  fil_flush

2-3% is the typical CRC CPU overhead I see on metadata/log intensive
workloads on x86-64, so this doesn't seem unreasonable.

Looking more closely at xlog_cksum, it does:

	crc = xfs_start_cksum(sizeof(struct xlog_rec_header) ...
	for (i = 0; i < xheads; i++) {
		...
		crc = crc32c(crc, ... sizeof(struct xlog_rec_ext_header));
		...
	}
	/* ... and finally for the payload */
	crc = crc32c(crc, dp, size);

	return xfs_end_cksum(crc);

The vast majority of the work it does is in the ".. and finally for
the payload " call. The first is a sector, the loop (up to 8 times
for 256k log buffer sizes) is over single sectors, and the payload
is up to 256kb of data. So the payload CRC is the vast majority of
the data being CRCed and so should dominate the CPU usage here.  It
doesn't look like optimising xfs_start_cksum() would make much
difference...

> As a rule, it helps the crc implementation if it can operate on as large a
> chunk as possible (alignment, startup overhead, etc). So I did a quick hack
> at getting XFS checksumming to feed crc32c() with larger chunks, by setting
> the existing crc to 0 before running over the entire buffer. Together with
> some small work on the powerpc crc implementation, crc drops below 0.1%.

I wouldn't have expected reducing call numbers and small alignment
changes to make that amount of difference given the amount of data
we are actually checksumming. How much of that difference was due to
the improved CRC implementation?

FWIW, can you provide some additional context by grabbing the log
stats that tell us the load on the log that is generating this
profile?  A sample over a minute of a typical workload (with a
corresponding CPU profile) would probably be sufficient. You can get
them simply by zeroing the xfs stats via
/proc/sys/fs/xfs/stats_clear at the start of the sample period and
then dumping /proc/fs/xfs/stat at the end.

> I don't know if something like this would be acceptable? It's not pretty,
> but I didn't see an easier way.

ISTR we made the choice not to do that to avoid potential problems
with potential race conditions and bugs (i.e. don't modify anything
in objects on read access) but I can't point you at anything
specific...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-27 21:42 ` Dave Chinner
@ 2016-10-27 23:16   ` Dave Chinner
  2016-10-28  2:12   ` Nicholas Piggin
  1 sibling, 0 replies; 20+ messages in thread
From: Dave Chinner @ 2016-10-27 23:16 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner

On Fri, Oct 28, 2016 at 08:42:44AM +1100, Dave Chinner wrote:
> On Fri, Oct 28, 2016 at 03:17:47AM +1100, Nicholas Piggin wrote:
> > Hi guys,
> > 
> > We're seeing crc32c_le show up in xfs log checksumming on a MySQL benchmark
> > on powerpc. I could reproduce similar overheads with dbench as well.
> > 
> > 1.11%  mysqld           [kernel.vmlinux]            [k] __crc32c_le
> >         |
> >         ---__crc32c_le
> >            |          
> >             --1.11%--chksum_update
> >                       |          
> >                        --1.11%--crypto_shash_update
> >                                  crc32c
> >                                  xlog_cksum
> >                                  xlog_sync
> >                                  _xfs_log_force_lsn
> >                                  xfs_file_fsync
> >                                  vfs_fsync_range
> >                                  do_fsync
> >                                  sys_fsync
> >                                  system_call
> >                                  0x17738
> >                                  0x17704
> >                                  os_file_flush_func
> >                                  fil_flush
> 
> 2-3% is the typical CRC CPU overhead I see on metadata/log intensive
> workloads on x86-64, so this doesn't seem unreasonable.

FWIW, I just noticed that qemu is now passing through hardware CRC
feature bits to the guest, so it's now using the hardware
accelerated intel module for CRCs. That "2-3%" I mentioned I used
to see from __crc32c_le is now:

   0.35%  [kernel]  [k] crc32c_pcl_intel_update

And there's about 250MB/s of log+metadata IO being CRC'd in this
workload, so I don't think that the way XFS splits the CRCs into
multiple segments is really all that significant...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-27 21:42 ` Dave Chinner
  2016-10-27 23:16   ` Dave Chinner
@ 2016-10-28  2:12   ` Nicholas Piggin
  2016-10-28  4:29     ` Dave Chinner
  2016-10-28  5:02     ` Nicholas Piggin
  1 sibling, 2 replies; 20+ messages in thread
From: Nicholas Piggin @ 2016-10-28  2:12 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner, Darrick J. Wong

On Fri, 28 Oct 2016 08:42:44 +1100
Dave Chinner <david@fromorbit.com> wrote:

> On Fri, Oct 28, 2016 at 03:17:47AM +1100, Nicholas Piggin wrote:
> > Hi guys,
> > 
> > We're seeing crc32c_le show up in xfs log checksumming on a MySQL benchmark
> > on powerpc. I could reproduce similar overheads with dbench as well.
> > 
> > 1.11%  mysqld           [kernel.vmlinux]            [k] __crc32c_le
> >         |
> >         ---__crc32c_le
> >            |          
> >             --1.11%--chksum_update
> >                       |          
> >                        --1.11%--crypto_shash_update
> >                                  crc32c
> >                                  xlog_cksum
> >                                  xlog_sync
> >                                  _xfs_log_force_lsn
> >                                  xfs_file_fsync
> >                                  vfs_fsync_range
> >                                  do_fsync
> >                                  sys_fsync
> >                                  system_call
> >                                  0x17738
> >                                  0x17704
> >                                  os_file_flush_func
> >                                  fil_flush  
> 
> 2-3% is the typical CRC CPU overhead I see on metadata/log intensive
> workloads on x86-64, so this doesn't seem unreasonable.
> 
> Looking more closely at xlog_cksum, it does:
> 
> 	crc = xfs_start_cksum(sizeof(struct xlog_rec_header) ...
> 	for (i = 0; i < xheads; i++) {
> 		...
> 		crc = crc32c(crc, ... sizeof(struct xlog_rec_ext_header));
> 		...
> 	}
> 	/* ... and finally for the payload */
> 	crc = crc32c(crc, dp, size);
> 
> 	return xfs_end_cksum(crc);
> 
> The vast majority of the work it does is in the ".. and finally for
> the payload " call. The first is a sector, the loop (up to 8 times
> for 256k log buffer sizes) is over single sectors, and the payload
> is up to 256kb of data. So the payload CRC is the vast majority of
> the data being CRCed and so should dominate the CPU usage here.  It
> doesn't look like optimising xfs_start_cksum() would make much
> difference...
> 
> > As a rule, it helps the crc implementation if it can operate on as large a
> > chunk as possible (alignment, startup overhead, etc). So I did a quick hack
> > at getting XFS checksumming to feed crc32c() with larger chunks, by setting
> > the existing crc to 0 before running over the entire buffer. Together with
> > some small work on the powerpc crc implementation, crc drops below 0.1%.  
> 
> I wouldn't have expected reducing call numbers and small alignment
> changes to make that amount of difference given the amount of data
> we are actually checksumming. How much of that difference was due to
> the improved CRC implementation?

Sorry, I should have been more clear about what was happening. Not enough
sleep. The larger sizes allows the vectorized crc implementation to kick in.

We should still be able to improve things on the powerpc side without any
changes to XFS, it was just something I noticed would be nice to improve.

As Darrick noted, it is modifying the field in-place for the verifiers,
which is nasty and the reason I don't propose it as a real patch (because I
don't know enough fo XFS to say whether it's completely safe).


> FWIW, can you provide some additional context by grabbing the log
> stats that tell us the load on the log that is generating this
> profile?  A sample over a minute of a typical workload (with a
> corresponding CPU profile) would probably be sufficient. You can get
> them simply by zeroing the xfs stats via
> /proc/sys/fs/xfs/stats_clear at the start of the sample period and
> then dumping /proc/fs/xfs/stat at the end.

Yeah I'll get some better information for you.

> > I don't know if something like this would be acceptable? It's not pretty,
> > but I didn't see an easier way.  
> 
> ISTR we made the choice not to do that to avoid potential problems
> with potential race conditions and bugs (i.e. don't modify anything
> in objects on read access) but I can't point you at anything
> specific...

Sounds pretty reasonable, especially for the verifiers. For the paths
that create/update the checksums (including this log checksum), it seems
like it should be less controversial.

But yes let me get more data first. Thanks for taking a look.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-27 18:29 ` Darrick J. Wong
@ 2016-10-28  3:21   ` Nicholas Piggin
  0 siblings, 0 replies; 20+ messages in thread
From: Nicholas Piggin @ 2016-10-28  3:21 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner

On Thu, 27 Oct 2016 11:29:00 -0700
"Darrick J. Wong" <darrick.wong@oracle.com> wrote:

> On Fri, Oct 28, 2016 at 03:17:47AM +1100, Nicholas Piggin wrote:
> > Hi guys,
> > 
> > We're seeing crc32c_le show up in xfs log checksumming on a MySQL benchmark
> > on powerpc. I could reproduce similar overheads with dbench as well.
> > 
> > 1.11%  mysqld           [kernel.vmlinux]            [k] __crc32c_le
> >         |
> >         ---__crc32c_le
> >            |          
> >             --1.11%--chksum_update
> >                       |          
> >                        --1.11%--crypto_shash_update
> >                                  crc32c
> >                                  xlog_cksum
> >                                  xlog_sync
> >                                  _xfs_log_force_lsn
> >                                  xfs_file_fsync
> >                                  vfs_fsync_range
> >                                  do_fsync
> >                                  sys_fsync
> >                                  system_call
> >                                  0x17738
> >                                  0x17704
> >                                  os_file_flush_func
> >                                  fil_flush
> > 
> > As a rule, it helps the crc implementation if it can operate on as large a
> > chunk as possible (alignment, startup overhead, etc). So I did a quick hack
> > at getting XFS checksumming to feed crc32c() with larger chunks, by setting
> > the existing crc to 0 before running over the entire buffer. Together with
> > some small work on the powerpc crc implementation, crc drops below 0.1%.
> > 
> > I don't know if something like this would be acceptable? It's not pretty,
> > but I didn't see an easier way.
> > 
> > diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
> > index fad1676..88f4431 100644
> > --- a/fs/xfs/libxfs/xfs_cksum.h
> > +++ b/fs/xfs/libxfs/xfs_cksum.h
> > @@ -9,20 +9,9 @@
> >   * cksum_offset parameter.
> >   */
> >  static inline __uint32_t
> > -xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
> > +xfs_start_cksum(void *buffer, size_t length)
> >  {
> > -	__uint32_t zero = 0;
> > -	__uint32_t crc;
> > -
> > -	/* Calculate CRC up to the checksum. */
> > -	crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
> > -
> > -	/* Skip checksum field */
> > -	crc = crc32c(crc, &zero, sizeof(__u32));
> > -
> > -	/* Calculate the rest of the CRC. */
> > -	return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
> > -		      length - (cksum_offset + sizeof(__be32)));
> > +	return crc32c(XFS_CRC_SEED, buffer, length);
> >  }
> >  
> >  /*
> > @@ -42,22 +31,29 @@ xfs_end_cksum(__uint32_t crc)
> >   * Helper to generate the checksum for a buffer.
> >   */
> >  static inline void
> > -xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
> > +xfs_update_cksum(void *buffer, size_t length, unsigned long cksum_offset)
> >  {
> > -	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
> > +	__le32 *crcp = buffer + cksum_offset;
> >  
> > -	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
> > +	*crcp = 0; /* set to 0 before calculating checksum */
> > +	*crcp = xfs_end_cksum(xfs_start_cksum(buffer, length));
> >  }
> >  
> >  /*
> >   * Helper to verify the checksum for a buffer.
> >   */
> >  static inline int
> > -xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
> > +xfs_verify_cksum(void *buffer, size_t length, unsigned long cksum_offset)
> >  {
> > -	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
> > +	__le32 *crcp = buffer + cksum_offset;
> > +	__le32 crc = *crcp;
> > +	__le32 new_crc;
> > +
> > +	*crcp = 0;
> > +	new_crc = xfs_end_cksum(xfs_start_cksum(buffer, length));
> > +	*crcp = crc; /* restore original CRC in place */  
> 
> So... this temporarily modifies the metadata buffer as part of verifying
> checksums in the read path.  I'm not sure about this, but what prevents
> a situation where a read verifier races with the buffer actually being
> written back to disk by the log?

It does. Yes that's probably the nastiest bit of it. If it was to
be turned into a "real" patch, perhaps it should be a burden on the
callers to clear the checksum themselves and do the comparison.
Although it may turn out to be a premature optimisation because I
have only been looking at log heavy write operations at the moment.

> 
> The reason I ask is that we just ripped this optimization out of ext4
> because it turned out that it does actually allow multiple readers of a
> directory block (I think because the VFS now allows it).  ext4 allows
> for multiple threads to call the read verifier on a block, which lead to
> races and verifier errors.  Normally the journal mediates write access
> to buffers, but we don't allocate transactions for a readdir.  Worse
> yet, I think the Samsung engineers who reported the ext4 issue also saw
> zeroed checksums on disk because the journal can be writing buffers back
> to disk concurrently with other processes having read access to the
> buffer.
> 
> I /think/ XFS buffer locking rules prevent verifier races by only
> allowing one owner of a buffer at a time.  However, I think if we ever
> wanted to allow multiple threads to have read access to a buffer then
> this is a bug waiting to happen.


XFS *looked* like it's pretty tight there, at least the inode and
buffer paths I looked at (though not knowing much about XFS). But I
agree with you and Dave it's not something that can be taken lightly.

As an aside, it's interesting ext4 is doing concurrent verifications
on the same block. Is that to avoid checksumming in IO completion?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-28  2:12   ` Nicholas Piggin
@ 2016-10-28  4:29     ` Dave Chinner
  2016-10-28  5:02     ` Nicholas Piggin
  1 sibling, 0 replies; 20+ messages in thread
From: Dave Chinner @ 2016-10-28  4:29 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-xfs, Christoph Hellwig, Dave Chinner, Darrick J. Wong

On Fri, Oct 28, 2016 at 01:12:34PM +1100, Nicholas Piggin wrote:
> On Fri, 28 Oct 2016 08:42:44 +1100
> Dave Chinner <david@fromorbit.com> wrote:
> > > As a rule, it helps the crc implementation if it can operate on as large a
> > > chunk as possible (alignment, startup overhead, etc). So I did a quick hack
> > > at getting XFS checksumming to feed crc32c() with larger chunks, by setting
> > > the existing crc to 0 before running over the entire buffer. Together with
> > > some small work on the powerpc crc implementation, crc drops below 0.1%.  
> > 
> > I wouldn't have expected reducing call numbers and small alignment
> > changes to make that amount of difference given the amount of data
> > we are actually checksumming. How much of that difference was due to
> > the improved CRC implementation?
> 
> Sorry, I should have been more clear about what was happening. Not enough
> sleep. The larger sizes allows the vectorized crc implementation to kick in.

Ah, ok. So it never gets out of the slow, branchy lead in/lead out
code for the smaller chunks.Fair enough.

For the verify side it probably doesn't matter than much - the
latency of the initial memory fetches on the data to be verified are
likely to be the dominant factor for performance...

> > FWIW, can you provide some additional context by grabbing the log
> > stats that tell us the load on the log that is generating this
> > profile?  A sample over a minute of a typical workload (with a
> > corresponding CPU profile) would probably be sufficient. You can get
> > them simply by zeroing the xfs stats via
> > /proc/sys/fs/xfs/stats_clear at the start of the sample period and
> > then dumping /proc/fs/xfs/stat at the end.
> 
> Yeah I'll get some better information for you.
> 
> > > I don't know if something like this would be acceptable? It's not pretty,
> > > but I didn't see an easier way.  
> > 
> > ISTR we made the choice not to do that to avoid potential problems
> > with potential race conditions and bugs (i.e. don't modify anything
> > in objects on read access) but I can't point you at anything
> > specific...
> 
> Sounds pretty reasonable, especially for the verifiers. For the paths
> that create/update the checksums (including this log checksum), it seems
> like it should be less controversial.

Yup. For the paths that update the checksum we have to have an
exclusive lock on the object (and will always require that), so I
can't see a problem with changing the update code to use this
method.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-28  2:12   ` Nicholas Piggin
  2016-10-28  4:29     ` Dave Chinner
@ 2016-10-28  5:02     ` Nicholas Piggin
  2016-10-31  3:08       ` Dave Chinner
  1 sibling, 1 reply; 20+ messages in thread
From: Nicholas Piggin @ 2016-10-28  5:02 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner, Darrick J. Wong

On Fri, 28 Oct 2016 13:12:34 +1100
Nicholas Piggin <npiggin@gmail.com> wrote:

> On Fri, 28 Oct 2016 08:42:44 +1100
> Dave Chinner <david@fromorbit.com> wrote:
> 

> > > I don't know if something like this would be acceptable? It's not pretty,
> > > but I didn't see an easier way.    
> > 
> > ISTR we made the choice not to do that to avoid potential problems
> > with potential race conditions and bugs (i.e. don't modify anything
> > in objects on read access) but I can't point you at anything
> > specific...  
> 
> Sounds pretty reasonable, especially for the verifiers. For the paths
> that create/update the checksums (including this log checksum), it seems
> like it should be less controversial.
> 
> But yes let me get more data first. Thanks for taking a look.

Okay, the XFS crc sizes indeed don't look too so bad, so it's more the
crc implementation I suppose. I was seeing a lot of small calls to crc,
but as a fraction of the total number of bytes, it's not as significant
as I thought. That said, there is some improvement you may be able to
get even from x86 implementation.

I took an ilog2 histogram of frequency and total bytes going to XFS
checksum, with total, head, and tail lengths. I'll give as percentages
of total for easier comparison (total calls were around 1 million and
500MB of data):

                frequency                   bytes
ilog2   total   | head | tail       total | head | tail
  3         0     1.51      0           0   0.01      0
  4         0        0      0           0      0      0
  5         0        0      0           0      0      0
  6         0    22.35      0           0   1.36      0
  7         0    76.10      0           0  14.40      0
  8         0     0.04     ~0           0   0.02     ~0
  9     22.25       ~0  98.39       13.81     ~0  71.07
 10     76.14        0      0       73.77      0      0
 11         0        0      0           0      0      0
 12         0        0   1.60           0      0  12.39
 13      1.60        0      0       12.42      0      0


Keep in mind you have to sum the number of bytes for head and tail to
get ~100%.

Now for x86-64, you need to be at 9-10 (depending on configuration) or
greater to exceed the breakeven point for their fastest implementation.
Split crc implementation will use the fast algorithm for about 85% of
bytes in the best case, 12% at worst. Combined gets there for 85% at
worst, and 100% at best. The slower x86 implementation still uses a
hardware instruction, so it doesn't do too badly.

For powerpc, the breakeven is at 512 + 16 bytes (9ish), but it falls
back to generic implementation for bytes below that.  I think we can
reduce the break even point on powerpc slightly and capture most of
the rest, so it's not so bad.

Anyway at least that's a data point to consider. Small improvement is
possible.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-28  5:02     ` Nicholas Piggin
@ 2016-10-31  3:08       ` Dave Chinner
  2016-11-01  3:39         ` Nicholas Piggin
  0 siblings, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-10-31  3:08 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-xfs, Christoph Hellwig, Dave Chinner, Darrick J. Wong

On Fri, Oct 28, 2016 at 04:02:18PM +1100, Nicholas Piggin wrote:
> Okay, the XFS crc sizes indeed don't look too so bad, so it's more the
> crc implementation I suppose. I was seeing a lot of small calls to crc,
> but as a fraction of the total number of bytes, it's not as significant
> as I thought. That said, there is some improvement you may be able to
> get even from x86 implementation.
> 
> I took an ilog2 histogram of frequency and total bytes going to XFS

Which means ilog2 = 3 is 8-15 bytes and 9 is 512-1023 bytes? 

> checksum, with total, head, and tail lengths. I'll give as percentages
> of total for easier comparison (total calls were around 1 million and
> 500MB of data):

Does this table match the profile you showed with all the overhead
being through the fsync->log write path?

Decode table - these are the offsets of crc fields in XFS structures.
('pahole fs/xfs/xfs.o |grep crc' and cleaned up)

	AGF header                 agf_crc;              /*   216     4 */
	short btree                bb_crc;               /*    44     4 */
	long btree                 bb_crc;               /*    56     4 */
	log buffer header          h_crc;                /*    32     4 */
	AG Free list               agfl_crc;             /*    32     4 */
	dir/attr leaf/node block   crc;                  /*    12     4 */
	remote attribute           rm_crc;               /*    12     4 */
	directory data block       crc;                  /*     4     4 */
	dquot                      dd_crc;               /*   108     4 */
	AGI header                 agi_crc;              /*   312     4 */
	inode                      di_crc;               /*   100     4 */
	superblock                 sb_crc;               /*   224     4 */
	symlink                    sl_crc;               /*    12     4 */

> 
>                 frequency                   bytes
> ilog2   total   | head | tail       total | head | tail
>   3         0     1.51      0           0   0.01      0

Directory data blocks.

>   4         0        0      0           0      0      0
>   5         0        0      0           0      0      0
>   6         0    22.35      0           0   1.36      0

log buffer headers, short/long btree blocks

>   7         0    76.10      0           0  14.40      0

Inodes.

>   8         0     0.04     ~0           0   0.02     ~0
>   9     22.25       ~0  98.39       13.81     ~0  71.07

Inode, log buffer header tails.

>  10     76.14        0      0       73.77      0      0

Full sector, no head, no tail (i.e. external crc store)? I think
only log buffers (the extended header sector CRCs) can do that.
That implies a large log buffer (e.g. 256k) is configured and
(possibly) log stripe unit padding is being done. What is the
xfs_info and mount options from the test filesystem?

>  11         0        0      0           0      0      0
>  12         0        0   1.60           0      0  12.39

Directory data block tails.

>  13      1.60        0      0       12.42      0      0

Larger than 4k? Probably only log buffers.

> Keep in mind you have to sum the number of bytes for head and tail to
> get ~100%.
> 
> Now for x86-64, you need to be at 9-10 (depending on configuration) or
> greater to exceed the breakeven point for their fastest implementation.
> Split crc implementation will use the fast algorithm for about 85% of
> bytes in the best case, 12% at worst. Combined gets there for 85% at
> worst, and 100% at best. The slower x86 implementation still uses a
> hardware instruction, so it doesn't do too badly.
> 
> For powerpc, the breakeven is at 512 + 16 bytes (9ish), but it falls
> back to generic implementation for bytes below that.

Which means for the most common objects we won't be able to reach
breakeven easily simply because of the size of the objects we are
running CRCs on. e.g. sectors and inodes/dquots by default are all
512 bytes or smaller. THere's only so much that can be optimised
here...

> I think we can
> reduce the break even point on powerpc slightly and capture most of
> the rest, so it's not so bad.
> 
> Anyway at least that's a data point to consider. Small improvement is
> possible.

Yup, but there's no huge gain to be made here - these numbers say to
me that the problem may not be the CRC overhead, but instead is the
amount of CRC work being done. Hence my request for mount options
+ xfs_info to determine if what you are seeing is simply a bad fs
configuration for optimal small log write performance. CRC overhead
may just be a symptom of a filesystem config issue...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-31  3:08       ` Dave Chinner
@ 2016-11-01  3:39         ` Nicholas Piggin
  2016-11-01  5:47           ` Dave Chinner
  0 siblings, 1 reply; 20+ messages in thread
From: Nicholas Piggin @ 2016-11-01  3:39 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner, Darrick J. Wong

On Mon, 31 Oct 2016 14:08:53 +1100
Dave Chinner <david@fromorbit.com> wrote:

> On Fri, Oct 28, 2016 at 04:02:18PM +1100, Nicholas Piggin wrote:
> > Okay, the XFS crc sizes indeed don't look too so bad, so it's more the
> > crc implementation I suppose. I was seeing a lot of small calls to crc,
> > but as a fraction of the total number of bytes, it's not as significant
> > as I thought. That said, there is some improvement you may be able to
> > get even from x86 implementation.
> > 
> > I took an ilog2 histogram of frequency and total bytes going to XFS  
> 
> Which means ilog2 = 3 is 8-15 bytes and 9 is 512-1023 bytes? 

Yes.

> > checksum, with total, head, and tail lengths. I'll give as percentages
> > of total for easier comparison (total calls were around 1 million and
> > 500MB of data):  
> 
> Does this table match the profile you showed with all the overhead
> being through the fsync->log write path?

Yes.

[snip interesting summary]

> Full sector, no head, no tail (i.e. external crc store)? I think
> only log buffers (the extended header sector CRCs) can do that.
> That implies a large log buffer (e.g. 256k) is configured and
> (possibly) log stripe unit padding is being done. What is the
> xfs_info and mount options from the test filesystem?

See the end of the mail.

[snip]

> > Keep in mind you have to sum the number of bytes for head and tail to
> > get ~100%.
> > 
> > Now for x86-64, you need to be at 9-10 (depending on configuration) or
> > greater to exceed the breakeven point for their fastest implementation.
> > Split crc implementation will use the fast algorithm for about 85% of
> > bytes in the best case, 12% at worst. Combined gets there for 85% at
> > worst, and 100% at best. The slower x86 implementation still uses a
> > hardware instruction, so it doesn't do too badly.
> > 
> > For powerpc, the breakeven is at 512 + 16 bytes (9ish), but it falls
> > back to generic implementation for bytes below that.  
> 
> Which means for the most common objects we won't be able to reach
> breakeven easily simply because of the size of the objects we are
> running CRCs on. e.g. sectors and inodes/dquots by default are all
> 512 bytes or smaller. THere's only so much that can be optimised
> here...

Well for this workload at least, the full checksum size seems always
>= 512. The small heads cut it down and drag a lot of crc32c calls
from 1024-2047 range (optimal for Intel) to 512-1023. I don't *think*
I've done the wrong thing here, but if it looks odd to you, I'll go
back and double check.

> 
> > I think we can
> > reduce the break even point on powerpc slightly and capture most of
> > the rest, so it's not so bad.
> > 
> > Anyway at least that's a data point to consider. Small improvement is
> > possible.  
> 
> Yup, but there's no huge gain to be made here - these numbers say to
> me that the problem may not be the CRC overhead, but instead is the
> amount of CRC work being done. Hence my request for mount options
> + xfs_info to determine if what you are seeing is simply a bad fs
> configuration for optimal small log write performance. CRC overhead
> may just be a symptom of a filesystem config issue...

Yes sorry, I forgot to send an xfs_info sample. mkfs.xfs is 4.3.0 from
Ubuntu 16.04.

npiggin@fstn3:/etc$ sudo mkfs.xfs -f /dev/ram0
specified blocksize 4096 is less than device physical sector size 65536
switching to logical sector size 512
meta-data=/dev/ram0              isize=512    agcount=4, agsize=4194304 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=0
data     =                       bsize=4096   blocks=16777216, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0 ftype=1
log      =internal log           bsize=4096   blocks=8192, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0

Mount options are standard:
/dev/ram0 on /mnt type xfs (rw,relatime,attr2,inode64,noquota)

xfs_info sample:
extent_alloc 64475 822567 74740 1164625
abt 0 0 0 0
blk_map 1356685 1125591 334183 64406 227190 2816523 0
bmbt 0 0 0 0
dir 79418 460612 460544 5685160
trans 0 3491960 0
ig 381191 378085 0 3106 0 2972 153329
log 89045 2859542 62 132145 143932
push_ail 3491960 24 619 53860 0 6433 13135 284324 0 445
xstrat 64342 0
rw 951375 2937203
attr 0 0 0 0
icluster 47412 38985 221903
vnodes 5294 0 0 0 381123 381123 381123 0
buf 4497307 6910 4497106 1054073 13012 201 0 0 0
abtb2 139597 675266 27639 27517 0 0 0 0 0 0 0 0 0 0 1411718
abtc2 240942 1207277 120532 120410 0 0 0 0 0 0 0 0 0 0 4618844
bmbt2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ibt2 762383 3048311 69 67 0 0 0 0 0 0 0 0 0 0 263
fibt2 1114420 2571311 143583 143582 0 0 0 0 0 0 0 0 0 0 1232534
rmapbt 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
refcntbt 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
qm 0 0 0 0 0 0 0 0
xpc 3366711296 24870568605 34799779740
debug 0

Thanks,
Nick

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-11-01  3:39         ` Nicholas Piggin
@ 2016-11-01  5:47           ` Dave Chinner
  2016-11-02  2:18             ` [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c) L.A. Walsh
  0 siblings, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-11-01  5:47 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-xfs, Christoph Hellwig, Dave Chinner, Darrick J. Wong

On Tue, Nov 01, 2016 at 02:39:18PM +1100, Nicholas Piggin wrote:
> On Mon, 31 Oct 2016 14:08:53 +1100
> Dave Chinner <david@fromorbit.com> wrote:
> 
> > On Fri, Oct 28, 2016 at 04:02:18PM +1100, Nicholas Piggin wrote:
> > > Okay, the XFS crc sizes indeed don't look too so bad, so it's more the
> > > crc implementation I suppose. I was seeing a lot of small calls to crc,
> > > but as a fraction of the total number of bytes, it's not as significant
> > > as I thought. That said, there is some improvement you may be able to
> > > get even from x86 implementation.
> > > 
> > > I took an ilog2 histogram of frequency and total bytes going to XFS  
> > 
> > Which means ilog2 = 3 is 8-15 bytes and 9 is 512-1023 bytes? 
> 
> Yes.
> 
> > > checksum, with total, head, and tail lengths. I'll give as percentages
> > > of total for easier comparison (total calls were around 1 million and
> > > 500MB of data):  
> > 
> > Does this table match the profile you showed with all the overhead
> > being through the fsync->log write path?
> 
> Yes.
> 
> [snip interesting summary]
> 
> > Full sector, no head, no tail (i.e. external crc store)? I think
> > only log buffers (the extended header sector CRCs) can do that.
> > That implies a large log buffer (e.g. 256k) is configured and
> > (possibly) log stripe unit padding is being done. What is the
> > xfs_info and mount options from the test filesystem?
> 
> See the end of the mail.
> 
> [snip]
> 
> > > Keep in mind you have to sum the number of bytes for head and tail to
> > > get ~100%.
> > > 
> > > Now for x86-64, you need to be at 9-10 (depending on configuration) or
> > > greater to exceed the breakeven point for their fastest implementation.
> > > Split crc implementation will use the fast algorithm for about 85% of
> > > bytes in the best case, 12% at worst. Combined gets there for 85% at
> > > worst, and 100% at best. The slower x86 implementation still uses a
> > > hardware instruction, so it doesn't do too badly.
> > > 
> > > For powerpc, the breakeven is at 512 + 16 bytes (9ish), but it falls
> > > back to generic implementation for bytes below that.  
> > 
> > Which means for the most common objects we won't be able to reach
> > breakeven easily simply because of the size of the objects we are
> > running CRCs on. e.g. sectors and inodes/dquots by default are all
> > 512 bytes or smaller. THere's only so much that can be optimised
> > here...
> 
> Well for this workload at least, the full checksum size seems always
> >= 512. The small heads cut it down and drag a lot of crc32c calls
> from 1024-2047 range (optimal for Intel) to 512-1023. I don't *think*
> I've done the wrong thing here, but if it looks odd to you, I'll go
> back and double check.

That doesn't make immediate sense to me. Objects being CRC'd in XFS
are either:

	- sector size or smaller (so head/tail < 512)
	- block sized (4k) with a very small head (< 64 bytes)
	  and a ~4k tail.
	- sector padded log writes of up to iclogbuf size (default
	  32k) with a short head/tail for the first sector, then
	  a direct crc32c() call for the rest.

The only thing that can fall into the 1024-2047 range are /very
small/ log writes. i.e. fsync()ing an inode change, but nothing
else.

FWIW, from the stats below, there are lots more transactions than log
forces and we know that the log forces driven by fsync are the
majority of the CRC traffic. The stats show about 1.4GB of data got
written to the log, so all that would have been CRCed. with ~140k
log forces, that's an average of 10kb per log write that was CRC'd.
Some are going to be much larger, some are going to be small. You'll
see the very small log writes be affected by head/tail size, but
otherwise the difference will be minimal because there are still
multiple crc32c calls for a log buffer...

So I think what you are seeing is simply the effect of variable log
write size due to frequent fsyncs preventing the log from fully
utilising it's in-memory log write reduction optimisations to reduce
log traffic and fully utilise the log buffer space to minimise CRC
overhead....

I've left the decoding of the stats below if you want to read it,
otherwise there's nothing else of note here...

> > > I think we can
> > > reduce the break even point on powerpc slightly and capture most of
> > > the rest, so it's not so bad.
> > > 
> > > Anyway at least that's a data point to consider. Small improvement is
> > > possible.  
> > 
> > Yup, but there's no huge gain to be made here - these numbers say to
> > me that the problem may not be the CRC overhead, but instead is the
> > amount of CRC work being done. Hence my request for mount options
> > + xfs_info to determine if what you are seeing is simply a bad fs
> > configuration for optimal small log write performance. CRC overhead
> > may just be a symptom of a filesystem config issue...
> 
> Yes sorry, I forgot to send an xfs_info sample. mkfs.xfs is 4.3.0 from
> Ubuntu 16.04.
> 
> npiggin@fstn3:/etc$ sudo mkfs.xfs -f /dev/ram0
> specified blocksize 4096 is less than device physical sector size 65536
> switching to logical sector size 512
> meta-data=/dev/ram0              isize=512    agcount=4, agsize=4194304 blks
>          =                       sectsz=512   attr=2, projid32bit=1
>          =                       crc=1        finobt=1, sparse=0
> data     =                       bsize=4096   blocks=16777216, imaxpct=25
>          =                       sunit=0      swidth=0 blks
> naming   =version 2              bsize=4096   ascii-ci=0 ftype=1
> log      =internal log           bsize=4096   blocks=8192, version=2
>          =                       sectsz=512   sunit=0 blks, lazy-count=1
> realtime =none                   extsz=4096   blocks=0, rtextents=0
> 
> Mount options are standard:
> /dev/ram0 on /mnt type xfs (rw,relatime,attr2,inode64,noquota)

OK, nothing unusual here at all. Nothing to make me suspect a config
issue.

> xfs_info sample:
....
> dir 79418 460612 460544 5685160

directory operations:
	lookups = 79418
	creates = 460612
	removes = 460544
	getdents = 5685160

SO, we have 80k directory lookups to instantiate an inode, but 460k
creates and removes, and 5.6M readdir calls. That's a lot of readdir
calls - this doesn't look like any part of a MySQL workload I'm
familiar with....

> trans 0 3491960 0

3.5 million asynchronous transaction commits. That's a lot, so far I
can account for about 1.5M of them (creates, removes and a handful
of data extents). Oh, a couple of million data writes - I bet the
rest are timestamp/i_version updates.

> log 89045 2859542 62 132145 143932

journal stats:
	log write IOs = 89045
	log write blocks = 2859542 = 1.4GB
	log stalls waitingfor iclogbuf = 62	(can ignore)
	log forces = 132145			(fsync load!)
	log force sleeps = 143932		(fsync waits!)

> push_ail 3491960 24 619 53860 0 6433 13135 284324 0 445

log space reservations = 3491960
  ... that blocked = 24
AIL pushes = 619
objects cleaned = 53860
buffers pushed = 0
objects skipped because pinned = 6433
objects skipped because locked = 13135
objects skipped because already pushed = 284324		(inodes!)
Push restarts = 0
log forces to unpin objects = 445

So, in terms of metadata writeback, there's inodes being flushed
here, and not much else. The directory blocks are probably being freed before
they get flushed (0 buffers pushed), which indicates that the inode
create/remove is temporary files.

> xstrat 64342 0

64342 data extent allocations during writeback. 

> rw 951375 2937203

->read_iter and ->write_iter calls.

> attr 0 0 0 0
> icluster 47412 38985 221903
> vnodes 5294 0 0 0 381123 381123 381123 0

only 5294 active inodes in cache at sample time, 380k inodes have
been fully reclaimed.

> buf 4497307 6910 4497106 1054073 13012 201 0 0 0

~4.5M metadata buffer lookups, 7,000 lookup misses.

> abtb2 139597 675266 27639 27517 0 0 0 0 0 0 0 0 0 0 1411718
> abtc2 240942 1207277 120532 120410 0 0 0 0 0 0 0 0 0 0 4618844
> bmbt2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
> ibt2 762383 3048311 69 67 0 0 0 0 0 0 0 0 0 0 263
> fibt2 1114420 2571311 143583 143582 0 0 0 0 0 0 0 0 0 0 1232534

The btree stats indication lookup, compare, insert, delete and move
on the free space btrees and the inode btrees. This means all the
data being written is in just 1-few extents per inode, and there
were about 143583 inodes created and removed in this period.

Let's tie that number back into the number of log forces from above:

log force sleeps = 143932               (fsync waits!)

It's almost identical. Whatever is creating/removing these files is
probably also causing all the fsync load and the log CRC overhead
you are measuring. Identifying what that is will give you some idea
of how to reduce the fsync load and hence potentially decrease the
log and CRC overhead that it is causing....

> rmapbt 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
> refcntbt 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
> qm 0 0 0 0 0 0 0 0
> xpc 3366711296 24870568605 34799779740

so in 64342 data extent allocations during writeback, we allocated
3366711296 bytes = 3.3GB. We wrote 24GB of data through ->write_iter
and read 34GB of data through ->read_iter.  That means average
read/write sizes are about 24k/12k. Pretty typical for a database, I
think.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c)
  2016-11-01  5:47           ` Dave Chinner
@ 2016-11-02  2:18             ` L.A. Walsh
  2016-11-03  8:29               ` Dave Chinner
  0 siblings, 1 reply; 20+ messages in thread
From: L.A. Walsh @ 2016-11-02  2:18 UTC (permalink / raw)
  To: linux-xfs

Snipping in various places:

Dave Chinner wrote:
> directory operations:
> 	lookups = 79418,  creates = 460612
> 	removes = 460544, getdents = 5685160
>
> SO, we have 80k directory lookups to instantiate an inode, but 460k
> creates and removes, and 5.6M readdir calls. That's a lot of readdir
> calls....
>   
>> trans 0 3491960 0
>>     
> 3.5 million asynchronous transaction commits. That's a lot, so far I
> can account for about 1.5M of them (creates, removes and a handful
> of data extents). Oh, a couple of million data writes - I bet the
> rest are timestamp/i_version updates.
>   
> [...]
> log force sleeps = 143932               (fsync waits!)
> ...
> Identifying what that is will give you some idea
> of how to reduce the fsync load and hence potentially decrease the
> log and CRC overhead that it is causing....
>   
----
    Back when the free-inode performance enhancement was introduced,
it was required that crc'ing of metadata also be enabled.  Have
these options been separated yet?  If so, save yourself some reading
and just get the "thanks!", and ignore the rest of this, but if
not... please read on, and please re-consider...

    I asked for a separation of these options on the basis that
crc'ing of the file system was not something I wanted on my files due
to performance and data security and recovery considerations.  I was
told that the crc'ing of the data wouldn't be noticeable (something I
was skeptical of), and that allowing the options separately wouldn't
be offered). 

    My inner-cynic thought that this was because many, if not most
users wouldn't need the crc and it would only be another source of
problems -- one that would disable data-access if the crc counts
couldn't be verified. 

    I really, still, don't want the crc's on my disks, I don't see
that they would provide any positive benefit in my usage -- nor in
many uses of xfs -- which ISN'T to say I can't see the need and/or
desire to have them for many classes of xfs users -- just not one
that I belong to at this point.  I was more worried about performance
than anything else (always trying to eek the most out of fixed budget!).

    I see the speedup from the free-inode tree in saving the
time during the real-time-search for free space, but only saw the crc
calculations as time-wasting overhead for the customer not needing
such integrity guarantees. 

    Is the free-space inode feature anything more than something to
offset the speed lost by crc calculations?  Wouldn't xfs be more
flexible if it could be tuned for performance OR integrity (or
a mix of both, using both).   Even if someone only wants to support
the combination in some official release, allowing selection
of those options to be separate would allow better testing
as well as isolation of features for specific workloads, testing
and/or benchmarks.

L. Walsh



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c)
  2016-11-02  2:18             ` [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c) L.A. Walsh
@ 2016-11-03  8:29               ` Dave Chinner
  2016-11-03 16:04                 ` L.A. Walsh
  0 siblings, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-11-03  8:29 UTC (permalink / raw)
  To: L.A. Walsh; +Cc: linux-xfs

Hi Linda,

On Tue, Nov 01, 2016 at 07:18:32PM -0700, L.A. Walsh wrote:
s
> Snipping in various places:
> 
> Dave Chinner wrote:
> >directory operations:
> >	lookups = 79418,  creates = 460612
> >	removes = 460544, getdents = 5685160
> >
> >SO, we have 80k directory lookups to instantiate an inode, but 460k
> >creates and removes, and 5.6M readdir calls. That's a lot of readdir
> >calls....
> >>trans 0 3491960 0
> >3.5 million asynchronous transaction commits. That's a lot, so far I
> >can account for about 1.5M of them (creates, removes and a handful
> >of data extents). Oh, a couple of million data writes - I bet the
> >rest are timestamp/i_version updates.
> >[...]
> >log force sleeps = 143932               (fsync waits!)
> >...
> >Identifying what that is will give you some idea
> >of how to reduce the fsync load and hence potentially decrease the
> >log and CRC overhead that it is causing....
> ----
>    Back when the free-inode performance enhancement was introduced,
> it was required that crc'ing of metadata also be enabled.  Have
> these options been separated yet?

No, and they won't be. The answer has not changed.

.....

>    I really, still, don't want the crc's on my disks, I don't see
> that they would provide any positive benefit in my usage -- nor in
> many uses of xfs -- which ISN'T to say I can't see the need and/or
> desire to have them for many classes of xfs users -- just not one
> that I belong to at this point.  I was more worried about performance
> than anything else (always trying to eek the most out of fixed budget!).

You may not want CRC's, but they /aren't for you/.  The benefits of
CRCs are realised when things go wrong, not when things go right.
IOWs, CRCs are most useful for the people supporting XFS as they
provide certainty about where errors and corruptions have
originated.

As most users never have things go wrong, all they think is "CRCs
are unnecessary overhead". It's just like backups - how many people
don't make backups because they cost money right now and there's no
tangible benefit until something goes wrong which almost never
happens?

>    I see the speedup from the free-inode tree in saving the
> time during the real-time-search for free space, but only saw the crc
> calculations as time-wasting overhead for the customer not needing
> such integrity guarantees.

Exactly my point. Humans are terrible at risk assessment and
mitigation because most people are unaware of the unconcious
cognitive biases that affect this sort of decision making.

>    Is the free-space inode feature anything more than something to
> offset the speed lost by crc calculations?

That's not what the free inode btree does. It actually slows down
allocation on an empty filesystem and trades off that increase in
"empty filesystem" overhead for significantly better aged filesystem
inode allocation performance. i.e. the finobt provides more
deterministic inode allocation overhead, not "faster" allocation.

Let me demonstrate with some numbers on empty filesystem create
rate:

			create rate	sys CPU time	write rate
			(files/s)	(seconds)	  (MB/s)
crc = 0, finobt = 0:	238943		  2629		 ~200
crc = 1, finobt = 0:	231582		  2711		  ~40
crc = 1, finobt = 1:	232563		  2766		  ~40
*hacked* crc disable:   231435		  2789		  ~40

Std deviation of the file create rate is about +/-10%, so the
differences between the create rate results (3%) are not
significant. Yes, there's a slight decrease in performance with CRCs
enabled, but that's because of the increased btree footprint  due to
the increased btree header size in the v5 format. Hence lookups,
modifications, etc take slightly longer and cost more CPU.

We can see that the system CPU time increased by 3.1% with the
"addition of CRCs".  The CPU usage increases by a further 2% with
the addition of the free inode btree, which should give you an idea
of how much CPU time even a small btree consumes. The allocated
inode btree is huge in comparison to the finobt in this workload,
which is why even a small change in header size (when CRCs are
enabled) makes a large difference in CPU usage.

To verify that CRC has no significant impact on inode allocation,
let's look at the actual CPU being used by the CRC calculations in
this workload are:

  0.28%  [kernel]  [k] crc32c_pcl_intel_update

Only a small proportion of the entire increase in CPU consumption
that comes from "turning on CRCS". Indeed, the "*hacked* CRC
disable" results are from skipping CRC calculations in the code
altogether and returning "verify ok" without calculating them. The
create rate is identical to the crc=1,finobt=1 numbers and the CPU
usage is /slightly higher/ than when CRCs are enabled.

IOWs, for most workloads CRCs have no impact on filesystem
performance.  Yes, there are cases where there is some impact (as
Nick has pointed out) but these are situations where minor
optimisations can make a big difference, such as Nick's suggestion
to zero before write rather than do two partial calcs. This is
not a severe issue - it's just the normal process of iterative
improvement.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c)
  2016-11-03  8:29               ` Dave Chinner
@ 2016-11-03 16:04                 ` L.A. Walsh
  2016-11-03 18:15                   ` Eric Sandeen
  2016-11-03 23:00                   ` Dave Chinner
  0 siblings, 2 replies; 20+ messages in thread
From: L.A. Walsh @ 2016-11-03 16:04 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs



Dave Chinner wrote:
> 
> As most users never have things go wrong, all they think is "CRCs
> are unnecessary overhead". It's just like backups - how many people
> don't make backups because they cost money right now and there's no
> tangible benefit until something goes wrong which almost never
> happens?
----
	But it's not like backups.  You can't run a util
program upon discovering bad CRC's that will fix the file system
because the file system is no longer usable.  That means you
have to restore from backup.  Thus, for those keeping
backups, there is no benefit, as they'll have to restore
from backups in either case.

> Exactly my point. Humans are terrible at risk assessment and
> mitigation because most people are unaware of the unconcious
> cognitive biases that affect this sort of decision making.
---
	My risk is near 0 since my file systems are monitored
by a raid controller with read patrols made over the data on
a period basis.  I'll assert that the chance of data randomly
going corrupt is much higher because there is ALOT more data than
metadata.  On top of that, because I keep backups, my risk, is
at worst, the same without crc's as with them.

>  It actually slows down
> allocation on an empty filesystem and trades off that increase in
> "empty filesystem" overhead for significantly better aged filesystem
> inode allocation performance.
----
	Ok, let's see, ages of my file systems:
4 from 2009 (7 years)
1 from 2013 (3 years)
9 from 2014 (2 years)
---
	I don't think I have any empty or new filesystems
(FWIW, I store the creation time in the UUID).

 i.e. the finobt provides more
> deterministic inode allocation overhead, not "faster" allocation.
> 
> Let me demonstrate with some numbers on empty filesystem create
> rate:
> 
> 			create rate	sys CPU time	write rate
> 			(files/s)	(seconds)	  (MB/s)
> crc = 0, finobt = 0:	238943		  2629		 ~200
> crc = 1, finobt = 0:	231582		  2711		  ~40
> crc = 1, finobt = 1:	232563		  2766		  ~40
> *hacked* crc disable:   231435	  2789		  ~40


> We can see that the system CPU time increased by 3.1% with the
> "addition of CRCs".  The CPU usage increases by a further 2% with
> the addition of the free inode btree,
---
	On an empty file system or older ones that are >50%
used?  It's *nice* to be able to benchmarks, but not allowing
crc to be disabled, disables that possibility -- and that's
sorta the point.  In order to prove you point, you created a
benchmark with crc's disabled. But the thing about benchmarks
is making so others can reproduce your results.  That's
the problem.  If I could do the same benchmarks, and get 
similar results, I'd give up as finobt not being worth it.

	But I'm not able to run such tests on my workload
and/or filesystems.  The common advice about performance numbers
and how they are affected by options is to do benchmarks
on your own systems with your own workload and see if the option
helps.  That's what I want to do.  Why deny that?



 which should give you an idea
> of how much CPU time even a small btree consumes.
---
	In a non-real-world case on empty file systems. How
does it work in the real world on file systems like mine?
I know the MB/s isn't close, w/my max sustained I/O rates
being about 1GB/s (all using direct i/o -- rate drops
significantly if I use kernel buffering).  Even not
pre-allocating and defragmenting the test file will noticeable
affect I/O rates.  

	Showing the result on an empty file
system is when finobt would have the *least* affect, since
it is when the kernel has to search for space that things
slow down, but if the free space is pre-allocated in a 
dedicated b-tree, then the kernel doesn't have to search --
which would be a much bigger difference than on an empty
file system.


 The allocated
> inode btree is huge in comparison to the finobt in this workload,
> which is why even a small change in header size (when CRCs are
> enabled) makes a large difference in CPU usage.
> 
> To verify that CRC has no significant impact on inode allocation,
> let's look at the actual CPU being used by the CRC calculations in
> this workload are:
> 
>   0.28%  [kernel]  [k] crc32c_pcl_intel_update
---
	And how much is spent searching for free space?
On multi-gig files it can reduces I/O rates by 30% or more.

> 
> Only a small proportion of the entire increase in CPU consumption
> that comes from "turning on CRCS". Indeed, the "*hacked* CRC
> disable" results are from skipping CRC calculations in the code
> altogether and returning "verify ok" without calculating them. The
> create rate is identical to the crc=1,finobt=1 numbers and the CPU
> usage is /slightly higher/ than when CRCs are enabled.
> 
> IOWs, for most workloads CRCs have no impact on filesystem
> performance.  
---
	Too bad no one can test such the effect on their
own workloads, though if not doing crc's takes more CPU, then
it sounds like an algorithm problem: crc calculations don't
take "negative time", and a benchmark showing they do indicates
something else is causing the slowdown.

> Cheers,
> Dave.
----
Sigh... and Cheers to you too! ;-)
Linda


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c)
  2016-11-03 16:04                 ` L.A. Walsh
@ 2016-11-03 18:15                   ` Eric Sandeen
  2016-11-03 23:00                   ` Dave Chinner
  1 sibling, 0 replies; 20+ messages in thread
From: Eric Sandeen @ 2016-11-03 18:15 UTC (permalink / raw)
  To: L.A. Walsh, Dave Chinner; +Cc: linux-xfs

On 11/3/16 11:04 AM, L.A. Walsh wrote:
> 
> 
> Dave Chinner wrote:
>>
>> As most users never have things go wrong, all they think is "CRCs
>> are unnecessary overhead". It's just like backups - how many people
>> don't make backups because they cost money right now and there's no
>> tangible benefit until something goes wrong which almost never
>> happens?
> ----
>     But it's not like backups.  You can't run a util
> program upon discovering bad CRC's that will fix the file system
> because the file system is no longer usable.

Sure you can - xfs_repair knows what to do in such a case.

It's not guaranteed that you get /all/ your data back, in every
corrupted fs situation, but that's not because of CRCs - a bad CRC
is just the messenger about filesystem corruption.

> That means you
> have to restore from backup.  Thus, for those keeping
> backups, there is no benefit, as they'll have to restore
> from backups in either case. 

Again, as Dave said, the format changes implemented for CRCS help
us support XFS.  It's not something /directly/ useful to the user,
other than possibly stopping a corruption before it gets worse, by
early detection.

If you have a corruption which is so bad that you have to go to
backups, that would be true with or without CRCs.

-Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c)
  2016-11-03 16:04                 ` L.A. Walsh
  2016-11-03 18:15                   ` Eric Sandeen
@ 2016-11-03 23:00                   ` Dave Chinner
  2016-11-04  6:56                     ` L.A. Walsh
  1 sibling, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-11-03 23:00 UTC (permalink / raw)
  To: L.A. Walsh; +Cc: linux-xfs

On Thu, Nov 03, 2016 at 09:04:42AM -0700, L.A. Walsh wrote:
> 
> 
> Dave Chinner wrote:
> >
> >As most users never have things go wrong, all they think is "CRCs
> >are unnecessary overhead". It's just like backups - how many people
> >don't make backups because they cost money right now and there's no
> >tangible benefit until something goes wrong which almost never
> >happens?
> ----
> 	But it's not like backups.  You can't run a util
> program upon discovering bad CRC's that will fix the file system
> because the file system is no longer usable.

xfs_repair will fix it, just like it will fix the same corruption
on non-CRC filesystems.

> >Exactly my point. Humans are terrible at risk assessment and
> >mitigation because most people are unaware of the unconcious
> >cognitive biases that affect this sort of decision making.
> ---
> 	My risk is near 0 since my file systems are monitored
> by a raid controller with read patrols made over the data on
> a period basis.

If I had a dollar for every time someone said "hardware raid
protects me" I'd have retired years ago.

Media scrubbing does not protect against misdirected writes,
corruptions to/from the storage, memory errors, software bugs, bad
compilers (yes, we've already had XFS CRCs find a compiler bug),
etc.

> I'll assert that the chance of data randomly
> going corrupt is much higher because there is ALOT more data than
> metadata.  On top of that, because I keep backups, my risk, is
> at worst, the same without crc's as with them.

The /scale of disaster/ for metadata corruption is far higher than
for file data - a single bit error can trash the entire filesystem.

You may not care about this, but plenty of other XFS users do.

> i.e. the finobt provides more
> >deterministic inode allocation overhead, not "faster" allocation.
> >
> >Let me demonstrate with some numbers on empty filesystem create
> >rate:
> >
> >			create rate	sys CPU time	write rate
> >			(files/s)	(seconds)	  (MB/s)
> >crc = 0, finobt = 0:	238943		  2629		 ~200
> >crc = 1, finobt = 0:	231582		  2711		  ~40
> >crc = 1, finobt = 1:	232563		  2766		  ~40
> >*hacked* crc disable:   231435	  2789		  ~40
> 
> 
> >We can see that the system CPU time increased by 3.1% with the
> >"addition of CRCs".  The CPU usage increases by a further 2% with
> >the addition of the free inode btree,
> ---
> 	On an empty file system or older ones that are >50%
> used?
>
> It's *nice* to be able to benchmarks, but not allowing
> crc to be disabled, disables that possibility -- and that's
> sorta the point. 

If you want to reproduce the above numbers, the script is below.
You don't need the "CRC disable" hack to test whether CRCs have
overhead or not, CPU profiles are sufficient for that. But, really,
I don't care about whether you can reproduce these tests, because
microbenchmarks don't matter to production systems.

That is,, you haven't provided any numbers to back up your
assertions that CRCs have excessive overhead for /your workload/.
For me to care about what you are saying, you need to demonstrate a
performance degradation between v4 and v5 filesystem formats for
/your workloads/.

I can't do this for you. I don't know what your workload is, nor
what hardware you use.  *Give me numbers* that I can work with -
something we can measure and fix. You need to do the work to show
there's an issue - I can't do that for you, and no amount of
demanding that I do will change that.

> >IOWs, for most workloads CRCs have no impact on filesystem
> >performance.
> ---
> 	Too bad no one can test such the effect on their
> own workloads, though if not doing crc's takes more CPU, then
> it sounds like an algorithm problem: crc calculations don't
> take "negative time", and a benchmark showing they do indicates
> something else is causing the slowdown.

I'm guessing that you aren't aware of how memory access patterns
affect performance on modern CPUs. i.e. sequential memory access to
a structure is much faster than random meory access becase the
hardware prefetchers detect the sequential accesses and minimises
cache miss latency.

e.g. for a typical 4k btree block, doing a binary search on a cache
cold block requires 10-12 cache misses for complete search. However,
if we run a CRC check on it, we take a couple of cache misses before
the hardware prefetcher kicks in then it's just CPU time to run the
calc. Then the binary search doesn't have a cache miss at all. Hence
if the CRC calc is fast enough (and for h/w accel it is fast enough)
adding CRCs will make the code run faster....

This is actually a well known behaviour of modern CPUs - for
years we've been using memset() to initialise structures when it's
technically not necessary because it's the fastest way to prime the
CPU caches for upcoming accesses to that structure.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

#!/bin/bash

QUOTA=
MKFSOPTS=
NFILES=100000
DEV=/dev/vdc
LOGBSIZE=256k

while [ $# -gt 0 ]; do
	case "$1" in
	-q)	QUOTA="uquota,gquota,pquota" ;;
	-N)	NFILES=$2 ; shift ;;
	-d)	DEV=$2 ; shift ;;
	-l)	LOGBSIZE=$2; shift ;;
	--)	shift ; break ;;
	esac
	shift
done
MKFSOPTS="$MKFSOPTS $*"

echo QUOTA=$QUOTA
echo MKFSOPTS=$MKFSOPTS
echo DEV=$DEV

sudo umount /mnt/scratch > /dev/null 2>&1
sudo mkfs.xfs -f $MKFSOPTS $DEV
sudo mount -o nobarrier,logbsize=$LOGBSIZE,$QUOTA $DEV /mnt/scratch
sudo chmod 777 /mnt/scratch
cd /home/dave/src/fs_mark-3.3/
sudo sh -c "echo 1 > /proc/sys/fs/xfs/stats_clear"
time ./fs_mark  -D  10000  -S0  -n  $NFILES  -s  0  -L  32 \
	-d  /mnt/scratch/0  -d  /mnt/scratch/1 \
	-d  /mnt/scratch/2  -d  /mnt/scratch/3 \
	-d  /mnt/scratch/4  -d  /mnt/scratch/5 \
	-d  /mnt/scratch/6  -d  /mnt/scratch/7 \
	-d  /mnt/scratch/8  -d  /mnt/scratch/9 \
	-d  /mnt/scratch/10  -d  /mnt/scratch/11 \
	-d  /mnt/scratch/12  -d  /mnt/scratch/13 \
	-d  /mnt/scratch/14  -d  /mnt/scratch/15 \
	| tee >(stats --trim-outliers | tail -1 1>&2)
sync
sudo umount /mnt/scratch

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-10-27 16:17 [rfc] larger batches for crc32c Nicholas Piggin
  2016-10-27 18:29 ` Darrick J. Wong
  2016-10-27 21:42 ` Dave Chinner
@ 2016-11-04  0:12 ` Dave Chinner
  2016-11-04  2:28   ` Nicholas Piggin
  2 siblings, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-11-04  0:12 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner

On Fri, Oct 28, 2016 at 03:17:47AM +1100, Nicholas Piggin wrote:
> Hi guys,
> 
> We're seeing crc32c_le show up in xfs log checksumming on a MySQL benchmark
> on powerpc. I could reproduce similar overheads with dbench as well.
> 
> 1.11%  mysqld           [kernel.vmlinux]            [k] __crc32c_le
>         |
>         ---__crc32c_le
>            |          
>             --1.11%--chksum_update
>                       |          
>                        --1.11%--crypto_shash_update
>                                  crc32c
>                                  xlog_cksum
>                                  xlog_sync
>                                  _xfs_log_force_lsn
>                                  xfs_file_fsync
>                                  vfs_fsync_range
>                                  do_fsync
>                                  sys_fsync
>                                  system_call
>                                  0x17738
>                                  0x17704
>                                  os_file_flush_func
>                                  fil_flush
> 
> As a rule, it helps the crc implementation if it can operate on as large a
> chunk as possible (alignment, startup overhead, etc). So I did a quick hack
> at getting XFS checksumming to feed crc32c() with larger chunks, by setting
> the existing crc to 0 before running over the entire buffer. Together with
> some small work on the powerpc crc implementation, crc drops below 0.1%.
> 
> I don't know if something like this would be acceptable? It's not pretty,
> but I didn't see an easier way.

Here's an alternative, slightly cleaner patch that optimises the CRC
update side but leaves the verify side as it is. I've not yet
decided exactly what is cleanest for the xlog_cksum() call in log
recovery, but that won't change the performance of the code.  Can
you give this a run through, Nick?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com


xfs: optimise CRC updates

From: Dave Chinner <dchinner@redhat.com>

Nick Piggin reported that the CRC overhead in an fsync heavy
workload was higher than expected on a Power8 machine. Part of this
was to do with the fact that the power8 CRC implementation is not
efficient for CRC lengths of less than 512 bytes, and so the way we
split the CRCs over the CRC field means a lot of the CRCs are
reduced to being less than than optimal size.

TO otpimise this, change the CRC update mechanism to zero the CRC
field first, and then compute the CRC in one pass over the buffer
and write the result back into the buffer. We can do this safely
because anything writing a CRC has exclusive access to the buffer
the CRC is being calculated over.

We leave the CRC verify code the same - it still splits the CRC
calculation - because we do not want read-only operations modifying
the underlying buffer. This is because read-only operations may not
have an exclusive access to the buffer guaranteed, and so temporary
modifications could leak out to to other processes accessing the
buffer concurrently.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_cksum.h     | 26 ++++++++++++++++++++++----
 fs/xfs/libxfs/xfs_inode_buf.c |  2 +-
 fs/xfs/xfs_log.c              |  2 +-
 fs/xfs/xfs_log_recover.c      | 12 +++++++-----
 4 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676ad8cd..a416c7cb23ea 100644
--- a/fs/xfs/libxfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -6,10 +6,11 @@
 /*
  * Calculate the intermediate checksum for a buffer that has the CRC field
  * inside it.  The offset of the 32bit crc fields is passed as the
- * cksum_offset parameter.
+ * cksum_offset parameter. We do not modify the buffer during verification,
+ * hence we have to split the CRC calculation across the cksum_offset.
  */
 static inline __uint32_t
-xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
 {
 	__uint32_t zero = 0;
 	__uint32_t crc;
@@ -26,6 +27,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 }
 
 /*
+ * Fast CRC method where the buffer is modified. Callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline __uint32_t
+xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	/* zero the CRC field */
+	*(__le32 *)(buffer + cksum_offset) = 0;
+
+	/* single pass CRC calculation for the entire buffer */
+	return crc32c(XFS_CRC_SEED, buffer, length);
+}
+
+/*
  * Convert the intermediate checksum to the final ondisk format.
  *
  * The CRC32c calculation uses LE format even on BE machines, but returns the
@@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc)
 
 /*
  * Helper to generate the checksum for a buffer.
+ *
+ * This modifies the buffer temporarily - callers must have exclusive
+ * access to the buffer while the calculation takes place.
  */
 static inline void
 xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 {
-	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+	__uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
 
 	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
 }
@@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 static inline int
 xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 {
-	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+	__uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
 
 	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
 }
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 134424fac434..d3aa50eb998f 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -436,7 +436,7 @@ xfs_dinode_calc_crc(
 		return;
 
 	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
-	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+	crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
 			      XFS_DINODE_CRC_OFF);
 	dip->di_crc = xfs_end_cksum(crc);
 }
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3b74fa011bb1..3ebe444eb60f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1668,7 +1668,7 @@ xlog_cksum(
 	__uint32_t		crc;
 
 	/* first generate the crc for the record header ... */
-	crc = xfs_start_cksum((char *)rhead,
+	crc = xfs_start_cksum_update((char *)rhead,
 			      sizeof(struct xlog_rec_header),
 			      offsetof(struct xlog_rec_header, h_crc));
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9b3d7c76915d..56b7a2f6aaf2 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5113,19 +5113,21 @@ xlog_recover_process(
 	struct list_head	*buffer_list)
 {
 	int			error;
+	__le32			old_crc = rhead->h_crc;
 	__le32			crc;
 
+
 	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
 
 	/*
 	 * Nothing else to do if this is a CRC verification pass. Just return
 	 * if this a record with a non-zero crc. Unfortunately, mkfs always
-	 * sets h_crc to 0 so we must consider this valid even on v5 supers.
+	 * sets old_crc to 0 so we must consider this valid even on v5 supers.
 	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
 	 * know precisely what failed.
 	 */
 	if (pass == XLOG_RECOVER_CRCPASS) {
-		if (rhead->h_crc && crc != rhead->h_crc)
+		if (old_crc && crc != old_crc)
 			return -EFSBADCRC;
 		return 0;
 	}
@@ -5136,11 +5138,11 @@ xlog_recover_process(
 	 * zero CRC check prevents warnings from being emitted when upgrading
 	 * the kernel from one that does not add CRCs by default.
 	 */
-	if (crc != rhead->h_crc) {
-		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+	if (crc != old_crc) {
+		if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
 			xfs_alert(log->l_mp,
 		"log record CRC mismatch: found 0x%x, expected 0x%x.",
-					le32_to_cpu(rhead->h_crc),
+					le32_to_cpu(old_crc),
 					le32_to_cpu(crc));
 			xfs_hex_dump(dp, 32);
 		}

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [rfc] larger batches for crc32c
  2016-11-04  0:12 ` [rfc] larger batches for crc32c Dave Chinner
@ 2016-11-04  2:28   ` Nicholas Piggin
  0 siblings, 0 replies; 20+ messages in thread
From: Nicholas Piggin @ 2016-11-04  2:28 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, Christoph Hellwig, Dave Chinner

On Fri, 4 Nov 2016 11:12:48 +1100
Dave Chinner <david@fromorbit.com> wrote:

> On Fri, Oct 28, 2016 at 03:17:47AM +1100, Nicholas Piggin wrote:
> > Hi guys,
> > 
> > We're seeing crc32c_le show up in xfs log checksumming on a MySQL benchmark
> > on powerpc. I could reproduce similar overheads with dbench as well.
> > 
> > 1.11%  mysqld           [kernel.vmlinux]            [k] __crc32c_le
> >         |
> >         ---__crc32c_le
> >            |          
> >             --1.11%--chksum_update
> >                       |          
> >                        --1.11%--crypto_shash_update
> >                                  crc32c
> >                                  xlog_cksum
> >                                  xlog_sync
> >                                  _xfs_log_force_lsn
> >                                  xfs_file_fsync
> >                                  vfs_fsync_range
> >                                  do_fsync
> >                                  sys_fsync
> >                                  system_call
> >                                  0x17738
> >                                  0x17704
> >                                  os_file_flush_func
> >                                  fil_flush
> > 
> > As a rule, it helps the crc implementation if it can operate on as large a
> > chunk as possible (alignment, startup overhead, etc). So I did a quick hack
> > at getting XFS checksumming to feed crc32c() with larger chunks, by setting
> > the existing crc to 0 before running over the entire buffer. Together with
> > some small work on the powerpc crc implementation, crc drops below 0.1%.
> > 
> > I don't know if something like this would be acceptable? It's not pretty,
> > but I didn't see an easier way.  
> 
> Here's an alternative, slightly cleaner patch that optimises the CRC
> update side but leaves the verify side as it is. I've not yet
> decided exactly what is cleanest for the xlog_cksum() call in log
> recovery, but that won't change the performance of the code.  Can
> you give this a run through, Nick?

Hi Dave,

Yeah sorry for the slow response, I've been getting a more realistic
MySQL benchmark setup working (what I had reproduced what appeared to
be the same overhead, but I wanted to get something better to retest
with). So your patch comes at a good time (and thanks for working on
it). I'll see if I can get something running and have results for you
by next week.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c)
  2016-11-03 23:00                   ` Dave Chinner
@ 2016-11-04  6:56                     ` L.A. Walsh
  2016-11-04 17:37                       ` Eric Sandeen
  0 siblings, 1 reply; 20+ messages in thread
From: L.A. Walsh @ 2016-11-04  6:56 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs



Dave Chinner wrote:
> If I had a dollar for every time someone said "hardware raid
> protects me" I'd have retired years ago.
----
	It's not a sole line of protection, but it does a fair
job of *detection*.  

> Media scrubbing does not protect against misdirected writes,
> corruptions to/from the storage, memory errors, software bugs, bad
> compilers (yes, we've already had XFS CRCs find a compiler bug),
> etc.
---
	Crc's don't _protect_ against such things either -- they
can allow detection, but for protection ... I make due with
xfsdump.  As for crc's -- I already had the feature
prevent me from creating and using new partitions where it
didn't like me setting the disk UUID when creating a
new partition -- something that I heard
would be fixed -- but something that prevented me from testing
the new finobt feature I was interested in.

I'm sure I'm not alone in wanting to test in my environment, but 
_both_ with and without the crc option.  

	I seem to remember, recently, that it took other kernel
developers to disable xfs panicing the entire system on problem
detection, with the idea of only disabling what was broken.

	Along the same lines, if I am using crc's and badness is
detected, I'd want to be able to keep the disk online, though
in "read-only" mode to allow me to explore what was wrong and find
the best solution. 
 
> You may not care about this, but plenty of other XFS users do.
---
	That's a crock -- I never said I didn't care about having
it -- just that I wanted to be able to run finobt with crc's being
disabled.  I noted after my last response, that you said your crc
testing only jumped around the tests to compare crc's -- which
sounded like they were still being generated but not checked -- no 
wonder the no-crc option was the slowest of tested options.
 
> I can't do this for you. I don't know what your workload is, nor
> what hardware you use.  *Give me numbers* that I can work with -
> something we can measure and fix. You need to do the work to show
> there's an issue - I can't do that for you, and no amount of
> demanding that I do will change that.

I don't want to bother others with my testing, I just want the
platform to be open enough for me to quietly do my own testing
and go forward from there.  I may be ignorant, but I'm also
picky about things I care about -- thus I prefer to do _some_
things myself, thank-you.

I realize that closed-platforms where the undesirable 
is bundled with the desirable and where end-users have no choice
is the wave of the future.  I realize that in many cases, large
corporations are pushing these changes.

Microsoft's last "open-update" that allowed selecting what 
updates you wanted is history and now you are forced to take
everything or nothing.  Such offerings are not made for the benefit
of the users -- it's not something they want.  But it doesn't matter,
it's where large companies are pushing things so consumers will be
easier to control.  I wouldn't be surprised if this feature bundling
was being pushed by project sponsors and that developers had little
choice and I may not either, as I don't have the smallest fraction
of the time I would need to add in options or changes to the
SW I use, just to hold the "status quo", let alone make improvements. 

All I can usually do is ask and later, "re-ask" -- since policies
in effect at one point in time may not be present later.

Cheers!
-l




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c)
  2016-11-04  6:56                     ` L.A. Walsh
@ 2016-11-04 17:37                       ` Eric Sandeen
  0 siblings, 0 replies; 20+ messages in thread
From: Eric Sandeen @ 2016-11-04 17:37 UTC (permalink / raw)
  To: L.A. Walsh, Dave Chinner; +Cc: linux-xfs

On 11/4/16 1:56 AM, L.A. Walsh wrote:
> 
> 
> Dave Chinner wrote:
>> If I had a dollar for every time someone said "hardware raid
>> protects me" I'd have retired years ago.
> ----
>     It's not a sole line of protection, but it does a fair
> job of *detection*. 
>> Media scrubbing does not protect against misdirected writes,
>> corruptions to/from the storage, memory errors, software bugs, bad
>> compilers (yes, we've already had XFS CRCs find a compiler bug),
>> etc.
> ---
>     Crc's don't _protect_ against such things either -- they
> can allow detection, but for protection ... I make due with
> xfsdump.  As for crc's -- I already had the feature
> prevent me from creating and using new partitions where it
> didn't like me setting the disk UUID when creating a
> new partition -- something that I heard
> would be fixed -- but something that prevented me from testing
> the new finobt feature I was interested in.

That has been fixed for over a year, now, FWIW.

v4.3 kernel / v4.2.0 xfsprogs

> I'm sure I'm not alone in wanting to test in my environment, but _both_ with and without the crc option. 
>     I seem to remember, recently, that it took other kernel
> developers to disable xfs panicing the entire system on problem
> detection, with the idea of only disabling what was broken.

Sorry, what?

>     Along the same lines, if I am using crc's and badness is
> detected, I'd want to be able to keep the disk online, though
> in "read-only" mode to allow me to explore what was wrong and find
> the best solution.
>> You may not care about this, but plenty of other XFS users do.
> ---
>     That's a crock -- I never said I didn't care about having
> it -- just that I wanted to be able to run finobt with crc's being
> disabled. 

If you want an untestable a la carte mishmash every option, there
are other filesystems which will do that for you.  They have lots of
weird broken corner cases as a result.

The xfs development crew has limited resources, and part of the decision
to /not/ make infinite combinations of features available was so that
we can get reliable coverage in testing and produce a robust result.

But we already went through all this on a similar thread 1 year ago...

> I noted after my last response, that you said your crc
> testing only jumped around the tests to compare crc's -- which
> sounded like they were still being generated but not checked -- no wonder the no-crc option was the slowest of tested options.
> 
>> I can't do this for you. I don't know what your workload is, nor
>> what hardware you use.  *Give me numbers* that I can work with -
>> something we can measure and fix. You need to do the work to show
>> there's an issue - I can't do that for you, and no amount of
>> demanding that I do will change that.
> 
> I don't want to bother others with my testing, I just want the
> platform to be open enough for me to quietly do my own testing
> and go forward from there.  I may be ignorant, but I'm also
> picky about things I care about -- thus I prefer to do _some_
> things myself, thank-you.
> 
> I realize that closed-platforms where the undesirable is bundled with the desirable and where end-users have no choice
> is the wave of the future.  I realize that in many cases, large
> corporations are pushing these changes.

Oh come on, now.

Open source doesn't mean everyone gets a pet pony in their favorite
color.  Failing to offer said ponies to each user doesn't make 
this a "closed platform" and I resent the assertion on behalf of
Dave and other xfs developers.  We're performing a herculean task
with a skeleton crew.  Dave, in particular, has performed tirelessly
for /years/ helping to create what is now arguably the highest-performing,
most robust, most scalable filesystem available on Linux.

A large part of XFS's success has been due to wise design and development
decisions, /including/ the decision to keep the test matrix under control.

And I'm not sure what to make sure of your mention of "large corporations"
but I can assure you that decisions like this are made within the xfs
developer community, without outside influence.

-Eric

> -l

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2016-11-04 17:37 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-27 16:17 [rfc] larger batches for crc32c Nicholas Piggin
2016-10-27 18:29 ` Darrick J. Wong
2016-10-28  3:21   ` Nicholas Piggin
2016-10-27 21:42 ` Dave Chinner
2016-10-27 23:16   ` Dave Chinner
2016-10-28  2:12   ` Nicholas Piggin
2016-10-28  4:29     ` Dave Chinner
2016-10-28  5:02     ` Nicholas Piggin
2016-10-31  3:08       ` Dave Chinner
2016-11-01  3:39         ` Nicholas Piggin
2016-11-01  5:47           ` Dave Chinner
2016-11-02  2:18             ` [rfe]: finobt option separable from crc option? (was [rfc] larger batches for crc32c) L.A. Walsh
2016-11-03  8:29               ` Dave Chinner
2016-11-03 16:04                 ` L.A. Walsh
2016-11-03 18:15                   ` Eric Sandeen
2016-11-03 23:00                   ` Dave Chinner
2016-11-04  6:56                     ` L.A. Walsh
2016-11-04 17:37                       ` Eric Sandeen
2016-11-04  0:12 ` [rfc] larger batches for crc32c Dave Chinner
2016-11-04  2:28   ` Nicholas Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.